Scraping Automation and CAPTCHA Handling

Production scraping pipelines need to handle CAPTCHAs automatically — no manual intervention. This guide shows how to build automated scrapers with CaptchaAI integrated for CAPTCHA solving, error recovery, and scheduling.

Architecture Overview

[Scheduler] → [URL Queue] → [Scraper Workers] → [CAPTCHA Solver] → [Data Store]
                                    ↕
                             [Proxy Rotator]

Each component:

Scheduler: Triggers scraping jobs (cron, task queue)
URL Queue: Manages URLs to scrape
Scraper Workers: Fetch pages, detect CAPTCHAs
CAPTCHA Solver: CaptchaAI API handles all CAPTCHA types
Proxy Rotator: Distributes requests across IPs

Core Scraper with CAPTCHA Handling

import requests
import time
import logging
from bs4 import BeautifulSoup

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

API_KEY = "YOUR_API_KEY"

class AutomatedScraper:
    def __init__(self, api_key, max_retries=3):
        self.api_key = api_key
        self.max_retries = max_retries
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        })
        self.stats = {"pages": 0, "captchas": 0, "errors": 0}

    def scrape(self, url):
        for attempt in range(self.max_retries):
            try:
                resp = self.session.get(url, timeout=30)

                if self._is_captcha(resp.text):
                    self.stats["captchas"] += 1
                    logger.info(f"CAPTCHA detected on {url}")
                    resp = self._solve_and_retry(resp.text, url)

                self.stats["pages"] += 1
                return resp.text

            except Exception as e:
                self.stats["errors"] += 1
                logger.error(f"Attempt {attempt + 1} failed for {url}: {e}")
                if attempt == self.max_retries - 1:
                    raise
                time.sleep(2 ** attempt)

    def _is_captcha(self, html):
        return any(m in html.lower() for m in
                   ["g-recaptcha", "cf-turnstile", "h-captcha", "captcha"])

    def _solve_and_retry(self, html, url):
        soup = BeautifulSoup(html, "html.parser")

        # Detect CAPTCHA type and solve
        rc = soup.find("div", class_="g-recaptcha")
        if rc:
            token = self._solve("userrecaptcha", {
                "googlekey": rc["data-sitekey"],
                "pageurl": url
            })
            return self.session.post(url, data={"g-recaptcha-response": token})

        ts = soup.find("div", class_="cf-turnstile")
        if ts:
            token = self._solve("turnstile", {
                "sitekey": ts["data-sitekey"],
                "pageurl": url
            })
            return self.session.post(url, data={"cf-turnstile-response": token})

        raise Exception("Unrecognized CAPTCHA type")

    def _solve(self, method, params):
        params["key"] = self.api_key
        params["method"] = method

        resp = requests.get("https://ocr.captchaai.com/in.php", params=params)
        if not resp.text.startswith("OK|"):
            raise Exception(f"Submit error: {resp.text}")

        task_id = resp.text.split("|")[1]

        for _ in range(60):
            time.sleep(5)
            result = requests.get("https://ocr.captchaai.com/res.php", params={
                "key": self.api_key, "action": "get", "id": task_id
            })
            if result.text == "CAPCHA_NOT_READY":
                continue
            if result.text.startswith("OK|"):
                return result.text.split("|")[1]
            raise Exception(f"Solve error: {result.text}")

        raise TimeoutError("Solve timed out")

    def get_stats(self):
        return self.stats

Batch Processing with Queue

from queue import Queue
from threading import Thread

def worker(scraper, url_queue, results):
    while not url_queue.empty():
        url = url_queue.get()
        try:
            html = scraper.scrape(url)
            results.append({"url": url, "html": html, "status": "success"})
        except Exception as e:
            results.append({"url": url, "error": str(e), "status": "failed"})
        finally:
            url_queue.task_done()
            time.sleep(2)

def scrape_batch(urls, num_workers=3):
    scraper = AutomatedScraper(API_KEY)
    url_queue = Queue()
    results = []

    for url in urls:
        url_queue.put(url)

    threads = []
    for _ in range(num_workers):
        t = Thread(target=worker, args=(scraper, url_queue, results))
        t.start()
        threads.append(t)

    for t in threads:
        t.join()

    logger.info(f"Stats: {scraper.get_stats()}")
    return results

Scheduling with Cron

Create a script that runs on a schedule:

# scheduled_scrape.py
import json
import sys

def run_scheduled_scrape():
    urls = [
        "https://example.com/page/1",
        "https://example.com/page/2",
        "https://example.com/page/3",
    ]

    results = scrape_batch(urls)

    # Save results
    with open(f"results_{int(time.time())}.json", "w") as f:
        json.dump(results, f, indent=2)

    # Report stats
    success = sum(1 for r in results if r["status"] == "success")
    failed = sum(1 for r in results if r["status"] == "failed")
    print(f"Completed: {success} success, {failed} failed")

if __name__ == "__main__":
    run_scheduled_scrape()

Add to crontab:

0 */6 * * * cd /path/to/scraper && python scheduled_scrape.py

Error Recovery Patterns

def scrape_with_recovery(scraper, urls, checkpoint_file="checkpoint.json"):
    # Load checkpoint
    completed = set()
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file) as f:
            completed = set(json.load(f))

    remaining = [u for u in urls if u not in completed]
    logger.info(f"Resuming: {len(remaining)} URLs remaining")

    for url in remaining:
        try:
            html = scraper.scrape(url)
            # Process html...
            completed.add(url)

            # Save checkpoint
            with open(checkpoint_file, "w") as f:
                json.dump(list(completed), f)

        except Exception as e:
            logger.error(f"Failed: {url} - {e}")
            continue

FAQ

How do I handle different CAPTCHA types in one pipeline?

The AutomatedScraper class above detects the CAPTCHA type automatically and uses the correct CaptchaAI method. Add detection for each CAPTCHA type your target sites use.

What's the optimal number of concurrent workers?

Start with 3-5 workers. More workers mean more concurrent requests, which increases CAPTCHA frequency. Balance speed against CAPTCHA cost.

How do I monitor my scraping pipeline?

Track three metrics: pages scraped, CAPTCHAs solved, and errors. The stats dict in the scraper class provides this. For production, export to a monitoring system.