Building a Custom Scraping Framework with CaptchaAI

When existing frameworks don't fit, build your own. This guide creates a modular scraping framework with CaptchaAI built in.

Framework Architecture

┌─────────────┐
│  URL Queue   │
└──────┬──────┘
       │
┌──────▼──────┐
│  Downloader  │ → Fetch pages
└──────┬──────┘
       │
┌──────▼──────┐
│  Middleware  │ → CAPTCHA detection → CaptchaAI solve
└──────┬──────┘
       │
┌──────▼──────┐
│   Parser     │ → Extract data
└──────┬──────┘
       │
┌──────▼──────┐
│   Output     │ → Store results
└─────────────┘

Core Framework

import requests
import time
import logging
from collections import deque
from urllib.parse import urlparse
import re

logger = logging.getLogger(__name__)


class ScrapingFramework:
    """Modular scraping framework with CAPTCHA support."""

    def __init__(self, captchaai_key=None):
        self.queue = deque()
        self.visited = set()
        self.results = []
        self.session = requests.Session()
        self.middlewares = []
        self.captcha_solver = CaptchaSolver(captchaai_key) if captchaai_key else None

        # Defaults
        self.delay = 3.0
        self.max_retries = 3
        self.timeout = 30

    def add_middleware(self, middleware):
        """Add a processing middleware."""
        self.middlewares.append(middleware)

    def add_urls(self, urls):
        """Add URLs to the scraping queue."""
        for url in urls:
            if url not in self.visited:
                self.queue.append(url)

    def run(self, parser_func):
        """Execute the scraping pipeline."""
        while self.queue:
            url = self.queue.popleft()
            if url in self.visited:
                continue

            self.visited.add(url)
            logger.info(f"Processing: {url}")

            try:
                response = self._fetch(url)
                response = self._apply_middlewares(url, response)

                if response:
                    data = parser_func(url, response)
                    if data:
                        self.results.extend(data if isinstance(data, list) else [data])

            except Exception as e:
                logger.error(f"Failed {url}: {e}")

            time.sleep(self.delay)

        return self.results

    def _fetch(self, url):
        """Fetch URL with retries."""
        for attempt in range(self.max_retries):
            try:
                resp = self.session.get(url, timeout=self.timeout)
                resp.raise_for_status()
                return resp
            except requests.RequestException as e:
                if attempt == self.max_retries - 1:
                    raise
                time.sleep(2 ** attempt)
        return None

    def _apply_middlewares(self, url, response):
        """Run response through middleware pipeline."""
        for mw in self.middlewares:
            response = mw.process(url, response, self)
            if response is None:
                break
        return response

CAPTCHA Middleware

class CaptchaMiddleware:
    """Detect and solve CAPTCHAs automatically."""

    CAPTCHA_PATTERNS = [
        (r'data-sitekey="([^"]+)"', "recaptcha"),
        (r'cf-turnstile.*?data-sitekey="([^"]+)"', "turnstile"),
        (r"geetest_challenge", "geetest"),
    ]

    def process(self, url, response, framework):
        """Check for CAPTCHA and solve if found."""
        if not framework.captcha_solver:
            return response

        html = response.text

        for pattern, captcha_type in self.CAPTCHA_PATTERNS:
            match = re.search(pattern, html)
            if match:
                sitekey = match.group(1)
                logger.info(f"CAPTCHA ({captcha_type}) detected on {url}")

                token = framework.captcha_solver.solve(captcha_type, sitekey, url)
                if token:
                    # Re-fetch with token
                    solved_resp = framework.session.post(url, data={
                        "g-recaptcha-response": token,
                    }, timeout=30)
                    return solved_resp

        return response


class CaptchaSolver:
    """CaptchaAI solver."""

    def __init__(self, api_key):
        self.api_key = api_key

    def solve(self, captcha_type, sitekey, pageurl):
        """Solve CAPTCHA via CaptchaAI."""
        method_map = {
            "recaptcha": {"method": "userrecaptcha", "googlekey": sitekey},
            "turnstile": {"method": "turnstile", "sitekey": sitekey},
        }

        params = method_map.get(captcha_type, {})
        if not params:
            logger.warning(f"Unsupported CAPTCHA type: {captcha_type}")
            return None

        data = {
            "key": self.api_key,
            "pageurl": pageurl,
            "json": 1,
            **params,
        }

        resp = requests.post("https://ocr.captchaai.com/in.php", data=data, timeout=30)
        result = resp.json()

        if result.get("status") != 1:
            logger.error(f"Submit error: {result.get('request')}")
            return None

        task_id = result["request"]
        time.sleep(10)

        for _ in range(24):
            resp = requests.get("https://ocr.captchaai.com/res.php", params={
                "key": self.api_key, "action": "get",
                "id": task_id, "json": 1,
            }, timeout=15)
            data = resp.json()

            if data.get("status") == 1:
                return data["request"]
            if data["request"] != "CAPCHA_NOT_READY":
                return None
            time.sleep(5)

        return None

Additional Middlewares

class RateLimitMiddleware:
    """Detect and handle rate limiting."""

    def process(self, url, response, framework):
        if response.status_code == 429:
            retry_after = int(response.headers.get("Retry-After", 60))
            logger.warning(f"Rate limited, waiting {retry_after}s")
            time.sleep(retry_after)
            return framework._fetch(url)
        return response


class UserAgentMiddleware:
    """Rotate user agents."""

    UAS = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
    ]

    def __init__(self):
        self._index = 0

    def process(self, url, response, framework):
        ua = self.UAS[self._index % len(self.UAS)]
        framework.session.headers["User-Agent"] = ua
        self._index += 1
        return response

Usage Example

import os
from bs4 import BeautifulSoup


def parse_product_page(url, response):
    """Extract product data from page."""
    soup = BeautifulSoup(response.text, "html.parser")
    products = []

    for item in soup.select(".product-card"):
        products.append({
            "name": item.select_one(".name").get_text(strip=True),
            "price": item.select_one(".price").get_text(strip=True),
            "url": url,
        })

    return products


# Build and run
framework = ScrapingFramework(captchaai_key=os.environ["CAPTCHAAI_API_KEY"])
framework.delay = 3.0

# Add middlewares
framework.add_middleware(UserAgentMiddleware())
framework.add_middleware(RateLimitMiddleware())
framework.add_middleware(CaptchaMiddleware())

# Add URLs
framework.add_urls([
    "https://example.com/products?page=1",
    "https://example.com/products?page=2",
    "https://example.com/products?page=3",
])

# Run
results = framework.run(parse_product_page)
print(f"Scraped {len(results)} products")

FAQ

Should I build a custom framework or use an existing one?

Use Scrapy or Crawlee for standard scraping. Build custom when you need specific CAPTCHA handling patterns, unusual workflow requirements, or tight integration with your existing systems.

How do I add concurrency?

Use concurrent.futures.ThreadPoolExecutor for parallel downloads. Ensure the CAPTCHA middleware is thread-safe by using a dedicated solver instance per thread.

Can I add database storage?

Yes. Create an OutputMiddleware that saves results to your database after parsing. The middleware pipeline makes it easy to add any processing step.

Build your custom framework — add CaptchaAI.

Full Working Code

Complete runnable examples for this article in Python, Node.js, PHP, Go, Java, C#, Ruby, Rust, Kotlin & Bash.

View on GitHub →

Building a Custom Scraping Framework with CaptchaAI

Framework Architecture

Core Framework

CAPTCHA Middleware

Additional Middlewares

Usage Example

FAQ

Should I build a custom framework or use an existing one?

How do I add concurrency?

Can I add database storage?

Discussions (0)

Complete Guide: CAPTCHA Solving from Basics to Production

ScrapingBee vs Building with CaptchaAI: When to Use Which

Dynamic CAPTCHA Loading: Detecting Lazy-Loaded CAPTCHAs

Rate Limiting CAPTCHA Solving Workflows

User-Agent Management for CAPTCHA Solving Workflows

IP Reputation and CAPTCHA Solving: Best Practices

Framework Architecture

Core Framework

CAPTCHA Middleware

Additional Middlewares

Usage Example

FAQ

Should I build a custom framework or use an existing one?

How do I add concurrency?

Can I add database storage?

Related Guides

Discussions (0)

Join the conversation

Related Posts

Complete Guide: CAPTCHA Solving from Basics to Production

ScrapingBee vs Building with CaptchaAI: When to Use Which

Dynamic CAPTCHA Loading: Detecting Lazy-Loaded CAPTCHAs

Rate Limiting CAPTCHA Solving Workflows

User-Agent Management for CAPTCHA Solving Workflows

IP Reputation and CAPTCHA Solving: Best Practices