API Tutorials

Building a Custom Scraping Framework with CaptchaAI

When existing frameworks don't fit, build your own. This guide creates a modular scraping framework with CaptchaAI built in.


Framework Architecture

┌─────────────┐
│  URL Queue   │
└──────┬──────┘
       │
┌──────▼──────┐
│  Downloader  │ → Fetch pages
└──────┬──────┘
       │
┌──────▼──────┐
│  Middleware  │ → CAPTCHA detection → CaptchaAI solve
└──────┬──────┘
       │
┌──────▼──────┐
│   Parser     │ → Extract data
└──────┬──────┘
       │
┌──────▼──────┐
│   Output     │ → Store results
└─────────────┘

Core Framework

import requests
import time
import logging
from collections import deque
from urllib.parse import urlparse
import re

logger = logging.getLogger(__name__)


class ScrapingFramework:
    """Modular scraping framework with CAPTCHA support."""

    def __init__(self, captchaai_key=None):
        self.queue = deque()
        self.visited = set()
        self.results = []
        self.session = requests.Session()
        self.middlewares = []
        self.captcha_solver = CaptchaSolver(captchaai_key) if captchaai_key else None

        # Defaults
        self.delay = 3.0
        self.max_retries = 3
        self.timeout = 30

    def add_middleware(self, middleware):
        """Add a processing middleware."""
        self.middlewares.append(middleware)

    def add_urls(self, urls):
        """Add URLs to the scraping queue."""
        for url in urls:
            if url not in self.visited:
                self.queue.append(url)

    def run(self, parser_func):
        """Execute the scraping pipeline."""
        while self.queue:
            url = self.queue.popleft()
            if url in self.visited:
                continue

            self.visited.add(url)
            logger.info(f"Processing: {url}")

            try:
                response = self._fetch(url)
                response = self._apply_middlewares(url, response)

                if response:
                    data = parser_func(url, response)
                    if data:
                        self.results.extend(data if isinstance(data, list) else [data])

            except Exception as e:
                logger.error(f"Failed {url}: {e}")

            time.sleep(self.delay)

        return self.results

    def _fetch(self, url):
        """Fetch URL with retries."""
        for attempt in range(self.max_retries):
            try:
                resp = self.session.get(url, timeout=self.timeout)
                resp.raise_for_status()
                return resp
            except requests.RequestException as e:
                if attempt == self.max_retries - 1:
                    raise
                time.sleep(2 ** attempt)
        return None

    def _apply_middlewares(self, url, response):
        """Run response through middleware pipeline."""
        for mw in self.middlewares:
            response = mw.process(url, response, self)
            if response is None:
                break
        return response

CAPTCHA Middleware

class CaptchaMiddleware:
    """Detect and solve CAPTCHAs automatically."""

    CAPTCHA_PATTERNS = [
        (r'data-sitekey="([^"]+)"', "recaptcha"),
        (r'cf-turnstile.*?data-sitekey="([^"]+)"', "turnstile"),
        (r"geetest_challenge", "geetest"),
    ]

    def process(self, url, response, framework):
        """Check for CAPTCHA and solve if found."""
        if not framework.captcha_solver:
            return response

        html = response.text

        for pattern, captcha_type in self.CAPTCHA_PATTERNS:
            match = re.search(pattern, html)
            if match:
                sitekey = match.group(1)
                logger.info(f"CAPTCHA ({captcha_type}) detected on {url}")

                token = framework.captcha_solver.solve(captcha_type, sitekey, url)
                if token:
                    # Re-fetch with token
                    solved_resp = framework.session.post(url, data={
                        "g-recaptcha-response": token,
                    }, timeout=30)
                    return solved_resp

        return response


class CaptchaSolver:
    """CaptchaAI solver."""

    def __init__(self, api_key):
        self.api_key = api_key

    def solve(self, captcha_type, sitekey, pageurl):
        """Solve CAPTCHA via CaptchaAI."""
        method_map = {
            "recaptcha": {"method": "userrecaptcha", "googlekey": sitekey},
            "turnstile": {"method": "turnstile", "sitekey": sitekey},
        }

        params = method_map.get(captcha_type, {})
        if not params:
            logger.warning(f"Unsupported CAPTCHA type: {captcha_type}")
            return None

        data = {
            "key": self.api_key,
            "pageurl": pageurl,
            "json": 1,
            **params,
        }

        resp = requests.post("https://ocr.captchaai.com/in.php", data=data, timeout=30)
        result = resp.json()

        if result.get("status") != 1:
            logger.error(f"Submit error: {result.get('request')}")
            return None

        task_id = result["request"]
        time.sleep(10)

        for _ in range(24):
            resp = requests.get("https://ocr.captchaai.com/res.php", params={
                "key": self.api_key, "action": "get",
                "id": task_id, "json": 1,
            }, timeout=15)
            data = resp.json()

            if data.get("status") == 1:
                return data["request"]
            if data["request"] != "CAPCHA_NOT_READY":
                return None
            time.sleep(5)

        return None

Additional Middlewares

class RateLimitMiddleware:
    """Detect and handle rate limiting."""

    def process(self, url, response, framework):
        if response.status_code == 429:
            retry_after = int(response.headers.get("Retry-After", 60))
            logger.warning(f"Rate limited, waiting {retry_after}s")
            time.sleep(retry_after)
            return framework._fetch(url)
        return response


class UserAgentMiddleware:
    """Rotate user agents."""

    UAS = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
    ]

    def __init__(self):
        self._index = 0

    def process(self, url, response, framework):
        ua = self.UAS[self._index % len(self.UAS)]
        framework.session.headers["User-Agent"] = ua
        self._index += 1
        return response

Usage Example

import os
from bs4 import BeautifulSoup


def parse_product_page(url, response):
    """Extract product data from page."""
    soup = BeautifulSoup(response.text, "html.parser")
    products = []

    for item in soup.select(".product-card"):
        products.append({
            "name": item.select_one(".name").get_text(strip=True),
            "price": item.select_one(".price").get_text(strip=True),
            "url": url,
        })

    return products


# Build and run
framework = ScrapingFramework(captchaai_key=os.environ["CAPTCHAAI_API_KEY"])
framework.delay = 3.0

# Add middlewares
framework.add_middleware(UserAgentMiddleware())
framework.add_middleware(RateLimitMiddleware())
framework.add_middleware(CaptchaMiddleware())

# Add URLs
framework.add_urls([
    "https://example.com/products?page=1",
    "https://example.com/products?page=2",
    "https://example.com/products?page=3",
])

# Run
results = framework.run(parse_product_page)
print(f"Scraped {len(results)} products")

FAQ

Should I build a custom framework or use an existing one?

Use Scrapy or Crawlee for standard scraping. Build custom when you need specific CAPTCHA handling patterns, unusual workflow requirements, or tight integration with your existing systems.

How do I add concurrency?

Use concurrent.futures.ThreadPoolExecutor for parallel downloads. Ensure the CAPTCHA middleware is thread-safe by using a dedicated solver instance per thread.

Can I add database storage?

Yes. Create an OutputMiddleware that saves results to your database after parsing. The middleware pipeline makes it easy to add any processing step.



Build your custom framework — add CaptchaAI.

Discussions (0)

No comments yet.

Related Posts

Comparisons ScrapingBee vs Building with CaptchaAI: When to Use Which
Compare Scraping Bee's -in-one scraping API with building your own solution using Captcha AI.

Compare Scraping Bee's all-in-one scraping API with building your own solution using Captcha AI. Cost, flexibi...

Python All CAPTCHA Types Web Scraping
Mar 16, 2026
Explainers Rate Limiting CAPTCHA Solving Workflows
Sending too many requests too fast triggers blocks, bans, and wasted CAPTCHA solves.

Sending too many requests too fast triggers blocks, bans, and wasted CAPTCHA solves. Smart rate limiting keeps...

Automation Python All CAPTCHA Types
Apr 04, 2026
Tutorials Dynamic CAPTCHA Loading: Detecting Lazy-Loaded CAPTCHAs
Detect and solve CAPTCHAs that load dynamically after user interaction — Mutation Observer, scroll triggers, and event-based rendering.

Detect and solve CAPTCHAs that load dynamically after user interaction — Mutation Observer, scroll triggers, a...

Python All CAPTCHA Types Web Scraping
Apr 03, 2026
Reference Complete Guide: CAPTCHA Solving from Basics to Production
End-to-end guide covering CAPTCHA fundamentals, solving approaches, API integration, error handling, scaling, and production deployment with Captcha AI.

End-to-end guide covering CAPTCHA fundamentals, solving approaches, API integration, error handling, scaling,...

Python All CAPTCHA Types Web Scraping
Jan 13, 2026
Explainers IP Reputation and CAPTCHA Solving: Best Practices
Manage IP reputation for CAPTCHA solving workflows.

Manage IP reputation for CAPTCHA solving workflows. Understand IP scoring, proxy rotation, and how IP quality...

Python All CAPTCHA Types Web Scraping
Mar 23, 2026
Explainers User-Agent Management for CAPTCHA Solving Workflows
Manage user-agent strings for CAPTCHA solving workflows.

Manage user-agent strings for CAPTCHA solving workflows. Avoid detection with proper UA rotation, consistency,...

Automation Python All CAPTCHA Types
Mar 09, 2026
DevOps & Scaling Ansible Playbooks for CaptchaAI Worker Deployment
Deploy and manage Captcha AI workers with Ansible — playbooks for provisioning, configuration, rolling updates, and health checks across your server fleet.

Deploy and manage Captcha AI workers with Ansible — playbooks for provisioning, configuration, rolling updates...

Automation Python All CAPTCHA Types
Apr 07, 2026
Troubleshooting Turnstile Token Invalid After Solving: Diagnosis and Fixes
Fix Cloudflare Turnstile tokens that come back invalid after solving with Captcha AI.

Fix Cloudflare Turnstile tokens that come back invalid after solving with Captcha AI. Covers token expiry, sit...

Python Cloudflare Turnstile Web Scraping
Apr 08, 2026
DevOps & Scaling Blue-Green Deployment for CAPTCHA Solving Infrastructure
Implement blue-green deployments for CAPTCHA solving infrastructure — zero-downtime upgrades, traffic switching, and rollback strategies with Captcha AI.

Implement blue-green deployments for CAPTCHA solving infrastructure — zero-downtime upgrades, traffic switchin...

Automation Python All CAPTCHA Types
Apr 07, 2026
Troubleshooting CaptchaAI API Error Handling: Complete Decision Tree
Complete decision tree for every Captcha AI API error.

Complete decision tree for every Captcha AI API error. Learn which errors are retryable, which need parameter...

Automation Python All CAPTCHA Types
Mar 17, 2026
API Tutorials How to Solve reCAPTCHA v2 Callback Using API
how to solve re CAPTCHA v 2 callback implementations using Captcha AI API.

Learn how to solve re CAPTCHA v 2 callback implementations using Captcha AI API. Detect the callback function,...

Automation reCAPTCHA v2 Webhooks
Mar 01, 2026
API Tutorials Solve GeeTest v3 CAPTCHA with Python and CaptchaAI
Step-by-step Python tutorial for solving Gee Test v 3 slide puzzle CAPTCHAs using the Captcha AI API.

Step-by-step Python tutorial for solving Gee Test v 3 slide puzzle CAPTCHAs using the Captcha AI API. Includes...

Automation Python Testing
Mar 23, 2026
API Tutorials Case-Sensitive CAPTCHA API Parameter Guide
How to use the regsense parameter for case-sensitive CAPTCHA solving with Captcha AI.

How to use the regsense parameter for case-sensitive CAPTCHA solving with Captcha AI. Covers when to use, comm...

Python Web Scraping Image OCR
Apr 09, 2026