Integrations

Scrapy Spider Middleware for CaptchaAI: Advanced Patterns

Scrapy's middleware system lets you intercept requests and responses. Here's how to build a downloader middleware that automatically solves CAPTCHAs with CaptchaAI.


Downloader Middleware

# middlewares.py
import re
import time
import logging
import requests as http_requests
from scrapy import signals
from scrapy.http import HtmlResponse

logger = logging.getLogger(__name__)


class CaptchaAIMiddleware:
    """Scrapy downloader middleware for automatic CAPTCHA solving."""

    CAPTCHA_PATTERNS = [
        (r'data-sitekey="([^"]+)"', "recaptcha"),
        (r"cf-turnstile.*?data-sitekey=\"([^\"]+)\"", "turnstile"),
    ]

    def __init__(self, api_key, max_retries=2):
        self.api_key = api_key
        self.max_retries = max_retries
        self.stats = {"detected": 0, "solved": 0, "failed": 0}

    @classmethod
    def from_crawler(cls, crawler):
        api_key = crawler.settings.get("CAPTCHAAI_API_KEY")
        if not api_key:
            raise ValueError("CAPTCHAAI_API_KEY setting is required")

        middleware = cls(
            api_key=api_key,
            max_retries=crawler.settings.getint("CAPTCHAAI_MAX_RETRIES", 2),
        )

        crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
        return middleware

    def process_response(self, request, response, spider):
        """Check response for CAPTCHA and solve if found."""
        if not isinstance(response, HtmlResponse):
            return response

        body = response.text

        for pattern, captcha_type in self.CAPTCHA_PATTERNS:
            match = re.search(pattern, body)
            if match:
                sitekey = match.group(1)
                self.stats["detected"] += 1
                logger.info(
                    f"CAPTCHA ({captcha_type}) on {response.url}, solving..."
                )

                retries = request.meta.get("captcha_retries", 0)
                if retries >= self.max_retries:
                    self.stats["failed"] += 1
                    logger.error(f"Max CAPTCHA retries on {response.url}")
                    return response

                token = self._solve(captcha_type, sitekey, response.url)
                if token:
                    self.stats["solved"] += 1
                    # Re-request with token
                    new_request = request.copy()
                    new_request.meta["captcha_retries"] = retries + 1
                    new_request.meta["captcha_token"] = token
                    new_request.method = "POST"
                    new_request.body = f"g-recaptcha-response={token}"
                    new_request.headers[b"Content-Type"] = b"application/x-www-form-urlencoded"
                    new_request.dont_filter = True
                    return new_request
                else:
                    self.stats["failed"] += 1

        return response

    def _solve(self, captcha_type, sitekey, pageurl):
        """Solve CAPTCHA via CaptchaAI."""
        method_map = {
            "recaptcha": {"method": "userrecaptcha", "googlekey": sitekey},
            "turnstile": {"method": "turnstile", "sitekey": sitekey},
        }

        params = method_map.get(captcha_type)
        if not params:
            return None

        try:
            resp = http_requests.post("https://ocr.captchaai.com/in.php", data={
                "key": self.api_key,
                "pageurl": pageurl,
                "json": 1,
                **params,
            }, timeout=30)
            result = resp.json()

            if result.get("status") != 1:
                logger.error(f"Submit error: {result.get('request')}")
                return None

            task_id = result["request"]
            time.sleep(10)

            for _ in range(24):
                resp = http_requests.get("https://ocr.captchaai.com/res.php", params={
                    "key": self.api_key, "action": "get",
                    "id": task_id, "json": 1,
                }, timeout=15)
                data = resp.json()

                if data.get("status") == 1:
                    return data["request"]
                if data["request"] != "CAPCHA_NOT_READY":
                    return None
                time.sleep(5)

        except Exception as e:
            logger.error(f"Solve exception: {e}")

        return None

    def spider_closed(self, spider):
        """Log CAPTCHA statistics on spider close."""
        logger.info(
            f"CAPTCHA Stats — Detected: {self.stats['detected']}, "
            f"Solved: {self.stats['solved']}, "
            f"Failed: {self.stats['failed']}"
        )

Scrapy Settings

# settings.py

# CaptchaAI configuration
CAPTCHAAI_API_KEY = "YOUR_API_KEY"  # Better: use env variable
CAPTCHAAI_MAX_RETRIES = 2

# Enable the middleware
DOWNLOADER_MIDDLEWARES = {
    "myproject.middlewares.CaptchaAIMiddleware": 600,
}

# Increase timeouts for CAPTCHA solving
DOWNLOAD_TIMEOUT = 180

# Rate limiting
DOWNLOAD_DELAY = 3
RANDOMIZE_DOWNLOAD_DELAY = True
CONCURRENT_REQUESTS = 4
CONCURRENT_REQUESTS_PER_DOMAIN = 2

Spider Example

# spiders/product_spider.py
import scrapy


class ProductSpider(scrapy.Spider):
    name = "products"
    start_urls = ["https://example.com/products"]

    def parse(self, response):
        """Parse product listing page."""
        # The middleware handles CAPTCHAs automatically
        # This method only deals with parsing

        for product in response.css("div.product-card"):
            yield {
                "name": product.css(".name::text").get("").strip(),
                "price": product.css(".price::text").get("").strip(),
                "url": response.urljoin(product.css("a::attr(href)").get("")),
            }

        # Follow pagination
        next_page = response.css("a.next-page::attr(href)").get()
        if next_page:
            yield scrapy.Request(response.urljoin(next_page))

    def parse_product(self, response):
        """Parse individual product page."""
        # Access CAPTCHA token if middleware solved one
        token = response.meta.get("captcha_token")
        if token:
            self.logger.info(f"Page accessed after CAPTCHA solve: {response.url}")

        yield {
            "title": response.css("h1::text").get("").strip(),
            "description": response.css(".description::text").get("").strip(),
            "price": response.css(".price::text").get("").strip(),
        }

Token-Passing Spider Middleware

For spiders that need the token in the parse method:

class CaptchaTokenSpiderMiddleware:
    """Pass CAPTCHA tokens to spider callbacks."""

    def process_spider_input(self, response, spider):
        """Add CAPTCHA token to response meta if available."""
        token = response.meta.get("captcha_token")
        if token:
            spider.logger.debug(f"CAPTCHA token available for {response.url}")
        return None

    def process_spider_output(self, response, result, spider):
        """Forward token to new requests from this response."""
        token = response.meta.get("captcha_token")

        for item_or_request in result:
            if isinstance(item_or_request, scrapy.Request) and token:
                item_or_request.meta.setdefault("parent_captcha_token", token)
            yield item_or_request

FAQ

Does the middleware block Scrapy's event loop?

Yes, the synchronous HTTP calls to CaptchaAI's API block during polling. For high-concurrency spiders, consider using scrapy-playwright with async CAPTCHA solving instead.

Can I use this middleware with Scrapy-Splash?

Yes. The middleware intercepts responses regardless of how they were rendered. It works with Splash, Playwright, and standard HTTP responses.

How do I test the middleware?

Use Scrapy's fake_response helper in unit tests. Mock the CaptchaAI API responses to test CAPTCHA detection and retry logic without making real API calls.



Add CaptchaAI to Scrapy — get your API key.

Discussions (0)

No comments yet.

Related Posts

Tutorials Handling Multiple CAPTCHAs on a Single Page
how to detect and solve multiple CAPTCHAs on a single web page using Captcha AI.

Learn how to detect and solve multiple CAPTCHAs on a single web page using Captcha AI. Covers multi-iframe ext...

Python Cloudflare Turnstile reCAPTCHA v2
Apr 09, 2026
Tutorials Extracting reCAPTCHA Parameters from Page Source
Extract re CAPTCHA parameters from any web page — sitekey, action, data-s, enterprise flag, and version — using regex, DOM queries, and network interception.

Extract all re CAPTCHA parameters from any web page — sitekey, action, data-s, enterprise flag, and version —...

Python reCAPTCHA v2 Web Scraping
Apr 07, 2026
Use Cases Academic Research Web Scraping with CAPTCHA Solving
How researchers can collect data from academic databases, journals, and citation sources protected by CAPTCHAs using Captcha AI.

How researchers can collect data from academic databases, journals, and citation sources protected by CAPTCHAs...

Python Cloudflare Turnstile reCAPTCHA v2
Apr 06, 2026
Use Cases Multi-Step Workflow Automation with CaptchaAI
Manage workflows across multiple accounts on CAPTCHA-protected platforms — , action, and data collection at scale.

Manage workflows across multiple accounts on CAPTCHA-protected platforms — , action, and data collection at sc...

Python Automation Cloudflare Turnstile
Apr 06, 2026
Explainers Mobile Proxies for CAPTCHA Solving: Higher Success Rates Explained
Why mobile proxies produce the lowest CAPTCHA trigger rates and how to use them with Captcha AI for maximum success.

Why mobile proxies produce the lowest CAPTCHA trigger rates and how to use them with Captcha AI for maximum su...

Python Cloudflare Turnstile reCAPTCHA v2
Apr 03, 2026
Troubleshooting CAPTCHA Appears After Login: Mid-Session CAPTCHA Handling
Handle CAPTCHAs that appear mid-session after — triggered by suspicious activity, rate limits, or session age.

Handle CAPTCHAs that appear mid-session after — triggered by suspicious activity, rate limits, or session age....

Python Cloudflare Turnstile reCAPTCHA v2
Apr 01, 2026
Troubleshooting ERROR_PROXY_NOT_AUTHORIZED: Proxy Authentication Fixes
Fix ERROR_PROXY_NOT_AUTHORIZED when using Captcha AI with proxies.

Fix ERROR_PROXY_NOT_AUTHORIZED when using Captcha AI with proxies. Diagnose proxy format, authentication, whit...

Python Cloudflare Turnstile reCAPTCHA v2
Mar 30, 2026
Troubleshooting CaptchaAI Proxy Connection Failures: Diagnosis and Fixes
Troubleshoot proxy connection failures when using Captcha AI.

Troubleshoot proxy connection failures when using Captcha AI. Fix timeout errors, authentication issues, and p...

Python reCAPTCHA v2 Web Scraping
Mar 27, 2026
Integrations Axios + CaptchaAI: Solve CAPTCHAs Without a Browser
Use Axios and Captcha AI to solve re CAPTCHA, Turnstile, and image CAPTCHAs in Node.js without launching a browser.

Use Axios and Captcha AI to solve re CAPTCHA, Turnstile, and image CAPTCHAs in Node.js without launching a bro...

Automation All CAPTCHA Types
Apr 08, 2026
Integrations Scrapy + CaptchaAI Integration Guide
Integrate Captcha AI into Scrapy spiders to automatically solve CAPTCHAs during web crawling with middleware and signal handlers.

Integrate Captcha AI into Scrapy spiders to automatically solve CAPTCHAs during web crawling with middleware a...

Automation reCAPTCHA v2 Scrapy
Jan 27, 2026
Integrations Puppeteer Stealth + CaptchaAI: Reliable Browser Automation
Standard Puppeteer gets detected immediately by anti-bot systems.

Standard Puppeteer gets detected immediately by anti-bot systems. `puppeteer-extra-plugin-stealth` patches the...

Automation Cloudflare Turnstile reCAPTCHA v2
Apr 05, 2026