Scrapy's middleware system lets you intercept requests and responses. Here's how to build a downloader middleware that automatically solves CAPTCHAs with CaptchaAI.
Downloader Middleware
# middlewares.py
import re
import time
import logging
import requests as http_requests
from scrapy import signals
from scrapy.http import HtmlResponse
logger = logging.getLogger(__name__)
class CaptchaAIMiddleware:
"""Scrapy downloader middleware for automatic CAPTCHA solving."""
CAPTCHA_PATTERNS = [
(r'data-sitekey="([^"]+)"', "recaptcha"),
(r"cf-turnstile.*?data-sitekey=\"([^\"]+)\"", "turnstile"),
]
def __init__(self, api_key, max_retries=2):
self.api_key = api_key
self.max_retries = max_retries
self.stats = {"detected": 0, "solved": 0, "failed": 0}
@classmethod
def from_crawler(cls, crawler):
api_key = crawler.settings.get("CAPTCHAAI_API_KEY")
if not api_key:
raise ValueError("CAPTCHAAI_API_KEY setting is required")
middleware = cls(
api_key=api_key,
max_retries=crawler.settings.getint("CAPTCHAAI_MAX_RETRIES", 2),
)
crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
return middleware
def process_response(self, request, response, spider):
"""Check response for CAPTCHA and solve if found."""
if not isinstance(response, HtmlResponse):
return response
body = response.text
for pattern, captcha_type in self.CAPTCHA_PATTERNS:
match = re.search(pattern, body)
if match:
sitekey = match.group(1)
self.stats["detected"] += 1
logger.info(
f"CAPTCHA ({captcha_type}) on {response.url}, solving..."
)
retries = request.meta.get("captcha_retries", 0)
if retries >= self.max_retries:
self.stats["failed"] += 1
logger.error(f"Max CAPTCHA retries on {response.url}")
return response
token = self._solve(captcha_type, sitekey, response.url)
if token:
self.stats["solved"] += 1
# Re-request with token
new_request = request.copy()
new_request.meta["captcha_retries"] = retries + 1
new_request.meta["captcha_token"] = token
new_request.method = "POST"
new_request.body = f"g-recaptcha-response={token}"
new_request.headers[b"Content-Type"] = b"application/x-www-form-urlencoded"
new_request.dont_filter = True
return new_request
else:
self.stats["failed"] += 1
return response
def _solve(self, captcha_type, sitekey, pageurl):
"""Solve CAPTCHA via CaptchaAI."""
method_map = {
"recaptcha": {"method": "userrecaptcha", "googlekey": sitekey},
"turnstile": {"method": "turnstile", "sitekey": sitekey},
}
params = method_map.get(captcha_type)
if not params:
return None
try:
resp = http_requests.post("https://ocr.captchaai.com/in.php", data={
"key": self.api_key,
"pageurl": pageurl,
"json": 1,
**params,
}, timeout=30)
result = resp.json()
if result.get("status") != 1:
logger.error(f"Submit error: {result.get('request')}")
return None
task_id = result["request"]
time.sleep(10)
for _ in range(24):
resp = http_requests.get("https://ocr.captchaai.com/res.php", params={
"key": self.api_key, "action": "get",
"id": task_id, "json": 1,
}, timeout=15)
data = resp.json()
if data.get("status") == 1:
return data["request"]
if data["request"] != "CAPCHA_NOT_READY":
return None
time.sleep(5)
except Exception as e:
logger.error(f"Solve exception: {e}")
return None
def spider_closed(self, spider):
"""Log CAPTCHA statistics on spider close."""
logger.info(
f"CAPTCHA Stats — Detected: {self.stats['detected']}, "
f"Solved: {self.stats['solved']}, "
f"Failed: {self.stats['failed']}"
)
Scrapy Settings
# settings.py
# CaptchaAI configuration
CAPTCHAAI_API_KEY = "YOUR_API_KEY" # Better: use env variable
CAPTCHAAI_MAX_RETRIES = 2
# Enable the middleware
DOWNLOADER_MIDDLEWARES = {
"myproject.middlewares.CaptchaAIMiddleware": 600,
}
# Increase timeouts for CAPTCHA solving
DOWNLOAD_TIMEOUT = 180
# Rate limiting
DOWNLOAD_DELAY = 3
RANDOMIZE_DOWNLOAD_DELAY = True
CONCURRENT_REQUESTS = 4
CONCURRENT_REQUESTS_PER_DOMAIN = 2
Spider Example
# spiders/product_spider.py
import scrapy
class ProductSpider(scrapy.Spider):
name = "products"
start_urls = ["https://example.com/products"]
def parse(self, response):
"""Parse product listing page."""
# The middleware handles CAPTCHAs automatically
# This method only deals with parsing
for product in response.css("div.product-card"):
yield {
"name": product.css(".name::text").get("").strip(),
"price": product.css(".price::text").get("").strip(),
"url": response.urljoin(product.css("a::attr(href)").get("")),
}
# Follow pagination
next_page = response.css("a.next-page::attr(href)").get()
if next_page:
yield scrapy.Request(response.urljoin(next_page))
def parse_product(self, response):
"""Parse individual product page."""
# Access CAPTCHA token if middleware solved one
token = response.meta.get("captcha_token")
if token:
self.logger.info(f"Page accessed after CAPTCHA solve: {response.url}")
yield {
"title": response.css("h1::text").get("").strip(),
"description": response.css(".description::text").get("").strip(),
"price": response.css(".price::text").get("").strip(),
}
Token-Passing Spider Middleware
For spiders that need the token in the parse method:
class CaptchaTokenSpiderMiddleware:
"""Pass CAPTCHA tokens to spider callbacks."""
def process_spider_input(self, response, spider):
"""Add CAPTCHA token to response meta if available."""
token = response.meta.get("captcha_token")
if token:
spider.logger.debug(f"CAPTCHA token available for {response.url}")
return None
def process_spider_output(self, response, result, spider):
"""Forward token to new requests from this response."""
token = response.meta.get("captcha_token")
for item_or_request in result:
if isinstance(item_or_request, scrapy.Request) and token:
item_or_request.meta.setdefault("parent_captcha_token", token)
yield item_or_request
FAQ
Does the middleware block Scrapy's event loop?
Yes, the synchronous HTTP calls to CaptchaAI's API block during polling. For high-concurrency spiders, consider using scrapy-playwright with async CAPTCHA solving instead.
Can I use this middleware with Scrapy-Splash?
Yes. The middleware intercepts responses regardless of how they were rendered. It works with Splash, Playwright, and standard HTTP responses.
How do I test the middleware?
Use Scrapy's fake_response helper in unit tests. Mock the CaptchaAI API responses to test CAPTCHA detection and retry logic without making real API calls.
Related Guides
Add CaptchaAI to Scrapy — get your API key.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.