When existing frameworks don't fit, build your own. This guide creates a modular scraping framework with CaptchaAI built in.
Framework Architecture
┌─────────────┐
│ URL Queue │
└──────┬──────┘
│
┌──────▼──────┐
│ Downloader │ → Fetch pages
└──────┬──────┘
│
┌──────▼──────┐
│ Middleware │ → CAPTCHA detection → CaptchaAI solve
└──────┬──────┘
│
┌──────▼──────┐
│ Parser │ → Extract data
└──────┬──────┘
│
┌──────▼──────┐
│ Output │ → Store results
└─────────────┘
Core Framework
import requests
import time
import logging
from collections import deque
from urllib.parse import urlparse
import re
logger = logging.getLogger(__name__)
class ScrapingFramework:
"""Modular scraping framework with CAPTCHA support."""
def __init__(self, captchaai_key=None):
self.queue = deque()
self.visited = set()
self.results = []
self.session = requests.Session()
self.middlewares = []
self.captcha_solver = CaptchaSolver(captchaai_key) if captchaai_key else None
# Defaults
self.delay = 3.0
self.max_retries = 3
self.timeout = 30
def add_middleware(self, middleware):
"""Add a processing middleware."""
self.middlewares.append(middleware)
def add_urls(self, urls):
"""Add URLs to the scraping queue."""
for url in urls:
if url not in self.visited:
self.queue.append(url)
def run(self, parser_func):
"""Execute the scraping pipeline."""
while self.queue:
url = self.queue.popleft()
if url in self.visited:
continue
self.visited.add(url)
logger.info(f"Processing: {url}")
try:
response = self._fetch(url)
response = self._apply_middlewares(url, response)
if response:
data = parser_func(url, response)
if data:
self.results.extend(data if isinstance(data, list) else [data])
except Exception as e:
logger.error(f"Failed {url}: {e}")
time.sleep(self.delay)
return self.results
def _fetch(self, url):
"""Fetch URL with retries."""
for attempt in range(self.max_retries):
try:
resp = self.session.get(url, timeout=self.timeout)
resp.raise_for_status()
return resp
except requests.RequestException as e:
if attempt == self.max_retries - 1:
raise
time.sleep(2 ** attempt)
return None
def _apply_middlewares(self, url, response):
"""Run response through middleware pipeline."""
for mw in self.middlewares:
response = mw.process(url, response, self)
if response is None:
break
return response
CAPTCHA Middleware
class CaptchaMiddleware:
"""Detect and solve CAPTCHAs automatically."""
CAPTCHA_PATTERNS = [
(r'data-sitekey="([^"]+)"', "recaptcha"),
(r'cf-turnstile.*?data-sitekey="([^"]+)"', "turnstile"),
(r"geetest_challenge", "geetest"),
]
def process(self, url, response, framework):
"""Check for CAPTCHA and solve if found."""
if not framework.captcha_solver:
return response
html = response.text
for pattern, captcha_type in self.CAPTCHA_PATTERNS:
match = re.search(pattern, html)
if match:
sitekey = match.group(1)
logger.info(f"CAPTCHA ({captcha_type}) detected on {url}")
token = framework.captcha_solver.solve(captcha_type, sitekey, url)
if token:
# Re-fetch with token
solved_resp = framework.session.post(url, data={
"g-recaptcha-response": token,
}, timeout=30)
return solved_resp
return response
class CaptchaSolver:
"""CaptchaAI solver."""
def __init__(self, api_key):
self.api_key = api_key
def solve(self, captcha_type, sitekey, pageurl):
"""Solve CAPTCHA via CaptchaAI."""
method_map = {
"recaptcha": {"method": "userrecaptcha", "googlekey": sitekey},
"turnstile": {"method": "turnstile", "sitekey": sitekey},
}
params = method_map.get(captcha_type, {})
if not params:
logger.warning(f"Unsupported CAPTCHA type: {captcha_type}")
return None
data = {
"key": self.api_key,
"pageurl": pageurl,
"json": 1,
**params,
}
resp = requests.post("https://ocr.captchaai.com/in.php", data=data, timeout=30)
result = resp.json()
if result.get("status") != 1:
logger.error(f"Submit error: {result.get('request')}")
return None
task_id = result["request"]
time.sleep(10)
for _ in range(24):
resp = requests.get("https://ocr.captchaai.com/res.php", params={
"key": self.api_key, "action": "get",
"id": task_id, "json": 1,
}, timeout=15)
data = resp.json()
if data.get("status") == 1:
return data["request"]
if data["request"] != "CAPCHA_NOT_READY":
return None
time.sleep(5)
return None
Additional Middlewares
class RateLimitMiddleware:
"""Detect and handle rate limiting."""
def process(self, url, response, framework):
if response.status_code == 429:
retry_after = int(response.headers.get("Retry-After", 60))
logger.warning(f"Rate limited, waiting {retry_after}s")
time.sleep(retry_after)
return framework._fetch(url)
return response
class UserAgentMiddleware:
"""Rotate user agents."""
UAS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
]
def __init__(self):
self._index = 0
def process(self, url, response, framework):
ua = self.UAS[self._index % len(self.UAS)]
framework.session.headers["User-Agent"] = ua
self._index += 1
return response
Usage Example
import os
from bs4 import BeautifulSoup
def parse_product_page(url, response):
"""Extract product data from page."""
soup = BeautifulSoup(response.text, "html.parser")
products = []
for item in soup.select(".product-card"):
products.append({
"name": item.select_one(".name").get_text(strip=True),
"price": item.select_one(".price").get_text(strip=True),
"url": url,
})
return products
# Build and run
framework = ScrapingFramework(captchaai_key=os.environ["CAPTCHAAI_API_KEY"])
framework.delay = 3.0
# Add middlewares
framework.add_middleware(UserAgentMiddleware())
framework.add_middleware(RateLimitMiddleware())
framework.add_middleware(CaptchaMiddleware())
# Add URLs
framework.add_urls([
"https://example.com/products?page=1",
"https://example.com/products?page=2",
"https://example.com/products?page=3",
])
# Run
results = framework.run(parse_product_page)
print(f"Scraped {len(results)} products")
FAQ
Should I build a custom framework or use an existing one?
Use Scrapy or Crawlee for standard scraping. Build custom when you need specific CAPTCHA handling patterns, unusual workflow requirements, or tight integration with your existing systems.
How do I add concurrency?
Use concurrent.futures.ThreadPoolExecutor for parallel downloads. Ensure the CAPTCHA middleware is thread-safe by using a dedicated solver instance per thread.
Can I add database storage?
Yes. Create an OutputMiddleware that saves results to your database after parsing. The middleware pipeline makes it easy to add any processing step.
Related Guides
Build your custom framework — add CaptchaAI.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.