Production scraping pipelines need to handle CAPTCHAs automatically — no manual intervention. This guide shows how to build automated scrapers with CaptchaAI integrated for CAPTCHA solving, error recovery, and scheduling.
Architecture Overview
[Scheduler] → [URL Queue] → [Scraper Workers] → [CAPTCHA Solver] → [Data Store]
↕
[Proxy Rotator]
Each component:
- Scheduler: Triggers scraping jobs (cron, task queue)
- URL Queue: Manages URLs to scrape
- Scraper Workers: Fetch pages, detect CAPTCHAs
- CAPTCHA Solver: CaptchaAI API handles all CAPTCHA types
- Proxy Rotator: Distributes requests across IPs
Core Scraper with CAPTCHA Handling
import requests
import time
import logging
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
API_KEY = "YOUR_API_KEY"
class AutomatedScraper:
def __init__(self, api_key, max_retries=3):
self.api_key = api_key
self.max_retries = max_retries
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
})
self.stats = {"pages": 0, "captchas": 0, "errors": 0}
def scrape(self, url):
for attempt in range(self.max_retries):
try:
resp = self.session.get(url, timeout=30)
if self._is_captcha(resp.text):
self.stats["captchas"] += 1
logger.info(f"CAPTCHA detected on {url}")
resp = self._solve_and_retry(resp.text, url)
self.stats["pages"] += 1
return resp.text
except Exception as e:
self.stats["errors"] += 1
logger.error(f"Attempt {attempt + 1} failed for {url}: {e}")
if attempt == self.max_retries - 1:
raise
time.sleep(2 ** attempt)
def _is_captcha(self, html):
return any(m in html.lower() for m in
["g-recaptcha", "cf-turnstile", "h-captcha", "captcha"])
def _solve_and_retry(self, html, url):
soup = BeautifulSoup(html, "html.parser")
# Detect CAPTCHA type and solve
rc = soup.find("div", class_="g-recaptcha")
if rc:
token = self._solve("userrecaptcha", {
"googlekey": rc["data-sitekey"],
"pageurl": url
})
return self.session.post(url, data={"g-recaptcha-response": token})
ts = soup.find("div", class_="cf-turnstile")
if ts:
token = self._solve("turnstile", {
"sitekey": ts["data-sitekey"],
"pageurl": url
})
return self.session.post(url, data={"cf-turnstile-response": token})
raise Exception("Unrecognized CAPTCHA type")
def _solve(self, method, params):
params["key"] = self.api_key
params["method"] = method
resp = requests.get("https://ocr.captchaai.com/in.php", params=params)
if not resp.text.startswith("OK|"):
raise Exception(f"Submit error: {resp.text}")
task_id = resp.text.split("|")[1]
for _ in range(60):
time.sleep(5)
result = requests.get("https://ocr.captchaai.com/res.php", params={
"key": self.api_key, "action": "get", "id": task_id
})
if result.text == "CAPCHA_NOT_READY":
continue
if result.text.startswith("OK|"):
return result.text.split("|")[1]
raise Exception(f"Solve error: {result.text}")
raise TimeoutError("Solve timed out")
def get_stats(self):
return self.stats
Batch Processing with Queue
from queue import Queue
from threading import Thread
def worker(scraper, url_queue, results):
while not url_queue.empty():
url = url_queue.get()
try:
html = scraper.scrape(url)
results.append({"url": url, "html": html, "status": "success"})
except Exception as e:
results.append({"url": url, "error": str(e), "status": "failed"})
finally:
url_queue.task_done()
time.sleep(2)
def scrape_batch(urls, num_workers=3):
scraper = AutomatedScraper(API_KEY)
url_queue = Queue()
results = []
for url in urls:
url_queue.put(url)
threads = []
for _ in range(num_workers):
t = Thread(target=worker, args=(scraper, url_queue, results))
t.start()
threads.append(t)
for t in threads:
t.join()
logger.info(f"Stats: {scraper.get_stats()}")
return results
Scheduling with Cron
Create a script that runs on a schedule:
# scheduled_scrape.py
import json
import sys
def run_scheduled_scrape():
urls = [
"https://example.com/page/1",
"https://example.com/page/2",
"https://example.com/page/3",
]
results = scrape_batch(urls)
# Save results
with open(f"results_{int(time.time())}.json", "w") as f:
json.dump(results, f, indent=2)
# Report stats
success = sum(1 for r in results if r["status"] == "success")
failed = sum(1 for r in results if r["status"] == "failed")
print(f"Completed: {success} success, {failed} failed")
if __name__ == "__main__":
run_scheduled_scrape()
Add to crontab:
0 */6 * * * cd /path/to/scraper && python scheduled_scrape.py
Error Recovery Patterns
def scrape_with_recovery(scraper, urls, checkpoint_file="checkpoint.json"):
# Load checkpoint
completed = set()
if os.path.exists(checkpoint_file):
with open(checkpoint_file) as f:
completed = set(json.load(f))
remaining = [u for u in urls if u not in completed]
logger.info(f"Resuming: {len(remaining)} URLs remaining")
for url in remaining:
try:
html = scraper.scrape(url)
# Process html...
completed.add(url)
# Save checkpoint
with open(checkpoint_file, "w") as f:
json.dump(list(completed), f)
except Exception as e:
logger.error(f"Failed: {url} - {e}")
continue
FAQ
How do I handle different CAPTCHA types in one pipeline?
The AutomatedScraper class above detects the CAPTCHA type automatically and uses the correct CaptchaAI method. Add detection for each CAPTCHA type your target sites use.
What's the optimal number of concurrent workers?
Start with 3-5 workers. More workers mean more concurrent requests, which increases CAPTCHA frequency. Balance speed against CAPTCHA cost.
How do I monitor my scraping pipeline?
Track three metrics: pages scraped, CAPTCHAs solved, and errors. The stats dict in the scraper class provides this. For production, export to a monitoring system.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.