ParseHub is a visual scraping tool that struggles with CAPTCHAs. Here's how to use CaptchaAI to pre-solve CAPTCHAs and feed authenticated sessions into ParseHub.
The Challenge
ParseHub's visual selector can't interact with CAPTCHAs:
| Issue | Impact |
|---|---|
| reCAPTCHA blocks scraping | ParseHub can't click checkboxes |
| Login CAPTCHAs | Can't authenticate |
| Rate-limit CAPTCHAs | Extraction stops mid-run |
| Cloudflare challenges | Page never loads data |
Solution: Pre-Authentication Script
import requests
import time
import json
def pre_authenticate(api_key, login_url, sitekey, credentials):
"""
Solve CAPTCHA, authenticate, and export cookies for ParseHub.
Run this script before starting your ParseHub extraction.
"""
session = requests.Session()
# Set realistic headers
session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
})
# Visit login page for initial cookies
print("1. Loading login page...")
session.get(login_url, timeout=15)
# Solve CAPTCHA
print("2. Solving CAPTCHA...")
token = solve_captcha(api_key, sitekey, login_url)
print(f" Token received ({len(token)} chars)")
# Submit login
print("3. Submitting login form...")
resp = session.post(login_url, data={
**credentials,
"g-recaptcha-response": token,
}, timeout=30)
if resp.status_code != 200 or "error" in resp.text.lower():
print(f" Login may have failed (status: {resp.status_code})")
else:
print(" Login successful")
# Export cookies
cookies = {c.name: c.value for c in session.cookies}
print(f"4. Exported {len(cookies)} cookies")
return cookies
def solve_captcha(api_key, sitekey, pageurl):
"""Solve reCAPTCHA via CaptchaAI."""
resp = requests.post("https://ocr.captchaai.com/in.php", data={
"key": api_key,
"method": "userrecaptcha",
"googlekey": sitekey,
"pageurl": pageurl,
"json": 1,
}, timeout=30)
result = resp.json()
if result.get("status") != 1:
raise RuntimeError(f"Submit error: {result.get('request')}")
task_id = result["request"]
time.sleep(15)
for _ in range(24):
resp = requests.get("https://ocr.captchaai.com/res.php", params={
"key": api_key, "action": "get",
"id": task_id, "json": 1,
}, timeout=15)
data = resp.json()
if data.get("status") == 1:
return data["request"]
if data["request"] != "CAPCHA_NOT_READY":
raise RuntimeError(data["request"])
time.sleep(5)
raise TimeoutError("Solve timeout")
# Run before ParseHub
cookies = pre_authenticate(
api_key="YOUR_API_KEY",
login_url="https://example.com/login",
sitekey="6Le-wvkSAAAAAPBMRTvw0Q4Muexq9bi0DJwx_mJ-",
credentials={"username": "user", "password": "pass"},
)
# Save for ParseHub
with open("parsehub_cookies.json", "w") as f:
json.dump(cookies, f, indent=2)
print(f"\nCookies saved to parsehub_cookies.json")
print("Import these cookies in ParseHub project settings")
Using ParseHub API with Pre-Solved Cookies
import requests
import json
def run_parsehub_with_cookies(project_token, api_key, cookies):
"""Start a ParseHub run with pre-authenticated cookies."""
# Format cookies for ParseHub
cookie_string = "; ".join(f"{k}={v}" for k, v in cookies.items())
resp = requests.post(
f"https://www.parsehub.com/api/v2/projects/{project_token}/run",
data={
"api_key": api_key,
"start_url": "https://example.com/dashboard",
"cookies": cookie_string,
},
timeout=30,
)
if resp.status_code == 200:
run_data = resp.json()
print(f"ParseHub run started: {run_data.get('run_token')}")
return run_data
else:
print(f"Failed to start run: {resp.status_code}")
return None
def get_parsehub_results(run_token, api_key):
"""Get results from a ParseHub run."""
resp = requests.get(
f"https://www.parsehub.com/api/v2/runs/{run_token}/data",
params={"api_key": api_key, "format": "json"},
timeout=60,
)
return resp.json()
Scheduled Workflow
import schedule
import time
def captcha_then_parsehub():
"""Run CAPTCHA solve, then trigger ParseHub."""
# Step 1: Pre-authenticate
cookies = pre_authenticate(
api_key="YOUR_CAPTCHAAI_KEY",
login_url="https://example.com/login",
sitekey="SITEKEY",
credentials={"username": "user", "password": "pass"},
)
# Step 2: Start ParseHub with fresh cookies
result = run_parsehub_with_cookies(
project_token="PARSEHUB_PROJECT_TOKEN",
api_key="PARSEHUB_API_KEY",
cookies=cookies,
)
if result:
print(f"Extraction started at {time.strftime('%H:%M')}")
# Run daily at 6 AM
schedule.every().day.at("06:00").do(captcha_then_parsehub)
while True:
schedule.run_pending()
time.sleep(60)
FAQ
Can ParseHub solve CAPTCHAs on its own?
ParseHub has limited CAPTCHA handling for simple image CAPTCHAs. For reCAPTCHA, Turnstile, or other modern CAPTCHAs, use CaptchaAI to pre-solve and pass cookies.
How long do pre-solved cookies last?
Session cookies typically last 30 minutes to 24 hours. Run the pre-authentication script before each ParseHub extraction for reliability.
Should I switch from ParseHub to a coded solution?
If most of your target sites have CAPTCHAs, a Python script with CaptchaAI gives more reliability. Use ParseHub for simple, low-CAPTCHA sites.
Related Guides
Handle CAPTCHAs in ParseHub — try CaptchaAI.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.