Beautiful Soup parses HTML. CaptchaAI solves CAPTCHAs. Together with requests, they form the fastest scraping stack for CAPTCHA-protected pages — no browser required.
This approach works when the site serves HTML directly (server-side rendered). For JavaScript-heavy SPAs, use Selenium or Playwright instead.
Prerequisites
pip install beautifulsoup4 requests lxml
The workflow
- Fetch the page HTML with
requests - Parse with Beautiful Soup to extract CAPTCHA parameters
- Send parameters to CaptchaAI to solve
- Submit the form with the CAPTCHA token via
requests - Parse the result page with Beautiful Soup
Extracting reCAPTCHA sitekeys with Beautiful Soup
import requests
from bs4 import BeautifulSoup
def extract_recaptcha_sitekey(url):
"""Extract reCAPTCHA v2 sitekey from page HTML."""
resp = requests.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
# Method 1: data-sitekey attribute on div
recaptcha_div = soup.find("div", class_="g-recaptcha")
if recaptcha_div and recaptcha_div.get("data-sitekey"):
return recaptcha_div["data-sitekey"]
# Method 2: data-sitekey on any element
element = soup.find(attrs={"data-sitekey": True})
if element:
return element["data-sitekey"]
# Method 3: from script src
import re
for script in soup.find_all("script", src=True):
match = re.search(r"render=([A-Za-z0-9_-]{40})", script["src"])
if match:
return match.group(1)
return None
sitekey = extract_recaptcha_sitekey("https://example.com/login")
print(f"Sitekey: {sitekey}")
Extracting Turnstile sitekeys
def extract_turnstile_sitekey(url):
"""Extract Cloudflare Turnstile sitekey from page HTML."""
resp = requests.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
# Method 1: Turnstile div
turnstile_div = soup.find("div", class_="cf-turnstile")
if turnstile_div and turnstile_div.get("data-sitekey"):
return turnstile_div["data-sitekey"]
# Method 2: Any element with Turnstile sitekey pattern
element = soup.find(attrs={"data-sitekey": True})
if element:
sitekey = element["data-sitekey"]
if sitekey.startswith("0x"):
return sitekey
# Method 3: In inline script
import re
for script in soup.find_all("script"):
if script.string:
match = re.search(r"sitekey\s*:\s*['\"]([0-9x][A-Za-z0-9_-]+)['\"]", script.string)
if match:
return match.group(1)
return None
Extracting form fields
Always extract hidden form fields — they often contain CSRF tokens and other parameters the server expects:
def extract_form_data(soup, form_selector="form"):
"""Extract all form field names and values."""
form = soup.select_one(form_selector)
if not form:
return {}
data = {}
# Hidden inputs (CSRF tokens, etc.)
for inp in form.find_all("input", type="hidden"):
name = inp.get("name")
value = inp.get("value", "")
if name:
data[name] = value
# Text inputs with default values
for inp in form.find_all("input", type=["text", "email", "password"]):
name = inp.get("name")
value = inp.get("value", "")
if name:
data[name] = value
return data
Complete reCAPTCHA scraping flow
import time
import requests
from bs4 import BeautifulSoup
API_KEY = "YOUR_API_KEY"
def solve_captcha(method, **params):
"""Solve CAPTCHA via CaptchaAI."""
submit = requests.post("https://ocr.captchaai.com/in.php", data={
"key": API_KEY, "method": method, "json": 1, **params,
}, timeout=30).json()
if submit.get("status") != 1:
raise Exception(f"Submit error: {submit.get('request')}")
task_id = submit["request"]
for _ in range(30):
time.sleep(5)
result = requests.get("https://ocr.captchaai.com/res.php", params={
"key": API_KEY, "action": "get", "id": task_id, "json": 1,
}, timeout=30).json()
if result.get("status") == 1:
return result["request"]
raise TimeoutError("Solve timed out")
def scrape_protected_page(url, credentials=None):
"""Scrape a reCAPTCHA-protected page — no browser needed."""
session = requests.Session()
session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
})
# Step 1: Fetch the login page
resp = session.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
# Step 2: Extract sitekey
sitekey = None
recaptcha_div = soup.find(attrs={"data-sitekey": True})
if recaptcha_div:
sitekey = recaptcha_div["data-sitekey"]
if not sitekey:
raise ValueError("No CAPTCHA sitekey found")
print(f"Sitekey: {sitekey}")
# Step 3: Extract form fields (CSRF tokens, etc.)
form_data = extract_form_data(soup)
print(f"Form fields: {list(form_data.keys())}")
# Step 4: Add credentials
if credentials:
form_data.update(credentials)
# Step 5: Solve CAPTCHA
token = solve_captcha("userrecaptcha", googlekey=sitekey, pageurl=url)
form_data["g-recaptcha-response"] = token
# Step 6: Submit the form
form = soup.find("form")
action_url = form.get("action", url) if form else url
if not action_url.startswith("http"):
from urllib.parse import urljoin
action_url = urljoin(url, action_url)
method = (form.get("method", "POST") if form else "POST").upper()
if method == "POST":
result = session.post(action_url, data=form_data, timeout=30)
else:
result = session.get(action_url, params=form_data, timeout=30)
# Step 7: Parse the result
result_soup = BeautifulSoup(result.text, "lxml")
return result_soup, session
# Usage
result_soup, session = scrape_protected_page(
"https://example.com/login",
credentials={"username": "user@example.com", "password": "pass123"},
)
# Now use the authenticated session to scrape protected content
dashboard = session.get("https://example.com/dashboard", timeout=30)
dashboard_soup = BeautifulSoup(dashboard.text, "lxml")
print(dashboard_soup.title.string)
Scraping search results behind CAPTCHA
def scrape_search_results(search_url, query):
"""Scrape search results from a CAPTCHA-protected search engine."""
session = requests.Session()
session.headers["User-Agent"] = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
)
# Fetch search page
resp = session.get(search_url, params={"q": query}, timeout=30)
# Check if CAPTCHA is present
soup = BeautifulSoup(resp.text, "lxml")
sitekey_el = soup.find(attrs={"data-sitekey": True})
if sitekey_el:
# Solve CAPTCHA
sitekey = sitekey_el["data-sitekey"]
token = solve_captcha("userrecaptcha", googlekey=sitekey, pageurl=resp.url)
# Resubmit with token
form_data = extract_form_data(soup)
form_data["g-recaptcha-response"] = token
form_data["q"] = query
resp = session.post(resp.url, data=form_data, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
# Extract results
results = []
for item in soup.select(".result, .search-result, .g"):
title_el = item.select_one("h3, .title")
link_el = item.select_one("a")
snippet_el = item.select_one(".snippet, .description, .st")
if title_el and link_el:
results.append({
"title": title_el.get_text(strip=True),
"url": link_el.get("href", ""),
"snippet": snippet_el.get_text(strip=True) if snippet_el else "",
})
return results
Image CAPTCHA extraction with Beautiful Soup
import base64
from urllib.parse import urljoin
def solve_image_captcha_bs4(url, captcha_img_selector="img.captcha"):
"""Extract, solve, and submit an image CAPTCHA."""
session = requests.Session()
resp = session.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
# Find CAPTCHA image
img = soup.select_one(captcha_img_selector)
if not img:
raise ValueError("CAPTCHA image not found")
# Download the image
img_url = img.get("src", "")
if img_url.startswith("data:image"):
# Base64 inline image
img_base64 = img_url.split(",", 1)[1]
else:
# URL — download it
img_url = urljoin(url, img_url)
img_resp = session.get(img_url, timeout=30)
img_base64 = base64.b64encode(img_resp.content).decode()
# Solve
answer = solve_captcha("base64", body=img_base64)
print(f"CAPTCHA answer: {answer}")
# Submit form
form_data = extract_form_data(soup)
# Find the captcha input field name
captcha_input = soup.select_one("input[name*='captcha'], input[name*='code']")
if captcha_input:
form_data[captcha_input["name"]] = answer
form = soup.find("form")
action = urljoin(url, form.get("action", "")) if form else url
result = session.post(action, data=form_data, timeout=30)
return BeautifulSoup(result.text, "lxml"), session
Production scraper class
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
class ProtectedScraper:
"""Scrape CAPTCHA-protected pages without a browser."""
def __init__(self, api_key):
self.api_key = api_key
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
})
def get(self, url):
"""Fetch and parse a page, solving CAPTCHAs automatically."""
resp = self.session.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
# Check for CAPTCHA
sitekey_el = soup.find(attrs={"data-sitekey": True})
if sitekey_el:
soup = self._handle_captcha(soup, resp.url, sitekey_el)
return soup
def login(self, url, credentials):
"""Log in through a CAPTCHA-protected form."""
resp = self.session.get(url, timeout=30)
soup = BeautifulSoup(resp.text, "lxml")
form_data = self._extract_form(soup)
form_data.update(credentials)
sitekey_el = soup.find(attrs={"data-sitekey": True})
if sitekey_el:
token = self._solve(sitekey_el["data-sitekey"], url)
form_data["g-recaptcha-response"] = token
form = soup.find("form")
action = urljoin(url, form.get("action", "")) if form else url
result = self.session.post(action, data=form_data, timeout=30)
return BeautifulSoup(result.text, "lxml")
def _handle_captcha(self, soup, url, sitekey_el):
token = self._solve(sitekey_el["data-sitekey"], url)
form_data = self._extract_form(soup)
form_data["g-recaptcha-response"] = token
form = soup.find("form")
action = urljoin(url, form.get("action", "")) if form else url
resp = self.session.post(action, data=form_data, timeout=30)
return BeautifulSoup(resp.text, "lxml")
def _extract_form(self, soup):
data = {}
for inp in soup.select("form input[type='hidden']"):
if inp.get("name"):
data[inp["name"]] = inp.get("value", "")
return data
def _solve(self, sitekey, url):
submit = requests.post("https://ocr.captchaai.com/in.php", data={
"key": self.api_key, "method": "userrecaptcha",
"googlekey": sitekey, "pageurl": url, "json": 1,
}, timeout=30).json()
if submit.get("status") != 1:
raise Exception(f"Error: {submit.get('request')}")
task_id = submit["request"]
for _ in range(30):
time.sleep(5)
result = requests.get("https://ocr.captchaai.com/res.php", params={
"key": self.api_key, "action": "get", "id": task_id, "json": 1,
}, timeout=30).json()
if result.get("status") == 1:
return result["request"]
raise TimeoutError("Solve timed out")
# Usage
scraper = ProtectedScraper("YOUR_API_KEY")
# Login and scrape
scraper.login("https://example.com/login", {
"email": "user@example.com",
"password": "pass123",
})
# Now scrape authenticated pages
soup = scraper.get("https://example.com/dashboard")
for row in soup.select("table tr"):
cells = [td.get_text(strip=True) for td in row.select("td")]
print(cells)
When to use Beautiful Soup vs browser automation
| Scenario | Use BS4 + requests | Use Selenium/Playwright |
|---|---|---|
| Server-rendered HTML | Yes | Overkill |
| JavaScript-rendered content | No | Yes |
| Complex multi-step form | Maybe | Preferred |
| High-volume scraping | Yes (faster) | Slower |
| Sites with JS fingerprinting | No | Yes |
| Simple login + scrape | Yes | Not needed |
Troubleshooting
| Symptom | Cause | Fix |
|---|---|---|
Sitekey extraction returns None |
CAPTCHA loaded via JavaScript | Switch to Selenium/Playwright |
| Form submission returns login page | Missing CSRF token | Extract all hidden inputs with extract_form_data() |
| 403 after form POST | Bot detection on headers | Add realistic User-Agent and Referer headers |
| Token rejected | Wrong pageurl parameter |
Use the exact URL shown in the browser |
| Cookies lost between requests | Not using requests.Session() |
Always use a session object |
Frequently asked questions
Can Beautiful Soup solve CAPTCHAs?
No — Beautiful Soup is an HTML parser. It extracts CAPTCHA parameters (sitekeys, image URLs). CaptchaAI does the actual solving. requests handles the HTTP communication.
When should I use a browser instead?
When the page requires JavaScript to render content, when the CAPTCHA is loaded dynamically, or when the site uses JavaScript-based fingerprinting.
Is this faster than Selenium?
Yes. requests + Beautiful Soup skips browser startup, JavaScript execution, and rendering, making it 5-10x faster per page.
Summary
Python Beautiful Soup + CaptchaAI provides the fastest scraping stack for CAPTCHA-protected pages that serve HTML directly. Parse sitekeys with BS4, solve with the API, and submit via requests.Session().
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.