Legal databases, court filing systems, and case law repositories protect their data with CAPTCHAs. Law firms, legal tech companies, and compliance teams need automated access to search case law, monitor filings, and aggregate regulatory data. CaptchaAI handles CAPTCHA challenges on these portals.
Legal Data Sources and CAPTCHAs
| Source | CAPTCHA Type | Data | Users |
|---|---|---|---|
| PACER | reCAPTCHA v2 | Federal court filings | Litigation teams |
| State court systems | Image CAPTCHA / reCAPTCHA | State case records | Attorneys |
| SEC EDGAR | reCAPTCHA v2 | Corporate filings | Compliance |
| Patent databases | reCAPTCHA v2 | Patent records | IP researchers |
| Regulatory portals | Image CAPTCHA | Rules, guidance | Compliance |
| Legal citation databases | reCAPTCHA v2 | Case citations | Legal tech |
| Bar association directories | reCAPTCHA v2 | Attorney records | Due diligence |
Case Law Search Engine
import requests
import time
import re
import base64
from bs4 import BeautifulSoup
import csv
CAPTCHAAI_KEY = "YOUR_API_KEY"
CAPTCHAAI_URL = "https://ocr.captchaai.com"
def solve_recaptcha(sitekey, pageurl):
resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data={
"key": CAPTCHAAI_KEY, "method": "userrecaptcha",
"googlekey": sitekey, "pageurl": pageurl, "json": 1,
})
task_id = resp.json()["request"]
for _ in range(60):
time.sleep(5)
result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
"key": CAPTCHAAI_KEY, "action": "get",
"id": task_id, "json": 1,
})
data = result.json()
if data["request"] != "CAPCHA_NOT_READY":
return data["request"]
raise TimeoutError("Timeout")
def solve_image_captcha(image_bytes):
img_b64 = base64.b64encode(image_bytes).decode()
resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data={
"key": CAPTCHAAI_KEY, "method": "base64",
"body": img_b64, "json": 1,
})
task_id = resp.json()["request"]
for _ in range(20):
time.sleep(3)
result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
"key": CAPTCHAAI_KEY, "action": "get",
"id": task_id, "json": 1,
})
data = result.json()
if data["request"] != "CAPCHA_NOT_READY":
return data["request"]
raise TimeoutError("Timeout")
class LegalResearchScraper:
def __init__(self, proxy=None):
self.session = requests.Session()
if proxy:
self.session.proxies = {"http": proxy, "https": proxy}
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
})
def search_cases(self, search_url, query, sitekey=None, max_pages=5):
"""Search case law database."""
all_cases = []
for page in range(max_pages):
url = f"{search_url}?q={query}&page={page + 1}"
resp = self.session.get(url, timeout=30)
if self._has_captcha(resp.text):
if sitekey:
token = solve_recaptcha(sitekey, url)
resp = self.session.post(url, data={
"q": query,
"g-recaptcha-response": token,
})
else:
resp = self._solve_image_and_retry(resp.text, url, query)
cases = self._parse_cases(resp.text)
if not cases:
break
all_cases.extend(cases)
print(f"Page {page + 1}: {len(cases)} cases")
time.sleep(5)
return all_cases
def get_case_details(self, case_url):
"""Fetch full case details."""
resp = self.session.get(case_url, timeout=30)
if self._has_captcha(resp.text):
sitekey = self._extract_sitekey(resp.text)
if sitekey:
token = solve_recaptcha(sitekey, case_url)
resp = self.session.post(case_url, data={
"g-recaptcha-response": token,
})
soup = BeautifulSoup(resp.text, "html.parser")
return {
"title": self._text(soup, "h1, .case-title"),
"citation": self._text(soup, ".citation, .case-cite"),
"court": self._text(soup, ".court, .jurisdiction"),
"date": self._text(soup, ".decision-date, .date-decided"),
"judge": self._text(soup, ".judge, .authored-by"),
"summary": self._text(soup, ".summary, .headnote"),
"url": case_url,
}
def monitor_docket(self, docket_url, case_number, sitekey=None):
"""Monitor a specific case docket for new filings."""
resp = self.session.get(docket_url, timeout=30)
data = {"case_number": case_number}
if sitekey and self._has_captcha(resp.text):
token = solve_recaptcha(sitekey, docket_url)
data["g-recaptcha-response"] = token
resp = self.session.post(docket_url, data=data)
return self._parse_docket(resp.text)
def export_results(self, cases, filename):
"""Export case results to CSV."""
if not cases:
return
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=cases[0].keys())
writer.writeheader()
writer.writerows(cases)
def _has_captcha(self, html):
return any(tag in html.lower() for tag in [
'data-sitekey', 'g-recaptcha', 'captcha',
])
def _extract_sitekey(self, html):
match = re.search(r'data-sitekey="([^"]+)"', html)
return match.group(1) if match else None
def _solve_image_and_retry(self, html, url, query):
match = re.search(r'src="(/captcha[^"]+)"', html)
if match:
img_url = url.split("?")[0].rstrip("/") + match.group(1)
img = self.session.get(img_url)
answer = solve_image_captcha(img.content)
return self.session.post(url, data={
"q": query,
"captcha": answer,
})
return self.session.get(url)
def _parse_cases(self, html):
soup = BeautifulSoup(html, "html.parser")
cases = []
for item in soup.select(".case-result, .search-result, tr.result"):
title_el = item.select_one("a, .case-name")
if title_el:
cases.append({
"title": title_el.get_text(strip=True),
"url": title_el.get("href", ""),
"citation": self._text(item, ".citation, .cite"),
"date": self._text(item, ".date"),
"court": self._text(item, ".court"),
})
return cases
def _parse_docket(self, html):
soup = BeautifulSoup(html, "html.parser")
entries = []
for row in soup.select(".docket-entry, tr.filing"):
entries.append({
"date": self._text(row, ".date, td:first-child"),
"entry": self._text(row, ".description, td:nth-child(2)"),
"filed_by": self._text(row, ".filer, td:nth-child(3)"),
})
return entries
def _text(self, el, selector):
found = el.select_one(selector)
return found.get_text(strip=True) if found else ""
# Usage
scraper = LegalResearchScraper(
proxy="http://user:pass@residential.proxy.com:5000"
)
# Search case law
cases = scraper.search_cases(
search_url="https://caselaw.example.com/search",
query="data privacy GDPR",
max_pages=5,
)
# Get details for relevant cases
for case in cases[:10]:
if case["url"]:
details = scraper.get_case_details(case["url"])
print(f"{details['citation']}: {details['title']}")
time.sleep(3)
# Export results
scraper.export_results(cases, "gdpr_cases.csv")
Regulatory Monitoring
class RegulatoryMonitor:
def __init__(self, proxy=None):
self.scraper = LegalResearchScraper(proxy=proxy)
self.seen_entries = set()
def check_new_filings(self, feeds):
"""Check regulatory portals for new filings."""
new_filings = []
for feed in feeds:
try:
cases = self.scraper.search_cases(
feed["url"], feed["query"],
sitekey=feed.get("sitekey"),
max_pages=2,
)
for case in cases:
key = case.get("citation") or case.get("title")
if key and key not in self.seen_entries:
self.seen_entries.add(key)
case["source"] = feed["name"]
new_filings.append(case)
except Exception as e:
print(f"Error checking {feed['name']}: {e}")
time.sleep(5)
return new_filings
Troubleshooting
| Issue | Cause | Fix |
|---|---|---|
| Image CAPTCHA fails repeatedly | Distorted text | Report and retry — new image |
| PACER blocks access | Rate limit exceeded | Wait 30 min, reduce request frequency |
| Case details incomplete | Behind paywall | Pay per-page fees where required |
| Search returns no results | CAPTCHA page returned instead | Check for CAPTCHA before parsing |
| Docket monitoring misses filings | Check interval too long | Increase check frequency |
FAQ
Is automated access to court records legal?
Public court records are generally accessible. PACER charges per-page fees regardless of access method. Always respect system rate limits and terms of use.
How do I handle PACER fees?
PACER charges per page. Budget for fees when planning bulk data collection. Consider RECAP (free archive) for already-downloaded documents.
Which legal databases have the most CAPTCHAs?
State court systems are worst — many use older image CAPTCHAs. Federal courts (PACER) use reCAPTCHA v2. Patent databases also frequently trigger CAPTCHAs.
Related Guides
Streamline legal research — get your CaptchaAI key and automate case law and filing searches.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.