Healthcare data portals — provider directories, drug pricing databases, and clinical trial registries — use CAPTCHAs to prevent automated data harvesting. Researchers and health-tech platforms need this data for analytics, compliance, and informed decision-making.
Where CAPTCHAs Appear
| Source | CAPTCHA Type | Data | Use Case |
|---|---|---|---|
| Provider directories (NPI) | Image CAPTCHA | Doctor/facility lookup | Network adequacy |
| Drug pricing portals | reCAPTCHA v2 | Medication prices | Price transparency |
| Clinical trial registries | reCAPTCHA v2 | Trial data, results | Research analysis |
| Insurance formularies | reCAPTCHA v2 | Drug coverage lists | Formulary comparison |
| State licensing boards | Image CAPTCHA | License verification | Credential checks |
| Hospital quality ratings | Cloudflare Turnstile | Quality metrics | Performance analysis |
Provider Directory Scraper
import requests
import time
import re
import base64
from bs4 import BeautifulSoup
import csv
CAPTCHAAI_KEY = "YOUR_API_KEY"
CAPTCHAAI_URL = "https://ocr.captchaai.com"
def solve_recaptcha(sitekey, pageurl):
resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data={
"key": CAPTCHAAI_KEY, "method": "userrecaptcha",
"googlekey": sitekey, "pageurl": pageurl, "json": 1,
})
task_id = resp.json()["request"]
for _ in range(60):
time.sleep(5)
result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
"key": CAPTCHAAI_KEY, "action": "get",
"id": task_id, "json": 1,
})
data = result.json()
if data["request"] != "CAPCHA_NOT_READY":
return data["request"]
raise TimeoutError("Timeout")
def solve_image_captcha(image_bytes):
img_b64 = base64.b64encode(image_bytes).decode()
resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data={
"key": CAPTCHAAI_KEY, "method": "base64",
"body": img_b64, "json": 1,
})
task_id = resp.json()["request"]
for _ in range(20):
time.sleep(3)
result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
"key": CAPTCHAAI_KEY, "action": "get",
"id": task_id, "json": 1,
})
data = result.json()
if data["request"] != "CAPCHA_NOT_READY":
return data["request"]
raise TimeoutError("Timeout")
class HealthcareDataCollector:
def __init__(self, proxy=None):
self.session = requests.Session()
if proxy:
self.session.proxies = {"http": proxy, "https": proxy}
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
})
def search_providers(self, portal_url, specialty, location, sitekey=None):
"""Search provider directory with CAPTCHA handling."""
resp = self.session.get(portal_url, timeout=30)
data = {"specialty": specialty, "location": location}
# Handle CAPTCHA
if sitekey:
token = solve_recaptcha(sitekey, portal_url)
data["g-recaptcha-response"] = token
else:
captcha_img = re.search(r'src="(/captcha[^"]+)"', resp.text)
if captcha_img:
img_url = portal_url.rstrip("/") + captcha_img.group(1)
img = self.session.get(img_url)
data["captcha"] = solve_image_captcha(img.content)
resp = self.session.post(portal_url, data=data)
return self._parse_providers(resp.text)
def lookup_drug_prices(self, pricing_url, drug_name, zip_code, sitekey):
"""Look up drug prices with CAPTCHA solving."""
# Load search page
self.session.get(pricing_url)
# Solve CAPTCHA
token = solve_recaptcha(sitekey, pricing_url)
resp = self.session.post(pricing_url, data={
"drug": drug_name,
"zip": zip_code,
"g-recaptcha-response": token,
})
if resp.status_code == 200:
return self._parse_prices(resp.text)
return []
def batch_provider_lookup(self, portal_url, specialties, locations, output_file):
"""Batch search across specialties and locations."""
all_providers = []
for specialty in specialties:
for location in locations:
try:
providers = self.search_providers(
portal_url, specialty, location,
)
for p in providers:
p["specialty_search"] = specialty
p["location_search"] = location
all_providers.extend(providers)
print(f"{specialty} / {location}: {len(providers)} providers")
time.sleep(5)
except Exception as e:
print(f"Error: {specialty} / {location}: {e}")
# Export
if all_providers:
keys = all_providers[0].keys()
with open(output_file, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=keys)
writer.writeheader()
writer.writerows(all_providers)
return all_providers
def _parse_providers(self, html):
soup = BeautifulSoup(html, "html.parser")
providers = []
for card in soup.select(".provider-card, .doctor-result, tr.provider"):
providers.append({
"name": self._text(card, ".name, .provider-name"),
"specialty": self._text(card, ".specialty"),
"address": self._text(card, ".address"),
"phone": self._text(card, ".phone"),
"accepting": self._text(card, ".accepting-patients"),
})
return providers
def _parse_prices(self, html):
soup = BeautifulSoup(html, "html.parser")
prices = []
for row in soup.select(".pharmacy-row, .price-result"):
prices.append({
"pharmacy": self._text(row, ".pharmacy-name"),
"price": self._text(row, ".price, .drug-price"),
"quantity": self._text(row, ".quantity"),
})
return prices
def _text(self, el, selector):
found = el.select_one(selector)
return found.get_text(strip=True) if found else ""
# Usage
collector = HealthcareDataCollector(
proxy="http://user:pass@residential.proxy.com:5000"
)
# Provider search
providers = collector.search_providers(
portal_url="https://provider-directory.example.com/search",
specialty="Cardiology",
location="New York, NY",
)
# Drug pricing
prices = collector.lookup_drug_prices(
pricing_url="https://drug-prices.example.com/compare",
drug_name="atorvastatin",
zip_code="10001",
sitekey="6Lc_xxxxxxx",
)
Clinical Trial Data Collection
def collect_clinical_trials(search_url, condition, sitekey):
"""Collect clinical trial data for a medical condition."""
collector = HealthcareDataCollector(
proxy="http://user:pass@residential.proxy.com:5000"
)
token = solve_recaptcha(sitekey, search_url)
resp = collector.session.post(search_url, data={
"condition": condition,
"status": "recruiting",
"g-recaptcha-response": token,
})
if resp.status_code != 200:
return []
soup = BeautifulSoup(resp.text, "html.parser")
trials = []
for item in soup.select(".trial-item, .study-result"):
trials.append({
"title": collector._text(item, ".title, h3"),
"status": collector._text(item, ".status"),
"sponsor": collector._text(item, ".sponsor"),
"phase": collector._text(item, ".phase"),
"enrollment": collector._text(item, ".enrollment"),
"location": collector._text(item, ".location"),
})
return trials
Data Privacy Considerations
| Data Type | Sensitivity | Recommendation |
|---|---|---|
| Provider directories | Low (public info) | Generally safe to collect |
| Drug pricing | Low (public pricing) | Permitted for transparency |
| Clinical trial metadata | Low (public registries) | Research use appropriate |
| Patient reviews | Medium | Anonymize before analysis |
| Insurance plan details | Low (published rates) | Permitted for comparison |
Important: Never attempt to collect protected health information (PHI). Focus only on publicly available, non-patient-specific data.
Troubleshooting
| Issue | Cause | Fix |
|---|---|---|
| Image CAPTCHA unreadable | Low-quality image | Retry — new image generated |
| Provider search returns empty | CAPTCHA blocked the search | Solve CAPTCHA before submitting |
| Drug price varies by location | Geo-based pricing | Match proxy location to zip code |
| Session expires on multi-page | Portal timeout | Complete searches quickly |
| Rate limited on batch lookups | Too many requests | Add 5-10 sec delays |
FAQ
Is collecting healthcare pricing data allowed?
Drug pricing transparency is encouraged by regulation (CMS Price Transparency Rule). Public provider directory data is generally accessible.
Can I compare drug prices across pharmacies?
Yes. Services like GoodRx do this at scale. CaptchaAI handles the CAPTCHAs that pricing portals use to limit automated access.
How do I handle HIPAA when scraping healthcare sites?
HIPAA applies to protected health information (PHI). Public data like provider directories, drug prices, and clinical trial registries are not PHI. Never scrape individual patient records.
Related Guides
Collect healthcare data efficiently — get your CaptchaAI key and automate provider and pricing lookups.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.