Market research requires data from competitor websites, review platforms, job boards, and industry directories — most of which are CAPTCHA-protected. CaptchaAI automates the solving so your data pipeline stays unbroken.
Common Data Sources and Their CAPTCHAs
| Source Type | Examples | CAPTCHA Type |
|---|---|---|
| Review sites | G2, Trustpilot, Yelp | reCAPTCHA v2/v3 |
| Job boards | LinkedIn, Indeed | Cloudflare Challenge |
| Business directories | YellowPages, Crunchbase | Turnstile, reCAPTCHA |
| Social media | Twitter/X, Reddit | Various |
| Patent databases | USPTO, Google Patents | reCAPTCHA v2 |
| Government data | SEC filings, census | Image CAPTCHA |
Data Collection Pipeline
import requests
import time
import re
import csv
import os
API_KEY = os.environ["CAPTCHAAI_API_KEY"]
def solve_captcha(params):
params["key"] = API_KEY
resp = requests.get("https://ocr.captchaai.com/in.php", params=params)
if not resp.text.startswith("OK|"):
raise Exception(f"Submit: {resp.text}")
task_id = resp.text.split("|")[1]
for _ in range(60):
time.sleep(5)
result = requests.get("https://ocr.captchaai.com/res.php", params={
"key": API_KEY, "action": "get", "id": task_id,
})
if result.text == "CAPCHA_NOT_READY":
continue
if result.text.startswith("OK|"):
return result.text.split("|", 1)[1]
raise Exception(f"Solve: {result.text}")
raise TimeoutError()
class MarketResearchCollector:
def __init__(self):
self.session = requests.Session()
self.session.headers["User-Agent"] = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/120.0.0.0"
)
def fetch(self, url):
"""Fetch a page, solving CAPTCHAs as needed."""
resp = self.session.get(url)
# Detect reCAPTCHA
match = re.search(
r'data-sitekey=["\']([A-Za-z0-9_-]+)["\']', resp.text
)
if match:
token = solve_captcha({
"method": "userrecaptcha",
"googlekey": match.group(1),
"pageurl": url,
})
resp = self.session.post(url, data={
"g-recaptcha-response": token,
})
# Detect Turnstile
match = re.search(
r'data-sitekey=["\']([0-9x][A-Za-z0-9_-]+)["\']', resp.text
)
if match and "cf-turnstile" in resp.text:
token = solve_captcha({
"method": "turnstile",
"sitekey": match.group(1),
"pageurl": url,
})
resp = self.session.post(url, data={
"cf-turnstile-response": token,
})
return resp.text
def collect_reviews(self, urls):
"""Collect review data from multiple pages."""
reviews = []
for url in urls:
try:
html = self.fetch(url)
page_reviews = self._parse_reviews(html)
reviews.extend(page_reviews)
print(f" Collected {len(page_reviews)} reviews from {url}")
time.sleep(2) # Polite delay
except Exception as e:
print(f" Error on {url}: {e}")
return reviews
def collect_company_profiles(self, urls):
"""Collect company profile data."""
profiles = []
for url in urls:
try:
html = self.fetch(url)
profile = self._parse_profile(html)
if profile:
profiles.append(profile)
print(f" Collected: {profile.get('name', 'Unknown')}")
time.sleep(2)
except Exception as e:
print(f" Error on {url}: {e}")
return profiles
def _parse_reviews(self, html):
"""Extract review data from HTML."""
reviews = []
# Generic review extraction patterns
for match in re.finditer(
r'class="review-text"[^>]*>(.*?)</div>', html, re.DOTALL
):
reviews.append({
"text": match.group(1).strip()[:500],
})
return reviews
def _parse_profile(self, html):
"""Extract company profile from HTML."""
name = re.search(r'<h1[^>]*>(.*?)</h1>', html)
desc = re.search(
r'class="description"[^>]*>(.*?)</div>', html, re.DOTALL
)
return {
"name": name.group(1).strip() if name else None,
"description": desc.group(1).strip()[:300] if desc else None,
}
def export_csv(self, data, filename):
"""Export collected data to CSV."""
if not data:
return
keys = data[0].keys()
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=keys)
writer.writeheader()
writer.writerows(data)
print(f"Exported {len(data)} records to {filename}")
Usage
collector = MarketResearchCollector()
# Collect competitor reviews
review_urls = [
"https://example-reviews.com/product/competitor-a",
"https://example-reviews.com/product/competitor-b",
"https://example-reviews.com/product/competitor-c",
]
reviews = collector.collect_reviews(review_urls)
collector.export_csv(reviews, "competitor_reviews.csv")
# Collect company profiles
profile_urls = [
"https://example-directory.com/company/alpha-corp",
"https://example-directory.com/company/beta-inc",
]
profiles = collector.collect_company_profiles(profile_urls)
collector.export_csv(profiles, "company_profiles.csv")
Use Cases
Competitive Pricing Intelligence
Monitor competitor pricing across e-commerce platforms. Track price changes, promotions, and stock levels.
Brand Sentiment Analysis
Collect reviews and ratings from review platforms. Aggregate sentiment data across multiple sources.
Job Market Analysis
Scrape job postings to understand hiring trends, salary ranges, and skill demand in your industry.
Patent Landscape Research
Collect patent filings from public databases to track innovation trends and competitor R&D activity.
Scaling Tips
| Factor | Recommendation |
|---|---|
| Request spacing | 2-5 seconds between pages |
| Concurrent collectors | 5-10 for moderate scale |
| Proxy rotation | Required for 100+ pages/hour |
| Data deduplication | Hash-based dedup before storage |
| Scheduling | Run daily or weekly for trend data |
FAQ
Is scraping market data legal?
Public data scraping is generally permitted. Always check the site's terms of service and comply with local regulations. Don't scrape personal data without consent.
How much does this cost with CaptchaAI?
Typical market research scraping triggers CAPTCHAs on ~30% of pages. For 1,000 pages/day, expect ~300 solves at $0.5-3 total.
How do I handle sites that block scrapers entirely?
Combine CaptchaAI with proxy rotation and realistic request patterns. See our Scraping Without Getting Blocked guide.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.