Salary databases, job boards, and government labor portals protect compensation data with Cloudflare Turnstile and reCAPTCHA. CAPTCHAs trigger when querying salary ranges by role, location, or industry — especially during bulk data collection across many job titles. Here's how to handle them.
CAPTCHA Patterns on Salary Portals
| Source type | CAPTCHA | Trigger |
|---|---|---|
| Salary comparison sites | Cloudflare Turnstile | Repeated search queries |
| Job board salary filters | reCAPTCHA v2 | Multiple salary lookups |
| Government labor statistics | Image CAPTCHA | Data download requests |
| Corporate salary pages | Cloudflare Challenge | Bulk page views |
| HR survey platforms | reCAPTCHA v3 | Form submissions |
Salary Data Collector
import requests
import time
import re
from dataclasses import dataclass
@dataclass
class SalaryRecord:
title: str
location: str
min_salary: float
max_salary: float
median_salary: float
sample_size: int
source: str
class SalaryCollector:
def __init__(self, api_key):
self.api_key = api_key
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
})
def collect_salary_data(self, portal_url, job_title, location):
"""Search for salary data, solving CAPTCHAs as needed."""
response = self.session.get(portal_url, params={
"title": job_title,
"location": location
})
if self._is_turnstile_challenge(response):
response = self._solve_turnstile_and_retry(response, portal_url)
return self._parse_salary_data(response.text, portal_url)
def collect_bulk(self, portal_url, job_titles, locations):
"""Collect salary data for multiple job title + location combos."""
results = []
for title in job_titles:
for location in locations:
try:
data = self.collect_salary_data(
portal_url, title, location
)
results.extend(data)
# Respectful delay between requests
time.sleep(2)
except Exception as e:
print(f"Failed for {title} in {location}: {e}")
return results
def _is_turnstile_challenge(self, response):
return (
response.status_code == 403 or
"cf-turnstile" in response.text or
"challenges.cloudflare.com" in response.text
)
def _solve_turnstile_and_retry(self, response, url):
match = re.search(r'data-sitekey="(0x[^"]+)"', response.text)
if not match:
raise ValueError("Turnstile sitekey not found")
resp = requests.post("https://ocr.captchaai.com/in.php", data={
"key": self.api_key,
"method": "turnstile",
"sitekey": match.group(1),
"pageurl": url,
"json": 1
})
task_id = resp.json()["request"]
for _ in range(60):
time.sleep(3)
result = requests.get("https://ocr.captchaai.com/res.php", params={
"key": self.api_key,
"action": "get",
"id": task_id,
"json": 1
})
data = result.json()
if data["status"] == 1:
return self.session.post(url, data={
"cf-turnstile-response": data["request"]
})
raise TimeoutError("Turnstile solve timed out")
def _parse_salary_data(self, html, source):
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
records = []
for row in soup.select(".salary-row, .compensation-entry, tr[data-salary]"):
try:
records.append(SalaryRecord(
title=row.select_one(".job-title, .title")?.text?.strip() or "",
location=row.select_one(".location")?.text?.strip() or "",
min_salary=self._parse_amount(
row.select_one(".min-salary, .low")?.text
),
max_salary=self._parse_amount(
row.select_one(".max-salary, .high")?.text
),
median_salary=self._parse_amount(
row.select_one(".median, .mid")?.text
),
sample_size=int(
row.select_one(".count, .sample")?.text?.replace(",", "") or 0
),
source=source
))
except (AttributeError, ValueError):
continue
return records
def _parse_amount(self, text):
if not text:
return 0.0
cleaned = re.sub(r'[^\d.]', '', text)
return float(cleaned) if cleaned else 0.0
# Usage
collector = SalaryCollector("YOUR_API_KEY")
data = collector.collect_bulk(
"https://salary.example.com/search",
job_titles=["Software Engineer", "Data Analyst", "Product Manager"],
locations=["San Francisco", "New York", "Austin"]
)
for record in data:
print(f"{record.title} in {record.location}: "
f"${record.min_salary:,.0f}–${record.max_salary:,.0f} "
f"(median: ${record.median_salary:,.0f})")
Multi-Source Aggregation (JavaScript)
class SalaryAggregator {
constructor(apiKey) {
this.apiKey = apiKey;
this.sources = [];
}
addSource(name, searchUrl) {
this.sources.push({ name, searchUrl });
}
async collectForRole(jobTitle, location) {
const results = [];
for (const source of this.sources) {
try {
const data = await this.querySource(source, jobTitle, location);
results.push({ source: source.name, ...data });
} catch (error) {
results.push({ source: source.name, error: error.message });
}
}
return this.aggregateResults(results, jobTitle, location);
}
async querySource(source, jobTitle, location) {
const url = `${source.searchUrl}?title=${encodeURIComponent(jobTitle)}&location=${encodeURIComponent(location)}`;
const response = await fetch(url);
const html = await response.text();
if (html.includes('cf-turnstile') || response.status === 403) {
return this.solveAndRetry(source.searchUrl, html, jobTitle, location);
}
return this.parseSalaryData(html);
}
async solveAndRetry(baseUrl, html, jobTitle, location) {
const match = html.match(/data-sitekey="(0x[^"]+)"/);
if (!match) throw new Error('Turnstile sitekey not found');
const submitResp = await fetch('https://ocr.captchaai.com/in.php', {
method: 'POST',
body: new URLSearchParams({
key: this.apiKey,
method: 'turnstile',
sitekey: match[1],
pageurl: baseUrl,
json: '1'
})
});
const { request: taskId } = await submitResp.json();
for (let i = 0; i < 60; i++) {
await new Promise(r => setTimeout(r, 3000));
const result = await fetch(
`https://ocr.captchaai.com/res.php?key=${this.apiKey}&action=get&id=${taskId}&json=1`
);
const data = await result.json();
if (data.status === 1) {
const response = await fetch(baseUrl, {
method: 'POST',
body: new URLSearchParams({
'cf-turnstile-response': data.request,
title: jobTitle,
location: location
})
});
return this.parseSalaryData(await response.text());
}
}
throw new Error('Turnstile solve timed out');
}
aggregateResults(results, jobTitle, location) {
const valid = results.filter(r => !r.error && r.median);
if (valid.length === 0) return null;
const medians = valid.map(r => r.median);
return {
jobTitle,
location,
avgMedian: medians.reduce((a, b) => a + b, 0) / medians.length,
sources: valid.length,
range: { min: Math.min(...medians), max: Math.max(...medians) }
};
}
}
// Usage
const aggregator = new SalaryAggregator('YOUR_API_KEY');
aggregator.addSource('SalaryDB', 'https://salarydb.example.com/search');
aggregator.addSource('PayScale', 'https://payscale.example.com/lookup');
const result = await aggregator.collectForRole('Software Engineer', 'San Francisco');
console.log(`Median salary: $${result.avgMedian.toLocaleString()} (${result.sources} sources)`);
Data Collection Strategy
| Approach | Volume per day | CAPTCHA frequency | Best for |
|---|---|---|---|
| Sequential with delays | 100–500 queries | Low | Small surveys |
| Proxy rotation | 500–2,000 queries | Moderate | Regional analysis |
| Multi-session parallel | 2,000–10,000 queries | High | Comprehensive datasets |
Troubleshooting
| Issue | Cause | Fix |
|---|---|---|
| Turnstile on every search | Session expired | Persist cf_clearance cookie |
| Salary data shows "Login required" | Portal requires authentication | Authenticate before searching |
| Empty results after CAPTCHA solve | POST parameters missing | Include all hidden form fields |
| Data inconsistent between runs | Portal shows different ranges | Use consistent query parameters |
FAQ
How many salary queries can I make per day?
It depends on the portal's rate limits, not CaptchaAI. CaptchaAI solves Turnstile with 100% success rate. Space requests 2–5 seconds apart and rotate proxies for high-volume collection.
Should I use proxies for salary data collection?
Yes, especially for bulk collection across thousands of job titles. Residential proxies reduce CAPTCHA frequency significantly compared to datacenter IPs.
Can I collect real-time salary data?
Most salary portals update data monthly or quarterly, so real-time collection is unnecessary. Schedule weekly or monthly collection runs for comprehensive datasets.
Related Articles
- Market Research Data Collection
- Geetest Vs Cloudflare Turnstile Comparison
- Cloudflare Turnstile 403 After Token Fix
Next Steps
Collect compensation data reliably — get your CaptchaAI API key and handle salary portal CAPTCHAs automatically.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.