Job boards like Indeed, LinkedIn, and Glassdoor deploy CAPTCHAs when they detect automated access patterns. Recruitment platforms, market researchers, and HR analytics tools need reliable CAPTCHA solving to collect job listing data at scale.
CAPTCHAs on Major Job Boards
| Platform | CAPTCHA Type | Trigger | Data Available |
|---|---|---|---|
| Indeed | reCAPTCHA v2 | High request volume | Job listings, salaries |
| Cloudflare Challenge | Bot detection | Jobs, company data | |
| Glassdoor | reCAPTCHA v2 | Scraping detection | Reviews, salaries, jobs |
| ZipRecruiter | Cloudflare Turnstile | Automated access | Job listings |
| Monster | reCAPTCHA v2 | Search pages | Job listings |
| CareerBuilder | reCAPTCHA v3 | Login, search | Job listings, resume search |
Job Board Scraper with CAPTCHA Handling
import requests
import time
import re
from bs4 import BeautifulSoup
CAPTCHAAI_KEY = "YOUR_API_KEY"
CAPTCHAAI_URL = "https://ocr.captchaai.com"
def solve_captcha(method, sitekey, pageurl, **kwargs):
data = {
"key": CAPTCHAAI_KEY,
"method": method,
"googlekey": sitekey,
"pageurl": pageurl,
"json": 1,
}
data.update(kwargs)
resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data=data)
task_id = resp.json()["request"]
for _ in range(60):
time.sleep(5)
result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
"key": CAPTCHAAI_KEY, "action": "get",
"id": task_id, "json": 1,
})
r = result.json()
if r["request"] != "CAPCHA_NOT_READY":
return r["request"]
raise TimeoutError("Solve timeout")
class JobBoardScraper:
def __init__(self, proxy=None):
self.session = requests.Session()
if proxy:
self.session.proxies = {"http": proxy, "https": proxy}
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
})
def search_jobs(self, base_url, query, location, pages=5):
"""Search job listings across multiple pages."""
all_jobs = []
for page in range(pages):
url = f"{base_url}/jobs?q={query}&l={location}&start={page * 10}"
resp = self.session.get(url, timeout=30)
# Check for CAPTCHA
if self._has_captcha(resp.text):
resp = self._solve_and_retry(resp.text, url)
if resp.status_code == 200:
jobs = self._parse_listings(resp.text)
all_jobs.extend(jobs)
print(f"Page {page + 1}: {len(jobs)} jobs found")
else:
print(f"Page {page + 1}: Request failed ({resp.status_code})")
time.sleep(3) # Rate limit
return all_jobs
def _has_captcha(self, html):
indicators = [
'data-sitekey=',
'g-recaptcha',
'cf-turnstile',
'captcha-delivery',
]
return any(ind in html.lower() for ind in indicators)
def _solve_and_retry(self, html, url):
# Try reCAPTCHA first
match = re.search(r'data-sitekey="([^"]+)"', html)
if match:
sitekey = match.group(1)
# Detect Turnstile vs reCAPTCHA
if 'cf-turnstile' in html:
token = solve_captcha("turnstile", sitekey, url)
field = "cf-turnstile-response"
else:
token = solve_captcha("userrecaptcha", sitekey, url)
field = "g-recaptcha-response"
return self.session.post(url, data={field: token})
return self.session.get(url)
def _parse_listings(self, html):
soup = BeautifulSoup(html, "html.parser")
jobs = []
for card in soup.select(".job_seen_beacon, .jobsearch-ResultsList > li"):
title_el = card.select_one("h2 a, .jobTitle a")
company_el = card.select_one(".companyName, [data-testid='company-name']")
location_el = card.select_one(".companyLocation, [data-testid='text-location']")
salary_el = card.select_one(".salary-snippet, .estimated-salary")
if title_el:
jobs.append({
"title": title_el.get_text(strip=True),
"company": company_el.get_text(strip=True) if company_el else "",
"location": location_el.get_text(strip=True) if location_el else "",
"salary": salary_el.get_text(strip=True) if salary_el else "",
"url": title_el.get("href", ""),
})
return jobs
# Usage
scraper = JobBoardScraper(
proxy="http://user:pass@residential.proxy.com:5000"
)
jobs = scraper.search_jobs(
base_url="https://jobs.example.com",
query="python developer",
location="New York",
pages=10,
)
print(f"Total jobs collected: {len(jobs)}")
Salary Data Collection
import csv
def collect_salary_data(titles, locations, output_file):
"""Collect salary data across job titles and locations."""
scraper = JobBoardScraper(
proxy="http://user:pass@residential.proxy.com:5000"
)
results = []
for title in titles:
for location in locations:
try:
jobs = scraper.search_jobs(
"https://jobs.example.com",
title, location, pages=3,
)
salaries = [j["salary"] for j in jobs if j["salary"]]
results.append({
"title": title,
"location": location,
"listings": len(jobs),
"with_salary": len(salaries),
"salary_samples": "; ".join(salaries[:5]),
})
time.sleep(5)
except Exception as e:
results.append({
"title": title,
"location": location,
"error": str(e),
})
with open(output_file, "w", newline="") as f:
writer = csv.DictWriter(
f, fieldnames=["title", "location", "listings",
"with_salary", "salary_samples", "error"],
)
writer.writeheader()
writer.writerows(results)
return results
# Collect salary data for market analysis
collect_salary_data(
titles=["Data Engineer", "ML Engineer", "DevOps Engineer"],
locations=["San Francisco", "New York", "Austin", "Remote"],
output_file="salary_data.csv",
)
Stealth-Configuredion Tips for Job Boards
| Technique | Why It Helps |
|---|---|
| Rotating residential proxies | Distributes requests across real IPs |
| 3-5 second delays between pages | Mimics human browsing speed |
| Consistent User-Agent per session | Avoids fingerprint mismatches |
| Accept cookies | Job boards track sessions via cookies |
| Randomize search order | Avoid sequential page patterns |
| Limit to 200 pages/day per domain | Stay below detection thresholds |
Troubleshooting
| Issue | Cause | Fix |
|---|---|---|
| CAPTCHA on every search | IP flagged or rate exceeded | Switch IP, add longer delays |
| Empty results page | CAPTCHA block returned instead | Detect CAPTCHA before parsing |
| "Please verify you're human" | Bot detection triggered | Use residential proxy + realistic UA |
| Login required for salary data | Platform gating content | Implement authenticated session |
| Different results than browser | Location/cookie differences | Match Accept-Language and geo proxy |
FAQ
How many job listings can I scrape per day?
With rotating residential proxies and proper delays, 500-2000 pages per domain is achievable without hitting persistent CAPTCHAs.
Do job boards block scraping?
Most job boards have terms discouraging automated access but vary in enforcement. CAPTCHAs are their primary defense, which CaptchaAI handles.
Which proxy type works best for job boards?
Rotating residential proxies are the best balance of cost and success rate. Datacenter IPs are frequently blocked by LinkedIn and Glassdoor.
Related Guides
Collect job market data at scale — get your CaptchaAI key for automated CAPTCHA solving.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.