Use Cases

Job Board Scraping with CAPTCHA Handling Using CaptchaAI

Job boards like Indeed, LinkedIn, and Glassdoor deploy CAPTCHAs when they detect automated access patterns. Recruitment platforms, market researchers, and HR analytics tools need reliable CAPTCHA solving to collect job listing data at scale.


CAPTCHAs on Major Job Boards

Platform CAPTCHA Type Trigger Data Available
Indeed reCAPTCHA v2 High request volume Job listings, salaries
LinkedIn Cloudflare Challenge Bot detection Jobs, company data
Glassdoor reCAPTCHA v2 Scraping detection Reviews, salaries, jobs
ZipRecruiter Cloudflare Turnstile Automated access Job listings
Monster reCAPTCHA v2 Search pages Job listings
CareerBuilder reCAPTCHA v3 Login, search Job listings, resume search

Job Board Scraper with CAPTCHA Handling

import requests
import time
import re
from bs4 import BeautifulSoup

CAPTCHAAI_KEY = "YOUR_API_KEY"
CAPTCHAAI_URL = "https://ocr.captchaai.com"


def solve_captcha(method, sitekey, pageurl, **kwargs):
    data = {
        "key": CAPTCHAAI_KEY,
        "method": method,
        "googlekey": sitekey,
        "pageurl": pageurl,
        "json": 1,
    }
    data.update(kwargs)
    resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data=data)
    task_id = resp.json()["request"]

    for _ in range(60):
        time.sleep(5)
        result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
            "key": CAPTCHAAI_KEY, "action": "get",
            "id": task_id, "json": 1,
        })
        r = result.json()
        if r["request"] != "CAPCHA_NOT_READY":
            return r["request"]
    raise TimeoutError("Solve timeout")


class JobBoardScraper:
    def __init__(self, proxy=None):
        self.session = requests.Session()
        if proxy:
            self.session.proxies = {"http": proxy, "https": proxy}
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
            "Accept-Language": "en-US,en;q=0.9",
        })

    def search_jobs(self, base_url, query, location, pages=5):
        """Search job listings across multiple pages."""
        all_jobs = []

        for page in range(pages):
            url = f"{base_url}/jobs?q={query}&l={location}&start={page * 10}"
            resp = self.session.get(url, timeout=30)

            # Check for CAPTCHA
            if self._has_captcha(resp.text):
                resp = self._solve_and_retry(resp.text, url)

            if resp.status_code == 200:
                jobs = self._parse_listings(resp.text)
                all_jobs.extend(jobs)
                print(f"Page {page + 1}: {len(jobs)} jobs found")
            else:
                print(f"Page {page + 1}: Request failed ({resp.status_code})")

            time.sleep(3)  # Rate limit

        return all_jobs

    def _has_captcha(self, html):
        indicators = [
            'data-sitekey=',
            'g-recaptcha',
            'cf-turnstile',
            'captcha-delivery',
        ]
        return any(ind in html.lower() for ind in indicators)

    def _solve_and_retry(self, html, url):
        # Try reCAPTCHA first
        match = re.search(r'data-sitekey="([^"]+)"', html)
        if match:
            sitekey = match.group(1)

            # Detect Turnstile vs reCAPTCHA
            if 'cf-turnstile' in html:
                token = solve_captcha("turnstile", sitekey, url)
                field = "cf-turnstile-response"
            else:
                token = solve_captcha("userrecaptcha", sitekey, url)
                field = "g-recaptcha-response"

            return self.session.post(url, data={field: token})

        return self.session.get(url)

    def _parse_listings(self, html):
        soup = BeautifulSoup(html, "html.parser")
        jobs = []

        for card in soup.select(".job_seen_beacon, .jobsearch-ResultsList > li"):
            title_el = card.select_one("h2 a, .jobTitle a")
            company_el = card.select_one(".companyName, [data-testid='company-name']")
            location_el = card.select_one(".companyLocation, [data-testid='text-location']")
            salary_el = card.select_one(".salary-snippet, .estimated-salary")

            if title_el:
                jobs.append({
                    "title": title_el.get_text(strip=True),
                    "company": company_el.get_text(strip=True) if company_el else "",
                    "location": location_el.get_text(strip=True) if location_el else "",
                    "salary": salary_el.get_text(strip=True) if salary_el else "",
                    "url": title_el.get("href", ""),
                })

        return jobs


# Usage
scraper = JobBoardScraper(
    proxy="http://user:pass@residential.proxy.com:5000"
)
jobs = scraper.search_jobs(
    base_url="https://jobs.example.com",
    query="python developer",
    location="New York",
    pages=10,
)
print(f"Total jobs collected: {len(jobs)}")

Salary Data Collection

import csv


def collect_salary_data(titles, locations, output_file):
    """Collect salary data across job titles and locations."""
    scraper = JobBoardScraper(
        proxy="http://user:pass@residential.proxy.com:5000"
    )

    results = []
    for title in titles:
        for location in locations:
            try:
                jobs = scraper.search_jobs(
                    "https://jobs.example.com",
                    title, location, pages=3,
                )
                salaries = [j["salary"] for j in jobs if j["salary"]]
                results.append({
                    "title": title,
                    "location": location,
                    "listings": len(jobs),
                    "with_salary": len(salaries),
                    "salary_samples": "; ".join(salaries[:5]),
                })
                time.sleep(5)
            except Exception as e:
                results.append({
                    "title": title,
                    "location": location,
                    "error": str(e),
                })

    with open(output_file, "w", newline="") as f:
        writer = csv.DictWriter(
            f, fieldnames=["title", "location", "listings",
                           "with_salary", "salary_samples", "error"],
        )
        writer.writeheader()
        writer.writerows(results)

    return results


# Collect salary data for market analysis
collect_salary_data(
    titles=["Data Engineer", "ML Engineer", "DevOps Engineer"],
    locations=["San Francisco", "New York", "Austin", "Remote"],
    output_file="salary_data.csv",
)

Stealth-Configuredion Tips for Job Boards

Technique Why It Helps
Rotating residential proxies Distributes requests across real IPs
3-5 second delays between pages Mimics human browsing speed
Consistent User-Agent per session Avoids fingerprint mismatches
Accept cookies Job boards track sessions via cookies
Randomize search order Avoid sequential page patterns
Limit to 200 pages/day per domain Stay below detection thresholds

Troubleshooting

Issue Cause Fix
CAPTCHA on every search IP flagged or rate exceeded Switch IP, add longer delays
Empty results page CAPTCHA block returned instead Detect CAPTCHA before parsing
"Please verify you're human" Bot detection triggered Use residential proxy + realistic UA
Login required for salary data Platform gating content Implement authenticated session
Different results than browser Location/cookie differences Match Accept-Language and geo proxy

FAQ

How many job listings can I scrape per day?

With rotating residential proxies and proper delays, 500-2000 pages per domain is achievable without hitting persistent CAPTCHAs.

Do job boards block scraping?

Most job boards have terms discouraging automated access but vary in enforcement. CAPTCHAs are their primary defense, which CaptchaAI handles.

Which proxy type works best for job boards?

Rotating residential proxies are the best balance of cost and success rate. Datacenter IPs are frequently blocked by LinkedIn and Glassdoor.



Collect job market data at scale — get your CaptchaAI key for automated CAPTCHA solving.

Discussions (0)

No comments yet.

Related Posts

Explainers How Proxy Quality Affects CAPTCHA Solve Success Rate
Understand how proxy quality, IP reputation, and configuration affect CAPTCHA frequency and solve success rates with Captcha AI.

Understand how proxy quality, IP reputation, and configuration affect CAPTCHA frequency and solve success rate...

Python reCAPTCHA v2 Cloudflare Turnstile
Feb 06, 2026
Use Cases Academic Research Web Scraping with CAPTCHA Solving
How researchers can collect data from academic databases, journals, and citation sources protected by CAPTCHAs using Captcha AI.

How researchers can collect data from academic databases, journals, and citation sources protected by CAPTCHAs...

Python reCAPTCHA v2 Cloudflare Turnstile
Apr 06, 2026
Explainers Mobile Proxies for CAPTCHA Solving: Higher Success Rates Explained
Why mobile proxies produce the lowest CAPTCHA trigger rates and how to use them with Captcha AI for maximum success.

Why mobile proxies produce the lowest CAPTCHA trigger rates and how to use them with Captcha AI for maximum su...

Python reCAPTCHA v2 Cloudflare Turnstile
Apr 03, 2026
Explainers Rotating Residential Proxies: Best Practices for CAPTCHA Solving
Best practices for using rotating residential proxies with Captcha AI to reduce CAPTCHA frequency and maintain high solve rates.

Best practices for using rotating residential proxies with Captcha AI to reduce CAPTCHA frequency and maintain...

Python reCAPTCHA v2 Cloudflare Turnstile
Mar 01, 2026
Reference CAPTCHA Token Injection Methods Reference
Complete reference for injecting solved CAPTCHA tokens into web pages.

Complete reference for injecting solved CAPTCHA tokens into web pages. Covers re CAPTCHA, Turnstile, and Cloud...

Automation Python reCAPTCHA v2
Apr 08, 2026
Reference Browser Session Persistence for CAPTCHA Workflows
Manage browser sessions, cookies, and storage across CAPTCHA-solving runs to reduce repeat challenges and maintain authenticated state.

Manage browser sessions, cookies, and storage across CAPTCHA-solving runs to reduce repeat challenges and main...

Automation Python reCAPTCHA v2
Feb 24, 2026
Use Cases CAPTCHA Solving in Ticket Purchase Automation
How to handle CAPTCHAs on ticketing platforms Ticketmaster, AXS, and event sites using Captcha AI for automated purchasing workflows.

How to handle CAPTCHAs on ticketing platforms Ticketmaster, AXS, and event sites using Captcha AI for automate...

Automation Python reCAPTCHA v2
Feb 25, 2026
Tutorials Caching CAPTCHA Tokens for Reuse
Cache and reuse CAPTCHA tokens with Captcha AI to reduce API calls and costs.

Cache and reuse CAPTCHA tokens with Captcha AI to reduce API calls and costs. Covers token lifetimes, cache st...

Automation Python reCAPTCHA v2
Feb 15, 2026
Explainers Reducing CAPTCHA Solve Costs: 10 Strategies
Cut CAPTCHA solving costs with Captcha AI using 10 practical strategies — from skipping unnecessary solves to batching and caching tokens.

Cut CAPTCHA solving costs with Captcha AI using 10 practical strategies — from skipping unnecessary solves to...

Python reCAPTCHA v2 Cloudflare Turnstile
Mar 11, 2026
Tutorials Extracting reCAPTCHA Parameters from Page Source
Extract re CAPTCHA parameters from any web page — sitekey, action, data-s, enterprise flag, and version — using regex, DOM queries, and network interception.

Extract all re CAPTCHA parameters from any web page — sitekey, action, data-s, enterprise flag, and version —...

Python reCAPTCHA v2 Web Scraping
Apr 07, 2026
Use Cases Retail Site Data Collection with CAPTCHA Handling
Amazon uses image CAPTCHAs to block automated access.

Amazon uses image CAPTCHAs to block automated access. When you hit their anti-bot threshold, you'll see a page...

Web Scraping Image OCR
Apr 07, 2026
Use Cases Event Ticket Monitoring with CAPTCHA Handling
Build an event ticket availability monitor that handles CAPTCHAs using Captcha AI.

Build an event ticket availability monitor that handles CAPTCHAs using Captcha AI. Python workflow for checkin...

Automation Python reCAPTCHA v2
Jan 17, 2026
Use Cases Automated Form Submission with CAPTCHA Handling
Complete guide to automating web form submissions that include CAPTCHA challenges — re CAPTCHA, Turnstile, and image CAPTCHAs with Captcha AI.

Complete guide to automating web form submissions that include CAPTCHA challenges — re CAPTCHA, Turnstile, and...

Python reCAPTCHA v2 Cloudflare Turnstile
Mar 21, 2026