Tutorials

Build a Job Listing Aggregator with CaptchaAI

Scrape job listings from multiple boards, handle CAPTCHA challenges with CaptchaAI, normalize the data, and store it for search and analysis.


Architecture

[Job Board A] ──┐
[Job Board B] ──┼──> Scraper + CAPTCHA Solver ──> Normalizer ──> SQLite DB
[Job Board C] ──┘

Job Data Model

# models.py
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
import sqlite3
import json


@dataclass
class JobListing:
    title: str
    company: str
    location: str
    url: str
    source: str
    salary_min: Optional[float] = None
    salary_max: Optional[float] = None
    posted_date: Optional[str] = None
    description: str = ""
    tags: list = field(default_factory=list)
    scraped_at: str = field(default_factory=lambda: datetime.now().isoformat())


class JobDatabase:
    def __init__(self, db_path="jobs.db"):
        self.conn = sqlite3.connect(db_path)
        self._create_table()

    def _create_table(self):
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS jobs (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                title TEXT NOT NULL,
                company TEXT NOT NULL,
                location TEXT,
                url TEXT UNIQUE,
                source TEXT,
                salary_min REAL,
                salary_max REAL,
                posted_date TEXT,
                description TEXT,
                tags TEXT,
                scraped_at TEXT
            )
        """)
        self.conn.commit()

    def insert(self, job: JobListing):
        try:
            self.conn.execute(
                """INSERT OR IGNORE INTO jobs
                   (title, company, location, url, source,
                    salary_min, salary_max, posted_date,
                    description, tags, scraped_at)
                   VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
                (job.title, job.company, job.location, job.url,
                 job.source, job.salary_min, job.salary_max,
                 job.posted_date, job.description,
                 json.dumps(job.tags), job.scraped_at),
            )
            self.conn.commit()
        except sqlite3.IntegrityError:
            pass  # Duplicate URL

    def search(self, keyword, location=None):
        query = "SELECT * FROM jobs WHERE title LIKE ?"
        params = [f"%{keyword}%"]
        if location:
            query += " AND location LIKE ?"
            params.append(f"%{location}%")
        query += " ORDER BY scraped_at DESC"
        cursor = self.conn.execute(query, params)
        return cursor.fetchall()

CAPTCHA-Aware Scraper Base

# scraper_base.py
import requests
import re
import time
import os


class BaseScraper:
    API_KEY = os.environ["CAPTCHAAI_API_KEY"]

    def __init__(self, source_name):
        self.source = source_name
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                          "AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
        })

    def fetch(self, url):
        resp = self.session.get(url, timeout=20)

        if self._has_captcha(resp.text):
            token = self._solve_captcha(url, resp.text)
            resp = self.session.post(url, data={
                "g-recaptcha-response": token,
            }, timeout=30)

        return resp.text

    def _has_captcha(self, html):
        return "data-sitekey" in html or "g-recaptcha" in html

    def _solve_captcha(self, url, html):
        match = re.search(r'data-sitekey="([^"]+)"', html)
        if not match:
            raise ValueError("No sitekey found")

        sitekey = match.group(1)

        resp = requests.post("https://ocr.captchaai.com/in.php", data={
            "key": self.API_KEY,
            "method": "userrecaptcha",
            "googlekey": sitekey,
            "pageurl": url,
            "json": 1,
        }, timeout=30)
        task_id = resp.json()["request"]
        time.sleep(15)

        for _ in range(24):
            resp = requests.get("https://ocr.captchaai.com/res.php", params={
                "key": self.API_KEY, "action": "get",
                "id": task_id, "json": 1,
            }, timeout=15)
            data = resp.json()
            if data.get("status") == 1:
                return data["request"]
            if data["request"] != "CAPCHA_NOT_READY":
                raise RuntimeError(data["request"])
            time.sleep(5)

        raise TimeoutError("CAPTCHA solve timeout")

Job Board Scraper

# scrapers.py
from bs4 import BeautifulSoup
from scraper_base import BaseScraper
from models import JobListing
import re


class GenericJobScraper(BaseScraper):
    """Scrape a job board search results page."""

    def __init__(self, source_name, base_url, selectors):
        super().__init__(source_name)
        self.base_url = base_url
        self.selectors = selectors

    def scrape_search(self, keyword, location="", max_pages=3):
        jobs = []

        for page in range(1, max_pages + 1):
            url = self.base_url.format(
                keyword=keyword.replace(" ", "+"),
                location=location.replace(" ", "+"),
                page=page,
            )
            html = self.fetch(url)
            page_jobs = self._parse_listings(html)

            if not page_jobs:
                break
            jobs.extend(page_jobs)

        return jobs

    def _parse_listings(self, html):
        soup = BeautifulSoup(html, "html.parser")
        cards = soup.select(self.selectors["card"])
        jobs = []

        for card in cards:
            title_el = card.select_one(self.selectors["title"])
            company_el = card.select_one(self.selectors["company"])
            location_el = card.select_one(self.selectors.get("location", ".location"))
            link_el = card.select_one(self.selectors.get("link", "a"))

            if not title_el or not company_el:
                continue

            salary = self._extract_salary(card.get_text())

            jobs.append(JobListing(
                title=title_el.get_text(strip=True),
                company=company_el.get_text(strip=True),
                location=location_el.get_text(strip=True) if location_el else "",
                url=link_el["href"] if link_el else "",
                source=self.source,
                salary_min=salary[0],
                salary_max=salary[1],
            ))

        return jobs

    def _extract_salary(self, text):
        match = re.search(
            r'\$?([\d,]+)\s*[-–to]+\s*\$?([\d,]+)', text
        )
        if match:
            return (
                float(match.group(1).replace(",", "")),
                float(match.group(2).replace(",", "")),
            )
        return (None, None)

Runner

# main.py
import time
from models import JobDatabase
from scrapers import GenericJobScraper

BOARDS = [
    {
        "name": "Board A",
        "base_url": "https://board-a.example.com/search?q={keyword}&l={location}&p={page}",
        "selectors": {
            "card": ".job-card",
            "title": ".job-title",
            "company": ".company-name",
            "location": ".job-location",
            "link": "a.job-link",
        },
    },
]


def main():
    db = JobDatabase()
    keywords = ["python developer", "data engineer"]

    for board in BOARDS:
        scraper = GenericJobScraper(board["name"], board["base_url"], board["selectors"])

        for keyword in keywords:
            print(f"Scraping {board['name']} for '{keyword}'...")
            jobs = scraper.scrape_search(keyword, location="Remote")

            for job in jobs:
                db.insert(job)
                print(f"  {job.title} at {job.company}")

            time.sleep(5)

    # Search example
    results = db.search("python", "Remote")
    print(f"\nFound {len(results)} matching jobs")


if __name__ == "__main__":
    main()

Troubleshooting

Issue Cause Fix
Duplicate listings Same job on multiple pages URL-based dedup via UNIQUE constraint
Salary extraction fails Non-standard format Customize _extract_salary regex per board
CAPTCHA on every page Session not persisted Reuse self.session across requests
Listings empty after solve CAPTCHA form needs JS Switch to Selenium + CaptchaAI

FAQ

How do I handle pagination?

The scraper loops through pages 1 to max_pages. If a page returns no job cards, it stops early.

Can I add new job boards easily?

Yes. Add a new entry to BOARDS with the board's URL template and CSS selectors.

How do I avoid getting blocked?

Rate-limit requests with time.sleep(), rotate user agents, and keep sessions consistent.



Aggregate job data — start with CaptchaAI.

Discussions (0)

No comments yet.

Related Posts

Reference CAPTCHA Token Injection Methods Reference
Complete reference for injecting solved CAPTCHA tokens into web pages.

Complete reference for injecting solved CAPTCHA tokens into web pages. Covers re CAPTCHA, Turnstile, and Cloud...

Python Automation Cloudflare Turnstile
Apr 08, 2026
Explainers reCAPTCHA v2 Invisible: Trigger Detection and Solving
Detect and solve re CAPTCHA v 2 Invisible challenges with Captcha AI — identify triggers, extract parameters, and handle auto-invoked CAPTCHAs.

Detect and solve re CAPTCHA v 2 Invisible challenges with Captcha AI — identify triggers, extract parameters,...

Python Automation reCAPTCHA v2
Apr 07, 2026
Tutorials Handling Multiple CAPTCHAs on a Single Page
how to detect and solve multiple CAPTCHAs on a single web page using Captcha AI.

Learn how to detect and solve multiple CAPTCHAs on a single web page using Captcha AI. Covers multi-iframe ext...

Python Cloudflare Turnstile reCAPTCHA v2
Apr 09, 2026
Tutorials Pytest Fixtures for CaptchaAI API Testing
Build reusable pytest fixtures to test CAPTCHA-solving workflows with Captcha AI.

Build reusable pytest fixtures to test CAPTCHA-solving workflows with Captcha AI. Covers mocking, live integra...

Python Automation Cloudflare Turnstile
Apr 08, 2026
API Tutorials How to Solve reCAPTCHA v2 Enterprise with Python
Solve re CAPTCHA v 2 Enterprise using Python and Captcha AI API.

Solve re CAPTCHA v 2 Enterprise using Python and Captcha AI API. Complete guide with sitekey extraction, task...

Python Automation reCAPTCHA v2
Apr 08, 2026
Tutorials Extracting reCAPTCHA Parameters from Page Source
Extract re CAPTCHA parameters from any web page — sitekey, action, data-s, enterprise flag, and version — using regex, DOM queries, and network interception.

Extract all re CAPTCHA parameters from any web page — sitekey, action, data-s, enterprise flag, and version —...

Python reCAPTCHA v2 Web Scraping
Apr 07, 2026
Troubleshooting ERROR_PAGEURL: URL Mismatch Troubleshooting Guide
Fix ERROR_PAGEURL when using Captcha AI.

Fix ERROR_PAGEURL when using Captcha AI. Diagnose URL mismatch issues, handle redirects, SPAs, and dynamic URL...

Python Automation Cloudflare Turnstile
Mar 23, 2026
Integrations Scrapy Spider Middleware for CaptchaAI: Advanced Patterns
Build advanced Scrapy middleware for automatic Captcha AI CAPTCHA solving.

Build advanced Scrapy middleware for automatic Captcha AI CAPTCHA solving. Downloader middleware, signal handl...

Python reCAPTCHA v2 Web Scraping
Apr 04, 2026
Troubleshooting Handling reCAPTCHA v2 and Cloudflare Turnstile on the Same Site
Solve both re CAPTCHA v 2 and Cloudflare Turnstile on sites that use multiple CAPTCHA providers — detect which type appears, solve each correctly, and handle pr...

Solve both re CAPTCHA v 2 and Cloudflare Turnstile on sites that use multiple CAPTCHA providers — detect which...

Python Automation Cloudflare Turnstile
Mar 23, 2026
Tutorials Streaming Batch Results: Processing CAPTCHA Solutions as They Arrive
Process CAPTCHA solutions the moment they arrive instead of waiting for tasks to complete — use async generators, event emitters, and callback patterns for stre...

Process CAPTCHA solutions the moment they arrive instead of waiting for all tasks to complete — use async gene...

Python Automation All CAPTCHA Types
Apr 07, 2026
Tutorials Bulkhead Pattern: Isolating CAPTCHA Solving Failures
Apply the bulkhead pattern to isolate CAPTCHA solving failures — partition resources into independent pools so a slow or failing solver type doesn't starve othe...

Apply the bulkhead pattern to isolate CAPTCHA solving failures — partition resources into independent pools so...

Python Automation All CAPTCHA Types
Apr 07, 2026