Scrape job listings from multiple boards, handle CAPTCHA challenges with CaptchaAI, normalize the data, and store it for search and analysis.
Architecture
[Job Board A] ──┐
[Job Board B] ──┼──> Scraper + CAPTCHA Solver ──> Normalizer ──> SQLite DB
[Job Board C] ──┘
Job Data Model
# models.py
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
import sqlite3
import json
@dataclass
class JobListing:
title: str
company: str
location: str
url: str
source: str
salary_min: Optional[float] = None
salary_max: Optional[float] = None
posted_date: Optional[str] = None
description: str = ""
tags: list = field(default_factory=list)
scraped_at: str = field(default_factory=lambda: datetime.now().isoformat())
class JobDatabase:
def __init__(self, db_path="jobs.db"):
self.conn = sqlite3.connect(db_path)
self._create_table()
def _create_table(self):
self.conn.execute("""
CREATE TABLE IF NOT EXISTS jobs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
company TEXT NOT NULL,
location TEXT,
url TEXT UNIQUE,
source TEXT,
salary_min REAL,
salary_max REAL,
posted_date TEXT,
description TEXT,
tags TEXT,
scraped_at TEXT
)
""")
self.conn.commit()
def insert(self, job: JobListing):
try:
self.conn.execute(
"""INSERT OR IGNORE INTO jobs
(title, company, location, url, source,
salary_min, salary_max, posted_date,
description, tags, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(job.title, job.company, job.location, job.url,
job.source, job.salary_min, job.salary_max,
job.posted_date, job.description,
json.dumps(job.tags), job.scraped_at),
)
self.conn.commit()
except sqlite3.IntegrityError:
pass # Duplicate URL
def search(self, keyword, location=None):
query = "SELECT * FROM jobs WHERE title LIKE ?"
params = [f"%{keyword}%"]
if location:
query += " AND location LIKE ?"
params.append(f"%{location}%")
query += " ORDER BY scraped_at DESC"
cursor = self.conn.execute(query, params)
return cursor.fetchall()
CAPTCHA-Aware Scraper Base
# scraper_base.py
import requests
import re
import time
import os
class BaseScraper:
API_KEY = os.environ["CAPTCHAAI_API_KEY"]
def __init__(self, source_name):
self.source = source_name
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
})
def fetch(self, url):
resp = self.session.get(url, timeout=20)
if self._has_captcha(resp.text):
token = self._solve_captcha(url, resp.text)
resp = self.session.post(url, data={
"g-recaptcha-response": token,
}, timeout=30)
return resp.text
def _has_captcha(self, html):
return "data-sitekey" in html or "g-recaptcha" in html
def _solve_captcha(self, url, html):
match = re.search(r'data-sitekey="([^"]+)"', html)
if not match:
raise ValueError("No sitekey found")
sitekey = match.group(1)
resp = requests.post("https://ocr.captchaai.com/in.php", data={
"key": self.API_KEY,
"method": "userrecaptcha",
"googlekey": sitekey,
"pageurl": url,
"json": 1,
}, timeout=30)
task_id = resp.json()["request"]
time.sleep(15)
for _ in range(24):
resp = requests.get("https://ocr.captchaai.com/res.php", params={
"key": self.API_KEY, "action": "get",
"id": task_id, "json": 1,
}, timeout=15)
data = resp.json()
if data.get("status") == 1:
return data["request"]
if data["request"] != "CAPCHA_NOT_READY":
raise RuntimeError(data["request"])
time.sleep(5)
raise TimeoutError("CAPTCHA solve timeout")
Job Board Scraper
# scrapers.py
from bs4 import BeautifulSoup
from scraper_base import BaseScraper
from models import JobListing
import re
class GenericJobScraper(BaseScraper):
"""Scrape a job board search results page."""
def __init__(self, source_name, base_url, selectors):
super().__init__(source_name)
self.base_url = base_url
self.selectors = selectors
def scrape_search(self, keyword, location="", max_pages=3):
jobs = []
for page in range(1, max_pages + 1):
url = self.base_url.format(
keyword=keyword.replace(" ", "+"),
location=location.replace(" ", "+"),
page=page,
)
html = self.fetch(url)
page_jobs = self._parse_listings(html)
if not page_jobs:
break
jobs.extend(page_jobs)
return jobs
def _parse_listings(self, html):
soup = BeautifulSoup(html, "html.parser")
cards = soup.select(self.selectors["card"])
jobs = []
for card in cards:
title_el = card.select_one(self.selectors["title"])
company_el = card.select_one(self.selectors["company"])
location_el = card.select_one(self.selectors.get("location", ".location"))
link_el = card.select_one(self.selectors.get("link", "a"))
if not title_el or not company_el:
continue
salary = self._extract_salary(card.get_text())
jobs.append(JobListing(
title=title_el.get_text(strip=True),
company=company_el.get_text(strip=True),
location=location_el.get_text(strip=True) if location_el else "",
url=link_el["href"] if link_el else "",
source=self.source,
salary_min=salary[0],
salary_max=salary[1],
))
return jobs
def _extract_salary(self, text):
match = re.search(
r'\$?([\d,]+)\s*[-–to]+\s*\$?([\d,]+)', text
)
if match:
return (
float(match.group(1).replace(",", "")),
float(match.group(2).replace(",", "")),
)
return (None, None)
Runner
# main.py
import time
from models import JobDatabase
from scrapers import GenericJobScraper
BOARDS = [
{
"name": "Board A",
"base_url": "https://board-a.example.com/search?q={keyword}&l={location}&p={page}",
"selectors": {
"card": ".job-card",
"title": ".job-title",
"company": ".company-name",
"location": ".job-location",
"link": "a.job-link",
},
},
]
def main():
db = JobDatabase()
keywords = ["python developer", "data engineer"]
for board in BOARDS:
scraper = GenericJobScraper(board["name"], board["base_url"], board["selectors"])
for keyword in keywords:
print(f"Scraping {board['name']} for '{keyword}'...")
jobs = scraper.scrape_search(keyword, location="Remote")
for job in jobs:
db.insert(job)
print(f" {job.title} at {job.company}")
time.sleep(5)
# Search example
results = db.search("python", "Remote")
print(f"\nFound {len(results)} matching jobs")
if __name__ == "__main__":
main()
Troubleshooting
| Issue | Cause | Fix |
|---|---|---|
| Duplicate listings | Same job on multiple pages | URL-based dedup via UNIQUE constraint |
| Salary extraction fails | Non-standard format | Customize _extract_salary regex per board |
| CAPTCHA on every page | Session not persisted | Reuse self.session across requests |
| Listings empty after solve | CAPTCHA form needs JS | Switch to Selenium + CaptchaAI |
FAQ
How do I handle pagination?
The scraper loops through pages 1 to max_pages. If a page returns no job cards, it stops early.
Can I add new job boards easily?
Yes. Add a new entry to BOARDS with the board's URL template and CSS selectors.
How do I avoid getting blocked?
Rate-limit requests with time.sleep(), rotate user agents, and keep sessions consistent.
Related Guides
Aggregate job data — start with CaptchaAI.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.