Tutorials

Build a Lead Generation Pipeline with CaptchaAI

Scrape business directories, handle CAPTCHA challenges with CaptchaAI, and produce CRM-ready lead data with contact info, categories, and location.


Pipeline Overview

Search Query ──> Directory Scraper ──> CAPTCHA Solver ──> Data Normalizer ──> CSV/CRM Export

Lead Data Model

# models.py
from dataclasses import dataclass, asdict
from typing import Optional
import csv


@dataclass
class Lead:
    business_name: str
    category: str
    phone: str = ""
    email: str = ""
    website: str = ""
    address: str = ""
    city: str = ""
    state: str = ""
    source: str = ""
    url: str = ""


class LeadExporter:
    @staticmethod
    def to_csv(leads, filename="leads.csv"):
        if not leads:
            return
        with open(filename, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=asdict(leads[0]).keys())
            writer.writeheader()
            for lead in leads:
                writer.writerow(asdict(lead))
        print(f"Exported {len(leads)} leads to {filename}")

    @staticmethod
    def deduplicate(leads):
        seen = set()
        unique = []
        for lead in leads:
            key = (lead.business_name.lower(), lead.phone)
            if key not in seen:
                seen.add(key)
                unique.append(lead)
        return unique

CAPTCHA Solver

# solver.py
import requests
import time
import os


class CaptchaSolver:
    def __init__(self):
        self.api_key = os.environ["CAPTCHAAI_API_KEY"]

    def solve(self, sitekey, pageurl):
        resp = requests.post("https://ocr.captchaai.com/in.php", data={
            "key": self.api_key,
            "method": "userrecaptcha",
            "googlekey": sitekey,
            "pageurl": pageurl,
            "json": 1,
        }, timeout=30)
        task_id = resp.json()["request"]

        time.sleep(15)
        for _ in range(24):
            resp = requests.get("https://ocr.captchaai.com/res.php", params={
                "key": self.api_key, "action": "get",
                "id": task_id, "json": 1,
            }, timeout=15)
            data = resp.json()
            if data.get("status") == 1:
                return data["request"]
            if data["request"] != "CAPCHA_NOT_READY":
                raise RuntimeError(data["request"])
            time.sleep(5)

        raise TimeoutError("Solve timeout")

Directory Scraper

# scraper.py
import requests
import re
from bs4 import BeautifulSoup
from solver import CaptchaSolver
from models import Lead


class DirectoryScraper:
    def __init__(self, source_name):
        self.source = source_name
        self.solver = CaptchaSolver()
        self.session = requests.Session()
        self.session.headers["User-Agent"] = (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36"
        )

    def search(self, query, location, max_pages=5):
        leads = []

        for page in range(1, max_pages + 1):
            url = self._build_url(query, location, page)
            html = self._fetch(url)
            page_leads = self._parse_results(html, url)

            if not page_leads:
                break
            leads.extend(page_leads)

        return leads

    def _build_url(self, query, location, page):
        return (
            f"https://directory.example.com/search"
            f"?q={query.replace(' ', '+')}"
            f"&loc={location.replace(' ', '+')}"
            f"&page={page}"
        )

    def _fetch(self, url):
        resp = self.session.get(url, timeout=20)

        if "data-sitekey" in resp.text:
            match = re.search(r'data-sitekey="([^"]+)"', resp.text)
            if match:
                token = self.solver.solve(match.group(1), url)
                resp = self.session.post(url, data={
                    "g-recaptcha-response": token,
                }, timeout=30)

        return resp.text

    def _parse_results(self, html, source_url):
        soup = BeautifulSoup(html, "html.parser")
        cards = soup.select(".listing-card, .result-item, .business-card")
        leads = []

        for card in cards:
            name_el = card.select_one(".business-name, .listing-name, h3")
            if not name_el:
                continue

            lead = Lead(
                business_name=name_el.get_text(strip=True),
                category=self._extract_text(card, ".category, .business-type"),
                phone=self._extract_phone(card),
                email=self._extract_email(card),
                website=self._extract_link(card, "a.website-link, a[href*='http']"),
                address=self._extract_text(card, ".address, .street"),
                city=self._extract_text(card, ".city"),
                state=self._extract_text(card, ".state"),
                source=self.source,
                url=source_url,
            )
            leads.append(lead)

        return leads

    def _extract_text(self, card, selector):
        el = card.select_one(selector)
        return el.get_text(strip=True) if el else ""

    def _extract_phone(self, card):
        el = card.select_one(".phone, a[href^='tel:']")
        if el:
            href = el.get("href", "")
            if href.startswith("tel:"):
                return href.replace("tel:", "")
            return el.get_text(strip=True)
        # Regex fallback
        match = re.search(r'\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}', card.get_text())
        return match.group() if match else ""

    def _extract_email(self, card):
        el = card.select_one("a[href^='mailto:']")
        if el:
            return el["href"].replace("mailto:", "")
        match = re.search(r'[\w.+-]+@[\w-]+\.[\w.]+', card.get_text())
        return match.group() if match else ""

    def _extract_link(self, card, selector):
        el = card.select_one(selector)
        return el["href"] if el and el.get("href") else ""

Pipeline Runner

# main.py
import time
from scraper import DirectoryScraper
from models import LeadExporter

SEARCHES = [
    {"query": "plumber", "location": "San Francisco, CA"},
    {"query": "electrician", "location": "San Francisco, CA"},
    {"query": "dentist", "location": "San Francisco, CA"},
]


def main():
    scraper = DirectoryScraper("Business Directory Example")
    all_leads = []

    for search in SEARCHES:
        print(f"Searching: {search['query']} in {search['location']}")
        leads = scraper.search(search["query"], search["location"])
        print(f"  Found {len(leads)} leads")
        all_leads.extend(leads)
        time.sleep(5)

    # Deduplicate
    unique = LeadExporter.deduplicate(all_leads)
    print(f"\nTotal: {len(all_leads)} raw → {len(unique)} unique leads")

    # Export
    LeadExporter.to_csv(unique, "leads_output.csv")


if __name__ == "__main__":
    main()

Troubleshooting

Issue Cause Fix
Empty phone/email Not visible on list page Scrape individual listing pages for full details
Duplicates across searches Same business in multiple categories Use name+phone key for dedup
CAPTCHA on every page load Session expired or IP flagged Add delays and rotate proxies
CSV encoding errors Non-ASCII characters in names Use encoding="utf-8" in CSV writer

FAQ

How do I import leads into a CRM?

Most CRMs support CSV import. Match column names to CRM fields (Name, Phone, Email, Address). HubSpot, Salesforce, and Pipedrive all support direct CSV upload.

Can I scrape more detail per lead?

Yes. After collecting URLs from search results, scrape each listing page individually for description, hours, reviews, and social links.

How do I handle rate limiting?

Add 3-5 second delays between requests. For heavy scraping, rotate proxies and distribute requests across sessions.



Build your lead pipeline — start with CaptchaAI.

Discussions (0)

No comments yet.

Related Posts

Reference CAPTCHA Token Injection Methods Reference
Complete reference for injecting solved CAPTCHA tokens into web pages.

Complete reference for injecting solved CAPTCHA tokens into web pages. Covers re CAPTCHA, Turnstile, and Cloud...

Automation Python reCAPTCHA v2
Apr 08, 2026
Tutorials Pytest Fixtures for CaptchaAI API Testing
Build reusable pytest fixtures to test CAPTCHA-solving workflows with Captcha AI.

Build reusable pytest fixtures to test CAPTCHA-solving workflows with Captcha AI. Covers mocking, live integra...

Automation Python reCAPTCHA v2
Apr 08, 2026
Reference Browser Session Persistence for CAPTCHA Workflows
Manage browser sessions, cookies, and storage across CAPTCHA-solving runs to reduce repeat challenges and maintain authenticated state.

Manage browser sessions, cookies, and storage across CAPTCHA-solving runs to reduce repeat challenges and main...

Automation Python reCAPTCHA v2
Feb 24, 2026
Integrations Browser Profile Isolation + CaptchaAI Integration
Browser profile isolation tools create distinct browser environments with unique fingerprints per session.

Browser profile isolation tools create distinct browser environments with unique fingerprints per session. Com...

Automation Python reCAPTCHA v2
Feb 21, 2026
Tutorials Securing CaptchaAI Credentials in Environment Variables
Store Captcha AI API keys securely using environment variables, .env files, Docker secrets, and cloud secret managers instead of hardcoding.

Store Captcha AI API keys securely using environment variables, .env files, Docker secrets, and cloud secret m...

Automation Python reCAPTCHA v2
Feb 12, 2026
Comparisons WebDriver vs Chrome DevTools Protocol for CAPTCHA Automation
Compare Web Driver and Chrome Dev Tools Protocol (CDP) for CAPTCHA automation — detection, performance, capabilities, and when to use each with Captcha AI.

Compare Web Driver and Chrome Dev Tools Protocol (CDP) for CAPTCHA automation — detection, performance, capabi...

Automation Python reCAPTCHA v2
Mar 27, 2026
Use Cases CAPTCHA Solving in Ticket Purchase Automation
How to handle CAPTCHAs on ticketing platforms Ticketmaster, AXS, and event sites using Captcha AI for automated purchasing workflows.

How to handle CAPTCHAs on ticketing platforms Ticketmaster, AXS, and event sites using Captcha AI for automate...

Automation Python reCAPTCHA v2
Feb 25, 2026
Tutorials Caching CAPTCHA Tokens for Reuse
Cache and reuse CAPTCHA tokens with Captcha AI to reduce API calls and costs.

Cache and reuse CAPTCHA tokens with Captcha AI to reduce API calls and costs. Covers token lifetimes, cache st...

Automation Python reCAPTCHA v2
Feb 15, 2026
Use Cases Event Ticket Monitoring with CAPTCHA Handling
Build an event ticket availability monitor that handles CAPTCHAs using Captcha AI.

Build an event ticket availability monitor that handles CAPTCHAs using Captcha AI. Python workflow for checkin...

Automation Python reCAPTCHA v2
Jan 17, 2026
Explainers reCAPTCHA v2 Invisible: Trigger Detection and Solving
Detect and solve re CAPTCHA v 2 Invisible challenges with Captcha AI — identify triggers, extract parameters, and handle auto-invoked CAPTCHAs.

Detect and solve re CAPTCHA v 2 Invisible challenges with Captcha AI — identify triggers, extract parameters,...

Automation Python reCAPTCHA v2
Apr 07, 2026
Tutorials Using Fiddler to Inspect CaptchaAI API Traffic
How to use Fiddler Everywhere and Fiddler Classic to capture, inspect, and debug Captcha AI API requests and responses — filters, breakpoints, and replay for tr...

How to use Fiddler Everywhere and Fiddler Classic to capture, inspect, and debug Captcha AI API requests and r...

Automation Python All CAPTCHA Types
Mar 05, 2026
Tutorials GeeTest Token Injection in Browser Automation Frameworks
how to inject Gee Test v 3 solution tokens into Playwright, Puppeteer, and Selenium — including the three-value response, callback triggering, and form submissi...

Learn how to inject Gee Test v 3 solution tokens into Playwright, Puppeteer, and Selenium — including the thre...

Automation Python Testing
Jan 18, 2026