Tutorials

Build a Content Change Monitoring Bot with CaptchaAI

Track changes on web pages protected by CAPTCHAs. Detect updates to pricing, terms, product listings, or any page content — and get notified instantly.


Architecture

Scheduler ──> Page Fetcher ──> CAPTCHA Handler ──> Content Extractor
                                                        │
                                                   Diff Engine
                                                        │
                                               Alert Dispatcher

Content Snapshot Store

# store.py
import hashlib
import json
import os
from datetime import datetime


class SnapshotStore:
    def __init__(self, data_dir="snapshots"):
        self.data_dir = data_dir
        os.makedirs(data_dir, exist_ok=True)

    def _key(self, url):
        return hashlib.md5(url.encode()).hexdigest()

    def save(self, url, content):
        path = os.path.join(self.data_dir, f"{self._key(url)}.json")
        data = {
            "url": url,
            "content": content,
            "hash": hashlib.sha256(content.encode()).hexdigest(),
            "timestamp": datetime.now().isoformat(),
        }
        with open(path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)

    def load(self, url):
        path = os.path.join(self.data_dir, f"{self._key(url)}.json")
        if not os.path.exists(path):
            return None
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)

    def has_changed(self, url, new_content):
        prev = self.load(url)
        if prev is None:
            return True  # First check
        new_hash = hashlib.sha256(new_content.encode()).hexdigest()
        return prev["hash"] != new_hash

CAPTCHA-Aware Fetcher

# fetcher.py
import requests
import re
import time
import os


class PageFetcher:
    def __init__(self):
        self.api_key = os.environ["CAPTCHAAI_API_KEY"]
        self.session = requests.Session()
        self.session.headers["User-Agent"] = (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36"
        )

    def fetch(self, url):
        resp = self.session.get(url, timeout=20)

        if self._has_turnstile(resp.text):
            resp = self._solve_turnstile(url, resp.text)
        elif self._has_recaptcha(resp.text):
            resp = self._solve_recaptcha(url, resp.text)

        return resp.text

    def _has_turnstile(self, html):
        return "cf-turnstile" in html or "challenges.cloudflare.com/turnstile" in html

    def _has_recaptcha(self, html):
        return "data-sitekey" in html

    def _solve_turnstile(self, url, html):
        match = re.search(r'data-sitekey="([^"]+)"', html)
        if not match:
            return self.session.get(url, timeout=20)

        resp = requests.post("https://ocr.captchaai.com/in.php", data={
            "key": self.api_key,
            "method": "turnstile",
            "sitekey": match.group(1),
            "pageurl": url,
            "json": 1,
        }, timeout=30)
        task_id = resp.json()["request"]

        return self._poll_and_retry(url, task_id)

    def _solve_recaptcha(self, url, html):
        match = re.search(r'data-sitekey="([^"]+)"', html)
        if not match:
            return self.session.get(url, timeout=20)

        resp = requests.post("https://ocr.captchaai.com/in.php", data={
            "key": self.api_key,
            "method": "userrecaptcha",
            "googlekey": match.group(1),
            "pageurl": url,
            "json": 1,
        }, timeout=30)
        task_id = resp.json()["request"]

        return self._poll_and_retry(url, task_id)

    def _poll_and_retry(self, url, task_id):
        time.sleep(15)
        for _ in range(24):
            resp = requests.get("https://ocr.captchaai.com/res.php", params={
                "key": self.api_key, "action": "get",
                "id": task_id, "json": 1,
            }, timeout=15)
            data = resp.json()
            if data.get("status") == 1:
                token = data["request"]
                return self.session.post(url, data={
                    "g-recaptcha-response": token,
                    "cf-turnstile-response": token,
                }, timeout=30)
            if data["request"] != "CAPCHA_NOT_READY":
                raise RuntimeError(data["request"])
            time.sleep(5)
        raise TimeoutError("CAPTCHA solve timeout")

Diff Engine

# differ.py
import difflib


def compute_diff(old_content, new_content, context_lines=3):
    """Compute a unified diff between old and new content."""
    old_lines = old_content.splitlines(keepends=True)
    new_lines = new_content.splitlines(keepends=True)

    diff = difflib.unified_diff(
        old_lines, new_lines,
        fromfile="previous", tofile="current",
        n=context_lines,
    )
    return "".join(diff)


def extract_changes(old_content, new_content):
    """Extract added and removed lines."""
    old_lines = set(old_content.splitlines())
    new_lines = set(new_content.splitlines())

    return {
        "added": list(new_lines - old_lines),
        "removed": list(old_lines - new_lines),
        "added_count": len(new_lines - old_lines),
        "removed_count": len(old_lines - new_lines),
    }

Alert Dispatcher

# alerter.py
import json
import requests


def send_slack_alert(webhook_url, url, changes):
    """Send change alert to Slack."""
    text = (
        f"*Content Change Detected*\n"
        f"URL: {url}\n"
        f"Added lines: {changes['added_count']}\n"
        f"Removed lines: {changes['removed_count']}\n"
    )

    if changes["added"]:
        text += f"\n*Sample additions:*\n```{chr(10).join(changes['added'][:5])}```"

    requests.post(webhook_url, json={"text": text}, timeout=10)


def send_email_alert(to_email, url, diff_text):
    """Send change alert via email (using any SMTP)."""
    import smtplib
    from email.mime.text import MIMEText

    msg = MIMEText(f"Changes detected on {url}:\n\n{diff_text}")
    msg["Subject"] = f"Content Change: {url}"
    msg["To"] = to_email
    msg["From"] = "monitor@example.com"

    with smtplib.SMTP("smtp.example.com", 587) as server:
        server.starttls()
        server.login("monitor@example.com", "YOUR_PASSWORD")
        server.send_message(msg)

Content Extractor

# extractor.py
from bs4 import BeautifulSoup


def extract_content(html, selector=None):
    """Extract meaningful content from HTML, stripping boilerplate."""
    soup = BeautifulSoup(html, "html.parser")

    # Remove scripts, styles, nav, footer
    for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
        tag.decompose()

    if selector:
        target = soup.select_one(selector)
        if target:
            return target.get_text(separator="\n", strip=True)

    # Default: main or body
    main = soup.select_one("main, article, .content, #content")
    if main:
        return main.get_text(separator="\n", strip=True)

    return soup.get_text(separator="\n", strip=True)

Main Monitor

# main.py
import time
import os
from fetcher import PageFetcher
from store import SnapshotStore
from extractor import extract_content
from differ import compute_diff, extract_changes
from alerter import send_slack_alert

PAGES = [
    {"url": "https://example.com/pricing", "selector": ".pricing-table"},
    {"url": "https://example.com/terms", "selector": ".terms-content"},
    {"url": "https://example.com/products", "selector": ".product-grid"},
]

SLACK_WEBHOOK = os.environ.get("SLACK_WEBHOOK_URL", "")


def main():
    fetcher = PageFetcher()
    store = SnapshotStore()

    for page in PAGES:
        url = page["url"]
        selector = page.get("selector")

        print(f"Checking: {url}")

        try:
            html = fetcher.fetch(url)
            content = extract_content(html, selector)

            if store.has_changed(url, content):
                prev = store.load(url)

                if prev:
                    diff = compute_diff(prev["content"], content)
                    changes = extract_changes(prev["content"], content)
                    print(f"  CHANGED — +{changes['added_count']} / -{changes['removed_count']} lines")

                    if SLACK_WEBHOOK:
                        send_slack_alert(SLACK_WEBHOOK, url, changes)
                else:
                    print(f"  First snapshot saved")

                store.save(url, content)
            else:
                print(f"  No changes")

        except Exception as e:
            print(f"  Error: {e}")

        time.sleep(5)


if __name__ == "__main__":
    main()

FAQ

How often should I run the monitor?

For pricing pages, every 1-4 hours. For terms/legal pages, daily. For product listings, every 6-12 hours. Adjust based on how fast the content changes.

How do I ignore minor layout changes?

Use CSS selectors to target only the meaningful content area. The extractor strips headers, footers, and navigation automatically.

Can I track changes over time?

Yes. Modify SnapshotStore to keep historical snapshots instead of overwriting. Store each version with a timestamp for a full change history.



Never miss a content change — start with CaptchaAI.

Discussions (0)

No comments yet.

Related Posts

Use Cases Event Ticket Monitoring with CAPTCHA Handling
Build an event ticket availability monitor that handles CAPTCHAs using Captcha AI.

Build an event ticket availability monitor that handles CAPTCHAs using Captcha AI. Python workflow for checkin...

Automation Python reCAPTCHA v2
Jan 17, 2026
Reference CAPTCHA Token Injection Methods Reference
Complete reference for injecting solved CAPTCHA tokens into web pages.

Complete reference for injecting solved CAPTCHA tokens into web pages. Covers re CAPTCHA, Turnstile, and Cloud...

Automation Python reCAPTCHA v2
Apr 08, 2026
Tutorials Pytest Fixtures for CaptchaAI API Testing
Build reusable pytest fixtures to test CAPTCHA-solving workflows with Captcha AI.

Build reusable pytest fixtures to test CAPTCHA-solving workflows with Captcha AI. Covers mocking, live integra...

Automation Python reCAPTCHA v2
Apr 08, 2026
Reference Browser Session Persistence for CAPTCHA Workflows
Manage browser sessions, cookies, and storage across CAPTCHA-solving runs to reduce repeat challenges and maintain authenticated state.

Manage browser sessions, cookies, and storage across CAPTCHA-solving runs to reduce repeat challenges and main...

Automation Python reCAPTCHA v2
Feb 24, 2026
Integrations Browser Profile Isolation + CaptchaAI Integration
Browser profile isolation tools create distinct browser environments with unique fingerprints per session.

Browser profile isolation tools create distinct browser environments with unique fingerprints per session. Com...

Automation Python reCAPTCHA v2
Feb 21, 2026
Tutorials Image CAPTCHA Confidence Scores: Using CaptchaAI Quality Metrics
how to use Captcha AI's confidence indicators for image CAPTCHA solutions — assess answer quality, implement confidence-based retry logic, and optimize solve ra...

Learn how to use Captcha AI's confidence indicators for image CAPTCHA solutions — assess answer quality, imple...

Automation Python Image OCR
Mar 30, 2026
Comparisons WebDriver vs Chrome DevTools Protocol for CAPTCHA Automation
Compare Web Driver and Chrome Dev Tools Protocol (CDP) for CAPTCHA automation — detection, performance, capabilities, and when to use each with Captcha AI.

Compare Web Driver and Chrome Dev Tools Protocol (CDP) for CAPTCHA automation — detection, performance, capabi...

Automation Python reCAPTCHA v2
Mar 27, 2026
Tutorials Using Fiddler to Inspect CaptchaAI API Traffic
How to use Fiddler Everywhere and Fiddler Classic to capture, inspect, and debug Captcha AI API requests and responses — filters, breakpoints, and replay for tr...

How to use Fiddler Everywhere and Fiddler Classic to capture, inspect, and debug Captcha AI API requests and r...

Automation Python All CAPTCHA Types
Mar 05, 2026
Tutorials GeeTest Token Injection in Browser Automation Frameworks
how to inject Gee Test v 3 solution tokens into Playwright, Puppeteer, and Selenium — including the three-value response, callback triggering, and form submissi...

Learn how to inject Gee Test v 3 solution tokens into Playwright, Puppeteer, and Selenium — including the thre...

Automation Python Testing
Jan 18, 2026
Tutorials CAPTCHA Handling in Mobile Apps with Appium
Handle CAPTCHAs in mobile app automation using Appium and Captcha AI — extract Web sitekeys, solve, and inject tokens on Android and i OS.

Handle CAPTCHAs in mobile app automation using Appium and Captcha AI — extract Web View sitekeys, solve, and i...

Automation Python All CAPTCHA Types
Feb 13, 2026