Build a bot that scrapes prices from multiple websites, handles CAPTCHAs automatically, and outputs a comparison report.
Project Structure
price_bot/
├── config.py # API keys and store configs
├── solver.py # CaptchaAI integration
├── scraper.py # Per-store scrapers
├── compare.py # Price comparison logic
└── main.py # Entry point
CAPTCHA Solver Module
# solver.py
import requests
import time
class CaptchaSolver:
def __init__(self, api_key):
self.api_key = api_key
def solve_recaptcha(self, sitekey, pageurl):
resp = requests.post("https://ocr.captchaai.com/in.php", data={
"key": self.api_key,
"method": "userrecaptcha",
"googlekey": sitekey,
"pageurl": pageurl,
"json": 1,
}, timeout=30)
result = resp.json()
if result.get("status") != 1:
raise RuntimeError(result.get("request"))
task_id = result["request"]
time.sleep(15)
for _ in range(24):
resp = requests.get("https://ocr.captchaai.com/res.php", params={
"key": self.api_key, "action": "get",
"id": task_id, "json": 1,
}, timeout=15)
data = resp.json()
if data.get("status") == 1:
return data["request"]
if data["request"] != "CAPCHA_NOT_READY":
raise RuntimeError(data["request"])
time.sleep(5)
raise TimeoutError("Solve timeout")
Store Scraper
# scraper.py
import requests
import re
import time
from bs4 import BeautifulSoup
class StoreScraper:
"""Generic store scraper with CAPTCHA handling."""
def __init__(self, solver):
self.solver = solver
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
})
def scrape_price(self, url):
"""Fetch page, handle CAPTCHA if present, extract price."""
resp = self.session.get(url, timeout=15)
# Check for CAPTCHA
if 'data-sitekey' in resp.text:
resp = self._solve_and_retry(url, resp.text)
return self._extract_price(resp.text, url)
def _solve_and_retry(self, url, html):
"""Solve CAPTCHA and re-fetch."""
match = re.search(r'data-sitekey="([^"]+)"', html)
if not match:
return None
sitekey = match.group(1)
token = self.solver.solve_recaptcha(sitekey, url)
# Submit with token
resp = self.session.post(url, data={
"g-recaptcha-response": token,
}, timeout=30)
return resp
def _extract_price(self, html, url):
"""Extract price from HTML."""
soup = BeautifulSoup(html, "html.parser")
# Try common price selectors
selectors = [
".price", ".product-price", "[data-price]",
".price-current", ".offer-price", "#priceblock_ourprice",
]
for selector in selectors:
el = soup.select_one(selector)
if el:
price_text = el.get_text(strip=True)
# Extract numeric value
match = re.search(r'[\d,]+\.?\d*', price_text.replace(",", ""))
if match:
return {
"price": float(match.group()),
"currency": self._detect_currency(price_text),
"url": url,
}
return {"price": None, "currency": None, "url": url}
def _detect_currency(self, text):
if "$" in text: return "USD"
if "€" in text: return "EUR"
if "£" in text: return "GBP"
return "USD"
Price Comparison Engine
# compare.py
from datetime import datetime
def compare_prices(product_name, price_data):
"""Compare prices from multiple sources."""
valid = [p for p in price_data if p.get("price") is not None]
if not valid:
return {"product": product_name, "error": "No prices found"}
sorted_prices = sorted(valid, key=lambda x: x["price"])
best = sorted_prices[0]
worst = sorted_prices[-1]
return {
"product": product_name,
"best_price": best["price"],
"best_source": best["url"],
"worst_price": worst["price"],
"savings": round(worst["price"] - best["price"], 2),
"savings_pct": round((1 - best["price"] / worst["price"]) * 100, 1),
"all_prices": sorted_prices,
"checked_at": datetime.now().isoformat(),
}
def format_report(comparisons):
"""Format comparison results as text report."""
lines = ["=" * 60, "Price Comparison Report", "=" * 60, ""]
for comp in comparisons:
if "error" in comp:
lines.append(f"{comp['product']}: {comp['error']}")
continue
lines.append(f"Product: {comp['product']}")
lines.append(f" Best: ${comp['best_price']:.2f}")
lines.append(f" Source: {comp['best_source']}")
lines.append(f" Savings: ${comp['savings']:.2f} ({comp['savings_pct']}%)")
for p in comp["all_prices"]:
lines.append(f" ${p['price']:.2f} — {p['url']}")
lines.append("")
return "\n".join(lines)
Main Runner
# main.py
import os
import time
from solver import CaptchaSolver
from scraper import StoreScraper
from compare import compare_prices, format_report
PRODUCTS = [
{
"name": "Wireless Headphones",
"urls": [
"https://store-a.example.com/headphones-xyz",
"https://store-b.example.com/product/headphones-xyz",
"https://store-c.example.com/electronics/headphones-xyz",
],
},
]
def main():
api_key = os.environ["CAPTCHAAI_API_KEY"]
solver = CaptchaSolver(api_key)
scraper = StoreScraper(solver)
comparisons = []
for product in PRODUCTS:
print(f"Checking prices for: {product['name']}")
prices = []
for url in product["urls"]:
try:
price = scraper.scrape_price(url)
prices.append(price)
print(f" {url}: ${price.get('price', 'N/A')}")
except Exception as e:
print(f" {url}: Error — {e}")
time.sleep(3)
comparison = compare_prices(product["name"], prices)
comparisons.append(comparison)
report = format_report(comparisons)
print(report)
# Save report
with open("price_report.txt", "w") as f:
f.write(report)
if __name__ == "__main__":
main()
FAQ
How often should I check prices?
Daily is sufficient for most products. For flash sales or competitive monitoring, every 4-6 hours. Rate limit to avoid blocks.
How do I handle different currencies?
Convert to a base currency using a free exchange rate API before comparing. Store the original currency for reference.
Can I run this as a scheduled job?
Yes. Use cron (Linux), Task Scheduler (Windows), or a cloud scheduler to run the bot daily.
Related Guides
Build your price bot — start with CaptchaAI.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.