News outlets and media platforms use CAPTCHAs to protect content from automated aggregation. Media monitors, PR firms, and research organizations need to collect articles programmatically. CaptchaAI handles CAPTCHA challenges across news sources.
CAPTCHAs on News Platforms
| Source Type | CAPTCHA | Trigger | Content |
|---|---|---|---|
| Major news outlets | Cloudflare Turnstile | Bot detection | Articles, headlines |
| Wire services (AP, Reuters) | reCAPTCHA v2 | Bulk access | Breaking news |
| Paywalled publications | reCAPTCHA v3 | Access attempts | Premium articles |
| Local news sites | reCAPTCHA v2 | Rate limiting | Regional news |
| News aggregators | Cloudflare Challenge | Scraping detection | Aggregated feeds |
| Press release sites | Image CAPTCHA | Download pages | PR content |
News Aggregator
import requests
import time
import re
from bs4 import BeautifulSoup
from datetime import datetime
import json
CAPTCHAAI_KEY = "YOUR_API_KEY"
CAPTCHAAI_URL = "https://ocr.captchaai.com"
def solve_captcha(method, sitekey, pageurl, **kwargs):
data = {
"key": CAPTCHAAI_KEY, "method": method,
"googlekey": sitekey, "pageurl": pageurl, "json": 1,
}
data.update(kwargs)
resp = requests.post(f"{CAPTCHAAI_URL}/in.php", data=data)
task_id = resp.json()["request"]
for _ in range(60):
time.sleep(5)
result = requests.get(f"{CAPTCHAAI_URL}/res.php", params={
"key": CAPTCHAAI_KEY, "action": "get",
"id": task_id, "json": 1,
})
r = result.json()
if r["request"] != "CAPCHA_NOT_READY":
return r["request"]
raise TimeoutError("Timeout")
class NewsAggregator:
def __init__(self, proxy=None):
self.session = requests.Session()
if proxy:
self.session.proxies = {"http": proxy, "https": proxy}
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
})
def collect_headlines(self, source_url, section=None):
"""Collect headlines from a news source."""
url = f"{source_url}/{section}" if section else source_url
resp = self.session.get(url, timeout=30)
if self._has_captcha(resp.text):
resp = self._solve_and_retry(resp.text, url)
soup = BeautifulSoup(resp.text, "html.parser")
articles = []
for item in soup.select("article, .story, .headline-item, h2 a, h3 a"):
link = item if item.name == "a" else item.select_one("a")
if link:
articles.append({
"title": link.get_text(strip=True),
"url": self._abs_url(source_url, link.get("href", "")),
"source": source_url,
"collected_at": datetime.now().isoformat(),
})
return articles
def get_article(self, article_url):
"""Fetch full article content."""
resp = self.session.get(article_url, timeout=30)
if self._has_captcha(resp.text):
resp = self._solve_and_retry(resp.text, article_url)
soup = BeautifulSoup(resp.text, "html.parser")
# Remove unwanted elements
for tag in soup.select("script, style, nav, footer, .ad, .sidebar"):
tag.decompose()
content_el = soup.select_one(
"article, .article-body, .story-body, .entry-content"
)
return {
"url": article_url,
"title": self._text(soup, "h1, .article-title"),
"author": self._text(soup, ".author, .byline, [rel='author']"),
"date": self._text(soup, "time, .publish-date, .article-date"),
"content": content_el.get_text(separator="\n", strip=True) if content_el else "",
"word_count": len(content_el.get_text().split()) if content_el else 0,
}
def aggregate_sources(self, sources, max_articles_per=20):
"""Aggregate headlines across multiple sources."""
all_articles = []
for source in sources:
try:
articles = self.collect_headlines(source["url"], source.get("section"))
all_articles.extend(articles[:max_articles_per])
print(f"{source['name']}: {len(articles)} headlines")
except Exception as e:
print(f"{source['name']}: Error - {e}")
time.sleep(3)
return all_articles
def _has_captcha(self, html):
return any(tag in html.lower() for tag in [
'data-sitekey', 'g-recaptcha', 'cf-turnstile',
])
def _solve_and_retry(self, html, url):
match = re.search(r'data-sitekey="([^"]+)"', html)
if not match:
return self.session.get(url)
sitekey = match.group(1)
if 'cf-turnstile' in html:
token = solve_captcha("turnstile", sitekey, url)
return self.session.post(url, data={"cf-turnstile-response": token})
token = solve_captcha("userrecaptcha", sitekey, url)
return self.session.post(url, data={"g-recaptcha-response": token})
def _text(self, soup, selector):
el = soup.select_one(selector)
return el.get_text(strip=True) if el else ""
def _abs_url(self, base, href):
if href.startswith("http"):
return href
return base.rstrip("/") + "/" + href.lstrip("/")
# Usage
aggregator = NewsAggregator(
proxy="http://user:pass@residential.proxy.com:5000"
)
sources = [
{"name": "Tech News A", "url": "https://technews-a.example.com", "section": "latest"},
{"name": "Business B", "url": "https://business-b.example.com", "section": "tech"},
{"name": "Industry C", "url": "https://industry-c.example.com"},
]
headlines = aggregator.aggregate_sources(sources)
print(f"Total: {len(headlines)} headlines collected")
Keyword-Based News Monitoring
class NewsMonitor:
def __init__(self, keywords, sources, proxy=None):
self.keywords = [kw.lower() for kw in keywords]
self.aggregator = NewsAggregator(proxy=proxy)
self.sources = sources
self.seen_urls = set()
def scan(self):
"""Scan for articles matching keywords."""
headlines = self.aggregator.aggregate_sources(self.sources)
matches = []
for article in headlines:
if article["url"] in self.seen_urls:
continue
title_lower = article["title"].lower()
matched_kws = [kw for kw in self.keywords if kw in title_lower]
if matched_kws:
article["matched_keywords"] = matched_kws
matches.append(article)
self.seen_urls.add(article["url"])
return matches
def continuous_monitor(self, interval_min=30):
"""Run continuous monitoring with alerts."""
while True:
matches = self.scan()
if matches:
print(f"\n=== {len(matches)} new matches found ===")
for m in matches:
print(f" [{', '.join(m['matched_keywords'])}] {m['title']}")
print(f" {m['url']}")
else:
print(f"No new matches at {datetime.now().strftime('%H:%M')}")
time.sleep(interval_min * 60)
# Monitor for specific topics
monitor = NewsMonitor(
keywords=["captcha", "bot detection", "web scraping", "automation"],
sources=sources,
proxy="http://user:pass@residential.proxy.com:5000",
)
matches = monitor.scan()
Troubleshooting
| Issue | Cause | Fix |
|---|---|---|
| Cloudflare blocking all requests | Aggressive bot detection | Use residential proxy + realistic UA |
| Paywall instead of article | Content behind subscription | Detect paywall, skip or handle |
| CAPTCHA on every page | IP flagged | Rotate proxy, add 5+ sec delays |
| Article content empty | JS-rendered content | Use Selenium/Puppeteer for SPA sites |
| Duplicate articles | Same story from multiple sources | Deduplicate by title similarity |
FAQ
Is news aggregation legal?
Collecting headlines and metadata for research or monitoring is common practice. Reproducing full copyrighted articles without permission is not. Use snippets and link back to the original source.
How do I handle paywalled content?
Detect the paywall (look for paywall CSS classes or limited content length) and flag it. Only access content you're authorized to view.
Which proxy type works best for news sites?
Rotating residential proxies work best. Major news outlets use Cloudflare, which blocks datacenter IPs aggressively.
Related Guides
Aggregate news from any source — get your CaptchaAI key and automate content collection.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.