MongoDB's flexible schema and aggregation framework make it a strong fit for CAPTCHA solve tracking. Store every solve attempt with metadata, then query patterns across time, CAPTCHA types, and error rates.
Why MongoDB for CAPTCHA Data
CAPTCHA solve records have variable fields depending on the type — reCAPTCHA needs googlekey, hCaptcha needs sitekey, image CAPTCHAs need body. MongoDB's schemaless documents handle these naturally without schema migrations.
Document Schema
{
"_id": "ObjectId",
"captcha_id": "12345678",
"type": "recaptcha_v2",
"method": "userrecaptcha",
"sitekey": "6Le-wvkSAAAAAPBMRTvw0Q4Muexq9bi0DJwx_mJ-",
"pageurl": "https://example.com/form",
"status": "solved",
"solution": "03AGdBq26...",
"error": null,
"submitted_at": "2026-04-04T10:15:30.000Z",
"solved_at": "2026-04-04T10:15:45.000Z",
"elapsed_ms": 15000,
"polls": 3,
"proxy_used": true,
"cost": 0.00299,
"metadata": {
"project": "price-monitor",
"worker_id": "worker-3",
"target_domain": "example.com"
}
}
Python Implementation
Setup and Connection
import os
import time
from datetime import datetime, timezone
from pymongo import MongoClient, ASCENDING, DESCENDING
import requests
MONGO_URI = os.environ.get("MONGO_URI", "mongodb://localhost:27017")
API_KEY = os.environ["CAPTCHAAI_API_KEY"]
client = MongoClient(MONGO_URI)
db = client["captcha_tracking"]
solves = db["solves"]
Create Indexes
def setup_indexes():
solves.create_index([("submitted_at", DESCENDING)])
solves.create_index([("type", ASCENDING), ("status", ASCENDING)])
solves.create_index([("metadata.project", ASCENDING)])
solves.create_index([("metadata.target_domain", ASCENDING)])
solves.create_index(
[("submitted_at", ASCENDING)],
expireAfterSeconds=90 * 24 * 3600, # Auto-delete after 90 days
name="ttl_cleanup"
)
setup_indexes()
Solve and Store
def solve_and_store(sitekey, pageurl, captcha_type="recaptcha_v2", metadata=None):
record = {
"type": captcha_type,
"method": "userrecaptcha",
"sitekey": sitekey,
"pageurl": pageurl,
"status": "submitted",
"submitted_at": datetime.now(timezone.utc),
"metadata": metadata or {}
}
result = solves.insert_one(record)
doc_id = result.inserted_id
# Submit to CaptchaAI
resp = requests.post("https://ocr.captchaai.com/in.php", data={
"key": API_KEY,
"method": "userrecaptcha",
"googlekey": sitekey,
"pageurl": pageurl,
"json": 1
})
data = resp.json()
if data.get("status") != 1:
solves.update_one(
{"_id": doc_id},
{"$set": {"status": "error", "error": data.get("request")}}
)
return None
captcha_id = data["request"]
solves.update_one(
{"_id": doc_id},
{"$set": {"captcha_id": captcha_id, "status": "polling"}}
)
# Poll for result
polls = 0
for _ in range(60):
time.sleep(5)
polls += 1
poll_resp = requests.get("https://ocr.captchaai.com/res.php", params={
"key": API_KEY, "action": "get",
"id": captcha_id, "json": 1
}).json()
if poll_resp.get("status") == 1:
solved_at = datetime.now(timezone.utc)
elapsed_ms = int(
(solved_at - record["submitted_at"]).total_seconds() * 1000
)
solves.update_one({"_id": doc_id}, {"$set": {
"status": "solved",
"solution": poll_resp["request"],
"solved_at": solved_at,
"elapsed_ms": elapsed_ms,
"polls": polls
}})
return poll_resp["request"]
if poll_resp.get("request") != "CAPCHA_NOT_READY":
solves.update_one({"_id": doc_id}, {"$set": {
"status": "error",
"error": poll_resp.get("request"),
"polls": polls
}})
return None
solves.update_one({"_id": doc_id}, {"$set": {
"status": "timeout", "polls": polls
}})
return None
Analytics Queries
def get_success_rate(hours=24):
"""Success rate for the last N hours."""
from datetime import timedelta
cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
pipeline = [
{"$match": {"submitted_at": {"$gte": cutoff}}},
{"$group": {
"_id": "$status",
"count": {"$sum": 1}
}}
]
results = {r["_id"]: r["count"] for r in solves.aggregate(pipeline)}
total = sum(results.values())
solved = results.get("solved", 0)
return (solved / total * 100) if total else 0
def get_avg_solve_time_by_type():
"""Average solve time grouped by CAPTCHA type."""
pipeline = [
{"$match": {"status": "solved"}},
{"$group": {
"_id": "$type",
"avg_time_ms": {"$avg": "$elapsed_ms"},
"min_time_ms": {"$min": "$elapsed_ms"},
"max_time_ms": {"$max": "$elapsed_ms"},
"count": {"$sum": 1}
}},
{"$sort": {"count": -1}}
]
return list(solves.aggregate(pipeline))
def get_hourly_solve_volume(days=7):
"""Hourly solve volume for charting."""
from datetime import timedelta
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
pipeline = [
{"$match": {"submitted_at": {"$gte": cutoff}}},
{"$group": {
"_id": {
"date": {"$dateToString": {"format": "%Y-%m-%d", "date": "$submitted_at"}},
"hour": {"$hour": "$submitted_at"}
},
"total": {"$sum": 1},
"solved": {"$sum": {"$cond": [{"$eq": ["$status", "solved"]}, 1, 0]}}
}},
{"$sort": {"_id.date": 1, "_id.hour": 1}}
]
return list(solves.aggregate(pipeline))
def get_error_breakdown(hours=24):
"""Error frequency by error code."""
from datetime import timedelta
cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
pipeline = [
{"$match": {"submitted_at": {"$gte": cutoff}, "status": "error"}},
{"$group": {"_id": "$error", "count": {"$sum": 1}}},
{"$sort": {"count": -1}}
]
return list(solves.aggregate(pipeline))
JavaScript Implementation
const { MongoClient } = require("mongodb");
const axios = require("axios");
const MONGO_URI = process.env.MONGO_URI || "mongodb://localhost:27017";
const API_KEY = process.env.CAPTCHAAI_API_KEY;
let db, solves;
async function connect() {
const client = await MongoClient.connect(MONGO_URI);
db = client.db("captcha_tracking");
solves = db.collection("solves");
await solves.createIndex({ submitted_at: -1 });
await solves.createIndex({ type: 1, status: 1 });
await solves.createIndex({ "metadata.project": 1 });
await solves.createIndex(
{ submitted_at: 1 },
{ expireAfterSeconds: 90 * 24 * 3600 }
);
}
async function solveAndStore(sitekey, pageurl, type = "recaptcha_v2", metadata = {}) {
const submittedAt = new Date();
const { insertedId } = await solves.insertOne({
type, method: "userrecaptcha", sitekey, pageurl,
status: "submitted", submitted_at: submittedAt, metadata,
});
const submit = await axios.post("https://ocr.captchaai.com/in.php", null, {
params: { key: API_KEY, method: "userrecaptcha", googlekey: sitekey, pageurl, json: 1 },
});
if (submit.data.status !== 1) {
await solves.updateOne({ _id: insertedId }, { $set: { status: "error", error: submit.data.request } });
return null;
}
const captchaId = submit.data.request;
await solves.updateOne({ _id: insertedId }, { $set: { captcha_id: captchaId, status: "polling" } });
let polls = 0;
for (let i = 0; i < 60; i++) {
await new Promise((r) => setTimeout(r, 5000));
polls++;
const poll = await axios.get("https://ocr.captchaai.com/res.php", {
params: { key: API_KEY, action: "get", id: captchaId, json: 1 },
});
if (poll.data.status === 1) {
const solvedAt = new Date();
await solves.updateOne({ _id: insertedId }, { $set: {
status: "solved", solution: poll.data.request,
solved_at: solvedAt, elapsed_ms: solvedAt - submittedAt, polls,
}});
return poll.data.request;
}
if (poll.data.request !== "CAPCHA_NOT_READY") {
await solves.updateOne({ _id: insertedId }, { $set: { status: "error", error: poll.data.request, polls } });
return null;
}
}
await solves.updateOne({ _id: insertedId }, { $set: { status: "timeout", polls } });
return null;
}
async function getSuccessRate(hours = 24) {
const cutoff = new Date(Date.now() - hours * 3600 * 1000);
const pipeline = [
{ $match: { submitted_at: { $gte: cutoff } } },
{ $group: { _id: "$status", count: { $sum: 1 } } },
];
const results = await solves.aggregate(pipeline).toArray();
const total = results.reduce((s, r) => s + r.count, 0);
const solved = results.find((r) => r._id === "solved")?.count || 0;
return total ? ((solved / total) * 100).toFixed(1) : 0;
}
Data Retention
| Strategy | TTL Index | Use Case |
|---|---|---|
| 30-day retention | expireAfterSeconds: 2592000 |
Development/testing |
| 90-day retention | expireAfterSeconds: 7776000 |
Production analytics |
| Permanent (with archival) | No TTL; use capped collection or cold storage | Compliance/audit |
Troubleshooting
| Issue | Cause | Fix |
|---|---|---|
| Slow aggregation queries | Missing indexes on submitted_at and type |
Run setup_indexes() — see index section above |
| Documents growing large | Storing full solutions in every record | Store solution hashes or truncate after use |
| TTL not deleting old records | TTL monitor runs every 60 seconds; large backlogs take time | Wait for background cleanup; check index with db.solves.getIndexes() |
| Connection pool exhaustion | Too many concurrent solve operations | Set maxPoolSize in connection string |
FAQ
Should I store the full CAPTCHA solution token?
For debugging, store tokens for 24–48 hours then let the TTL index clean them up. For long-term analytics, store only metadata (type, time, status, error) — tokens are useless after expiration anyway.
How much storage does this use?
Each solve record is roughly 500 bytes to 2 KB depending on metadata. At 10,000 solves/day with 90-day retention, expect about 1–2 GB. MongoDB handles this easily.
Can I use MongoDB Atlas (cloud)?
Yes. Atlas supports TTL indexes and aggregation pipelines. Use the connection string from your Atlas dashboard in MONGO_URI.
Next Steps
Track every CAPTCHA solve and spot issues before they impact your pipeline — get your CaptchaAI API key.
Related guides:
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.