Fifty workers simultaneously fail to solve CAPTCHAs — the API has a brief issue. All fifty retry immediately. Then again. The API, already struggling, gets hammered with 150 requests in seconds. This is a retry storm, and it turns a brief hiccup into a sustained outage for your system. Prevention requires coordinated backoff, jitter, and global retry limits.
How Retry Storms Form
Normal: 50 workers → 50 requests → 50 responses
Brief issue: 50 workers → 50 failures → 50 immediate retries
Storm: 50 workers → 150 retries in 3 seconds → API overwhelmed
Cascade: API returns 429/500 → even more retries → complete outage
Python: Backoff with Jitter and Global Limiter
import requests
import time
import random
import threading
from dataclasses import dataclass
API_KEY = "YOUR_API_KEY"
SUBMIT_URL = "https://ocr.captchaai.com/in.php"
RESULT_URL = "https://ocr.captchaai.com/res.php"
class RetryBudget:
"""Global retry budget — limits total retries across all workers."""
def __init__(self, max_retries_per_second: float = 10.0):
self._max_rate = max_retries_per_second
self._tokens = max_retries_per_second
self._last_refill = time.monotonic()
self._lock = threading.Lock()
self._total_consumed = 0
self._total_rejected = 0
def acquire(self) -> bool:
"""Try to acquire a retry token. Returns False if budget exhausted."""
with self._lock:
now = time.monotonic()
elapsed = now - self._last_refill
self._tokens = min(self._max_rate, self._tokens + elapsed * self._max_rate)
self._last_refill = now
if self._tokens >= 1.0:
self._tokens -= 1.0
self._total_consumed += 1
return True
self._total_rejected += 1
return False
@property
def stats(self) -> dict:
return {"consumed": self._total_consumed, "rejected": self._total_rejected}
# Shared global budget
retry_budget = RetryBudget(max_retries_per_second=10.0)
def backoff_with_jitter(attempt: int, base: float = 1.0, max_delay: float = 60.0) -> float:
"""Exponential backoff with full jitter."""
exp_delay = min(base * (2 ** attempt), max_delay)
return random.uniform(0, exp_delay)
@dataclass
class RetryConfig:
max_attempts: int = 3
base_delay: float = 1.0
max_delay: float = 30.0
retryable_errors: tuple = ("ERROR_NO_SLOT_AVAILABLE", "CAPCHA_NOT_READY")
def solve_with_retry(params: dict, config: RetryConfig = RetryConfig()) -> str:
"""Solve a CAPTCHA with retry storm prevention."""
last_error = None
for attempt in range(config.max_attempts):
if attempt > 0:
# Check global retry budget
if not retry_budget.acquire():
raise RuntimeError(
f"Retry budget exhausted — too many retries system-wide. "
f"Stats: {retry_budget.stats}"
)
delay = backoff_with_jitter(attempt, config.base_delay, config.max_delay)
print(f"[RETRY] Attempt {attempt + 1}/{config.max_attempts}, "
f"waiting {delay:.1f}s")
time.sleep(delay)
try:
return _submit_and_poll(params)
except RetryableError as e:
last_error = e
print(f"[RETRY] Retryable error: {e}")
continue
except PermanentError:
raise # Don't retry permanent errors
raise RuntimeError(f"All {config.max_attempts} attempts failed: {last_error}")
class RetryableError(Exception):
pass
class PermanentError(Exception):
pass
def _submit_and_poll(params: dict) -> str:
"""Submit and poll with error classification."""
submit_params = {**params, "key": API_KEY, "json": 1}
resp = requests.post(SUBMIT_URL, data=submit_params, timeout=30).json()
error = resp.get("request", "")
if resp.get("status") != 1:
if error in ("ERROR_NO_SLOT_AVAILABLE", "ERROR_TOO_MUCH_REQUESTS"):
raise RetryableError(error)
if error in ("ERROR_WRONG_USER_KEY", "ERROR_KEY_DOES_NOT_EXIST",
"ERROR_ZERO_BALANCE", "ERROR_ZERO_CAPTCHA_FILESIZE"):
raise PermanentError(error)
raise RetryableError(error) # Default to retryable for unknown errors
task_id = resp["request"]
start = time.monotonic()
while time.monotonic() - start < 180:
time.sleep(5)
poll = requests.get(RESULT_URL, params={
"key": API_KEY, "action": "get", "id": task_id, "json": 1,
}, timeout=15).json()
if poll.get("request") == "CAPCHA_NOT_READY":
continue
if poll.get("status") == 1:
return poll["request"]
poll_error = poll.get("request", "")
if poll_error in ("ERROR_CAPTCHA_UNSOLVABLE",):
raise RetryableError(poll_error)
raise PermanentError(poll_error)
raise RetryableError("Timeout")
# --- Usage ---
token = solve_with_retry({
"method": "turnstile",
"sitekey": "0x4XXXXXXXXXXXXXXXXX",
"pageurl": "https://example.com/login",
})
print(f"Token: {token[:30]}...")
print(f"Retry budget stats: {retry_budget.stats}")
JavaScript: Coordinated Retry Limiter
const API_KEY = "YOUR_API_KEY";
const SUBMIT_URL = "https://ocr.captchaai.com/in.php";
const RESULT_URL = "https://ocr.captchaai.com/res.php";
class RetryBudget {
#tokens;
#maxRate;
#lastRefill;
consumed = 0;
rejected = 0;
constructor(maxPerSecond = 10) {
this.#maxRate = maxPerSecond;
this.#tokens = maxPerSecond;
this.#lastRefill = Date.now();
}
acquire() {
const now = Date.now();
const elapsed = (now - this.#lastRefill) / 1000;
this.#tokens = Math.min(this.#maxRate, this.#tokens + elapsed * this.#maxRate);
this.#lastRefill = now;
if (this.#tokens >= 1) {
this.#tokens--;
this.consumed++;
return true;
}
this.rejected++;
return false;
}
}
const retryBudget = new RetryBudget(10);
const RETRYABLE = new Set([
"ERROR_NO_SLOT_AVAILABLE",
"ERROR_TOO_MUCH_REQUESTS",
"ERROR_CAPTCHA_UNSOLVABLE",
]);
const PERMANENT = new Set([
"ERROR_WRONG_USER_KEY",
"ERROR_KEY_DOES_NOT_EXIST",
"ERROR_ZERO_BALANCE",
]);
function backoffWithJitter(attempt, base = 1, maxDelay = 30) {
const expDelay = Math.min(base * 2 ** attempt, maxDelay);
return Math.random() * expDelay;
}
async function submitAndPoll(params) {
const body = new URLSearchParams({ key: API_KEY, json: "1", ...params });
const resp = await (await fetch(SUBMIT_URL, { method: "POST", body })).json();
if (resp.status !== 1) {
if (PERMANENT.has(resp.request)) throw new Error(`Permanent: ${resp.request}`);
if (RETRYABLE.has(resp.request)) throw new RetryableError(resp.request);
throw new RetryableError(resp.request);
}
const taskId = resp.request;
for (let i = 0; i < 60; i++) {
await new Promise((r) => setTimeout(r, 5000));
const url = `${RESULT_URL}?key=${API_KEY}&action=get&id=${taskId}&json=1`;
const poll = await (await fetch(url)).json();
if (poll.request === "CAPCHA_NOT_READY") continue;
if (poll.status === 1) return poll.request;
if (RETRYABLE.has(poll.request)) throw new RetryableError(poll.request);
throw new Error(`Permanent: ${poll.request}`);
}
throw new RetryableError("Timeout");
}
class RetryableError extends Error {
constructor(msg) { super(msg); this.retryable = true; }
}
async function solveWithRetry(params, maxAttempts = 3) {
let lastError;
for (let attempt = 0; attempt < maxAttempts; attempt++) {
if (attempt > 0) {
if (!retryBudget.acquire()) {
throw new Error(`Retry budget exhausted: consumed=${retryBudget.consumed}`);
}
const delay = backoffWithJitter(attempt);
console.log(`[RETRY] Attempt ${attempt + 1}, waiting ${delay.toFixed(1)}s`);
await new Promise((r) => setTimeout(r, delay * 1000));
}
try {
return await submitAndPoll(params);
} catch (e) {
if (!e.retryable) throw e;
lastError = e;
}
}
throw new Error(`All ${maxAttempts} attempts failed: ${lastError.message}`);
}
// Usage
const token = await solveWithRetry({
method: "turnstile",
sitekey: "0x4XXXXXXXXXXXXXXXXX",
pageurl: "https://example.com/login",
});
Backoff Strategies Compared
| Strategy | Formula | Storm risk | Convergence |
|---|---|---|---|
| Fixed delay | delay = 5s |
High — all retry together | None |
| Linear backoff | delay = attempt × 5s |
Medium — still clustered | Slow |
| Exponential | delay = 2^attempt × base |
Medium | Fast |
| Exponential + full jitter | random(0, 2^attempt × base) |
Low — randomized | Fast |
| Decorrelated jitter | min(cap, random(base, prev × 3)) |
Lowest | Fast |
Troubleshooting
| Issue | Cause | Fix |
|---|---|---|
| All workers retry at the same time | No jitter in backoff | Use random.uniform(0, delay) instead of fixed delay |
| Retry budget rejected too many requests | Rate too low for traffic | Increase max_retries_per_second proportional to worker count |
| Permanent errors trigger retries | No error classification | Classify ERROR_WRONG_USER_KEY and ERROR_ZERO_BALANCE as permanent |
| Retries succeed but token expired | Too many retries took too long | Reduce max attempts; solve fresh instead of retrying stale tasks |
| System recovers slowly after outage | Backoff delays too long | Cap max delay at 30s; reduce base delay |
FAQ
What's the right retry budget for my system?
Start with 10–20% of your normal request rate. If you normally send 100 requests/second, allow 10–20 retries/second. Monitor rejection rates and adjust. Too generous defeats the purpose; too strict causes unnecessary failures.
Should I retry ERROR_CAPTCHA_UNSOLVABLE?
Yes — this error usually means the specific attempt failed, not that the CAPTCHA is impossible. Retry with a new submission. If the same CAPTCHA fails 3+ times, it may genuinely be unsolvable.
How does this differ from a circuit breaker?
Retry storm prevention limits how fast retries happen. A circuit breaker stops all requests when failures exceed a threshold. Use both together: the circuit breaker prevents sending requests to a known-failed service, and retry storm prevention controls the rate when the service is degraded but not fully down.
Next Steps
Protect your CAPTCHA pipeline from retry storms — get your CaptchaAI API key and implement coordinated retry limits.
Related guides:
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.