R users collecting data from web sources — government portals, research databases, public registries — frequently encounter CAPTCHAs that block automated access. CaptchaAI's HTTP API integrates with R's httr2 and httr packages, letting you solve CAPTCHAs directly from your data collection scripts.
This guide covers reCAPTCHA v2, Cloudflare Turnstile, and image CAPTCHA solving with R functions you can embed in any scraping or data pipeline.
Why R for CAPTCHA-Protected Data Collection
- Research workflows — R is the standard tool in academia, data science, and policy research
- rvest + httr2 — powerful web scraping ecosystem already in place
- Reproducibility — R scripts document exact data collection procedures
- Statistical integration — process collected data immediately with tidyverse
- RMarkdown — combine data collection, CAPTCHA solving, and analysis in one document
Prerequisites
install.packages(c("httr2", "httr", "jsonlite", "rvest", "base64enc"))
- R 4.0+
- CaptchaAI API key (get one here)
Core Solver Functions
Using httr2 (Recommended)
library(httr2)
library(jsonlite)
#' Submit a CAPTCHA task to CaptchaAI
#' @param api_key CaptchaAI API key
#' @param params Named list of task parameters
#' @return Task ID string
submit_captcha <- function(api_key, params) {
body <- c(list(key = api_key, json = 1), params)
resp <- request("https://ocr.captchaai.com/in.php") |>
req_method("POST") |>
req_body_form(!!!body) |>
req_timeout(30) |>
req_perform()
data <- resp |> resp_body_json()
if (data$status != 1) {
stop(paste("Submit failed:", data$request))
}
return(data$request)
}
#' Poll for CAPTCHA solution
#' @param api_key CaptchaAI API key
#' @param task_id Task ID from submit
#' @param max_wait Maximum seconds to wait (default 300)
#' @param interval Poll interval in seconds (default 5)
#' @return Solution string
poll_captcha <- function(api_key, task_id, max_wait = 300, interval = 5) {
deadline <- Sys.time() + max_wait
while (Sys.time() < deadline) {
Sys.sleep(interval)
resp <- request("https://ocr.captchaai.com/res.php") |>
req_url_query(
key = api_key,
action = "get",
id = task_id,
json = 1
) |>
req_perform()
data <- resp |> resp_body_json()
if (data$request == "CAPCHA_NOT_READY") next
if (data$status != 1) {
stop(paste("Solve failed:", data$request))
}
return(data$request)
}
stop("Timeout: CAPTCHA not solved within time limit")
}
Solving reCAPTCHA v2
#' Solve reCAPTCHA v2
#' @param api_key CaptchaAI API key
#' @param site_url Page URL containing the CAPTCHA
#' @param sitekey Google sitekey
#' @return reCAPTCHA token string
solve_recaptcha_v2 <- function(api_key, site_url, sitekey) {
message("Submitting reCAPTCHA v2 task...")
task_id <- submit_captcha(api_key, list(
method = "userrecaptcha",
googlekey = sitekey,
pageurl = site_url
))
message(paste("Task ID:", task_id))
message("Polling for solution...")
token <- poll_captcha(api_key, task_id)
message(paste("Solved! Token:", substr(token, 1, 50), "..."))
return(token)
}
# Usage
api_key <- "YOUR_API_KEY"
token <- solve_recaptcha_v2(
api_key,
site_url = "https://example.com/login",
sitekey = "6Le-wvkSAAAAAPBMRTvw0Q4Muexq9bi0DJwx_mJ-"
)
Solving Cloudflare Turnstile
solve_turnstile <- function(api_key, site_url, sitekey) {
task_id <- submit_captcha(api_key, list(
method = "turnstile",
key = sitekey,
pageurl = site_url
))
return(poll_captcha(api_key, task_id))
}
# Usage
token <- solve_turnstile(
"YOUR_API_KEY",
site_url = "https://example.com/form",
sitekey = "0x4AAAAAAAB5..."
)
Solving Image CAPTCHAs
library(base64enc)
#' Solve image CAPTCHA from file
#' @param api_key CaptchaAI API key
#' @param image_path Path to CAPTCHA image file
#' @return Decoded text string
solve_image_captcha <- function(api_key, image_path) {
if (!file.exists(image_path)) {
stop(paste("File not found:", image_path))
}
image_data <- base64encode(image_path)
task_id <- submit_captcha(api_key, list(
method = "base64",
body = image_data
))
return(poll_captcha(api_key, task_id))
}
#' Solve image CAPTCHA from URL
solve_image_from_url <- function(api_key, image_url) {
tmp <- tempfile(fileext = ".png")
download.file(image_url, tmp, mode = "wb", quiet = TRUE)
on.exit(unlink(tmp))
return(solve_image_captcha(api_key, tmp))
}
# Usage
text <- solve_image_captcha("YOUR_API_KEY", "captcha.png")
cat("CAPTCHA text:", text, "\n")
Integrating with rvest for Web Scraping
library(rvest)
library(httr2)
#' Extract sitekey from a web page
#' @param url Page URL
#' @return List with type and key
extract_sitekey <- function(url) {
page <- read_html(url)
# reCAPTCHA
recaptcha <- page |> html_element("[data-sitekey]")
if (!is.na(recaptcha)) {
return(list(
type = "recaptcha",
key = html_attr(recaptcha, "data-sitekey")
))
}
# Turnstile
turnstile <- page |> html_element(".cf-turnstile[data-sitekey]")
if (!is.na(turnstile)) {
return(list(
type = "turnstile",
key = html_attr(turnstile, "data-sitekey")
))
}
# Check inline scripts
scripts <- page |> html_elements("script") |> html_text2()
for (script in scripts) {
match <- regmatches(script, regexpr("sitekey['\": ]+['\"]([A-Za-z0-9_-]{20,})", script))
if (length(match) > 0) {
key <- sub(".*['\"]([A-Za-z0-9_-]{20,}).*", "\\1", match)
return(list(type = "unknown", key = key))
}
}
return(NULL)
}
#' Scrape a CAPTCHA-protected page
scrape_protected_page <- function(api_key, url) {
# Step 1: Extract sitekey
captcha_info <- extract_sitekey(url)
if (is.null(captcha_info)) {
message("No CAPTCHA found, scraping directly")
return(read_html(url))
}
message(paste("Found", captcha_info$type, "CAPTCHA"))
# Step 2: Solve CAPTCHA
token <- switch(captcha_info$type,
"recaptcha" = solve_recaptcha_v2(api_key, url, captcha_info$key),
"turnstile" = solve_turnstile(api_key, url, captcha_info$key),
stop(paste("Unsupported CAPTCHA type:", captcha_info$type))
)
# Step 3: Submit form with token
resp <- request(url) |>
req_method("POST") |>
req_body_form(
`g-recaptcha-response` = token
) |>
req_perform()
return(read_html(resp |> resp_body_string()))
}
Research Data Collection Pipeline
Complete example: collecting data from a CAPTCHA-protected government portal.
library(tidyverse)
library(httr2)
#' Collect records from a CAPTCHA-protected search portal
#' @param api_key CaptchaAI API key
#' @param search_terms Character vector of search terms
#' @param base_url Portal base URL
#' @param sitekey CAPTCHA sitekey
#' @return Tibble of collected records
collect_records <- function(api_key, search_terms, base_url, sitekey) {
all_records <- tibble()
for (term in search_terms) {
message(paste("Searching for:", term))
tryCatch({
# Solve CAPTCHA
token <- solve_recaptcha_v2(api_key, base_url, sitekey)
# Submit search with token
resp <- request(paste0(base_url, "/search")) |>
req_method("POST") |>
req_body_form(
query = term,
`g-recaptcha-response` = token
) |>
req_perform()
# Parse results
page <- read_html(resp |> resp_body_string())
rows <- page |>
html_element("table.results") |>
html_table()
if (nrow(rows) > 0) {
rows$search_term <- term
rows$collected_at <- Sys.time()
all_records <- bind_rows(all_records, rows)
}
message(paste(" Found", nrow(rows), "records"))
# Polite delay between requests
Sys.sleep(2)
}, error = function(e) {
warning(paste("Failed for term:", term, "-", e$message))
})
}
return(all_records)
}
# Usage
records <- collect_records(
api_key = "YOUR_API_KEY",
search_terms = c("term1", "term2", "term3"),
base_url = "https://portal.example.gov",
sitekey = "6Le-wvkSAAAAAPBMRTvw0Q4Muexq9bi0DJwx_mJ-"
)
# Save results
write_csv(records, "collected_data.csv")
message(paste("Collected", nrow(records), "total records"))
Check Balance
check_balance <- function(api_key) {
resp <- request("https://ocr.captchaai.com/res.php") |>
req_url_query(
key = api_key,
action = "getbalance",
json = 1
) |>
req_perform()
data <- resp |> resp_body_json()
return(as.numeric(data$request))
}
balance <- check_balance("YOUR_API_KEY")
cat(sprintf("Balance: $%.2f\n", balance))
Error Handling with Retry
solve_with_retry <- function(api_key, solve_fn, ..., max_retries = 3) {
retryable <- c("ERROR_NO_SLOT_AVAILABLE", "ERROR_CAPTCHA_UNSOLVABLE")
for (attempt in seq_len(max_retries + 1)) {
if (attempt > 1) {
delay <- 2^(attempt - 1) + runif(1, 0, 2)
message(sprintf("Retry %d/%d after %.1fs", attempt - 1, max_retries, delay))
Sys.sleep(delay)
}
result <- tryCatch(
solve_fn(api_key, ...),
error = function(e) {
is_retryable <- any(sapply(retryable, function(err) grepl(err, e$message)))
if (!is_retryable || attempt > max_retries) stop(e)
return(NULL)
}
)
if (!is.null(result)) return(result)
}
stop("Max retries exceeded")
}
# Usage
token <- solve_with_retry(
"YOUR_API_KEY",
solve_recaptcha_v2,
site_url = "https://example.com",
sitekey = "SITEKEY"
)
Parallel Solving with future
library(future)
library(future.apply)
plan(multisession, workers = 4)
parallel_solve <- function(api_key, tasks) {
future_lapply(tasks, function(task) {
tryCatch(
solve_recaptcha_v2(api_key, task$url, task$sitekey),
error = function(e) paste("Error:", e$message)
)
})
}
tasks <- list(
list(url = "https://site-a.com", sitekey = "KEY_A"),
list(url = "https://site-b.com", sitekey = "KEY_B"),
list(url = "https://site-c.com", sitekey = "KEY_C")
)
results <- parallel_solve("YOUR_API_KEY", tasks)
Troubleshooting
| Error | Cause | Fix |
|---|---|---|
ERROR_WRONG_USER_KEY |
Invalid API key | Verify key at dashboard |
ERROR_ZERO_BALANCE |
No funds | Top up account |
SSL certificate problem |
CA bundle issue | httr::set_config(config(ssl_verifypeer = TRUE)) or update ca-bundle |
could not find function |
Package not loaded | Run library(httr2) |
Timeout was reached |
Slow network | Increase req_timeout() value |
Error in base64encode |
Missing package | install.packages("base64enc") |
FAQ
Can I use this in RMarkdown or Quarto?
Yes. Embed the solver functions in a setup chunk and call them in data collection chunks. Results integrate directly into your analysis pipeline.
Which HTTP package should I use?
Use httr2 for new projects — it has a cleaner pipe-based API. Use httr if your existing code already depends on it.
Is this suitable for large-scale data collection?
For small-to-medium research projects, yes. For large-scale production scraping, consider Python or Node.js which have more robust async capabilities.
How do I store the API key securely?
Use Sys.getenv("CAPTCHAAI_KEY") and set the environment variable in .Renviron. Never hardcode keys in scripts you share.
Related Guides
Unlock CAPTCHA-protected data for your research — get your API key and start collecting.
Discussions (0)
Join the conversation
Sign in to share your opinion.
Sign InNo comments yet.