API Tutorials

Solving CAPTCHAs with R for Research Data Collection

R users collecting data from web sources — government portals, research databases, public registries — frequently encounter CAPTCHAs that block automated access. CaptchaAI's HTTP API integrates with R's httr2 and httr packages, letting you solve CAPTCHAs directly from your data collection scripts.

This guide covers reCAPTCHA v2, Cloudflare Turnstile, and image CAPTCHA solving with R functions you can embed in any scraping or data pipeline.


Why R for CAPTCHA-Protected Data Collection

  • Research workflows — R is the standard tool in academia, data science, and policy research
  • rvest + httr2 — powerful web scraping ecosystem already in place
  • Reproducibility — R scripts document exact data collection procedures
  • Statistical integration — process collected data immediately with tidyverse
  • RMarkdown — combine data collection, CAPTCHA solving, and analysis in one document

Prerequisites

install.packages(c("httr2", "httr", "jsonlite", "rvest", "base64enc"))

Core Solver Functions

library(httr2)
library(jsonlite)

#' Submit a CAPTCHA task to CaptchaAI
#' @param api_key CaptchaAI API key
#' @param params Named list of task parameters
#' @return Task ID string
submit_captcha <- function(api_key, params) {
  body <- c(list(key = api_key, json = 1), params)

  resp <- request("https://ocr.captchaai.com/in.php") |>
    req_method("POST") |>
    req_body_form(!!!body) |>
    req_timeout(30) |>
    req_perform()

  data <- resp |> resp_body_json()

  if (data$status != 1) {
    stop(paste("Submit failed:", data$request))
  }

  return(data$request)
}

#' Poll for CAPTCHA solution
#' @param api_key CaptchaAI API key
#' @param task_id Task ID from submit
#' @param max_wait Maximum seconds to wait (default 300)
#' @param interval Poll interval in seconds (default 5)
#' @return Solution string
poll_captcha <- function(api_key, task_id, max_wait = 300, interval = 5) {
  deadline <- Sys.time() + max_wait

  while (Sys.time() < deadline) {
    Sys.sleep(interval)

    resp <- request("https://ocr.captchaai.com/res.php") |>
      req_url_query(
        key = api_key,
        action = "get",
        id = task_id,
        json = 1
      ) |>
      req_perform()

    data <- resp |> resp_body_json()

    if (data$request == "CAPCHA_NOT_READY") next

    if (data$status != 1) {
      stop(paste("Solve failed:", data$request))
    }

    return(data$request)
  }

  stop("Timeout: CAPTCHA not solved within time limit")
}

Solving reCAPTCHA v2

#' Solve reCAPTCHA v2
#' @param api_key CaptchaAI API key
#' @param site_url Page URL containing the CAPTCHA
#' @param sitekey Google sitekey
#' @return reCAPTCHA token string
solve_recaptcha_v2 <- function(api_key, site_url, sitekey) {
  message("Submitting reCAPTCHA v2 task...")
  task_id <- submit_captcha(api_key, list(
    method = "userrecaptcha",
    googlekey = sitekey,
    pageurl = site_url
  ))
  message(paste("Task ID:", task_id))

  message("Polling for solution...")
  token <- poll_captcha(api_key, task_id)
  message(paste("Solved! Token:", substr(token, 1, 50), "..."))

  return(token)
}

# Usage
api_key <- "YOUR_API_KEY"
token <- solve_recaptcha_v2(
  api_key,
  site_url = "https://example.com/login",
  sitekey = "6Le-wvkSAAAAAPBMRTvw0Q4Muexq9bi0DJwx_mJ-"
)

Solving Cloudflare Turnstile

solve_turnstile <- function(api_key, site_url, sitekey) {
  task_id <- submit_captcha(api_key, list(
    method = "turnstile",
    key = sitekey,
    pageurl = site_url
  ))

  return(poll_captcha(api_key, task_id))
}

# Usage
token <- solve_turnstile(
  "YOUR_API_KEY",
  site_url = "https://example.com/form",
  sitekey = "0x4AAAAAAAB5..."
)

Solving Image CAPTCHAs

library(base64enc)

#' Solve image CAPTCHA from file
#' @param api_key CaptchaAI API key
#' @param image_path Path to CAPTCHA image file
#' @return Decoded text string
solve_image_captcha <- function(api_key, image_path) {
  if (!file.exists(image_path)) {
    stop(paste("File not found:", image_path))
  }

  image_data <- base64encode(image_path)

  task_id <- submit_captcha(api_key, list(
    method = "base64",
    body = image_data
  ))

  return(poll_captcha(api_key, task_id))
}

#' Solve image CAPTCHA from URL
solve_image_from_url <- function(api_key, image_url) {
  tmp <- tempfile(fileext = ".png")
  download.file(image_url, tmp, mode = "wb", quiet = TRUE)
  on.exit(unlink(tmp))

  return(solve_image_captcha(api_key, tmp))
}

# Usage
text <- solve_image_captcha("YOUR_API_KEY", "captcha.png")
cat("CAPTCHA text:", text, "\n")

Integrating with rvest for Web Scraping

library(rvest)
library(httr2)

#' Extract sitekey from a web page
#' @param url Page URL
#' @return List with type and key
extract_sitekey <- function(url) {
  page <- read_html(url)

  # reCAPTCHA
  recaptcha <- page |> html_element("[data-sitekey]")
  if (!is.na(recaptcha)) {
    return(list(
      type = "recaptcha",
      key = html_attr(recaptcha, "data-sitekey")
    ))
  }

  # Turnstile
  turnstile <- page |> html_element(".cf-turnstile[data-sitekey]")
  if (!is.na(turnstile)) {
    return(list(
      type = "turnstile",
      key = html_attr(turnstile, "data-sitekey")
    ))
  }

  # Check inline scripts
  scripts <- page |> html_elements("script") |> html_text2()
  for (script in scripts) {
    match <- regmatches(script, regexpr("sitekey['\": ]+['\"]([A-Za-z0-9_-]{20,})", script))
    if (length(match) > 0) {
      key <- sub(".*['\"]([A-Za-z0-9_-]{20,}).*", "\\1", match)
      return(list(type = "unknown", key = key))
    }
  }

  return(NULL)
}

#' Scrape a CAPTCHA-protected page
scrape_protected_page <- function(api_key, url) {
  # Step 1: Extract sitekey
  captcha_info <- extract_sitekey(url)
  if (is.null(captcha_info)) {
    message("No CAPTCHA found, scraping directly")
    return(read_html(url))
  }

  message(paste("Found", captcha_info$type, "CAPTCHA"))

  # Step 2: Solve CAPTCHA
  token <- switch(captcha_info$type,
    "recaptcha" = solve_recaptcha_v2(api_key, url, captcha_info$key),
    "turnstile" = solve_turnstile(api_key, url, captcha_info$key),
    stop(paste("Unsupported CAPTCHA type:", captcha_info$type))
  )

  # Step 3: Submit form with token
  resp <- request(url) |>
    req_method("POST") |>
    req_body_form(
      `g-recaptcha-response` = token
    ) |>
    req_perform()

  return(read_html(resp |> resp_body_string()))
}

Research Data Collection Pipeline

Complete example: collecting data from a CAPTCHA-protected government portal.

library(tidyverse)
library(httr2)

#' Collect records from a CAPTCHA-protected search portal
#' @param api_key CaptchaAI API key
#' @param search_terms Character vector of search terms
#' @param base_url Portal base URL
#' @param sitekey CAPTCHA sitekey
#' @return Tibble of collected records
collect_records <- function(api_key, search_terms, base_url, sitekey) {
  all_records <- tibble()

  for (term in search_terms) {
    message(paste("Searching for:", term))

    tryCatch({
      # Solve CAPTCHA
      token <- solve_recaptcha_v2(api_key, base_url, sitekey)

      # Submit search with token
      resp <- request(paste0(base_url, "/search")) |>
        req_method("POST") |>
        req_body_form(
          query = term,
          `g-recaptcha-response` = token
        ) |>
        req_perform()

      # Parse results
      page <- read_html(resp |> resp_body_string())
      rows <- page |>
        html_element("table.results") |>
        html_table()

      if (nrow(rows) > 0) {
        rows$search_term <- term
        rows$collected_at <- Sys.time()
        all_records <- bind_rows(all_records, rows)
      }

      message(paste("  Found", nrow(rows), "records"))

      # Polite delay between requests
      Sys.sleep(2)

    }, error = function(e) {
      warning(paste("Failed for term:", term, "-", e$message))
    })
  }

  return(all_records)
}

# Usage
records <- collect_records(
  api_key = "YOUR_API_KEY",
  search_terms = c("term1", "term2", "term3"),
  base_url = "https://portal.example.gov",
  sitekey = "6Le-wvkSAAAAAPBMRTvw0Q4Muexq9bi0DJwx_mJ-"
)

# Save results
write_csv(records, "collected_data.csv")
message(paste("Collected", nrow(records), "total records"))

Check Balance

check_balance <- function(api_key) {
  resp <- request("https://ocr.captchaai.com/res.php") |>
    req_url_query(
      key = api_key,
      action = "getbalance",
      json = 1
    ) |>
    req_perform()

  data <- resp |> resp_body_json()
  return(as.numeric(data$request))
}

balance <- check_balance("YOUR_API_KEY")
cat(sprintf("Balance: $%.2f\n", balance))

Error Handling with Retry

solve_with_retry <- function(api_key, solve_fn, ..., max_retries = 3) {
  retryable <- c("ERROR_NO_SLOT_AVAILABLE", "ERROR_CAPTCHA_UNSOLVABLE")

  for (attempt in seq_len(max_retries + 1)) {
    if (attempt > 1) {
      delay <- 2^(attempt - 1) + runif(1, 0, 2)
      message(sprintf("Retry %d/%d after %.1fs", attempt - 1, max_retries, delay))
      Sys.sleep(delay)
    }

    result <- tryCatch(
      solve_fn(api_key, ...),
      error = function(e) {
        is_retryable <- any(sapply(retryable, function(err) grepl(err, e$message)))
        if (!is_retryable || attempt > max_retries) stop(e)
        return(NULL)
      }
    )

    if (!is.null(result)) return(result)
  }

  stop("Max retries exceeded")
}

# Usage
token <- solve_with_retry(
  "YOUR_API_KEY",
  solve_recaptcha_v2,
  site_url = "https://example.com",
  sitekey = "SITEKEY"
)

Parallel Solving with future

library(future)
library(future.apply)

plan(multisession, workers = 4)

parallel_solve <- function(api_key, tasks) {
  future_lapply(tasks, function(task) {
    tryCatch(
      solve_recaptcha_v2(api_key, task$url, task$sitekey),
      error = function(e) paste("Error:", e$message)
    )
  })
}

tasks <- list(
  list(url = "https://site-a.com", sitekey = "KEY_A"),
  list(url = "https://site-b.com", sitekey = "KEY_B"),
  list(url = "https://site-c.com", sitekey = "KEY_C")
)

results <- parallel_solve("YOUR_API_KEY", tasks)

Troubleshooting

Error Cause Fix
ERROR_WRONG_USER_KEY Invalid API key Verify key at dashboard
ERROR_ZERO_BALANCE No funds Top up account
SSL certificate problem CA bundle issue httr::set_config(config(ssl_verifypeer = TRUE)) or update ca-bundle
could not find function Package not loaded Run library(httr2)
Timeout was reached Slow network Increase req_timeout() value
Error in base64encode Missing package install.packages("base64enc")

FAQ

Can I use this in RMarkdown or Quarto?

Yes. Embed the solver functions in a setup chunk and call them in data collection chunks. Results integrate directly into your analysis pipeline.

Which HTTP package should I use?

Use httr2 for new projects — it has a cleaner pipe-based API. Use httr if your existing code already depends on it.

Is this suitable for large-scale data collection?

For small-to-medium research projects, yes. For large-scale production scraping, consider Python or Node.js which have more robust async capabilities.

How do I store the API key securely?

Use Sys.getenv("CAPTCHAAI_KEY") and set the environment variable in .Renviron. Never hardcode keys in scripts you share.



Unlock CAPTCHA-protected data for your research — get your API key and start collecting.

Discussions (0)

No comments yet.

Related Posts

Use Cases Automated Form Submission with CAPTCHA Handling
Complete guide to automating web form submissions that include CAPTCHA challenges — re CAPTCHA, Turnstile, and image CAPTCHAs with Captcha AI.

Complete guide to automating web form submissions that include CAPTCHA challenges — re CAPTCHA, Turnstile, and...

Python reCAPTCHA v2 Cloudflare Turnstile
Mar 21, 2026
Explainers Reducing CAPTCHA Solve Costs: 10 Strategies
Cut CAPTCHA solving costs with Captcha AI using 10 practical strategies — from skipping unnecessary solves to batching and caching tokens.

Cut CAPTCHA solving costs with Captcha AI using 10 practical strategies — from skipping unnecessary solves to...

Python reCAPTCHA v2 Cloudflare Turnstile
Mar 11, 2026
Use Cases Supply Chain Monitoring with CAPTCHA Handling
Monitor supply chain data from manufacturer sites, logistics portals, and inventory systems protected by CAPTCHAs using Captcha AI.

Monitor supply chain data from manufacturer sites, logistics portals, and inventory systems protected by CAPTC...

Python reCAPTCHA v2 Cloudflare Turnstile
Jan 15, 2026
Tutorials CAPTCHA Solving Fallback Chains
Implement fallback chains for CAPTCHA solving with Captcha AI.

Implement fallback chains for CAPTCHA solving with Captcha AI. Cascade through solver methods, proxy pools, an...

Automation Python reCAPTCHA v2
Apr 06, 2026
API Tutorials CaptchaAI API Latency Optimization: Faster Solves
Reduce CAPTCHA solve latency with Captcha AI by optimizing poll intervals, connection pooling, prefetching, and proxy selection.

Reduce CAPTCHA solve latency with Captcha AI by optimizing poll intervals, connection pooling, prefetching, an...

Automation Python reCAPTCHA v2
Feb 27, 2026
Use Cases Shipping and Logistics Rate Scraping with CAPTCHA Solving
Scrape shipping rates, tracking data, and logistics information from carrier websites protected by CAPTCHAs using Captcha AI.

Scrape shipping rates, tracking data, and logistics information from carrier websites protected by CAPTCHAs us...

Python reCAPTCHA v2 Cloudflare Turnstile
Jan 25, 2026
API Tutorials Building a Python Wrapper Library for CaptchaAI API
Build a reusable Python wrapper library for the Captcha AI API with type hints, retry logic, context managers, and support for CAPTCHA types.

Build a reusable Python wrapper library for the Captcha AI API with type hints, retry logic, context managers,...

Automation Python reCAPTCHA v2
Jan 31, 2026
API Tutorials Solving CAPTCHAs with Swift and CaptchaAI API
Complete guide to solving re CAPTCHA, Turnstile, and image CAPTCHAs in Swift using Captcha AI's HTTP API with URLSession, async/await, and Alamofire.

Complete guide to solving re CAPTCHA, Turnstile, and image CAPTCHAs in Swift using Captcha AI's HTTP API with...

Automation reCAPTCHA v2 Cloudflare Turnstile
Apr 05, 2026
Getting Started Migrate from CapMonster Cloud to CaptchaAI
Step-by-step guide to migrate from Cap Monster Cloud to Captcha AI — endpoint mapping, parameter changes, and code migration examples.

Step-by-step guide to migrate from Cap Monster Cloud to Captcha AI — endpoint mapping, parameter changes, and...

Python reCAPTCHA v2 Cloudflare Turnstile
Mar 29, 2026
API Tutorials Solving CAPTCHAs with Kotlin and CaptchaAI API
Complete guide to solving re CAPTCHA, Turnstile, and image CAPTCHAs in Kotlin using Captcha AI's HTTP API with Ok Http, Ktor client, and coroutines.

Complete guide to solving re CAPTCHA, Turnstile, and image CAPTCHAs in Kotlin using Captcha AI's HTTP API with...

Automation reCAPTCHA v2 Cloudflare Turnstile
Mar 06, 2026
API Tutorials How to Solve reCAPTCHA v2 Callback Using API
how to solve re CAPTCHA v 2 callback implementations using Captcha AI API.

Learn how to solve re CAPTCHA v 2 callback implementations using Captcha AI API. Detect the callback function,...

Automation reCAPTCHA v2 Webhooks
Mar 01, 2026
API Tutorials Solve GeeTest v3 CAPTCHA with Python and CaptchaAI
Step-by-step Python tutorial for solving Gee Test v 3 slide puzzle CAPTCHAs using the Captcha AI API.

Step-by-step Python tutorial for solving Gee Test v 3 slide puzzle CAPTCHAs using the Captcha AI API. Includes...

Automation Python Testing
Mar 23, 2026
API Tutorials Case-Sensitive CAPTCHA API Parameter Guide
How to use the regsense parameter for case-sensitive CAPTCHA solving with Captcha AI.

How to use the regsense parameter for case-sensitive CAPTCHA solving with Captcha AI. Covers when to use, comm...

Python Web Scraping Image OCR
Apr 09, 2026