feat(eval): NopeCHA CAPTCHA solver integration (#537)

* feat(eval): show mean score instead of pass/fail in report and viewer

* feat(eval): integrate NopeCHA CAPTCHA solver into eval pipeline

Add CAPTCHA detection and waiting so screenshots capture post-solve state.
Run headed with xvfb on CI since headless breaks extension content scripts.

- Add CaptchaWaiter module (detect reCAPTCHA/hCaptcha/Turnstile, poll until solved)
- Add optional `captcha` config block to EvalConfigSchema
- Wait for CAPTCHA solve before screenshot in single-agent and orchestrator-executor
- Patch NopeCHA manifest with API key before launching workers
- Fix CAPTCHA_EXT_DIR path (was pointing one level too high)
- Remove --incognito (extensions don't run in incognito; fresh user-data-dir isolates)
- CI: install xvfb, run headed via xvfb-run, pass NOPECHA_API_KEY secret
This commit is contained in:
shivammittal274
2026-03-24 00:14:16 +05:30
committed by GitHub
parent 1270b5b55c
commit 0babc05077
11 changed files with 503 additions and 5 deletions

View File

@@ -43,6 +43,9 @@ jobs:
working-directory: packages/browseros-agent
run: bun install --ignore-scripts && bun run build:agent-sdk
- name: Install xvfb
run: sudo apt-get update && sudo apt-get install -y xvfb
- name: Install captcha solver extension
working-directory: packages/browseros-agent/apps/eval
run: |
@@ -55,11 +58,12 @@ jobs:
env:
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
BROWSEROS_BINARY: /usr/bin/browseros
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
run: |
echo "Running eval with config: $EVAL_CONFIG"
bun run src/index.ts -c "$EVAL_CONFIG"
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts -c "$EVAL_CONFIG"
- name: Upload runs to R2
if: success()

View File

@@ -16,7 +16,10 @@
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": true
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",

View File

@@ -11,6 +11,7 @@
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
import { Browser } from '@browseros/server/browser'
import { CdpBackend } from '@browseros/server/browser/backends/cdp'
import { CaptchaWaiter } from '../../capture/captcha-waiter'
import { DEFAULT_TIMEOUT_MS } from '../../constants'
import type {
EvalConfig,
@@ -161,6 +162,13 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
const browser = new Browser(cdp, CONTROLLER_STUB)
capture.screenshot.setBrowser(browser)
const captchaWaiter = config.captcha
? new CaptchaWaiter({
waitTimeoutMs: config.captcha.wait_timeout_ms,
pollIntervalMs: config.captcha.poll_interval_ms,
})
: null
try {
// Build capture callbacks (same pattern as single-agent.ts)
const callbacks: ExecutorCallbacks = {
@@ -172,6 +180,12 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
},
onToolCallFinish: async () => {
try {
if (captchaWaiter) {
await captchaWaiter.waitIfCaptchaPresent(
browser,
capture.getActivePageId(),
)
}
const screenshotNum = await capture.screenshot.capture(
capture.getActivePageId(),
)

View File

@@ -7,6 +7,7 @@ import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
import { Browser } from '@browseros/server/browser'
import { CdpBackend } from '@browseros/server/browser/backends/cdp'
import { registry } from '@browseros/server/tools/registry'
import { CaptchaWaiter } from '../capture/captcha-waiter'
import { DEFAULT_TIMEOUT_MS } from '../constants'
import type { EvalConfig, TaskMetadata } from '../types'
import { resolveProviderConfig } from '../utils/resolve-provider-config'
@@ -78,6 +79,13 @@ export class SingleAgentEvaluator implements AgentEvaluator {
}
: undefined
const captchaWaiter = config.captcha
? new CaptchaWaiter({
waitTimeoutMs: config.captcha.wait_timeout_ms,
pollIntervalMs: config.captcha.poll_interval_ms,
})
: null
let agent: AiSdkAgent | null = null
try {
@@ -112,6 +120,12 @@ export class SingleAgentEvaluator implements AgentEvaluator {
experimental_onToolCallFinish: async () => {
try {
if (captchaWaiter) {
await captchaWaiter.waitIfCaptchaPresent(
browser,
capture.getActivePageId(),
)
}
const screenshotNum = await capture.screenshot.capture(
capture.getActivePageId(),
)

View File

@@ -0,0 +1,115 @@
import type { Browser } from '@browseros/server/browser'
export interface CaptchaWaitResult {
detected: boolean
type: 'recaptcha' | 'hcaptcha' | 'turnstile' | 'none'
solved: boolean
waitDurationMs: number
}
interface CaptchaWaiterConfig {
waitTimeoutMs: number
pollIntervalMs: number
}
const DETECTION_SCRIPT = `(() => {
const recaptcha = document.querySelector('iframe[src*="recaptcha"]')
if (recaptcha) {
const response = document.getElementById('g-recaptcha-response')
return { type: 'recaptcha', solved: !!(response && response.value) }
}
const hcaptcha = document.querySelector('iframe[src*="hcaptcha"]')
if (hcaptcha) {
const response = document.querySelector('[name="h-captcha-response"]')
return { type: 'hcaptcha', solved: !!(response && response.value) }
}
const turnstile = document.querySelector('iframe[src*="challenges.cloudflare.com"]')
if (turnstile) {
const response = document.querySelector('[name="cf-turnstile-response"]')
return { type: 'turnstile', solved: !!(response && response.value) }
}
return { type: 'none', solved: false }
})()`
export class CaptchaWaiter {
private readonly config: CaptchaWaiterConfig
constructor(config: CaptchaWaiterConfig) {
this.config = config
}
async waitIfCaptchaPresent(
browser: Browser,
pageId: number,
): Promise<CaptchaWaitResult> {
const start = Date.now()
try {
const initial = await this.detect(browser, pageId)
if (initial.type === 'none') {
return {
detected: false,
type: 'none',
solved: false,
waitDurationMs: Date.now() - start,
}
}
if (initial.solved) {
return {
detected: true,
type: initial.type,
solved: true,
waitDurationMs: Date.now() - start,
}
}
// Poll until solved or timeout
while (Date.now() - start < this.config.waitTimeoutMs) {
await sleep(this.config.pollIntervalMs)
const check = await this.detect(browser, pageId)
if (check.solved || check.type === 'none') {
return {
detected: true,
type: initial.type,
solved: check.solved,
waitDurationMs: Date.now() - start,
}
}
}
return {
detected: true,
type: initial.type,
solved: false,
waitDurationMs: Date.now() - start,
}
} catch {
return {
detected: false,
type: 'none',
solved: false,
waitDurationMs: Date.now() - start,
}
}
}
private async detect(
browser: Browser,
pageId: number,
): Promise<{ type: CaptchaWaitResult['type']; solved: boolean }> {
const result = await browser.evaluate(pageId, DETECTION_SCRIPT)
if (result.error || !result.value) {
return { type: 'none', solved: false }
}
const val = result.value as { type: string; solved: boolean }
return {
type: (val.type as CaptchaWaitResult['type']) ?? 'none',
solved: val.solved ?? false,
}
}
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms))
}

View File

@@ -1,4 +1,5 @@
export { callMcpTool } from '../utils/mcp-client'
export { CaptchaWaiter } from './captcha-waiter'
export { CaptureContext } from './context'
export { MessageLogger } from './message-logger'
export { ScreenshotCapture } from './screenshot'

View File

@@ -14,7 +14,13 @@
* Each worker gets isolated ports: base + workerIndex offset.
*/
import { existsSync, mkdtempSync, rmSync } from 'node:fs'
import {
existsSync,
mkdtempSync,
readFileSync,
rmSync,
writeFileSync,
} from 'node:fs'
import { dirname, join } from 'node:path'
import { fileURLToPath } from 'node:url'
import { type Subprocess, spawn, spawnSync } from 'bun'
@@ -37,7 +43,7 @@ const BROWSEROS_BINARY =
const CONTROLLER_EXT_DIR = join(MONOREPO_ROOT, 'apps/controller-ext/dist')
const CAPTCHA_EXT_DIR = join(
dirname(fileURLToPath(import.meta.url)),
'../../../extensions/nopecha',
'../../extensions/nopecha',
)
export class BrowserOSAppManager {
@@ -149,7 +155,6 @@ export class BrowserOSAppManager {
'--use-mock-keychain',
'--disable-browseros-server',
'--disable-browseros-extensions',
'--incognito',
...(this.headless ? ['--headless=new'] : []),
'--window-size=1440,900',
`--remote-debugging-port=${cdp}`,
@@ -319,4 +324,22 @@ export class BrowserOSAppManager {
})
return (result.stdout?.toString().trim() ?? '').length > 0
}
/**
* Patch NopeCHA extension manifest with API key.
* Call once before launching any workers — the extension directory is shared.
*/
static patchNopechaApiKey(apiKey: string): void {
const manifestPath = join(CAPTCHA_EXT_DIR, 'manifest.json')
if (!existsSync(manifestPath)) {
console.log(
'[BROWSEROS] NopeCHA extension not found, skipping API key patch',
)
return
}
const manifest = JSON.parse(readFileSync(manifestPath, 'utf-8'))
manifest.nopecha = { ...manifest.nopecha, key: apiKey }
writeFileSync(manifestPath, JSON.stringify(manifest, null, 2))
console.log('[BROWSEROS] NopeCHA API key patched')
}
}

View File

@@ -93,6 +93,15 @@ export class ParallelExecutor {
BrowserOSAppManager.buildExtensions()
}
// Patch NopeCHA API key before launching any workers
const captchaConfig = this.config.config.captcha
if (captchaConfig) {
const apiKey = process.env[captchaConfig.api_key_env]
if (apiKey) {
BrowserOSAppManager.patchNopechaApiKey(apiKey)
}
}
this.queue = new TaskQueue(tasks)
const totalTasks = tasks.length

View File

@@ -71,6 +71,13 @@ export const EvalConfigSchema = z.object({
grader_api_key_env: z.string().optional(),
grader_base_url: z.string().url().optional(),
timeout_ms: z.number().int().min(30000).max(3600000).optional(),
captcha: z
.object({
api_key_env: z.string().default('NOPECHA_API_KEY'),
wait_timeout_ms: z.number().int().min(1000).max(120000).default(30000),
poll_interval_ms: z.number().int().min(200).max(5000).default(1000),
})
.optional(),
})
export type SingleAgentConfig = z.infer<typeof SingleAgentConfigSchema>

View File

@@ -0,0 +1,136 @@
import { beforeEach, describe, expect, it, mock } from 'bun:test'
import { CaptchaWaiter } from '../../src/capture/captcha-waiter'
function createMockBrowser(
evaluateResults: Array<{ value?: unknown; error?: string }>,
) {
let callIndex = 0
return {
evaluate: mock(async (_page: number, _expr: string) => {
const result = evaluateResults[callIndex] ?? evaluateResults.at(-1)!
callIndex++
return result
}),
} as any
}
describe('CaptchaWaiter', () => {
let waiter: CaptchaWaiter
beforeEach(() => {
waiter = new CaptchaWaiter({
waitTimeoutMs: 5000,
pollIntervalMs: 100,
})
})
it('returns immediately when no CAPTCHA detected', async () => {
const browser = createMockBrowser([
{ value: { type: 'none', solved: false } },
])
const result = await waiter.waitIfCaptchaPresent(browser, 1)
expect(result.detected).toBe(false)
expect(result.type).toBe('none')
expect(result.solved).toBe(false)
expect(browser.evaluate).toHaveBeenCalledTimes(1)
})
it('returns immediately when CAPTCHA already solved', async () => {
const browser = createMockBrowser([
{ value: { type: 'recaptcha', solved: true } },
])
const result = await waiter.waitIfCaptchaPresent(browser, 1)
expect(result.detected).toBe(true)
expect(result.type).toBe('recaptcha')
expect(result.solved).toBe(true)
expect(browser.evaluate).toHaveBeenCalledTimes(1)
})
it('polls until CAPTCHA is solved', async () => {
const browser = createMockBrowser([
{ value: { type: 'hcaptcha', solved: false } },
{ value: { type: 'hcaptcha', solved: false } },
{ value: { type: 'hcaptcha', solved: true } },
])
const result = await waiter.waitIfCaptchaPresent(browser, 1)
expect(result.detected).toBe(true)
expect(result.type).toBe('hcaptcha')
expect(result.solved).toBe(true)
expect(browser.evaluate).toHaveBeenCalledTimes(3)
})
it('polls until CAPTCHA disappears', async () => {
const browser = createMockBrowser([
{ value: { type: 'turnstile', solved: false } },
{ value: { type: 'turnstile', solved: false } },
{ value: { type: 'none', solved: false } },
])
const result = await waiter.waitIfCaptchaPresent(browser, 1)
expect(result.detected).toBe(true)
expect(result.type).toBe('turnstile')
expect(result.solved).toBe(false)
expect(browser.evaluate).toHaveBeenCalledTimes(3)
})
it('times out if CAPTCHA never solves', async () => {
const shortWaiter = new CaptchaWaiter({
waitTimeoutMs: 300,
pollIntervalMs: 100,
})
const browser = createMockBrowser([
{ value: { type: 'recaptcha', solved: false } },
])
const result = await shortWaiter.waitIfCaptchaPresent(browser, 1)
expect(result.detected).toBe(true)
expect(result.type).toBe('recaptcha')
expect(result.solved).toBe(false)
expect(result.waitDurationMs).toBeGreaterThanOrEqual(250)
})
it('handles browser.evaluate errors gracefully', async () => {
const browser = createMockBrowser([{ error: 'Page crashed' }])
const result = await waiter.waitIfCaptchaPresent(browser, 1)
expect(result.detected).toBe(false)
expect(result.type).toBe('none')
expect(result.solved).toBe(false)
})
it('handles browser.evaluate throwing', async () => {
const browser = {
evaluate: mock(async () => {
throw new Error('Connection lost')
}),
} as any
const result = await waiter.waitIfCaptchaPresent(browser, 1)
expect(result.detected).toBe(false)
expect(result.type).toBe('none')
expect(result.solved).toBe(false)
})
it('tracks wait duration', async () => {
const browser = createMockBrowser([
{ value: { type: 'recaptcha', solved: false } },
{ value: { type: 'recaptcha', solved: false } },
{ value: { type: 'recaptcha', solved: true } },
])
const result = await waiter.waitIfCaptchaPresent(browser, 1)
expect(result.waitDurationMs).toBeGreaterThanOrEqual(150)
})
})

View File

@@ -0,0 +1,172 @@
/**
* End-to-end test for CAPTCHA solver integration.
*
* Runs a single eval task against Google's reCAPTCHA demo page:
* 1. Launches BrowserOS (headed) with NopeCHA extension loaded
* 2. Agent navigates to reCAPTCHA demo, fills form
* 3. CaptchaWaiter polls until NopeCHA solves the CAPTCHA
* 4. Screenshot is captured AFTER solve
* 5. Verifies: task completed, screenshots exist, metadata saved
*
* Prerequisites:
* - NOPECHA_API_KEY env var set
* - FIREWORKS_API_KEY env var set (or swap agent config)
* - NopeCHA extension at extensions/nopecha/ (run the install step from CI)
* - BrowserOS binary available
*
* Run:
* bun --env-file=apps/eval/.env.development apps/eval/tests/e2e/captcha-e2e.ts
*/
import { existsSync, readdirSync, readFileSync, rmSync } from 'node:fs'
import { dirname, join } from 'node:path'
import { fileURLToPath } from 'node:url'
import { BrowserOSAppManager } from '../../src/runner/browseros-app-manager'
import { createTaskExecutor } from '../../src/runner/task-executor'
import { EvalConfigSchema } from '../../src/types/config'
import { TaskSchema } from '../../src/types/task'
const HERE = dirname(fileURLToPath(import.meta.url))
const OUTPUT_DIR = join(HERE, 'results')
const EVAL_CONFIG = {
agent: {
type: 'single' as const,
provider: 'openai-compatible' as const,
model: 'accounts/fireworks/models/kimi-k2p5',
apiKey: 'FIREWORKS_API_KEY',
baseUrl: 'https://api.fireworks.ai/inference/v1',
supportsImages: true,
},
dataset: 'inline',
num_workers: 1,
restart_server_per_task: true,
browseros: {
server_url: 'http://127.0.0.1:9110',
base_cdp_port: 9010,
base_server_port: 9110,
base_extension_port: 9310,
load_extensions: false,
headless: false,
},
captcha: { api_key_env: 'NOPECHA_API_KEY' },
timeout_ms: 120000,
}
const TASK = {
query_id: 'captcha-e2e-1',
dataset: 'captcha-test',
query:
"Go to the Google reCAPTCHA demo page. Wait for the CAPTCHA to appear. Click the 'I'm not a robot' checkbox. Once the CAPTCHA is solved, fill in the 'Name' field with 'Test User' and the 'Email' field with 'test@example.com'. Then click the Submit button.",
start_url: 'https://www.google.com/recaptcha/api2/demo',
metadata: { original_task_id: 'captcha-e2e-1' },
}
// ── Helpers ────────────────────────────────────────────────────────────
function log(msg: string) {
console.log(`[captcha-e2e] ${msg}`)
}
function fail(msg: string): never {
console.error(`\n[FAIL] ${msg}`)
process.exit(1)
}
function pass(msg: string) {
console.log(`\n[PASS] ${msg}`)
}
function preflight() {
if (!process.env.NOPECHA_API_KEY) {
fail('NOPECHA_API_KEY env var not set')
}
if (!process.env.FIREWORKS_API_KEY) {
fail('FIREWORKS_API_KEY env var not set — needed for the agent LLM')
}
const extDir = join(HERE, '../../extensions/nopecha')
if (!existsSync(join(extDir, 'manifest.json'))) {
fail(`NopeCHA extension not found at ${extDir}`)
}
}
// ── Main ──────────────────────────────────────────────────────────────
async function main() {
preflight()
const config = EvalConfigSchema.parse(EVAL_CONFIG)
const task = TaskSchema.parse(TASK)
const taskDir = join(OUTPUT_DIR, task.query_id)
if (existsSync(taskDir)) {
rmSync(taskDir, { recursive: true, force: true })
}
const captcha = config.captcha
if (!captcha) fail('captcha config block missing')
const apiKey = process.env[captcha.api_key_env]
if (!apiKey) fail(`${captcha.api_key_env} env var is empty`)
BrowserOSAppManager.patchNopechaApiKey(apiKey)
const app = new BrowserOSAppManager(
0,
{
cdp: config.browseros.base_cdp_port,
server: config.browseros.base_server_port,
extension: config.browseros.base_extension_port,
},
config.browseros.load_extensions,
config.browseros.headless,
)
try {
log('Starting BrowserOS stack (headed + NopeCHA extension)...')
await app.restart()
log(`BrowserOS ready at ${app.getServerUrl()}`)
const runConfig = {
...config,
browseros: { ...config.browseros, server_url: app.getServerUrl() },
}
const executor = createTaskExecutor(runConfig, OUTPUT_DIR, null)
log(`Running task: ${task.query_id}`)
log(` start_url: ${task.start_url}`)
const result = await executor.execute(task)
log(`\nTask status: ${result.status}`)
if (result.status === 'failed') {
const err = 'error' in result ? result.error : null
fail(`Task failed: ${err?.message ?? 'unknown error'}`)
}
const metadataPath = join(taskDir, 'metadata.json')
if (!existsSync(metadataPath)) fail('metadata.json not found')
const metadata = JSON.parse(readFileSync(metadataPath, 'utf-8'))
log(` Duration: ${metadata.total_duration_ms}ms`)
log(` Steps: ${metadata.total_steps}`)
log(` Termination: ${metadata.termination_reason}`)
const screenshotDir = join(taskDir, 'screenshots')
const screenshots = existsSync(screenshotDir)
? readdirSync(screenshotDir).filter((f) => f.endsWith('.png'))
: []
log(` Screenshots: ${screenshots.length}`)
if (screenshots.length === 0) fail('No screenshots captured')
pass(
`${screenshots.length} screenshots, ${metadata.total_steps} steps, ${metadata.total_duration_ms}ms`,
)
} finally {
log('Shutting down BrowserOS...')
await app.killApp()
}
}
main().catch((err) => {
console.error(err)
process.exit(1)
})