From 0babc050772d44c93bf8f1fa4964aecabe78dea8 Mon Sep 17 00:00:00 2001 From: shivammittal274 <56757235+shivammittal274@users.noreply.github.com> Date: Tue, 24 Mar 2026 00:14:16 +0530 Subject: [PATCH] feat(eval): NopeCHA CAPTCHA solver integration (#537) * feat(eval): show mean score instead of pass/fail in report and viewer * feat(eval): integrate NopeCHA CAPTCHA solver into eval pipeline Add CAPTCHA detection and waiting so screenshots capture post-solve state. Run headed with xvfb on CI since headless breaks extension content scripts. - Add CaptchaWaiter module (detect reCAPTCHA/hCaptcha/Turnstile, poll until solved) - Add optional `captcha` config block to EvalConfigSchema - Wait for CAPTCHA solve before screenshot in single-agent and orchestrator-executor - Patch NopeCHA manifest with API key before launching workers - Fix CAPTCHA_EXT_DIR path (was pointing one level too high) - Remove --incognito (extensions don't run in incognito; fresh user-data-dir isolates) - CI: install xvfb, run headed via xvfb-run, pass NOPECHA_API_KEY secret --- .github/workflows/eval-weekly.yml | 6 +- .../eval/configs/browseros-agent-weekly.json | 5 +- .../src/agents/orchestrator-executor/index.ts | 14 ++ .../apps/eval/src/agents/single-agent.ts | 14 ++ .../apps/eval/src/capture/captcha-waiter.ts | 115 ++++++++++++ .../apps/eval/src/capture/index.ts | 1 + .../eval/src/runner/browseros-app-manager.ts | 29 ++- .../apps/eval/src/runner/parallel-executor.ts | 9 + .../apps/eval/src/types/config.ts | 7 + .../eval/tests/capture/captcha-waiter.test.ts | 136 ++++++++++++++ .../apps/eval/tests/e2e/captcha-e2e.ts | 172 ++++++++++++++++++ 11 files changed, 503 insertions(+), 5 deletions(-) create mode 100644 packages/browseros-agent/apps/eval/src/capture/captcha-waiter.ts create mode 100644 packages/browseros-agent/apps/eval/tests/capture/captcha-waiter.test.ts create mode 100644 packages/browseros-agent/apps/eval/tests/e2e/captcha-e2e.ts diff --git a/.github/workflows/eval-weekly.yml b/.github/workflows/eval-weekly.yml index f4c938ca6..e47fa16a9 100644 --- a/.github/workflows/eval-weekly.yml +++ b/.github/workflows/eval-weekly.yml @@ -43,6 +43,9 @@ jobs: working-directory: packages/browseros-agent run: bun install --ignore-scripts && bun run build:agent-sdk + - name: Install xvfb + run: sudo apt-get update && sudo apt-get install -y xvfb + - name: Install captcha solver extension working-directory: packages/browseros-agent/apps/eval run: | @@ -55,11 +58,12 @@ jobs: env: FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }} BROWSEROS_BINARY: /usr/bin/browseros EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }} run: | echo "Running eval with config: $EVAL_CONFIG" - bun run src/index.ts -c "$EVAL_CONFIG" + xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts -c "$EVAL_CONFIG" - name: Upload runs to R2 if: success() diff --git a/packages/browseros-agent/apps/eval/configs/browseros-agent-weekly.json b/packages/browseros-agent/apps/eval/configs/browseros-agent-weekly.json index 9e605e6a2..2cc27ff6c 100644 --- a/packages/browseros-agent/apps/eval/configs/browseros-agent-weekly.json +++ b/packages/browseros-agent/apps/eval/configs/browseros-agent-weekly.json @@ -16,7 +16,10 @@ "base_server_port": 9110, "base_extension_port": 9310, "load_extensions": false, - "headless": true + "headless": false + }, + "captcha": { + "api_key_env": "NOPECHA_API_KEY" }, "graders": ["performance_grader"], "grader_api_key_env": "OPENROUTER_API_KEY", diff --git a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/index.ts b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/index.ts index e7eee7bb7..25fbc3114 100644 --- a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/index.ts +++ b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/index.ts @@ -11,6 +11,7 @@ import type { ResolvedAgentConfig } from '@browseros/server/agent/types' import { Browser } from '@browseros/server/browser' import { CdpBackend } from '@browseros/server/browser/backends/cdp' +import { CaptchaWaiter } from '../../capture/captcha-waiter' import { DEFAULT_TIMEOUT_MS } from '../../constants' import type { EvalConfig, @@ -161,6 +162,13 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator { const browser = new Browser(cdp, CONTROLLER_STUB) capture.screenshot.setBrowser(browser) + const captchaWaiter = config.captcha + ? new CaptchaWaiter({ + waitTimeoutMs: config.captcha.wait_timeout_ms, + pollIntervalMs: config.captcha.poll_interval_ms, + }) + : null + try { // Build capture callbacks (same pattern as single-agent.ts) const callbacks: ExecutorCallbacks = { @@ -172,6 +180,12 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator { }, onToolCallFinish: async () => { try { + if (captchaWaiter) { + await captchaWaiter.waitIfCaptchaPresent( + browser, + capture.getActivePageId(), + ) + } const screenshotNum = await capture.screenshot.capture( capture.getActivePageId(), ) diff --git a/packages/browseros-agent/apps/eval/src/agents/single-agent.ts b/packages/browseros-agent/apps/eval/src/agents/single-agent.ts index a0cd92d65..0d50acce1 100644 --- a/packages/browseros-agent/apps/eval/src/agents/single-agent.ts +++ b/packages/browseros-agent/apps/eval/src/agents/single-agent.ts @@ -7,6 +7,7 @@ import type { ResolvedAgentConfig } from '@browseros/server/agent/types' import { Browser } from '@browseros/server/browser' import { CdpBackend } from '@browseros/server/browser/backends/cdp' import { registry } from '@browseros/server/tools/registry' +import { CaptchaWaiter } from '../capture/captcha-waiter' import { DEFAULT_TIMEOUT_MS } from '../constants' import type { EvalConfig, TaskMetadata } from '../types' import { resolveProviderConfig } from '../utils/resolve-provider-config' @@ -78,6 +79,13 @@ export class SingleAgentEvaluator implements AgentEvaluator { } : undefined + const captchaWaiter = config.captcha + ? new CaptchaWaiter({ + waitTimeoutMs: config.captcha.wait_timeout_ms, + pollIntervalMs: config.captcha.poll_interval_ms, + }) + : null + let agent: AiSdkAgent | null = null try { @@ -112,6 +120,12 @@ export class SingleAgentEvaluator implements AgentEvaluator { experimental_onToolCallFinish: async () => { try { + if (captchaWaiter) { + await captchaWaiter.waitIfCaptchaPresent( + browser, + capture.getActivePageId(), + ) + } const screenshotNum = await capture.screenshot.capture( capture.getActivePageId(), ) diff --git a/packages/browseros-agent/apps/eval/src/capture/captcha-waiter.ts b/packages/browseros-agent/apps/eval/src/capture/captcha-waiter.ts new file mode 100644 index 000000000..bc7b17e7a --- /dev/null +++ b/packages/browseros-agent/apps/eval/src/capture/captcha-waiter.ts @@ -0,0 +1,115 @@ +import type { Browser } from '@browseros/server/browser' + +export interface CaptchaWaitResult { + detected: boolean + type: 'recaptcha' | 'hcaptcha' | 'turnstile' | 'none' + solved: boolean + waitDurationMs: number +} + +interface CaptchaWaiterConfig { + waitTimeoutMs: number + pollIntervalMs: number +} + +const DETECTION_SCRIPT = `(() => { + const recaptcha = document.querySelector('iframe[src*="recaptcha"]') + if (recaptcha) { + const response = document.getElementById('g-recaptcha-response') + return { type: 'recaptcha', solved: !!(response && response.value) } + } + const hcaptcha = document.querySelector('iframe[src*="hcaptcha"]') + if (hcaptcha) { + const response = document.querySelector('[name="h-captcha-response"]') + return { type: 'hcaptcha', solved: !!(response && response.value) } + } + const turnstile = document.querySelector('iframe[src*="challenges.cloudflare.com"]') + if (turnstile) { + const response = document.querySelector('[name="cf-turnstile-response"]') + return { type: 'turnstile', solved: !!(response && response.value) } + } + return { type: 'none', solved: false } +})()` + +export class CaptchaWaiter { + private readonly config: CaptchaWaiterConfig + + constructor(config: CaptchaWaiterConfig) { + this.config = config + } + + async waitIfCaptchaPresent( + browser: Browser, + pageId: number, + ): Promise { + const start = Date.now() + + try { + const initial = await this.detect(browser, pageId) + if (initial.type === 'none') { + return { + detected: false, + type: 'none', + solved: false, + waitDurationMs: Date.now() - start, + } + } + + if (initial.solved) { + return { + detected: true, + type: initial.type, + solved: true, + waitDurationMs: Date.now() - start, + } + } + + // Poll until solved or timeout + while (Date.now() - start < this.config.waitTimeoutMs) { + await sleep(this.config.pollIntervalMs) + const check = await this.detect(browser, pageId) + if (check.solved || check.type === 'none') { + return { + detected: true, + type: initial.type, + solved: check.solved, + waitDurationMs: Date.now() - start, + } + } + } + + return { + detected: true, + type: initial.type, + solved: false, + waitDurationMs: Date.now() - start, + } + } catch { + return { + detected: false, + type: 'none', + solved: false, + waitDurationMs: Date.now() - start, + } + } + } + + private async detect( + browser: Browser, + pageId: number, + ): Promise<{ type: CaptchaWaitResult['type']; solved: boolean }> { + const result = await browser.evaluate(pageId, DETECTION_SCRIPT) + if (result.error || !result.value) { + return { type: 'none', solved: false } + } + const val = result.value as { type: string; solved: boolean } + return { + type: (val.type as CaptchaWaitResult['type']) ?? 'none', + solved: val.solved ?? false, + } + } +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)) +} diff --git a/packages/browseros-agent/apps/eval/src/capture/index.ts b/packages/browseros-agent/apps/eval/src/capture/index.ts index a2818bd6d..806d5705d 100644 --- a/packages/browseros-agent/apps/eval/src/capture/index.ts +++ b/packages/browseros-agent/apps/eval/src/capture/index.ts @@ -1,4 +1,5 @@ export { callMcpTool } from '../utils/mcp-client' +export { CaptchaWaiter } from './captcha-waiter' export { CaptureContext } from './context' export { MessageLogger } from './message-logger' export { ScreenshotCapture } from './screenshot' diff --git a/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts b/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts index 7898297a3..e70b8f3f9 100644 --- a/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts +++ b/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts @@ -14,7 +14,13 @@ * Each worker gets isolated ports: base + workerIndex offset. */ -import { existsSync, mkdtempSync, rmSync } from 'node:fs' +import { + existsSync, + mkdtempSync, + readFileSync, + rmSync, + writeFileSync, +} from 'node:fs' import { dirname, join } from 'node:path' import { fileURLToPath } from 'node:url' import { type Subprocess, spawn, spawnSync } from 'bun' @@ -37,7 +43,7 @@ const BROWSEROS_BINARY = const CONTROLLER_EXT_DIR = join(MONOREPO_ROOT, 'apps/controller-ext/dist') const CAPTCHA_EXT_DIR = join( dirname(fileURLToPath(import.meta.url)), - '../../../extensions/nopecha', + '../../extensions/nopecha', ) export class BrowserOSAppManager { @@ -149,7 +155,6 @@ export class BrowserOSAppManager { '--use-mock-keychain', '--disable-browseros-server', '--disable-browseros-extensions', - '--incognito', ...(this.headless ? ['--headless=new'] : []), '--window-size=1440,900', `--remote-debugging-port=${cdp}`, @@ -319,4 +324,22 @@ export class BrowserOSAppManager { }) return (result.stdout?.toString().trim() ?? '').length > 0 } + + /** + * Patch NopeCHA extension manifest with API key. + * Call once before launching any workers — the extension directory is shared. + */ + static patchNopechaApiKey(apiKey: string): void { + const manifestPath = join(CAPTCHA_EXT_DIR, 'manifest.json') + if (!existsSync(manifestPath)) { + console.log( + '[BROWSEROS] NopeCHA extension not found, skipping API key patch', + ) + return + } + const manifest = JSON.parse(readFileSync(manifestPath, 'utf-8')) + manifest.nopecha = { ...manifest.nopecha, key: apiKey } + writeFileSync(manifestPath, JSON.stringify(manifest, null, 2)) + console.log('[BROWSEROS] NopeCHA API key patched') + } } diff --git a/packages/browseros-agent/apps/eval/src/runner/parallel-executor.ts b/packages/browseros-agent/apps/eval/src/runner/parallel-executor.ts index 01d34f627..263eb966c 100644 --- a/packages/browseros-agent/apps/eval/src/runner/parallel-executor.ts +++ b/packages/browseros-agent/apps/eval/src/runner/parallel-executor.ts @@ -93,6 +93,15 @@ export class ParallelExecutor { BrowserOSAppManager.buildExtensions() } + // Patch NopeCHA API key before launching any workers + const captchaConfig = this.config.config.captcha + if (captchaConfig) { + const apiKey = process.env[captchaConfig.api_key_env] + if (apiKey) { + BrowserOSAppManager.patchNopechaApiKey(apiKey) + } + } + this.queue = new TaskQueue(tasks) const totalTasks = tasks.length diff --git a/packages/browseros-agent/apps/eval/src/types/config.ts b/packages/browseros-agent/apps/eval/src/types/config.ts index 1eafff7d0..bf181535b 100644 --- a/packages/browseros-agent/apps/eval/src/types/config.ts +++ b/packages/browseros-agent/apps/eval/src/types/config.ts @@ -71,6 +71,13 @@ export const EvalConfigSchema = z.object({ grader_api_key_env: z.string().optional(), grader_base_url: z.string().url().optional(), timeout_ms: z.number().int().min(30000).max(3600000).optional(), + captcha: z + .object({ + api_key_env: z.string().default('NOPECHA_API_KEY'), + wait_timeout_ms: z.number().int().min(1000).max(120000).default(30000), + poll_interval_ms: z.number().int().min(200).max(5000).default(1000), + }) + .optional(), }) export type SingleAgentConfig = z.infer diff --git a/packages/browseros-agent/apps/eval/tests/capture/captcha-waiter.test.ts b/packages/browseros-agent/apps/eval/tests/capture/captcha-waiter.test.ts new file mode 100644 index 000000000..dc4cd8f3c --- /dev/null +++ b/packages/browseros-agent/apps/eval/tests/capture/captcha-waiter.test.ts @@ -0,0 +1,136 @@ +import { beforeEach, describe, expect, it, mock } from 'bun:test' +import { CaptchaWaiter } from '../../src/capture/captcha-waiter' + +function createMockBrowser( + evaluateResults: Array<{ value?: unknown; error?: string }>, +) { + let callIndex = 0 + return { + evaluate: mock(async (_page: number, _expr: string) => { + const result = evaluateResults[callIndex] ?? evaluateResults.at(-1)! + callIndex++ + return result + }), + } as any +} + +describe('CaptchaWaiter', () => { + let waiter: CaptchaWaiter + + beforeEach(() => { + waiter = new CaptchaWaiter({ + waitTimeoutMs: 5000, + pollIntervalMs: 100, + }) + }) + + it('returns immediately when no CAPTCHA detected', async () => { + const browser = createMockBrowser([ + { value: { type: 'none', solved: false } }, + ]) + + const result = await waiter.waitIfCaptchaPresent(browser, 1) + + expect(result.detected).toBe(false) + expect(result.type).toBe('none') + expect(result.solved).toBe(false) + expect(browser.evaluate).toHaveBeenCalledTimes(1) + }) + + it('returns immediately when CAPTCHA already solved', async () => { + const browser = createMockBrowser([ + { value: { type: 'recaptcha', solved: true } }, + ]) + + const result = await waiter.waitIfCaptchaPresent(browser, 1) + + expect(result.detected).toBe(true) + expect(result.type).toBe('recaptcha') + expect(result.solved).toBe(true) + expect(browser.evaluate).toHaveBeenCalledTimes(1) + }) + + it('polls until CAPTCHA is solved', async () => { + const browser = createMockBrowser([ + { value: { type: 'hcaptcha', solved: false } }, + { value: { type: 'hcaptcha', solved: false } }, + { value: { type: 'hcaptcha', solved: true } }, + ]) + + const result = await waiter.waitIfCaptchaPresent(browser, 1) + + expect(result.detected).toBe(true) + expect(result.type).toBe('hcaptcha') + expect(result.solved).toBe(true) + expect(browser.evaluate).toHaveBeenCalledTimes(3) + }) + + it('polls until CAPTCHA disappears', async () => { + const browser = createMockBrowser([ + { value: { type: 'turnstile', solved: false } }, + { value: { type: 'turnstile', solved: false } }, + { value: { type: 'none', solved: false } }, + ]) + + const result = await waiter.waitIfCaptchaPresent(browser, 1) + + expect(result.detected).toBe(true) + expect(result.type).toBe('turnstile') + expect(result.solved).toBe(false) + expect(browser.evaluate).toHaveBeenCalledTimes(3) + }) + + it('times out if CAPTCHA never solves', async () => { + const shortWaiter = new CaptchaWaiter({ + waitTimeoutMs: 300, + pollIntervalMs: 100, + }) + + const browser = createMockBrowser([ + { value: { type: 'recaptcha', solved: false } }, + ]) + + const result = await shortWaiter.waitIfCaptchaPresent(browser, 1) + + expect(result.detected).toBe(true) + expect(result.type).toBe('recaptcha') + expect(result.solved).toBe(false) + expect(result.waitDurationMs).toBeGreaterThanOrEqual(250) + }) + + it('handles browser.evaluate errors gracefully', async () => { + const browser = createMockBrowser([{ error: 'Page crashed' }]) + + const result = await waiter.waitIfCaptchaPresent(browser, 1) + + expect(result.detected).toBe(false) + expect(result.type).toBe('none') + expect(result.solved).toBe(false) + }) + + it('handles browser.evaluate throwing', async () => { + const browser = { + evaluate: mock(async () => { + throw new Error('Connection lost') + }), + } as any + + const result = await waiter.waitIfCaptchaPresent(browser, 1) + + expect(result.detected).toBe(false) + expect(result.type).toBe('none') + expect(result.solved).toBe(false) + }) + + it('tracks wait duration', async () => { + const browser = createMockBrowser([ + { value: { type: 'recaptcha', solved: false } }, + { value: { type: 'recaptcha', solved: false } }, + { value: { type: 'recaptcha', solved: true } }, + ]) + + const result = await waiter.waitIfCaptchaPresent(browser, 1) + + expect(result.waitDurationMs).toBeGreaterThanOrEqual(150) + }) +}) diff --git a/packages/browseros-agent/apps/eval/tests/e2e/captcha-e2e.ts b/packages/browseros-agent/apps/eval/tests/e2e/captcha-e2e.ts new file mode 100644 index 000000000..071375d84 --- /dev/null +++ b/packages/browseros-agent/apps/eval/tests/e2e/captcha-e2e.ts @@ -0,0 +1,172 @@ +/** + * End-to-end test for CAPTCHA solver integration. + * + * Runs a single eval task against Google's reCAPTCHA demo page: + * 1. Launches BrowserOS (headed) with NopeCHA extension loaded + * 2. Agent navigates to reCAPTCHA demo, fills form + * 3. CaptchaWaiter polls until NopeCHA solves the CAPTCHA + * 4. Screenshot is captured AFTER solve + * 5. Verifies: task completed, screenshots exist, metadata saved + * + * Prerequisites: + * - NOPECHA_API_KEY env var set + * - FIREWORKS_API_KEY env var set (or swap agent config) + * - NopeCHA extension at extensions/nopecha/ (run the install step from CI) + * - BrowserOS binary available + * + * Run: + * bun --env-file=apps/eval/.env.development apps/eval/tests/e2e/captcha-e2e.ts + */ + +import { existsSync, readdirSync, readFileSync, rmSync } from 'node:fs' +import { dirname, join } from 'node:path' +import { fileURLToPath } from 'node:url' +import { BrowserOSAppManager } from '../../src/runner/browseros-app-manager' +import { createTaskExecutor } from '../../src/runner/task-executor' +import { EvalConfigSchema } from '../../src/types/config' +import { TaskSchema } from '../../src/types/task' + +const HERE = dirname(fileURLToPath(import.meta.url)) +const OUTPUT_DIR = join(HERE, 'results') + +const EVAL_CONFIG = { + agent: { + type: 'single' as const, + provider: 'openai-compatible' as const, + model: 'accounts/fireworks/models/kimi-k2p5', + apiKey: 'FIREWORKS_API_KEY', + baseUrl: 'https://api.fireworks.ai/inference/v1', + supportsImages: true, + }, + dataset: 'inline', + num_workers: 1, + restart_server_per_task: true, + browseros: { + server_url: 'http://127.0.0.1:9110', + base_cdp_port: 9010, + base_server_port: 9110, + base_extension_port: 9310, + load_extensions: false, + headless: false, + }, + captcha: { api_key_env: 'NOPECHA_API_KEY' }, + timeout_ms: 120000, +} + +const TASK = { + query_id: 'captcha-e2e-1', + dataset: 'captcha-test', + query: + "Go to the Google reCAPTCHA demo page. Wait for the CAPTCHA to appear. Click the 'I'm not a robot' checkbox. Once the CAPTCHA is solved, fill in the 'Name' field with 'Test User' and the 'Email' field with 'test@example.com'. Then click the Submit button.", + start_url: 'https://www.google.com/recaptcha/api2/demo', + metadata: { original_task_id: 'captcha-e2e-1' }, +} + +// ── Helpers ──────────────────────────────────────────────────────────── + +function log(msg: string) { + console.log(`[captcha-e2e] ${msg}`) +} + +function fail(msg: string): never { + console.error(`\n[FAIL] ${msg}`) + process.exit(1) +} + +function pass(msg: string) { + console.log(`\n[PASS] ${msg}`) +} + +function preflight() { + if (!process.env.NOPECHA_API_KEY) { + fail('NOPECHA_API_KEY env var not set') + } + if (!process.env.FIREWORKS_API_KEY) { + fail('FIREWORKS_API_KEY env var not set — needed for the agent LLM') + } + const extDir = join(HERE, '../../extensions/nopecha') + if (!existsSync(join(extDir, 'manifest.json'))) { + fail(`NopeCHA extension not found at ${extDir}`) + } +} + +// ── Main ────────────────────────────────────────────────────────────── + +async function main() { + preflight() + + const config = EvalConfigSchema.parse(EVAL_CONFIG) + const task = TaskSchema.parse(TASK) + const taskDir = join(OUTPUT_DIR, task.query_id) + + if (existsSync(taskDir)) { + rmSync(taskDir, { recursive: true, force: true }) + } + + const captcha = config.captcha + if (!captcha) fail('captcha config block missing') + const apiKey = process.env[captcha.api_key_env] + if (!apiKey) fail(`${captcha.api_key_env} env var is empty`) + BrowserOSAppManager.patchNopechaApiKey(apiKey) + + const app = new BrowserOSAppManager( + 0, + { + cdp: config.browseros.base_cdp_port, + server: config.browseros.base_server_port, + extension: config.browseros.base_extension_port, + }, + config.browseros.load_extensions, + config.browseros.headless, + ) + + try { + log('Starting BrowserOS stack (headed + NopeCHA extension)...') + await app.restart() + log(`BrowserOS ready at ${app.getServerUrl()}`) + + const runConfig = { + ...config, + browseros: { ...config.browseros, server_url: app.getServerUrl() }, + } + + const executor = createTaskExecutor(runConfig, OUTPUT_DIR, null) + log(`Running task: ${task.query_id}`) + log(` start_url: ${task.start_url}`) + + const result = await executor.execute(task) + + log(`\nTask status: ${result.status}`) + + if (result.status === 'failed') { + const err = 'error' in result ? result.error : null + fail(`Task failed: ${err?.message ?? 'unknown error'}`) + } + + const metadataPath = join(taskDir, 'metadata.json') + if (!existsSync(metadataPath)) fail('metadata.json not found') + const metadata = JSON.parse(readFileSync(metadataPath, 'utf-8')) + log(` Duration: ${metadata.total_duration_ms}ms`) + log(` Steps: ${metadata.total_steps}`) + log(` Termination: ${metadata.termination_reason}`) + + const screenshotDir = join(taskDir, 'screenshots') + const screenshots = existsSync(screenshotDir) + ? readdirSync(screenshotDir).filter((f) => f.endsWith('.png')) + : [] + log(` Screenshots: ${screenshots.length}`) + if (screenshots.length === 0) fail('No screenshots captured') + + pass( + `${screenshots.length} screenshots, ${metadata.total_steps} steps, ${metadata.total_duration_ms}ms`, + ) + } finally { + log('Shutting down BrowserOS...') + await app.killApp() + } +} + +main().catch((err) => { + console.error(err) + process.exit(1) +})