mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
feat(eval): NopeCHA CAPTCHA solver integration (#537)
* feat(eval): show mean score instead of pass/fail in report and viewer * feat(eval): integrate NopeCHA CAPTCHA solver into eval pipeline Add CAPTCHA detection and waiting so screenshots capture post-solve state. Run headed with xvfb on CI since headless breaks extension content scripts. - Add CaptchaWaiter module (detect reCAPTCHA/hCaptcha/Turnstile, poll until solved) - Add optional `captcha` config block to EvalConfigSchema - Wait for CAPTCHA solve before screenshot in single-agent and orchestrator-executor - Patch NopeCHA manifest with API key before launching workers - Fix CAPTCHA_EXT_DIR path (was pointing one level too high) - Remove --incognito (extensions don't run in incognito; fresh user-data-dir isolates) - CI: install xvfb, run headed via xvfb-run, pass NOPECHA_API_KEY secret
This commit is contained in:
6
.github/workflows/eval-weekly.yml
vendored
6
.github/workflows/eval-weekly.yml
vendored
@@ -43,6 +43,9 @@ jobs:
|
||||
working-directory: packages/browseros-agent
|
||||
run: bun install --ignore-scripts && bun run build:agent-sdk
|
||||
|
||||
- name: Install xvfb
|
||||
run: sudo apt-get update && sudo apt-get install -y xvfb
|
||||
|
||||
- name: Install captcha solver extension
|
||||
working-directory: packages/browseros-agent/apps/eval
|
||||
run: |
|
||||
@@ -55,11 +58,12 @@ jobs:
|
||||
env:
|
||||
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
|
||||
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
|
||||
NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
|
||||
BROWSEROS_BINARY: /usr/bin/browseros
|
||||
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
|
||||
run: |
|
||||
echo "Running eval with config: $EVAL_CONFIG"
|
||||
bun run src/index.ts -c "$EVAL_CONFIG"
|
||||
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts -c "$EVAL_CONFIG"
|
||||
|
||||
- name: Upload runs to R2
|
||||
if: success()
|
||||
|
||||
@@ -16,7 +16,10 @@
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": true
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["performance_grader"],
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
|
||||
import { Browser } from '@browseros/server/browser'
|
||||
import { CdpBackend } from '@browseros/server/browser/backends/cdp'
|
||||
import { CaptchaWaiter } from '../../capture/captcha-waiter'
|
||||
import { DEFAULT_TIMEOUT_MS } from '../../constants'
|
||||
import type {
|
||||
EvalConfig,
|
||||
@@ -161,6 +162,13 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
|
||||
const browser = new Browser(cdp, CONTROLLER_STUB)
|
||||
capture.screenshot.setBrowser(browser)
|
||||
|
||||
const captchaWaiter = config.captcha
|
||||
? new CaptchaWaiter({
|
||||
waitTimeoutMs: config.captcha.wait_timeout_ms,
|
||||
pollIntervalMs: config.captcha.poll_interval_ms,
|
||||
})
|
||||
: null
|
||||
|
||||
try {
|
||||
// Build capture callbacks (same pattern as single-agent.ts)
|
||||
const callbacks: ExecutorCallbacks = {
|
||||
@@ -172,6 +180,12 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
|
||||
},
|
||||
onToolCallFinish: async () => {
|
||||
try {
|
||||
if (captchaWaiter) {
|
||||
await captchaWaiter.waitIfCaptchaPresent(
|
||||
browser,
|
||||
capture.getActivePageId(),
|
||||
)
|
||||
}
|
||||
const screenshotNum = await capture.screenshot.capture(
|
||||
capture.getActivePageId(),
|
||||
)
|
||||
|
||||
@@ -7,6 +7,7 @@ import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
|
||||
import { Browser } from '@browseros/server/browser'
|
||||
import { CdpBackend } from '@browseros/server/browser/backends/cdp'
|
||||
import { registry } from '@browseros/server/tools/registry'
|
||||
import { CaptchaWaiter } from '../capture/captcha-waiter'
|
||||
import { DEFAULT_TIMEOUT_MS } from '../constants'
|
||||
import type { EvalConfig, TaskMetadata } from '../types'
|
||||
import { resolveProviderConfig } from '../utils/resolve-provider-config'
|
||||
@@ -78,6 +79,13 @@ export class SingleAgentEvaluator implements AgentEvaluator {
|
||||
}
|
||||
: undefined
|
||||
|
||||
const captchaWaiter = config.captcha
|
||||
? new CaptchaWaiter({
|
||||
waitTimeoutMs: config.captcha.wait_timeout_ms,
|
||||
pollIntervalMs: config.captcha.poll_interval_ms,
|
||||
})
|
||||
: null
|
||||
|
||||
let agent: AiSdkAgent | null = null
|
||||
|
||||
try {
|
||||
@@ -112,6 +120,12 @@ export class SingleAgentEvaluator implements AgentEvaluator {
|
||||
|
||||
experimental_onToolCallFinish: async () => {
|
||||
try {
|
||||
if (captchaWaiter) {
|
||||
await captchaWaiter.waitIfCaptchaPresent(
|
||||
browser,
|
||||
capture.getActivePageId(),
|
||||
)
|
||||
}
|
||||
const screenshotNum = await capture.screenshot.capture(
|
||||
capture.getActivePageId(),
|
||||
)
|
||||
|
||||
115
packages/browseros-agent/apps/eval/src/capture/captcha-waiter.ts
Normal file
115
packages/browseros-agent/apps/eval/src/capture/captcha-waiter.ts
Normal file
@@ -0,0 +1,115 @@
|
||||
import type { Browser } from '@browseros/server/browser'
|
||||
|
||||
export interface CaptchaWaitResult {
|
||||
detected: boolean
|
||||
type: 'recaptcha' | 'hcaptcha' | 'turnstile' | 'none'
|
||||
solved: boolean
|
||||
waitDurationMs: number
|
||||
}
|
||||
|
||||
interface CaptchaWaiterConfig {
|
||||
waitTimeoutMs: number
|
||||
pollIntervalMs: number
|
||||
}
|
||||
|
||||
const DETECTION_SCRIPT = `(() => {
|
||||
const recaptcha = document.querySelector('iframe[src*="recaptcha"]')
|
||||
if (recaptcha) {
|
||||
const response = document.getElementById('g-recaptcha-response')
|
||||
return { type: 'recaptcha', solved: !!(response && response.value) }
|
||||
}
|
||||
const hcaptcha = document.querySelector('iframe[src*="hcaptcha"]')
|
||||
if (hcaptcha) {
|
||||
const response = document.querySelector('[name="h-captcha-response"]')
|
||||
return { type: 'hcaptcha', solved: !!(response && response.value) }
|
||||
}
|
||||
const turnstile = document.querySelector('iframe[src*="challenges.cloudflare.com"]')
|
||||
if (turnstile) {
|
||||
const response = document.querySelector('[name="cf-turnstile-response"]')
|
||||
return { type: 'turnstile', solved: !!(response && response.value) }
|
||||
}
|
||||
return { type: 'none', solved: false }
|
||||
})()`
|
||||
|
||||
export class CaptchaWaiter {
|
||||
private readonly config: CaptchaWaiterConfig
|
||||
|
||||
constructor(config: CaptchaWaiterConfig) {
|
||||
this.config = config
|
||||
}
|
||||
|
||||
async waitIfCaptchaPresent(
|
||||
browser: Browser,
|
||||
pageId: number,
|
||||
): Promise<CaptchaWaitResult> {
|
||||
const start = Date.now()
|
||||
|
||||
try {
|
||||
const initial = await this.detect(browser, pageId)
|
||||
if (initial.type === 'none') {
|
||||
return {
|
||||
detected: false,
|
||||
type: 'none',
|
||||
solved: false,
|
||||
waitDurationMs: Date.now() - start,
|
||||
}
|
||||
}
|
||||
|
||||
if (initial.solved) {
|
||||
return {
|
||||
detected: true,
|
||||
type: initial.type,
|
||||
solved: true,
|
||||
waitDurationMs: Date.now() - start,
|
||||
}
|
||||
}
|
||||
|
||||
// Poll until solved or timeout
|
||||
while (Date.now() - start < this.config.waitTimeoutMs) {
|
||||
await sleep(this.config.pollIntervalMs)
|
||||
const check = await this.detect(browser, pageId)
|
||||
if (check.solved || check.type === 'none') {
|
||||
return {
|
||||
detected: true,
|
||||
type: initial.type,
|
||||
solved: check.solved,
|
||||
waitDurationMs: Date.now() - start,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
detected: true,
|
||||
type: initial.type,
|
||||
solved: false,
|
||||
waitDurationMs: Date.now() - start,
|
||||
}
|
||||
} catch {
|
||||
return {
|
||||
detected: false,
|
||||
type: 'none',
|
||||
solved: false,
|
||||
waitDurationMs: Date.now() - start,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async detect(
|
||||
browser: Browser,
|
||||
pageId: number,
|
||||
): Promise<{ type: CaptchaWaitResult['type']; solved: boolean }> {
|
||||
const result = await browser.evaluate(pageId, DETECTION_SCRIPT)
|
||||
if (result.error || !result.value) {
|
||||
return { type: 'none', solved: false }
|
||||
}
|
||||
const val = result.value as { type: string; solved: boolean }
|
||||
return {
|
||||
type: (val.type as CaptchaWaitResult['type']) ?? 'none',
|
||||
solved: val.solved ?? false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms))
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
export { callMcpTool } from '../utils/mcp-client'
|
||||
export { CaptchaWaiter } from './captcha-waiter'
|
||||
export { CaptureContext } from './context'
|
||||
export { MessageLogger } from './message-logger'
|
||||
export { ScreenshotCapture } from './screenshot'
|
||||
|
||||
@@ -14,7 +14,13 @@
|
||||
* Each worker gets isolated ports: base + workerIndex offset.
|
||||
*/
|
||||
|
||||
import { existsSync, mkdtempSync, rmSync } from 'node:fs'
|
||||
import {
|
||||
existsSync,
|
||||
mkdtempSync,
|
||||
readFileSync,
|
||||
rmSync,
|
||||
writeFileSync,
|
||||
} from 'node:fs'
|
||||
import { dirname, join } from 'node:path'
|
||||
import { fileURLToPath } from 'node:url'
|
||||
import { type Subprocess, spawn, spawnSync } from 'bun'
|
||||
@@ -37,7 +43,7 @@ const BROWSEROS_BINARY =
|
||||
const CONTROLLER_EXT_DIR = join(MONOREPO_ROOT, 'apps/controller-ext/dist')
|
||||
const CAPTCHA_EXT_DIR = join(
|
||||
dirname(fileURLToPath(import.meta.url)),
|
||||
'../../../extensions/nopecha',
|
||||
'../../extensions/nopecha',
|
||||
)
|
||||
|
||||
export class BrowserOSAppManager {
|
||||
@@ -149,7 +155,6 @@ export class BrowserOSAppManager {
|
||||
'--use-mock-keychain',
|
||||
'--disable-browseros-server',
|
||||
'--disable-browseros-extensions',
|
||||
'--incognito',
|
||||
...(this.headless ? ['--headless=new'] : []),
|
||||
'--window-size=1440,900',
|
||||
`--remote-debugging-port=${cdp}`,
|
||||
@@ -319,4 +324,22 @@ export class BrowserOSAppManager {
|
||||
})
|
||||
return (result.stdout?.toString().trim() ?? '').length > 0
|
||||
}
|
||||
|
||||
/**
|
||||
* Patch NopeCHA extension manifest with API key.
|
||||
* Call once before launching any workers — the extension directory is shared.
|
||||
*/
|
||||
static patchNopechaApiKey(apiKey: string): void {
|
||||
const manifestPath = join(CAPTCHA_EXT_DIR, 'manifest.json')
|
||||
if (!existsSync(manifestPath)) {
|
||||
console.log(
|
||||
'[BROWSEROS] NopeCHA extension not found, skipping API key patch',
|
||||
)
|
||||
return
|
||||
}
|
||||
const manifest = JSON.parse(readFileSync(manifestPath, 'utf-8'))
|
||||
manifest.nopecha = { ...manifest.nopecha, key: apiKey }
|
||||
writeFileSync(manifestPath, JSON.stringify(manifest, null, 2))
|
||||
console.log('[BROWSEROS] NopeCHA API key patched')
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,6 +93,15 @@ export class ParallelExecutor {
|
||||
BrowserOSAppManager.buildExtensions()
|
||||
}
|
||||
|
||||
// Patch NopeCHA API key before launching any workers
|
||||
const captchaConfig = this.config.config.captcha
|
||||
if (captchaConfig) {
|
||||
const apiKey = process.env[captchaConfig.api_key_env]
|
||||
if (apiKey) {
|
||||
BrowserOSAppManager.patchNopechaApiKey(apiKey)
|
||||
}
|
||||
}
|
||||
|
||||
this.queue = new TaskQueue(tasks)
|
||||
const totalTasks = tasks.length
|
||||
|
||||
|
||||
@@ -71,6 +71,13 @@ export const EvalConfigSchema = z.object({
|
||||
grader_api_key_env: z.string().optional(),
|
||||
grader_base_url: z.string().url().optional(),
|
||||
timeout_ms: z.number().int().min(30000).max(3600000).optional(),
|
||||
captcha: z
|
||||
.object({
|
||||
api_key_env: z.string().default('NOPECHA_API_KEY'),
|
||||
wait_timeout_ms: z.number().int().min(1000).max(120000).default(30000),
|
||||
poll_interval_ms: z.number().int().min(200).max(5000).default(1000),
|
||||
})
|
||||
.optional(),
|
||||
})
|
||||
|
||||
export type SingleAgentConfig = z.infer<typeof SingleAgentConfigSchema>
|
||||
|
||||
@@ -0,0 +1,136 @@
|
||||
import { beforeEach, describe, expect, it, mock } from 'bun:test'
|
||||
import { CaptchaWaiter } from '../../src/capture/captcha-waiter'
|
||||
|
||||
function createMockBrowser(
|
||||
evaluateResults: Array<{ value?: unknown; error?: string }>,
|
||||
) {
|
||||
let callIndex = 0
|
||||
return {
|
||||
evaluate: mock(async (_page: number, _expr: string) => {
|
||||
const result = evaluateResults[callIndex] ?? evaluateResults.at(-1)!
|
||||
callIndex++
|
||||
return result
|
||||
}),
|
||||
} as any
|
||||
}
|
||||
|
||||
describe('CaptchaWaiter', () => {
|
||||
let waiter: CaptchaWaiter
|
||||
|
||||
beforeEach(() => {
|
||||
waiter = new CaptchaWaiter({
|
||||
waitTimeoutMs: 5000,
|
||||
pollIntervalMs: 100,
|
||||
})
|
||||
})
|
||||
|
||||
it('returns immediately when no CAPTCHA detected', async () => {
|
||||
const browser = createMockBrowser([
|
||||
{ value: { type: 'none', solved: false } },
|
||||
])
|
||||
|
||||
const result = await waiter.waitIfCaptchaPresent(browser, 1)
|
||||
|
||||
expect(result.detected).toBe(false)
|
||||
expect(result.type).toBe('none')
|
||||
expect(result.solved).toBe(false)
|
||||
expect(browser.evaluate).toHaveBeenCalledTimes(1)
|
||||
})
|
||||
|
||||
it('returns immediately when CAPTCHA already solved', async () => {
|
||||
const browser = createMockBrowser([
|
||||
{ value: { type: 'recaptcha', solved: true } },
|
||||
])
|
||||
|
||||
const result = await waiter.waitIfCaptchaPresent(browser, 1)
|
||||
|
||||
expect(result.detected).toBe(true)
|
||||
expect(result.type).toBe('recaptcha')
|
||||
expect(result.solved).toBe(true)
|
||||
expect(browser.evaluate).toHaveBeenCalledTimes(1)
|
||||
})
|
||||
|
||||
it('polls until CAPTCHA is solved', async () => {
|
||||
const browser = createMockBrowser([
|
||||
{ value: { type: 'hcaptcha', solved: false } },
|
||||
{ value: { type: 'hcaptcha', solved: false } },
|
||||
{ value: { type: 'hcaptcha', solved: true } },
|
||||
])
|
||||
|
||||
const result = await waiter.waitIfCaptchaPresent(browser, 1)
|
||||
|
||||
expect(result.detected).toBe(true)
|
||||
expect(result.type).toBe('hcaptcha')
|
||||
expect(result.solved).toBe(true)
|
||||
expect(browser.evaluate).toHaveBeenCalledTimes(3)
|
||||
})
|
||||
|
||||
it('polls until CAPTCHA disappears', async () => {
|
||||
const browser = createMockBrowser([
|
||||
{ value: { type: 'turnstile', solved: false } },
|
||||
{ value: { type: 'turnstile', solved: false } },
|
||||
{ value: { type: 'none', solved: false } },
|
||||
])
|
||||
|
||||
const result = await waiter.waitIfCaptchaPresent(browser, 1)
|
||||
|
||||
expect(result.detected).toBe(true)
|
||||
expect(result.type).toBe('turnstile')
|
||||
expect(result.solved).toBe(false)
|
||||
expect(browser.evaluate).toHaveBeenCalledTimes(3)
|
||||
})
|
||||
|
||||
it('times out if CAPTCHA never solves', async () => {
|
||||
const shortWaiter = new CaptchaWaiter({
|
||||
waitTimeoutMs: 300,
|
||||
pollIntervalMs: 100,
|
||||
})
|
||||
|
||||
const browser = createMockBrowser([
|
||||
{ value: { type: 'recaptcha', solved: false } },
|
||||
])
|
||||
|
||||
const result = await shortWaiter.waitIfCaptchaPresent(browser, 1)
|
||||
|
||||
expect(result.detected).toBe(true)
|
||||
expect(result.type).toBe('recaptcha')
|
||||
expect(result.solved).toBe(false)
|
||||
expect(result.waitDurationMs).toBeGreaterThanOrEqual(250)
|
||||
})
|
||||
|
||||
it('handles browser.evaluate errors gracefully', async () => {
|
||||
const browser = createMockBrowser([{ error: 'Page crashed' }])
|
||||
|
||||
const result = await waiter.waitIfCaptchaPresent(browser, 1)
|
||||
|
||||
expect(result.detected).toBe(false)
|
||||
expect(result.type).toBe('none')
|
||||
expect(result.solved).toBe(false)
|
||||
})
|
||||
|
||||
it('handles browser.evaluate throwing', async () => {
|
||||
const browser = {
|
||||
evaluate: mock(async () => {
|
||||
throw new Error('Connection lost')
|
||||
}),
|
||||
} as any
|
||||
|
||||
const result = await waiter.waitIfCaptchaPresent(browser, 1)
|
||||
|
||||
expect(result.detected).toBe(false)
|
||||
expect(result.type).toBe('none')
|
||||
expect(result.solved).toBe(false)
|
||||
})
|
||||
|
||||
it('tracks wait duration', async () => {
|
||||
const browser = createMockBrowser([
|
||||
{ value: { type: 'recaptcha', solved: false } },
|
||||
{ value: { type: 'recaptcha', solved: false } },
|
||||
{ value: { type: 'recaptcha', solved: true } },
|
||||
])
|
||||
|
||||
const result = await waiter.waitIfCaptchaPresent(browser, 1)
|
||||
|
||||
expect(result.waitDurationMs).toBeGreaterThanOrEqual(150)
|
||||
})
|
||||
})
|
||||
172
packages/browseros-agent/apps/eval/tests/e2e/captcha-e2e.ts
Normal file
172
packages/browseros-agent/apps/eval/tests/e2e/captcha-e2e.ts
Normal file
@@ -0,0 +1,172 @@
|
||||
/**
|
||||
* End-to-end test for CAPTCHA solver integration.
|
||||
*
|
||||
* Runs a single eval task against Google's reCAPTCHA demo page:
|
||||
* 1. Launches BrowserOS (headed) with NopeCHA extension loaded
|
||||
* 2. Agent navigates to reCAPTCHA demo, fills form
|
||||
* 3. CaptchaWaiter polls until NopeCHA solves the CAPTCHA
|
||||
* 4. Screenshot is captured AFTER solve
|
||||
* 5. Verifies: task completed, screenshots exist, metadata saved
|
||||
*
|
||||
* Prerequisites:
|
||||
* - NOPECHA_API_KEY env var set
|
||||
* - FIREWORKS_API_KEY env var set (or swap agent config)
|
||||
* - NopeCHA extension at extensions/nopecha/ (run the install step from CI)
|
||||
* - BrowserOS binary available
|
||||
*
|
||||
* Run:
|
||||
* bun --env-file=apps/eval/.env.development apps/eval/tests/e2e/captcha-e2e.ts
|
||||
*/
|
||||
|
||||
import { existsSync, readdirSync, readFileSync, rmSync } from 'node:fs'
|
||||
import { dirname, join } from 'node:path'
|
||||
import { fileURLToPath } from 'node:url'
|
||||
import { BrowserOSAppManager } from '../../src/runner/browseros-app-manager'
|
||||
import { createTaskExecutor } from '../../src/runner/task-executor'
|
||||
import { EvalConfigSchema } from '../../src/types/config'
|
||||
import { TaskSchema } from '../../src/types/task'
|
||||
|
||||
const HERE = dirname(fileURLToPath(import.meta.url))
|
||||
const OUTPUT_DIR = join(HERE, 'results')
|
||||
|
||||
const EVAL_CONFIG = {
|
||||
agent: {
|
||||
type: 'single' as const,
|
||||
provider: 'openai-compatible' as const,
|
||||
model: 'accounts/fireworks/models/kimi-k2p5',
|
||||
apiKey: 'FIREWORKS_API_KEY',
|
||||
baseUrl: 'https://api.fireworks.ai/inference/v1',
|
||||
supportsImages: true,
|
||||
},
|
||||
dataset: 'inline',
|
||||
num_workers: 1,
|
||||
restart_server_per_task: true,
|
||||
browseros: {
|
||||
server_url: 'http://127.0.0.1:9110',
|
||||
base_cdp_port: 9010,
|
||||
base_server_port: 9110,
|
||||
base_extension_port: 9310,
|
||||
load_extensions: false,
|
||||
headless: false,
|
||||
},
|
||||
captcha: { api_key_env: 'NOPECHA_API_KEY' },
|
||||
timeout_ms: 120000,
|
||||
}
|
||||
|
||||
const TASK = {
|
||||
query_id: 'captcha-e2e-1',
|
||||
dataset: 'captcha-test',
|
||||
query:
|
||||
"Go to the Google reCAPTCHA demo page. Wait for the CAPTCHA to appear. Click the 'I'm not a robot' checkbox. Once the CAPTCHA is solved, fill in the 'Name' field with 'Test User' and the 'Email' field with 'test@example.com'. Then click the Submit button.",
|
||||
start_url: 'https://www.google.com/recaptcha/api2/demo',
|
||||
metadata: { original_task_id: 'captcha-e2e-1' },
|
||||
}
|
||||
|
||||
// ── Helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
function log(msg: string) {
|
||||
console.log(`[captcha-e2e] ${msg}`)
|
||||
}
|
||||
|
||||
function fail(msg: string): never {
|
||||
console.error(`\n[FAIL] ${msg}`)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
function pass(msg: string) {
|
||||
console.log(`\n[PASS] ${msg}`)
|
||||
}
|
||||
|
||||
function preflight() {
|
||||
if (!process.env.NOPECHA_API_KEY) {
|
||||
fail('NOPECHA_API_KEY env var not set')
|
||||
}
|
||||
if (!process.env.FIREWORKS_API_KEY) {
|
||||
fail('FIREWORKS_API_KEY env var not set — needed for the agent LLM')
|
||||
}
|
||||
const extDir = join(HERE, '../../extensions/nopecha')
|
||||
if (!existsSync(join(extDir, 'manifest.json'))) {
|
||||
fail(`NopeCHA extension not found at ${extDir}`)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Main ──────────────────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
preflight()
|
||||
|
||||
const config = EvalConfigSchema.parse(EVAL_CONFIG)
|
||||
const task = TaskSchema.parse(TASK)
|
||||
const taskDir = join(OUTPUT_DIR, task.query_id)
|
||||
|
||||
if (existsSync(taskDir)) {
|
||||
rmSync(taskDir, { recursive: true, force: true })
|
||||
}
|
||||
|
||||
const captcha = config.captcha
|
||||
if (!captcha) fail('captcha config block missing')
|
||||
const apiKey = process.env[captcha.api_key_env]
|
||||
if (!apiKey) fail(`${captcha.api_key_env} env var is empty`)
|
||||
BrowserOSAppManager.patchNopechaApiKey(apiKey)
|
||||
|
||||
const app = new BrowserOSAppManager(
|
||||
0,
|
||||
{
|
||||
cdp: config.browseros.base_cdp_port,
|
||||
server: config.browseros.base_server_port,
|
||||
extension: config.browseros.base_extension_port,
|
||||
},
|
||||
config.browseros.load_extensions,
|
||||
config.browseros.headless,
|
||||
)
|
||||
|
||||
try {
|
||||
log('Starting BrowserOS stack (headed + NopeCHA extension)...')
|
||||
await app.restart()
|
||||
log(`BrowserOS ready at ${app.getServerUrl()}`)
|
||||
|
||||
const runConfig = {
|
||||
...config,
|
||||
browseros: { ...config.browseros, server_url: app.getServerUrl() },
|
||||
}
|
||||
|
||||
const executor = createTaskExecutor(runConfig, OUTPUT_DIR, null)
|
||||
log(`Running task: ${task.query_id}`)
|
||||
log(` start_url: ${task.start_url}`)
|
||||
|
||||
const result = await executor.execute(task)
|
||||
|
||||
log(`\nTask status: ${result.status}`)
|
||||
|
||||
if (result.status === 'failed') {
|
||||
const err = 'error' in result ? result.error : null
|
||||
fail(`Task failed: ${err?.message ?? 'unknown error'}`)
|
||||
}
|
||||
|
||||
const metadataPath = join(taskDir, 'metadata.json')
|
||||
if (!existsSync(metadataPath)) fail('metadata.json not found')
|
||||
const metadata = JSON.parse(readFileSync(metadataPath, 'utf-8'))
|
||||
log(` Duration: ${metadata.total_duration_ms}ms`)
|
||||
log(` Steps: ${metadata.total_steps}`)
|
||||
log(` Termination: ${metadata.termination_reason}`)
|
||||
|
||||
const screenshotDir = join(taskDir, 'screenshots')
|
||||
const screenshots = existsSync(screenshotDir)
|
||||
? readdirSync(screenshotDir).filter((f) => f.endsWith('.png'))
|
||||
: []
|
||||
log(` Screenshots: ${screenshots.length}`)
|
||||
if (screenshots.length === 0) fail('No screenshots captured')
|
||||
|
||||
pass(
|
||||
`${screenshots.length} screenshots, ${metadata.total_steps} steps, ${metadata.total_duration_ms}ms`,
|
||||
)
|
||||
} finally {
|
||||
log('Shutting down BrowserOS...')
|
||||
await app.killApp()
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err)
|
||||
process.exit(1)
|
||||
})
|
||||
Reference in New Issue
Block a user