Compare commits

...

1 Commits

Author SHA1 Message Date
shivammittal274
38f2d1ba88 fix(eval): clean up eval configs and add test-clado-api script
Consolidate 13 configs down to 7 with uniform settings:
- 3 weekly (CI): browseros-agent, browseros-oe-agent, browseros-oe-clado
- 4 test (local): test_gemini-computer-use, test_yutori-navigator, test_webvoyager, test_mind2web
- All configs: headless=false, captcha block, full browseros ports, restart_server_per_task

Deleted: debug-test, mind2web-test, tool-loop-test, orchestrator-executor-test,
orchestrator-executor-clado-test, fireworks-minimax-m2, webvoyager-test

Added: test-clado-api.ts script, browseros-oe-agent-weekly.json (OE with AI SDK executor)
2026-03-24 01:25:18 +05:30
13 changed files with 273 additions and 164 deletions

View File

@@ -2,24 +2,21 @@
"agent": {
"type": "orchestrator-executor",
"orchestrator": {
"type": "single",
"provider": "openai-compatible",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1",
"supportsImages": true
"baseUrl": "https://api.fireworks.ai/inference/v1"
},
"executor": {
"provider": "openai-compatible",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1",
"supportsImages": true
"baseUrl": "https://api.fireworks.ai/inference/v1"
}
},
"dataset": "../data/webvoyager_e2e_test.jsonl",
"output_dir": "../results/orchestrator-executor-webvoyager-test",
"num_workers": 3,
"dataset": "../data/webbench-2of4-50.jsonl",
"num_workers": 10,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
@@ -28,8 +25,12 @@
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1200000
"timeout_ms": 1800000
}

View File

@@ -23,7 +23,10 @@
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": true
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",

View File

@@ -1,23 +0,0 @@
{
"agent": {
"type": "orchestrator-executor",
"orchestrator": {
"provider": "openrouter",
"model": "openai/gpt-4o",
"apiKey": "OPENROUTER_API_KEY",
"maxTurns": 3
},
"executor": {
"provider": "openrouter",
"model": "openai/gpt-4o",
"apiKey": "OPENROUTER_API_KEY"
}
},
"dataset": "../data/webvoyager_e2e_test.jsonl",
"output_dir": "../results/debug-test",
"num_workers": 1,
"browseros": {
"server_url": "http://127.0.0.1:9110"
},
"timeout_ms": 90000
}

View File

@@ -1,21 +0,0 @@
{
"agent": {
"type": "single",
"provider": "openai-compatible",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1",
"supportsImages": true
},
"dataset": "../data/test-set.jsonl",
"output_dir": "../results/fireworks-minimax-k2p5-test-set",
"num_workers": 1,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110"
},
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/o4-mini-high",
"timeout_ms": 3600000
}

View File

@@ -1,18 +0,0 @@
{
"agent": {
"type": "single",
"provider": "openrouter",
"model": "openai/gpt-4.1",
"apiKey": "OPENROUTER_API_KEY"
},
"dataset": "../data/mind2web_e2e_test.jsonl",
"output_dir": "../results/mind2web-test",
"num_workers": 5,
"browseros": {
"server_url": "http://127.0.0.1:9110"
},
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 300000
}

View File

@@ -1,32 +0,0 @@
{
"agent": {
"type": "orchestrator-executor",
"orchestrator": {
"provider": "openai-compatible",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1"
},
"executor": {
"provider": "clado-action",
"model": "qwen3-vl-30b-a3b-instruct",
"apiKey": "",
"baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
}
},
"dataset": "../data/webvoyager_e2e_test.jsonl",
"output_dir": "../results/orchestrator-executor-clado-webvoyager-test",
"num_workers": 3,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": true
},
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1200000
}

View File

@@ -9,12 +9,20 @@
"turnLimit": 100
},
"dataset": "../data/test-set.jsonl",
"output_dir": "../results/gemini-computer-use-test-set2",
"num_workers": 1,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110"
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",

View File

@@ -6,11 +6,20 @@
"apiKey": "OPENROUTER_API_KEY"
},
"dataset": "../data/mind2web.jsonl",
"output_dir": "../results/mind2web-full",
"num_workers": 5,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110"
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",

View File

@@ -8,16 +8,20 @@
"supportsImages": true
},
"dataset": "../data/webvoyager.jsonl",
"output_dir": "../results/webvoyager-cdp-server",
"num_workers": 3,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": true
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",

View File

@@ -9,14 +9,22 @@
"turnLimit": 100
},
"dataset": "../data/test-set.jsonl",
"output_dir": "../results/yutori-navigator",
"num_workers": 1,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110"
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"timeout_ms": 1200000,
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1"
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1200000
}

View File

@@ -1,25 +0,0 @@
{
"agent": {
"type": "single",
"provider": "openai-compatible",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1",
"supportsImages": true
},
"dataset": "../data/webvoyager_e2e_test.jsonl",
"output_dir": "../results/tool-loop-webvoyager-test",
"num_workers": 3,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": true
},
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1200000
}

View File

@@ -1,25 +0,0 @@
{
"agent": {
"type": "single",
"provider": "openai-compatible",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1",
"supportsImages": true
},
"dataset": "../data/webvoyager_e2e_test.jsonl",
"output_dir": "../results/webvoyager-test",
"num_workers": 3,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": true
},
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1200000
}

View File

@@ -0,0 +1,220 @@
/**
* Test script for Clado API endpoints (grounding + action models)
*
* Usage:
* bun apps/eval/scripts/test-clado-api.ts [screenshot-path]
*
* If no screenshot provided, captures one from a running BrowserOS server.
*/
import { readFile } from 'node:fs/promises'
import { resolve } from 'node:path'
const ACTION_URL =
'https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run'
const ACTION_HEALTH_URL =
'https://clado-ai--clado-browseros-action-actionmodel-health.modal.run'
const GROUNDING_URL =
'https://clado-ai--clado-browseros-grounding-groundingmodel-generate.modal.run'
const GROUNDING_HEALTH_URL =
'https://clado-ai--clado-browseros-grounding-groundingmodel-health.modal.run'
async function checkHealth(name: string, url: string): Promise<boolean> {
console.log(`\n--- ${name} health check ---`)
console.log(` URL: ${url}`)
const start = performance.now()
try {
const resp = await fetch(url, { signal: AbortSignal.timeout(30_000) })
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
const body = await resp.text()
console.log(` Status: ${resp.status} (${elapsed}s)`)
console.log(` Body: ${body.slice(0, 200)}`)
return resp.ok
} catch (err) {
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
console.log(
` FAILED (${elapsed}s): ${err instanceof Error ? err.message : err}`,
)
return false
}
}
async function testGenerate(
name: string,
url: string,
payload: Record<string, unknown>,
): Promise<Record<string, unknown> | null> {
console.log(`\n--- ${name} generate ---`)
console.log(` URL: ${url}`)
console.log(` Instruction: ${payload.instruction}`)
console.log(
` Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
)
if (payload.history) console.log(` History: ${payload.history}`)
const start = performance.now()
try {
const resp = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
signal: AbortSignal.timeout(120_000),
})
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
if (!resp.ok) {
const body = await resp.text()
console.log(` FAILED: HTTP ${resp.status} (${elapsed}s)`)
console.log(` Body: ${body.slice(0, 400)}`)
return null
}
const result = (await resp.json()) as Record<string, unknown>
console.log(` Status: ${resp.status} (${elapsed}s)`)
console.log(` Action: ${result.action}`)
if (result.x !== null && result.x !== undefined)
console.log(` Coordinates: (${result.x}, ${result.y})`)
if (result.text)
console.log(` Text: ${(result.text as string).slice(0, 100)}`)
if (result.key) console.log(` Key: ${result.key}`)
if (result.inference_time_seconds)
console.log(` Inference: ${result.inference_time_seconds}s`)
// Show thinking if present
const raw = result.raw_response as string | undefined
if (raw) {
const thinkMatch = raw.match(/<thinking>([\s\S]*?)<\/thinking>/)
if (thinkMatch) {
const thinking = thinkMatch[1].trim()
console.log(
` Thinking: ${thinking.slice(0, 200)}${thinking.length > 200 ? '...' : ''}`,
)
}
}
return result
} catch (err) {
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
console.log(
` FAILED (${elapsed}s): ${err instanceof Error ? err.message : err}`,
)
return null
}
}
async function loadScreenshot(path?: string): Promise<string> {
if (path) {
const resolved = resolve(path)
console.log(`Loading screenshot: ${resolved}`)
const data = await readFile(resolved)
return data.toString('base64')
}
// Try to capture from a running BrowserOS server
const serverUrl = process.env.BROWSEROS_URL || 'http://127.0.0.1:9110'
console.log(
`No screenshot path provided. Trying to capture from ${serverUrl}...`,
)
const { Client } = await import('@modelcontextprotocol/sdk/client/index.js')
const { StreamableHTTPClientTransport } = await import(
'@modelcontextprotocol/sdk/client/streamableHttp.js'
)
const client = new Client({ name: 'clado-test', version: '1.0.0' })
const transport = new StreamableHTTPClientTransport(
new URL(`${serverUrl}/mcp`),
{ requestInit: { headers: { 'X-BrowserOS-Source': 'sdk-internal' } } },
)
try {
await client.connect(transport)
const result = (await client.callTool({
name: 'take_screenshot',
arguments: { format: 'png', page: 1 },
})) as { content: Array<{ type: string; data?: string }> }
const imageContent = result.content?.find((c) => c.type === 'image')
if (!imageContent?.data)
throw new Error('No image data in screenshot response')
console.log(
`Captured screenshot (${(imageContent.data.length / 1024).toFixed(0)} KB base64)`,
)
return imageContent.data
} finally {
try {
await transport.close()
} catch {}
}
}
async function main() {
const screenshotPath = process.argv[2]
console.log('=== Clado API Test ===\n')
// Health checks (parallel)
const [actionHealthy, groundingHealthy] = await Promise.all([
checkHealth('Action Model', ACTION_HEALTH_URL),
checkHealth('Grounding Model', GROUNDING_HEALTH_URL),
])
if (!actionHealthy && !groundingHealthy) {
console.log('\nBoth endpoints are down. Exiting.')
process.exit(1)
}
// Load screenshot
let imageBase64: string
try {
imageBase64 = await loadScreenshot(screenshotPath)
} catch (err) {
console.log(
`\nFailed to load screenshot: ${err instanceof Error ? err.message : err}`,
)
console.log(
'Provide a screenshot path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
)
process.exit(1)
}
const instruction = 'Click on the search button or search bar'
// Test grounding model
if (groundingHealthy) {
await testGenerate('Grounding Model', GROUNDING_URL, {
instruction,
image_base64: imageBase64,
})
} else {
console.log('\nSkipping grounding model (unhealthy)')
}
// Test action model (no history)
if (actionHealthy) {
const result = await testGenerate('Action Model (step 1)', ACTION_URL, {
instruction,
image_base64: imageBase64,
history: 'None',
})
// Test action model with history (simulate multi-turn)
if (result && result.action === 'click') {
await testGenerate('Action Model (step 2, with history)', ACTION_URL, {
instruction: 'Type "hello world" in the search bar',
image_base64: imageBase64,
history: `click(${result.x}, ${result.y})`,
})
}
} else {
console.log('\nSkipping action model (unhealthy)')
}
console.log('\n=== Done ===')
}
main().catch((err) => {
console.error('Fatal:', err)
process.exit(1)
})