mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 23:53:25 +00:00
Compare commits
1 Commits
fix/linux-
...
fix/eval-c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
38f2d1ba88 |
@@ -2,24 +2,21 @@
|
||||
"agent": {
|
||||
"type": "orchestrator-executor",
|
||||
"orchestrator": {
|
||||
"type": "single",
|
||||
"provider": "openai-compatible",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1"
|
||||
},
|
||||
"executor": {
|
||||
"provider": "openai-compatible",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1"
|
||||
}
|
||||
},
|
||||
"dataset": "../data/webvoyager_e2e_test.jsonl",
|
||||
"output_dir": "../results/orchestrator-executor-webvoyager-test",
|
||||
"num_workers": 3,
|
||||
"dataset": "../data/webbench-2of4-50.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
@@ -28,8 +25,12 @@
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["performance_grader"],
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
"grader_base_url": "https://openrouter.ai/api/v1",
|
||||
"grader_model": "openai/gpt-4.1",
|
||||
"timeout_ms": 1200000
|
||||
"timeout_ms": 1800000
|
||||
}
|
||||
@@ -23,7 +23,10 @@
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": true
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["performance_grader"],
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "orchestrator-executor",
|
||||
"orchestrator": {
|
||||
"provider": "openrouter",
|
||||
"model": "openai/gpt-4o",
|
||||
"apiKey": "OPENROUTER_API_KEY",
|
||||
"maxTurns": 3
|
||||
},
|
||||
"executor": {
|
||||
"provider": "openrouter",
|
||||
"model": "openai/gpt-4o",
|
||||
"apiKey": "OPENROUTER_API_KEY"
|
||||
}
|
||||
},
|
||||
"dataset": "../data/webvoyager_e2e_test.jsonl",
|
||||
"output_dir": "../results/debug-test",
|
||||
"num_workers": 1,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110"
|
||||
},
|
||||
"timeout_ms": 90000
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "single",
|
||||
"provider": "openai-compatible",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/test-set.jsonl",
|
||||
"output_dir": "../results/fireworks-minimax-k2p5-test-set",
|
||||
"num_workers": 1,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110"
|
||||
},
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
"grader_base_url": "https://openrouter.ai/api/v1",
|
||||
"grader_model": "openai/o4-mini-high",
|
||||
"timeout_ms": 3600000
|
||||
}
|
||||
@@ -1,18 +0,0 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "single",
|
||||
"provider": "openrouter",
|
||||
"model": "openai/gpt-4.1",
|
||||
"apiKey": "OPENROUTER_API_KEY"
|
||||
},
|
||||
"dataset": "../data/mind2web_e2e_test.jsonl",
|
||||
"output_dir": "../results/mind2web-test",
|
||||
"num_workers": 5,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110"
|
||||
},
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
"grader_base_url": "https://openrouter.ai/api/v1",
|
||||
"grader_model": "openai/gpt-4.1",
|
||||
"timeout_ms": 300000
|
||||
}
|
||||
@@ -1,32 +0,0 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "orchestrator-executor",
|
||||
"orchestrator": {
|
||||
"provider": "openai-compatible",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1"
|
||||
},
|
||||
"executor": {
|
||||
"provider": "clado-action",
|
||||
"model": "qwen3-vl-30b-a3b-instruct",
|
||||
"apiKey": "",
|
||||
"baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
|
||||
}
|
||||
},
|
||||
"dataset": "../data/webvoyager_e2e_test.jsonl",
|
||||
"output_dir": "../results/orchestrator-executor-clado-webvoyager-test",
|
||||
"num_workers": 3,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": true
|
||||
},
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
"grader_base_url": "https://openrouter.ai/api/v1",
|
||||
"grader_model": "openai/gpt-4.1",
|
||||
"timeout_ms": 1200000
|
||||
}
|
||||
@@ -9,12 +9,20 @@
|
||||
"turnLimit": 100
|
||||
},
|
||||
"dataset": "../data/test-set.jsonl",
|
||||
"output_dir": "../results/gemini-computer-use-test-set2",
|
||||
"num_workers": 1,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110"
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["performance_grader"],
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
"grader_base_url": "https://openrouter.ai/api/v1",
|
||||
"grader_model": "openai/gpt-4.1",
|
||||
@@ -6,11 +6,20 @@
|
||||
"apiKey": "OPENROUTER_API_KEY"
|
||||
},
|
||||
"dataset": "../data/mind2web.jsonl",
|
||||
"output_dir": "../results/mind2web-full",
|
||||
"num_workers": 5,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110"
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["performance_grader"],
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
"grader_base_url": "https://openrouter.ai/api/v1",
|
||||
"grader_model": "openai/gpt-4.1",
|
||||
@@ -8,16 +8,20 @@
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/webvoyager.jsonl",
|
||||
"output_dir": "../results/webvoyager-cdp-server",
|
||||
"num_workers": 3,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": true
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["performance_grader"],
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
"grader_base_url": "https://openrouter.ai/api/v1",
|
||||
"grader_model": "openai/gpt-4.1",
|
||||
@@ -9,14 +9,22 @@
|
||||
"turnLimit": 100
|
||||
},
|
||||
"dataset": "../data/test-set.jsonl",
|
||||
"output_dir": "../results/yutori-navigator",
|
||||
"num_workers": 1,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110"
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"timeout_ms": 1200000,
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["performance_grader"],
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
"grader_base_url": "https://openrouter.ai/api/v1",
|
||||
"grader_model": "openai/gpt-4.1"
|
||||
"grader_model": "openai/gpt-4.1",
|
||||
"timeout_ms": 1200000
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "single",
|
||||
"provider": "openai-compatible",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/webvoyager_e2e_test.jsonl",
|
||||
"output_dir": "../results/tool-loop-webvoyager-test",
|
||||
"num_workers": 3,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": true
|
||||
},
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
"grader_base_url": "https://openrouter.ai/api/v1",
|
||||
"grader_model": "openai/gpt-4.1",
|
||||
"timeout_ms": 1200000
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "single",
|
||||
"provider": "openai-compatible",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/webvoyager_e2e_test.jsonl",
|
||||
"output_dir": "../results/webvoyager-test",
|
||||
"num_workers": 3,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": true
|
||||
},
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
"grader_base_url": "https://openrouter.ai/api/v1",
|
||||
"grader_model": "openai/gpt-4.1",
|
||||
"timeout_ms": 1200000
|
||||
}
|
||||
220
packages/browseros-agent/apps/eval/scripts/test-clado-api.ts
Normal file
220
packages/browseros-agent/apps/eval/scripts/test-clado-api.ts
Normal file
@@ -0,0 +1,220 @@
|
||||
/**
|
||||
* Test script for Clado API endpoints (grounding + action models)
|
||||
*
|
||||
* Usage:
|
||||
* bun apps/eval/scripts/test-clado-api.ts [screenshot-path]
|
||||
*
|
||||
* If no screenshot provided, captures one from a running BrowserOS server.
|
||||
*/
|
||||
|
||||
import { readFile } from 'node:fs/promises'
|
||||
import { resolve } from 'node:path'
|
||||
|
||||
const ACTION_URL =
|
||||
'https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run'
|
||||
const ACTION_HEALTH_URL =
|
||||
'https://clado-ai--clado-browseros-action-actionmodel-health.modal.run'
|
||||
const GROUNDING_URL =
|
||||
'https://clado-ai--clado-browseros-grounding-groundingmodel-generate.modal.run'
|
||||
const GROUNDING_HEALTH_URL =
|
||||
'https://clado-ai--clado-browseros-grounding-groundingmodel-health.modal.run'
|
||||
|
||||
async function checkHealth(name: string, url: string): Promise<boolean> {
|
||||
console.log(`\n--- ${name} health check ---`)
|
||||
console.log(` URL: ${url}`)
|
||||
const start = performance.now()
|
||||
try {
|
||||
const resp = await fetch(url, { signal: AbortSignal.timeout(30_000) })
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
const body = await resp.text()
|
||||
console.log(` Status: ${resp.status} (${elapsed}s)`)
|
||||
console.log(` Body: ${body.slice(0, 200)}`)
|
||||
return resp.ok
|
||||
} catch (err) {
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
console.log(
|
||||
` FAILED (${elapsed}s): ${err instanceof Error ? err.message : err}`,
|
||||
)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
async function testGenerate(
|
||||
name: string,
|
||||
url: string,
|
||||
payload: Record<string, unknown>,
|
||||
): Promise<Record<string, unknown> | null> {
|
||||
console.log(`\n--- ${name} generate ---`)
|
||||
console.log(` URL: ${url}`)
|
||||
console.log(` Instruction: ${payload.instruction}`)
|
||||
console.log(
|
||||
` Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
|
||||
)
|
||||
if (payload.history) console.log(` History: ${payload.history}`)
|
||||
|
||||
const start = performance.now()
|
||||
try {
|
||||
const resp = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(payload),
|
||||
signal: AbortSignal.timeout(120_000),
|
||||
})
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
|
||||
if (!resp.ok) {
|
||||
const body = await resp.text()
|
||||
console.log(` FAILED: HTTP ${resp.status} (${elapsed}s)`)
|
||||
console.log(` Body: ${body.slice(0, 400)}`)
|
||||
return null
|
||||
}
|
||||
|
||||
const result = (await resp.json()) as Record<string, unknown>
|
||||
console.log(` Status: ${resp.status} (${elapsed}s)`)
|
||||
console.log(` Action: ${result.action}`)
|
||||
if (result.x !== null && result.x !== undefined)
|
||||
console.log(` Coordinates: (${result.x}, ${result.y})`)
|
||||
if (result.text)
|
||||
console.log(` Text: ${(result.text as string).slice(0, 100)}`)
|
||||
if (result.key) console.log(` Key: ${result.key}`)
|
||||
if (result.inference_time_seconds)
|
||||
console.log(` Inference: ${result.inference_time_seconds}s`)
|
||||
|
||||
// Show thinking if present
|
||||
const raw = result.raw_response as string | undefined
|
||||
if (raw) {
|
||||
const thinkMatch = raw.match(/<thinking>([\s\S]*?)<\/thinking>/)
|
||||
if (thinkMatch) {
|
||||
const thinking = thinkMatch[1].trim()
|
||||
console.log(
|
||||
` Thinking: ${thinking.slice(0, 200)}${thinking.length > 200 ? '...' : ''}`,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
} catch (err) {
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
console.log(
|
||||
` FAILED (${elapsed}s): ${err instanceof Error ? err.message : err}`,
|
||||
)
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
async function loadScreenshot(path?: string): Promise<string> {
|
||||
if (path) {
|
||||
const resolved = resolve(path)
|
||||
console.log(`Loading screenshot: ${resolved}`)
|
||||
const data = await readFile(resolved)
|
||||
return data.toString('base64')
|
||||
}
|
||||
|
||||
// Try to capture from a running BrowserOS server
|
||||
const serverUrl = process.env.BROWSEROS_URL || 'http://127.0.0.1:9110'
|
||||
console.log(
|
||||
`No screenshot path provided. Trying to capture from ${serverUrl}...`,
|
||||
)
|
||||
|
||||
const { Client } = await import('@modelcontextprotocol/sdk/client/index.js')
|
||||
const { StreamableHTTPClientTransport } = await import(
|
||||
'@modelcontextprotocol/sdk/client/streamableHttp.js'
|
||||
)
|
||||
|
||||
const client = new Client({ name: 'clado-test', version: '1.0.0' })
|
||||
const transport = new StreamableHTTPClientTransport(
|
||||
new URL(`${serverUrl}/mcp`),
|
||||
{ requestInit: { headers: { 'X-BrowserOS-Source': 'sdk-internal' } } },
|
||||
)
|
||||
|
||||
try {
|
||||
await client.connect(transport)
|
||||
const result = (await client.callTool({
|
||||
name: 'take_screenshot',
|
||||
arguments: { format: 'png', page: 1 },
|
||||
})) as { content: Array<{ type: string; data?: string }> }
|
||||
|
||||
const imageContent = result.content?.find((c) => c.type === 'image')
|
||||
if (!imageContent?.data)
|
||||
throw new Error('No image data in screenshot response')
|
||||
|
||||
console.log(
|
||||
`Captured screenshot (${(imageContent.data.length / 1024).toFixed(0)} KB base64)`,
|
||||
)
|
||||
return imageContent.data
|
||||
} finally {
|
||||
try {
|
||||
await transport.close()
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const screenshotPath = process.argv[2]
|
||||
|
||||
console.log('=== Clado API Test ===\n')
|
||||
|
||||
// Health checks (parallel)
|
||||
const [actionHealthy, groundingHealthy] = await Promise.all([
|
||||
checkHealth('Action Model', ACTION_HEALTH_URL),
|
||||
checkHealth('Grounding Model', GROUNDING_HEALTH_URL),
|
||||
])
|
||||
|
||||
if (!actionHealthy && !groundingHealthy) {
|
||||
console.log('\nBoth endpoints are down. Exiting.')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
// Load screenshot
|
||||
let imageBase64: string
|
||||
try {
|
||||
imageBase64 = await loadScreenshot(screenshotPath)
|
||||
} catch (err) {
|
||||
console.log(
|
||||
`\nFailed to load screenshot: ${err instanceof Error ? err.message : err}`,
|
||||
)
|
||||
console.log(
|
||||
'Provide a screenshot path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const instruction = 'Click on the search button or search bar'
|
||||
|
||||
// Test grounding model
|
||||
if (groundingHealthy) {
|
||||
await testGenerate('Grounding Model', GROUNDING_URL, {
|
||||
instruction,
|
||||
image_base64: imageBase64,
|
||||
})
|
||||
} else {
|
||||
console.log('\nSkipping grounding model (unhealthy)')
|
||||
}
|
||||
|
||||
// Test action model (no history)
|
||||
if (actionHealthy) {
|
||||
const result = await testGenerate('Action Model (step 1)', ACTION_URL, {
|
||||
instruction,
|
||||
image_base64: imageBase64,
|
||||
history: 'None',
|
||||
})
|
||||
|
||||
// Test action model with history (simulate multi-turn)
|
||||
if (result && result.action === 'click') {
|
||||
await testGenerate('Action Model (step 2, with history)', ACTION_URL, {
|
||||
instruction: 'Type "hello world" in the search bar',
|
||||
image_base64: imageBase64,
|
||||
history: `click(${result.x}, ${result.y})`,
|
||||
})
|
||||
}
|
||||
} else {
|
||||
console.log('\nSkipping action model (unhealthy)')
|
||||
}
|
||||
|
||||
console.log('\n=== Done ===')
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('Fatal:', err)
|
||||
process.exit(1)
|
||||
})
|
||||
Reference in New Issue
Block a user