mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
* feat: add deterministic eval graders (AGI SDK + WebArena-Infinity) Two new benchmark integrations with programmatic grading — no LLM judge. AGI SDK / REAL Bench (52 tasks): - 11 React/Next.js clones of consumer apps (DoorDash, Amazon, Gmail, etc.) - Grader navigates browser to /finish, extracts state diff from <pre> tag - Python verifier checks exact values via jmespath queries WebArena-Infinity (50 hard tasks): - 13 LLM-generated SaaS clones (Gmail, GitLab, Linear, Figma, etc.) - InfinityAppManager starts fresh app server per task per worker - Python verifier calls /api/state and asserts on JSON state Infrastructure: - GraderInput extended with mcpUrl + infinityAppUrl for parallel workers - Each worker gets isolated ports (no cross-worker state contamination) - CI workflow: pip install agisdk, clone webarena-infinity repo * chore: switch eval configs back to kimi-k2p5 * fix: register deterministic graders in pass rate calculation Add agisdk_state_diff and infinity_state to PASS_FAIL_GRADER_ORDER in both runner types and weekly report script, so scores show correctly in the dashboard. * chore: temp switch to opus 4.6 for eval run * chore: restore kimi-k2p5 as default eval config * ci: add timeout and continue-on-error for trend report step
27 lines
684 B
JSON
Vendored
27 lines
684 B
JSON
Vendored
{
|
|
"agent": {
|
|
"type": "single",
|
|
"provider": "openai-compatible",
|
|
"model": "accounts/fireworks/models/kimi-k2p5",
|
|
"apiKey": "FIREWORKS_API_KEY",
|
|
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
|
"supportsImages": true
|
|
},
|
|
"dataset": "../data/webarena-infinity-hard-50.jsonl",
|
|
"num_workers": 10,
|
|
"restart_server_per_task": true,
|
|
"browseros": {
|
|
"server_url": "http://127.0.0.1:9110",
|
|
"base_cdp_port": 9010,
|
|
"base_server_port": 9110,
|
|
"base_extension_port": 9310,
|
|
"load_extensions": false,
|
|
"headless": false
|
|
},
|
|
"captcha": {
|
|
"api_key_env": "NOPECHA_API_KEY"
|
|
},
|
|
"graders": ["infinity_state"],
|
|
"timeout_ms": 1800000
|
|
}
|