mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-14 16:14:28 +00:00
Compare commits
2 Commits
fix/github
...
fix/eval-3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6956434b4f | ||
|
|
565aea10f3 |
3
packages/browseros-agent/apps/eval/README.md
vendored
3
packages/browseros-agent/apps/eval/README.md
vendored
@@ -140,7 +140,8 @@ Each worker gets its own Chrome instance. Worker N uses `base_port + N` for CDP
|
||||
| `webvoyager.jsonl` | 643 | Full WebVoyager benchmark |
|
||||
| `mind2web.jsonl` | 300 | Online-Mind2Web |
|
||||
| `webbench-{0,1,2}of4-50.jsonl` | 50 each | WebBench shards (50-task subsets) |
|
||||
| `agisdk-real.jsonl` | 40 | AGI SDK / REAL Bench (action-only tasks) |
|
||||
| `agisdk-real-smoke.jsonl` | 1 | AGI SDK / REAL Bench smoke task |
|
||||
| `agisdk-real.jsonl` | 36 | AGI SDK / REAL Bench (action-only tasks) |
|
||||
| `webarena-infinity-hard-50.jsonl` | 50 | WebArena-Infinity hard set |
|
||||
| `browsecomp-medium-hard-50.jsonl` | 50 | BrowseComp medium-hard |
|
||||
| `browsecomp-very-hard-50.jsonl` | 50 | BrowseComp very-hard |
|
||||
|
||||
@@ -2,13 +2,13 @@
|
||||
"agent": {
|
||||
"type": "single",
|
||||
"provider": "openai-compatible",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"model": "moonshotai/kimi-k2.5",
|
||||
"apiKey": "OPENROUTER_API_KEY",
|
||||
"baseUrl": "https://openrouter.ai/api/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/agisdk-real.jsonl",
|
||||
"num_workers": 4,
|
||||
"dataset": "../data/agisdk-real-smoke.jsonl",
|
||||
"num_workers": 1,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
|
||||
26
packages/browseros-agent/apps/eval/configs/agisdk-real.json
vendored
Normal file
26
packages/browseros-agent/apps/eval/configs/agisdk-real.json
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "single",
|
||||
"provider": "openai-compatible",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/agisdk-real.jsonl",
|
||||
"num_workers": 4,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"timeout_ms": 1800000
|
||||
}
|
||||
1
packages/browseros-agent/apps/eval/data/agisdk-real-smoke.jsonl
vendored
Normal file
1
packages/browseros-agent/apps/eval/data/agisdk-real-smoke.jsonl
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
|
||||
@@ -32,7 +32,8 @@ Preset configs in configs/:
|
||||
- browseros-agent-weekly.json Weekly eval (single agent)
|
||||
- browseros-oe-agent-weekly.json Weekly eval (orchestrator + LLM executor)
|
||||
- browseros-oe-clado-weekly.json Weekly eval (orchestrator + Clado executor)
|
||||
- agisdk-real-smoke.json AGI SDK smoke run
|
||||
- agisdk-real-smoke.json AGI SDK smoke run (1 task)
|
||||
- agisdk-real.json AGI SDK full run (36 tasks)
|
||||
- infinity-hard-50.json WebArena-Infinity hard-50 set
|
||||
- test-webvoyager.json WebVoyager test
|
||||
- test-mind2web.json Mind2Web test
|
||||
|
||||
Reference in New Issue
Block a user