Compare commits

...

2 Commits

Author SHA1 Message Date
Nikhil Sonti
6956434b4f fix(eval): default agisdk smoke to openrouter 2026-04-29 13:38:21 -07:00
Nikhil Sonti
565aea10f3 fix(eval): split agisdk smoke and full configs 2026-04-29 13:34:47 -07:00
5 changed files with 36 additions and 7 deletions

View File

@@ -140,7 +140,8 @@ Each worker gets its own Chrome instance. Worker N uses `base_port + N` for CDP
| `webvoyager.jsonl` | 643 | Full WebVoyager benchmark |
| `mind2web.jsonl` | 300 | Online-Mind2Web |
| `webbench-{0,1,2}of4-50.jsonl` | 50 each | WebBench shards (50-task subsets) |
| `agisdk-real.jsonl` | 40 | AGI SDK / REAL Bench (action-only tasks) |
| `agisdk-real-smoke.jsonl` | 1 | AGI SDK / REAL Bench smoke task |
| `agisdk-real.jsonl` | 36 | AGI SDK / REAL Bench (action-only tasks) |
| `webarena-infinity-hard-50.jsonl` | 50 | WebArena-Infinity hard set |
| `browsecomp-medium-hard-50.jsonl` | 50 | BrowseComp medium-hard |
| `browsecomp-very-hard-50.jsonl` | 50 | BrowseComp very-hard |

View File

@@ -2,13 +2,13 @@
"agent": {
"type": "single",
"provider": "openai-compatible",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1",
"model": "moonshotai/kimi-k2.5",
"apiKey": "OPENROUTER_API_KEY",
"baseUrl": "https://openrouter.ai/api/v1",
"supportsImages": true
},
"dataset": "../data/agisdk-real.jsonl",
"num_workers": 4,
"dataset": "../data/agisdk-real-smoke.jsonl",
"num_workers": 1,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",

View File

@@ -0,0 +1,26 @@
{
"agent": {
"type": "single",
"provider": "openai-compatible",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1",
"supportsImages": true
},
"dataset": "../data/agisdk-real.jsonl",
"num_workers": 4,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["agisdk_state_diff"],
"timeout_ms": 1800000
}

View File

@@ -0,0 +1 @@
{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}

View File

@@ -32,7 +32,8 @@ Preset configs in configs/:
- browseros-agent-weekly.json Weekly eval (single agent)
- browseros-oe-agent-weekly.json Weekly eval (orchestrator + LLM executor)
- browseros-oe-clado-weekly.json Weekly eval (orchestrator + Clado executor)
- agisdk-real-smoke.json AGI SDK smoke run
- agisdk-real-smoke.json AGI SDK smoke run (1 task)
- agisdk-real.json AGI SDK full run (36 tasks)
- infinity-hard-50.json WebArena-Infinity hard-50 set
- test-webvoyager.json WebVoyager test
- test-mind2web.json Mind2Web test