mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 23:53:25 +00:00
Compare commits
17 Commits
fix/debug-
...
fix/eval-4
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6ee306236e | ||
|
|
0afc59cda1 | ||
|
|
eb8faa931a | ||
|
|
be70170313 | ||
|
|
0661197f5b | ||
|
|
c4e7824266 | ||
|
|
22f71a36c5 | ||
|
|
d49986d0b3 | ||
|
|
acdd394585 | ||
|
|
219fdf1e28 | ||
|
|
014f71d227 | ||
|
|
876dea4d56 | ||
|
|
fca7d4cbcb | ||
|
|
e1bfadb075 | ||
|
|
aa0d9b96ef | ||
|
|
1c9604b5fa | ||
|
|
685266a1d8 |
29
.github/workflows/eval-weekly.yml
vendored
29
.github/workflows/eval-weekly.yml
vendored
@@ -14,7 +14,7 @@ on:
|
||||
config:
|
||||
description: 'Eval config file (relative to apps/eval/)'
|
||||
required: false
|
||||
default: 'configs/browseros-agent-weekly.json'
|
||||
default: 'configs/legacy/browseros-agent-weekly.json'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -62,36 +62,27 @@ jobs:
|
||||
curl -sL -o /tmp/nopecha.zip https://github.com/NopeCHALLC/nopecha-extension/releases/latest/download/chromium_automation.zip
|
||||
unzip -qo /tmp/nopecha.zip -d extensions/nopecha
|
||||
|
||||
- name: Run eval
|
||||
- name: Run eval and publish to R2
|
||||
working-directory: packages/browseros-agent/apps/eval
|
||||
env:
|
||||
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
|
||||
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
||||
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
|
||||
NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
|
||||
BROWSEROS_BINARY: /usr/bin/browseros
|
||||
WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
|
||||
# OpenClaw container runtime is macOS-only; opt the Linux runner
|
||||
# into the no-op stub so the server can boot and the eval can run.
|
||||
BROWSEROS_SKIP_OPENCLAW: '1'
|
||||
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
|
||||
run: |
|
||||
echo "Running eval with config: $EVAL_CONFIG"
|
||||
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts -c "$EVAL_CONFIG"
|
||||
|
||||
- name: Upload runs to R2
|
||||
if: success()
|
||||
working-directory: packages/browseros-agent/apps/eval
|
||||
env:
|
||||
EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
|
||||
EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }}
|
||||
EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }}
|
||||
EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }}
|
||||
EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }}
|
||||
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
|
||||
BROWSEROS_BINARY: /usr/bin/browseros
|
||||
WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
|
||||
# OpenClaw container runtime is macOS-only; opt the Linux runner
|
||||
# into the no-op stub so the server can boot and the eval can run.
|
||||
BROWSEROS_SKIP_OPENCLAW: '1'
|
||||
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }}
|
||||
run: |
|
||||
CONFIG_NAME=$(basename "$EVAL_CONFIG" .json)
|
||||
bun scripts/upload-run.ts "results/$CONFIG_NAME"
|
||||
echo "Running eval with config: $EVAL_CONFIG"
|
||||
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts suite --config "$EVAL_CONFIG" --publish r2
|
||||
|
||||
- name: Generate trend report
|
||||
if: success()
|
||||
|
||||
51
packages/browseros-agent/apps/eval/.env.example
vendored
Normal file
51
packages/browseros-agent/apps/eval/.env.example
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
# Copy to .env.development for local eval runs.
|
||||
|
||||
# Provider keys used by existing config files.
|
||||
OPENROUTER_API_KEY=
|
||||
FIREWORKS_API_KEY=
|
||||
ANTHROPIC_API_KEY=
|
||||
OPENAI_API_KEY=
|
||||
GOOGLE_GENERATIVE_AI_API_KEY=
|
||||
|
||||
# Claude Agent SDK token used by performance_grader.
|
||||
CLAUDE_CODE_OAUTH_TOKEN=
|
||||
|
||||
# Suite-mode model selection.
|
||||
EVAL_VARIANT=local
|
||||
EVAL_AGENT_PROVIDER=openai-compatible
|
||||
EVAL_AGENT_MODEL=
|
||||
EVAL_AGENT_API_KEY=
|
||||
EVAL_AGENT_BASE_URL=
|
||||
EVAL_AGENT_SUPPORTS_IMAGES=true
|
||||
|
||||
# Optional suite-mode executor override for orchestrator suites.
|
||||
EVAL_EXECUTOR_MODEL=
|
||||
EVAL_EXECUTOR_API_KEY=
|
||||
EVAL_EXECUTOR_BASE_URL=
|
||||
|
||||
# Clado visual action executor.
|
||||
CLADO_ACTION_MODEL=
|
||||
CLADO_ACTION_API_KEY=
|
||||
CLADO_ACTION_BASE_URL=
|
||||
# Backward-compatible alias used by older local scripts.
|
||||
CLADO_ACTION_URL=
|
||||
|
||||
# BrowserOS runner.
|
||||
BROWSEROS_BINARY=/Applications/BrowserOS.app/Contents/MacOS/BrowserOS
|
||||
BROWSEROS_SERVER_URL=http://127.0.0.1:9110
|
||||
BROWSEROS_SERVER_LOG_DIR=/tmp/browseros-server-logs
|
||||
BROWSEROS_CONFIG_URL=
|
||||
|
||||
# Captcha solver extension.
|
||||
NOPECHA_API_KEY=
|
||||
|
||||
# WebArena-Infinity.
|
||||
WEBARENA_INFINITY_DIR=
|
||||
INFINITY_APP_URL=
|
||||
|
||||
# R2 publishing and weekly report.
|
||||
EVAL_R2_ACCOUNT_ID=
|
||||
EVAL_R2_ACCESS_KEY_ID=
|
||||
EVAL_R2_SECRET_ACCESS_KEY=
|
||||
EVAL_R2_BUCKET=browseros-eval
|
||||
EVAL_R2_CDN_BASE_URL=https://eval.browseros.com
|
||||
81
packages/browseros-agent/apps/eval/README.md
vendored
81
packages/browseros-agent/apps/eval/README.md
vendored
@@ -14,6 +14,7 @@ Evaluation framework for BrowserOS browser automation agents. Runs tasks from st
|
||||
|
||||
```bash
|
||||
cd apps/eval
|
||||
cp .env.example .env.development
|
||||
# Edit .env.development with your keys, then:
|
||||
bun run eval
|
||||
```
|
||||
@@ -23,11 +24,55 @@ Opens the eval dashboard at `http://localhost:9900` in config mode. From there:
|
||||
### CLI mode
|
||||
|
||||
```bash
|
||||
bun run eval -c configs/browseros-agent-weekly.json
|
||||
bun run eval -c configs/legacy/browseros-agent-weekly.json
|
||||
bun run eval suite --config configs/legacy/browseros-agent-weekly.json --publish r2
|
||||
```
|
||||
|
||||
Runs immediately. Dashboard still available at `http://localhost:9900` for live progress.
|
||||
|
||||
The `suite` command is the workflow-compatible full loop: execute tasks, run graders, write artifacts, and optionally publish to R2. The old `-c` form remains supported during migration.
|
||||
|
||||
```bash
|
||||
bun run eval run --config configs/legacy/browseros-agent-weekly.json
|
||||
bun run eval suite --suite configs/suites/agisdk-daily-10.json --variant kimi-fireworks --publish r2
|
||||
bun run eval grade --run results/browseros-agent-weekly/2026-04-29-1430
|
||||
bun run eval publish --run results/browseros-agent-weekly/2026-04-29-1430 --target r2
|
||||
```
|
||||
|
||||
Config files live in two groups:
|
||||
|
||||
```txt
|
||||
configs/legacy/ # Complete EvalConfig files used by older workflows and the dashboard
|
||||
configs/suites/ # Suite definitions; model/provider comes from CLI flags or env
|
||||
```
|
||||
|
||||
Suite mode takes model settings from CLI flags first, then env:
|
||||
|
||||
```bash
|
||||
EVAL_VARIANT=kimi-fireworks \
|
||||
EVAL_AGENT_PROVIDER=openai-compatible \
|
||||
EVAL_AGENT_MODEL=accounts/fireworks/models/kimi-k2p5 \
|
||||
EVAL_AGENT_API_KEY=$FIREWORKS_API_KEY \
|
||||
EVAL_AGENT_BASE_URL=https://api.fireworks.ai/inference/v1 \
|
||||
bun run eval suite --suite configs/suites/agisdk-daily-10.json --publish r2
|
||||
```
|
||||
|
||||
### Suites and variants
|
||||
|
||||
A **suite** is what we run: the task dataset, graders, worker count, timeout, and browser settings. For example, `agisdk-daily-10` means "run these 10 AGI SDK tasks and grade them with `agisdk_state_diff`."
|
||||
|
||||
A **variant** is the model setup we are testing on that suite. `EVAL_VARIANT` is just the human-readable name for that setup. The actual model connection still comes from `EVAL_AGENT_PROVIDER`, `EVAL_AGENT_MODEL`, `EVAL_AGENT_API_KEY`, and `EVAL_AGENT_BASE_URL`.
|
||||
|
||||
This lets us run the same suite against multiple model setups without copying the benchmark config:
|
||||
|
||||
```txt
|
||||
agisdk-daily-10 + kimi-fireworks
|
||||
agisdk-daily-10 + claude-sonnet
|
||||
agisdk-daily-10 + clado-action-000159
|
||||
```
|
||||
|
||||
For `orchestrator-executor` suites, there can also be an executor model/backend. The `EVAL_AGENT_*` vars describe the main agent or orchestrator. The optional `EVAL_EXECUTOR_*` or `CLADO_ACTION_*` vars describe the delegated executor.
|
||||
|
||||
## Agent types
|
||||
|
||||
| Type | Description |
|
||||
@@ -96,6 +141,20 @@ The `apiKey` field supports two formats:
|
||||
- **Env var name**: `"OPENAI_API_KEY"` — resolved from `.env.development` at runtime
|
||||
- **Direct value**: `"sk-xxxxx"` — used as-is (not recommended)
|
||||
|
||||
### Environment variables
|
||||
|
||||
| Variable | Used for |
|
||||
|----------|----------|
|
||||
| `EVAL_AGENT_PROVIDER`, `EVAL_AGENT_MODEL`, `EVAL_AGENT_API_KEY`, `EVAL_AGENT_BASE_URL`, `EVAL_AGENT_SUPPORTS_IMAGES` | Suite variant model selection |
|
||||
| `FIREWORKS_API_KEY`, `OPENROUTER_API_KEY`, `ANTHROPIC_API_KEY`, provider-specific keys | Config-file or provider-backed model calls |
|
||||
| `EVAL_EXECUTOR_MODEL`, `EVAL_EXECUTOR_API_KEY`, `EVAL_EXECUTOR_BASE_URL` | Suite-mode orchestrator executor override |
|
||||
| `CLADO_ACTION_MODEL`, `CLADO_ACTION_API_KEY`, `CLADO_ACTION_BASE_URL` | Clado executor defaults |
|
||||
| `BROWSEROS_BINARY` | BrowserOS binary path in CI/local smoke runs |
|
||||
| `BROWSEROS_SERVER_URL` | Optional grader MCP URL override |
|
||||
| `WEBARENA_INFINITY_DIR` | Local WebArena-Infinity checkout for Infinity tasks |
|
||||
| `NOPECHA_API_KEY` | CAPTCHA solver extension |
|
||||
| `EVAL_R2_ACCOUNT_ID`, `EVAL_R2_ACCESS_KEY_ID`, `EVAL_R2_SECRET_ACCESS_KEY`, `EVAL_R2_BUCKET`, `EVAL_R2_CDN_BASE_URL` | R2 upload and viewer URL |
|
||||
|
||||
### Supported providers
|
||||
|
||||
| Provider | `provider` value | Requires `baseUrl` |
|
||||
@@ -110,6 +169,20 @@ The `apiKey` field supports two formats:
|
||||
| Ollama | `ollama` | No |
|
||||
| Clado Action (executor only) | `clado-action` | Yes |
|
||||
|
||||
### R2 publishing
|
||||
|
||||
`suite --config ... --publish r2` and `publish --target r2` upload the run artifacts plus `viewer.html` to the viewer-compatible R2 layout:
|
||||
|
||||
```bash
|
||||
export EVAL_R2_ACCOUNT_ID=...
|
||||
export EVAL_R2_ACCESS_KEY_ID=...
|
||||
export EVAL_R2_SECRET_ACCESS_KEY=...
|
||||
export EVAL_R2_BUCKET=browseros-eval
|
||||
export EVAL_R2_CDN_BASE_URL=https://eval.browseros.com
|
||||
```
|
||||
|
||||
Published runs are available at `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
|
||||
|
||||
### BrowserOS infrastructure
|
||||
|
||||
```json
|
||||
@@ -137,6 +210,7 @@ Each worker gets its own Chrome instance. Worker N uses `base_port + N` for CDP
|
||||
|
||||
| File | Tasks | Description |
|
||||
|------|-------|-------------|
|
||||
| `agisdk-daily-10.jsonl` | 10 | Daily AGI SDK / REAL Bench subset |
|
||||
| `webvoyager.jsonl` | 643 | Full WebVoyager benchmark |
|
||||
| `mind2web.jsonl` | 300 | Online-Mind2Web |
|
||||
| `webbench-{0,1,2}of4-50.jsonl` | 50 each | WebBench shards (50-task subsets) |
|
||||
@@ -168,14 +242,19 @@ results/
|
||||
browseros-agent-weekly/
|
||||
2026-04-29-1430/
|
||||
Amazon--0/
|
||||
attempt.json # Stable attempt summary for viewer/reporting
|
||||
metadata.json # Task result, timing, grader scores
|
||||
grades.json # Compact grader results
|
||||
messages.jsonl # Full message log
|
||||
grader-artifacts/ # Grader-specific inputs/outputs/stderr
|
||||
screenshots/
|
||||
001.png # Step-by-step screenshots
|
||||
002.png
|
||||
summary.json # Aggregate pass rates
|
||||
```
|
||||
|
||||
R2 publishing preserves the same task files under `runs/<run-id>/...`, writes `runs/<run-id>/manifest.json`, and uploads `viewer.html` at the bucket root. The viewer URL is `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**BrowserOS not found**: Expects `/Applications/BrowserOS.app/Contents/MacOS/BrowserOS`. Set `BROWSEROS_BINARY` to override.
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
"baseUrl": "https://openrouter.ai/api/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/agisdk-real-smoke.jsonl",
|
||||
"dataset": "../../data/agisdk-real-smoke.jsonl",
|
||||
"num_workers": 1,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -7,7 +7,7 @@
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/agisdk-real.jsonl",
|
||||
"dataset": "../../data/agisdk-real.jsonl",
|
||||
"num_workers": 4,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -7,7 +7,7 @@
|
||||
"baseUrl": "https://openrouter.ai/api/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/webbench-2of4-50.jsonl",
|
||||
"dataset": "../../data/webbench-2of4-50.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -14,7 +14,7 @@
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1"
|
||||
}
|
||||
},
|
||||
"dataset": "../data/webbench-2of4-50.jsonl",
|
||||
"dataset": "../../data/webbench-2of4-50.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -14,7 +14,7 @@
|
||||
"baseUrl": "https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run"
|
||||
}
|
||||
},
|
||||
"dataset": "../data/agisdk-real.jsonl",
|
||||
"dataset": "../../data/agisdk-real.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -7,7 +7,7 @@
|
||||
"baseUrl": "https://openrouter.ai/api/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/webarena-infinity-hard-50.jsonl",
|
||||
"dataset": "../../data/webarena-infinity-hard-50.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -5,7 +5,7 @@
|
||||
"model": "openai/gpt-4.1",
|
||||
"apiKey": "OPENROUTER_API_KEY"
|
||||
},
|
||||
"dataset": "../data/mind2web.jsonl",
|
||||
"dataset": "../../data/mind2web.jsonl",
|
||||
"num_workers": 5,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -7,7 +7,7 @@
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/webvoyager.jsonl",
|
||||
"dataset": "../../data/webvoyager.jsonl",
|
||||
"num_workers": 3,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
22
packages/browseros-agent/apps/eval/configs/suites/agisdk-daily-10.json
vendored
Normal file
22
packages/browseros-agent/apps/eval/configs/suites/agisdk-daily-10.json
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"id": "agisdk-daily-10",
|
||||
"dataset": "../../data/agisdk-daily-10.jsonl",
|
||||
"agent": {
|
||||
"type": "single"
|
||||
},
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"workers": 1,
|
||||
"restartBrowserPerTask": true,
|
||||
"timeoutMs": 1800000,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": true
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
}
|
||||
}
|
||||
22
packages/browseros-agent/apps/eval/configs/suites/agisdk-real-smoke.json
vendored
Normal file
22
packages/browseros-agent/apps/eval/configs/suites/agisdk-real-smoke.json
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"id": "agisdk-real-smoke",
|
||||
"dataset": "../../data/agisdk-real-smoke.jsonl",
|
||||
"agent": {
|
||||
"type": "single"
|
||||
},
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"workers": 1,
|
||||
"restartBrowserPerTask": true,
|
||||
"timeoutMs": 1800000,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
}
|
||||
}
|
||||
22
packages/browseros-agent/apps/eval/configs/suites/agisdk-real.json
vendored
Normal file
22
packages/browseros-agent/apps/eval/configs/suites/agisdk-real.json
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"id": "agisdk-real",
|
||||
"dataset": "../../data/agisdk-real.jsonl",
|
||||
"agent": {
|
||||
"type": "single"
|
||||
},
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"workers": 1,
|
||||
"restartBrowserPerTask": true,
|
||||
"timeoutMs": 1800000,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
}
|
||||
}
|
||||
10
packages/browseros-agent/apps/eval/data/agisdk-daily-10.jsonl
vendored
Normal file
10
packages/browseros-agent/apps/eval/data/agisdk-daily-10.jsonl
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
|
||||
{"query_id": "agisdk-fly-unified-5", "dataset": "agisdk-real", "query": "Find me the cheapest fare for a flight from Orlando to Milwaukee on December 5th, 2024 and book it.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/30, Security Code: 420 Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-5", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "United Airlines"}}}
|
||||
{"query_id": "agisdk-udriver-10", "dataset": "agisdk-real", "query": "Order me a ride for 4pm, I'll be at the de Young muesum headed to the Waterbar, fanciest option possible please.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-10", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
|
||||
{"query_id": "agisdk-udriver-9", "dataset": "agisdk-real", "query": "Book me a ride from the thai restaurant I last took a ride to for later today at 2pm, I'll be at 333 Apartments on Fremont", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-9", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-9", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Uber"}}}
|
||||
{"query_id": "agisdk-topwork-4", "dataset": "agisdk-real", "query": "Create a job post for a UI/UX Designer with expertise in Figma, Sketch, and Adobe Creative Suite, including project details, timeline, and required skills (Wireframing, Prototyping, Responsive Design).", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-4", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
|
||||
{"query_id": "agisdk-gocalendar-4", "dataset": "agisdk-real", "query": "Change the \"Team Check-In\" event on July 18, 2024, name to \"Project Kickoff\" and update the location to \"Zoom\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-4", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
|
||||
{"query_id": "agisdk-staynb-6", "dataset": "agisdk-real", "query": "Find and book the stay with the best value for money (cheapest stay with the best reviews) for 1 day. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-6", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-6", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
|
||||
{"query_id": "agisdk-udriver-11", "dataset": "agisdk-real", "query": "I need to go from Pacific Catch on Chestnut back home to 333 Fremont now. If the fancy version is within ten dollars of the regular one, book that.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-11", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-11", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
|
||||
{"query_id": "agisdk-networkin-5", "dataset": "agisdk-real", "query": "Send a connection request to John Smith.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-5", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-5", "challenge_type": "action", "difficulty": "easy", "similar_to": "LinkedIn"}}}
|
||||
{"query_id": "agisdk-zilloft-6", "dataset": "agisdk-real", "query": "Select a property listed in San Francisco as \"Condos\" within a price range under $300,000 and request a tour for tomorrow at 4:00 PM. Use these contact details: Name: Sarah Brown, Email: sarahbrown@example.com, Phone: 555-987-6543.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-6", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "Zillow"}}}
|
||||
@@ -1,349 +1,43 @@
|
||||
#!/usr/bin/env bun
|
||||
|
||||
/**
|
||||
* Upload eval runs to R2.
|
||||
*
|
||||
* Two modes:
|
||||
* bun scripts/upload-run.ts results/browseros-agent-weekly/2026-03-21-1730
|
||||
* → uploads that specific run
|
||||
*
|
||||
* bun scripts/upload-run.ts results/browseros-agent-weekly
|
||||
* → finds all timestamped subfolders, uploads any not yet in R2
|
||||
*
|
||||
* Env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY
|
||||
* EVAL_R2_BUCKET (default: browseros-eval)
|
||||
* EVAL_R2_CDN_BASE_URL (default: https://eval.browseros.com)
|
||||
*/
|
||||
|
||||
import { readdir, readFile, stat } from 'node:fs/promises'
|
||||
import { basename, dirname, extname, join } from 'node:path'
|
||||
import {
|
||||
GetObjectCommand,
|
||||
PutObjectCommand,
|
||||
S3Client,
|
||||
} from '@aws-sdk/client-s3'
|
||||
loadR2ConfigFromEnv,
|
||||
R2Publisher,
|
||||
} from '../src/publishing/r2-publisher'
|
||||
|
||||
const CONCURRENCY = 20
|
||||
|
||||
const CONTENT_TYPES: Record<string, string> = {
|
||||
'.json': 'application/json',
|
||||
'.jsonl': 'application/x-ndjson',
|
||||
'.png': 'image/png',
|
||||
}
|
||||
|
||||
interface R2Config {
|
||||
accountId: string
|
||||
accessKeyId: string
|
||||
secretAccessKey: string
|
||||
bucket: string
|
||||
cdnBaseUrl: string
|
||||
}
|
||||
|
||||
function loadConfig(): R2Config {
|
||||
const accountId = process.env.EVAL_R2_ACCOUNT_ID
|
||||
const accessKeyId = process.env.EVAL_R2_ACCESS_KEY_ID
|
||||
const secretAccessKey = process.env.EVAL_R2_SECRET_ACCESS_KEY
|
||||
|
||||
if (!accountId || !accessKeyId || !secretAccessKey) {
|
||||
console.error(
|
||||
'Missing required env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY',
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
return {
|
||||
accountId,
|
||||
accessKeyId,
|
||||
secretAccessKey,
|
||||
bucket: process.env.EVAL_R2_BUCKET || 'browseros-eval',
|
||||
cdnBaseUrl: (
|
||||
process.env.EVAL_R2_CDN_BASE_URL || 'https://eval.browseros.com'
|
||||
).replace(/\/+$/, ''),
|
||||
}
|
||||
}
|
||||
|
||||
function createClient(config: R2Config): S3Client {
|
||||
return new S3Client({
|
||||
region: 'auto',
|
||||
endpoint: `https://${config.accountId}.r2.cloudflarestorage.com`,
|
||||
credentials: {
|
||||
accessKeyId: config.accessKeyId,
|
||||
secretAccessKey: config.secretAccessKey,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
async function upload(
|
||||
client: S3Client,
|
||||
bucket: string,
|
||||
key: string,
|
||||
body: Buffer,
|
||||
contentType: string,
|
||||
) {
|
||||
await client.send(
|
||||
new PutObjectCommand({
|
||||
Bucket: bucket,
|
||||
Key: key,
|
||||
Body: body,
|
||||
ContentType: contentType,
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
async function collectFiles(dir: string): Promise<string[]> {
|
||||
const files: string[] = []
|
||||
const entries = await readdir(dir, { withFileTypes: true })
|
||||
for (const entry of entries) {
|
||||
const full = join(dir, entry.name)
|
||||
if (entry.isDirectory()) {
|
||||
files.push(...(await collectFiles(full)))
|
||||
} else {
|
||||
files.push(full)
|
||||
}
|
||||
}
|
||||
return files
|
||||
}
|
||||
|
||||
async function runPool<T>(
|
||||
items: T[],
|
||||
concurrency: number,
|
||||
fn: (item: T) => Promise<void>,
|
||||
) {
|
||||
let i = 0
|
||||
const workers = Array.from({ length: concurrency }, async () => {
|
||||
while (i < items.length) {
|
||||
const idx = i++
|
||||
await fn(items[idx])
|
||||
}
|
||||
})
|
||||
await Promise.all(workers)
|
||||
}
|
||||
|
||||
// Check if a run has already been uploaded to R2
|
||||
async function isUploaded(
|
||||
client: S3Client,
|
||||
bucket: string,
|
||||
runId: string,
|
||||
): Promise<boolean> {
|
||||
try {
|
||||
await client.send(
|
||||
new GetObjectCommand({
|
||||
Bucket: bucket,
|
||||
Key: `runs/${runId}/manifest.json`,
|
||||
}),
|
||||
)
|
||||
return true
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Detect if a directory is a run dir (has task subdirs with metadata.json)
|
||||
// vs a config dir (has timestamped subdirs like 2026-03-21-1730/)
|
||||
async function isRunDir(dir: string): Promise<boolean> {
|
||||
const entries = await readdir(dir, { withFileTypes: true })
|
||||
const subdirs = entries.filter((e) => e.isDirectory())
|
||||
for (const subdir of subdirs) {
|
||||
const metaPath = join(dir, subdir.name, 'metadata.json')
|
||||
const metaStat = await stat(metaPath).catch(() => null)
|
||||
if (metaStat?.isFile()) return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
async function uploadSingleRun(
|
||||
runDir: string,
|
||||
runId: string,
|
||||
r2Config: R2Config,
|
||||
client: S3Client,
|
||||
): Promise<void> {
|
||||
const taskDirs = await readdir(runDir, { withFileTypes: true })
|
||||
const taskEntries = taskDirs.filter((d) => d.isDirectory())
|
||||
|
||||
if (taskEntries.length === 0) {
|
||||
console.warn(` No task subdirectories in ${runId}, skipping`)
|
||||
return
|
||||
}
|
||||
|
||||
const manifestTasks: Record<string, unknown>[] = []
|
||||
const jobs: { key: string; filePath: string; contentType: string }[] = []
|
||||
|
||||
// Extract agent config from first task
|
||||
let agentConfig: Record<string, unknown> | undefined
|
||||
let dataset: string | undefined
|
||||
|
||||
for (const taskDir of taskEntries) {
|
||||
const taskId = taskDir.name
|
||||
const taskPath = join(runDir, taskId)
|
||||
const metaPath = join(taskPath, 'metadata.json')
|
||||
|
||||
let meta: Record<string, unknown> = {}
|
||||
try {
|
||||
meta = JSON.parse(await readFile(metaPath, 'utf-8'))
|
||||
} catch {
|
||||
continue
|
||||
}
|
||||
|
||||
if (!agentConfig && meta.agent_config)
|
||||
agentConfig = meta.agent_config as Record<string, unknown>
|
||||
if (!dataset && meta.dataset) dataset = meta.dataset as string
|
||||
|
||||
const files = await collectFiles(taskPath)
|
||||
let screenshotCount = 0
|
||||
|
||||
for (const file of files) {
|
||||
const relative = file.slice(taskPath.length + 1)
|
||||
const ext = extname(file)
|
||||
if (relative.startsWith('screenshots/') && ext === '.png')
|
||||
screenshotCount++
|
||||
|
||||
jobs.push({
|
||||
key: `runs/${runId}/${taskId}/${relative}`,
|
||||
filePath: file,
|
||||
contentType: CONTENT_TYPES[ext] || 'application/octet-stream',
|
||||
})
|
||||
}
|
||||
|
||||
manifestTasks.push({
|
||||
queryId: meta.query_id || taskId,
|
||||
query: meta.query || '',
|
||||
startUrl: meta.start_url || '',
|
||||
status:
|
||||
meta.termination_reason === 'completed'
|
||||
? 'completed'
|
||||
: meta.termination_reason || 'unknown',
|
||||
durationMs: meta.total_duration_ms || 0,
|
||||
screenshotCount: (meta.screenshot_count as number) || screenshotCount,
|
||||
graderResults: meta.grader_results || {},
|
||||
})
|
||||
}
|
||||
|
||||
if (manifestTasks.length === 0) {
|
||||
console.warn(` No completed tasks in ${runId}, skipping`)
|
||||
return
|
||||
}
|
||||
|
||||
console.log(
|
||||
` Uploading ${jobs.length} files across ${manifestTasks.length} tasks...`,
|
||||
)
|
||||
|
||||
let uploaded = 0
|
||||
await runPool(jobs, CONCURRENCY, async (job) => {
|
||||
const body = await readFile(job.filePath)
|
||||
await upload(client, r2Config.bucket, job.key, body, job.contentType)
|
||||
uploaded++
|
||||
if (uploaded % 50 === 0 || uploaded === jobs.length) {
|
||||
console.log(` ${uploaded}/${jobs.length}`)
|
||||
}
|
||||
})
|
||||
|
||||
// Read summary.json if it exists
|
||||
let summaryData: Record<string, unknown> | undefined
|
||||
try {
|
||||
summaryData = JSON.parse(
|
||||
await readFile(join(runDir, 'summary.json'), 'utf-8'),
|
||||
)
|
||||
} catch {}
|
||||
|
||||
// Upload manifest
|
||||
const manifest = {
|
||||
runId,
|
||||
uploadedAt: new Date().toISOString(),
|
||||
agentConfig,
|
||||
dataset,
|
||||
summary: summaryData
|
||||
? {
|
||||
passRate: summaryData.passRate,
|
||||
avgDurationMs: summaryData.avgDurationMs,
|
||||
}
|
||||
: undefined,
|
||||
tasks: manifestTasks,
|
||||
}
|
||||
const manifestBody = Buffer.from(JSON.stringify(manifest, null, 2))
|
||||
await upload(
|
||||
client,
|
||||
r2Config.bucket,
|
||||
`runs/${runId}/manifest.json`,
|
||||
manifestBody,
|
||||
'application/json',
|
||||
)
|
||||
|
||||
// Upload viewer.html to bucket root
|
||||
const viewerPath = join(
|
||||
import.meta.dir,
|
||||
'..',
|
||||
'src',
|
||||
'dashboard',
|
||||
'viewer.html',
|
||||
)
|
||||
const viewerBody = await readFile(viewerPath)
|
||||
await upload(client, r2Config.bucket, 'viewer.html', viewerBody, 'text/html')
|
||||
|
||||
console.log(` Uploaded ${uploaded + 2} files`)
|
||||
console.log(` ${r2Config.cdnBaseUrl}/viewer.html?run=${runId}`)
|
||||
}
|
||||
|
||||
async function main() {
|
||||
async function main(): Promise<void> {
|
||||
const inputDir = process.argv[2]
|
||||
if (!inputDir) {
|
||||
console.error(
|
||||
throw new Error(
|
||||
'Usage:\n' +
|
||||
' bun scripts/upload-run.ts results/config-name/2026-03-21-1730 (specific run)\n' +
|
||||
' bun scripts/upload-run.ts results/config-name (all un-uploaded runs)',
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const dirStat = await stat(inputDir).catch(() => null)
|
||||
if (!dirStat?.isDirectory()) {
|
||||
console.error(`Not a directory: ${inputDir}`)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const r2Config = loadConfig()
|
||||
const client = createClient(r2Config)
|
||||
|
||||
if (await isRunDir(inputDir)) {
|
||||
// Single run: results/config-name/2026-03-21-1730
|
||||
const timestamp = basename(inputDir)
|
||||
const configName = basename(dirname(inputDir))
|
||||
const runId = `${configName}-${timestamp}`
|
||||
console.log(`Uploading run: ${runId}`)
|
||||
await uploadSingleRun(inputDir, runId, r2Config, client)
|
||||
} else {
|
||||
// Config dir: results/config-name/ — upload all un-uploaded runs
|
||||
const configName = basename(inputDir)
|
||||
const entries = await readdir(inputDir, { withFileTypes: true })
|
||||
const runDirs = entries
|
||||
.filter((e) => e.isDirectory())
|
||||
.map((e) => e.name)
|
||||
.sort()
|
||||
|
||||
if (runDirs.length === 0) {
|
||||
console.error('No run subdirectories found')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
console.log(
|
||||
`Found ${runDirs.length} runs for config "${configName}", checking R2...`,
|
||||
)
|
||||
|
||||
let uploadedCount = 0
|
||||
for (const dir of runDirs) {
|
||||
const runId = `${configName}-${dir}`
|
||||
const alreadyUploaded = await isUploaded(client, r2Config.bucket, runId)
|
||||
if (alreadyUploaded) {
|
||||
console.log(` ${runId}: already uploaded, skipping`)
|
||||
continue
|
||||
}
|
||||
|
||||
console.log(` ${runId}: uploading...`)
|
||||
await uploadSingleRun(join(inputDir, dir), runId, r2Config, client)
|
||||
uploadedCount++
|
||||
}
|
||||
|
||||
console.log(
|
||||
`\nDone. Uploaded ${uploadedCount} new run(s), ${runDirs.length - uploadedCount} already in R2.`,
|
||||
' bun scripts/upload-run.ts results/config-name/2026-03-21-1730\n' +
|
||||
' bun scripts/upload-run.ts results/config-name',
|
||||
)
|
||||
}
|
||||
|
||||
const publisher = new R2Publisher({ config: loadR2ConfigFromEnv() })
|
||||
const result = await publisher.publishPath(inputDir)
|
||||
for (const run of result.uploadedRuns) {
|
||||
console.log(`Uploaded ${run.uploadedFiles} files for ${run.runId}`)
|
||||
console.log(run.viewerUrl)
|
||||
}
|
||||
for (const runId of result.skippedRuns) {
|
||||
console.log(`${runId}: already uploaded, skipping`)
|
||||
}
|
||||
console.log(
|
||||
`Done. Uploaded ${result.uploadedRuns.length} run(s), skipped ${result.skippedRuns.length}.`,
|
||||
)
|
||||
}
|
||||
|
||||
main()
|
||||
main().catch((error) => {
|
||||
console.error(error instanceof Error ? error.message : String(error))
|
||||
process.exit(1)
|
||||
})
|
||||
|
||||
191
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-actions.ts
vendored
Normal file
191
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-actions.ts
vendored
Normal file
@@ -0,0 +1,191 @@
|
||||
import type {
|
||||
CladoAction,
|
||||
CladoActionResponse,
|
||||
RawCladoActionPayload,
|
||||
} from './types'
|
||||
|
||||
/** Parses Clado's structured response plus any raw `<answer>` blocks into executable actions. */
|
||||
export function parseCladoActions(
|
||||
prediction: CladoActionResponse,
|
||||
): CladoAction[] {
|
||||
const actionFromField =
|
||||
typeof prediction.action === 'string' ? prediction.action : null
|
||||
|
||||
const rawActions = parseCladoActionsFromRawResponse(prediction.raw_response)
|
||||
const primaryFromRaw = rawActions[0] ?? null
|
||||
const mergedPrimary = {
|
||||
...primaryFromRaw,
|
||||
...prediction,
|
||||
action: actionFromField ?? primaryFromRaw?.action,
|
||||
}
|
||||
|
||||
const normalized: CladoAction[] = []
|
||||
const primary = normalizeCladoActionPayload(mergedPrimary)
|
||||
if (primary) normalized.push(primary)
|
||||
|
||||
for (const candidate of rawActions.slice(1)) {
|
||||
const parsed = normalizeCladoActionPayload(candidate)
|
||||
if (!parsed) continue
|
||||
const prev = normalized[normalized.length - 1]
|
||||
if (
|
||||
!prev ||
|
||||
getCladoActionSignature(prev) !== getCladoActionSignature(parsed)
|
||||
) {
|
||||
normalized.push(parsed)
|
||||
}
|
||||
}
|
||||
|
||||
return normalized
|
||||
}
|
||||
|
||||
export function normalizeCladoActionPayload(
|
||||
payload: RawCladoActionPayload,
|
||||
): CladoAction | null {
|
||||
if (!payload.action || typeof payload.action !== 'string') {
|
||||
return null
|
||||
}
|
||||
return {
|
||||
action: payload.action,
|
||||
x: typeof payload.x === 'number' ? payload.x : undefined,
|
||||
y: typeof payload.y === 'number' ? payload.y : undefined,
|
||||
text: typeof payload.text === 'string' ? payload.text : undefined,
|
||||
key: typeof payload.key === 'string' ? payload.key : undefined,
|
||||
direction:
|
||||
typeof payload.direction === 'string' ? payload.direction : undefined,
|
||||
startX: typeof payload.startX === 'number' ? payload.startX : undefined,
|
||||
startY: typeof payload.startY === 'number' ? payload.startY : undefined,
|
||||
endX: typeof payload.endX === 'number' ? payload.endX : undefined,
|
||||
endY: typeof payload.endY === 'number' ? payload.endY : undefined,
|
||||
amount: typeof payload.amount === 'number' ? payload.amount : undefined,
|
||||
time: typeof payload.time === 'number' ? payload.time : undefined,
|
||||
final_answer:
|
||||
typeof payload.final_answer === 'string'
|
||||
? payload.final_answer
|
||||
: undefined,
|
||||
}
|
||||
}
|
||||
|
||||
export function parseCladoActionsFromRawResponse(
|
||||
rawResponse: string | undefined,
|
||||
): RawCladoActionPayload[] {
|
||||
if (!rawResponse) return []
|
||||
const matches = [
|
||||
...rawResponse.matchAll(/<answer>\s*([\s\S]*?)\s*<\/answer>/gi),
|
||||
]
|
||||
const parsed: RawCladoActionPayload[] = []
|
||||
for (const match of matches) {
|
||||
try {
|
||||
parsed.push(JSON.parse(match[1]) as RawCladoActionPayload)
|
||||
} catch {
|
||||
// Ignore malformed answer blocks so one bad block does not drop the whole prediction.
|
||||
}
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
|
||||
export function extractCladoThinking(
|
||||
rawResponse: string | undefined,
|
||||
): string | undefined {
|
||||
if (!rawResponse) return undefined
|
||||
const matches = [
|
||||
...rawResponse.matchAll(/<thinking>\s*([\s\S]*?)\s*<\/thinking>/gi),
|
||||
]
|
||||
if (matches.length === 0) return undefined
|
||||
|
||||
const merged = matches
|
||||
.map((match) => match[1]?.replace(/\s+/g, ' ').trim() ?? '')
|
||||
.filter((value) => value.length > 0)
|
||||
.join(' ')
|
||||
|
||||
if (!merged) return undefined
|
||||
return merged
|
||||
}
|
||||
|
||||
export function summarizeCladoPrediction(
|
||||
prediction: CladoActionResponse,
|
||||
): Record<string, unknown> {
|
||||
const preview =
|
||||
typeof prediction.raw_response === 'string' &&
|
||||
prediction.raw_response.length > 0
|
||||
? prediction.raw_response.slice(0, 240)
|
||||
: undefined
|
||||
|
||||
return {
|
||||
action: prediction.action,
|
||||
x: prediction.x,
|
||||
y: prediction.y,
|
||||
text: prediction.text,
|
||||
key: prediction.key,
|
||||
direction: prediction.direction,
|
||||
startX: prediction.startX,
|
||||
startY: prediction.startY,
|
||||
endX: prediction.endX,
|
||||
endY: prediction.endY,
|
||||
amount: prediction.amount,
|
||||
time: prediction.time,
|
||||
inference_time_seconds: prediction.inference_time_seconds,
|
||||
raw_response_preview: preview,
|
||||
}
|
||||
}
|
||||
|
||||
export function getCladoActionSignature(action: CladoAction): string {
|
||||
switch (action.action) {
|
||||
case 'click':
|
||||
case 'double_click':
|
||||
case 'right_click':
|
||||
case 'hover':
|
||||
return `${action.action}:${action.x ?? 'x'}:${action.y ?? 'y'}`
|
||||
case 'type':
|
||||
return `${action.action}:${(action.text ?? '').slice(0, 16)}`
|
||||
case 'press_key':
|
||||
return `${action.action}:${action.key ?? 'key'}`
|
||||
case 'scroll':
|
||||
return `${action.action}:${action.direction ?? 'down'}:${action.amount ?? 500}`
|
||||
case 'drag':
|
||||
return `${action.action}:${action.startX}:${action.startY}:${action.endX}:${action.endY}`
|
||||
case 'wait':
|
||||
return `${action.action}:${action.time ?? 1}`
|
||||
case 'end':
|
||||
return action.final_answer
|
||||
? `end(${action.final_answer.slice(0, 32)})`
|
||||
: 'end()'
|
||||
case 'invalid':
|
||||
return `invalid(${(action.text ?? '').slice(0, 40)})`
|
||||
default:
|
||||
return action.action
|
||||
}
|
||||
}
|
||||
|
||||
export function formatCladoHistory(actions: CladoAction[]): string {
|
||||
if (actions.length === 0) return 'None'
|
||||
|
||||
const parts = actions.map((action) => {
|
||||
switch (action.action) {
|
||||
case 'click':
|
||||
case 'double_click':
|
||||
case 'right_click':
|
||||
case 'hover':
|
||||
return `${action.action}(${Math.round(action.x ?? 500)}, ${Math.round(action.y ?? 500)})`
|
||||
case 'type': {
|
||||
const text = (action.text ?? '').replace(/'/g, "\\'")
|
||||
return `type('${text}')`
|
||||
}
|
||||
case 'press_key':
|
||||
return `press_key('${action.key ?? 'Enter'}')`
|
||||
case 'scroll':
|
||||
return `scroll(${action.direction ?? 'down'})`
|
||||
case 'drag':
|
||||
return `drag(${Math.round(action.startX ?? 500)},${Math.round(action.startY ?? 500)} -> ${Math.round(action.endX ?? 500)},${Math.round(action.endY ?? 500)})`
|
||||
case 'wait':
|
||||
return `wait(${Math.round(action.time ?? 1)}s)`
|
||||
case 'end':
|
||||
return 'end()'
|
||||
case 'invalid':
|
||||
return 'invalid()'
|
||||
default:
|
||||
return action.action
|
||||
}
|
||||
})
|
||||
|
||||
return parts.join(' -> ')
|
||||
}
|
||||
123
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-browser-driver.ts
vendored
Normal file
123
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-browser-driver.ts
vendored
Normal file
@@ -0,0 +1,123 @@
|
||||
import {
|
||||
CLADO_PAGE_SCOPED_TOOLS,
|
||||
type CladoActionPoint,
|
||||
type CladoViewport,
|
||||
} from './types'
|
||||
|
||||
export function clampCladoNormalizedCoordinate(value: number): number {
|
||||
return Math.min(999, Math.max(0, Math.round(value)))
|
||||
}
|
||||
|
||||
/** Converts Clado's 0-1000 normalized coordinate space into BrowserOS viewport pixels. */
|
||||
export function resolveCladoPoint(
|
||||
viewport: CladoViewport,
|
||||
normalizedX: number | undefined,
|
||||
normalizedY: number | undefined,
|
||||
): CladoActionPoint {
|
||||
const nx = clampCladoNormalizedCoordinate(normalizedX ?? 500)
|
||||
const ny = clampCladoNormalizedCoordinate(normalizedY ?? 500)
|
||||
|
||||
return {
|
||||
x: Math.round((nx / 1000) * viewport.width),
|
||||
y: Math.round((ny / 1000) * viewport.height),
|
||||
}
|
||||
}
|
||||
|
||||
/** Adapts Clado action tool arguments to the BrowserOS MCP tool argument contract. */
|
||||
export function prepareCladoToolArgs(
|
||||
toolName: string,
|
||||
args: Record<string, unknown>,
|
||||
pageId: number,
|
||||
): Record<string, unknown> {
|
||||
const prepared: Record<string, unknown> = { ...args }
|
||||
|
||||
if (
|
||||
toolName === 'evaluate_script' &&
|
||||
typeof prepared.function === 'string' &&
|
||||
prepared.expression === undefined
|
||||
) {
|
||||
prepared.expression = toCladoEvaluateExpression(prepared.function)
|
||||
delete prepared.function
|
||||
}
|
||||
|
||||
if (
|
||||
toolName === 'click_at' &&
|
||||
typeof prepared.dblClick === 'boolean' &&
|
||||
prepared.clickCount === undefined
|
||||
) {
|
||||
prepared.clickCount = prepared.dblClick ? 2 : 1
|
||||
delete prepared.dblClick
|
||||
}
|
||||
|
||||
if (
|
||||
CLADO_PAGE_SCOPED_TOOLS.has(toolName) &&
|
||||
typeof prepared.page !== 'number'
|
||||
) {
|
||||
prepared.page = pageId
|
||||
}
|
||||
|
||||
return prepared
|
||||
}
|
||||
|
||||
export function toCladoEvaluateExpression(rawFunction: unknown): string {
|
||||
const source = String(rawFunction).trim()
|
||||
if (source.startsWith('() =>') || source.startsWith('async () =>')) {
|
||||
return `(${source})()`
|
||||
}
|
||||
if (source.startsWith('function')) {
|
||||
return `(${source})()`
|
||||
}
|
||||
return source
|
||||
}
|
||||
|
||||
export function normalizeCladoPressKey(key: string | undefined): string {
|
||||
const raw = (key ?? '').trim()
|
||||
if (!raw) throw new Error('press_key action missing key field')
|
||||
|
||||
const map: Record<string, string> = {
|
||||
'C-a': 'Control+A',
|
||||
'C-c': 'Control+C',
|
||||
'C-v': 'Control+V',
|
||||
'C-x': 'Control+X',
|
||||
'C-z': 'Control+Z',
|
||||
'C-y': 'Control+Y',
|
||||
'C-s': 'Control+S',
|
||||
'C-t': 'Control+T',
|
||||
'C-w': 'Control+W',
|
||||
'C-h': 'Control+H',
|
||||
'C-f': 'Control+F',
|
||||
'C-+': 'Control++',
|
||||
'C--': 'Control+-',
|
||||
'C-tab': 'Control+Tab',
|
||||
'C-S-tab': 'Control+Shift+Tab',
|
||||
'C-S-n': 'Control+Shift+N',
|
||||
'C-down': 'Control+ArrowDown',
|
||||
'M-a': 'Meta+A',
|
||||
'M-c': 'Meta+C',
|
||||
'M-v': 'Meta+V',
|
||||
'M-x': 'Meta+X',
|
||||
'M-f4': 'Alt+F4',
|
||||
}
|
||||
return map[raw] ?? raw
|
||||
}
|
||||
|
||||
export function normalizeCladoDirection(
|
||||
direction: string | undefined,
|
||||
): 'up' | 'down' | 'left' | 'right' {
|
||||
if (
|
||||
direction === 'up' ||
|
||||
direction === 'down' ||
|
||||
direction === 'left' ||
|
||||
direction === 'right'
|
||||
) {
|
||||
return direction
|
||||
}
|
||||
return 'down'
|
||||
}
|
||||
|
||||
export function normalizeCladoScrollAmount(amount: number | undefined): number {
|
||||
if (typeof amount !== 'number') return 500
|
||||
if (amount <= 0) return 100
|
||||
const clamped = Math.min(amount, 1000)
|
||||
return Math.max(100, Math.round((clamped / 1000) * 900))
|
||||
}
|
||||
68
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-client.ts
vendored
Normal file
68
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-client.ts
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
import { CLADO_REQUEST_TIMEOUT_MS } from '../../../../constants'
|
||||
import { formatCladoHistory } from './clado-actions'
|
||||
import type { CladoAction, CladoActionResponse } from './types'
|
||||
|
||||
export interface CladoActionClientOptions {
|
||||
baseUrl?: string
|
||||
apiKey?: string
|
||||
}
|
||||
|
||||
export interface CladoActionPredictionInput {
|
||||
instruction: string
|
||||
imageBase64: string
|
||||
actionHistory: CladoAction[]
|
||||
signal?: AbortSignal
|
||||
}
|
||||
|
||||
/** Calls the Clado action model without exposing credentials in process arguments or artifacts. */
|
||||
export class CladoActionClient {
|
||||
constructor(private readonly options: CladoActionClientOptions) {}
|
||||
|
||||
async requestActionPrediction(
|
||||
input: CladoActionPredictionInput,
|
||||
): Promise<CladoActionResponse> {
|
||||
if (!this.options.baseUrl) {
|
||||
throw new Error('executor.baseUrl must be set for clado-action provider')
|
||||
}
|
||||
|
||||
const requestController = new AbortController()
|
||||
const onAbort = () => requestController.abort()
|
||||
input.signal?.addEventListener('abort', onAbort, { once: true })
|
||||
|
||||
const timeoutHandle = setTimeout(() => {
|
||||
requestController.abort()
|
||||
}, CLADO_REQUEST_TIMEOUT_MS)
|
||||
|
||||
try {
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
if (this.options.apiKey) {
|
||||
headers.Authorization = `Bearer ${this.options.apiKey}`
|
||||
}
|
||||
|
||||
const response = await fetch(this.options.baseUrl, {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: JSON.stringify({
|
||||
instruction: input.instruction,
|
||||
image_base64: input.imageBase64,
|
||||
history: formatCladoHistory(input.actionHistory),
|
||||
}),
|
||||
signal: requestController.signal,
|
||||
})
|
||||
|
||||
if (!response.ok) {
|
||||
const body = await response.text()
|
||||
throw new Error(
|
||||
`HTTP ${response.status} ${response.statusText}: ${body.slice(0, 400)}`,
|
||||
)
|
||||
}
|
||||
|
||||
return (await response.json()) as CladoActionResponse
|
||||
} finally {
|
||||
clearTimeout(timeoutHandle)
|
||||
input.signal?.removeEventListener('abort', onAbort)
|
||||
}
|
||||
}
|
||||
}
|
||||
78
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/types.ts
vendored
Normal file
78
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/types.ts
vendored
Normal file
@@ -0,0 +1,78 @@
|
||||
export const CLADO_ACTION_PROVIDER = 'clado-action'
|
||||
|
||||
export const CLADO_PAGE_SCOPED_TOOLS = new Set<string>([
|
||||
'take_screenshot',
|
||||
'evaluate_script',
|
||||
'click',
|
||||
'click_at',
|
||||
'hover',
|
||||
'hover_at',
|
||||
'clear',
|
||||
'fill',
|
||||
'press_key',
|
||||
'type_at',
|
||||
'drag',
|
||||
'drag_at',
|
||||
'scroll',
|
||||
'handle_dialog',
|
||||
'select_option',
|
||||
'navigate_page',
|
||||
'close_page',
|
||||
'wait_for',
|
||||
])
|
||||
|
||||
export interface CladoActionResponse {
|
||||
action?: string | null
|
||||
x?: number
|
||||
y?: number
|
||||
text?: string
|
||||
key?: string
|
||||
direction?: string
|
||||
startX?: number
|
||||
startY?: number
|
||||
endX?: number
|
||||
endY?: number
|
||||
amount?: number
|
||||
time?: number
|
||||
final_answer?: string | null
|
||||
inference_time_seconds?: number
|
||||
raw_response?: string
|
||||
thinking?: string | null
|
||||
parse_error?: string | null
|
||||
}
|
||||
|
||||
export interface CladoViewport {
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
|
||||
export interface CladoAction {
|
||||
action: string
|
||||
x?: number
|
||||
y?: number
|
||||
text?: string
|
||||
key?: string
|
||||
direction?: string
|
||||
startX?: number
|
||||
startY?: number
|
||||
endX?: number
|
||||
endY?: number
|
||||
amount?: number
|
||||
time?: number
|
||||
final_answer?: string
|
||||
}
|
||||
|
||||
export type RawCladoActionPayload = Partial<
|
||||
Omit<CladoAction, 'final_answer'>
|
||||
> & {
|
||||
final_answer?: string | null
|
||||
}
|
||||
|
||||
export interface CladoActionPoint {
|
||||
x: number
|
||||
y: number
|
||||
}
|
||||
|
||||
export function isCladoActionProvider(provider: string): boolean {
|
||||
return provider === CLADO_ACTION_PROVIDER
|
||||
}
|
||||
45
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/create-executor-backend.ts
vendored
Normal file
45
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/create-executor-backend.ts
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
|
||||
import type { Browser } from '@browseros/server/browser'
|
||||
import type { ExecutorCallbacks } from '../../orchestrator-executor/executor'
|
||||
import type { ExecutorBackend, ExecutorBackendKind } from '../executor-backend'
|
||||
import { ExecutorAdapterBackend } from './tool-loop-backend'
|
||||
|
||||
export interface CreateExecutorBackendOptions {
|
||||
backendKind?: ExecutorBackendKind
|
||||
provider?: string
|
||||
configTemplate?: ResolvedAgentConfig
|
||||
browser?: Browser | null
|
||||
serverUrl?: string
|
||||
windowId?: number
|
||||
tabId?: number
|
||||
initialPageId?: number
|
||||
callbacks?: ExecutorCallbacks
|
||||
executor?: ExecutorBackend
|
||||
}
|
||||
|
||||
export function backendKindForProvider(provider: string): ExecutorBackendKind {
|
||||
return provider === 'clado-action' ? 'clado' : 'tool-loop'
|
||||
}
|
||||
|
||||
/** Creates the backend used for one orchestrator delegation. */
|
||||
export function createExecutorBackend(
|
||||
options: CreateExecutorBackendOptions,
|
||||
): ExecutorBackend {
|
||||
const kind =
|
||||
options.backendKind ??
|
||||
backendKindForProvider(
|
||||
options.provider ?? options.configTemplate?.provider ?? '',
|
||||
)
|
||||
|
||||
return new ExecutorAdapterBackend({
|
||||
kind,
|
||||
configTemplate: options.configTemplate,
|
||||
browser: options.browser,
|
||||
serverUrl: options.serverUrl,
|
||||
windowId: options.windowId,
|
||||
tabId: options.tabId,
|
||||
initialPageId: options.initialPageId,
|
||||
callbacks: options.callbacks,
|
||||
executor: options.executor,
|
||||
})
|
||||
}
|
||||
72
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/tool-loop-backend.ts
vendored
Normal file
72
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/tool-loop-backend.ts
vendored
Normal file
@@ -0,0 +1,72 @@
|
||||
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
|
||||
import type { Browser } from '@browseros/server/browser'
|
||||
import {
|
||||
Executor,
|
||||
type ExecutorCallbacks,
|
||||
} from '../../orchestrator-executor/executor'
|
||||
import type {
|
||||
DelegationResult,
|
||||
ExecutorBackend,
|
||||
ExecutorBackendKind,
|
||||
} from '../executor-backend'
|
||||
|
||||
interface ExecutorRunner {
|
||||
execute(instruction: string, signal?: AbortSignal): Promise<DelegationResult>
|
||||
close(): Promise<void>
|
||||
getTotalSteps(): number
|
||||
}
|
||||
|
||||
export interface ExecutorAdapterBackendOptions {
|
||||
kind: ExecutorBackendKind
|
||||
configTemplate?: ResolvedAgentConfig
|
||||
browser?: Browser | null
|
||||
serverUrl?: string
|
||||
windowId?: number
|
||||
tabId?: number
|
||||
initialPageId?: number
|
||||
callbacks?: ExecutorCallbacks
|
||||
executor?: ExecutorRunner
|
||||
}
|
||||
|
||||
export class ExecutorAdapterBackend implements ExecutorBackend {
|
||||
readonly kind: ExecutorBackendKind
|
||||
private readonly executor: ExecutorRunner
|
||||
|
||||
constructor(options: ExecutorAdapterBackendOptions) {
|
||||
this.kind = options.kind
|
||||
this.executor =
|
||||
options.executor ??
|
||||
new Executor(
|
||||
required(options.configTemplate, 'configTemplate'),
|
||||
options.browser ?? null,
|
||||
required(options.serverUrl, 'serverUrl'),
|
||||
{
|
||||
isCladoAction: options.kind === 'clado',
|
||||
windowId: options.windowId,
|
||||
tabId: options.tabId,
|
||||
initialPageId: options.initialPageId,
|
||||
callbacks: options.callbacks,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
execute(
|
||||
instruction: string,
|
||||
signal?: AbortSignal,
|
||||
): Promise<DelegationResult> {
|
||||
return this.executor.execute(instruction, signal)
|
||||
}
|
||||
|
||||
close(): Promise<void> {
|
||||
return this.executor.close()
|
||||
}
|
||||
|
||||
getTotalSteps(): number {
|
||||
return this.executor.getTotalSteps()
|
||||
}
|
||||
}
|
||||
|
||||
function required<T>(value: T | undefined, name: string): T {
|
||||
if (value === undefined) throw new Error(`${name} is required`)
|
||||
return value
|
||||
}
|
||||
11
packages/browseros-agent/apps/eval/src/agents/orchestrated/executor-backend.ts
vendored
Normal file
11
packages/browseros-agent/apps/eval/src/agents/orchestrated/executor-backend.ts
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
import type { ExecutorResult } from '../orchestrator-executor/types'
|
||||
|
||||
export type ExecutorBackendKind = 'tool-loop' | 'clado'
|
||||
export type DelegationResult = ExecutorResult
|
||||
|
||||
export interface ExecutorBackend {
|
||||
readonly kind: ExecutorBackendKind
|
||||
execute(instruction: string, signal?: AbortSignal): Promise<DelegationResult>
|
||||
close(): Promise<void>
|
||||
getTotalSteps(): number
|
||||
}
|
||||
@@ -1,106 +1,47 @@
|
||||
import { randomUUID } from 'node:crypto'
|
||||
import {
|
||||
CLADO_REQUEST_TIMEOUT_MS,
|
||||
MAX_ACTIONS_PER_DELEGATION,
|
||||
} from '../../constants'
|
||||
import { MAX_ACTIONS_PER_DELEGATION } from '../../constants'
|
||||
import { McpClient, type McpToolResult } from '../../utils/mcp-client'
|
||||
import { sleep } from '../../utils/sleep'
|
||||
import {
|
||||
extractCladoThinking,
|
||||
formatCladoHistory,
|
||||
getCladoActionSignature,
|
||||
parseCladoActions,
|
||||
summarizeCladoPrediction,
|
||||
} from '../orchestrated/backends/clado/clado-actions'
|
||||
import {
|
||||
normalizeCladoDirection,
|
||||
normalizeCladoPressKey,
|
||||
normalizeCladoScrollAmount,
|
||||
prepareCladoToolArgs,
|
||||
resolveCladoPoint,
|
||||
} from '../orchestrated/backends/clado/clado-browser-driver'
|
||||
import { CladoActionClient } from '../orchestrated/backends/clado/clado-client'
|
||||
import {
|
||||
CLADO_ACTION_PROVIDER,
|
||||
type CladoAction,
|
||||
type CladoActionPoint,
|
||||
type CladoActionResponse,
|
||||
type CladoViewport,
|
||||
isCladoActionProvider,
|
||||
} from '../orchestrated/backends/clado/types'
|
||||
import type { ExecutorCallbacks } from './executor'
|
||||
import type { ExecutorConfig, ExecutorResult } from './types'
|
||||
|
||||
const CLADO_ACTION_PROVIDER = 'clado-action'
|
||||
const PAGE_SCOPED_TOOLS = new Set<string>([
|
||||
'take_screenshot',
|
||||
'evaluate_script',
|
||||
'click',
|
||||
'click_at',
|
||||
'hover',
|
||||
'hover_at',
|
||||
'clear',
|
||||
'fill',
|
||||
'press_key',
|
||||
'type_at',
|
||||
'drag',
|
||||
'drag_at',
|
||||
'scroll',
|
||||
'handle_dialog',
|
||||
'select_option',
|
||||
'navigate_page',
|
||||
'close_page',
|
||||
'wait_for',
|
||||
])
|
||||
|
||||
interface CladoActionResponse {
|
||||
action?: string | null
|
||||
x?: number
|
||||
y?: number
|
||||
text?: string
|
||||
key?: string
|
||||
direction?: string
|
||||
startX?: number
|
||||
startY?: number
|
||||
endX?: number
|
||||
endY?: number
|
||||
amount?: number
|
||||
time?: number
|
||||
final_answer?: string | null
|
||||
inference_time_seconds?: number
|
||||
raw_response?: string
|
||||
thinking?: string | null
|
||||
parse_error?: string | null
|
||||
}
|
||||
|
||||
interface Viewport {
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
|
||||
interface CladoAction {
|
||||
action: string
|
||||
x?: number
|
||||
y?: number
|
||||
text?: string
|
||||
key?: string
|
||||
direction?: string
|
||||
startX?: number
|
||||
startY?: number
|
||||
endX?: number
|
||||
endY?: number
|
||||
amount?: number
|
||||
time?: number
|
||||
final_answer?: string
|
||||
}
|
||||
|
||||
type RawActionPayload = Partial<Omit<CladoAction, 'final_answer'>> & {
|
||||
final_answer?: string | null
|
||||
}
|
||||
|
||||
const MAX_CONSECUTIVE_PARSE_FAILURES = 3
|
||||
|
||||
interface ActionPoint {
|
||||
x: number
|
||||
y: number
|
||||
}
|
||||
|
||||
function asErrorMessage(error: unknown): string {
|
||||
return error instanceof Error ? error.message : String(error)
|
||||
}
|
||||
|
||||
function clampNormalized(value: number): number {
|
||||
return Math.min(999, Math.max(0, Math.round(value)))
|
||||
}
|
||||
|
||||
function isCladoProvider(provider: string): boolean {
|
||||
return provider === CLADO_ACTION_PROVIDER
|
||||
}
|
||||
|
||||
export class CladoActionExecutor {
|
||||
private readonly mcpClient: McpClient
|
||||
private readonly cladoClient: CladoActionClient
|
||||
private readonly pageId: number
|
||||
private callbacks: ExecutorCallbacks = {}
|
||||
private stepsUsed = 0
|
||||
private viewport: Viewport | null = null
|
||||
private lastPoint: ActionPoint | null = null
|
||||
private viewport: CladoViewport | null = null
|
||||
private lastPoint: CladoActionPoint | null = null
|
||||
private currentUrl = ''
|
||||
|
||||
constructor(
|
||||
@@ -110,12 +51,16 @@ export class CladoActionExecutor {
|
||||
readonly _tabId?: number,
|
||||
initialPageId?: number,
|
||||
) {
|
||||
if (!isCladoProvider(config.provider)) {
|
||||
if (!isCladoActionProvider(config.provider)) {
|
||||
throw new Error(
|
||||
`CladoActionExecutor requires provider="${CLADO_ACTION_PROVIDER}"`,
|
||||
)
|
||||
}
|
||||
this.mcpClient = new McpClient(`${serverUrl}/mcp`)
|
||||
this.cladoClient = new CladoActionClient({
|
||||
baseUrl: config.baseUrl,
|
||||
apiKey: config.apiKey,
|
||||
})
|
||||
this.pageId = initialPageId ?? 1
|
||||
}
|
||||
|
||||
@@ -165,7 +110,7 @@ export class CladoActionExecutor {
|
||||
break
|
||||
}
|
||||
|
||||
const historyForPrediction = this.formatHistory(actionHistory)
|
||||
const historyForPrediction = formatCladoHistory(actionHistory)
|
||||
const actionToolCallId = randomUUID()
|
||||
const predictionInput = {
|
||||
instruction,
|
||||
@@ -187,7 +132,7 @@ export class CladoActionExecutor {
|
||||
signal,
|
||||
)
|
||||
predictionCalls++
|
||||
const thinking = this.extractThinking(prediction.raw_response)
|
||||
const thinking = extractCladoThinking(prediction.raw_response)
|
||||
if (thinking) {
|
||||
const previous = thinkingTrace[thinkingTrace.length - 1]
|
||||
if (previous !== thinking) {
|
||||
@@ -217,7 +162,7 @@ export class CladoActionExecutor {
|
||||
break
|
||||
}
|
||||
|
||||
const predictedActions = this.parseActions(prediction)
|
||||
const predictedActions = parseCladoActions(prediction)
|
||||
if (predictedActions.length === 0) {
|
||||
// Per Clado contract: HTTP 200 with action=null on parse failure.
|
||||
// Count as an invalid step so the model can self-correct on the
|
||||
@@ -243,7 +188,7 @@ export class CladoActionExecutor {
|
||||
toolCallId: actionToolCallId,
|
||||
toolName: 'clado_action_predict',
|
||||
output: {
|
||||
prediction: this.summarizePrediction(prediction),
|
||||
prediction: summarizeCladoPrediction(prediction),
|
||||
parsedActions: [],
|
||||
parseError,
|
||||
consecutiveParseFailures,
|
||||
@@ -285,7 +230,7 @@ export class CladoActionExecutor {
|
||||
toolCallId: actionToolCallId,
|
||||
toolName: 'clado_action_predict',
|
||||
output: {
|
||||
prediction: this.summarizePrediction(prediction),
|
||||
prediction: summarizeCladoPrediction(prediction),
|
||||
parsedActions: predictedActions,
|
||||
executed: executionNotes,
|
||||
},
|
||||
@@ -326,7 +271,7 @@ export class CladoActionExecutor {
|
||||
toolCallId: actionToolCallId,
|
||||
toolName: 'clado_action_predict',
|
||||
output: {
|
||||
prediction: this.summarizePrediction(prediction),
|
||||
prediction: summarizeCladoPrediction(prediction),
|
||||
parsedActions: predictedActions,
|
||||
executed: executionNotes,
|
||||
},
|
||||
@@ -378,125 +323,12 @@ export class CladoActionExecutor {
|
||||
actionHistory: CladoAction[],
|
||||
signal?: AbortSignal,
|
||||
): Promise<CladoActionResponse> {
|
||||
if (!this.config.baseUrl) {
|
||||
throw new Error('executor.baseUrl must be set for clado-action provider')
|
||||
}
|
||||
|
||||
const requestController = new AbortController()
|
||||
const onAbort = () => requestController.abort()
|
||||
signal?.addEventListener('abort', onAbort, { once: true })
|
||||
|
||||
const timeoutHandle = setTimeout(() => {
|
||||
requestController.abort()
|
||||
}, CLADO_REQUEST_TIMEOUT_MS)
|
||||
|
||||
try {
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
if (this.config.apiKey) {
|
||||
headers.Authorization = `Bearer ${this.config.apiKey}`
|
||||
}
|
||||
|
||||
const response = await fetch(this.config.baseUrl, {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: JSON.stringify({
|
||||
instruction,
|
||||
image_base64: imageBase64,
|
||||
history: this.formatHistory(actionHistory),
|
||||
}),
|
||||
signal: requestController.signal,
|
||||
})
|
||||
|
||||
if (!response.ok) {
|
||||
const body = await response.text()
|
||||
throw new Error(
|
||||
`HTTP ${response.status} ${response.statusText}: ${body.slice(0, 400)}`,
|
||||
)
|
||||
}
|
||||
|
||||
return (await response.json()) as CladoActionResponse
|
||||
} finally {
|
||||
clearTimeout(timeoutHandle)
|
||||
signal?.removeEventListener('abort', onAbort)
|
||||
}
|
||||
}
|
||||
|
||||
private parseActions(prediction: CladoActionResponse): CladoAction[] {
|
||||
const actionFromField =
|
||||
typeof prediction.action === 'string' ? prediction.action : null
|
||||
|
||||
const rawActions = this.parseActionsFromRawResponse(prediction.raw_response)
|
||||
const primaryFromRaw = rawActions[0] ?? null
|
||||
const mergedPrimary = {
|
||||
...primaryFromRaw,
|
||||
...prediction,
|
||||
action: actionFromField ?? primaryFromRaw?.action,
|
||||
}
|
||||
|
||||
const normalized: CladoAction[] = []
|
||||
const primary = this.normalizeActionPayload(mergedPrimary)
|
||||
if (primary) normalized.push(primary)
|
||||
|
||||
for (const candidate of rawActions.slice(1)) {
|
||||
const parsed = this.normalizeActionPayload(candidate)
|
||||
if (!parsed) continue
|
||||
const prev = normalized[normalized.length - 1]
|
||||
if (
|
||||
!prev ||
|
||||
this.getActionSignature(prev) !== this.getActionSignature(parsed)
|
||||
) {
|
||||
normalized.push(parsed)
|
||||
}
|
||||
}
|
||||
|
||||
return normalized
|
||||
}
|
||||
|
||||
private normalizeActionPayload(
|
||||
payload: RawActionPayload,
|
||||
): CladoAction | null {
|
||||
if (!payload.action || typeof payload.action !== 'string') {
|
||||
return null
|
||||
}
|
||||
return {
|
||||
action: payload.action,
|
||||
x: typeof payload.x === 'number' ? payload.x : undefined,
|
||||
y: typeof payload.y === 'number' ? payload.y : undefined,
|
||||
text: typeof payload.text === 'string' ? payload.text : undefined,
|
||||
key: typeof payload.key === 'string' ? payload.key : undefined,
|
||||
direction:
|
||||
typeof payload.direction === 'string' ? payload.direction : undefined,
|
||||
startX: typeof payload.startX === 'number' ? payload.startX : undefined,
|
||||
startY: typeof payload.startY === 'number' ? payload.startY : undefined,
|
||||
endX: typeof payload.endX === 'number' ? payload.endX : undefined,
|
||||
endY: typeof payload.endY === 'number' ? payload.endY : undefined,
|
||||
amount: typeof payload.amount === 'number' ? payload.amount : undefined,
|
||||
time: typeof payload.time === 'number' ? payload.time : undefined,
|
||||
final_answer:
|
||||
typeof payload.final_answer === 'string'
|
||||
? payload.final_answer
|
||||
: undefined,
|
||||
}
|
||||
}
|
||||
|
||||
private parseActionsFromRawResponse(
|
||||
rawResponse: string | undefined,
|
||||
): RawActionPayload[] {
|
||||
if (!rawResponse) return []
|
||||
const matches = [
|
||||
...rawResponse.matchAll(/<answer>\s*([\s\S]*?)\s*<\/answer>/gi),
|
||||
]
|
||||
const parsed: RawActionPayload[] = []
|
||||
for (const match of matches) {
|
||||
try {
|
||||
parsed.push(JSON.parse(match[1]) as RawActionPayload)
|
||||
} catch {
|
||||
// ignore malformed answer blocks
|
||||
}
|
||||
}
|
||||
return parsed
|
||||
return this.cladoClient.requestActionPrediction({
|
||||
instruction,
|
||||
imageBase64,
|
||||
actionHistory,
|
||||
signal,
|
||||
})
|
||||
}
|
||||
|
||||
private async executeAction(
|
||||
@@ -567,14 +399,14 @@ export class CladoActionExecutor {
|
||||
}
|
||||
|
||||
case 'press_key': {
|
||||
const key = this.normalizePressKey(action.key)
|
||||
const key = normalizeCladoPressKey(action.key)
|
||||
await this.runTool('press_key', { key }, signal)
|
||||
return `Pressed key "${key}".`
|
||||
}
|
||||
|
||||
case 'scroll': {
|
||||
const direction = this.normalizeDirection(action.direction)
|
||||
const amountPx = this.normalizeScrollAmount(action.amount)
|
||||
const direction = normalizeCladoDirection(action.direction)
|
||||
const amountPx = normalizeCladoScrollAmount(action.amount)
|
||||
const ticks = Math.max(1, Math.round(amountPx / 120))
|
||||
|
||||
await this.runTool('scroll', { direction, amount: ticks }, signal)
|
||||
@@ -645,7 +477,7 @@ export class CladoActionExecutor {
|
||||
return image.data
|
||||
}
|
||||
|
||||
private async getViewport(signal?: AbortSignal): Promise<Viewport> {
|
||||
private async getViewport(signal?: AbortSignal): Promise<CladoViewport> {
|
||||
if (this.viewport) return this.viewport
|
||||
|
||||
try {
|
||||
@@ -676,15 +508,9 @@ export class CladoActionExecutor {
|
||||
normalizedX: number | undefined,
|
||||
normalizedY: number | undefined,
|
||||
signal?: AbortSignal,
|
||||
): Promise<ActionPoint> {
|
||||
): Promise<CladoActionPoint> {
|
||||
const viewport = await this.getViewport(signal)
|
||||
const nx = clampNormalized(normalizedX ?? 500)
|
||||
const ny = clampNormalized(normalizedY ?? 500)
|
||||
|
||||
return {
|
||||
x: Math.round((nx / 1000) * viewport.width),
|
||||
y: Math.round((ny / 1000) * viewport.height),
|
||||
}
|
||||
return resolveCladoPoint(viewport, normalizedX, normalizedY)
|
||||
}
|
||||
|
||||
private async getCurrentUrl(signal?: AbortSignal): Promise<string> {
|
||||
@@ -711,7 +537,7 @@ export class CladoActionExecutor {
|
||||
throw new Error('aborted')
|
||||
}
|
||||
|
||||
const toolArgs = this.prepareToolArgs(toolName, args)
|
||||
const toolArgs = prepareCladoToolArgs(toolName, args, this.pageId)
|
||||
|
||||
try {
|
||||
const raw = await this.mcpClient.callTool(toolName, toolArgs)
|
||||
@@ -730,207 +556,6 @@ export class CladoActionExecutor {
|
||||
}
|
||||
}
|
||||
|
||||
private prepareToolArgs(
|
||||
toolName: string,
|
||||
args: Record<string, unknown>,
|
||||
): Record<string, unknown> {
|
||||
const prepared: Record<string, unknown> = { ...args }
|
||||
|
||||
if (
|
||||
toolName === 'evaluate_script' &&
|
||||
typeof prepared.function === 'string' &&
|
||||
prepared.expression === undefined
|
||||
) {
|
||||
prepared.expression = this.toEvaluateExpression(prepared.function)
|
||||
delete prepared.function
|
||||
}
|
||||
|
||||
if (
|
||||
toolName === 'click_at' &&
|
||||
typeof prepared.dblClick === 'boolean' &&
|
||||
prepared.clickCount === undefined
|
||||
) {
|
||||
prepared.clickCount = prepared.dblClick ? 2 : 1
|
||||
delete prepared.dblClick
|
||||
}
|
||||
|
||||
// Use fixed page ID for all page-scoped tools (single-page operation)
|
||||
if (PAGE_SCOPED_TOOLS.has(toolName) && typeof prepared.page !== 'number') {
|
||||
prepared.page = this.pageId
|
||||
}
|
||||
|
||||
return prepared
|
||||
}
|
||||
|
||||
private toEvaluateExpression(rawFunction: unknown): string {
|
||||
const source = String(rawFunction).trim()
|
||||
if (source.startsWith('() =>') || source.startsWith('async () =>')) {
|
||||
return `(${source})()`
|
||||
}
|
||||
if (source.startsWith('function')) {
|
||||
return `(${source})()`
|
||||
}
|
||||
return source
|
||||
}
|
||||
|
||||
private normalizePressKey(key: string | undefined): string {
|
||||
const raw = (key ?? '').trim()
|
||||
if (!raw) throw new Error('press_key action missing key field')
|
||||
|
||||
const map: Record<string, string> = {
|
||||
'C-a': 'Control+A',
|
||||
'C-c': 'Control+C',
|
||||
'C-v': 'Control+V',
|
||||
'C-x': 'Control+X',
|
||||
'C-z': 'Control+Z',
|
||||
'C-y': 'Control+Y',
|
||||
'C-s': 'Control+S',
|
||||
'C-t': 'Control+T',
|
||||
'C-w': 'Control+W',
|
||||
'C-h': 'Control+H',
|
||||
'C-f': 'Control+F',
|
||||
'C-+': 'Control++',
|
||||
'C--': 'Control+-',
|
||||
'C-tab': 'Control+Tab',
|
||||
'C-S-tab': 'Control+Shift+Tab',
|
||||
'C-S-n': 'Control+Shift+N',
|
||||
'C-down': 'Control+ArrowDown',
|
||||
// macOS Cmd shortcuts (Meta in CDP).
|
||||
'M-a': 'Meta+A',
|
||||
'M-c': 'Meta+C',
|
||||
'M-v': 'Meta+V',
|
||||
'M-x': 'Meta+X',
|
||||
'M-f4': 'Alt+F4',
|
||||
}
|
||||
return map[raw] ?? raw
|
||||
}
|
||||
|
||||
private normalizeDirection(
|
||||
direction: string | undefined,
|
||||
): 'up' | 'down' | 'left' | 'right' {
|
||||
if (
|
||||
direction === 'up' ||
|
||||
direction === 'down' ||
|
||||
direction === 'left' ||
|
||||
direction === 'right'
|
||||
) {
|
||||
return direction
|
||||
}
|
||||
return 'down'
|
||||
}
|
||||
|
||||
private normalizeScrollAmount(amount: number | undefined): number {
|
||||
if (typeof amount !== 'number') return 500
|
||||
if (amount <= 0) return 100
|
||||
const clamped = Math.min(amount, 1000)
|
||||
return Math.max(100, Math.round((clamped / 1000) * 900))
|
||||
}
|
||||
|
||||
private summarizePrediction(
|
||||
prediction: CladoActionResponse,
|
||||
): Record<string, unknown> {
|
||||
const preview =
|
||||
typeof prediction.raw_response === 'string' &&
|
||||
prediction.raw_response.length > 0
|
||||
? prediction.raw_response.slice(0, 240)
|
||||
: undefined
|
||||
|
||||
return {
|
||||
action: prediction.action,
|
||||
x: prediction.x,
|
||||
y: prediction.y,
|
||||
text: prediction.text,
|
||||
key: prediction.key,
|
||||
direction: prediction.direction,
|
||||
startX: prediction.startX,
|
||||
startY: prediction.startY,
|
||||
endX: prediction.endX,
|
||||
endY: prediction.endY,
|
||||
amount: prediction.amount,
|
||||
time: prediction.time,
|
||||
inference_time_seconds: prediction.inference_time_seconds,
|
||||
raw_response_preview: preview,
|
||||
}
|
||||
}
|
||||
|
||||
private extractThinking(rawResponse: string | undefined): string | undefined {
|
||||
if (!rawResponse) return undefined
|
||||
const matches = [
|
||||
...rawResponse.matchAll(/<thinking>\s*([\s\S]*?)\s*<\/thinking>/gi),
|
||||
]
|
||||
if (matches.length === 0) return undefined
|
||||
|
||||
const merged = matches
|
||||
.map((match) => match[1]?.replace(/\s+/g, ' ').trim() ?? '')
|
||||
.filter((value) => value.length > 0)
|
||||
.join(' ')
|
||||
|
||||
if (!merged) return undefined
|
||||
return merged
|
||||
}
|
||||
|
||||
private getActionSignature(action: CladoAction): string {
|
||||
switch (action.action) {
|
||||
case 'click':
|
||||
case 'double_click':
|
||||
case 'right_click':
|
||||
case 'hover':
|
||||
return `${action.action}:${action.x ?? 'x'}:${action.y ?? 'y'}`
|
||||
case 'type':
|
||||
return `${action.action}:${(action.text ?? '').slice(0, 16)}`
|
||||
case 'press_key':
|
||||
return `${action.action}:${action.key ?? 'key'}`
|
||||
case 'scroll':
|
||||
return `${action.action}:${action.direction ?? 'down'}:${action.amount ?? 500}`
|
||||
case 'drag':
|
||||
return `${action.action}:${action.startX}:${action.startY}:${action.endX}:${action.endY}`
|
||||
case 'wait':
|
||||
return `${action.action}:${action.time ?? 1}`
|
||||
case 'end':
|
||||
return action.final_answer
|
||||
? `end(${action.final_answer.slice(0, 32)})`
|
||||
: 'end()'
|
||||
case 'invalid':
|
||||
return `invalid(${(action.text ?? '').slice(0, 40)})`
|
||||
default:
|
||||
return action.action
|
||||
}
|
||||
}
|
||||
|
||||
private formatHistory(actions: CladoAction[]): string {
|
||||
if (actions.length === 0) return 'None'
|
||||
|
||||
const parts = actions.map((action) => {
|
||||
switch (action.action) {
|
||||
case 'click':
|
||||
case 'double_click':
|
||||
case 'right_click':
|
||||
case 'hover':
|
||||
return `${action.action}(${Math.round(action.x ?? 500)}, ${Math.round(action.y ?? 500)})`
|
||||
case 'type': {
|
||||
const text = (action.text ?? '').replace(/'/g, "\\'")
|
||||
return `type('${text}')`
|
||||
}
|
||||
case 'press_key':
|
||||
return `press_key('${action.key ?? 'Enter'}')`
|
||||
case 'scroll':
|
||||
return `scroll(${action.direction ?? 'down'})`
|
||||
case 'drag':
|
||||
return `drag(${Math.round(action.startX ?? 500)},${Math.round(action.startY ?? 500)} -> ${Math.round(action.endX ?? 500)},${Math.round(action.endY ?? 500)})`
|
||||
case 'wait':
|
||||
return `wait(${Math.round(action.time ?? 1)}s)`
|
||||
case 'end':
|
||||
return 'end()'
|
||||
case 'invalid':
|
||||
return 'invalid()'
|
||||
default:
|
||||
return action.action
|
||||
}
|
||||
})
|
||||
|
||||
return parts.join(' -> ')
|
||||
}
|
||||
|
||||
private buildObservation(params: {
|
||||
status: ExecutorResult['status']
|
||||
reason: string
|
||||
@@ -946,7 +571,7 @@ export class CladoActionExecutor {
|
||||
: actions
|
||||
.slice(-5)
|
||||
.map(
|
||||
(action, idx) => `${idx + 1}. ${this.getActionSignature(action)}`,
|
||||
(action, idx) => `${idx + 1}. ${getCladoActionSignature(action)}`,
|
||||
)
|
||||
.join('\n')
|
||||
const thinkingSummary =
|
||||
|
||||
@@ -24,8 +24,9 @@ import {
|
||||
resolveProviderConfig,
|
||||
} from '../../utils/resolve-provider-config'
|
||||
import { withEvalTimeout } from '../../utils/with-eval-timeout'
|
||||
import { createExecutorBackend } from '../orchestrated/backends/create-executor-backend'
|
||||
import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
|
||||
import { Executor, type ExecutorCallbacks } from './executor'
|
||||
import type { ExecutorCallbacks } from './executor'
|
||||
import { OrchestratorAgent } from './orchestrator-agent'
|
||||
import type { ExecutorFactory, ExecutorResult } from './types'
|
||||
|
||||
@@ -235,12 +236,13 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
|
||||
await capture.messageLogger.logStreamEvent(delegateInputEvent)
|
||||
capture.emitEvent(task.query_id, delegateInputEvent)
|
||||
|
||||
const executor = new Executor(
|
||||
executorConfig,
|
||||
const executor = createExecutorBackend({
|
||||
backendKind: isCladoAction ? 'clado' : 'tool-loop',
|
||||
configTemplate: executorConfig,
|
||||
browser,
|
||||
config.browseros.server_url,
|
||||
{ isCladoAction, callbacks },
|
||||
)
|
||||
serverUrl: config.browseros.server_url,
|
||||
callbacks,
|
||||
})
|
||||
let result: ExecutorResult
|
||||
try {
|
||||
result = await executor.execute(instruction, signal)
|
||||
|
||||
@@ -57,6 +57,20 @@ export class TrajectorySaver {
|
||||
)
|
||||
}
|
||||
|
||||
async saveAttempt(attempt: Record<string, unknown>): Promise<void> {
|
||||
await writeFile(
|
||||
join(this.outputDir, 'attempt.json'),
|
||||
JSON.stringify(attempt, null, 2),
|
||||
)
|
||||
}
|
||||
|
||||
async saveGrades(graderResults: Record<string, GraderResult>): Promise<void> {
|
||||
await writeFile(
|
||||
join(this.outputDir, 'grades.json'),
|
||||
JSON.stringify(graderResults, null, 2),
|
||||
)
|
||||
}
|
||||
|
||||
async loadMetadata(): Promise<TaskMetadata> {
|
||||
const content = await readFile(
|
||||
join(this.outputDir, 'metadata.json'),
|
||||
@@ -70,6 +84,7 @@ export class TrajectorySaver {
|
||||
): Promise<void> {
|
||||
const metadata = await this.loadMetadata()
|
||||
metadata.grader_results = graderResults
|
||||
await this.saveGrades(graderResults)
|
||||
await this.saveMetadata(metadata)
|
||||
}
|
||||
|
||||
|
||||
170
packages/browseros-agent/apps/eval/src/cli/args.ts
vendored
Normal file
170
packages/browseros-agent/apps/eval/src/cli/args.ts
vendored
Normal file
@@ -0,0 +1,170 @@
|
||||
import { parseArgs } from 'node:util'
|
||||
|
||||
export type PublishTarget = 'r2'
|
||||
|
||||
export interface LegacyCliArgs {
|
||||
command: 'legacy'
|
||||
configPath?: string
|
||||
help?: boolean
|
||||
}
|
||||
|
||||
export interface SuiteCliArgs {
|
||||
command: 'suite'
|
||||
configPath?: string
|
||||
suitePath?: string
|
||||
variantId?: string
|
||||
provider?: string
|
||||
model?: string
|
||||
apiKey?: string
|
||||
baseUrl?: string
|
||||
publishTarget?: PublishTarget
|
||||
}
|
||||
|
||||
export interface RunCliArgs
|
||||
extends Omit<SuiteCliArgs, 'command' | 'publishTarget'> {
|
||||
command: 'run'
|
||||
}
|
||||
|
||||
export interface GradeCliArgs {
|
||||
command: 'grade'
|
||||
runDir: string
|
||||
}
|
||||
|
||||
export interface PublishCliArgs {
|
||||
command: 'publish'
|
||||
runDir: string
|
||||
target: PublishTarget
|
||||
}
|
||||
|
||||
export type EvalCliArgs =
|
||||
| LegacyCliArgs
|
||||
| SuiteCliArgs
|
||||
| RunCliArgs
|
||||
| GradeCliArgs
|
||||
| PublishCliArgs
|
||||
|
||||
const COMMANDS = new Set(['suite', 'run', 'grade', 'publish'])
|
||||
|
||||
function stringValue(value: string | boolean | undefined): string | undefined {
|
||||
return typeof value === 'string' && value.length > 0 ? value : undefined
|
||||
}
|
||||
|
||||
function publishTarget(value: string | undefined): PublishTarget | undefined {
|
||||
if (value === undefined) return undefined
|
||||
if (value === 'r2') return 'r2'
|
||||
throw new Error(`Unsupported publish target: ${value}`)
|
||||
}
|
||||
|
||||
function requireOne(
|
||||
command: string,
|
||||
configPath: string | undefined,
|
||||
suitePath: string | undefined,
|
||||
): void {
|
||||
if (!configPath && !suitePath) {
|
||||
throw new Error(`${command} requires --config or --suite`)
|
||||
}
|
||||
if (configPath && suitePath) {
|
||||
throw new Error(`${command} accepts either --config or --suite, not both`)
|
||||
}
|
||||
}
|
||||
|
||||
function parseSuiteLikeArgs(
|
||||
command: 'suite' | 'run',
|
||||
argv: string[],
|
||||
): SuiteCliArgs | RunCliArgs {
|
||||
const { values } = parseArgs({
|
||||
args: argv,
|
||||
options: {
|
||||
config: { type: 'string' },
|
||||
suite: { type: 'string' },
|
||||
variant: { type: 'string' },
|
||||
provider: { type: 'string' },
|
||||
model: { type: 'string' },
|
||||
'api-key': { type: 'string' },
|
||||
'base-url': { type: 'string' },
|
||||
publish: { type: 'string' },
|
||||
},
|
||||
})
|
||||
|
||||
const configPath = stringValue(values.config)
|
||||
const suitePath = stringValue(values.suite)
|
||||
requireOne(command, configPath, suitePath)
|
||||
|
||||
const parsed: SuiteCliArgs | RunCliArgs =
|
||||
command === 'suite' ? { command: 'suite' } : { command: 'run' }
|
||||
if (configPath) parsed.configPath = configPath
|
||||
if (suitePath) parsed.suitePath = suitePath
|
||||
const variantId = stringValue(values.variant)
|
||||
if (variantId) parsed.variantId = variantId
|
||||
const provider = stringValue(values.provider)
|
||||
if (provider) parsed.provider = provider
|
||||
const model = stringValue(values.model)
|
||||
if (model) parsed.model = model
|
||||
const apiKey = stringValue(values['api-key'])
|
||||
if (apiKey) parsed.apiKey = apiKey
|
||||
const baseUrl = stringValue(values['base-url'])
|
||||
if (baseUrl) parsed.baseUrl = baseUrl
|
||||
|
||||
if (command === 'suite') {
|
||||
const target = publishTarget(stringValue(values.publish))
|
||||
if (target) {
|
||||
const suiteArgs = parsed as SuiteCliArgs
|
||||
suiteArgs.publishTarget = target
|
||||
}
|
||||
}
|
||||
|
||||
return parsed
|
||||
}
|
||||
|
||||
function parseLegacyArgs(argv: string[]): LegacyCliArgs {
|
||||
const { values } = parseArgs({
|
||||
args: argv,
|
||||
options: {
|
||||
config: { type: 'string', short: 'c' },
|
||||
help: { type: 'boolean', short: 'h' },
|
||||
},
|
||||
})
|
||||
|
||||
const parsed: LegacyCliArgs = { command: 'legacy' }
|
||||
const configPath = stringValue(values.config)
|
||||
if (configPath) parsed.configPath = configPath
|
||||
if (values.help) parsed.help = true
|
||||
return parsed
|
||||
}
|
||||
|
||||
/** Parses the eval CLI command without running browser or publishing side effects. */
|
||||
export function parseEvalCliArgs(argv: string[]): EvalCliArgs {
|
||||
const [command, ...rest] = argv
|
||||
if (!COMMANDS.has(command ?? '')) {
|
||||
return parseLegacyArgs(argv)
|
||||
}
|
||||
|
||||
switch (command) {
|
||||
case 'suite':
|
||||
return parseSuiteLikeArgs('suite', rest)
|
||||
case 'run':
|
||||
return parseSuiteLikeArgs('run', rest)
|
||||
case 'grade': {
|
||||
const { values } = parseArgs({
|
||||
args: rest,
|
||||
options: { run: { type: 'string' } },
|
||||
})
|
||||
const runDir = stringValue(values.run)
|
||||
if (!runDir) throw new Error('grade requires --run')
|
||||
return { command: 'grade', runDir }
|
||||
}
|
||||
case 'publish': {
|
||||
const { values } = parseArgs({
|
||||
args: rest,
|
||||
options: { run: { type: 'string' }, target: { type: 'string' } },
|
||||
})
|
||||
const runDir = stringValue(values.run)
|
||||
if (!runDir) throw new Error('publish requires --run')
|
||||
const target = publishTarget(stringValue(values.target))
|
||||
if (!target) throw new Error('publish requires --target')
|
||||
return { command: 'publish', runDir, target }
|
||||
}
|
||||
default:
|
||||
return parseLegacyArgs(argv)
|
||||
}
|
||||
}
|
||||
84
packages/browseros-agent/apps/eval/src/cli/commands/grade.ts
vendored
Normal file
84
packages/browseros-agent/apps/eval/src/cli/commands/grade.ts
vendored
Normal file
@@ -0,0 +1,84 @@
|
||||
import { readdir, readFile, stat } from 'node:fs/promises'
|
||||
import { join } from 'node:path'
|
||||
import { TrajectorySaver } from '../../capture/trajectory-saver'
|
||||
import { runGraders } from '../../grading/grader-runner'
|
||||
import { type Message, MessageSchema, TaskMetadataSchema } from '../../types'
|
||||
import type { GradeCliArgs } from '../args'
|
||||
|
||||
async function loadMessages(taskDir: string): Promise<Message[]> {
|
||||
const content = await readFile(
|
||||
join(taskDir, 'messages.jsonl'),
|
||||
'utf-8',
|
||||
).catch(() => '')
|
||||
return content
|
||||
.split('\n')
|
||||
.filter((line) => line.trim().length > 0)
|
||||
.map((line) => MessageSchema.parse(JSON.parse(line)))
|
||||
}
|
||||
|
||||
async function findTaskDirs(runDir: string): Promise<string[]> {
|
||||
const entries = await readdir(runDir, { withFileTypes: true })
|
||||
const taskDirs: string[] = []
|
||||
for (const entry of entries) {
|
||||
if (!entry.isDirectory()) continue
|
||||
const taskDir = join(runDir, entry.name)
|
||||
const metadata = await stat(join(taskDir, 'metadata.json')).catch(
|
||||
() => null,
|
||||
)
|
||||
if (metadata?.isFile()) taskDirs.push(taskDir)
|
||||
}
|
||||
return taskDirs
|
||||
}
|
||||
|
||||
/** Re-runs graders for task artifacts that already contain metadata and messages. */
|
||||
export async function runGradeCommand(args: GradeCliArgs): Promise<void> {
|
||||
const runStat = await stat(args.runDir).catch(() => null)
|
||||
if (!runStat?.isDirectory()) {
|
||||
throw new Error(`Not a run directory: ${args.runDir}`)
|
||||
}
|
||||
|
||||
const taskDirs = await findTaskDirs(args.runDir)
|
||||
if (taskDirs.length === 0) {
|
||||
throw new Error(`No task metadata found under ${args.runDir}`)
|
||||
}
|
||||
|
||||
let graded = 0
|
||||
for (const taskDir of taskDirs) {
|
||||
const metadata = TaskMetadataSchema.parse(
|
||||
JSON.parse(await readFile(join(taskDir, 'metadata.json'), 'utf-8')),
|
||||
)
|
||||
const graderNames = Object.keys(metadata.grader_results ?? {})
|
||||
if (graderNames.length === 0) {
|
||||
console.warn(`Skipping ${metadata.query_id}: no existing grader names`)
|
||||
continue
|
||||
}
|
||||
|
||||
const messages = await loadMessages(taskDir)
|
||||
const graderResults = await runGraders(graderNames, {
|
||||
task: {
|
||||
query_id: metadata.query_id,
|
||||
query: metadata.query,
|
||||
dataset: metadata.dataset,
|
||||
},
|
||||
messages,
|
||||
screenshotCount: metadata.screenshot_count ?? metadata.total_steps,
|
||||
finalAnswer: metadata.final_answer,
|
||||
taskArtifactDir: taskDir,
|
||||
outputDir: taskDir,
|
||||
mcpUrl: `${process.env.BROWSEROS_SERVER_URL || 'http://127.0.0.1:9110'}/mcp`,
|
||||
})
|
||||
|
||||
await new TrajectorySaver(
|
||||
args.runDir,
|
||||
metadata.query_id,
|
||||
).updateGraderResults(graderResults)
|
||||
graded++
|
||||
}
|
||||
|
||||
if (graded === 0) {
|
||||
throw new Error(
|
||||
`No tasks with existing grader names found under ${args.runDir}`,
|
||||
)
|
||||
}
|
||||
console.log(`Re-graded ${graded} task(s) in ${args.runDir}`)
|
||||
}
|
||||
25
packages/browseros-agent/apps/eval/src/cli/commands/publish.ts
vendored
Normal file
25
packages/browseros-agent/apps/eval/src/cli/commands/publish.ts
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
import { publishPathToR2 } from '../../publishing/r2-publisher'
|
||||
import type { PublishCliArgs, PublishTarget } from '../args'
|
||||
|
||||
export interface PublishRunOptions {
|
||||
runDir: string
|
||||
target: PublishTarget
|
||||
}
|
||||
|
||||
/** Publishes run artifacts through the R2 viewer upload path. */
|
||||
export async function publishRun(options: PublishRunOptions): Promise<void> {
|
||||
if (options.target !== 'r2') {
|
||||
throw new Error(`Unsupported publish target: ${options.target}`)
|
||||
}
|
||||
const result = await publishPathToR2(options.runDir)
|
||||
for (const run of result.uploadedRuns) {
|
||||
console.log(run.viewerUrl)
|
||||
}
|
||||
for (const runId of result.skippedRuns) {
|
||||
console.log(`${runId}: already uploaded, skipping`)
|
||||
}
|
||||
}
|
||||
|
||||
export async function runPublishCommand(args: PublishCliArgs): Promise<void> {
|
||||
await publishRun({ runDir: args.runDir, target: args.target })
|
||||
}
|
||||
21
packages/browseros-agent/apps/eval/src/cli/commands/run.ts
vendored
Normal file
21
packages/browseros-agent/apps/eval/src/cli/commands/run.ts
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
import type { RunCliArgs } from '../args'
|
||||
import { runSuiteCommand, type SuiteCommandDeps } from './suite'
|
||||
|
||||
/** Executes tasks from a config or suite without publishing artifacts. */
|
||||
export async function runRunCommand(
|
||||
args: RunCliArgs,
|
||||
deps: SuiteCommandDeps = {},
|
||||
): Promise<void> {
|
||||
await runSuiteCommand(
|
||||
{
|
||||
configPath: args.configPath,
|
||||
suitePath: args.suitePath,
|
||||
variantId: args.variantId,
|
||||
provider: args.provider,
|
||||
model: args.model,
|
||||
apiKey: args.apiKey,
|
||||
baseUrl: args.baseUrl,
|
||||
},
|
||||
deps,
|
||||
)
|
||||
}
|
||||
187
packages/browseros-agent/apps/eval/src/cli/commands/suite.ts
vendored
Normal file
187
packages/browseros-agent/apps/eval/src/cli/commands/suite.ts
vendored
Normal file
@@ -0,0 +1,187 @@
|
||||
import type { RunEvalOptions, RunEvalResult } from '../../runner/types'
|
||||
import { runEval as defaultRunEval } from '../../runs/eval-runner'
|
||||
import {
|
||||
type AdaptedEvalConfig,
|
||||
adaptEvalConfigFile,
|
||||
} from '../../suites/config-adapter'
|
||||
import { loadSuite } from '../../suites/load-suite'
|
||||
import { type EvalVariant, resolveVariant } from '../../suites/resolve-variant'
|
||||
import type { EvalSuite } from '../../suites/schema'
|
||||
import { type EvalConfig, EvalConfigSchema } from '../../types'
|
||||
import type { PublishTarget } from '../args'
|
||||
|
||||
type Env = Record<string, string | undefined>
|
||||
|
||||
export interface SuiteCommandOptions {
|
||||
configPath?: string
|
||||
suitePath?: string
|
||||
variantId?: string
|
||||
provider?: string
|
||||
model?: string
|
||||
apiKey?: string
|
||||
baseUrl?: string
|
||||
publishTarget?: PublishTarget
|
||||
env?: Env
|
||||
}
|
||||
|
||||
export type ResolvedSuiteCommand =
|
||||
| (AdaptedEvalConfig & { kind: 'config'; datasetPath?: undefined })
|
||||
| {
|
||||
kind: 'suite'
|
||||
suitePath: string
|
||||
suite: EvalSuite
|
||||
variant: EvalVariant
|
||||
datasetPath: string
|
||||
evalConfig: EvalConfig
|
||||
}
|
||||
|
||||
export interface SuiteCommandDeps {
|
||||
runEval?: (options: RunEvalOptions) => Promise<RunEvalResult | undefined>
|
||||
publishRun?: (options: {
|
||||
runDir: string
|
||||
target: PublishTarget
|
||||
}) => Promise<void>
|
||||
}
|
||||
|
||||
function ensureRunnableSuite(suite: EvalSuite): void {
|
||||
if (!suite.browseros) {
|
||||
throw new Error('suite browseros config is required to run suite commands')
|
||||
}
|
||||
}
|
||||
|
||||
function suiteToEvalConfig(
|
||||
suite: EvalSuite,
|
||||
datasetPath: string,
|
||||
variant: EvalVariant,
|
||||
env: Env,
|
||||
): EvalConfig {
|
||||
ensureRunnableSuite(suite)
|
||||
|
||||
const base = {
|
||||
dataset: datasetPath,
|
||||
num_workers: suite.workers,
|
||||
restart_server_per_task: suite.restartBrowserPerTask,
|
||||
browseros: suite.browseros,
|
||||
graders: suite.graders,
|
||||
timeout_ms: suite.timeoutMs,
|
||||
captcha: suite.captcha,
|
||||
}
|
||||
|
||||
if (suite.agent.type === 'single' || suite.agent.type === 'tool-loop') {
|
||||
// The legacy runner names the BrowserOS tool-loop agent "single".
|
||||
return EvalConfigSchema.parse({
|
||||
...base,
|
||||
agent: {
|
||||
type: 'single',
|
||||
provider: variant.agent.provider,
|
||||
model: variant.agent.model,
|
||||
apiKey: variant.agent.apiKey,
|
||||
baseUrl: variant.agent.baseUrl,
|
||||
supportsImages: variant.agent.supportsImages,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
const executorBackend = suite.agent.executorBackend ?? 'tool-loop'
|
||||
const executor =
|
||||
executorBackend === 'clado'
|
||||
? {
|
||||
provider: 'clado-action' as const,
|
||||
model:
|
||||
env.EVAL_EXECUTOR_MODEL ?? env.CLADO_ACTION_MODEL ?? 'clado-action',
|
||||
apiKey: env.EVAL_EXECUTOR_API_KEY ?? env.CLADO_ACTION_API_KEY ?? '',
|
||||
baseUrl:
|
||||
env.EVAL_EXECUTOR_BASE_URL ??
|
||||
env.CLADO_ACTION_BASE_URL ??
|
||||
env.CLADO_ACTION_URL,
|
||||
}
|
||||
: {
|
||||
provider: variant.agent.provider,
|
||||
model: variant.agent.model,
|
||||
apiKey: variant.agent.apiKey,
|
||||
baseUrl: variant.agent.baseUrl,
|
||||
}
|
||||
|
||||
return EvalConfigSchema.parse({
|
||||
...base,
|
||||
agent: {
|
||||
type: 'orchestrator-executor',
|
||||
orchestrator: {
|
||||
provider: variant.agent.provider,
|
||||
model: variant.agent.model,
|
||||
apiKey: variant.agent.apiKey,
|
||||
baseUrl: variant.agent.baseUrl,
|
||||
},
|
||||
executor,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
/** Resolves config-backed or suite-backed CLI input into the run shape used by the runner. */
|
||||
export async function resolveSuiteCommand(
|
||||
options: SuiteCommandOptions,
|
||||
): Promise<ResolvedSuiteCommand> {
|
||||
const env = options.env ?? process.env
|
||||
if (options.configPath) {
|
||||
return {
|
||||
kind: 'config',
|
||||
...(await adaptEvalConfigFile(options.configPath, { env })),
|
||||
}
|
||||
}
|
||||
if (!options.suitePath) {
|
||||
throw new Error('suite requires --config or --suite')
|
||||
}
|
||||
|
||||
const loaded = await loadSuite(options.suitePath)
|
||||
const variant = resolveVariant({
|
||||
variantId: options.variantId,
|
||||
provider: options.provider,
|
||||
model: options.model,
|
||||
apiKey: options.apiKey,
|
||||
baseUrl: options.baseUrl,
|
||||
env,
|
||||
})
|
||||
|
||||
return {
|
||||
kind: 'suite',
|
||||
suitePath: loaded.suitePath,
|
||||
suite: loaded.suite,
|
||||
variant,
|
||||
datasetPath: loaded.datasetPath,
|
||||
evalConfig: suiteToEvalConfig(
|
||||
loaded.suite,
|
||||
loaded.datasetPath,
|
||||
variant,
|
||||
env,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/** Runs the full suite loop: resolve input, execute tasks, then optionally publish the run. */
|
||||
export async function runSuiteCommand(
|
||||
options: SuiteCommandOptions,
|
||||
deps: SuiteCommandDeps = {},
|
||||
): Promise<void> {
|
||||
const runEval = deps.runEval ?? defaultRunEval
|
||||
const resolved = await resolveSuiteCommand(options)
|
||||
const runOptions: RunEvalOptions =
|
||||
resolved.kind === 'config'
|
||||
? { configPath: resolved.configPath }
|
||||
: {
|
||||
configPath: resolved.suitePath,
|
||||
dataPath: resolved.datasetPath,
|
||||
config: resolved.evalConfig,
|
||||
}
|
||||
|
||||
const result = await runEval(runOptions)
|
||||
if (!options.publishTarget) return
|
||||
|
||||
const outputDir = result?.outputDir
|
||||
if (!outputDir) {
|
||||
throw new Error('publish requested but runner did not return an outputDir')
|
||||
}
|
||||
if (!deps.publishRun) {
|
||||
throw new Error('publish requested before the publisher is configured')
|
||||
}
|
||||
await deps.publishRun({ runDir: outputDir, target: options.publishTarget })
|
||||
}
|
||||
70
packages/browseros-agent/apps/eval/src/cli/index.ts
vendored
Normal file
70
packages/browseros-agent/apps/eval/src/cli/index.ts
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
import { startDashboard } from '../dashboard/server'
|
||||
import { runEval } from '../runs/eval-runner'
|
||||
import { type EvalCliArgs, parseEvalCliArgs } from './args'
|
||||
import { runGradeCommand } from './commands/grade'
|
||||
import { publishRun, runPublishCommand } from './commands/publish'
|
||||
import { runRunCommand } from './commands/run'
|
||||
import { runSuiteCommand } from './commands/suite'
|
||||
|
||||
export function usage(): string {
|
||||
return `
|
||||
BrowserOS Eval
|
||||
|
||||
Usage:
|
||||
bun run eval suite --config <config.json> [--publish r2]
|
||||
bun run eval suite --suite <suite.json> --variant <id> [--publish r2]
|
||||
bun run eval run --config <config.json>
|
||||
bun run eval run --suite <suite.json> --variant <id>
|
||||
bun run eval grade --run <results/run-dir>
|
||||
bun run eval publish --run <results/run-dir> --target r2
|
||||
bun run eval -c <config.json>
|
||||
`
|
||||
}
|
||||
|
||||
async function runLegacyCommand(args: EvalCliArgs): Promise<void> {
|
||||
if (args.command !== 'legacy') return
|
||||
if (args.help) {
|
||||
console.log(usage())
|
||||
return
|
||||
}
|
||||
if (args.configPath) {
|
||||
await runEval({ configPath: args.configPath })
|
||||
return
|
||||
}
|
||||
|
||||
startDashboard({
|
||||
tasks: [],
|
||||
configName: '',
|
||||
agentType: '',
|
||||
outputDir: '',
|
||||
configMode: true,
|
||||
})
|
||||
console.log(
|
||||
'Dashboard running at http://localhost:9900 — configure and run from the UI',
|
||||
)
|
||||
await new Promise(() => {})
|
||||
}
|
||||
|
||||
/** Dispatches the eval CLI while preserving the old config/dashboard entry points. */
|
||||
export async function runCli(
|
||||
argv: string[] = Bun.argv.slice(2),
|
||||
): Promise<void> {
|
||||
const args = parseEvalCliArgs(argv)
|
||||
switch (args.command) {
|
||||
case 'legacy':
|
||||
await runLegacyCommand(args)
|
||||
break
|
||||
case 'suite':
|
||||
await runSuiteCommand(args, { publishRun })
|
||||
break
|
||||
case 'run':
|
||||
await runRunCommand(args)
|
||||
break
|
||||
case 'grade':
|
||||
await runGradeCommand(args)
|
||||
break
|
||||
case 'publish':
|
||||
await runPublishCommand(args)
|
||||
break
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
import { mkdir, readdir, readFile, stat } from 'node:fs/promises'
|
||||
import { join, resolve } from 'node:path'
|
||||
import { dirname, join, resolve, sep } from 'node:path'
|
||||
import { Hono } from 'hono'
|
||||
import { streamSSE } from 'hono/streaming'
|
||||
import { ParallelExecutor } from '../runner/parallel-executor'
|
||||
@@ -128,6 +128,35 @@ let dashboardConfigMode = false
|
||||
const configsDir = join(import.meta.dir, '..', '..', 'configs')
|
||||
const projectRoot = resolve(import.meta.dir, '..', '..', '..', '..')
|
||||
|
||||
async function listConfigFiles(dir: string, prefix = ''): Promise<string[]> {
|
||||
const entries = await readdir(join(dir, prefix), { withFileTypes: true })
|
||||
const files: string[] = []
|
||||
for (const entry of entries) {
|
||||
const relativePath = prefix ? join(prefix, entry.name) : entry.name
|
||||
if (entry.isDirectory()) {
|
||||
files.push(...(await listConfigFiles(dir, relativePath)))
|
||||
} else if (entry.isFile() && entry.name.endsWith('.json')) {
|
||||
files.push(relativePath.split(sep).join('/'))
|
||||
}
|
||||
}
|
||||
return files.sort()
|
||||
}
|
||||
|
||||
function resolveConfigPath(name: string): string | null {
|
||||
if (!name.endsWith('.json')) return null
|
||||
if (name.split('/').some((part) => !part || part === '.' || part === '..')) {
|
||||
return null
|
||||
}
|
||||
|
||||
const resolvedPath = resolve(configsDir, name)
|
||||
const resolvedConfigsDir = resolve(configsDir)
|
||||
const configRootPrefix = resolvedConfigsDir.endsWith(sep)
|
||||
? resolvedConfigsDir
|
||||
: `${resolvedConfigsDir}${sep}`
|
||||
if (!resolvedPath.startsWith(configRootPrefix)) return null
|
||||
return resolvedPath
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Hono App
|
||||
// ============================================================================
|
||||
@@ -339,21 +368,21 @@ app.get('/api/mode', (c) => {
|
||||
// List saved config files
|
||||
app.get('/api/configs', async (c) => {
|
||||
try {
|
||||
const files = await readdir(configsDir)
|
||||
return c.json(files.filter((f) => f.endsWith('.json')))
|
||||
return c.json(await listConfigFiles(configsDir))
|
||||
} catch {
|
||||
return c.json([])
|
||||
}
|
||||
})
|
||||
|
||||
// Read a specific config file
|
||||
app.get('/api/config/:name', async (c) => {
|
||||
const name = c.req.param('name')
|
||||
if (name.includes('/') || name.includes('..')) {
|
||||
app.get('/api/config/*', async (c) => {
|
||||
const name = decodeURIComponent(c.req.path.slice('/api/config/'.length))
|
||||
const configPath = resolveConfigPath(name)
|
||||
if (!configPath) {
|
||||
return c.json({ error: 'Invalid config name' }, 400)
|
||||
}
|
||||
try {
|
||||
const content = await readFile(join(configsDir, name), 'utf-8')
|
||||
const content = await readFile(configPath, 'utf-8')
|
||||
return c.json(JSON.parse(content))
|
||||
} catch {
|
||||
return c.notFound()
|
||||
@@ -382,8 +411,17 @@ app.post('/api/run', async (c) => {
|
||||
|
||||
const config = parseResult.data
|
||||
|
||||
// Resolve relative paths from configs/ dir (dataset dropdown values are relative to it)
|
||||
const baseDir = configsDir
|
||||
let baseDir = configsDir
|
||||
if (body.configName) {
|
||||
const configPath = resolveConfigPath(body.configName)
|
||||
if (!configPath) {
|
||||
return c.json({ error: 'Invalid config name' }, 400)
|
||||
}
|
||||
baseDir = dirname(configPath)
|
||||
}
|
||||
|
||||
// Resolve relative paths from the loaded config location. Unsaved dashboard
|
||||
// configs keep using apps/eval/configs as their base for dropdown values.
|
||||
const datasetPath = resolve(
|
||||
config.dataset.startsWith('/')
|
||||
? config.dataset
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
import { spawn } from 'node:child_process'
|
||||
import { join } from 'node:path'
|
||||
import {
|
||||
writeGraderJsonArtifact,
|
||||
writeGraderTextArtifact,
|
||||
} from '../../grading/artifacts'
|
||||
import {
|
||||
type PythonEvaluatorResult,
|
||||
runPythonJsonEvaluator,
|
||||
} from '../../grading/python-evaluator'
|
||||
import type { GraderResult } from '../../types'
|
||||
import { callMcpTool } from '../../utils/mcp-client'
|
||||
import type { Grader, GraderInput } from '../types'
|
||||
@@ -7,12 +14,23 @@ import type { Grader, GraderInput } from '../types'
|
||||
const EVAL_SCRIPT = join(
|
||||
import.meta.dirname,
|
||||
'..',
|
||||
'..',
|
||||
'..',
|
||||
'scripts',
|
||||
'python',
|
||||
'agisdk-evaluate.py',
|
||||
)
|
||||
|
||||
interface AgisdkEvaluatorInput {
|
||||
task_id: string
|
||||
env_state: Record<string, unknown>
|
||||
model_response: string
|
||||
}
|
||||
|
||||
interface AgisdkEvaluatorOutput {
|
||||
reward: number
|
||||
pass: boolean
|
||||
message: string
|
||||
per_criterion: unknown[]
|
||||
}
|
||||
|
||||
export class AgisdkStateDiffGrader implements Grader {
|
||||
name = 'agisdk_state_diff'
|
||||
|
||||
@@ -36,6 +54,16 @@ export class AgisdkStateDiffGrader implements Grader {
|
||||
let envState: Record<string, unknown>
|
||||
try {
|
||||
envState = await this.fetchFinishState(origin, mcpEndpoint)
|
||||
await writeGraderJsonArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'finish-state.json',
|
||||
envState,
|
||||
)
|
||||
await writeGraderJsonArtifact(input, this.name, 'context.json', {
|
||||
origin,
|
||||
agisdk_task_id: taskId,
|
||||
})
|
||||
} catch (error) {
|
||||
return {
|
||||
score: 0,
|
||||
@@ -46,10 +74,30 @@ export class AgisdkStateDiffGrader implements Grader {
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.runPythonEvaluator(
|
||||
taskId,
|
||||
envState,
|
||||
input.finalAnswer || '',
|
||||
const evaluatorInput: AgisdkEvaluatorInput = {
|
||||
task_id: taskId,
|
||||
env_state: envState,
|
||||
model_response: input.finalAnswer || '',
|
||||
}
|
||||
await writeGraderJsonArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'evaluator-input.json',
|
||||
evaluatorInput,
|
||||
)
|
||||
const evaluation = await this.runPythonEvaluator(evaluatorInput)
|
||||
const result = evaluation.output
|
||||
await writeGraderJsonArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'evaluator-output.json',
|
||||
result,
|
||||
)
|
||||
await writeGraderTextArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'stderr.txt',
|
||||
evaluation.stderr,
|
||||
)
|
||||
return {
|
||||
score: result.reward,
|
||||
@@ -144,59 +192,12 @@ export class AgisdkStateDiffGrader implements Grader {
|
||||
}
|
||||
|
||||
private runPythonEvaluator(
|
||||
taskId: string,
|
||||
envState: Record<string, unknown>,
|
||||
modelResponse: string,
|
||||
): Promise<{
|
||||
reward: number
|
||||
pass: boolean
|
||||
message: string
|
||||
per_criterion: unknown[]
|
||||
}> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const proc = spawn('python3', [EVAL_SCRIPT], {
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
})
|
||||
|
||||
const inputData = JSON.stringify({
|
||||
task_id: taskId,
|
||||
env_state: envState,
|
||||
model_response: modelResponse,
|
||||
})
|
||||
|
||||
let stdout = ''
|
||||
let stderr = ''
|
||||
|
||||
proc.stdout.on('data', (data: Buffer) => {
|
||||
stdout += data.toString()
|
||||
})
|
||||
|
||||
proc.stderr.on('data', (data: Buffer) => {
|
||||
stderr += data.toString()
|
||||
})
|
||||
|
||||
proc.on('close', (code) => {
|
||||
if (code !== 0) {
|
||||
reject(
|
||||
new Error(`Python evaluator exited with code ${code}: ${stderr}`),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
const result = JSON.parse(stdout.trim())
|
||||
resolve(result)
|
||||
} catch {
|
||||
reject(new Error(`Failed to parse evaluator output: ${stdout}`))
|
||||
}
|
||||
})
|
||||
|
||||
proc.on('error', (err) => {
|
||||
reject(new Error(`Failed to spawn Python evaluator: ${err.message}`))
|
||||
})
|
||||
|
||||
proc.stdin.write(inputData)
|
||||
proc.stdin.end()
|
||||
evalInput: AgisdkEvaluatorInput,
|
||||
): Promise<PythonEvaluatorResult<AgisdkEvaluatorOutput>> {
|
||||
return runPythonJsonEvaluator<AgisdkEvaluatorOutput>({
|
||||
scriptPath: EVAL_SCRIPT,
|
||||
input: evalInput,
|
||||
timeoutMs: 300_000,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,12 @@
|
||||
import { join, resolve } from 'node:path'
|
||||
import {
|
||||
writeGraderJsonArtifact,
|
||||
writeGraderTextArtifact,
|
||||
} from '../../grading/artifacts'
|
||||
import {
|
||||
type PythonEvaluatorResult,
|
||||
runPythonJsonEvaluator,
|
||||
} from '../../grading/python-evaluator'
|
||||
import type { GraderResult } from '../../types'
|
||||
import type { Grader, GraderInput } from '../types'
|
||||
|
||||
@@ -14,10 +22,7 @@ interface InfinityEvalOutput {
|
||||
message: string
|
||||
}
|
||||
|
||||
const EVAL_SCRIPT = resolve(
|
||||
import.meta.dir,
|
||||
'../../../scripts/infinity-evaluate.py',
|
||||
)
|
||||
const EVAL_SCRIPT = resolve(import.meta.dir, '../python/infinity-evaluate.py')
|
||||
|
||||
export class InfinityStateGrader implements Grader {
|
||||
name = 'infinity_state'
|
||||
@@ -66,7 +71,32 @@ export class InfinityStateGrader implements Grader {
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.runPythonEvaluator(evalInput)
|
||||
await writeGraderJsonArtifact(input, this.name, 'verifier.json', {
|
||||
appName: parsed.appName,
|
||||
taskId: parsed.taskId,
|
||||
verifierPath,
|
||||
appServerUrl,
|
||||
})
|
||||
await writeGraderJsonArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'evaluator-input.json',
|
||||
evalInput,
|
||||
)
|
||||
const evaluation = await this.runPythonEvaluator(evalInput)
|
||||
const result = evaluation.output
|
||||
await writeGraderJsonArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'evaluator-output.json',
|
||||
result,
|
||||
)
|
||||
await writeGraderTextArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'stderr.txt',
|
||||
evaluation.stderr,
|
||||
)
|
||||
return {
|
||||
score: result.pass ? 1 : 0,
|
||||
pass: result.pass,
|
||||
@@ -108,27 +138,11 @@ export class InfinityStateGrader implements Grader {
|
||||
|
||||
private async runPythonEvaluator(
|
||||
evalInput: InfinityEvalInput,
|
||||
): Promise<InfinityEvalOutput> {
|
||||
const proc = Bun.spawn(['python3', EVAL_SCRIPT], {
|
||||
stdin: 'pipe',
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
): Promise<PythonEvaluatorResult<InfinityEvalOutput>> {
|
||||
return runPythonJsonEvaluator<InfinityEvalOutput>({
|
||||
scriptPath: EVAL_SCRIPT,
|
||||
input: evalInput,
|
||||
timeoutMs: 300_000,
|
||||
})
|
||||
|
||||
const inputJson = JSON.stringify(evalInput)
|
||||
proc.stdin.write(inputJson)
|
||||
proc.stdin.end()
|
||||
|
||||
const stdout = await new Response(proc.stdout).text()
|
||||
const stderr = await new Response(proc.stderr).text()
|
||||
const exitCode = await proc.exited
|
||||
|
||||
if (exitCode !== 0) {
|
||||
throw new Error(
|
||||
`Python evaluator exited with code ${exitCode}: ${stderr || stdout}`,
|
||||
)
|
||||
}
|
||||
|
||||
return JSON.parse(stdout.trim()) as InfinityEvalOutput
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { readFile } from 'node:fs/promises'
|
||||
import { join } from 'node:path'
|
||||
import { query } from '@anthropic-ai/claude-agent-sdk'
|
||||
import { writeGraderJsonArtifact } from '../../grading/artifacts'
|
||||
import type { GraderResult } from '../../types'
|
||||
import type { Grader, GraderInput } from '../types'
|
||||
import {
|
||||
@@ -63,6 +64,7 @@ export class PerformanceGrader implements Grader {
|
||||
input.screenshotCount,
|
||||
terminationReason,
|
||||
)
|
||||
await writeGraderJsonArtifact(input, this.name, 'metrics.json', metrics)
|
||||
|
||||
const systemPrompt = PERFORMANCE_SYSTEM_PROMPT.replace(
|
||||
/\{screenshot_count\}/g,
|
||||
@@ -82,6 +84,14 @@ export class PerformanceGrader implements Grader {
|
||||
userPrompt,
|
||||
input.outputDir,
|
||||
)
|
||||
if (response) {
|
||||
await writeGraderJsonArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'agent-output.json',
|
||||
response,
|
||||
)
|
||||
}
|
||||
|
||||
if (!response) {
|
||||
return {
|
||||
@@ -140,6 +150,7 @@ export class PerformanceGrader implements Grader {
|
||||
`Perf grader: LLM returned ${returnedAxes.size}/${expectedAxes.size} axes, missing: ${missingAxes.join(', ')}`,
|
||||
)
|
||||
}
|
||||
await writeGraderJsonArtifact(input, this.name, 'axes.json', axisResults)
|
||||
|
||||
return {
|
||||
score: compositeScore / 100,
|
||||
|
||||
@@ -1,51 +1,2 @@
|
||||
import type { GraderResult } from '../types'
|
||||
import { AgisdkStateDiffGrader } from './benchmark/agisdk-state-diff'
|
||||
import { InfinityStateGrader } from './benchmark/infinity-state'
|
||||
import { PerformanceGrader } from './performance/performance-grader'
|
||||
import type { Grader, GraderInput } from './types'
|
||||
|
||||
export const PASS_FAIL_GRADER_ORDER = [
|
||||
'agisdk_state_diff',
|
||||
'infinity_state',
|
||||
'performance_grader',
|
||||
] as const
|
||||
|
||||
export function createGrader(name: string): Grader | null {
|
||||
switch (name) {
|
||||
case 'agisdk_state_diff':
|
||||
return new AgisdkStateDiffGrader()
|
||||
case 'infinity_state':
|
||||
return new InfinityStateGrader()
|
||||
case 'performance_grader':
|
||||
return new PerformanceGrader()
|
||||
default:
|
||||
console.warn(`Unknown grader: ${name}`)
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
export async function runGraders(
|
||||
graderNames: string[],
|
||||
input: GraderInput,
|
||||
): Promise<Record<string, GraderResult>> {
|
||||
const results: Record<string, GraderResult> = {}
|
||||
|
||||
for (const name of graderNames) {
|
||||
const grader = createGrader(name)
|
||||
if (!grader) continue
|
||||
try {
|
||||
console.log(` Running grader: ${name}`)
|
||||
results[name] = await grader.grade(input)
|
||||
} catch (error) {
|
||||
results[name] = {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: `Error running grader: ${error}`,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
export { AgisdkStateDiffGrader, InfinityStateGrader, PerformanceGrader }
|
||||
export * from '../grading/grader-registry'
|
||||
export { runConfiguredGraders, runGraders } from '../grading/grader-runner'
|
||||
|
||||
@@ -1,21 +1 @@
|
||||
import type { GraderResult, Message } from '../types'
|
||||
|
||||
export interface GraderInput {
|
||||
task: {
|
||||
query_id: string
|
||||
query: string
|
||||
dataset: string
|
||||
}
|
||||
messages: Message[]
|
||||
screenshotCount: number
|
||||
finalAnswer: string | null
|
||||
expectedAnswer?: string | null
|
||||
outputDir: string
|
||||
mcpUrl?: string
|
||||
infinityAppUrl?: string
|
||||
}
|
||||
|
||||
export interface Grader {
|
||||
name: string
|
||||
grade(input: GraderInput): Promise<GraderResult>
|
||||
}
|
||||
export type { Grader, GraderInput } from '../grading/types'
|
||||
|
||||
34
packages/browseros-agent/apps/eval/src/grading/artifacts.ts
vendored
Normal file
34
packages/browseros-agent/apps/eval/src/grading/artifacts.ts
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
import { mkdir, writeFile } from 'node:fs/promises'
|
||||
import { join } from 'node:path'
|
||||
import type { GraderInput } from './types'
|
||||
|
||||
function artifactDir(input: GraderInput, graderName: string): string {
|
||||
return join(
|
||||
input.taskArtifactDir || input.outputDir,
|
||||
'grader-artifacts',
|
||||
graderName,
|
||||
)
|
||||
}
|
||||
|
||||
/** Writes a JSON artifact for a grader under the task artifact directory. */
|
||||
export async function writeGraderJsonArtifact(
|
||||
input: GraderInput,
|
||||
graderName: string,
|
||||
filename: string,
|
||||
value: unknown,
|
||||
): Promise<void> {
|
||||
const dir = artifactDir(input, graderName)
|
||||
await mkdir(dir, { recursive: true })
|
||||
await writeFile(join(dir, filename), JSON.stringify(value, null, 2))
|
||||
}
|
||||
|
||||
export async function writeGraderTextArtifact(
|
||||
input: GraderInput,
|
||||
graderName: string,
|
||||
filename: string,
|
||||
value: string,
|
||||
): Promise<void> {
|
||||
const dir = artifactDir(input, graderName)
|
||||
await mkdir(dir, { recursive: true })
|
||||
await writeFile(join(dir, filename), value)
|
||||
}
|
||||
26
packages/browseros-agent/apps/eval/src/grading/grader-registry.ts
vendored
Normal file
26
packages/browseros-agent/apps/eval/src/grading/grader-registry.ts
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
import { AgisdkStateDiffGrader } from '../graders/benchmark/agisdk-state-diff'
|
||||
import { InfinityStateGrader } from '../graders/benchmark/infinity-state'
|
||||
import { PerformanceGrader } from '../graders/performance/performance-grader'
|
||||
import type { Grader } from './types'
|
||||
|
||||
export const PASS_FAIL_GRADER_ORDER = [
|
||||
'agisdk_state_diff',
|
||||
'infinity_state',
|
||||
'performance_grader',
|
||||
] as const
|
||||
|
||||
export function createGrader(name: string): Grader | null {
|
||||
switch (name) {
|
||||
case 'agisdk_state_diff':
|
||||
return new AgisdkStateDiffGrader()
|
||||
case 'infinity_state':
|
||||
return new InfinityStateGrader()
|
||||
case 'performance_grader':
|
||||
return new PerformanceGrader()
|
||||
default:
|
||||
console.warn(`Unknown grader: ${name}`)
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
export { AgisdkStateDiffGrader, InfinityStateGrader, PerformanceGrader }
|
||||
36
packages/browseros-agent/apps/eval/src/grading/grader-runner.ts
vendored
Normal file
36
packages/browseros-agent/apps/eval/src/grading/grader-runner.ts
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
import type { GraderResult } from '../types'
|
||||
import { createGrader as defaultCreateGrader } from './grader-registry'
|
||||
import type { Grader, GraderInput } from './types'
|
||||
|
||||
export interface GraderRunnerDeps {
|
||||
createGrader?: (name: string) => Grader | null
|
||||
}
|
||||
|
||||
/** Runs configured graders independently so one failure does not hide others. */
|
||||
export async function runConfiguredGraders(
|
||||
graderNames: string[],
|
||||
input: GraderInput,
|
||||
deps: GraderRunnerDeps = {},
|
||||
): Promise<Record<string, GraderResult>> {
|
||||
const create = deps.createGrader ?? defaultCreateGrader
|
||||
const results: Record<string, GraderResult> = {}
|
||||
|
||||
for (const name of graderNames) {
|
||||
const grader = create(name)
|
||||
if (!grader) continue
|
||||
try {
|
||||
console.log(` Running grader: ${name}`)
|
||||
results[name] = await grader.grade(input)
|
||||
} catch (error) {
|
||||
results[name] = {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: `Error running grader: ${error instanceof Error ? error.message : String(error)}`,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
export const runGraders = runConfiguredGraders
|
||||
65
packages/browseros-agent/apps/eval/src/grading/python-evaluator.ts
vendored
Normal file
65
packages/browseros-agent/apps/eval/src/grading/python-evaluator.ts
vendored
Normal file
@@ -0,0 +1,65 @@
|
||||
export interface PythonEvaluatorOptions {
|
||||
scriptPath: string
|
||||
input: unknown
|
||||
timeoutMs: number
|
||||
}
|
||||
|
||||
export interface PythonEvaluatorResult<T> {
|
||||
output: T
|
||||
stdout: string
|
||||
stderr: string
|
||||
exitCode: number
|
||||
}
|
||||
|
||||
/** Runs a Python evaluator that accepts stdin JSON and emits stdout JSON. */
|
||||
export async function runPythonJsonEvaluator<T>(
|
||||
options: PythonEvaluatorOptions,
|
||||
): Promise<PythonEvaluatorResult<T>> {
|
||||
const proc = Bun.spawn(['python3', options.scriptPath], {
|
||||
stdin: 'pipe',
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
|
||||
proc.stdin.write(JSON.stringify(options.input))
|
||||
proc.stdin.end()
|
||||
|
||||
let timeoutHandle: ReturnType<typeof setTimeout> | undefined
|
||||
const timeout = new Promise<never>((_, reject) => {
|
||||
timeoutHandle = setTimeout(() => {
|
||||
proc.kill('SIGKILL')
|
||||
reject(
|
||||
new Error(`Python evaluator timed out after ${options.timeoutMs}ms`),
|
||||
)
|
||||
}, options.timeoutMs)
|
||||
})
|
||||
|
||||
const completed = (async (): Promise<PythonEvaluatorResult<T>> => {
|
||||
const stdout = await new Response(proc.stdout).text()
|
||||
const stderr = await new Response(proc.stderr).text()
|
||||
const exitCode = await proc.exited
|
||||
|
||||
if (exitCode !== 0) {
|
||||
throw new Error(
|
||||
`Python evaluator exited with code ${exitCode}: ${stderr || stdout}`,
|
||||
)
|
||||
}
|
||||
|
||||
try {
|
||||
return {
|
||||
output: JSON.parse(stdout.trim()) as T,
|
||||
stdout,
|
||||
stderr,
|
||||
exitCode,
|
||||
}
|
||||
} catch {
|
||||
throw new Error(`Failed to parse Python evaluator output: ${stdout}`)
|
||||
}
|
||||
})()
|
||||
|
||||
try {
|
||||
return await Promise.race([completed, timeout])
|
||||
} finally {
|
||||
clearTimeout(timeoutHandle)
|
||||
}
|
||||
}
|
||||
22
packages/browseros-agent/apps/eval/src/grading/types.ts
vendored
Normal file
22
packages/browseros-agent/apps/eval/src/grading/types.ts
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
import type { GraderResult, Message } from '../types'
|
||||
|
||||
export interface GraderInput {
|
||||
task: {
|
||||
query_id: string
|
||||
query: string
|
||||
dataset: string
|
||||
}
|
||||
messages: Message[]
|
||||
screenshotCount: number
|
||||
finalAnswer: string | null
|
||||
expectedAnswer?: string | null
|
||||
taskArtifactDir: string
|
||||
outputDir: string
|
||||
mcpUrl?: string
|
||||
infinityAppUrl?: string
|
||||
}
|
||||
|
||||
export interface Grader {
|
||||
name: string
|
||||
grade(input: GraderInput): Promise<GraderResult>
|
||||
}
|
||||
75
packages/browseros-agent/apps/eval/src/index.ts
vendored
75
packages/browseros-agent/apps/eval/src/index.ts
vendored
@@ -1,73 +1,10 @@
|
||||
#!/usr/bin/env bun
|
||||
|
||||
import { parseArgs } from 'node:util'
|
||||
import { runEval } from './runner/eval-runner'
|
||||
import { runCli } from './cli'
|
||||
|
||||
const { values } = parseArgs({
|
||||
args: Bun.argv.slice(2),
|
||||
options: {
|
||||
config: { type: 'string', short: 'c' },
|
||||
help: { type: 'boolean', short: 'h' },
|
||||
},
|
||||
})
|
||||
|
||||
if (values.help) {
|
||||
console.log(`
|
||||
BrowserOS Eval
|
||||
|
||||
Usage:
|
||||
bun run eval # Opens dashboard in config mode
|
||||
bun run eval --config <config.json> # Runs eval with config file
|
||||
|
||||
Available agent types:
|
||||
- single Single LLM agent driven by the BrowserOS tool loop
|
||||
- orchestrator-executor High-level planner + visual/text executor
|
||||
|
||||
Available graders:
|
||||
- performance_grader Multi-axis grader using Claude Agent SDK
|
||||
- agisdk_state_diff AGI SDK / REAL Bench state-diff grader
|
||||
- infinity_state WebArena-Infinity verifier-script grader
|
||||
|
||||
Preset configs in configs/:
|
||||
- browseros-agent-weekly.json Weekly eval (single agent)
|
||||
- browseros-oe-agent-weekly.json Weekly eval (orchestrator + LLM executor)
|
||||
- browseros-oe-clado-weekly.json Weekly eval (orchestrator + Clado executor)
|
||||
- agisdk-real-smoke.json AGI SDK smoke run (1 task)
|
||||
- agisdk-real.json AGI SDK full run (36 tasks)
|
||||
- infinity-hard-50.json WebArena-Infinity hard-50 set
|
||||
- test-webvoyager.json WebVoyager test
|
||||
- test-mind2web.json Mind2Web test
|
||||
|
||||
Examples:
|
||||
bun run eval # Dashboard config mode
|
||||
bun run eval -c configs/browseros-agent-weekly.json
|
||||
bun run eval -c configs/test-webvoyager.json
|
||||
`)
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
if (values.config) {
|
||||
try {
|
||||
await runEval({ configPath: values.config })
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : String(error))
|
||||
process.exit(1)
|
||||
}
|
||||
process.exit(0)
|
||||
} else {
|
||||
// No config — start dashboard in config mode, wait for user to configure and run
|
||||
const { startDashboard } = await import('./dashboard/server')
|
||||
startDashboard({
|
||||
tasks: [],
|
||||
configName: '',
|
||||
agentType: '',
|
||||
outputDir: '',
|
||||
configMode: true,
|
||||
})
|
||||
console.log(
|
||||
'Dashboard running at http://localhost:9900 — configure and run from the UI',
|
||||
)
|
||||
|
||||
// Keep process alive until SIGINT
|
||||
await new Promise(() => {})
|
||||
try {
|
||||
await runCli(Bun.argv.slice(2))
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : String(error))
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
41
packages/browseros-agent/apps/eval/src/publishing/r2-manifest.ts
vendored
Normal file
41
packages/browseros-agent/apps/eval/src/publishing/r2-manifest.ts
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
export interface R2UploadConfig {
|
||||
accountId: string
|
||||
accessKeyId: string
|
||||
secretAccessKey: string
|
||||
bucket: string
|
||||
cdnBaseUrl: string
|
||||
}
|
||||
|
||||
export interface R2ManifestTask {
|
||||
queryId: string
|
||||
query: string
|
||||
startUrl: string
|
||||
status: string
|
||||
durationMs: number
|
||||
screenshotCount: number
|
||||
graderResults: Record<string, unknown>
|
||||
}
|
||||
|
||||
export interface R2RunManifest {
|
||||
runId: string
|
||||
uploadedAt: string
|
||||
agentConfig?: Record<string, unknown>
|
||||
dataset?: string
|
||||
summary?: {
|
||||
passRate?: unknown
|
||||
avgDurationMs?: unknown
|
||||
}
|
||||
tasks: R2ManifestTask[]
|
||||
}
|
||||
|
||||
export interface R2PublishRunResult {
|
||||
runId: string
|
||||
uploadedFiles: number
|
||||
viewerUrl: string
|
||||
manifest: R2RunManifest
|
||||
}
|
||||
|
||||
export interface R2PublishPathResult {
|
||||
uploadedRuns: R2PublishRunResult[]
|
||||
skippedRuns: string[]
|
||||
}
|
||||
425
packages/browseros-agent/apps/eval/src/publishing/r2-publisher.ts
vendored
Normal file
425
packages/browseros-agent/apps/eval/src/publishing/r2-publisher.ts
vendored
Normal file
@@ -0,0 +1,425 @@
|
||||
import { readdir, readFile, stat } from 'node:fs/promises'
|
||||
import { basename, dirname, extname, join } from 'node:path'
|
||||
import {
|
||||
GetObjectCommand,
|
||||
PutObjectCommand,
|
||||
S3Client,
|
||||
} from '@aws-sdk/client-s3'
|
||||
import type {
|
||||
R2ManifestTask,
|
||||
R2PublishPathResult,
|
||||
R2PublishRunResult,
|
||||
R2RunManifest,
|
||||
R2UploadConfig,
|
||||
} from './r2-manifest'
|
||||
|
||||
const DEFAULT_CONCURRENCY = 20
|
||||
|
||||
const CONTENT_TYPES: Record<string, string> = {
|
||||
'.json': 'application/json',
|
||||
'.jsonl': 'application/x-ndjson',
|
||||
'.png': 'image/png',
|
||||
'.html': 'text/html',
|
||||
}
|
||||
|
||||
export interface R2Client {
|
||||
send(command: unknown): Promise<unknown>
|
||||
}
|
||||
|
||||
export interface R2PublisherOptions {
|
||||
config: R2UploadConfig
|
||||
client?: R2Client
|
||||
viewerPath?: string
|
||||
concurrency?: number
|
||||
now?: () => Date
|
||||
}
|
||||
|
||||
interface UploadJob {
|
||||
key: string
|
||||
filePath: string
|
||||
contentType: string
|
||||
}
|
||||
|
||||
interface TaskDirEntry {
|
||||
taskId: string
|
||||
taskPath: string
|
||||
canonicalLayout: boolean
|
||||
}
|
||||
|
||||
export function contentTypeForPath(filePath: string): string {
|
||||
return CONTENT_TYPES[extname(filePath)] || 'application/octet-stream'
|
||||
}
|
||||
|
||||
export function loadR2ConfigFromEnv(
|
||||
env: Record<string, string | undefined> = process.env,
|
||||
): R2UploadConfig {
|
||||
const accountId = env.EVAL_R2_ACCOUNT_ID
|
||||
const accessKeyId = env.EVAL_R2_ACCESS_KEY_ID
|
||||
const secretAccessKey = env.EVAL_R2_SECRET_ACCESS_KEY
|
||||
|
||||
if (!accountId || !accessKeyId || !secretAccessKey) {
|
||||
throw new Error(
|
||||
'Missing required env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY',
|
||||
)
|
||||
}
|
||||
|
||||
return {
|
||||
accountId,
|
||||
accessKeyId,
|
||||
secretAccessKey,
|
||||
bucket: env.EVAL_R2_BUCKET || 'browseros-eval',
|
||||
cdnBaseUrl: (
|
||||
env.EVAL_R2_CDN_BASE_URL || 'https://eval.browseros.com'
|
||||
).replace(/\/+$/, ''),
|
||||
}
|
||||
}
|
||||
|
||||
export function createR2Client(config: R2UploadConfig): S3Client {
|
||||
return new S3Client({
|
||||
region: 'auto',
|
||||
endpoint: `https://${config.accountId}.r2.cloudflarestorage.com`,
|
||||
credentials: {
|
||||
accessKeyId: config.accessKeyId,
|
||||
secretAccessKey: config.secretAccessKey,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
async function collectFiles(dir: string): Promise<string[]> {
|
||||
const files: string[] = []
|
||||
const entries = await readdir(dir, { withFileTypes: true })
|
||||
for (const entry of entries) {
|
||||
const full = join(dir, entry.name)
|
||||
if (entry.isDirectory()) {
|
||||
files.push(...(await collectFiles(full)))
|
||||
} else {
|
||||
files.push(full)
|
||||
}
|
||||
}
|
||||
return files
|
||||
}
|
||||
|
||||
async function runPool<T>(
|
||||
items: T[],
|
||||
concurrency: number,
|
||||
fn: (item: T) => Promise<void>,
|
||||
): Promise<void> {
|
||||
let i = 0
|
||||
const workers = Array.from({ length: concurrency }, async () => {
|
||||
while (i < items.length) {
|
||||
const idx = i++
|
||||
await fn(items[idx])
|
||||
}
|
||||
})
|
||||
await Promise.all(workers)
|
||||
}
|
||||
|
||||
async function hasMetadata(dir: string): Promise<boolean> {
|
||||
const metaStat = await stat(join(dir, 'metadata.json')).catch(() => null)
|
||||
return !!metaStat?.isFile()
|
||||
}
|
||||
|
||||
async function findTaskDirs(runDir: string): Promise<TaskDirEntry[]> {
|
||||
const entries = await readdir(runDir, { withFileTypes: true })
|
||||
const legacyTasks: TaskDirEntry[] = []
|
||||
for (const entry of entries) {
|
||||
if (!entry.isDirectory() || entry.name === 'tasks') continue
|
||||
const taskPath = join(runDir, entry.name)
|
||||
if (await hasMetadata(taskPath)) {
|
||||
legacyTasks.push({
|
||||
taskId: entry.name,
|
||||
taskPath,
|
||||
canonicalLayout: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
const tasksRoot = join(runDir, 'tasks')
|
||||
const canonicalEntries = await readdir(tasksRoot, {
|
||||
withFileTypes: true,
|
||||
}).catch(() => [])
|
||||
const canonicalTasks: TaskDirEntry[] = []
|
||||
for (const entry of canonicalEntries) {
|
||||
if (!entry.isDirectory()) continue
|
||||
const taskPath = join(tasksRoot, entry.name)
|
||||
if (await hasMetadata(taskPath)) {
|
||||
canonicalTasks.push({
|
||||
taskId: entry.name,
|
||||
taskPath,
|
||||
canonicalLayout: true,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return legacyTasks.length > 0 ? legacyTasks : canonicalTasks
|
||||
}
|
||||
|
||||
async function isRunDir(dir: string): Promise<boolean> {
|
||||
return (await findTaskDirs(dir)).length > 0
|
||||
}
|
||||
|
||||
async function collectRunRootFiles(runDir: string): Promise<UploadJob[]> {
|
||||
const entries = await readdir(runDir, { withFileTypes: true })
|
||||
return entries
|
||||
.filter((entry) => entry.isFile())
|
||||
.map((entry) => {
|
||||
const filePath = join(runDir, entry.name)
|
||||
return {
|
||||
key: entry.name,
|
||||
filePath,
|
||||
contentType: contentTypeForPath(filePath),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
function statusFromMetadata(meta: Record<string, unknown>): string {
|
||||
return meta.termination_reason === 'completed'
|
||||
? 'completed'
|
||||
: ((meta.termination_reason as string | undefined) ?? 'unknown')
|
||||
}
|
||||
|
||||
function runIdForDir(runDir: string): string {
|
||||
const timestamp = basename(runDir)
|
||||
const configName = basename(dirname(runDir))
|
||||
return `${configName}-${timestamp}`
|
||||
}
|
||||
|
||||
/** Publishes eval artifacts in the viewer-compatible R2 layout. */
|
||||
export class R2Publisher {
|
||||
private readonly client: R2Client
|
||||
private readonly config: R2UploadConfig
|
||||
private readonly viewerPath: string
|
||||
private readonly concurrency: number
|
||||
private readonly now: () => Date
|
||||
|
||||
constructor(options: R2PublisherOptions) {
|
||||
this.config = options.config
|
||||
this.client = options.client ?? createR2Client(options.config)
|
||||
this.viewerPath =
|
||||
options.viewerPath ??
|
||||
join(import.meta.dirname, '..', 'dashboard', 'viewer.html')
|
||||
this.concurrency = options.concurrency ?? DEFAULT_CONCURRENCY
|
||||
this.now = options.now ?? (() => new Date())
|
||||
}
|
||||
|
||||
async isUploaded(runId: string): Promise<boolean> {
|
||||
try {
|
||||
await this.client.send(
|
||||
new GetObjectCommand({
|
||||
Bucket: this.config.bucket,
|
||||
Key: `runs/${runId}/manifest.json`,
|
||||
}),
|
||||
)
|
||||
return true
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
async publishPath(inputDir: string): Promise<R2PublishPathResult> {
|
||||
const dirStat = await stat(inputDir).catch(() => null)
|
||||
if (!dirStat?.isDirectory()) {
|
||||
throw new Error(`Not a directory: ${inputDir}`)
|
||||
}
|
||||
|
||||
if (await isRunDir(inputDir)) {
|
||||
const result = await this.publishRun(inputDir, runIdForDir(inputDir))
|
||||
return { uploadedRuns: [result], skippedRuns: [] }
|
||||
}
|
||||
|
||||
const configName = basename(inputDir)
|
||||
const entries = await readdir(inputDir, { withFileTypes: true })
|
||||
const runDirs = entries
|
||||
.filter((entry) => entry.isDirectory())
|
||||
.map((entry) => entry.name)
|
||||
.sort()
|
||||
|
||||
if (runDirs.length === 0) {
|
||||
throw new Error('No run subdirectories found')
|
||||
}
|
||||
|
||||
const uploadedRuns: R2PublishRunResult[] = []
|
||||
const skippedRuns: string[] = []
|
||||
for (const dir of runDirs) {
|
||||
const runId = `${configName}-${dir}`
|
||||
if (await this.isUploaded(runId)) {
|
||||
skippedRuns.push(runId)
|
||||
continue
|
||||
}
|
||||
uploadedRuns.push(await this.publishRun(join(inputDir, dir), runId))
|
||||
}
|
||||
|
||||
return { uploadedRuns, skippedRuns }
|
||||
}
|
||||
|
||||
async publishRun(
|
||||
runDir: string,
|
||||
runId: string = runIdForDir(runDir),
|
||||
): Promise<R2PublishRunResult> {
|
||||
const taskEntries = await findTaskDirs(runDir)
|
||||
|
||||
if (taskEntries.length === 0) {
|
||||
throw new Error(`No task subdirectories in ${runId}`)
|
||||
}
|
||||
|
||||
const manifestTasks: R2ManifestTask[] = []
|
||||
const jobs: UploadJob[] = (await collectRunRootFiles(runDir)).map(
|
||||
(job) => ({
|
||||
...job,
|
||||
key: `runs/${runId}/${job.key}`,
|
||||
}),
|
||||
)
|
||||
let agentConfig: Record<string, unknown> | undefined
|
||||
let dataset: string | undefined
|
||||
|
||||
for (const taskDirEntry of taskEntries) {
|
||||
const { taskId, taskPath } = taskDirEntry
|
||||
const meta = await this.readMetadata(taskPath)
|
||||
if (!meta) continue
|
||||
|
||||
if (!agentConfig && meta.agent_config) {
|
||||
agentConfig = meta.agent_config as Record<string, unknown>
|
||||
}
|
||||
if (!dataset && meta.dataset) dataset = meta.dataset as string
|
||||
|
||||
const files = await collectFiles(taskPath)
|
||||
let screenshotCount = 0
|
||||
for (const file of files) {
|
||||
const relative = file.slice(taskPath.length + 1)
|
||||
if (relative.startsWith('screenshots/') && extname(file) === '.png') {
|
||||
screenshotCount++
|
||||
}
|
||||
jobs.push({
|
||||
key: `runs/${runId}/${taskId}/${relative}`,
|
||||
filePath: file,
|
||||
contentType: contentTypeForPath(file),
|
||||
})
|
||||
if (taskDirEntry.canonicalLayout) {
|
||||
jobs.push({
|
||||
key: `runs/${runId}/tasks/${taskId}/${relative}`,
|
||||
filePath: file,
|
||||
contentType: contentTypeForPath(file),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
manifestTasks.push({
|
||||
queryId: (meta.query_id as string | undefined) || taskId,
|
||||
query: (meta.query as string | undefined) || '',
|
||||
startUrl: (meta.start_url as string | undefined) || '',
|
||||
status: statusFromMetadata(meta),
|
||||
durationMs: (meta.total_duration_ms as number | undefined) || 0,
|
||||
screenshotCount:
|
||||
(meta.screenshot_count as number | undefined) || screenshotCount,
|
||||
graderResults:
|
||||
(meta.grader_results as Record<string, unknown> | undefined) || {},
|
||||
})
|
||||
}
|
||||
|
||||
if (manifestTasks.length === 0) {
|
||||
throw new Error(`No completed tasks in ${runId}`)
|
||||
}
|
||||
|
||||
let uploaded = 0
|
||||
await runPool(jobs, this.concurrency, async (job) => {
|
||||
await this.uploadFile(job)
|
||||
uploaded++
|
||||
})
|
||||
|
||||
const manifest = await this.buildManifest(
|
||||
runDir,
|
||||
runId,
|
||||
agentConfig,
|
||||
dataset,
|
||||
manifestTasks,
|
||||
)
|
||||
await this.uploadBuffer(
|
||||
`runs/${runId}/manifest.json`,
|
||||
Buffer.from(JSON.stringify(manifest, null, 2)),
|
||||
'application/json',
|
||||
)
|
||||
await this.uploadBuffer(
|
||||
'viewer.html',
|
||||
await readFile(this.viewerPath),
|
||||
'text/html',
|
||||
)
|
||||
|
||||
return {
|
||||
runId,
|
||||
uploadedFiles: uploaded + 2,
|
||||
viewerUrl: `${this.config.cdnBaseUrl}/viewer.html?run=${runId}`,
|
||||
manifest,
|
||||
}
|
||||
}
|
||||
|
||||
private async readMetadata(
|
||||
taskPath: string,
|
||||
): Promise<Record<string, unknown> | null> {
|
||||
try {
|
||||
return JSON.parse(
|
||||
await readFile(join(taskPath, 'metadata.json'), 'utf-8'),
|
||||
) as Record<string, unknown>
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
private async buildManifest(
|
||||
runDir: string,
|
||||
runId: string,
|
||||
agentConfig: Record<string, unknown> | undefined,
|
||||
dataset: string | undefined,
|
||||
tasks: R2ManifestTask[],
|
||||
): Promise<R2RunManifest> {
|
||||
let summaryData: Record<string, unknown> | undefined
|
||||
try {
|
||||
summaryData = JSON.parse(
|
||||
await readFile(join(runDir, 'summary.json'), 'utf-8'),
|
||||
) as Record<string, unknown>
|
||||
} catch {}
|
||||
|
||||
return {
|
||||
runId,
|
||||
uploadedAt: this.now().toISOString(),
|
||||
agentConfig,
|
||||
dataset,
|
||||
summary: summaryData
|
||||
? {
|
||||
passRate: summaryData.passRate,
|
||||
avgDurationMs: summaryData.avgDurationMs,
|
||||
}
|
||||
: undefined,
|
||||
tasks,
|
||||
}
|
||||
}
|
||||
|
||||
private async uploadFile(job: UploadJob): Promise<void> {
|
||||
await this.uploadBuffer(
|
||||
job.key,
|
||||
await readFile(job.filePath),
|
||||
job.contentType,
|
||||
)
|
||||
}
|
||||
|
||||
private async uploadBuffer(
|
||||
key: string,
|
||||
body: Buffer,
|
||||
contentType: string,
|
||||
): Promise<void> {
|
||||
await this.client.send(
|
||||
new PutObjectCommand({
|
||||
Bucket: this.config.bucket,
|
||||
Key: key,
|
||||
Body: body,
|
||||
ContentType: contentType,
|
||||
}),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
export async function publishPathToR2(
|
||||
inputDir: string,
|
||||
): Promise<R2PublishPathResult> {
|
||||
const config = loadR2ConfigFromEnv()
|
||||
return new R2Publisher({ config }).publishPath(inputDir)
|
||||
}
|
||||
@@ -1,362 +1 @@
|
||||
import { mkdir, writeFile } from 'node:fs/promises'
|
||||
import { basename, dirname, join, resolve } from 'node:path'
|
||||
import {
|
||||
dashboardState,
|
||||
setActiveExecutor,
|
||||
startDashboard,
|
||||
stopDashboard,
|
||||
} from '../dashboard/server'
|
||||
import type { ErrorSource, EvalConfig, Task } from '../types'
|
||||
import {
|
||||
printValidationResult,
|
||||
validateConfig,
|
||||
} from '../utils/config-validator'
|
||||
import { ParallelExecutor } from './parallel-executor'
|
||||
import {
|
||||
getTaskSourceDescription,
|
||||
loadTasks,
|
||||
TaskLoadError,
|
||||
} from './task-loader'
|
||||
import type {
|
||||
BatchSummary,
|
||||
RunEvalOptions,
|
||||
TaskResult,
|
||||
TaskResultSummary,
|
||||
TaskSource,
|
||||
} from './types'
|
||||
import { getPrimaryGraderResult, isSuccessfulResult } from './types'
|
||||
|
||||
// ============================================================================
|
||||
// Main Entry Point
|
||||
// ============================================================================
|
||||
|
||||
export async function runEval(options: RunEvalOptions): Promise<void> {
|
||||
// Step 1: Validate configuration
|
||||
const config = await loadAndValidateConfig(options.configPath)
|
||||
|
||||
// Step 2: Resolve paths relative to config location
|
||||
const configDir = dirname(resolve(options.configPath))
|
||||
const resolvedPaths = resolvePaths(options, config, configDir)
|
||||
|
||||
// Log configuration
|
||||
console.log('Eval Configuration:')
|
||||
console.log(` Config: ${options.configPath}`)
|
||||
console.log(` Dataset: ${resolvedPaths.dataPath}`)
|
||||
console.log(` Output: ${resolvedPaths.outputDir}`)
|
||||
console.log(` Workers: ${config.num_workers}`)
|
||||
console.log(` Agent: ${config.agent.type}`)
|
||||
console.log()
|
||||
|
||||
// Step 3: Load tasks
|
||||
const taskSource = resolveTaskSource(options, resolvedPaths.dataPath)
|
||||
const { tasks } = await loadTasksWithLogging(taskSource)
|
||||
|
||||
// Step 4: Setup
|
||||
await mkdir(resolvedPaths.outputDir, { recursive: true })
|
||||
|
||||
// Step 5: Start dashboard
|
||||
startDashboard({
|
||||
tasks,
|
||||
configName: options.configPath,
|
||||
agentType: config.agent.type,
|
||||
outputDir: resolvedPaths.outputDir,
|
||||
})
|
||||
|
||||
// Step 6: Execute tasks (parallel or sequential based on num_workers)
|
||||
const results = await executeTasks(tasks, config, resolvedPaths.outputDir)
|
||||
|
||||
// Step 7: Summary
|
||||
const summary = buildSummary(results)
|
||||
await saveSummary(summary, resolvedPaths.outputDir)
|
||||
printSummary(summary)
|
||||
console.log(`\nResults saved to: ${resolvedPaths.outputDir}`)
|
||||
|
||||
stopDashboard()
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Configuration
|
||||
// ============================================================================
|
||||
|
||||
async function loadAndValidateConfig(configPath: string) {
|
||||
console.log('Validating configuration...')
|
||||
const validationResult = await validateConfig(configPath)
|
||||
printValidationResult(validationResult)
|
||||
|
||||
if (!validationResult.valid || !validationResult.config) {
|
||||
throw new Error(
|
||||
'Configuration validation failed. Fix the above errors and try again.',
|
||||
)
|
||||
}
|
||||
|
||||
return validationResult.config
|
||||
}
|
||||
|
||||
interface ResolvedPaths {
|
||||
dataPath: string
|
||||
outputDir: string
|
||||
}
|
||||
|
||||
function resolvePaths(
|
||||
options: RunEvalOptions,
|
||||
config: EvalConfig,
|
||||
configDir: string,
|
||||
): ResolvedPaths {
|
||||
// Resolve dataset path: use options.dataPath if provided, otherwise resolve from config
|
||||
const dataPath = options.dataPath
|
||||
? options.dataPath
|
||||
: config.dataset.startsWith('/')
|
||||
? config.dataset
|
||||
: resolve(configDir, config.dataset)
|
||||
|
||||
// Resolve output directory: results/{config-name}/{timestamp}/
|
||||
// Config name derived from config filename (e.g., "browseros-agent-weekly.json" → "browseros-agent-weekly")
|
||||
const configName = options.configPath
|
||||
? basename(resolve(options.configPath), '.json')
|
||||
: 'eval'
|
||||
const timestamp = formatTimestamp(new Date())
|
||||
const resultsBase = config.output_dir
|
||||
? config.output_dir.startsWith('/')
|
||||
? config.output_dir
|
||||
: resolve(configDir, config.output_dir)
|
||||
: resolve(configDir, '..', 'results')
|
||||
const outputDir = join(resultsBase, configName, timestamp)
|
||||
|
||||
return { dataPath, outputDir }
|
||||
}
|
||||
|
||||
function formatTimestamp(date: Date): string {
|
||||
const y = date.getFullYear()
|
||||
const m = String(date.getMonth() + 1).padStart(2, '0')
|
||||
const d = String(date.getDate()).padStart(2, '0')
|
||||
const h = String(date.getHours()).padStart(2, '0')
|
||||
const min = String(date.getMinutes()).padStart(2, '0')
|
||||
return `${y}-${m}-${d}-${h}${min}`
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Task Loading
|
||||
// ============================================================================
|
||||
|
||||
function resolveTaskSource(
|
||||
options: RunEvalOptions,
|
||||
dataPath: string,
|
||||
): TaskSource {
|
||||
// If query is provided, use single task mode
|
||||
if (options.query) {
|
||||
return { type: 'single', query: options.query, startUrl: options.startUrl }
|
||||
}
|
||||
|
||||
// Otherwise use file mode with the resolved dataPath
|
||||
return { type: 'file', path: dataPath }
|
||||
}
|
||||
|
||||
async function loadTasksWithLogging(
|
||||
source: TaskSource,
|
||||
): Promise<{ tasks: Awaited<ReturnType<typeof loadTasks>>['tasks'] }> {
|
||||
console.log(`Loading tasks from ${getTaskSourceDescription(source)}...`)
|
||||
|
||||
try {
|
||||
const result = await loadTasks(source)
|
||||
console.log(`Loaded ${result.tasks.length} task(s)`)
|
||||
return { tasks: result.tasks }
|
||||
} catch (error) {
|
||||
if (error instanceof TaskLoadError) {
|
||||
throw new Error(`Failed to load tasks: ${error.message}`)
|
||||
}
|
||||
throw new Error(`Failed to load tasks: ${error}`)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Task Execution
|
||||
// ============================================================================
|
||||
|
||||
async function executeTasks(
|
||||
tasks: Task[],
|
||||
config: EvalConfig,
|
||||
outputDir: string,
|
||||
): Promise<TaskResult[]> {
|
||||
console.log(`\n${'='.repeat(60)}`)
|
||||
console.log('STARTING EVALUATION')
|
||||
console.log(`${'='.repeat(60)}\n`)
|
||||
|
||||
const numWorkers = config.num_workers || 1
|
||||
console.log(`Running with ${numWorkers} worker(s)`)
|
||||
if (config.restart_server_per_task) {
|
||||
console.log(`Server restart per task: enabled`)
|
||||
}
|
||||
console.log()
|
||||
|
||||
const executor = new ParallelExecutor({
|
||||
numWorkers,
|
||||
config,
|
||||
outputDir,
|
||||
restartServerPerTask: config.restart_server_per_task,
|
||||
onEvent: (taskId, event) =>
|
||||
dashboardState.broadcastStreamEvent(taskId, event),
|
||||
})
|
||||
|
||||
// Register so dashboard stop button works for CLI runs too
|
||||
setActiveExecutor(executor)
|
||||
try {
|
||||
return await executor.execute(tasks, (completed, total, task, result) => {
|
||||
printTaskProgress(completed, total, task, result)
|
||||
})
|
||||
} finally {
|
||||
setActiveExecutor(null)
|
||||
}
|
||||
}
|
||||
|
||||
function printTaskProgress(
|
||||
completed: number,
|
||||
total: number,
|
||||
task: Task,
|
||||
result: TaskResult,
|
||||
): void {
|
||||
const status =
|
||||
result.status === 'completed'
|
||||
? 'DONE'
|
||||
: result.status === 'timeout'
|
||||
? 'TIMEOUT'
|
||||
: 'FAILED'
|
||||
|
||||
const duration =
|
||||
result.durationMs > 0 ? ` (${(result.durationMs / 1000).toFixed(1)}s)` : ''
|
||||
|
||||
console.log(`[${completed}/${total}] ${task.query_id}: ${status}${duration}`)
|
||||
|
||||
if (result.status === 'failed') {
|
||||
console.log(` ERROR: ${result.error.message}`)
|
||||
} else if (isSuccessfulResult(result)) {
|
||||
// Log agent errors (e.g., LLM API failures) even if task "completed"
|
||||
if (result.agentResult.metadata.errors?.length) {
|
||||
for (const err of result.agentResult.metadata.errors) {
|
||||
console.log(` ERROR [${err.source}]: ${err.message}`)
|
||||
}
|
||||
}
|
||||
for (const [name, gr] of Object.entries(result.graderResults)) {
|
||||
const icon = gr.pass ? 'PASS' : 'FAIL'
|
||||
console.log(` ${name}: ${icon}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Summary
|
||||
// ============================================================================
|
||||
|
||||
function buildSummary(results: TaskResult[]): BatchSummary {
|
||||
// Track errors by source
|
||||
const errorsBySource: Partial<Record<ErrorSource, number>> = {}
|
||||
let totalWarnings = 0
|
||||
|
||||
const taskSummaries: TaskResultSummary[] = results.map((r) => {
|
||||
let errorCount = 0
|
||||
let warningCount = 0
|
||||
let errorSources: ErrorSource[] | undefined
|
||||
let failureReason: string | undefined
|
||||
|
||||
if (isSuccessfulResult(r)) {
|
||||
// Count errors and warnings from agent metadata
|
||||
errorCount = r.agentResult.metadata.errors?.length ?? 0
|
||||
warningCount = r.agentResult.metadata.warnings?.length ?? 0
|
||||
totalWarnings += warningCount
|
||||
|
||||
// Track error sources
|
||||
if (r.agentResult.metadata.errors?.length) {
|
||||
errorSources = r.agentResult.metadata.errors.map((e) => e.source)
|
||||
for (const err of r.agentResult.metadata.errors) {
|
||||
errorsBySource[err.source] = (errorsBySource[err.source] ?? 0) + 1
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Failed task
|
||||
errorCount = 1
|
||||
errorSources = [r.errorSource]
|
||||
failureReason = r.error.message
|
||||
errorsBySource[r.errorSource] = (errorsBySource[r.errorSource] ?? 0) + 1
|
||||
}
|
||||
|
||||
return {
|
||||
queryId: r.task.query_id,
|
||||
status: r.status,
|
||||
durationMs: r.durationMs,
|
||||
graderResults: isSuccessfulResult(r)
|
||||
? Object.fromEntries(
|
||||
Object.entries(r.graderResults).map(([name, gr]) => [
|
||||
name,
|
||||
{ pass: gr.pass, score: gr.score },
|
||||
]),
|
||||
)
|
||||
: undefined,
|
||||
errorCount,
|
||||
warningCount,
|
||||
errorSources: errorSources?.length ? errorSources : undefined,
|
||||
failureReason,
|
||||
}
|
||||
})
|
||||
|
||||
const completed = results.filter((r) => r.status === 'completed').length
|
||||
const timeout = results.filter((r) => r.status === 'timeout').length
|
||||
const failed = results.filter((r) => r.status === 'failed').length
|
||||
|
||||
// Calculate pass rate using primary grader (fallback order)
|
||||
let totalGraded = 0
|
||||
let totalPasses = 0
|
||||
|
||||
for (const result of results) {
|
||||
if (isSuccessfulResult(result)) {
|
||||
const primary = getPrimaryGraderResult(result.graderResults)
|
||||
if (primary) {
|
||||
totalGraded++
|
||||
if (primary.pass) totalPasses++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const passRate = totalGraded > 0 ? totalPasses / totalGraded : 0
|
||||
|
||||
// Calculate average duration for non-failed tasks
|
||||
const durations = results
|
||||
.filter((r) => r.status !== 'failed')
|
||||
.map((r) => r.durationMs)
|
||||
const avgDurationMs =
|
||||
durations.length > 0
|
||||
? durations.reduce((a, b) => a + b, 0) / durations.length
|
||||
: 0
|
||||
|
||||
return {
|
||||
total: results.length,
|
||||
completed,
|
||||
failed,
|
||||
timeout,
|
||||
passRate,
|
||||
avgDurationMs,
|
||||
errorsBySource,
|
||||
totalWarnings,
|
||||
results: taskSummaries,
|
||||
}
|
||||
}
|
||||
|
||||
async function saveSummary(
|
||||
summary: BatchSummary,
|
||||
outputDir: string,
|
||||
): Promise<void> {
|
||||
await writeFile(
|
||||
join(outputDir, 'summary.json'),
|
||||
JSON.stringify(summary, null, 2),
|
||||
)
|
||||
}
|
||||
|
||||
function printSummary(summary: BatchSummary): void {
|
||||
console.log('='.repeat(60))
|
||||
console.log('EVALUATION COMPLETE')
|
||||
console.log('='.repeat(60))
|
||||
console.log(`Total: ${summary.total} tasks`)
|
||||
console.log(` Completed: ${summary.completed}`)
|
||||
console.log(` Timeout: ${summary.timeout}`)
|
||||
console.log(` Failed: ${summary.failed}`)
|
||||
console.log(` Pass Rate: ${(summary.passRate * 100).toFixed(1)}%`)
|
||||
console.log(` Avg Duration: ${(summary.avgDurationMs / 1000).toFixed(1)}s`)
|
||||
}
|
||||
export { runEval } from '../runs/eval-runner'
|
||||
|
||||
@@ -1,266 +1,5 @@
|
||||
/**
|
||||
* Parallel Executor
|
||||
*
|
||||
* Each worker gets its own isolated BrowserOS stack:
|
||||
* - BrowserOSAppManager (Chrome + Server on unique ports)
|
||||
* - TaskExecutor (uses that worker's server URL)
|
||||
*
|
||||
* Port allocation: Worker N → CDP=base+N, Server=base+N, Extension=base+N
|
||||
*/
|
||||
|
||||
import type { EvalConfig, Task } from '../types'
|
||||
import { BrowserOSAppManager, type EvalPorts } from './browseros-app-manager'
|
||||
import { createTaskExecutor } from './task-executor'
|
||||
import type { TaskResult } from './types'
|
||||
|
||||
// ============================================================================
|
||||
// Types
|
||||
// ============================================================================
|
||||
|
||||
export interface ParallelExecutorConfig {
|
||||
numWorkers: number
|
||||
config: EvalConfig
|
||||
outputDir: string
|
||||
restartServerPerTask?: boolean
|
||||
onEvent?: (taskId: string, event: Record<string, unknown>) => void
|
||||
}
|
||||
|
||||
export type ProgressCallback = (
|
||||
completed: number,
|
||||
total: number,
|
||||
task: Task,
|
||||
result: TaskResult,
|
||||
) => void
|
||||
|
||||
// ============================================================================
|
||||
// Task Queue (thread-safe for single-threaded async — index is atomic)
|
||||
// ============================================================================
|
||||
|
||||
class TaskQueue {
|
||||
private tasks: Task[]
|
||||
private index: number = 0
|
||||
private stopped: boolean = false
|
||||
|
||||
constructor(tasks: Task[]) {
|
||||
this.tasks = [...tasks]
|
||||
}
|
||||
|
||||
next(): Task | null {
|
||||
if (this.stopped || this.index >= this.tasks.length) return null
|
||||
return this.tasks[this.index++]
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
this.stopped = true
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Parallel Executor
|
||||
// ============================================================================
|
||||
|
||||
export class ParallelExecutor {
|
||||
private readonly numWorkers: number
|
||||
private readonly appManagers = new Map<number, BrowserOSAppManager>()
|
||||
private completedCount: number = 0
|
||||
private readonly resultLock = new Map<string, TaskResult>()
|
||||
private queue: TaskQueue | null = null
|
||||
|
||||
constructor(private readonly config: ParallelExecutorConfig) {
|
||||
this.numWorkers = Math.max(1, config.numWorkers)
|
||||
}
|
||||
|
||||
async stop(): Promise<void> {
|
||||
console.log('\nStopping eval run...')
|
||||
this.queue?.stop()
|
||||
const kills = [...this.appManagers.values()].map((m) => m.killApp())
|
||||
await Promise.allSettled(kills)
|
||||
}
|
||||
|
||||
async execute(
|
||||
tasks: Task[],
|
||||
onProgress?: ProgressCallback,
|
||||
): Promise<TaskResult[]> {
|
||||
if (tasks.length === 0) return []
|
||||
|
||||
const cleanup = this.setupSignalHandlers()
|
||||
|
||||
const loadExtensions = this.config.config.browseros.load_extensions ?? false
|
||||
|
||||
// Patch NopeCHA API key before launching any workers
|
||||
const captchaConfig = this.config.config.captcha
|
||||
if (captchaConfig) {
|
||||
const apiKey = process.env[captchaConfig.api_key_env]
|
||||
if (apiKey) {
|
||||
BrowserOSAppManager.patchNopechaApiKey(apiKey)
|
||||
}
|
||||
}
|
||||
|
||||
this.queue = new TaskQueue(tasks)
|
||||
const totalTasks = tasks.length
|
||||
|
||||
try {
|
||||
const queue = this.queue
|
||||
// Launch N workers in parallel — each gets its own Chrome + Server
|
||||
const workers = Array.from({ length: this.numWorkers }, (_, i) =>
|
||||
this.runWorker(i, queue, totalTasks, loadExtensions, onProgress),
|
||||
)
|
||||
await Promise.all(workers)
|
||||
|
||||
// Return results in original task order
|
||||
return tasks.map((task) => {
|
||||
const result = this.resultLock.get(task.query_id)
|
||||
if (!result) {
|
||||
return {
|
||||
status: 'failed' as const,
|
||||
task,
|
||||
error: new Error('Task result not found'),
|
||||
errorSource: 'unknown' as const,
|
||||
durationMs: 0,
|
||||
}
|
||||
}
|
||||
return result
|
||||
})
|
||||
} finally {
|
||||
cleanup()
|
||||
}
|
||||
}
|
||||
|
||||
private async runWorker(
|
||||
workerIndex: number,
|
||||
queue: TaskQueue,
|
||||
totalTasks: number,
|
||||
loadExtensions: boolean,
|
||||
onProgress?: ProgressCallback,
|
||||
): Promise<void> {
|
||||
// Per-worker isolated ports
|
||||
const basePorts: EvalPorts = {
|
||||
cdp: this.config.config.browseros.base_cdp_port,
|
||||
server: this.config.config.browseros.base_server_port,
|
||||
extension: this.config.config.browseros.base_extension_port,
|
||||
}
|
||||
const headless = this.config.config.browseros.headless ?? false
|
||||
const appManager = new BrowserOSAppManager(
|
||||
workerIndex,
|
||||
basePorts,
|
||||
loadExtensions,
|
||||
headless,
|
||||
)
|
||||
this.appManagers.set(workerIndex, appManager)
|
||||
|
||||
// Per-worker executor pointing to this worker's server
|
||||
const workerConfig: typeof this.config.config = {
|
||||
...this.config.config,
|
||||
browseros: {
|
||||
...this.config.config.browseros,
|
||||
server_url: appManager.getServerUrl(),
|
||||
},
|
||||
}
|
||||
const executor = createTaskExecutor(
|
||||
workerConfig,
|
||||
workerIndex,
|
||||
this.config.outputDir,
|
||||
this.config.onEvent,
|
||||
)
|
||||
|
||||
try {
|
||||
// Always start Chrome+Server once for this worker
|
||||
console.log(`\n Worker ${workerIndex}: Starting BrowserOS stack...`)
|
||||
await appManager.restart()
|
||||
|
||||
while (true) {
|
||||
const task = queue.next()
|
||||
if (!task) break
|
||||
|
||||
const taskStartTime = Date.now()
|
||||
let result: TaskResult
|
||||
|
||||
try {
|
||||
// Restart between tasks if configured
|
||||
if (this.config.restartServerPerTask) {
|
||||
console.log(`\n${'─'.repeat(60)}`)
|
||||
console.log(` Worker ${workerIndex}: Task: ${task.query_id}`)
|
||||
console.log(`${'─'.repeat(60)}`)
|
||||
await appManager.restart()
|
||||
}
|
||||
|
||||
this.config.onEvent?.(task.query_id, {
|
||||
type: 'task-state',
|
||||
taskId: task.query_id,
|
||||
status: 'running',
|
||||
})
|
||||
result = await executor.execute(task)
|
||||
console.log(
|
||||
` Worker ${workerIndex}: ${task.query_id}: ${result.status}`,
|
||||
)
|
||||
} catch (error) {
|
||||
console.error(
|
||||
` Worker ${workerIndex}: ${task.query_id}: FAILED - ${error instanceof Error ? error.message : String(error)}`,
|
||||
)
|
||||
result = {
|
||||
status: 'failed',
|
||||
task,
|
||||
error: error instanceof Error ? error : new Error(String(error)),
|
||||
errorSource: 'unknown',
|
||||
durationMs: Date.now() - taskStartTime,
|
||||
}
|
||||
}
|
||||
|
||||
this.resultLock.set(task.query_id, result)
|
||||
this.completedCount++
|
||||
|
||||
// Emit task completion to dashboard
|
||||
const stateEvent: Record<string, unknown> = {
|
||||
type: 'task-state',
|
||||
taskId: task.query_id,
|
||||
status: result.status,
|
||||
durationMs: result.durationMs,
|
||||
}
|
||||
if (result.status !== 'failed' && 'graderResults' in result) {
|
||||
stateEvent.graderResults = Object.fromEntries(
|
||||
Object.entries(result.graderResults).map(([name, gr]) => [
|
||||
name,
|
||||
{
|
||||
pass: gr.pass,
|
||||
score: gr.score,
|
||||
reasoning: gr.reasoning,
|
||||
details: gr.details,
|
||||
},
|
||||
]),
|
||||
)
|
||||
stateEvent.screenshotCount =
|
||||
result.agentResult?.metadata?.total_steps ?? 0
|
||||
}
|
||||
this.config.onEvent?.(task.query_id, stateEvent)
|
||||
|
||||
onProgress?.(this.completedCount, totalTasks, task, result)
|
||||
|
||||
if (this.config.restartServerPerTask) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000))
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
await appManager.killApp()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* SIGINT/SIGTERM kills all Chrome + Server instances across all workers.
|
||||
* Returns a cleanup function that removes the listeners after execute() completes.
|
||||
*/
|
||||
private setupSignalHandlers(): () => void {
|
||||
const onSignal = async () => {
|
||||
console.log('\nShutting down all workers...')
|
||||
this.queue?.stop()
|
||||
const kills = [...this.appManagers.values()].map((m) => m.killApp())
|
||||
await Promise.allSettled(kills)
|
||||
process.exit(0)
|
||||
}
|
||||
process.on('SIGINT', onSignal)
|
||||
process.on('SIGTERM', onSignal)
|
||||
return () => {
|
||||
process.off('SIGINT', onSignal)
|
||||
process.off('SIGTERM', onSignal)
|
||||
}
|
||||
}
|
||||
}
|
||||
export {
|
||||
type ProgressCallback,
|
||||
TaskWorkerPool as ParallelExecutor,
|
||||
type TaskWorkerPoolConfig as ParallelExecutorConfig,
|
||||
} from '../runs/task-worker-pool'
|
||||
|
||||
@@ -1,316 +1,6 @@
|
||||
import { join } from 'node:path'
|
||||
import { createAgent } from '../agents'
|
||||
import type { AgentContext, AgentResult } from '../agents/types'
|
||||
import { CaptureContext } from '../capture/context'
|
||||
import {
|
||||
hasExistingGraderResults,
|
||||
TrajectorySaver,
|
||||
} from '../capture/trajectory-saver'
|
||||
import { runGraders } from '../graders/registry'
|
||||
import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
|
||||
import { callMcpTool } from '../utils/mcp-client'
|
||||
import { InfinityAppManager } from './infinity-app-manager'
|
||||
import type { TaskResult } from './types'
|
||||
|
||||
// ============================================================================
|
||||
// Errors
|
||||
// ============================================================================
|
||||
|
||||
export class TaskExecutionError extends Error {
|
||||
public readonly errorSource: ErrorSource
|
||||
|
||||
constructor(
|
||||
message: string,
|
||||
public readonly task: Task,
|
||||
public readonly phase:
|
||||
| 'navigation'
|
||||
| 'agent_execution'
|
||||
| 'grading'
|
||||
| 'cleanup',
|
||||
public readonly cause?: Error,
|
||||
) {
|
||||
super(message)
|
||||
this.name = 'TaskExecutionError'
|
||||
this.errorSource = phase as ErrorSource
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Task Executor
|
||||
// ============================================================================
|
||||
|
||||
export interface TaskExecutorDeps {
|
||||
onEvent?: (taskId: string, event: Record<string, unknown>) => void
|
||||
}
|
||||
|
||||
export class TaskExecutor {
|
||||
constructor(
|
||||
private readonly config: EvalConfig,
|
||||
private readonly workerIndex: number,
|
||||
private readonly outputDir: string,
|
||||
private readonly deps: TaskExecutorDeps,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* Resolve the initial page ID via list_pages MCP call.
|
||||
* Called once per task on a fresh browser — there's exactly one page.
|
||||
*/
|
||||
private async resolveInitialPageId(mcpUrl: string): Promise<number> {
|
||||
try {
|
||||
const result = await callMcpTool(mcpUrl, 'list_pages', {})
|
||||
if (!result.isError) {
|
||||
const textContent = result.content?.find(
|
||||
(c: { type: string }) => c.type === 'text',
|
||||
)
|
||||
const match = textContent?.text?.match(/^\s*(\d+)\./m)
|
||||
if (match) return Number.parseInt(match[1], 10)
|
||||
}
|
||||
} catch {
|
||||
// Fall through to default
|
||||
}
|
||||
// Fresh browser always has page 1
|
||||
return 1
|
||||
}
|
||||
|
||||
async execute(task: Task): Promise<TaskResult> {
|
||||
const startTime = Date.now()
|
||||
const mcpUrl = `${this.config.browseros.server_url}/mcp`
|
||||
|
||||
// Check if task already has grader results (resume capability)
|
||||
const existing = await hasExistingGraderResults(
|
||||
this.outputDir,
|
||||
task.query_id,
|
||||
)
|
||||
if (existing.exists && existing.metadata) {
|
||||
console.log(` Skipping: already has grader results`)
|
||||
return {
|
||||
status:
|
||||
existing.metadata.termination_reason === 'timeout'
|
||||
? 'timeout'
|
||||
: 'completed',
|
||||
task,
|
||||
agentResult: {
|
||||
metadata: existing.metadata,
|
||||
messages: [],
|
||||
finalAnswer: existing.metadata.final_answer,
|
||||
},
|
||||
graderResults: existing.metadata.grader_results,
|
||||
durationMs: existing.metadata.total_duration_ms,
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve page ID once — fresh browser has exactly one page
|
||||
const pageId = await this.resolveInitialPageId(mcpUrl)
|
||||
|
||||
// For Infinity tasks, start a fresh app server per task
|
||||
let infinityManager: InfinityAppManager | null = null
|
||||
let actualStartUrl = task.start_url
|
||||
|
||||
if (task.dataset === 'webarena-infinity') {
|
||||
const appName = (task.metadata?.additional as Record<string, unknown>)
|
||||
?.app_name as string
|
||||
const appBasePort =
|
||||
((task.metadata?.additional as Record<string, unknown>)
|
||||
?.app_base_port as number) || 8000
|
||||
|
||||
if (appName && process.env.WEBARENA_INFINITY_DIR) {
|
||||
infinityManager = new InfinityAppManager(this.workerIndex, appBasePort)
|
||||
try {
|
||||
actualStartUrl = await infinityManager.startApp(appName)
|
||||
console.log(
|
||||
` Infinity app "${appName}" started on port ${infinityManager.getPort()}`,
|
||||
)
|
||||
} catch (error) {
|
||||
throw new TaskExecutionError(
|
||||
`Failed to start Infinity app: ${error instanceof Error ? error.message : String(error)}`,
|
||||
task,
|
||||
'navigation',
|
||||
error instanceof Error ? error : undefined,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
// Phase 1: Set viewport + navigate to start URL
|
||||
try {
|
||||
await callMcpTool(mcpUrl, 'evaluate_script', {
|
||||
page: pageId,
|
||||
expression: 'window.resizeTo(1440, 900)',
|
||||
})
|
||||
} catch (vpError) {
|
||||
console.warn(
|
||||
` Viewport resize failed: ${vpError instanceof Error ? vpError.message : String(vpError)}`,
|
||||
)
|
||||
}
|
||||
|
||||
if (actualStartUrl && actualStartUrl !== 'about:blank') {
|
||||
try {
|
||||
await callMcpTool(mcpUrl, 'navigate_page', {
|
||||
url: actualStartUrl,
|
||||
page: pageId,
|
||||
})
|
||||
} catch (error) {
|
||||
throw new TaskExecutionError(
|
||||
`Failed to navigate to start URL: ${error instanceof Error ? error.message : String(error)}`,
|
||||
task,
|
||||
'navigation',
|
||||
error instanceof Error ? error : undefined,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Execute agent
|
||||
const agentResult = await this.executeAgent(task, pageId)
|
||||
|
||||
// Phase 3: Run graders
|
||||
const graderResults = await this.runGraders(
|
||||
task,
|
||||
agentResult,
|
||||
infinityManager?.getUrl(),
|
||||
)
|
||||
|
||||
const status =
|
||||
agentResult.metadata.termination_reason === 'timeout'
|
||||
? 'timeout'
|
||||
: 'completed'
|
||||
|
||||
return {
|
||||
status,
|
||||
task,
|
||||
agentResult,
|
||||
graderResults,
|
||||
durationMs: Date.now() - startTime,
|
||||
}
|
||||
} catch (error) {
|
||||
const errorSource: ErrorSource =
|
||||
error instanceof TaskExecutionError ? error.errorSource : 'unknown'
|
||||
|
||||
return {
|
||||
status: 'failed',
|
||||
task,
|
||||
error: error instanceof Error ? error : new Error(String(error)),
|
||||
errorSource,
|
||||
durationMs: Date.now() - startTime,
|
||||
}
|
||||
} finally {
|
||||
// Navigate to about:blank to clean up
|
||||
try {
|
||||
await callMcpTool(mcpUrl, 'navigate_page', {
|
||||
url: 'about:blank',
|
||||
page: pageId,
|
||||
})
|
||||
} catch {
|
||||
// Ignore cleanup errors
|
||||
}
|
||||
|
||||
// Stop Infinity app server if running
|
||||
if (infinityManager) {
|
||||
await infinityManager.stop().catch(() => {})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async executeAgent(task: Task, pageId: number): Promise<AgentResult> {
|
||||
try {
|
||||
const { capture, taskOutputDir } = await CaptureContext.create({
|
||||
serverUrl: this.config.browseros.server_url,
|
||||
outputDir: this.outputDir,
|
||||
taskId: task.query_id,
|
||||
initialPageId: pageId,
|
||||
onEvent: this.deps.onEvent,
|
||||
})
|
||||
|
||||
const context: AgentContext = {
|
||||
config: this.config,
|
||||
task,
|
||||
workerIndex: this.workerIndex,
|
||||
initialPageId: pageId,
|
||||
outputDir: this.outputDir,
|
||||
taskOutputDir,
|
||||
capture,
|
||||
}
|
||||
|
||||
const agent = createAgent(context)
|
||||
return await agent.execute()
|
||||
} catch (error) {
|
||||
if (error instanceof TaskExecutionError) {
|
||||
throw error
|
||||
}
|
||||
throw new TaskExecutionError(
|
||||
`Agent execution failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
task,
|
||||
'agent_execution',
|
||||
error instanceof Error ? error : undefined,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private async runGraders(
|
||||
task: Task,
|
||||
agentResult: AgentResult,
|
||||
infinityAppUrl?: string,
|
||||
): Promise<Record<string, GraderResult>> {
|
||||
const configGraders = this.config.graders ?? []
|
||||
const taskGraders = task.graders ?? []
|
||||
const graderNames = configGraders.length > 0 ? configGraders : taskGraders
|
||||
if (graderNames.length === 0) {
|
||||
return {}
|
||||
}
|
||||
|
||||
try {
|
||||
const graderResults = await runGraders(graderNames, {
|
||||
task: {
|
||||
query_id: task.query_id,
|
||||
query: task.query,
|
||||
dataset: task.dataset,
|
||||
},
|
||||
messages: agentResult.messages,
|
||||
screenshotCount:
|
||||
agentResult.metadata.screenshot_count ??
|
||||
agentResult.metadata.total_steps,
|
||||
finalAnswer: agentResult.finalAnswer,
|
||||
expectedAnswer: (task.metadata?.additional as Record<string, unknown>)
|
||||
?.answer as string | undefined,
|
||||
outputDir: join(this.outputDir, task.query_id),
|
||||
mcpUrl: `${this.config.browseros.server_url}/mcp`,
|
||||
infinityAppUrl,
|
||||
})
|
||||
|
||||
try {
|
||||
const saver = new TrajectorySaver(this.outputDir, task.query_id)
|
||||
await saver.updateGraderResults(graderResults)
|
||||
} catch (saveError) {
|
||||
console.warn(
|
||||
` Failed to persist grader results: ${saveError instanceof Error ? saveError.message : String(saveError)}`,
|
||||
)
|
||||
}
|
||||
|
||||
return graderResults
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
` Grading failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
)
|
||||
return {
|
||||
_error: {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: `Grading failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Factory
|
||||
// ============================================================================
|
||||
|
||||
export function createTaskExecutor(
|
||||
config: EvalConfig,
|
||||
workerIndex: number,
|
||||
outputDir: string,
|
||||
onEvent?: (taskId: string, event: Record<string, unknown>) => void,
|
||||
): TaskExecutor {
|
||||
return new TaskExecutor(config, workerIndex, outputDir, { onEvent })
|
||||
}
|
||||
export {
|
||||
createTaskRunPipeline as createTaskExecutor,
|
||||
TaskExecutionError,
|
||||
TaskRunPipeline as TaskExecutor,
|
||||
type TaskRunPipelineDeps as TaskExecutorDeps,
|
||||
} from '../runs/task-run-pipeline'
|
||||
|
||||
@@ -8,12 +8,18 @@ import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
|
||||
|
||||
export interface RunEvalOptions {
|
||||
configPath: string
|
||||
config?: EvalConfig
|
||||
dataPath?: string
|
||||
query?: string
|
||||
startUrl?: string
|
||||
outputDir?: string
|
||||
}
|
||||
|
||||
export interface RunEvalResult {
|
||||
outputDir: string
|
||||
summary: BatchSummary
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Task Loading
|
||||
// ============================================================================
|
||||
|
||||
46
packages/browseros-agent/apps/eval/src/runs/artifact-paths.ts
vendored
Normal file
46
packages/browseros-agent/apps/eval/src/runs/artifact-paths.ts
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
import { join } from 'node:path'
|
||||
|
||||
function timestamp(date: Date): string {
|
||||
const y = date.getUTCFullYear()
|
||||
const m = String(date.getUTCMonth() + 1).padStart(2, '0')
|
||||
const d = String(date.getUTCDate()).padStart(2, '0')
|
||||
const h = String(date.getUTCHours()).padStart(2, '0')
|
||||
const min = String(date.getUTCMinutes()).padStart(2, '0')
|
||||
return `${y}-${m}-${d}-${h}${min}`
|
||||
}
|
||||
|
||||
function safeSegment(value: string): string {
|
||||
return value
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9._-]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '')
|
||||
}
|
||||
|
||||
/** Creates a path-safe run id from suite/config, variant, and time. */
|
||||
export function createRunId(
|
||||
suiteId: string,
|
||||
variantId: string,
|
||||
date = new Date(),
|
||||
): string {
|
||||
return `${safeSegment(suiteId)}__${safeSegment(variantId)}__${timestamp(date)}`
|
||||
}
|
||||
|
||||
export function getRunPaths(baseDir: string, runId: string, taskId?: string) {
|
||||
const runDir = join(baseDir, 'runs', runId)
|
||||
const taskDir = taskId ? join(runDir, 'tasks', taskId) : undefined
|
||||
|
||||
return {
|
||||
runDir,
|
||||
runManifest: join(runDir, 'run.json'),
|
||||
summary: join(runDir, 'summary.json'),
|
||||
viewerManifest: join(runDir, 'viewer-manifest.json'),
|
||||
uploadManifest: join(runDir, 'upload-manifest.json'),
|
||||
taskDir,
|
||||
attempt: taskDir ? join(taskDir, 'attempt.json') : undefined,
|
||||
trace: taskDir ? join(taskDir, 'trace.jsonl') : undefined,
|
||||
messages: taskDir ? join(taskDir, 'messages.jsonl') : undefined,
|
||||
grades: taskDir ? join(taskDir, 'grades.json') : undefined,
|
||||
graderArtifacts: taskDir ? join(taskDir, 'grader-artifacts') : undefined,
|
||||
screenshots: taskDir ? join(taskDir, 'screenshots') : undefined,
|
||||
}
|
||||
}
|
||||
380
packages/browseros-agent/apps/eval/src/runs/eval-runner.ts
vendored
Normal file
380
packages/browseros-agent/apps/eval/src/runs/eval-runner.ts
vendored
Normal file
@@ -0,0 +1,380 @@
|
||||
import { mkdir, writeFile } from 'node:fs/promises'
|
||||
import { basename, dirname, join, resolve } from 'node:path'
|
||||
import {
|
||||
dashboardState,
|
||||
setActiveExecutor,
|
||||
startDashboard,
|
||||
stopDashboard,
|
||||
} from '../dashboard/server'
|
||||
import {
|
||||
getTaskSourceDescription,
|
||||
loadTasks,
|
||||
TaskLoadError,
|
||||
} from '../runner/task-loader'
|
||||
import type {
|
||||
BatchSummary,
|
||||
RunEvalOptions,
|
||||
RunEvalResult,
|
||||
TaskResult,
|
||||
TaskResultSummary,
|
||||
TaskSource,
|
||||
} from '../runner/types'
|
||||
import { getPrimaryGraderResult, isSuccessfulResult } from '../runner/types'
|
||||
import type { ErrorSource, EvalConfig, Task } from '../types'
|
||||
import {
|
||||
printValidationResult,
|
||||
validateConfig,
|
||||
} from '../utils/config-validator'
|
||||
import { TaskWorkerPool } from './task-worker-pool'
|
||||
|
||||
// ============================================================================
|
||||
// Main Entry Point
|
||||
// ============================================================================
|
||||
|
||||
export async function runEval(options: RunEvalOptions): Promise<RunEvalResult> {
|
||||
// Step 1: Validate configuration
|
||||
const config =
|
||||
options.config ?? (await loadAndValidateConfig(options.configPath))
|
||||
|
||||
// Step 2: Resolve paths relative to config location
|
||||
const configDir = options.configPath
|
||||
? dirname(resolve(options.configPath))
|
||||
: process.cwd()
|
||||
const resolvedPaths = resolvePaths(options, config, configDir)
|
||||
|
||||
// Log configuration
|
||||
console.log('Eval Configuration:')
|
||||
console.log(` Config: ${options.configPath}`)
|
||||
console.log(` Dataset: ${resolvedPaths.dataPath}`)
|
||||
console.log(` Output: ${resolvedPaths.outputDir}`)
|
||||
console.log(` Workers: ${config.num_workers}`)
|
||||
console.log(` Agent: ${config.agent.type}`)
|
||||
console.log()
|
||||
|
||||
// Step 3: Load tasks
|
||||
const taskSource = resolveTaskSource(options, resolvedPaths.dataPath)
|
||||
const { tasks } = await loadTasksWithLogging(taskSource)
|
||||
|
||||
// Step 4: Setup
|
||||
await mkdir(resolvedPaths.outputDir, { recursive: true })
|
||||
|
||||
// Step 5: Start dashboard
|
||||
startDashboard({
|
||||
tasks,
|
||||
configName: options.configPath,
|
||||
agentType: config.agent.type,
|
||||
outputDir: resolvedPaths.outputDir,
|
||||
})
|
||||
|
||||
// Step 6: Execute tasks (parallel or sequential based on num_workers)
|
||||
const results = await executeTasks(tasks, config, resolvedPaths.outputDir)
|
||||
|
||||
// Step 7: Summary
|
||||
const summary = buildSummary(results)
|
||||
await saveSummary(summary, resolvedPaths.outputDir)
|
||||
printSummary(summary)
|
||||
console.log(`\nResults saved to: ${resolvedPaths.outputDir}`)
|
||||
|
||||
stopDashboard()
|
||||
return { outputDir: resolvedPaths.outputDir, summary }
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Configuration
|
||||
// ============================================================================
|
||||
|
||||
async function loadAndValidateConfig(configPath: string) {
|
||||
console.log('Validating configuration...')
|
||||
const validationResult = await validateConfig(configPath)
|
||||
printValidationResult(validationResult)
|
||||
|
||||
if (!validationResult.valid || !validationResult.config) {
|
||||
throw new Error(
|
||||
'Configuration validation failed. Fix the above errors and try again.',
|
||||
)
|
||||
}
|
||||
|
||||
return validationResult.config
|
||||
}
|
||||
|
||||
interface ResolvedPaths {
|
||||
dataPath: string
|
||||
outputDir: string
|
||||
}
|
||||
|
||||
/** Returns the eval results directory for both flat and nested config layouts. */
|
||||
function defaultResultsBase(configDir: string): string {
|
||||
const resolvedConfigDir = resolve(configDir)
|
||||
if (basename(resolvedConfigDir) === 'configs') {
|
||||
return resolve(resolvedConfigDir, '..', 'results')
|
||||
}
|
||||
if (basename(dirname(resolvedConfigDir)) === 'configs') {
|
||||
return resolve(resolvedConfigDir, '..', '..', 'results')
|
||||
}
|
||||
return resolve(resolvedConfigDir, '..', 'results')
|
||||
}
|
||||
|
||||
function resolvePaths(
|
||||
options: RunEvalOptions,
|
||||
config: EvalConfig,
|
||||
configDir: string,
|
||||
): ResolvedPaths {
|
||||
// Resolve dataset path: use options.dataPath if provided, otherwise resolve from config
|
||||
const dataPath = options.dataPath
|
||||
? options.dataPath
|
||||
: config.dataset.startsWith('/')
|
||||
? config.dataset
|
||||
: resolve(configDir, config.dataset)
|
||||
|
||||
// Resolve output directory: results/{config-name}/{timestamp}/
|
||||
// Config name derived from config filename (e.g., "browseros-agent-weekly.json" → "browseros-agent-weekly")
|
||||
const configName = options.configPath
|
||||
? basename(resolve(options.configPath), '.json')
|
||||
: 'eval'
|
||||
const timestamp = formatTimestamp(new Date())
|
||||
const resultsBase = config.output_dir
|
||||
? config.output_dir.startsWith('/')
|
||||
? config.output_dir
|
||||
: resolve(configDir, config.output_dir)
|
||||
: defaultResultsBase(configDir)
|
||||
const outputDir =
|
||||
options.outputDir ?? join(resultsBase, configName, timestamp)
|
||||
|
||||
return { dataPath, outputDir }
|
||||
}
|
||||
|
||||
function formatTimestamp(date: Date): string {
|
||||
const y = date.getUTCFullYear()
|
||||
const m = String(date.getUTCMonth() + 1).padStart(2, '0')
|
||||
const d = String(date.getUTCDate()).padStart(2, '0')
|
||||
const h = String(date.getUTCHours()).padStart(2, '0')
|
||||
const min = String(date.getUTCMinutes()).padStart(2, '0')
|
||||
return `${y}-${m}-${d}-${h}${min}`
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Task Loading
|
||||
// ============================================================================
|
||||
|
||||
function resolveTaskSource(
|
||||
options: RunEvalOptions,
|
||||
dataPath: string,
|
||||
): TaskSource {
|
||||
// If query is provided, use single task mode
|
||||
if (options.query) {
|
||||
return { type: 'single', query: options.query, startUrl: options.startUrl }
|
||||
}
|
||||
|
||||
// Otherwise use file mode with the resolved dataPath
|
||||
return { type: 'file', path: dataPath }
|
||||
}
|
||||
|
||||
async function loadTasksWithLogging(
|
||||
source: TaskSource,
|
||||
): Promise<{ tasks: Awaited<ReturnType<typeof loadTasks>>['tasks'] }> {
|
||||
console.log(`Loading tasks from ${getTaskSourceDescription(source)}...`)
|
||||
|
||||
try {
|
||||
const result = await loadTasks(source)
|
||||
console.log(`Loaded ${result.tasks.length} task(s)`)
|
||||
return { tasks: result.tasks }
|
||||
} catch (error) {
|
||||
if (error instanceof TaskLoadError) {
|
||||
throw new Error(`Failed to load tasks: ${error.message}`)
|
||||
}
|
||||
throw new Error(`Failed to load tasks: ${error}`)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Task Execution
|
||||
// ============================================================================
|
||||
|
||||
async function executeTasks(
|
||||
tasks: Task[],
|
||||
config: EvalConfig,
|
||||
outputDir: string,
|
||||
): Promise<TaskResult[]> {
|
||||
console.log(`\n${'='.repeat(60)}`)
|
||||
console.log('STARTING EVALUATION')
|
||||
console.log(`${'='.repeat(60)}\n`)
|
||||
|
||||
const numWorkers = config.num_workers || 1
|
||||
console.log(`Running with ${numWorkers} worker(s)`)
|
||||
if (config.restart_server_per_task) {
|
||||
console.log(`Server restart per task: enabled`)
|
||||
}
|
||||
console.log()
|
||||
|
||||
const executor = new TaskWorkerPool({
|
||||
numWorkers,
|
||||
config,
|
||||
outputDir,
|
||||
restartServerPerTask: config.restart_server_per_task,
|
||||
onEvent: (taskId, event) =>
|
||||
dashboardState.broadcastStreamEvent(taskId, event),
|
||||
})
|
||||
|
||||
// Register so dashboard stop button works for CLI runs too
|
||||
setActiveExecutor(executor)
|
||||
try {
|
||||
return await executor.execute(tasks, (completed, total, task, result) => {
|
||||
printTaskProgress(completed, total, task, result)
|
||||
})
|
||||
} finally {
|
||||
setActiveExecutor(null)
|
||||
}
|
||||
}
|
||||
|
||||
function printTaskProgress(
|
||||
completed: number,
|
||||
total: number,
|
||||
task: Task,
|
||||
result: TaskResult,
|
||||
): void {
|
||||
const status =
|
||||
result.status === 'completed'
|
||||
? 'DONE'
|
||||
: result.status === 'timeout'
|
||||
? 'TIMEOUT'
|
||||
: 'FAILED'
|
||||
|
||||
const duration =
|
||||
result.durationMs > 0 ? ` (${(result.durationMs / 1000).toFixed(1)}s)` : ''
|
||||
|
||||
console.log(`[${completed}/${total}] ${task.query_id}: ${status}${duration}`)
|
||||
|
||||
if (result.status === 'failed') {
|
||||
console.log(` ERROR: ${result.error.message}`)
|
||||
} else if (isSuccessfulResult(result)) {
|
||||
// Log agent errors (e.g., LLM API failures) even if task "completed"
|
||||
if (result.agentResult.metadata.errors?.length) {
|
||||
for (const err of result.agentResult.metadata.errors) {
|
||||
console.log(` ERROR [${err.source}]: ${err.message}`)
|
||||
}
|
||||
}
|
||||
for (const [name, gr] of Object.entries(result.graderResults)) {
|
||||
const icon = gr.pass ? 'PASS' : 'FAIL'
|
||||
console.log(` ${name}: ${icon}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Summary
|
||||
// ============================================================================
|
||||
|
||||
function buildSummary(results: TaskResult[]): BatchSummary {
|
||||
// Track errors by source
|
||||
const errorsBySource: Partial<Record<ErrorSource, number>> = {}
|
||||
let totalWarnings = 0
|
||||
|
||||
const taskSummaries: TaskResultSummary[] = results.map((r) => {
|
||||
let errorCount = 0
|
||||
let warningCount = 0
|
||||
let errorSources: ErrorSource[] | undefined
|
||||
let failureReason: string | undefined
|
||||
|
||||
if (isSuccessfulResult(r)) {
|
||||
// Count errors and warnings from agent metadata
|
||||
errorCount = r.agentResult.metadata.errors?.length ?? 0
|
||||
warningCount = r.agentResult.metadata.warnings?.length ?? 0
|
||||
totalWarnings += warningCount
|
||||
|
||||
// Track error sources
|
||||
if (r.agentResult.metadata.errors?.length) {
|
||||
errorSources = r.agentResult.metadata.errors.map((e) => e.source)
|
||||
for (const err of r.agentResult.metadata.errors) {
|
||||
errorsBySource[err.source] = (errorsBySource[err.source] ?? 0) + 1
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Failed task
|
||||
errorCount = 1
|
||||
errorSources = [r.errorSource]
|
||||
failureReason = r.error.message
|
||||
errorsBySource[r.errorSource] = (errorsBySource[r.errorSource] ?? 0) + 1
|
||||
}
|
||||
|
||||
return {
|
||||
queryId: r.task.query_id,
|
||||
status: r.status,
|
||||
durationMs: r.durationMs,
|
||||
graderResults: isSuccessfulResult(r)
|
||||
? Object.fromEntries(
|
||||
Object.entries(r.graderResults).map(([name, gr]) => [
|
||||
name,
|
||||
{ pass: gr.pass, score: gr.score },
|
||||
]),
|
||||
)
|
||||
: undefined,
|
||||
errorCount,
|
||||
warningCount,
|
||||
errorSources: errorSources?.length ? errorSources : undefined,
|
||||
failureReason,
|
||||
}
|
||||
})
|
||||
|
||||
const completed = results.filter((r) => r.status === 'completed').length
|
||||
const timeout = results.filter((r) => r.status === 'timeout').length
|
||||
const failed = results.filter((r) => r.status === 'failed').length
|
||||
|
||||
// Calculate pass rate using primary grader (fallback order)
|
||||
let totalGraded = 0
|
||||
let totalPasses = 0
|
||||
|
||||
for (const result of results) {
|
||||
if (isSuccessfulResult(result)) {
|
||||
const primary = getPrimaryGraderResult(result.graderResults)
|
||||
if (primary) {
|
||||
totalGraded++
|
||||
if (primary.pass) totalPasses++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const passRate = totalGraded > 0 ? totalPasses / totalGraded : 0
|
||||
|
||||
// Calculate average duration for non-failed tasks
|
||||
const durations = results
|
||||
.filter((r) => r.status !== 'failed')
|
||||
.map((r) => r.durationMs)
|
||||
const avgDurationMs =
|
||||
durations.length > 0
|
||||
? durations.reduce((a, b) => a + b, 0) / durations.length
|
||||
: 0
|
||||
|
||||
return {
|
||||
total: results.length,
|
||||
completed,
|
||||
failed,
|
||||
timeout,
|
||||
passRate,
|
||||
avgDurationMs,
|
||||
errorsBySource,
|
||||
totalWarnings,
|
||||
results: taskSummaries,
|
||||
}
|
||||
}
|
||||
|
||||
async function saveSummary(
|
||||
summary: BatchSummary,
|
||||
outputDir: string,
|
||||
): Promise<void> {
|
||||
await writeFile(
|
||||
join(outputDir, 'summary.json'),
|
||||
JSON.stringify(summary, null, 2),
|
||||
)
|
||||
}
|
||||
|
||||
function printSummary(summary: BatchSummary): void {
|
||||
console.log('='.repeat(60))
|
||||
console.log('EVALUATION COMPLETE')
|
||||
console.log('='.repeat(60))
|
||||
console.log(`Total: ${summary.total} tasks`)
|
||||
console.log(` Completed: ${summary.completed}`)
|
||||
console.log(` Timeout: ${summary.timeout}`)
|
||||
console.log(` Failed: ${summary.failed}`)
|
||||
console.log(` Pass Rate: ${(summary.passRate * 100).toFixed(1)}%`)
|
||||
console.log(` Avg Duration: ${(summary.avgDurationMs / 1000).toFixed(1)}s`)
|
||||
}
|
||||
44
packages/browseros-agent/apps/eval/src/runs/run-manifest.ts
vendored
Normal file
44
packages/browseros-agent/apps/eval/src/runs/run-manifest.ts
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
import type { EvalVariant } from '../suites/resolve-variant'
|
||||
|
||||
export interface BuildRunManifestInput {
|
||||
runId: string
|
||||
suiteId: string
|
||||
variant: EvalVariant
|
||||
datasetPath: string
|
||||
datasetHash?: string
|
||||
graders: string[]
|
||||
gitSha?: string
|
||||
browserosVersion?: string
|
||||
startedAt?: string
|
||||
}
|
||||
|
||||
export interface RunManifest {
|
||||
runId: string
|
||||
suiteId: string
|
||||
variant: EvalVariant['publicMetadata']
|
||||
dataset: {
|
||||
path: string
|
||||
hash?: string
|
||||
}
|
||||
graders: string[]
|
||||
gitSha?: string
|
||||
browserosVersion?: string
|
||||
startedAt: string
|
||||
}
|
||||
|
||||
/** Builds the sanitized run manifest used for reproducibility. */
|
||||
export function buildRunManifest(input: BuildRunManifestInput): RunManifest {
|
||||
return {
|
||||
runId: input.runId,
|
||||
suiteId: input.suiteId,
|
||||
variant: input.variant.publicMetadata,
|
||||
dataset: {
|
||||
path: input.datasetPath,
|
||||
hash: input.datasetHash,
|
||||
},
|
||||
graders: input.graders,
|
||||
gitSha: input.gitSha,
|
||||
browserosVersion: input.browserosVersion,
|
||||
startedAt: input.startedAt ?? new Date().toISOString(),
|
||||
}
|
||||
}
|
||||
317
packages/browseros-agent/apps/eval/src/runs/task-run-pipeline.ts
vendored
Normal file
317
packages/browseros-agent/apps/eval/src/runs/task-run-pipeline.ts
vendored
Normal file
@@ -0,0 +1,317 @@
|
||||
import { join } from 'node:path'
|
||||
import { createAgent } from '../agents'
|
||||
import type { AgentContext, AgentResult } from '../agents/types'
|
||||
import { CaptureContext } from '../capture/context'
|
||||
import {
|
||||
hasExistingGraderResults,
|
||||
TrajectorySaver,
|
||||
} from '../capture/trajectory-saver'
|
||||
import { runGraders } from '../graders/registry'
|
||||
import { InfinityAppManager } from '../runner/infinity-app-manager'
|
||||
import type { TaskResult } from '../runner/types'
|
||||
import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
|
||||
import { callMcpTool } from '../utils/mcp-client'
|
||||
|
||||
// ============================================================================
|
||||
// Errors
|
||||
// ============================================================================
|
||||
|
||||
export class TaskExecutionError extends Error {
|
||||
public readonly errorSource: ErrorSource
|
||||
|
||||
constructor(
|
||||
message: string,
|
||||
public readonly task: Task,
|
||||
public readonly phase:
|
||||
| 'navigation'
|
||||
| 'agent_execution'
|
||||
| 'grading'
|
||||
| 'cleanup',
|
||||
public readonly cause?: Error,
|
||||
) {
|
||||
super(message)
|
||||
this.name = 'TaskExecutionError'
|
||||
this.errorSource = phase as ErrorSource
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Task Executor
|
||||
// ============================================================================
|
||||
|
||||
export interface TaskRunPipelineDeps {
|
||||
onEvent?: (taskId: string, event: Record<string, unknown>) => void
|
||||
}
|
||||
|
||||
export class TaskRunPipeline {
|
||||
constructor(
|
||||
private readonly config: EvalConfig,
|
||||
private readonly workerIndex: number,
|
||||
private readonly outputDir: string,
|
||||
private readonly deps: TaskRunPipelineDeps,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* Resolve the initial page ID via list_pages MCP call.
|
||||
* Called once per task on a fresh browser — there's exactly one page.
|
||||
*/
|
||||
private async resolveInitialPageId(mcpUrl: string): Promise<number> {
|
||||
try {
|
||||
const result = await callMcpTool(mcpUrl, 'list_pages', {})
|
||||
if (!result.isError) {
|
||||
const textContent = result.content?.find(
|
||||
(c: { type: string }) => c.type === 'text',
|
||||
)
|
||||
const match = textContent?.text?.match(/^\s*(\d+)\./m)
|
||||
if (match) return Number.parseInt(match[1], 10)
|
||||
}
|
||||
} catch {
|
||||
// Fall through to default
|
||||
}
|
||||
// Fresh browser always has page 1
|
||||
return 1
|
||||
}
|
||||
|
||||
async execute(task: Task): Promise<TaskResult> {
|
||||
const startTime = Date.now()
|
||||
const mcpUrl = `${this.config.browseros.server_url}/mcp`
|
||||
|
||||
// Check if task already has grader results (resume capability)
|
||||
const existing = await hasExistingGraderResults(
|
||||
this.outputDir,
|
||||
task.query_id,
|
||||
)
|
||||
if (existing.exists && existing.metadata) {
|
||||
console.log(` Skipping: already has grader results`)
|
||||
return {
|
||||
status:
|
||||
existing.metadata.termination_reason === 'timeout'
|
||||
? 'timeout'
|
||||
: 'completed',
|
||||
task,
|
||||
agentResult: {
|
||||
metadata: existing.metadata,
|
||||
messages: [],
|
||||
finalAnswer: existing.metadata.final_answer,
|
||||
},
|
||||
graderResults: existing.metadata.grader_results,
|
||||
durationMs: existing.metadata.total_duration_ms,
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve page ID once — fresh browser has exactly one page
|
||||
const pageId = await this.resolveInitialPageId(mcpUrl)
|
||||
|
||||
// For Infinity tasks, start a fresh app server per task
|
||||
let infinityManager: InfinityAppManager | null = null
|
||||
let actualStartUrl = task.start_url
|
||||
|
||||
if (task.dataset === 'webarena-infinity') {
|
||||
const appName = (task.metadata?.additional as Record<string, unknown>)
|
||||
?.app_name as string
|
||||
const appBasePort =
|
||||
((task.metadata?.additional as Record<string, unknown>)
|
||||
?.app_base_port as number) || 8000
|
||||
|
||||
if (appName && process.env.WEBARENA_INFINITY_DIR) {
|
||||
infinityManager = new InfinityAppManager(this.workerIndex, appBasePort)
|
||||
try {
|
||||
actualStartUrl = await infinityManager.startApp(appName)
|
||||
console.log(
|
||||
` Infinity app "${appName}" started on port ${infinityManager.getPort()}`,
|
||||
)
|
||||
} catch (error) {
|
||||
throw new TaskExecutionError(
|
||||
`Failed to start Infinity app: ${error instanceof Error ? error.message : String(error)}`,
|
||||
task,
|
||||
'navigation',
|
||||
error instanceof Error ? error : undefined,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
// Phase 1: Set viewport + navigate to start URL
|
||||
try {
|
||||
await callMcpTool(mcpUrl, 'evaluate_script', {
|
||||
page: pageId,
|
||||
expression: 'window.resizeTo(1440, 900)',
|
||||
})
|
||||
} catch (vpError) {
|
||||
console.warn(
|
||||
` Viewport resize failed: ${vpError instanceof Error ? vpError.message : String(vpError)}`,
|
||||
)
|
||||
}
|
||||
|
||||
if (actualStartUrl && actualStartUrl !== 'about:blank') {
|
||||
try {
|
||||
await callMcpTool(mcpUrl, 'navigate_page', {
|
||||
url: actualStartUrl,
|
||||
page: pageId,
|
||||
})
|
||||
} catch (error) {
|
||||
throw new TaskExecutionError(
|
||||
`Failed to navigate to start URL: ${error instanceof Error ? error.message : String(error)}`,
|
||||
task,
|
||||
'navigation',
|
||||
error instanceof Error ? error : undefined,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Execute agent
|
||||
const agentResult = await this.executeAgent(task, pageId)
|
||||
|
||||
// Phase 3: Run graders
|
||||
const graderResults = await this.runGraders(
|
||||
task,
|
||||
agentResult,
|
||||
infinityManager?.getUrl(),
|
||||
)
|
||||
|
||||
const status =
|
||||
agentResult.metadata.termination_reason === 'timeout'
|
||||
? 'timeout'
|
||||
: 'completed'
|
||||
|
||||
return {
|
||||
status,
|
||||
task,
|
||||
agentResult,
|
||||
graderResults,
|
||||
durationMs: Date.now() - startTime,
|
||||
}
|
||||
} catch (error) {
|
||||
const errorSource: ErrorSource =
|
||||
error instanceof TaskExecutionError ? error.errorSource : 'unknown'
|
||||
|
||||
return {
|
||||
status: 'failed',
|
||||
task,
|
||||
error: error instanceof Error ? error : new Error(String(error)),
|
||||
errorSource,
|
||||
durationMs: Date.now() - startTime,
|
||||
}
|
||||
} finally {
|
||||
// Navigate to about:blank to clean up
|
||||
try {
|
||||
await callMcpTool(mcpUrl, 'navigate_page', {
|
||||
url: 'about:blank',
|
||||
page: pageId,
|
||||
})
|
||||
} catch {
|
||||
// Ignore cleanup errors
|
||||
}
|
||||
|
||||
// Stop Infinity app server if running
|
||||
if (infinityManager) {
|
||||
await infinityManager.stop().catch(() => {})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async executeAgent(task: Task, pageId: number): Promise<AgentResult> {
|
||||
try {
|
||||
const { capture, taskOutputDir } = await CaptureContext.create({
|
||||
serverUrl: this.config.browseros.server_url,
|
||||
outputDir: this.outputDir,
|
||||
taskId: task.query_id,
|
||||
initialPageId: pageId,
|
||||
onEvent: this.deps.onEvent,
|
||||
})
|
||||
|
||||
const context: AgentContext = {
|
||||
config: this.config,
|
||||
task,
|
||||
workerIndex: this.workerIndex,
|
||||
initialPageId: pageId,
|
||||
outputDir: this.outputDir,
|
||||
taskOutputDir,
|
||||
capture,
|
||||
}
|
||||
|
||||
const agent = createAgent(context)
|
||||
return await agent.execute()
|
||||
} catch (error) {
|
||||
if (error instanceof TaskExecutionError) {
|
||||
throw error
|
||||
}
|
||||
throw new TaskExecutionError(
|
||||
`Agent execution failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
task,
|
||||
'agent_execution',
|
||||
error instanceof Error ? error : undefined,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private async runGraders(
|
||||
task: Task,
|
||||
agentResult: AgentResult,
|
||||
infinityAppUrl?: string,
|
||||
): Promise<Record<string, GraderResult>> {
|
||||
const configGraders = this.config.graders ?? []
|
||||
const taskGraders = task.graders ?? []
|
||||
const graderNames = configGraders.length > 0 ? configGraders : taskGraders
|
||||
if (graderNames.length === 0) {
|
||||
return {}
|
||||
}
|
||||
|
||||
try {
|
||||
const graderResults = await runGraders(graderNames, {
|
||||
task: {
|
||||
query_id: task.query_id,
|
||||
query: task.query,
|
||||
dataset: task.dataset,
|
||||
},
|
||||
messages: agentResult.messages,
|
||||
screenshotCount:
|
||||
agentResult.metadata.screenshot_count ??
|
||||
agentResult.metadata.total_steps,
|
||||
finalAnswer: agentResult.finalAnswer,
|
||||
expectedAnswer: (task.metadata?.additional as Record<string, unknown>)
|
||||
?.answer as string | undefined,
|
||||
taskArtifactDir: join(this.outputDir, task.query_id),
|
||||
outputDir: join(this.outputDir, task.query_id),
|
||||
mcpUrl: `${this.config.browseros.server_url}/mcp`,
|
||||
infinityAppUrl,
|
||||
})
|
||||
|
||||
try {
|
||||
const saver = new TrajectorySaver(this.outputDir, task.query_id)
|
||||
await saver.updateGraderResults(graderResults)
|
||||
} catch (saveError) {
|
||||
console.warn(
|
||||
` Failed to persist grader results: ${saveError instanceof Error ? saveError.message : String(saveError)}`,
|
||||
)
|
||||
}
|
||||
|
||||
return graderResults
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
` Grading failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
)
|
||||
return {
|
||||
_error: {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: `Grading failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Factory
|
||||
// ============================================================================
|
||||
|
||||
export function createTaskRunPipeline(
|
||||
config: EvalConfig,
|
||||
workerIndex: number,
|
||||
outputDir: string,
|
||||
onEvent?: (taskId: string, event: Record<string, unknown>) => void,
|
||||
): TaskRunPipeline {
|
||||
return new TaskRunPipeline(config, workerIndex, outputDir, { onEvent })
|
||||
}
|
||||
269
packages/browseros-agent/apps/eval/src/runs/task-worker-pool.ts
vendored
Normal file
269
packages/browseros-agent/apps/eval/src/runs/task-worker-pool.ts
vendored
Normal file
@@ -0,0 +1,269 @@
|
||||
/**
|
||||
* Task Worker Pool
|
||||
*
|
||||
* Each worker gets its own isolated BrowserOS stack:
|
||||
* - BrowserOSAppManager (Chrome + Server on unique ports)
|
||||
* - TaskRunPipeline (uses that worker's server URL)
|
||||
*
|
||||
* Port allocation: Worker N → CDP=base+N, Server=base+N, Extension=base+N
|
||||
*/
|
||||
|
||||
import {
|
||||
BrowserOSAppManager,
|
||||
type EvalPorts,
|
||||
} from '../runner/browseros-app-manager'
|
||||
import type { TaskResult } from '../runner/types'
|
||||
import type { EvalConfig, Task } from '../types'
|
||||
import { createTaskRunPipeline } from './task-run-pipeline'
|
||||
|
||||
// ============================================================================
|
||||
// Types
|
||||
// ============================================================================
|
||||
|
||||
export interface TaskWorkerPoolConfig {
|
||||
numWorkers: number
|
||||
config: EvalConfig
|
||||
outputDir: string
|
||||
restartServerPerTask?: boolean
|
||||
onEvent?: (taskId: string, event: Record<string, unknown>) => void
|
||||
}
|
||||
|
||||
export type ProgressCallback = (
|
||||
completed: number,
|
||||
total: number,
|
||||
task: Task,
|
||||
result: TaskResult,
|
||||
) => void
|
||||
|
||||
// ============================================================================
|
||||
// Task Queue (thread-safe for single-threaded async — index is atomic)
|
||||
// ============================================================================
|
||||
|
||||
class TaskQueue {
|
||||
private tasks: Task[]
|
||||
private index: number = 0
|
||||
private stopped: boolean = false
|
||||
|
||||
constructor(tasks: Task[]) {
|
||||
this.tasks = [...tasks]
|
||||
}
|
||||
|
||||
next(): Task | null {
|
||||
if (this.stopped || this.index >= this.tasks.length) return null
|
||||
return this.tasks[this.index++]
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
this.stopped = true
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Task Worker Pool
|
||||
// ============================================================================
|
||||
|
||||
export class TaskWorkerPool {
|
||||
private readonly numWorkers: number
|
||||
private readonly appManagers = new Map<number, BrowserOSAppManager>()
|
||||
private completedCount: number = 0
|
||||
private readonly resultsByTaskId = new Map<string, TaskResult>()
|
||||
private queue: TaskQueue | null = null
|
||||
|
||||
constructor(private readonly config: TaskWorkerPoolConfig) {
|
||||
this.numWorkers = Math.max(1, config.numWorkers)
|
||||
}
|
||||
|
||||
async stop(): Promise<void> {
|
||||
console.log('\nStopping eval run...')
|
||||
this.queue?.stop()
|
||||
const kills = [...this.appManagers.values()].map((m) => m.killApp())
|
||||
await Promise.allSettled(kills)
|
||||
}
|
||||
|
||||
async execute(
|
||||
tasks: Task[],
|
||||
onProgress?: ProgressCallback,
|
||||
): Promise<TaskResult[]> {
|
||||
if (tasks.length === 0) return []
|
||||
|
||||
const cleanup = this.setupSignalHandlers()
|
||||
|
||||
const loadExtensions = this.config.config.browseros.load_extensions ?? false
|
||||
|
||||
// Patch NopeCHA API key before launching any workers
|
||||
const captchaConfig = this.config.config.captcha
|
||||
if (captchaConfig) {
|
||||
const apiKey = process.env[captchaConfig.api_key_env]
|
||||
if (apiKey) {
|
||||
BrowserOSAppManager.patchNopechaApiKey(apiKey)
|
||||
}
|
||||
}
|
||||
|
||||
this.queue = new TaskQueue(tasks)
|
||||
const totalTasks = tasks.length
|
||||
|
||||
try {
|
||||
const queue = this.queue
|
||||
// Launch N workers in parallel — each gets its own Chrome + Server
|
||||
const workers = Array.from({ length: this.numWorkers }, (_, i) =>
|
||||
this.runWorker(i, queue, totalTasks, loadExtensions, onProgress),
|
||||
)
|
||||
await Promise.all(workers)
|
||||
|
||||
// Return results in original task order
|
||||
return tasks.map((task) => {
|
||||
const result = this.resultsByTaskId.get(task.query_id)
|
||||
if (!result) {
|
||||
return {
|
||||
status: 'failed' as const,
|
||||
task,
|
||||
error: new Error('Task result not found'),
|
||||
errorSource: 'unknown' as const,
|
||||
durationMs: 0,
|
||||
}
|
||||
}
|
||||
return result
|
||||
})
|
||||
} finally {
|
||||
cleanup()
|
||||
}
|
||||
}
|
||||
|
||||
private async runWorker(
|
||||
workerIndex: number,
|
||||
queue: TaskQueue,
|
||||
totalTasks: number,
|
||||
loadExtensions: boolean,
|
||||
onProgress?: ProgressCallback,
|
||||
): Promise<void> {
|
||||
// Per-worker isolated ports
|
||||
const basePorts: EvalPorts = {
|
||||
cdp: this.config.config.browseros.base_cdp_port,
|
||||
server: this.config.config.browseros.base_server_port,
|
||||
extension: this.config.config.browseros.base_extension_port,
|
||||
}
|
||||
const headless = this.config.config.browseros.headless ?? false
|
||||
const appManager = new BrowserOSAppManager(
|
||||
workerIndex,
|
||||
basePorts,
|
||||
loadExtensions,
|
||||
headless,
|
||||
)
|
||||
this.appManagers.set(workerIndex, appManager)
|
||||
|
||||
// Per-worker executor pointing to this worker's server
|
||||
const workerConfig: typeof this.config.config = {
|
||||
...this.config.config,
|
||||
browseros: {
|
||||
...this.config.config.browseros,
|
||||
server_url: appManager.getServerUrl(),
|
||||
},
|
||||
}
|
||||
const executor = createTaskRunPipeline(
|
||||
workerConfig,
|
||||
workerIndex,
|
||||
this.config.outputDir,
|
||||
this.config.onEvent,
|
||||
)
|
||||
|
||||
try {
|
||||
// Always start Chrome+Server once for this worker
|
||||
console.log(`\n Worker ${workerIndex}: Starting BrowserOS stack...`)
|
||||
await appManager.restart()
|
||||
|
||||
while (true) {
|
||||
const task = queue.next()
|
||||
if (!task) break
|
||||
|
||||
const taskStartTime = Date.now()
|
||||
let result: TaskResult
|
||||
|
||||
try {
|
||||
// Restart between tasks if configured
|
||||
if (this.config.restartServerPerTask) {
|
||||
console.log(`\n${'─'.repeat(60)}`)
|
||||
console.log(` Worker ${workerIndex}: Task: ${task.query_id}`)
|
||||
console.log(`${'─'.repeat(60)}`)
|
||||
await appManager.restart()
|
||||
}
|
||||
|
||||
this.config.onEvent?.(task.query_id, {
|
||||
type: 'task-state',
|
||||
taskId: task.query_id,
|
||||
status: 'running',
|
||||
})
|
||||
result = await executor.execute(task)
|
||||
console.log(
|
||||
` Worker ${workerIndex}: ${task.query_id}: ${result.status}`,
|
||||
)
|
||||
} catch (error) {
|
||||
console.error(
|
||||
` Worker ${workerIndex}: ${task.query_id}: FAILED - ${error instanceof Error ? error.message : String(error)}`,
|
||||
)
|
||||
result = {
|
||||
status: 'failed',
|
||||
task,
|
||||
error: error instanceof Error ? error : new Error(String(error)),
|
||||
errorSource: 'unknown',
|
||||
durationMs: Date.now() - taskStartTime,
|
||||
}
|
||||
}
|
||||
|
||||
this.resultsByTaskId.set(task.query_id, result)
|
||||
this.completedCount++
|
||||
|
||||
// Emit task completion to dashboard
|
||||
const stateEvent: Record<string, unknown> = {
|
||||
type: 'task-state',
|
||||
taskId: task.query_id,
|
||||
status: result.status,
|
||||
durationMs: result.durationMs,
|
||||
}
|
||||
if (result.status !== 'failed' && 'graderResults' in result) {
|
||||
stateEvent.graderResults = Object.fromEntries(
|
||||
Object.entries(result.graderResults).map(([name, gr]) => [
|
||||
name,
|
||||
{
|
||||
pass: gr.pass,
|
||||
score: gr.score,
|
||||
reasoning: gr.reasoning,
|
||||
details: gr.details,
|
||||
},
|
||||
]),
|
||||
)
|
||||
stateEvent.screenshotCount =
|
||||
result.agentResult?.metadata?.total_steps ?? 0
|
||||
}
|
||||
this.config.onEvent?.(task.query_id, stateEvent)
|
||||
|
||||
onProgress?.(this.completedCount, totalTasks, task, result)
|
||||
|
||||
if (this.config.restartServerPerTask) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000))
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
await appManager.killApp()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* SIGINT/SIGTERM kills all Chrome + Server instances across all workers.
|
||||
* Returns a cleanup function that removes the listeners after execute() completes.
|
||||
*/
|
||||
private setupSignalHandlers(): () => void {
|
||||
const onSignal = async () => {
|
||||
console.log('\nShutting down all workers...')
|
||||
this.queue?.stop()
|
||||
const kills = [...this.appManagers.values()].map((m) => m.killApp())
|
||||
await Promise.allSettled(kills)
|
||||
process.exit(0)
|
||||
}
|
||||
process.on('SIGINT', onSignal)
|
||||
process.on('SIGTERM', onSignal)
|
||||
return () => {
|
||||
process.off('SIGINT', onSignal)
|
||||
process.off('SIGTERM', onSignal)
|
||||
}
|
||||
}
|
||||
}
|
||||
101
packages/browseros-agent/apps/eval/src/suites/config-adapter.ts
vendored
Normal file
101
packages/browseros-agent/apps/eval/src/suites/config-adapter.ts
vendored
Normal file
@@ -0,0 +1,101 @@
|
||||
import { basename, resolve } from 'node:path'
|
||||
import { type EvalConfig, EvalConfigSchema } from '../types'
|
||||
import { type EvalVariant, resolveVariant } from './resolve-variant'
|
||||
import type { EvalSuite } from './schema'
|
||||
|
||||
type Env = Record<string, string | undefined>
|
||||
|
||||
export interface AdaptEvalConfigOptions {
|
||||
env?: Env
|
||||
}
|
||||
|
||||
export interface AdaptedEvalConfig {
|
||||
configPath: string
|
||||
evalConfig: EvalConfig
|
||||
suite: EvalSuite
|
||||
variant: EvalVariant
|
||||
}
|
||||
|
||||
function executorBackend(
|
||||
config: EvalConfig,
|
||||
): 'tool-loop' | 'clado' | undefined {
|
||||
if (config.agent.type !== 'orchestrator-executor') return undefined
|
||||
return config.agent.executor.provider === 'clado-action'
|
||||
? 'clado'
|
||||
: 'tool-loop'
|
||||
}
|
||||
|
||||
function variantSource(config: EvalConfig): {
|
||||
provider: string
|
||||
model: string
|
||||
apiKey?: string
|
||||
apiKeyEnv?: string
|
||||
baseUrl?: string
|
||||
supportsImages?: boolean
|
||||
} {
|
||||
const agent =
|
||||
config.agent.type === 'single' ? config.agent : config.agent.orchestrator
|
||||
if (!agent.model) {
|
||||
throw new Error('Config agent model is required')
|
||||
}
|
||||
const apiKeyEnv = /^[A-Z][A-Z0-9_]*$/.test(agent.apiKey ?? '')
|
||||
? agent.apiKey
|
||||
: undefined
|
||||
|
||||
return {
|
||||
provider: agent.provider,
|
||||
model: agent.model,
|
||||
apiKey: agent.apiKey,
|
||||
apiKeyEnv,
|
||||
baseUrl: agent.baseUrl,
|
||||
supportsImages:
|
||||
config.agent.type === 'single' ? config.agent.supportsImages : undefined,
|
||||
}
|
||||
}
|
||||
|
||||
/** Adapts an existing eval config into the suite/variant model. */
|
||||
export async function adaptEvalConfigFile(
|
||||
configPath: string,
|
||||
options: AdaptEvalConfigOptions = {},
|
||||
): Promise<AdaptedEvalConfig> {
|
||||
const absolute = resolve(configPath)
|
||||
const raw = JSON.parse(await Bun.file(absolute).text())
|
||||
const evalConfig = EvalConfigSchema.parse(raw)
|
||||
const id = basename(absolute, '.json')
|
||||
const backend = executorBackend(evalConfig)
|
||||
const source = variantSource(evalConfig)
|
||||
const env = options.env ?? process.env
|
||||
const apiKey =
|
||||
source.apiKeyEnv && env[source.apiKeyEnv]
|
||||
? env[source.apiKeyEnv]
|
||||
: source.apiKey
|
||||
|
||||
return {
|
||||
configPath: absolute,
|
||||
evalConfig,
|
||||
suite: {
|
||||
id,
|
||||
dataset: evalConfig.dataset,
|
||||
agent:
|
||||
evalConfig.agent.type === 'single'
|
||||
? { type: 'tool-loop' }
|
||||
: { type: 'orchestrated', executorBackend: backend ?? 'tool-loop' },
|
||||
graders: evalConfig.graders ?? [],
|
||||
workers: evalConfig.num_workers,
|
||||
restartBrowserPerTask: evalConfig.restart_server_per_task,
|
||||
timeoutMs: evalConfig.timeout_ms,
|
||||
browseros: evalConfig.browseros,
|
||||
captcha: evalConfig.captcha,
|
||||
},
|
||||
variant: resolveVariant({
|
||||
variantId: id,
|
||||
provider: source.provider,
|
||||
model: source.model,
|
||||
apiKey,
|
||||
apiKeyEnv: source.apiKeyEnv,
|
||||
baseUrl: source.baseUrl,
|
||||
supportsImages: source.supportsImages,
|
||||
env,
|
||||
}),
|
||||
}
|
||||
}
|
||||
22
packages/browseros-agent/apps/eval/src/suites/load-suite.ts
vendored
Normal file
22
packages/browseros-agent/apps/eval/src/suites/load-suite.ts
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
import { dirname, resolve } from 'node:path'
|
||||
import { type EvalSuite, EvalSuiteSchema } from './schema'
|
||||
|
||||
export interface LoadedSuite {
|
||||
suite: EvalSuite
|
||||
suitePath: string
|
||||
suiteDir: string
|
||||
datasetPath: string
|
||||
}
|
||||
|
||||
/** Loads a suite file and resolves its dataset relative to the suite. */
|
||||
export async function loadSuite(suitePath: string): Promise<LoadedSuite> {
|
||||
const absolute = resolve(suitePath)
|
||||
const raw = JSON.parse(await Bun.file(absolute).text())
|
||||
const suite = EvalSuiteSchema.parse(raw)
|
||||
const suiteDir = dirname(absolute)
|
||||
const datasetPath = suite.dataset.startsWith('/')
|
||||
? suite.dataset
|
||||
: resolve(suiteDir, suite.dataset)
|
||||
|
||||
return { suite, suitePath: absolute, suiteDir, datasetPath }
|
||||
}
|
||||
102
packages/browseros-agent/apps/eval/src/suites/resolve-variant.ts
vendored
Normal file
102
packages/browseros-agent/apps/eval/src/suites/resolve-variant.ts
vendored
Normal file
@@ -0,0 +1,102 @@
|
||||
type Env = Record<string, string | undefined>
|
||||
|
||||
export interface ResolveVariantOptions {
|
||||
variantId?: string
|
||||
provider?: string
|
||||
model?: string
|
||||
apiKey?: string
|
||||
apiKeyEnv?: string
|
||||
baseUrl?: string
|
||||
supportsImages?: boolean
|
||||
env?: Env
|
||||
requireApiKey?: boolean
|
||||
}
|
||||
|
||||
export interface EvalVariant {
|
||||
id: string
|
||||
agent: {
|
||||
provider: string
|
||||
model: string
|
||||
apiKey?: string
|
||||
baseUrl?: string
|
||||
supportsImages?: boolean
|
||||
}
|
||||
publicMetadata: {
|
||||
id: string
|
||||
agent: {
|
||||
provider: string
|
||||
model: string
|
||||
baseUrlHost?: string
|
||||
supportsImages?: boolean
|
||||
apiKeyConfigured: boolean
|
||||
apiKeyEnv?: string
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function boolFromEnv(value: string | undefined): boolean | undefined {
|
||||
if (value === undefined) return undefined
|
||||
return ['1', 'true', 'yes'].includes(value.toLowerCase())
|
||||
}
|
||||
|
||||
function hostFromUrl(value: string | undefined): string | undefined {
|
||||
if (!value) return undefined
|
||||
try {
|
||||
return new URL(value).host
|
||||
} catch {
|
||||
return undefined
|
||||
}
|
||||
}
|
||||
|
||||
function isEnvName(value: string | undefined): boolean {
|
||||
return !!value && /^[A-Z][A-Z0-9_]*$/.test(value)
|
||||
}
|
||||
|
||||
/** Resolves one model/backend variant from CLI values first, then env. */
|
||||
export function resolveVariant(
|
||||
options: ResolveVariantOptions = {},
|
||||
): EvalVariant {
|
||||
const env = options.env ?? process.env
|
||||
const id = options.variantId ?? env.EVAL_VARIANT ?? 'default'
|
||||
const provider =
|
||||
options.provider ?? env.EVAL_AGENT_PROVIDER ?? 'openai-compatible'
|
||||
const model = options.model ?? env.EVAL_AGENT_MODEL
|
||||
const apiKey = options.apiKey ?? env.EVAL_AGENT_API_KEY
|
||||
const apiKeyEnv =
|
||||
options.apiKeyEnv ?? (options.apiKey ? undefined : 'EVAL_AGENT_API_KEY')
|
||||
const baseUrl = options.baseUrl ?? env.EVAL_AGENT_BASE_URL
|
||||
const supportsImages =
|
||||
options.supportsImages ?? boolFromEnv(env.EVAL_AGENT_SUPPORTS_IMAGES)
|
||||
|
||||
if (!model) {
|
||||
throw new Error('EVAL_AGENT_MODEL is required')
|
||||
}
|
||||
if (options.requireApiKey && !apiKey) {
|
||||
throw new Error('EVAL_AGENT_API_KEY is required')
|
||||
}
|
||||
|
||||
const publicApiKeyEnv =
|
||||
options.apiKeyEnv ?? (isEnvName(apiKey) ? apiKey : apiKeyEnv)
|
||||
|
||||
return {
|
||||
id,
|
||||
agent: {
|
||||
provider,
|
||||
model,
|
||||
apiKey,
|
||||
baseUrl,
|
||||
supportsImages,
|
||||
},
|
||||
publicMetadata: {
|
||||
id,
|
||||
agent: {
|
||||
provider,
|
||||
model,
|
||||
baseUrlHost: hostFromUrl(baseUrl),
|
||||
supportsImages,
|
||||
apiKeyConfigured: !!apiKey,
|
||||
apiKeyEnv: publicApiKeyEnv,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
41
packages/browseros-agent/apps/eval/src/suites/schema.ts
vendored
Normal file
41
packages/browseros-agent/apps/eval/src/suites/schema.ts
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
import { z } from 'zod'
|
||||
import { EvalConfigSchema } from '../types'
|
||||
|
||||
export const SuiteAgentSchema = z
|
||||
.object({
|
||||
type: z.enum([
|
||||
'tool-loop',
|
||||
'single',
|
||||
'orchestrated',
|
||||
'orchestrator-executor',
|
||||
]),
|
||||
executorBackend: z.enum(['tool-loop', 'clado']).optional(),
|
||||
})
|
||||
.superRefine((agent, ctx) => {
|
||||
if (
|
||||
(agent.type === 'orchestrated' ||
|
||||
agent.type === 'orchestrator-executor') &&
|
||||
!agent.executorBackend
|
||||
) {
|
||||
ctx.addIssue({
|
||||
code: z.ZodIssueCode.custom,
|
||||
path: ['executorBackend'],
|
||||
message: 'executorBackend is required for orchestrated suites',
|
||||
})
|
||||
}
|
||||
})
|
||||
|
||||
export const EvalSuiteSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
dataset: z.string().min(1),
|
||||
agent: SuiteAgentSchema,
|
||||
graders: z.array(z.string()).default([]),
|
||||
workers: z.number().int().min(1).max(20).default(1),
|
||||
restartBrowserPerTask: z.boolean().default(false),
|
||||
timeoutMs: z.number().int().min(30_000).max(3_600_000).optional(),
|
||||
browseros: EvalConfigSchema.shape.browseros.optional(),
|
||||
captcha: EvalConfigSchema.shape.captcha.optional(),
|
||||
})
|
||||
|
||||
export type EvalSuite = z.infer<typeof EvalSuiteSchema>
|
||||
export type SuiteAgent = z.infer<typeof SuiteAgentSchema>
|
||||
66
packages/browseros-agent/apps/eval/src/viewer/viewer-manifest.ts
vendored
Normal file
66
packages/browseros-agent/apps/eval/src/viewer/viewer-manifest.ts
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
import type { GraderResult } from '../types'
|
||||
|
||||
export interface ViewerManifestTaskInput {
|
||||
queryId: string
|
||||
query: string
|
||||
startUrl?: string
|
||||
status: string
|
||||
durationMs: number
|
||||
screenshotCount: number
|
||||
graderResults: Record<string, GraderResult>
|
||||
}
|
||||
|
||||
export interface ViewerManifestTask extends ViewerManifestTaskInput {
|
||||
paths: {
|
||||
attempt: string
|
||||
metadata: string
|
||||
messages: string
|
||||
trace: string
|
||||
grades: string
|
||||
screenshots: string
|
||||
graderArtifacts: string
|
||||
}
|
||||
}
|
||||
|
||||
export interface ViewerManifest {
|
||||
runId: string
|
||||
suiteId: string
|
||||
variantId: string
|
||||
uploadedAt?: string
|
||||
summary: Record<string, unknown>
|
||||
tasks: ViewerManifestTask[]
|
||||
}
|
||||
|
||||
export interface BuildViewerManifestInput {
|
||||
runId: string
|
||||
suiteId: string
|
||||
variantId: string
|
||||
uploadedAt?: string
|
||||
summary: Record<string, unknown>
|
||||
tasks: ViewerManifestTaskInput[]
|
||||
}
|
||||
|
||||
/** Builds the compact JSON index consumed by the static R2 viewer. */
|
||||
export function buildViewerManifest(
|
||||
input: BuildViewerManifestInput,
|
||||
): ViewerManifest {
|
||||
return {
|
||||
runId: input.runId,
|
||||
suiteId: input.suiteId,
|
||||
variantId: input.variantId,
|
||||
uploadedAt: input.uploadedAt,
|
||||
summary: input.summary,
|
||||
tasks: input.tasks.map((task) => ({
|
||||
...task,
|
||||
paths: {
|
||||
attempt: `tasks/${task.queryId}/attempt.json`,
|
||||
metadata: `tasks/${task.queryId}/metadata.json`,
|
||||
messages: `tasks/${task.queryId}/messages.jsonl`,
|
||||
trace: `tasks/${task.queryId}/trace.jsonl`,
|
||||
grades: `tasks/${task.queryId}/grades.json`,
|
||||
screenshots: `tasks/${task.queryId}/screenshots`,
|
||||
graderArtifacts: `tasks/${task.queryId}/grader-artifacts`,
|
||||
},
|
||||
})),
|
||||
}
|
||||
}
|
||||
69
packages/browseros-agent/apps/eval/tests/agents/clado-actions.test.ts
vendored
Normal file
69
packages/browseros-agent/apps/eval/tests/agents/clado-actions.test.ts
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import {
|
||||
extractCladoThinking,
|
||||
formatCladoHistory,
|
||||
getCladoActionSignature,
|
||||
parseCladoActions,
|
||||
} from '../../src/agents/orchestrated/backends/clado/clado-actions'
|
||||
import type { CladoActionResponse } from '../../src/agents/orchestrated/backends/clado/types'
|
||||
|
||||
describe('Clado action parsing', () => {
|
||||
it('merges the structured response with the first raw answer block', () => {
|
||||
const prediction: CladoActionResponse = {
|
||||
action: 'click',
|
||||
x: 800,
|
||||
raw_response:
|
||||
'<answer>{"action":"click","x":100,"y":200}</answer><answer>{"action":"press_key","key":"Enter"}</answer>',
|
||||
}
|
||||
|
||||
expect(parseCladoActions(prediction)).toEqual([
|
||||
{ action: 'click', x: 800, y: 200 },
|
||||
{ action: 'press_key', key: 'Enter' },
|
||||
])
|
||||
})
|
||||
|
||||
it('returns no action for malformed or missing action payloads', () => {
|
||||
expect(
|
||||
parseCladoActions({
|
||||
action: null,
|
||||
raw_response: '<answer>{"x":100}</answer><answer>bad json</answer>',
|
||||
}),
|
||||
).toEqual([])
|
||||
})
|
||||
|
||||
it('deduplicates repeated raw actions after the primary action', () => {
|
||||
const prediction: CladoActionResponse = {
|
||||
raw_response: [
|
||||
'<answer>{"action":"click","x":100,"y":200}</answer>',
|
||||
'<answer>{"action":"click","x":100,"y":200}</answer>',
|
||||
'<answer>{"action":"type","text":"hello"}</answer>',
|
||||
].join(''),
|
||||
}
|
||||
|
||||
expect(parseCladoActions(prediction)).toEqual([
|
||||
{ action: 'click', x: 100, y: 200 },
|
||||
{ action: 'type', text: 'hello' },
|
||||
])
|
||||
})
|
||||
|
||||
it('extracts compact thinking text from raw model output', () => {
|
||||
expect(
|
||||
extractCladoThinking(
|
||||
'<thinking> first\\n thought </thinking><thinking>second thought</thinking>',
|
||||
),
|
||||
).toBe('first\\n thought second thought')
|
||||
})
|
||||
|
||||
it('formats history and signatures using the existing trajectory shape', () => {
|
||||
const actions = [
|
||||
{ action: 'click', x: 100, y: 200 },
|
||||
{ action: 'type', text: "can't" },
|
||||
{ action: 'scroll', direction: 'down', amount: 500 },
|
||||
]
|
||||
|
||||
expect(formatCladoHistory(actions)).toBe(
|
||||
"click(100, 200) -> type('can\\'t') -> scroll(down)",
|
||||
)
|
||||
expect(getCladoActionSignature(actions[0])).toBe('click:100:200')
|
||||
})
|
||||
})
|
||||
45
packages/browseros-agent/apps/eval/tests/agents/clado-browser-driver.test.ts
vendored
Normal file
45
packages/browseros-agent/apps/eval/tests/agents/clado-browser-driver.test.ts
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import {
|
||||
prepareCladoToolArgs,
|
||||
resolveCladoPoint,
|
||||
} from '../../src/agents/orchestrated/backends/clado/clado-browser-driver'
|
||||
|
||||
describe('Clado browser driver helpers', () => {
|
||||
it('maps normalized coordinates into the current viewport', () => {
|
||||
expect(resolveCladoPoint({ width: 1440, height: 900 }, 500, 500)).toEqual({
|
||||
x: 720,
|
||||
y: 450,
|
||||
})
|
||||
})
|
||||
|
||||
it('clamps normalized coordinates before mapping to pixels', () => {
|
||||
expect(resolveCladoPoint({ width: 1000, height: 800 }, -10, 1200)).toEqual({
|
||||
x: 0,
|
||||
y: 799,
|
||||
})
|
||||
})
|
||||
|
||||
it('keeps the current evaluate_script argument conversion', () => {
|
||||
expect(
|
||||
prepareCladoToolArgs(
|
||||
'evaluate_script',
|
||||
{ function: '() => window.location.href' },
|
||||
7,
|
||||
),
|
||||
).toEqual({
|
||||
expression: '(() => window.location.href)()',
|
||||
page: 7,
|
||||
})
|
||||
})
|
||||
|
||||
it('normalizes click_at and adds page for page-scoped tools', () => {
|
||||
expect(
|
||||
prepareCladoToolArgs('click_at', { x: 10, y: 20, dblClick: true }, 3),
|
||||
).toEqual({
|
||||
x: 10,
|
||||
y: 20,
|
||||
clickCount: 2,
|
||||
page: 3,
|
||||
})
|
||||
})
|
||||
})
|
||||
45
packages/browseros-agent/apps/eval/tests/agents/executor-backend.test.ts
vendored
Normal file
45
packages/browseros-agent/apps/eval/tests/agents/executor-backend.test.ts
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import {
|
||||
backendKindForProvider,
|
||||
createExecutorBackend,
|
||||
} from '../../src/agents/orchestrated/backends/create-executor-backend'
|
||||
import type { ExecutorBackend } from '../../src/agents/orchestrated/executor-backend'
|
||||
|
||||
describe('executor backend boundary', () => {
|
||||
it('selects Clado only for the Clado action provider', () => {
|
||||
expect(backendKindForProvider('clado-action')).toBe('clado')
|
||||
expect(backendKindForProvider('openai-compatible')).toBe('tool-loop')
|
||||
})
|
||||
|
||||
it('forwards execution and step state through the backend interface', async () => {
|
||||
const signal = new AbortController().signal
|
||||
const fakeBackend: ExecutorBackend = {
|
||||
kind: 'tool-loop',
|
||||
async execute(instruction, receivedSignal) {
|
||||
expect(instruction).toBe('Click checkout')
|
||||
expect(receivedSignal).toBe(signal)
|
||||
return {
|
||||
observation: 'Clicked checkout',
|
||||
status: 'done',
|
||||
url: 'https://example.test/checkout',
|
||||
actionsPerformed: 2,
|
||||
toolsUsed: ['browser_click_element'],
|
||||
}
|
||||
},
|
||||
async close() {},
|
||||
getTotalSteps() {
|
||||
return 2
|
||||
},
|
||||
}
|
||||
|
||||
const backend = createExecutorBackend({
|
||||
backendKind: 'tool-loop',
|
||||
executor: fakeBackend,
|
||||
})
|
||||
const result = await backend.execute('Click checkout', signal)
|
||||
|
||||
expect(result.observation).toBe('Clicked checkout')
|
||||
expect(result.actionsPerformed).toBe(2)
|
||||
expect(backend.getTotalSteps()).toBe(2)
|
||||
})
|
||||
})
|
||||
64
packages/browseros-agent/apps/eval/tests/cli/args.test.ts
vendored
Normal file
64
packages/browseros-agent/apps/eval/tests/cli/args.test.ts
vendored
Normal file
@@ -0,0 +1,64 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { parseEvalCliArgs } from '../../src/cli/args'
|
||||
|
||||
describe('parseEvalCliArgs', () => {
|
||||
it('parses the workflow-compatible suite config command', () => {
|
||||
expect(
|
||||
parseEvalCliArgs([
|
||||
'suite',
|
||||
'--config',
|
||||
'configs/legacy/browseros-agent-weekly.json',
|
||||
'--publish',
|
||||
'r2',
|
||||
]),
|
||||
).toEqual({
|
||||
command: 'suite',
|
||||
configPath: 'configs/legacy/browseros-agent-weekly.json',
|
||||
publishTarget: 'r2',
|
||||
})
|
||||
})
|
||||
|
||||
it('parses suite variant and model options', () => {
|
||||
expect(
|
||||
parseEvalCliArgs([
|
||||
'suite',
|
||||
'--suite',
|
||||
'configs/suites/agisdk-daily-10.json',
|
||||
'--variant',
|
||||
'kimi-fireworks',
|
||||
'--provider',
|
||||
'openai-compatible',
|
||||
'--model',
|
||||
'accounts/fireworks/models/kimi-k2p5',
|
||||
'--base-url',
|
||||
'https://api.fireworks.ai/inference/v1',
|
||||
]),
|
||||
).toEqual({
|
||||
command: 'suite',
|
||||
suitePath: 'configs/suites/agisdk-daily-10.json',
|
||||
variantId: 'kimi-fireworks',
|
||||
provider: 'openai-compatible',
|
||||
model: 'accounts/fireworks/models/kimi-k2p5',
|
||||
baseUrl: 'https://api.fireworks.ai/inference/v1',
|
||||
})
|
||||
})
|
||||
|
||||
it('keeps the old config shorthand as legacy config mode', () => {
|
||||
expect(
|
||||
parseEvalCliArgs(['-c', 'configs/legacy/browseros-agent-weekly.json']),
|
||||
).toEqual({
|
||||
command: 'legacy',
|
||||
configPath: 'configs/legacy/browseros-agent-weekly.json',
|
||||
})
|
||||
})
|
||||
|
||||
it('rejects missing required command options with targeted errors', () => {
|
||||
expect(() => parseEvalCliArgs(['run'])).toThrow(
|
||||
'run requires --config or --suite',
|
||||
)
|
||||
expect(() => parseEvalCliArgs(['grade'])).toThrow('grade requires --run')
|
||||
expect(() =>
|
||||
parseEvalCliArgs(['publish', '--run', 'results/run-1']),
|
||||
).toThrow('publish requires --target')
|
||||
})
|
||||
})
|
||||
115
packages/browseros-agent/apps/eval/tests/cli/suite-command.test.ts
vendored
Normal file
115
packages/browseros-agent/apps/eval/tests/cli/suite-command.test.ts
vendored
Normal file
@@ -0,0 +1,115 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { mkdtemp, writeFile } from 'node:fs/promises'
|
||||
import { tmpdir } from 'node:os'
|
||||
import { basename, join } from 'node:path'
|
||||
import {
|
||||
resolveSuiteCommand,
|
||||
runSuiteCommand,
|
||||
} from '../../src/cli/commands/suite'
|
||||
import type { RunEvalOptions } from '../../src/runner/types'
|
||||
|
||||
async function writeTempSuite(): Promise<{ dir: string; suitePath: string }> {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'eval-suite-cli-'))
|
||||
const suitePath = join(dir, 'agisdk-daily-10.json')
|
||||
await writeFile(
|
||||
suitePath,
|
||||
JSON.stringify(
|
||||
{
|
||||
id: 'agisdk-daily-10',
|
||||
dataset: 'tasks.jsonl',
|
||||
agent: { type: 'single' },
|
||||
graders: ['agisdk_state_diff'],
|
||||
workers: 2,
|
||||
restartBrowserPerTask: true,
|
||||
browseros: {
|
||||
server_url: 'http://127.0.0.1:9110',
|
||||
headless: true,
|
||||
},
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
)
|
||||
await writeFile(join(dir, 'tasks.jsonl'), '')
|
||||
return { dir, suitePath }
|
||||
}
|
||||
|
||||
describe('suite command', () => {
|
||||
it('resolves an existing config through the config adapter', async () => {
|
||||
const resolved = await resolveSuiteCommand({
|
||||
configPath: 'apps/eval/configs/legacy/browseros-agent-weekly.json',
|
||||
env: {},
|
||||
})
|
||||
|
||||
expect(resolved.kind).toBe('config')
|
||||
expect(resolved.suite.id).toBe('browseros-agent-weekly')
|
||||
expect(resolved.evalConfig.dataset).toBe(
|
||||
'../../data/webbench-2of4-50.jsonl',
|
||||
)
|
||||
expect(resolved.variant.publicMetadata.agent.apiKeyConfigured).toBe(true)
|
||||
})
|
||||
|
||||
it('resolves a suite file and variant into a runnable eval config', async () => {
|
||||
const { dir, suitePath } = await writeTempSuite()
|
||||
|
||||
const resolved = await resolveSuiteCommand({
|
||||
suitePath,
|
||||
variantId: 'kimi-fireworks',
|
||||
provider: 'openai-compatible',
|
||||
model: 'accounts/fireworks/models/kimi-k2p5',
|
||||
apiKey: 'test-key',
|
||||
baseUrl: 'https://api.fireworks.ai/inference/v1',
|
||||
env: {},
|
||||
})
|
||||
|
||||
expect(resolved.kind).toBe('suite')
|
||||
expect(resolved.suite.id).toBe('agisdk-daily-10')
|
||||
expect(resolved.datasetPath).toBe(join(dir, 'tasks.jsonl'))
|
||||
expect(resolved.evalConfig.agent).toMatchObject({
|
||||
type: 'single',
|
||||
provider: 'openai-compatible',
|
||||
model: 'accounts/fireworks/models/kimi-k2p5',
|
||||
apiKey: 'test-key',
|
||||
baseUrl: 'https://api.fireworks.ai/inference/v1',
|
||||
})
|
||||
expect(resolved.evalConfig.num_workers).toBe(2)
|
||||
})
|
||||
|
||||
it('runs config and suite commands through the runner dependency', async () => {
|
||||
const calls: RunEvalOptions[] = []
|
||||
await runSuiteCommand(
|
||||
{
|
||||
configPath: 'apps/eval/configs/legacy/browseros-agent-weekly.json',
|
||||
env: {},
|
||||
},
|
||||
{
|
||||
runEval: async (options) => {
|
||||
calls.push(options)
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
const { suitePath } = await writeTempSuite()
|
||||
await runSuiteCommand(
|
||||
{
|
||||
suitePath,
|
||||
model: 'moonshotai/kimi-k2.5',
|
||||
provider: 'openai-compatible',
|
||||
env: {},
|
||||
},
|
||||
{
|
||||
runEval: async (options) => {
|
||||
calls.push(options)
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
expect(calls).toHaveLength(2)
|
||||
expect(calls[0].configPath.endsWith('browseros-agent-weekly.json')).toBe(
|
||||
true,
|
||||
)
|
||||
expect(basename(calls[1].configPath)).toBe('agisdk-daily-10.json')
|
||||
expect(calls[1].config).toBeDefined()
|
||||
expect(calls[1].dataPath?.endsWith('tasks.jsonl')).toBe(true)
|
||||
})
|
||||
})
|
||||
79
packages/browseros-agent/apps/eval/tests/grading/agisdk-artifacts.test.ts
vendored
Normal file
79
packages/browseros-agent/apps/eval/tests/grading/agisdk-artifacts.test.ts
vendored
Normal file
@@ -0,0 +1,79 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { mkdtemp, readFile } from 'node:fs/promises'
|
||||
import { tmpdir } from 'node:os'
|
||||
import { join } from 'node:path'
|
||||
import { AgisdkStateDiffGrader } from '../../src/graders/benchmark/agisdk-state-diff'
|
||||
import type { GraderInput } from '../../src/grading/types'
|
||||
|
||||
describe('AgisdkStateDiffGrader artifacts', () => {
|
||||
it('writes finish state and evaluator artifacts', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'agisdk-artifacts-'))
|
||||
const grader = new AgisdkStateDiffGrader()
|
||||
const internals = grader as unknown as {
|
||||
fetchFinishState(
|
||||
origin: string,
|
||||
endpoint: string,
|
||||
): Promise<Record<string, unknown>>
|
||||
runPythonEvaluator(input: unknown): Promise<{
|
||||
output: {
|
||||
reward: number
|
||||
pass: boolean
|
||||
message: string
|
||||
per_criterion: unknown[]
|
||||
}
|
||||
stderr: string
|
||||
}>
|
||||
}
|
||||
|
||||
internals.fetchFinishState = async () => ({ cart: [{ name: 'Soup' }] })
|
||||
internals.runPythonEvaluator = async () => ({
|
||||
output: {
|
||||
reward: 0,
|
||||
pass: false,
|
||||
message: 'Missing entree',
|
||||
per_criterion: [{ passed: false, detail: 'entree missing' }],
|
||||
},
|
||||
stderr: 'criterion log',
|
||||
})
|
||||
|
||||
const input: GraderInput = {
|
||||
task: {
|
||||
query_id: 'agisdk-dashdish-10',
|
||||
query: 'Order dinner',
|
||||
dataset: 'agisdk',
|
||||
},
|
||||
messages: [],
|
||||
screenshotCount: 0,
|
||||
finalAnswer: 'done',
|
||||
taskArtifactDir: dir,
|
||||
outputDir: dir,
|
||||
mcpUrl: 'http://127.0.0.1:9110/mcp',
|
||||
}
|
||||
|
||||
const result = await grader.grade(input)
|
||||
|
||||
expect(result.pass).toBe(false)
|
||||
expect(
|
||||
JSON.parse(
|
||||
await readFile(
|
||||
join(dir, 'grader-artifacts/agisdk_state_diff/finish-state.json'),
|
||||
'utf-8',
|
||||
),
|
||||
),
|
||||
).toEqual({ cart: [{ name: 'Soup' }] })
|
||||
expect(
|
||||
JSON.parse(
|
||||
await readFile(
|
||||
join(dir, 'grader-artifacts/agisdk_state_diff/evaluator-output.json'),
|
||||
'utf-8',
|
||||
),
|
||||
),
|
||||
).toMatchObject({ message: 'Missing entree' })
|
||||
expect(
|
||||
await readFile(
|
||||
join(dir, 'grader-artifacts/agisdk_state_diff/stderr.txt'),
|
||||
'utf-8',
|
||||
),
|
||||
).toContain('criterion log')
|
||||
})
|
||||
})
|
||||
56
packages/browseros-agent/apps/eval/tests/grading/grader-registry.test.ts
vendored
Normal file
56
packages/browseros-agent/apps/eval/tests/grading/grader-registry.test.ts
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { createGrader } from '../../src/grading/grader-registry'
|
||||
import { runConfiguredGraders } from '../../src/grading/grader-runner'
|
||||
import type { Grader, GraderInput } from '../../src/grading/types'
|
||||
|
||||
const fixtureInput: GraderInput = {
|
||||
task: {
|
||||
query_id: 'task-1',
|
||||
query: 'Do the thing',
|
||||
dataset: 'fixture',
|
||||
},
|
||||
messages: [],
|
||||
screenshotCount: 0,
|
||||
finalAnswer: null,
|
||||
taskArtifactDir: '/tmp/task-1',
|
||||
outputDir: '/tmp/task-1',
|
||||
}
|
||||
|
||||
describe('grader registry', () => {
|
||||
it('creates all current graders behind the shared interface', () => {
|
||||
expect(createGrader('agisdk_state_diff')?.name).toBe('agisdk_state_diff')
|
||||
expect(createGrader('infinity_state')?.name).toBe('infinity_state')
|
||||
expect(createGrader('performance_grader')?.name).toBe('performance_grader')
|
||||
})
|
||||
})
|
||||
|
||||
describe('runConfiguredGraders', () => {
|
||||
it('records one grader failure without aborting other graders', async () => {
|
||||
const passing: Grader = {
|
||||
name: 'passing',
|
||||
async grade() {
|
||||
return { score: 1, pass: true, reasoning: 'ok' }
|
||||
},
|
||||
}
|
||||
const failing: Grader = {
|
||||
name: 'failing',
|
||||
async grade() {
|
||||
throw new Error('grader exploded')
|
||||
},
|
||||
}
|
||||
|
||||
const results = await runConfiguredGraders(
|
||||
['failing', 'passing'],
|
||||
fixtureInput,
|
||||
{
|
||||
createGrader(name) {
|
||||
return name === 'failing' ? failing : passing
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
expect(results.failing.pass).toBe(false)
|
||||
expect(results.failing.reasoning).toContain('grader exploded')
|
||||
expect(results.passing.pass).toBe(true)
|
||||
})
|
||||
})
|
||||
67
packages/browseros-agent/apps/eval/tests/grading/infinity-artifacts.test.ts
vendored
Normal file
67
packages/browseros-agent/apps/eval/tests/grading/infinity-artifacts.test.ts
vendored
Normal file
@@ -0,0 +1,67 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { mkdtemp, readFile } from 'node:fs/promises'
|
||||
import { tmpdir } from 'node:os'
|
||||
import { join } from 'node:path'
|
||||
import { InfinityStateGrader } from '../../src/graders/benchmark/infinity-state'
|
||||
import type { GraderInput } from '../../src/grading/types'
|
||||
|
||||
describe('InfinityStateGrader artifacts', () => {
|
||||
it('writes verifier and evaluator artifacts', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'infinity-artifacts-'))
|
||||
const oldInfinityDir = process.env.WEBARENA_INFINITY_DIR
|
||||
process.env.WEBARENA_INFINITY_DIR = '/tmp/webarena-infinity'
|
||||
|
||||
try {
|
||||
const grader = new InfinityStateGrader()
|
||||
const internals = grader as unknown as {
|
||||
runPythonEvaluator(input: unknown): Promise<{
|
||||
output: { pass: boolean; reward: number; message: string }
|
||||
stderr: string
|
||||
}>
|
||||
}
|
||||
internals.runPythonEvaluator = async () => ({
|
||||
output: { pass: true, reward: 1, message: 'verified' },
|
||||
stderr: 'verifier log',
|
||||
})
|
||||
|
||||
const input: GraderInput = {
|
||||
task: {
|
||||
query_id: 'infinity-elation-prescriptions-task_h69',
|
||||
query: 'Verify the app state',
|
||||
dataset: 'webarena-infinity',
|
||||
},
|
||||
messages: [],
|
||||
screenshotCount: 0,
|
||||
finalAnswer: null,
|
||||
taskArtifactDir: dir,
|
||||
outputDir: dir,
|
||||
infinityAppUrl: 'http://127.0.0.1:8123',
|
||||
}
|
||||
|
||||
const result = await grader.grade(input)
|
||||
|
||||
expect(result.pass).toBe(true)
|
||||
expect(
|
||||
JSON.parse(
|
||||
await readFile(
|
||||
join(dir, 'grader-artifacts/infinity_state/verifier.json'),
|
||||
'utf-8',
|
||||
),
|
||||
),
|
||||
).toMatchObject({
|
||||
appName: 'elation-prescriptions',
|
||||
appServerUrl: 'http://127.0.0.1:8123',
|
||||
})
|
||||
expect(
|
||||
JSON.parse(
|
||||
await readFile(
|
||||
join(dir, 'grader-artifacts/infinity_state/evaluator-output.json'),
|
||||
'utf-8',
|
||||
),
|
||||
),
|
||||
).toMatchObject({ message: 'verified' })
|
||||
} finally {
|
||||
process.env.WEBARENA_INFINITY_DIR = oldInfinityDir
|
||||
}
|
||||
})
|
||||
})
|
||||
92
packages/browseros-agent/apps/eval/tests/grading/performance-artifacts.test.ts
vendored
Normal file
92
packages/browseros-agent/apps/eval/tests/grading/performance-artifacts.test.ts
vendored
Normal file
@@ -0,0 +1,92 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'
|
||||
import { tmpdir } from 'node:os'
|
||||
import { join } from 'node:path'
|
||||
import { PerformanceGrader } from '../../src/graders/performance/performance-grader'
|
||||
import type { GraderInput } from '../../src/grading/types'
|
||||
|
||||
describe('PerformanceGrader artifacts', () => {
|
||||
it('writes metrics, agent output, and axes artifacts', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'performance-artifacts-'))
|
||||
await mkdir(join(dir, 'screenshots'))
|
||||
await writeFile(
|
||||
join(dir, 'metadata.json'),
|
||||
JSON.stringify({ termination_reason: 'completed' }),
|
||||
)
|
||||
|
||||
const grader = new PerformanceGrader(undefined, undefined, 'claude-test')
|
||||
const internals = grader as unknown as {
|
||||
runAgent(
|
||||
systemPrompt: string,
|
||||
userPrompt: string,
|
||||
outputDir: string,
|
||||
): Promise<{
|
||||
type: 'result'
|
||||
subtype: string
|
||||
result: string
|
||||
total_cost_usd: number
|
||||
num_turns: number
|
||||
structured_output: unknown
|
||||
}>
|
||||
}
|
||||
internals.runAgent = async () => ({
|
||||
type: 'result',
|
||||
subtype: 'success',
|
||||
result: 'ok',
|
||||
total_cost_usd: 0.01,
|
||||
num_turns: 2,
|
||||
structured_output: {
|
||||
axes: [{ axis: 'task_completion', score: 90, reasoning: 'completed' }],
|
||||
},
|
||||
})
|
||||
|
||||
const input: GraderInput = {
|
||||
task: {
|
||||
query_id: 'task-1',
|
||||
query: 'Find the answer',
|
||||
dataset: 'fixture',
|
||||
},
|
||||
messages: [
|
||||
{
|
||||
type: 'tool-input-available',
|
||||
timestamp: '2026-04-29T00:00:00.000Z',
|
||||
toolCallId: 'call-1',
|
||||
toolName: 'browser_get_page_content',
|
||||
input: {},
|
||||
},
|
||||
],
|
||||
screenshotCount: 1,
|
||||
finalAnswer: 'answer',
|
||||
taskArtifactDir: dir,
|
||||
outputDir: dir,
|
||||
}
|
||||
|
||||
const result = await grader.grade(input)
|
||||
|
||||
expect(result.details?.model).toBe('claude-test')
|
||||
expect(
|
||||
JSON.parse(
|
||||
await readFile(
|
||||
join(dir, 'grader-artifacts/performance_grader/metrics.json'),
|
||||
'utf-8',
|
||||
),
|
||||
),
|
||||
).toMatchObject({ totalToolCalls: 1 })
|
||||
expect(
|
||||
JSON.parse(
|
||||
await readFile(
|
||||
join(dir, 'grader-artifacts/performance_grader/axes.json'),
|
||||
'utf-8',
|
||||
),
|
||||
),
|
||||
).toMatchObject({ task_completion: { score: 90 } })
|
||||
expect(
|
||||
JSON.parse(
|
||||
await readFile(
|
||||
join(dir, 'grader-artifacts/performance_grader/agent-output.json'),
|
||||
'utf-8',
|
||||
),
|
||||
),
|
||||
).toMatchObject({ subtype: 'success' })
|
||||
})
|
||||
})
|
||||
66
packages/browseros-agent/apps/eval/tests/grading/python-evaluator.test.ts
vendored
Normal file
66
packages/browseros-agent/apps/eval/tests/grading/python-evaluator.test.ts
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { mkdtemp, writeFile } from 'node:fs/promises'
|
||||
import { tmpdir } from 'node:os'
|
||||
import { join } from 'node:path'
|
||||
import { runPythonJsonEvaluator } from '../../src/grading/python-evaluator'
|
||||
|
||||
async function writeScript(source: string): Promise<string> {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'eval-python-'))
|
||||
const script = join(dir, 'script.py')
|
||||
await writeFile(script, source)
|
||||
return script
|
||||
}
|
||||
|
||||
describe('runPythonJsonEvaluator', () => {
|
||||
it('sends JSON on stdin, captures stderr, and parses stdout JSON', async () => {
|
||||
const script = await writeScript(`
|
||||
import json, sys
|
||||
data = json.loads(sys.stdin.read())
|
||||
print("warning", file=sys.stderr)
|
||||
print(json.dumps({"ok": True, "value": data["value"]}))
|
||||
`)
|
||||
|
||||
const result = await runPythonJsonEvaluator<{ ok: boolean; value: number }>(
|
||||
{
|
||||
scriptPath: script,
|
||||
input: { value: 42 },
|
||||
timeoutMs: 5_000,
|
||||
},
|
||||
)
|
||||
|
||||
expect(result.output).toEqual({ ok: true, value: 42 })
|
||||
expect(result.stderr).toContain('warning')
|
||||
expect(result.exitCode).toBe(0)
|
||||
})
|
||||
|
||||
it('reports non-zero exits with stderr', async () => {
|
||||
const script = await writeScript(`
|
||||
import sys
|
||||
print("bad verifier", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
`)
|
||||
|
||||
await expect(
|
||||
runPythonJsonEvaluator({
|
||||
scriptPath: script,
|
||||
input: {},
|
||||
timeoutMs: 5_000,
|
||||
}),
|
||||
).rejects.toThrow('bad verifier')
|
||||
})
|
||||
|
||||
it('enforces timeouts', async () => {
|
||||
const script = await writeScript(`
|
||||
import time
|
||||
time.sleep(5)
|
||||
`)
|
||||
|
||||
await expect(
|
||||
runPythonJsonEvaluator({
|
||||
scriptPath: script,
|
||||
input: {},
|
||||
timeoutMs: 50,
|
||||
}),
|
||||
).rejects.toThrow('timed out')
|
||||
})
|
||||
})
|
||||
21
packages/browseros-agent/apps/eval/tests/grading/python-script-layout.test.ts
vendored
Normal file
21
packages/browseros-agent/apps/eval/tests/grading/python-script-layout.test.ts
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { stat } from 'node:fs/promises'
|
||||
import { resolve } from 'node:path'
|
||||
|
||||
async function exists(path: string): Promise<boolean> {
|
||||
return !!(await stat(path).catch(() => null))
|
||||
}
|
||||
|
||||
describe('grader python script layout', () => {
|
||||
it('keeps runtime evaluator scripts next to the grader implementation', async () => {
|
||||
const pythonDir = resolve(import.meta.dir, '../../src/graders/python')
|
||||
const scriptsDir = resolve(import.meta.dir, '../../scripts')
|
||||
|
||||
expect(await exists(resolve(pythonDir, 'agisdk-evaluate.py'))).toBe(true)
|
||||
expect(await exists(resolve(pythonDir, 'infinity-evaluate.py'))).toBe(true)
|
||||
expect(await exists(resolve(scriptsDir, 'agisdk-evaluate.py'))).toBe(false)
|
||||
expect(await exists(resolve(scriptsDir, 'infinity-evaluate.py'))).toBe(
|
||||
false,
|
||||
)
|
||||
})
|
||||
})
|
||||
193
packages/browseros-agent/apps/eval/tests/publishing/r2-publisher.test.ts
vendored
Normal file
193
packages/browseros-agent/apps/eval/tests/publishing/r2-publisher.test.ts
vendored
Normal file
@@ -0,0 +1,193 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { mkdir, mkdtemp, readFile, rename, writeFile } from 'node:fs/promises'
|
||||
import { tmpdir } from 'node:os'
|
||||
import { join } from 'node:path'
|
||||
import {
|
||||
contentTypeForPath,
|
||||
R2Publisher,
|
||||
} from '../../src/publishing/r2-publisher'
|
||||
|
||||
class FakeR2Client {
|
||||
readonly puts: Record<string, unknown>[] = []
|
||||
readonly existing = new Set<string>()
|
||||
|
||||
async send(command: { input: Record<string, unknown> }): Promise<unknown> {
|
||||
const key = command.input.Key as string
|
||||
if ('Body' in command.input) {
|
||||
this.puts.push(command.input)
|
||||
return {}
|
||||
}
|
||||
if (this.existing.has(key)) return {}
|
||||
throw new Error('not found')
|
||||
}
|
||||
}
|
||||
|
||||
async function writeRunFixture(
|
||||
root: string,
|
||||
configName = 'browseros-agent-weekly',
|
||||
timestamp = '2026-04-29-1200',
|
||||
): Promise<{ runDir: string; runId: string }> {
|
||||
const runDir = join(root, configName, timestamp)
|
||||
const taskDir = join(runDir, 'task-1')
|
||||
await mkdir(join(taskDir, 'screenshots'), { recursive: true })
|
||||
await writeFile(
|
||||
join(taskDir, 'metadata.json'),
|
||||
JSON.stringify({
|
||||
query_id: 'task-1',
|
||||
dataset: 'webbench',
|
||||
query: 'Find pricing',
|
||||
start_url: 'https://example.test',
|
||||
termination_reason: 'completed',
|
||||
total_duration_ms: 1200,
|
||||
screenshot_count: 1,
|
||||
agent_config: { type: 'single', model: 'kimi' },
|
||||
grader_results: {
|
||||
performance_grader: { score: 1, pass: true, reasoning: 'ok' },
|
||||
},
|
||||
}),
|
||||
)
|
||||
await writeFile(join(taskDir, 'messages.jsonl'), '{"type":"user"}\n')
|
||||
await writeFile(join(taskDir, 'grades.json'), '{"ok":true}')
|
||||
await writeFile(join(taskDir, 'screenshots', '1.png'), 'png')
|
||||
await writeFile(
|
||||
join(runDir, 'summary.json'),
|
||||
JSON.stringify({ passRate: 1, avgDurationMs: 1200 }),
|
||||
)
|
||||
return { runDir, runId: `${configName}-${timestamp}` }
|
||||
}
|
||||
|
||||
describe('R2Publisher', () => {
|
||||
it('maps artifact file extensions to viewer-compatible content types', () => {
|
||||
expect(contentTypeForPath('metadata.json')).toBe('application/json')
|
||||
expect(contentTypeForPath('messages.jsonl')).toBe('application/x-ndjson')
|
||||
expect(contentTypeForPath('screenshots/1.png')).toBe('image/png')
|
||||
expect(contentTypeForPath('viewer.html')).toBe('text/html')
|
||||
})
|
||||
|
||||
it('uploads run artifacts, manifest, and viewer html', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'eval-r2-'))
|
||||
const { runDir, runId } = await writeRunFixture(dir)
|
||||
const viewerPath = join(dir, 'viewer.html')
|
||||
await writeFile(viewerPath, '<html>viewer</html>')
|
||||
const client = new FakeR2Client()
|
||||
|
||||
const result = await new R2Publisher({
|
||||
client,
|
||||
viewerPath,
|
||||
config: {
|
||||
accountId: 'acct',
|
||||
accessKeyId: 'key',
|
||||
secretAccessKey: 'secret',
|
||||
bucket: 'bucket',
|
||||
cdnBaseUrl: 'https://eval.example.test',
|
||||
},
|
||||
now: () => new Date('2026-04-29T12:00:00.000Z'),
|
||||
}).publishRun(runDir, runId)
|
||||
|
||||
const byKey = new Map(client.puts.map((put) => [put.Key, put]))
|
||||
expect(byKey.get(`runs/${runId}/task-1/metadata.json`)?.ContentType).toBe(
|
||||
'application/json',
|
||||
)
|
||||
expect(byKey.get(`runs/${runId}/task-1/messages.jsonl`)?.ContentType).toBe(
|
||||
'application/x-ndjson',
|
||||
)
|
||||
expect(
|
||||
byKey.get(`runs/${runId}/task-1/screenshots/1.png`)?.ContentType,
|
||||
).toBe('image/png')
|
||||
expect(byKey.get(`runs/${runId}/manifest.json`)?.ContentType).toBe(
|
||||
'application/json',
|
||||
)
|
||||
expect(byKey.get(`runs/${runId}/summary.json`)?.ContentType).toBe(
|
||||
'application/json',
|
||||
)
|
||||
expect(byKey.get('viewer.html')?.ContentType).toBe('text/html')
|
||||
expect(result.viewerUrl).toBe(
|
||||
`https://eval.example.test/viewer.html?run=${runId}`,
|
||||
)
|
||||
|
||||
const manifest = JSON.parse(
|
||||
Buffer.from(
|
||||
byKey.get(`runs/${runId}/manifest.json`)?.Body as Buffer,
|
||||
).toString('utf-8'),
|
||||
)
|
||||
expect(manifest).toMatchObject({
|
||||
runId,
|
||||
uploadedAt: '2026-04-29T12:00:00.000Z',
|
||||
dataset: 'webbench',
|
||||
summary: { passRate: 1, avgDurationMs: 1200 },
|
||||
tasks: [
|
||||
{
|
||||
queryId: 'task-1',
|
||||
status: 'completed',
|
||||
screenshotCount: 1,
|
||||
},
|
||||
],
|
||||
})
|
||||
})
|
||||
|
||||
it('publishes unuploaded runs from a config results directory', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'eval-r2-config-'))
|
||||
const first = await writeRunFixture(dir, 'weekly', '2026-04-29-1200')
|
||||
const second = await writeRunFixture(dir, 'weekly', '2026-04-30-1200')
|
||||
const viewerPath = join(dir, 'viewer.html')
|
||||
await writeFile(viewerPath, '<html>viewer</html>')
|
||||
const client = new FakeR2Client()
|
||||
client.existing.add(`runs/${first.runId}/manifest.json`)
|
||||
|
||||
const result = await new R2Publisher({
|
||||
client,
|
||||
viewerPath,
|
||||
config: {
|
||||
accountId: 'acct',
|
||||
accessKeyId: 'key',
|
||||
secretAccessKey: 'secret',
|
||||
bucket: 'bucket',
|
||||
cdnBaseUrl: 'https://eval.example.test',
|
||||
},
|
||||
now: () => new Date('2026-04-29T12:00:00.000Z'),
|
||||
}).publishPath(join(dir, 'weekly'))
|
||||
|
||||
expect(result.uploadedRuns.map((run) => run.runId)).toEqual([second.runId])
|
||||
expect(
|
||||
client.puts.some(
|
||||
(put) => put.Key === `runs/${first.runId}/manifest.json`,
|
||||
),
|
||||
).toBe(false)
|
||||
expect(
|
||||
client.puts.some(
|
||||
(put) => put.Key === `runs/${second.runId}/manifest.json`,
|
||||
),
|
||||
).toBe(true)
|
||||
|
||||
await expect(
|
||||
readFile(join(second.runDir, 'summary.json'), 'utf-8'),
|
||||
).resolves.toContain('passRate')
|
||||
})
|
||||
|
||||
it('recognizes and publishes canonical tasks directory runs', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'eval-r2-tasks-'))
|
||||
const { runDir, runId } = await writeRunFixture(dir)
|
||||
await mkdir(join(runDir, 'tasks'), { recursive: true })
|
||||
await rename(join(runDir, 'task-1'), join(runDir, 'tasks', 'task-1'))
|
||||
const viewerPath = join(dir, 'viewer.html')
|
||||
await writeFile(viewerPath, '<html>viewer</html>')
|
||||
const client = new FakeR2Client()
|
||||
|
||||
const result = await new R2Publisher({
|
||||
client,
|
||||
viewerPath,
|
||||
config: {
|
||||
accountId: 'acct',
|
||||
accessKeyId: 'key',
|
||||
secretAccessKey: 'secret',
|
||||
bucket: 'bucket',
|
||||
cdnBaseUrl: 'https://eval.example.test',
|
||||
},
|
||||
}).publishPath(runDir)
|
||||
|
||||
const keys = client.puts.map((put) => put.Key)
|
||||
expect(result.uploadedRuns.map((run) => run.runId)).toEqual([runId])
|
||||
expect(keys).toContain(`runs/${runId}/task-1/metadata.json`)
|
||||
expect(keys).toContain(`runs/${runId}/tasks/task-1/metadata.json`)
|
||||
})
|
||||
})
|
||||
82
packages/browseros-agent/apps/eval/tests/runs/artifact-paths.test.ts
vendored
Normal file
82
packages/browseros-agent/apps/eval/tests/runs/artifact-paths.test.ts
vendored
Normal file
@@ -0,0 +1,82 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { mkdtemp, readFile } from 'node:fs/promises'
|
||||
import { tmpdir } from 'node:os'
|
||||
import { join } from 'node:path'
|
||||
import { TrajectorySaver } from '../../src/capture/trajectory-saver'
|
||||
import { createRunId, getRunPaths } from '../../src/runs/artifact-paths'
|
||||
import type { TaskMetadata } from '../../src/types'
|
||||
|
||||
describe('artifact paths', () => {
|
||||
it('creates stable safe run ids', () => {
|
||||
const runId = createRunId(
|
||||
'agisdk/daily 10',
|
||||
'kimi fire?',
|
||||
new Date('2026-04-29T06:00:00Z'),
|
||||
)
|
||||
|
||||
expect(runId).toBe('agisdk-daily-10__kimi-fire__2026-04-29-0600')
|
||||
})
|
||||
|
||||
it('returns run and task artifact paths', () => {
|
||||
const paths = getRunPaths('results', 'run-1', 'task-1')
|
||||
|
||||
expect(paths.runDir).toBe(join('results', 'runs', 'run-1'))
|
||||
expect(paths.runManifest).toBe(join('results', 'runs', 'run-1', 'run.json'))
|
||||
expect(paths.viewerManifest).toBe(
|
||||
join('results', 'runs', 'run-1', 'viewer-manifest.json'),
|
||||
)
|
||||
expect(paths.messages).toBe(
|
||||
join('results', 'runs', 'run-1', 'tasks', 'task-1', 'messages.jsonl'),
|
||||
)
|
||||
expect(paths.graderArtifacts).toBe(
|
||||
join('results', 'runs', 'run-1', 'tasks', 'task-1', 'grader-artifacts'),
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
describe('TrajectorySaver artifact compatibility', () => {
|
||||
it('keeps metadata.json and writes attempt and grades artifacts', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'eval-artifacts-'))
|
||||
const saver = new TrajectorySaver(dir, 'task-1')
|
||||
const taskDir = await saver.init()
|
||||
const metadata: TaskMetadata = {
|
||||
query_id: 'task-1',
|
||||
dataset: 'fixture',
|
||||
query: 'Do the task',
|
||||
started_at: '2026-04-29T00:00:00.000Z',
|
||||
completed_at: '2026-04-29T00:00:01.000Z',
|
||||
total_duration_ms: 1000,
|
||||
total_steps: 1,
|
||||
screenshot_count: 1,
|
||||
termination_reason: 'completed',
|
||||
final_answer: 'done',
|
||||
errors: [],
|
||||
warnings: [],
|
||||
agent_config: { type: 'single', model: 'model' },
|
||||
grader_results: {},
|
||||
}
|
||||
|
||||
await saver.saveMetadata(metadata)
|
||||
await saver.saveAttempt({ status: 'completed', taskId: 'task-1' })
|
||||
await saver.saveGrades({
|
||||
performance_grader: { score: 1, pass: true, reasoning: 'ok' },
|
||||
})
|
||||
|
||||
expect(
|
||||
JSON.parse(await readFile(join(taskDir, 'metadata.json'), 'utf-8')),
|
||||
).toMatchObject({
|
||||
query_id: 'task-1',
|
||||
})
|
||||
expect(
|
||||
JSON.parse(await readFile(join(taskDir, 'attempt.json'), 'utf-8')),
|
||||
).toEqual({
|
||||
status: 'completed',
|
||||
taskId: 'task-1',
|
||||
})
|
||||
expect(
|
||||
JSON.parse(await readFile(join(taskDir, 'grades.json'), 'utf-8')),
|
||||
).toMatchObject({
|
||||
performance_grader: { pass: true },
|
||||
})
|
||||
})
|
||||
})
|
||||
21
packages/browseros-agent/apps/eval/tests/runs/pipeline-compat.test.ts
vendored
Normal file
21
packages/browseros-agent/apps/eval/tests/runs/pipeline-compat.test.ts
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { runEval as oldRunEval } from '../../src/runner/eval-runner'
|
||||
import { ParallelExecutor } from '../../src/runner/parallel-executor'
|
||||
import { TaskExecutor } from '../../src/runner/task-executor'
|
||||
import { runEval } from '../../src/runs/eval-runner'
|
||||
import { TaskRunPipeline } from '../../src/runs/task-run-pipeline'
|
||||
import { TaskWorkerPool } from '../../src/runs/task-worker-pool'
|
||||
|
||||
describe('runner naming compatibility', () => {
|
||||
it('exports new runner-layer names', () => {
|
||||
expect(TaskWorkerPool.name).toBe('TaskWorkerPool')
|
||||
expect(TaskRunPipeline.name).toBe('TaskRunPipeline')
|
||||
expect(typeof runEval).toBe('function')
|
||||
})
|
||||
|
||||
it('keeps old runner imports working', () => {
|
||||
expect(ParallelExecutor).toBe(TaskWorkerPool)
|
||||
expect(TaskExecutor).toBe(TaskRunPipeline)
|
||||
expect(oldRunEval).toBe(runEval)
|
||||
})
|
||||
})
|
||||
40
packages/browseros-agent/apps/eval/tests/runs/run-manifest.test.ts
vendored
Normal file
40
packages/browseros-agent/apps/eval/tests/runs/run-manifest.test.ts
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { buildRunManifest } from '../../src/runs/run-manifest'
|
||||
|
||||
describe('buildRunManifest', () => {
|
||||
it('captures reproducibility fields without raw secrets', () => {
|
||||
const manifest = buildRunManifest({
|
||||
runId: 'agisdk-daily-10__kimi__2026-04-29-0600',
|
||||
suiteId: 'agisdk-daily-10',
|
||||
variant: {
|
||||
id: 'kimi',
|
||||
agent: {
|
||||
provider: 'openai-compatible',
|
||||
model: 'moonshotai/kimi-k2.5',
|
||||
apiKey: 'secret-value',
|
||||
baseUrl: 'https://api.example.com/v1',
|
||||
},
|
||||
publicMetadata: {
|
||||
id: 'kimi',
|
||||
agent: {
|
||||
provider: 'openai-compatible',
|
||||
model: 'moonshotai/kimi-k2.5',
|
||||
baseUrlHost: 'api.example.com',
|
||||
apiKeyConfigured: true,
|
||||
apiKeyEnv: 'EVAL_AGENT_API_KEY',
|
||||
},
|
||||
},
|
||||
},
|
||||
datasetPath: 'apps/eval/data/agisdk-real.jsonl',
|
||||
datasetHash: 'sha256:abc',
|
||||
graders: ['agisdk_state_diff'],
|
||||
gitSha: 'abc123',
|
||||
browserosVersion: 'BrowserOS 1.0.0',
|
||||
startedAt: '2026-04-29T06:00:00.000Z',
|
||||
})
|
||||
|
||||
expect(manifest.variant.agent.baseUrlHost).toBe('api.example.com')
|
||||
expect(manifest.dataset.hash).toBe('sha256:abc')
|
||||
expect(JSON.stringify(manifest)).not.toContain('secret-value')
|
||||
})
|
||||
})
|
||||
37
packages/browseros-agent/apps/eval/tests/suites/config-adapter.test.ts
vendored
Normal file
37
packages/browseros-agent/apps/eval/tests/suites/config-adapter.test.ts
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { adaptEvalConfigFile } from '../../src/suites/config-adapter'
|
||||
|
||||
describe('adaptEvalConfigFile', () => {
|
||||
it('preserves browseros-agent-weekly config semantics', async () => {
|
||||
const adapted = await adaptEvalConfigFile(
|
||||
'apps/eval/configs/legacy/browseros-agent-weekly.json',
|
||||
)
|
||||
|
||||
expect(adapted.suite.id).toBe('browseros-agent-weekly')
|
||||
expect(adapted.suite.dataset).toBe('../../data/webbench-2of4-50.jsonl')
|
||||
expect(adapted.suite.graders).toEqual(['performance_grader'])
|
||||
expect(adapted.suite.workers).toBe(10)
|
||||
expect(adapted.suite.restartBrowserPerTask).toBe(true)
|
||||
expect(adapted.suite.timeoutMs).toBe(1_800_000)
|
||||
expect(adapted.evalConfig.num_workers).toBe(10)
|
||||
expect(adapted.evalConfig.browseros.server_url).toBe(
|
||||
'http://127.0.0.1:9110',
|
||||
)
|
||||
})
|
||||
|
||||
it('keeps API key env names public while omitting secret values', async () => {
|
||||
const adapted = await adaptEvalConfigFile(
|
||||
'apps/eval/configs/legacy/browseros-agent-weekly.json',
|
||||
{
|
||||
env: { OPENROUTER_API_KEY: 'secret-openrouter-value' },
|
||||
},
|
||||
)
|
||||
|
||||
expect(adapted.variant.publicMetadata.agent.apiKeyEnv).toBe(
|
||||
'OPENROUTER_API_KEY',
|
||||
)
|
||||
expect(JSON.stringify(adapted.variant.publicMetadata)).not.toContain(
|
||||
'secret-openrouter-value',
|
||||
)
|
||||
})
|
||||
})
|
||||
92
packages/browseros-agent/apps/eval/tests/suites/schema.test.ts
vendored
Normal file
92
packages/browseros-agent/apps/eval/tests/suites/schema.test.ts
vendored
Normal file
@@ -0,0 +1,92 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { readFile } from 'node:fs/promises'
|
||||
import { loadSuite } from '../../src/suites/load-suite'
|
||||
import { resolveVariant } from '../../src/suites/resolve-variant'
|
||||
import { EvalSuiteSchema } from '../../src/suites/schema'
|
||||
|
||||
describe('EvalSuiteSchema', () => {
|
||||
it('validates suite settings used by the eval pipeline', () => {
|
||||
const suite = EvalSuiteSchema.parse({
|
||||
id: 'agisdk-daily-10',
|
||||
dataset: 'data/agisdk-daily-10.jsonl',
|
||||
agent: {
|
||||
type: 'orchestrated',
|
||||
executorBackend: 'tool-loop',
|
||||
},
|
||||
graders: ['agisdk_state_diff'],
|
||||
workers: 4,
|
||||
restartBrowserPerTask: true,
|
||||
timeoutMs: 1_800_000,
|
||||
})
|
||||
|
||||
expect(suite.id).toBe('agisdk-daily-10')
|
||||
expect(suite.agent.type).toBe('orchestrated')
|
||||
expect(suite.agent.executorBackend).toBe('tool-loop')
|
||||
expect(suite.workers).toBe(4)
|
||||
})
|
||||
|
||||
it('rejects suites without a dataset', () => {
|
||||
const parsed = EvalSuiteSchema.safeParse({
|
||||
id: 'bad-suite',
|
||||
agent: { type: 'tool-loop' },
|
||||
graders: ['performance_grader'],
|
||||
})
|
||||
|
||||
expect(parsed.success).toBe(false)
|
||||
})
|
||||
|
||||
it('validates the daily AGISDK 10-task suite', async () => {
|
||||
const loaded = await loadSuite(
|
||||
'apps/eval/configs/suites/agisdk-daily-10.json',
|
||||
)
|
||||
const lines = (await readFile(loaded.datasetPath, 'utf-8'))
|
||||
.trim()
|
||||
.split('\n')
|
||||
|
||||
expect(loaded.suite.id).toBe('agisdk-daily-10')
|
||||
expect(loaded.suite.graders).toEqual(['agisdk_state_diff'])
|
||||
expect(loaded.suite.workers).toBe(1)
|
||||
expect(lines).toHaveLength(10)
|
||||
expect(JSON.parse(lines[0]).query_id).toBe('agisdk-dashdish-10')
|
||||
expect(JSON.parse(lines[9]).query_id).toBe('agisdk-zilloft-6')
|
||||
})
|
||||
})
|
||||
|
||||
describe('resolveVariant', () => {
|
||||
it('prefers CLI values over env values and does not expose raw API keys', () => {
|
||||
const variant = resolveVariant({
|
||||
variantId: 'cli-variant',
|
||||
provider: 'anthropic',
|
||||
model: 'claude-test',
|
||||
apiKey: 'cli-secret',
|
||||
baseUrl: 'https://cli.example/v1',
|
||||
env: {
|
||||
EVAL_VARIANT: 'env-variant',
|
||||
EVAL_AGENT_PROVIDER: 'openai-compatible',
|
||||
EVAL_AGENT_MODEL: 'env-model',
|
||||
EVAL_AGENT_API_KEY: 'env-secret',
|
||||
EVAL_AGENT_BASE_URL: 'https://env.example/v1',
|
||||
},
|
||||
})
|
||||
|
||||
expect(variant.id).toBe('cli-variant')
|
||||
expect(variant.agent.provider).toBe('anthropic')
|
||||
expect(variant.agent.model).toBe('claude-test')
|
||||
expect(variant.agent.apiKey).toBe('cli-secret')
|
||||
expect(variant.publicMetadata.agent.apiKeyConfigured).toBe(true)
|
||||
expect(JSON.stringify(variant.publicMetadata)).not.toContain('cli-secret')
|
||||
expect(JSON.stringify(variant.publicMetadata)).not.toContain('env-secret')
|
||||
})
|
||||
|
||||
it('fails clearly when credentials are required but missing', () => {
|
||||
expect(() =>
|
||||
resolveVariant({
|
||||
variantId: 'missing-key',
|
||||
provider: 'openai-compatible',
|
||||
model: 'kimi',
|
||||
env: {},
|
||||
requireApiKey: true,
|
||||
}),
|
||||
).toThrow('EVAL_AGENT_API_KEY')
|
||||
})
|
||||
})
|
||||
41
packages/browseros-agent/apps/eval/tests/viewer/viewer-manifest.test.ts
vendored
Normal file
41
packages/browseros-agent/apps/eval/tests/viewer/viewer-manifest.test.ts
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { buildViewerManifest } from '../../src/viewer/viewer-manifest'
|
||||
|
||||
describe('buildViewerManifest', () => {
|
||||
it('indexes task artifacts for the R2 viewer', () => {
|
||||
const manifest = buildViewerManifest({
|
||||
runId: 'run-1',
|
||||
suiteId: 'agisdk-daily-10',
|
||||
variantId: 'kimi',
|
||||
uploadedAt: '2026-04-29T06:00:00.000Z',
|
||||
summary: { total: 1, passRate: 0 },
|
||||
tasks: [
|
||||
{
|
||||
queryId: 'agisdk-dashdish-4',
|
||||
query: 'Schedule a delivery order',
|
||||
startUrl: 'https://evals-dashdish.vercel.app',
|
||||
status: 'completed',
|
||||
durationMs: 353_000,
|
||||
screenshotCount: 42,
|
||||
graderResults: {
|
||||
agisdk_state_diff: {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: 'Missing checkout item',
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
expect(manifest.tasks[0].paths.messages).toBe(
|
||||
'tasks/agisdk-dashdish-4/messages.jsonl',
|
||||
)
|
||||
expect(manifest.tasks[0].paths.screenshots).toBe(
|
||||
'tasks/agisdk-dashdish-4/screenshots',
|
||||
)
|
||||
expect(manifest.tasks[0].paths.graderArtifacts).toBe(
|
||||
'tasks/agisdk-dashdish-4/grader-artifacts',
|
||||
)
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user