Compare commits

...

17 Commits

Author SHA1 Message Date
Nikhil Sonti
6ee306236e chore(eval): colocate grader python evaluators 2026-04-29 17:16:58 -07:00
Nikhil Sonti
0afc59cda1 chore(eval): organize config layouts 2026-04-29 17:01:25 -07:00
Nikhil Sonti
eb8faa931a docs(eval): explain suites and variants 2026-04-29 16:38:54 -07:00
Nikhil Sonti
be70170313 docs(eval): add env example 2026-04-29 16:10:27 -07:00
Nikhil Sonti
0661197f5b fix: address review feedback for PR #875 2026-04-29 16:00:56 -07:00
Nikhil Sonti
c4e7824266 chore(eval): verify pipeline refactor 2026-04-29 15:47:09 -07:00
Nikhil Sonti
22f71a36c5 docs(eval): document suite pipeline 2026-04-29 15:45:27 -07:00
Nikhil Sonti
d49986d0b3 ci(eval): migrate weekly workflow to eval cli 2026-04-29 15:43:56 -07:00
Nikhil Sonti
acdd394585 feat(eval): add r2 publisher module 2026-04-29 15:42:58 -07:00
Nikhil Sonti
219fdf1e28 feat(eval): add workflow compatible cli 2026-04-29 15:40:05 -07:00
Nikhil Sonti
014f71d227 refactor(eval): split clado backend 2026-04-29 15:34:09 -07:00
Nikhil Sonti
876dea4d56 refactor(eval): add executor backend boundary 2026-04-29 15:28:56 -07:00
Nikhil Sonti
fca7d4cbcb refactor(eval): rename runner layers 2026-04-29 15:27:12 -07:00
Nikhil Sonti
e1bfadb075 feat(eval): persist grader artifacts 2026-04-29 15:25:42 -07:00
Nikhil Sonti
aa0d9b96ef refactor(eval): add shared grader contract 2026-04-29 15:23:41 -07:00
Nikhil Sonti
1c9604b5fa feat(eval): add stable run artifacts 2026-04-29 15:22:10 -07:00
Nikhil Sonti
685266a1d8 feat(eval): add suite variant config bridge 2026-04-29 15:20:45 -07:00
80 changed files with 4920 additions and 1974 deletions

View File

@@ -14,7 +14,7 @@ on:
config:
description: 'Eval config file (relative to apps/eval/)'
required: false
default: 'configs/browseros-agent-weekly.json'
default: 'configs/legacy/browseros-agent-weekly.json'
permissions:
contents: read
@@ -62,36 +62,27 @@ jobs:
curl -sL -o /tmp/nopecha.zip https://github.com/NopeCHALLC/nopecha-extension/releases/latest/download/chromium_automation.zip
unzip -qo /tmp/nopecha.zip -d extensions/nopecha
- name: Run eval
- name: Run eval and publish to R2
working-directory: packages/browseros-agent/apps/eval
env:
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
BROWSEROS_BINARY: /usr/bin/browseros
WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
# OpenClaw container runtime is macOS-only; opt the Linux runner
# into the no-op stub so the server can boot and the eval can run.
BROWSEROS_SKIP_OPENCLAW: '1'
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
run: |
echo "Running eval with config: $EVAL_CONFIG"
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts -c "$EVAL_CONFIG"
- name: Upload runs to R2
if: success()
working-directory: packages/browseros-agent/apps/eval
env:
EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }}
EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }}
EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }}
EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }}
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
BROWSEROS_BINARY: /usr/bin/browseros
WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
# OpenClaw container runtime is macOS-only; opt the Linux runner
# into the no-op stub so the server can boot and the eval can run.
BROWSEROS_SKIP_OPENCLAW: '1'
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }}
run: |
CONFIG_NAME=$(basename "$EVAL_CONFIG" .json)
bun scripts/upload-run.ts "results/$CONFIG_NAME"
echo "Running eval with config: $EVAL_CONFIG"
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts suite --config "$EVAL_CONFIG" --publish r2
- name: Generate trend report
if: success()

View File

@@ -0,0 +1,51 @@
# Copy to .env.development for local eval runs.
# Provider keys used by existing config files.
OPENROUTER_API_KEY=
FIREWORKS_API_KEY=
ANTHROPIC_API_KEY=
OPENAI_API_KEY=
GOOGLE_GENERATIVE_AI_API_KEY=
# Claude Agent SDK token used by performance_grader.
CLAUDE_CODE_OAUTH_TOKEN=
# Suite-mode model selection.
EVAL_VARIANT=local
EVAL_AGENT_PROVIDER=openai-compatible
EVAL_AGENT_MODEL=
EVAL_AGENT_API_KEY=
EVAL_AGENT_BASE_URL=
EVAL_AGENT_SUPPORTS_IMAGES=true
# Optional suite-mode executor override for orchestrator suites.
EVAL_EXECUTOR_MODEL=
EVAL_EXECUTOR_API_KEY=
EVAL_EXECUTOR_BASE_URL=
# Clado visual action executor.
CLADO_ACTION_MODEL=
CLADO_ACTION_API_KEY=
CLADO_ACTION_BASE_URL=
# Backward-compatible alias used by older local scripts.
CLADO_ACTION_URL=
# BrowserOS runner.
BROWSEROS_BINARY=/Applications/BrowserOS.app/Contents/MacOS/BrowserOS
BROWSEROS_SERVER_URL=http://127.0.0.1:9110
BROWSEROS_SERVER_LOG_DIR=/tmp/browseros-server-logs
BROWSEROS_CONFIG_URL=
# Captcha solver extension.
NOPECHA_API_KEY=
# WebArena-Infinity.
WEBARENA_INFINITY_DIR=
INFINITY_APP_URL=
# R2 publishing and weekly report.
EVAL_R2_ACCOUNT_ID=
EVAL_R2_ACCESS_KEY_ID=
EVAL_R2_SECRET_ACCESS_KEY=
EVAL_R2_BUCKET=browseros-eval
EVAL_R2_CDN_BASE_URL=https://eval.browseros.com

View File

@@ -14,6 +14,7 @@ Evaluation framework for BrowserOS browser automation agents. Runs tasks from st
```bash
cd apps/eval
cp .env.example .env.development
# Edit .env.development with your keys, then:
bun run eval
```
@@ -23,11 +24,55 @@ Opens the eval dashboard at `http://localhost:9900` in config mode. From there:
### CLI mode
```bash
bun run eval -c configs/browseros-agent-weekly.json
bun run eval -c configs/legacy/browseros-agent-weekly.json
bun run eval suite --config configs/legacy/browseros-agent-weekly.json --publish r2
```
Runs immediately. Dashboard still available at `http://localhost:9900` for live progress.
The `suite` command is the workflow-compatible full loop: execute tasks, run graders, write artifacts, and optionally publish to R2. The old `-c` form remains supported during migration.
```bash
bun run eval run --config configs/legacy/browseros-agent-weekly.json
bun run eval suite --suite configs/suites/agisdk-daily-10.json --variant kimi-fireworks --publish r2
bun run eval grade --run results/browseros-agent-weekly/2026-04-29-1430
bun run eval publish --run results/browseros-agent-weekly/2026-04-29-1430 --target r2
```
Config files live in two groups:
```txt
configs/legacy/ # Complete EvalConfig files used by older workflows and the dashboard
configs/suites/ # Suite definitions; model/provider comes from CLI flags or env
```
Suite mode takes model settings from CLI flags first, then env:
```bash
EVAL_VARIANT=kimi-fireworks \
EVAL_AGENT_PROVIDER=openai-compatible \
EVAL_AGENT_MODEL=accounts/fireworks/models/kimi-k2p5 \
EVAL_AGENT_API_KEY=$FIREWORKS_API_KEY \
EVAL_AGENT_BASE_URL=https://api.fireworks.ai/inference/v1 \
bun run eval suite --suite configs/suites/agisdk-daily-10.json --publish r2
```
### Suites and variants
A **suite** is what we run: the task dataset, graders, worker count, timeout, and browser settings. For example, `agisdk-daily-10` means "run these 10 AGI SDK tasks and grade them with `agisdk_state_diff`."
A **variant** is the model setup we are testing on that suite. `EVAL_VARIANT` is just the human-readable name for that setup. The actual model connection still comes from `EVAL_AGENT_PROVIDER`, `EVAL_AGENT_MODEL`, `EVAL_AGENT_API_KEY`, and `EVAL_AGENT_BASE_URL`.
This lets us run the same suite against multiple model setups without copying the benchmark config:
```txt
agisdk-daily-10 + kimi-fireworks
agisdk-daily-10 + claude-sonnet
agisdk-daily-10 + clado-action-000159
```
For `orchestrator-executor` suites, there can also be an executor model/backend. The `EVAL_AGENT_*` vars describe the main agent or orchestrator. The optional `EVAL_EXECUTOR_*` or `CLADO_ACTION_*` vars describe the delegated executor.
## Agent types
| Type | Description |
@@ -96,6 +141,20 @@ The `apiKey` field supports two formats:
- **Env var name**: `"OPENAI_API_KEY"` — resolved from `.env.development` at runtime
- **Direct value**: `"sk-xxxxx"` — used as-is (not recommended)
### Environment variables
| Variable | Used for |
|----------|----------|
| `EVAL_AGENT_PROVIDER`, `EVAL_AGENT_MODEL`, `EVAL_AGENT_API_KEY`, `EVAL_AGENT_BASE_URL`, `EVAL_AGENT_SUPPORTS_IMAGES` | Suite variant model selection |
| `FIREWORKS_API_KEY`, `OPENROUTER_API_KEY`, `ANTHROPIC_API_KEY`, provider-specific keys | Config-file or provider-backed model calls |
| `EVAL_EXECUTOR_MODEL`, `EVAL_EXECUTOR_API_KEY`, `EVAL_EXECUTOR_BASE_URL` | Suite-mode orchestrator executor override |
| `CLADO_ACTION_MODEL`, `CLADO_ACTION_API_KEY`, `CLADO_ACTION_BASE_URL` | Clado executor defaults |
| `BROWSEROS_BINARY` | BrowserOS binary path in CI/local smoke runs |
| `BROWSEROS_SERVER_URL` | Optional grader MCP URL override |
| `WEBARENA_INFINITY_DIR` | Local WebArena-Infinity checkout for Infinity tasks |
| `NOPECHA_API_KEY` | CAPTCHA solver extension |
| `EVAL_R2_ACCOUNT_ID`, `EVAL_R2_ACCESS_KEY_ID`, `EVAL_R2_SECRET_ACCESS_KEY`, `EVAL_R2_BUCKET`, `EVAL_R2_CDN_BASE_URL` | R2 upload and viewer URL |
### Supported providers
| Provider | `provider` value | Requires `baseUrl` |
@@ -110,6 +169,20 @@ The `apiKey` field supports two formats:
| Ollama | `ollama` | No |
| Clado Action (executor only) | `clado-action` | Yes |
### R2 publishing
`suite --config ... --publish r2` and `publish --target r2` upload the run artifacts plus `viewer.html` to the viewer-compatible R2 layout:
```bash
export EVAL_R2_ACCOUNT_ID=...
export EVAL_R2_ACCESS_KEY_ID=...
export EVAL_R2_SECRET_ACCESS_KEY=...
export EVAL_R2_BUCKET=browseros-eval
export EVAL_R2_CDN_BASE_URL=https://eval.browseros.com
```
Published runs are available at `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
### BrowserOS infrastructure
```json
@@ -137,6 +210,7 @@ Each worker gets its own Chrome instance. Worker N uses `base_port + N` for CDP
| File | Tasks | Description |
|------|-------|-------------|
| `agisdk-daily-10.jsonl` | 10 | Daily AGI SDK / REAL Bench subset |
| `webvoyager.jsonl` | 643 | Full WebVoyager benchmark |
| `mind2web.jsonl` | 300 | Online-Mind2Web |
| `webbench-{0,1,2}of4-50.jsonl` | 50 each | WebBench shards (50-task subsets) |
@@ -168,14 +242,19 @@ results/
browseros-agent-weekly/
2026-04-29-1430/
Amazon--0/
attempt.json # Stable attempt summary for viewer/reporting
metadata.json # Task result, timing, grader scores
grades.json # Compact grader results
messages.jsonl # Full message log
grader-artifacts/ # Grader-specific inputs/outputs/stderr
screenshots/
001.png # Step-by-step screenshots
002.png
summary.json # Aggregate pass rates
```
R2 publishing preserves the same task files under `runs/<run-id>/...`, writes `runs/<run-id>/manifest.json`, and uploads `viewer.html` at the bucket root. The viewer URL is `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
## Troubleshooting
**BrowserOS not found**: Expects `/Applications/BrowserOS.app/Contents/MacOS/BrowserOS`. Set `BROWSEROS_BINARY` to override.

View File

@@ -7,7 +7,7 @@
"baseUrl": "https://openrouter.ai/api/v1",
"supportsImages": true
},
"dataset": "../data/agisdk-real-smoke.jsonl",
"dataset": "../../data/agisdk-real-smoke.jsonl",
"num_workers": 1,
"restart_server_per_task": true,
"browseros": {

View File

@@ -7,7 +7,7 @@
"baseUrl": "https://api.fireworks.ai/inference/v1",
"supportsImages": true
},
"dataset": "../data/agisdk-real.jsonl",
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 4,
"restart_server_per_task": true,
"browseros": {

View File

@@ -7,7 +7,7 @@
"baseUrl": "https://openrouter.ai/api/v1",
"supportsImages": true
},
"dataset": "../data/webbench-2of4-50.jsonl",
"dataset": "../../data/webbench-2of4-50.jsonl",
"num_workers": 10,
"restart_server_per_task": true,
"browseros": {

View File

@@ -14,7 +14,7 @@
"baseUrl": "https://api.fireworks.ai/inference/v1"
}
},
"dataset": "../data/webbench-2of4-50.jsonl",
"dataset": "../../data/webbench-2of4-50.jsonl",
"num_workers": 10,
"restart_server_per_task": true,
"browseros": {

View File

@@ -14,7 +14,7 @@
"baseUrl": "https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run"
}
},
"dataset": "../data/agisdk-real.jsonl",
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 10,
"restart_server_per_task": true,
"browseros": {

View File

@@ -7,7 +7,7 @@
"baseUrl": "https://openrouter.ai/api/v1",
"supportsImages": true
},
"dataset": "../data/webarena-infinity-hard-50.jsonl",
"dataset": "../../data/webarena-infinity-hard-50.jsonl",
"num_workers": 10,
"restart_server_per_task": true,
"browseros": {

View File

@@ -5,7 +5,7 @@
"model": "openai/gpt-4.1",
"apiKey": "OPENROUTER_API_KEY"
},
"dataset": "../data/mind2web.jsonl",
"dataset": "../../data/mind2web.jsonl",
"num_workers": 5,
"restart_server_per_task": true,
"browseros": {

View File

@@ -7,7 +7,7 @@
"baseUrl": "https://api.fireworks.ai/inference/v1",
"supportsImages": true
},
"dataset": "../data/webvoyager.jsonl",
"dataset": "../../data/webvoyager.jsonl",
"num_workers": 3,
"restart_server_per_task": true,
"browseros": {

View File

@@ -0,0 +1,22 @@
{
"id": "agisdk-daily-10",
"dataset": "../../data/agisdk-daily-10.jsonl",
"agent": {
"type": "single"
},
"graders": ["agisdk_state_diff"],
"workers": 1,
"restartBrowserPerTask": true,
"timeoutMs": 1800000,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": true
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
}
}

View File

@@ -0,0 +1,22 @@
{
"id": "agisdk-real-smoke",
"dataset": "../../data/agisdk-real-smoke.jsonl",
"agent": {
"type": "single"
},
"graders": ["agisdk_state_diff"],
"workers": 1,
"restartBrowserPerTask": true,
"timeoutMs": 1800000,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
}
}

View File

@@ -0,0 +1,22 @@
{
"id": "agisdk-real",
"dataset": "../../data/agisdk-real.jsonl",
"agent": {
"type": "single"
},
"graders": ["agisdk_state_diff"],
"workers": 1,
"restartBrowserPerTask": true,
"timeoutMs": 1800000,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
}
}

View File

@@ -0,0 +1,10 @@
{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
{"query_id": "agisdk-fly-unified-5", "dataset": "agisdk-real", "query": "Find me the cheapest fare for a flight from Orlando to Milwaukee on December 5th, 2024 and book it.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/30, Security Code: 420 Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-5", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "United Airlines"}}}
{"query_id": "agisdk-udriver-10", "dataset": "agisdk-real", "query": "Order me a ride for 4pm, I'll be at the de Young muesum headed to the Waterbar, fanciest option possible please.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-10", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
{"query_id": "agisdk-udriver-9", "dataset": "agisdk-real", "query": "Book me a ride from the thai restaurant I last took a ride to for later today at 2pm, I'll be at 333 Apartments on Fremont", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-9", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-9", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Uber"}}}
{"query_id": "agisdk-topwork-4", "dataset": "agisdk-real", "query": "Create a job post for a UI/UX Designer with expertise in Figma, Sketch, and Adobe Creative Suite, including project details, timeline, and required skills (Wireframing, Prototyping, Responsive Design).", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-4", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
{"query_id": "agisdk-gocalendar-4", "dataset": "agisdk-real", "query": "Change the \"Team Check-In\" event on July 18, 2024, name to \"Project Kickoff\" and update the location to \"Zoom\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-4", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
{"query_id": "agisdk-staynb-6", "dataset": "agisdk-real", "query": "Find and book the stay with the best value for money (cheapest stay with the best reviews) for 1 day. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-6", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-6", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
{"query_id": "agisdk-udriver-11", "dataset": "agisdk-real", "query": "I need to go from Pacific Catch on Chestnut back home to 333 Fremont now. If the fancy version is within ten dollars of the regular one, book that.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-11", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-11", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
{"query_id": "agisdk-networkin-5", "dataset": "agisdk-real", "query": "Send a connection request to John Smith.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-5", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-5", "challenge_type": "action", "difficulty": "easy", "similar_to": "LinkedIn"}}}
{"query_id": "agisdk-zilloft-6", "dataset": "agisdk-real", "query": "Select a property listed in San Francisco as \"Condos\" within a price range under $300,000 and request a tour for tomorrow at 4:00 PM. Use these contact details: Name: Sarah Brown, Email: sarahbrown@example.com, Phone: 555-987-6543.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-6", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "Zillow"}}}

View File

@@ -1,349 +1,43 @@
#!/usr/bin/env bun
/**
* Upload eval runs to R2.
*
* Two modes:
* bun scripts/upload-run.ts results/browseros-agent-weekly/2026-03-21-1730
* → uploads that specific run
*
* bun scripts/upload-run.ts results/browseros-agent-weekly
* → finds all timestamped subfolders, uploads any not yet in R2
*
* Env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY
* EVAL_R2_BUCKET (default: browseros-eval)
* EVAL_R2_CDN_BASE_URL (default: https://eval.browseros.com)
*/
import { readdir, readFile, stat } from 'node:fs/promises'
import { basename, dirname, extname, join } from 'node:path'
import {
GetObjectCommand,
PutObjectCommand,
S3Client,
} from '@aws-sdk/client-s3'
loadR2ConfigFromEnv,
R2Publisher,
} from '../src/publishing/r2-publisher'
const CONCURRENCY = 20
const CONTENT_TYPES: Record<string, string> = {
'.json': 'application/json',
'.jsonl': 'application/x-ndjson',
'.png': 'image/png',
}
interface R2Config {
accountId: string
accessKeyId: string
secretAccessKey: string
bucket: string
cdnBaseUrl: string
}
function loadConfig(): R2Config {
const accountId = process.env.EVAL_R2_ACCOUNT_ID
const accessKeyId = process.env.EVAL_R2_ACCESS_KEY_ID
const secretAccessKey = process.env.EVAL_R2_SECRET_ACCESS_KEY
if (!accountId || !accessKeyId || !secretAccessKey) {
console.error(
'Missing required env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY',
)
process.exit(1)
}
return {
accountId,
accessKeyId,
secretAccessKey,
bucket: process.env.EVAL_R2_BUCKET || 'browseros-eval',
cdnBaseUrl: (
process.env.EVAL_R2_CDN_BASE_URL || 'https://eval.browseros.com'
).replace(/\/+$/, ''),
}
}
function createClient(config: R2Config): S3Client {
return new S3Client({
region: 'auto',
endpoint: `https://${config.accountId}.r2.cloudflarestorage.com`,
credentials: {
accessKeyId: config.accessKeyId,
secretAccessKey: config.secretAccessKey,
},
})
}
async function upload(
client: S3Client,
bucket: string,
key: string,
body: Buffer,
contentType: string,
) {
await client.send(
new PutObjectCommand({
Bucket: bucket,
Key: key,
Body: body,
ContentType: contentType,
}),
)
}
async function collectFiles(dir: string): Promise<string[]> {
const files: string[] = []
const entries = await readdir(dir, { withFileTypes: true })
for (const entry of entries) {
const full = join(dir, entry.name)
if (entry.isDirectory()) {
files.push(...(await collectFiles(full)))
} else {
files.push(full)
}
}
return files
}
async function runPool<T>(
items: T[],
concurrency: number,
fn: (item: T) => Promise<void>,
) {
let i = 0
const workers = Array.from({ length: concurrency }, async () => {
while (i < items.length) {
const idx = i++
await fn(items[idx])
}
})
await Promise.all(workers)
}
// Check if a run has already been uploaded to R2
async function isUploaded(
client: S3Client,
bucket: string,
runId: string,
): Promise<boolean> {
try {
await client.send(
new GetObjectCommand({
Bucket: bucket,
Key: `runs/${runId}/manifest.json`,
}),
)
return true
} catch {
return false
}
}
// Detect if a directory is a run dir (has task subdirs with metadata.json)
// vs a config dir (has timestamped subdirs like 2026-03-21-1730/)
async function isRunDir(dir: string): Promise<boolean> {
const entries = await readdir(dir, { withFileTypes: true })
const subdirs = entries.filter((e) => e.isDirectory())
for (const subdir of subdirs) {
const metaPath = join(dir, subdir.name, 'metadata.json')
const metaStat = await stat(metaPath).catch(() => null)
if (metaStat?.isFile()) return true
}
return false
}
async function uploadSingleRun(
runDir: string,
runId: string,
r2Config: R2Config,
client: S3Client,
): Promise<void> {
const taskDirs = await readdir(runDir, { withFileTypes: true })
const taskEntries = taskDirs.filter((d) => d.isDirectory())
if (taskEntries.length === 0) {
console.warn(` No task subdirectories in ${runId}, skipping`)
return
}
const manifestTasks: Record<string, unknown>[] = []
const jobs: { key: string; filePath: string; contentType: string }[] = []
// Extract agent config from first task
let agentConfig: Record<string, unknown> | undefined
let dataset: string | undefined
for (const taskDir of taskEntries) {
const taskId = taskDir.name
const taskPath = join(runDir, taskId)
const metaPath = join(taskPath, 'metadata.json')
let meta: Record<string, unknown> = {}
try {
meta = JSON.parse(await readFile(metaPath, 'utf-8'))
} catch {
continue
}
if (!agentConfig && meta.agent_config)
agentConfig = meta.agent_config as Record<string, unknown>
if (!dataset && meta.dataset) dataset = meta.dataset as string
const files = await collectFiles(taskPath)
let screenshotCount = 0
for (const file of files) {
const relative = file.slice(taskPath.length + 1)
const ext = extname(file)
if (relative.startsWith('screenshots/') && ext === '.png')
screenshotCount++
jobs.push({
key: `runs/${runId}/${taskId}/${relative}`,
filePath: file,
contentType: CONTENT_TYPES[ext] || 'application/octet-stream',
})
}
manifestTasks.push({
queryId: meta.query_id || taskId,
query: meta.query || '',
startUrl: meta.start_url || '',
status:
meta.termination_reason === 'completed'
? 'completed'
: meta.termination_reason || 'unknown',
durationMs: meta.total_duration_ms || 0,
screenshotCount: (meta.screenshot_count as number) || screenshotCount,
graderResults: meta.grader_results || {},
})
}
if (manifestTasks.length === 0) {
console.warn(` No completed tasks in ${runId}, skipping`)
return
}
console.log(
` Uploading ${jobs.length} files across ${manifestTasks.length} tasks...`,
)
let uploaded = 0
await runPool(jobs, CONCURRENCY, async (job) => {
const body = await readFile(job.filePath)
await upload(client, r2Config.bucket, job.key, body, job.contentType)
uploaded++
if (uploaded % 50 === 0 || uploaded === jobs.length) {
console.log(` ${uploaded}/${jobs.length}`)
}
})
// Read summary.json if it exists
let summaryData: Record<string, unknown> | undefined
try {
summaryData = JSON.parse(
await readFile(join(runDir, 'summary.json'), 'utf-8'),
)
} catch {}
// Upload manifest
const manifest = {
runId,
uploadedAt: new Date().toISOString(),
agentConfig,
dataset,
summary: summaryData
? {
passRate: summaryData.passRate,
avgDurationMs: summaryData.avgDurationMs,
}
: undefined,
tasks: manifestTasks,
}
const manifestBody = Buffer.from(JSON.stringify(manifest, null, 2))
await upload(
client,
r2Config.bucket,
`runs/${runId}/manifest.json`,
manifestBody,
'application/json',
)
// Upload viewer.html to bucket root
const viewerPath = join(
import.meta.dir,
'..',
'src',
'dashboard',
'viewer.html',
)
const viewerBody = await readFile(viewerPath)
await upload(client, r2Config.bucket, 'viewer.html', viewerBody, 'text/html')
console.log(` Uploaded ${uploaded + 2} files`)
console.log(` ${r2Config.cdnBaseUrl}/viewer.html?run=${runId}`)
}
async function main() {
async function main(): Promise<void> {
const inputDir = process.argv[2]
if (!inputDir) {
console.error(
throw new Error(
'Usage:\n' +
' bun scripts/upload-run.ts results/config-name/2026-03-21-1730 (specific run)\n' +
' bun scripts/upload-run.ts results/config-name (all un-uploaded runs)',
)
process.exit(1)
}
const dirStat = await stat(inputDir).catch(() => null)
if (!dirStat?.isDirectory()) {
console.error(`Not a directory: ${inputDir}`)
process.exit(1)
}
const r2Config = loadConfig()
const client = createClient(r2Config)
if (await isRunDir(inputDir)) {
// Single run: results/config-name/2026-03-21-1730
const timestamp = basename(inputDir)
const configName = basename(dirname(inputDir))
const runId = `${configName}-${timestamp}`
console.log(`Uploading run: ${runId}`)
await uploadSingleRun(inputDir, runId, r2Config, client)
} else {
// Config dir: results/config-name/ — upload all un-uploaded runs
const configName = basename(inputDir)
const entries = await readdir(inputDir, { withFileTypes: true })
const runDirs = entries
.filter((e) => e.isDirectory())
.map((e) => e.name)
.sort()
if (runDirs.length === 0) {
console.error('No run subdirectories found')
process.exit(1)
}
console.log(
`Found ${runDirs.length} runs for config "${configName}", checking R2...`,
)
let uploadedCount = 0
for (const dir of runDirs) {
const runId = `${configName}-${dir}`
const alreadyUploaded = await isUploaded(client, r2Config.bucket, runId)
if (alreadyUploaded) {
console.log(` ${runId}: already uploaded, skipping`)
continue
}
console.log(` ${runId}: uploading...`)
await uploadSingleRun(join(inputDir, dir), runId, r2Config, client)
uploadedCount++
}
console.log(
`\nDone. Uploaded ${uploadedCount} new run(s), ${runDirs.length - uploadedCount} already in R2.`,
' bun scripts/upload-run.ts results/config-name/2026-03-21-1730\n' +
' bun scripts/upload-run.ts results/config-name',
)
}
const publisher = new R2Publisher({ config: loadR2ConfigFromEnv() })
const result = await publisher.publishPath(inputDir)
for (const run of result.uploadedRuns) {
console.log(`Uploaded ${run.uploadedFiles} files for ${run.runId}`)
console.log(run.viewerUrl)
}
for (const runId of result.skippedRuns) {
console.log(`${runId}: already uploaded, skipping`)
}
console.log(
`Done. Uploaded ${result.uploadedRuns.length} run(s), skipped ${result.skippedRuns.length}.`,
)
}
main()
main().catch((error) => {
console.error(error instanceof Error ? error.message : String(error))
process.exit(1)
})

View File

@@ -0,0 +1,191 @@
import type {
CladoAction,
CladoActionResponse,
RawCladoActionPayload,
} from './types'
/** Parses Clado's structured response plus any raw `<answer>` blocks into executable actions. */
export function parseCladoActions(
prediction: CladoActionResponse,
): CladoAction[] {
const actionFromField =
typeof prediction.action === 'string' ? prediction.action : null
const rawActions = parseCladoActionsFromRawResponse(prediction.raw_response)
const primaryFromRaw = rawActions[0] ?? null
const mergedPrimary = {
...primaryFromRaw,
...prediction,
action: actionFromField ?? primaryFromRaw?.action,
}
const normalized: CladoAction[] = []
const primary = normalizeCladoActionPayload(mergedPrimary)
if (primary) normalized.push(primary)
for (const candidate of rawActions.slice(1)) {
const parsed = normalizeCladoActionPayload(candidate)
if (!parsed) continue
const prev = normalized[normalized.length - 1]
if (
!prev ||
getCladoActionSignature(prev) !== getCladoActionSignature(parsed)
) {
normalized.push(parsed)
}
}
return normalized
}
export function normalizeCladoActionPayload(
payload: RawCladoActionPayload,
): CladoAction | null {
if (!payload.action || typeof payload.action !== 'string') {
return null
}
return {
action: payload.action,
x: typeof payload.x === 'number' ? payload.x : undefined,
y: typeof payload.y === 'number' ? payload.y : undefined,
text: typeof payload.text === 'string' ? payload.text : undefined,
key: typeof payload.key === 'string' ? payload.key : undefined,
direction:
typeof payload.direction === 'string' ? payload.direction : undefined,
startX: typeof payload.startX === 'number' ? payload.startX : undefined,
startY: typeof payload.startY === 'number' ? payload.startY : undefined,
endX: typeof payload.endX === 'number' ? payload.endX : undefined,
endY: typeof payload.endY === 'number' ? payload.endY : undefined,
amount: typeof payload.amount === 'number' ? payload.amount : undefined,
time: typeof payload.time === 'number' ? payload.time : undefined,
final_answer:
typeof payload.final_answer === 'string'
? payload.final_answer
: undefined,
}
}
export function parseCladoActionsFromRawResponse(
rawResponse: string | undefined,
): RawCladoActionPayload[] {
if (!rawResponse) return []
const matches = [
...rawResponse.matchAll(/<answer>\s*([\s\S]*?)\s*<\/answer>/gi),
]
const parsed: RawCladoActionPayload[] = []
for (const match of matches) {
try {
parsed.push(JSON.parse(match[1]) as RawCladoActionPayload)
} catch {
// Ignore malformed answer blocks so one bad block does not drop the whole prediction.
}
}
return parsed
}
export function extractCladoThinking(
rawResponse: string | undefined,
): string | undefined {
if (!rawResponse) return undefined
const matches = [
...rawResponse.matchAll(/<thinking>\s*([\s\S]*?)\s*<\/thinking>/gi),
]
if (matches.length === 0) return undefined
const merged = matches
.map((match) => match[1]?.replace(/\s+/g, ' ').trim() ?? '')
.filter((value) => value.length > 0)
.join(' ')
if (!merged) return undefined
return merged
}
export function summarizeCladoPrediction(
prediction: CladoActionResponse,
): Record<string, unknown> {
const preview =
typeof prediction.raw_response === 'string' &&
prediction.raw_response.length > 0
? prediction.raw_response.slice(0, 240)
: undefined
return {
action: prediction.action,
x: prediction.x,
y: prediction.y,
text: prediction.text,
key: prediction.key,
direction: prediction.direction,
startX: prediction.startX,
startY: prediction.startY,
endX: prediction.endX,
endY: prediction.endY,
amount: prediction.amount,
time: prediction.time,
inference_time_seconds: prediction.inference_time_seconds,
raw_response_preview: preview,
}
}
export function getCladoActionSignature(action: CladoAction): string {
switch (action.action) {
case 'click':
case 'double_click':
case 'right_click':
case 'hover':
return `${action.action}:${action.x ?? 'x'}:${action.y ?? 'y'}`
case 'type':
return `${action.action}:${(action.text ?? '').slice(0, 16)}`
case 'press_key':
return `${action.action}:${action.key ?? 'key'}`
case 'scroll':
return `${action.action}:${action.direction ?? 'down'}:${action.amount ?? 500}`
case 'drag':
return `${action.action}:${action.startX}:${action.startY}:${action.endX}:${action.endY}`
case 'wait':
return `${action.action}:${action.time ?? 1}`
case 'end':
return action.final_answer
? `end(${action.final_answer.slice(0, 32)})`
: 'end()'
case 'invalid':
return `invalid(${(action.text ?? '').slice(0, 40)})`
default:
return action.action
}
}
export function formatCladoHistory(actions: CladoAction[]): string {
if (actions.length === 0) return 'None'
const parts = actions.map((action) => {
switch (action.action) {
case 'click':
case 'double_click':
case 'right_click':
case 'hover':
return `${action.action}(${Math.round(action.x ?? 500)}, ${Math.round(action.y ?? 500)})`
case 'type': {
const text = (action.text ?? '').replace(/'/g, "\\'")
return `type('${text}')`
}
case 'press_key':
return `press_key('${action.key ?? 'Enter'}')`
case 'scroll':
return `scroll(${action.direction ?? 'down'})`
case 'drag':
return `drag(${Math.round(action.startX ?? 500)},${Math.round(action.startY ?? 500)} -> ${Math.round(action.endX ?? 500)},${Math.round(action.endY ?? 500)})`
case 'wait':
return `wait(${Math.round(action.time ?? 1)}s)`
case 'end':
return 'end()'
case 'invalid':
return 'invalid()'
default:
return action.action
}
})
return parts.join(' -> ')
}

View File

@@ -0,0 +1,123 @@
import {
CLADO_PAGE_SCOPED_TOOLS,
type CladoActionPoint,
type CladoViewport,
} from './types'
export function clampCladoNormalizedCoordinate(value: number): number {
return Math.min(999, Math.max(0, Math.round(value)))
}
/** Converts Clado's 0-1000 normalized coordinate space into BrowserOS viewport pixels. */
export function resolveCladoPoint(
viewport: CladoViewport,
normalizedX: number | undefined,
normalizedY: number | undefined,
): CladoActionPoint {
const nx = clampCladoNormalizedCoordinate(normalizedX ?? 500)
const ny = clampCladoNormalizedCoordinate(normalizedY ?? 500)
return {
x: Math.round((nx / 1000) * viewport.width),
y: Math.round((ny / 1000) * viewport.height),
}
}
/** Adapts Clado action tool arguments to the BrowserOS MCP tool argument contract. */
export function prepareCladoToolArgs(
toolName: string,
args: Record<string, unknown>,
pageId: number,
): Record<string, unknown> {
const prepared: Record<string, unknown> = { ...args }
if (
toolName === 'evaluate_script' &&
typeof prepared.function === 'string' &&
prepared.expression === undefined
) {
prepared.expression = toCladoEvaluateExpression(prepared.function)
delete prepared.function
}
if (
toolName === 'click_at' &&
typeof prepared.dblClick === 'boolean' &&
prepared.clickCount === undefined
) {
prepared.clickCount = prepared.dblClick ? 2 : 1
delete prepared.dblClick
}
if (
CLADO_PAGE_SCOPED_TOOLS.has(toolName) &&
typeof prepared.page !== 'number'
) {
prepared.page = pageId
}
return prepared
}
export function toCladoEvaluateExpression(rawFunction: unknown): string {
const source = String(rawFunction).trim()
if (source.startsWith('() =>') || source.startsWith('async () =>')) {
return `(${source})()`
}
if (source.startsWith('function')) {
return `(${source})()`
}
return source
}
export function normalizeCladoPressKey(key: string | undefined): string {
const raw = (key ?? '').trim()
if (!raw) throw new Error('press_key action missing key field')
const map: Record<string, string> = {
'C-a': 'Control+A',
'C-c': 'Control+C',
'C-v': 'Control+V',
'C-x': 'Control+X',
'C-z': 'Control+Z',
'C-y': 'Control+Y',
'C-s': 'Control+S',
'C-t': 'Control+T',
'C-w': 'Control+W',
'C-h': 'Control+H',
'C-f': 'Control+F',
'C-+': 'Control++',
'C--': 'Control+-',
'C-tab': 'Control+Tab',
'C-S-tab': 'Control+Shift+Tab',
'C-S-n': 'Control+Shift+N',
'C-down': 'Control+ArrowDown',
'M-a': 'Meta+A',
'M-c': 'Meta+C',
'M-v': 'Meta+V',
'M-x': 'Meta+X',
'M-f4': 'Alt+F4',
}
return map[raw] ?? raw
}
export function normalizeCladoDirection(
direction: string | undefined,
): 'up' | 'down' | 'left' | 'right' {
if (
direction === 'up' ||
direction === 'down' ||
direction === 'left' ||
direction === 'right'
) {
return direction
}
return 'down'
}
export function normalizeCladoScrollAmount(amount: number | undefined): number {
if (typeof amount !== 'number') return 500
if (amount <= 0) return 100
const clamped = Math.min(amount, 1000)
return Math.max(100, Math.round((clamped / 1000) * 900))
}

View File

@@ -0,0 +1,68 @@
import { CLADO_REQUEST_TIMEOUT_MS } from '../../../../constants'
import { formatCladoHistory } from './clado-actions'
import type { CladoAction, CladoActionResponse } from './types'
export interface CladoActionClientOptions {
baseUrl?: string
apiKey?: string
}
export interface CladoActionPredictionInput {
instruction: string
imageBase64: string
actionHistory: CladoAction[]
signal?: AbortSignal
}
/** Calls the Clado action model without exposing credentials in process arguments or artifacts. */
export class CladoActionClient {
constructor(private readonly options: CladoActionClientOptions) {}
async requestActionPrediction(
input: CladoActionPredictionInput,
): Promise<CladoActionResponse> {
if (!this.options.baseUrl) {
throw new Error('executor.baseUrl must be set for clado-action provider')
}
const requestController = new AbortController()
const onAbort = () => requestController.abort()
input.signal?.addEventListener('abort', onAbort, { once: true })
const timeoutHandle = setTimeout(() => {
requestController.abort()
}, CLADO_REQUEST_TIMEOUT_MS)
try {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
}
if (this.options.apiKey) {
headers.Authorization = `Bearer ${this.options.apiKey}`
}
const response = await fetch(this.options.baseUrl, {
method: 'POST',
headers,
body: JSON.stringify({
instruction: input.instruction,
image_base64: input.imageBase64,
history: formatCladoHistory(input.actionHistory),
}),
signal: requestController.signal,
})
if (!response.ok) {
const body = await response.text()
throw new Error(
`HTTP ${response.status} ${response.statusText}: ${body.slice(0, 400)}`,
)
}
return (await response.json()) as CladoActionResponse
} finally {
clearTimeout(timeoutHandle)
input.signal?.removeEventListener('abort', onAbort)
}
}
}

View File

@@ -0,0 +1,78 @@
export const CLADO_ACTION_PROVIDER = 'clado-action'
export const CLADO_PAGE_SCOPED_TOOLS = new Set<string>([
'take_screenshot',
'evaluate_script',
'click',
'click_at',
'hover',
'hover_at',
'clear',
'fill',
'press_key',
'type_at',
'drag',
'drag_at',
'scroll',
'handle_dialog',
'select_option',
'navigate_page',
'close_page',
'wait_for',
])
export interface CladoActionResponse {
action?: string | null
x?: number
y?: number
text?: string
key?: string
direction?: string
startX?: number
startY?: number
endX?: number
endY?: number
amount?: number
time?: number
final_answer?: string | null
inference_time_seconds?: number
raw_response?: string
thinking?: string | null
parse_error?: string | null
}
export interface CladoViewport {
width: number
height: number
}
export interface CladoAction {
action: string
x?: number
y?: number
text?: string
key?: string
direction?: string
startX?: number
startY?: number
endX?: number
endY?: number
amount?: number
time?: number
final_answer?: string
}
export type RawCladoActionPayload = Partial<
Omit<CladoAction, 'final_answer'>
> & {
final_answer?: string | null
}
export interface CladoActionPoint {
x: number
y: number
}
export function isCladoActionProvider(provider: string): boolean {
return provider === CLADO_ACTION_PROVIDER
}

View File

@@ -0,0 +1,45 @@
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
import type { Browser } from '@browseros/server/browser'
import type { ExecutorCallbacks } from '../../orchestrator-executor/executor'
import type { ExecutorBackend, ExecutorBackendKind } from '../executor-backend'
import { ExecutorAdapterBackend } from './tool-loop-backend'
export interface CreateExecutorBackendOptions {
backendKind?: ExecutorBackendKind
provider?: string
configTemplate?: ResolvedAgentConfig
browser?: Browser | null
serverUrl?: string
windowId?: number
tabId?: number
initialPageId?: number
callbacks?: ExecutorCallbacks
executor?: ExecutorBackend
}
export function backendKindForProvider(provider: string): ExecutorBackendKind {
return provider === 'clado-action' ? 'clado' : 'tool-loop'
}
/** Creates the backend used for one orchestrator delegation. */
export function createExecutorBackend(
options: CreateExecutorBackendOptions,
): ExecutorBackend {
const kind =
options.backendKind ??
backendKindForProvider(
options.provider ?? options.configTemplate?.provider ?? '',
)
return new ExecutorAdapterBackend({
kind,
configTemplate: options.configTemplate,
browser: options.browser,
serverUrl: options.serverUrl,
windowId: options.windowId,
tabId: options.tabId,
initialPageId: options.initialPageId,
callbacks: options.callbacks,
executor: options.executor,
})
}

View File

@@ -0,0 +1,72 @@
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
import type { Browser } from '@browseros/server/browser'
import {
Executor,
type ExecutorCallbacks,
} from '../../orchestrator-executor/executor'
import type {
DelegationResult,
ExecutorBackend,
ExecutorBackendKind,
} from '../executor-backend'
interface ExecutorRunner {
execute(instruction: string, signal?: AbortSignal): Promise<DelegationResult>
close(): Promise<void>
getTotalSteps(): number
}
export interface ExecutorAdapterBackendOptions {
kind: ExecutorBackendKind
configTemplate?: ResolvedAgentConfig
browser?: Browser | null
serverUrl?: string
windowId?: number
tabId?: number
initialPageId?: number
callbacks?: ExecutorCallbacks
executor?: ExecutorRunner
}
export class ExecutorAdapterBackend implements ExecutorBackend {
readonly kind: ExecutorBackendKind
private readonly executor: ExecutorRunner
constructor(options: ExecutorAdapterBackendOptions) {
this.kind = options.kind
this.executor =
options.executor ??
new Executor(
required(options.configTemplate, 'configTemplate'),
options.browser ?? null,
required(options.serverUrl, 'serverUrl'),
{
isCladoAction: options.kind === 'clado',
windowId: options.windowId,
tabId: options.tabId,
initialPageId: options.initialPageId,
callbacks: options.callbacks,
},
)
}
execute(
instruction: string,
signal?: AbortSignal,
): Promise<DelegationResult> {
return this.executor.execute(instruction, signal)
}
close(): Promise<void> {
return this.executor.close()
}
getTotalSteps(): number {
return this.executor.getTotalSteps()
}
}
function required<T>(value: T | undefined, name: string): T {
if (value === undefined) throw new Error(`${name} is required`)
return value
}

View File

@@ -0,0 +1,11 @@
import type { ExecutorResult } from '../orchestrator-executor/types'
export type ExecutorBackendKind = 'tool-loop' | 'clado'
export type DelegationResult = ExecutorResult
export interface ExecutorBackend {
readonly kind: ExecutorBackendKind
execute(instruction: string, signal?: AbortSignal): Promise<DelegationResult>
close(): Promise<void>
getTotalSteps(): number
}

View File

@@ -1,106 +1,47 @@
import { randomUUID } from 'node:crypto'
import {
CLADO_REQUEST_TIMEOUT_MS,
MAX_ACTIONS_PER_DELEGATION,
} from '../../constants'
import { MAX_ACTIONS_PER_DELEGATION } from '../../constants'
import { McpClient, type McpToolResult } from '../../utils/mcp-client'
import { sleep } from '../../utils/sleep'
import {
extractCladoThinking,
formatCladoHistory,
getCladoActionSignature,
parseCladoActions,
summarizeCladoPrediction,
} from '../orchestrated/backends/clado/clado-actions'
import {
normalizeCladoDirection,
normalizeCladoPressKey,
normalizeCladoScrollAmount,
prepareCladoToolArgs,
resolveCladoPoint,
} from '../orchestrated/backends/clado/clado-browser-driver'
import { CladoActionClient } from '../orchestrated/backends/clado/clado-client'
import {
CLADO_ACTION_PROVIDER,
type CladoAction,
type CladoActionPoint,
type CladoActionResponse,
type CladoViewport,
isCladoActionProvider,
} from '../orchestrated/backends/clado/types'
import type { ExecutorCallbacks } from './executor'
import type { ExecutorConfig, ExecutorResult } from './types'
const CLADO_ACTION_PROVIDER = 'clado-action'
const PAGE_SCOPED_TOOLS = new Set<string>([
'take_screenshot',
'evaluate_script',
'click',
'click_at',
'hover',
'hover_at',
'clear',
'fill',
'press_key',
'type_at',
'drag',
'drag_at',
'scroll',
'handle_dialog',
'select_option',
'navigate_page',
'close_page',
'wait_for',
])
interface CladoActionResponse {
action?: string | null
x?: number
y?: number
text?: string
key?: string
direction?: string
startX?: number
startY?: number
endX?: number
endY?: number
amount?: number
time?: number
final_answer?: string | null
inference_time_seconds?: number
raw_response?: string
thinking?: string | null
parse_error?: string | null
}
interface Viewport {
width: number
height: number
}
interface CladoAction {
action: string
x?: number
y?: number
text?: string
key?: string
direction?: string
startX?: number
startY?: number
endX?: number
endY?: number
amount?: number
time?: number
final_answer?: string
}
type RawActionPayload = Partial<Omit<CladoAction, 'final_answer'>> & {
final_answer?: string | null
}
const MAX_CONSECUTIVE_PARSE_FAILURES = 3
interface ActionPoint {
x: number
y: number
}
function asErrorMessage(error: unknown): string {
return error instanceof Error ? error.message : String(error)
}
function clampNormalized(value: number): number {
return Math.min(999, Math.max(0, Math.round(value)))
}
function isCladoProvider(provider: string): boolean {
return provider === CLADO_ACTION_PROVIDER
}
export class CladoActionExecutor {
private readonly mcpClient: McpClient
private readonly cladoClient: CladoActionClient
private readonly pageId: number
private callbacks: ExecutorCallbacks = {}
private stepsUsed = 0
private viewport: Viewport | null = null
private lastPoint: ActionPoint | null = null
private viewport: CladoViewport | null = null
private lastPoint: CladoActionPoint | null = null
private currentUrl = ''
constructor(
@@ -110,12 +51,16 @@ export class CladoActionExecutor {
readonly _tabId?: number,
initialPageId?: number,
) {
if (!isCladoProvider(config.provider)) {
if (!isCladoActionProvider(config.provider)) {
throw new Error(
`CladoActionExecutor requires provider="${CLADO_ACTION_PROVIDER}"`,
)
}
this.mcpClient = new McpClient(`${serverUrl}/mcp`)
this.cladoClient = new CladoActionClient({
baseUrl: config.baseUrl,
apiKey: config.apiKey,
})
this.pageId = initialPageId ?? 1
}
@@ -165,7 +110,7 @@ export class CladoActionExecutor {
break
}
const historyForPrediction = this.formatHistory(actionHistory)
const historyForPrediction = formatCladoHistory(actionHistory)
const actionToolCallId = randomUUID()
const predictionInput = {
instruction,
@@ -187,7 +132,7 @@ export class CladoActionExecutor {
signal,
)
predictionCalls++
const thinking = this.extractThinking(prediction.raw_response)
const thinking = extractCladoThinking(prediction.raw_response)
if (thinking) {
const previous = thinkingTrace[thinkingTrace.length - 1]
if (previous !== thinking) {
@@ -217,7 +162,7 @@ export class CladoActionExecutor {
break
}
const predictedActions = this.parseActions(prediction)
const predictedActions = parseCladoActions(prediction)
if (predictedActions.length === 0) {
// Per Clado contract: HTTP 200 with action=null on parse failure.
// Count as an invalid step so the model can self-correct on the
@@ -243,7 +188,7 @@ export class CladoActionExecutor {
toolCallId: actionToolCallId,
toolName: 'clado_action_predict',
output: {
prediction: this.summarizePrediction(prediction),
prediction: summarizeCladoPrediction(prediction),
parsedActions: [],
parseError,
consecutiveParseFailures,
@@ -285,7 +230,7 @@ export class CladoActionExecutor {
toolCallId: actionToolCallId,
toolName: 'clado_action_predict',
output: {
prediction: this.summarizePrediction(prediction),
prediction: summarizeCladoPrediction(prediction),
parsedActions: predictedActions,
executed: executionNotes,
},
@@ -326,7 +271,7 @@ export class CladoActionExecutor {
toolCallId: actionToolCallId,
toolName: 'clado_action_predict',
output: {
prediction: this.summarizePrediction(prediction),
prediction: summarizeCladoPrediction(prediction),
parsedActions: predictedActions,
executed: executionNotes,
},
@@ -378,125 +323,12 @@ export class CladoActionExecutor {
actionHistory: CladoAction[],
signal?: AbortSignal,
): Promise<CladoActionResponse> {
if (!this.config.baseUrl) {
throw new Error('executor.baseUrl must be set for clado-action provider')
}
const requestController = new AbortController()
const onAbort = () => requestController.abort()
signal?.addEventListener('abort', onAbort, { once: true })
const timeoutHandle = setTimeout(() => {
requestController.abort()
}, CLADO_REQUEST_TIMEOUT_MS)
try {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
}
if (this.config.apiKey) {
headers.Authorization = `Bearer ${this.config.apiKey}`
}
const response = await fetch(this.config.baseUrl, {
method: 'POST',
headers,
body: JSON.stringify({
instruction,
image_base64: imageBase64,
history: this.formatHistory(actionHistory),
}),
signal: requestController.signal,
})
if (!response.ok) {
const body = await response.text()
throw new Error(
`HTTP ${response.status} ${response.statusText}: ${body.slice(0, 400)}`,
)
}
return (await response.json()) as CladoActionResponse
} finally {
clearTimeout(timeoutHandle)
signal?.removeEventListener('abort', onAbort)
}
}
private parseActions(prediction: CladoActionResponse): CladoAction[] {
const actionFromField =
typeof prediction.action === 'string' ? prediction.action : null
const rawActions = this.parseActionsFromRawResponse(prediction.raw_response)
const primaryFromRaw = rawActions[0] ?? null
const mergedPrimary = {
...primaryFromRaw,
...prediction,
action: actionFromField ?? primaryFromRaw?.action,
}
const normalized: CladoAction[] = []
const primary = this.normalizeActionPayload(mergedPrimary)
if (primary) normalized.push(primary)
for (const candidate of rawActions.slice(1)) {
const parsed = this.normalizeActionPayload(candidate)
if (!parsed) continue
const prev = normalized[normalized.length - 1]
if (
!prev ||
this.getActionSignature(prev) !== this.getActionSignature(parsed)
) {
normalized.push(parsed)
}
}
return normalized
}
private normalizeActionPayload(
payload: RawActionPayload,
): CladoAction | null {
if (!payload.action || typeof payload.action !== 'string') {
return null
}
return {
action: payload.action,
x: typeof payload.x === 'number' ? payload.x : undefined,
y: typeof payload.y === 'number' ? payload.y : undefined,
text: typeof payload.text === 'string' ? payload.text : undefined,
key: typeof payload.key === 'string' ? payload.key : undefined,
direction:
typeof payload.direction === 'string' ? payload.direction : undefined,
startX: typeof payload.startX === 'number' ? payload.startX : undefined,
startY: typeof payload.startY === 'number' ? payload.startY : undefined,
endX: typeof payload.endX === 'number' ? payload.endX : undefined,
endY: typeof payload.endY === 'number' ? payload.endY : undefined,
amount: typeof payload.amount === 'number' ? payload.amount : undefined,
time: typeof payload.time === 'number' ? payload.time : undefined,
final_answer:
typeof payload.final_answer === 'string'
? payload.final_answer
: undefined,
}
}
private parseActionsFromRawResponse(
rawResponse: string | undefined,
): RawActionPayload[] {
if (!rawResponse) return []
const matches = [
...rawResponse.matchAll(/<answer>\s*([\s\S]*?)\s*<\/answer>/gi),
]
const parsed: RawActionPayload[] = []
for (const match of matches) {
try {
parsed.push(JSON.parse(match[1]) as RawActionPayload)
} catch {
// ignore malformed answer blocks
}
}
return parsed
return this.cladoClient.requestActionPrediction({
instruction,
imageBase64,
actionHistory,
signal,
})
}
private async executeAction(
@@ -567,14 +399,14 @@ export class CladoActionExecutor {
}
case 'press_key': {
const key = this.normalizePressKey(action.key)
const key = normalizeCladoPressKey(action.key)
await this.runTool('press_key', { key }, signal)
return `Pressed key "${key}".`
}
case 'scroll': {
const direction = this.normalizeDirection(action.direction)
const amountPx = this.normalizeScrollAmount(action.amount)
const direction = normalizeCladoDirection(action.direction)
const amountPx = normalizeCladoScrollAmount(action.amount)
const ticks = Math.max(1, Math.round(amountPx / 120))
await this.runTool('scroll', { direction, amount: ticks }, signal)
@@ -645,7 +477,7 @@ export class CladoActionExecutor {
return image.data
}
private async getViewport(signal?: AbortSignal): Promise<Viewport> {
private async getViewport(signal?: AbortSignal): Promise<CladoViewport> {
if (this.viewport) return this.viewport
try {
@@ -676,15 +508,9 @@ export class CladoActionExecutor {
normalizedX: number | undefined,
normalizedY: number | undefined,
signal?: AbortSignal,
): Promise<ActionPoint> {
): Promise<CladoActionPoint> {
const viewport = await this.getViewport(signal)
const nx = clampNormalized(normalizedX ?? 500)
const ny = clampNormalized(normalizedY ?? 500)
return {
x: Math.round((nx / 1000) * viewport.width),
y: Math.round((ny / 1000) * viewport.height),
}
return resolveCladoPoint(viewport, normalizedX, normalizedY)
}
private async getCurrentUrl(signal?: AbortSignal): Promise<string> {
@@ -711,7 +537,7 @@ export class CladoActionExecutor {
throw new Error('aborted')
}
const toolArgs = this.prepareToolArgs(toolName, args)
const toolArgs = prepareCladoToolArgs(toolName, args, this.pageId)
try {
const raw = await this.mcpClient.callTool(toolName, toolArgs)
@@ -730,207 +556,6 @@ export class CladoActionExecutor {
}
}
private prepareToolArgs(
toolName: string,
args: Record<string, unknown>,
): Record<string, unknown> {
const prepared: Record<string, unknown> = { ...args }
if (
toolName === 'evaluate_script' &&
typeof prepared.function === 'string' &&
prepared.expression === undefined
) {
prepared.expression = this.toEvaluateExpression(prepared.function)
delete prepared.function
}
if (
toolName === 'click_at' &&
typeof prepared.dblClick === 'boolean' &&
prepared.clickCount === undefined
) {
prepared.clickCount = prepared.dblClick ? 2 : 1
delete prepared.dblClick
}
// Use fixed page ID for all page-scoped tools (single-page operation)
if (PAGE_SCOPED_TOOLS.has(toolName) && typeof prepared.page !== 'number') {
prepared.page = this.pageId
}
return prepared
}
private toEvaluateExpression(rawFunction: unknown): string {
const source = String(rawFunction).trim()
if (source.startsWith('() =>') || source.startsWith('async () =>')) {
return `(${source})()`
}
if (source.startsWith('function')) {
return `(${source})()`
}
return source
}
private normalizePressKey(key: string | undefined): string {
const raw = (key ?? '').trim()
if (!raw) throw new Error('press_key action missing key field')
const map: Record<string, string> = {
'C-a': 'Control+A',
'C-c': 'Control+C',
'C-v': 'Control+V',
'C-x': 'Control+X',
'C-z': 'Control+Z',
'C-y': 'Control+Y',
'C-s': 'Control+S',
'C-t': 'Control+T',
'C-w': 'Control+W',
'C-h': 'Control+H',
'C-f': 'Control+F',
'C-+': 'Control++',
'C--': 'Control+-',
'C-tab': 'Control+Tab',
'C-S-tab': 'Control+Shift+Tab',
'C-S-n': 'Control+Shift+N',
'C-down': 'Control+ArrowDown',
// macOS Cmd shortcuts (Meta in CDP).
'M-a': 'Meta+A',
'M-c': 'Meta+C',
'M-v': 'Meta+V',
'M-x': 'Meta+X',
'M-f4': 'Alt+F4',
}
return map[raw] ?? raw
}
private normalizeDirection(
direction: string | undefined,
): 'up' | 'down' | 'left' | 'right' {
if (
direction === 'up' ||
direction === 'down' ||
direction === 'left' ||
direction === 'right'
) {
return direction
}
return 'down'
}
private normalizeScrollAmount(amount: number | undefined): number {
if (typeof amount !== 'number') return 500
if (amount <= 0) return 100
const clamped = Math.min(amount, 1000)
return Math.max(100, Math.round((clamped / 1000) * 900))
}
private summarizePrediction(
prediction: CladoActionResponse,
): Record<string, unknown> {
const preview =
typeof prediction.raw_response === 'string' &&
prediction.raw_response.length > 0
? prediction.raw_response.slice(0, 240)
: undefined
return {
action: prediction.action,
x: prediction.x,
y: prediction.y,
text: prediction.text,
key: prediction.key,
direction: prediction.direction,
startX: prediction.startX,
startY: prediction.startY,
endX: prediction.endX,
endY: prediction.endY,
amount: prediction.amount,
time: prediction.time,
inference_time_seconds: prediction.inference_time_seconds,
raw_response_preview: preview,
}
}
private extractThinking(rawResponse: string | undefined): string | undefined {
if (!rawResponse) return undefined
const matches = [
...rawResponse.matchAll(/<thinking>\s*([\s\S]*?)\s*<\/thinking>/gi),
]
if (matches.length === 0) return undefined
const merged = matches
.map((match) => match[1]?.replace(/\s+/g, ' ').trim() ?? '')
.filter((value) => value.length > 0)
.join(' ')
if (!merged) return undefined
return merged
}
private getActionSignature(action: CladoAction): string {
switch (action.action) {
case 'click':
case 'double_click':
case 'right_click':
case 'hover':
return `${action.action}:${action.x ?? 'x'}:${action.y ?? 'y'}`
case 'type':
return `${action.action}:${(action.text ?? '').slice(0, 16)}`
case 'press_key':
return `${action.action}:${action.key ?? 'key'}`
case 'scroll':
return `${action.action}:${action.direction ?? 'down'}:${action.amount ?? 500}`
case 'drag':
return `${action.action}:${action.startX}:${action.startY}:${action.endX}:${action.endY}`
case 'wait':
return `${action.action}:${action.time ?? 1}`
case 'end':
return action.final_answer
? `end(${action.final_answer.slice(0, 32)})`
: 'end()'
case 'invalid':
return `invalid(${(action.text ?? '').slice(0, 40)})`
default:
return action.action
}
}
private formatHistory(actions: CladoAction[]): string {
if (actions.length === 0) return 'None'
const parts = actions.map((action) => {
switch (action.action) {
case 'click':
case 'double_click':
case 'right_click':
case 'hover':
return `${action.action}(${Math.round(action.x ?? 500)}, ${Math.round(action.y ?? 500)})`
case 'type': {
const text = (action.text ?? '').replace(/'/g, "\\'")
return `type('${text}')`
}
case 'press_key':
return `press_key('${action.key ?? 'Enter'}')`
case 'scroll':
return `scroll(${action.direction ?? 'down'})`
case 'drag':
return `drag(${Math.round(action.startX ?? 500)},${Math.round(action.startY ?? 500)} -> ${Math.round(action.endX ?? 500)},${Math.round(action.endY ?? 500)})`
case 'wait':
return `wait(${Math.round(action.time ?? 1)}s)`
case 'end':
return 'end()'
case 'invalid':
return 'invalid()'
default:
return action.action
}
})
return parts.join(' -> ')
}
private buildObservation(params: {
status: ExecutorResult['status']
reason: string
@@ -946,7 +571,7 @@ export class CladoActionExecutor {
: actions
.slice(-5)
.map(
(action, idx) => `${idx + 1}. ${this.getActionSignature(action)}`,
(action, idx) => `${idx + 1}. ${getCladoActionSignature(action)}`,
)
.join('\n')
const thinkingSummary =

View File

@@ -24,8 +24,9 @@ import {
resolveProviderConfig,
} from '../../utils/resolve-provider-config'
import { withEvalTimeout } from '../../utils/with-eval-timeout'
import { createExecutorBackend } from '../orchestrated/backends/create-executor-backend'
import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
import { Executor, type ExecutorCallbacks } from './executor'
import type { ExecutorCallbacks } from './executor'
import { OrchestratorAgent } from './orchestrator-agent'
import type { ExecutorFactory, ExecutorResult } from './types'
@@ -235,12 +236,13 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
await capture.messageLogger.logStreamEvent(delegateInputEvent)
capture.emitEvent(task.query_id, delegateInputEvent)
const executor = new Executor(
executorConfig,
const executor = createExecutorBackend({
backendKind: isCladoAction ? 'clado' : 'tool-loop',
configTemplate: executorConfig,
browser,
config.browseros.server_url,
{ isCladoAction, callbacks },
)
serverUrl: config.browseros.server_url,
callbacks,
})
let result: ExecutorResult
try {
result = await executor.execute(instruction, signal)

View File

@@ -57,6 +57,20 @@ export class TrajectorySaver {
)
}
async saveAttempt(attempt: Record<string, unknown>): Promise<void> {
await writeFile(
join(this.outputDir, 'attempt.json'),
JSON.stringify(attempt, null, 2),
)
}
async saveGrades(graderResults: Record<string, GraderResult>): Promise<void> {
await writeFile(
join(this.outputDir, 'grades.json'),
JSON.stringify(graderResults, null, 2),
)
}
async loadMetadata(): Promise<TaskMetadata> {
const content = await readFile(
join(this.outputDir, 'metadata.json'),
@@ -70,6 +84,7 @@ export class TrajectorySaver {
): Promise<void> {
const metadata = await this.loadMetadata()
metadata.grader_results = graderResults
await this.saveGrades(graderResults)
await this.saveMetadata(metadata)
}

View File

@@ -0,0 +1,170 @@
import { parseArgs } from 'node:util'
export type PublishTarget = 'r2'
export interface LegacyCliArgs {
command: 'legacy'
configPath?: string
help?: boolean
}
export interface SuiteCliArgs {
command: 'suite'
configPath?: string
suitePath?: string
variantId?: string
provider?: string
model?: string
apiKey?: string
baseUrl?: string
publishTarget?: PublishTarget
}
export interface RunCliArgs
extends Omit<SuiteCliArgs, 'command' | 'publishTarget'> {
command: 'run'
}
export interface GradeCliArgs {
command: 'grade'
runDir: string
}
export interface PublishCliArgs {
command: 'publish'
runDir: string
target: PublishTarget
}
export type EvalCliArgs =
| LegacyCliArgs
| SuiteCliArgs
| RunCliArgs
| GradeCliArgs
| PublishCliArgs
const COMMANDS = new Set(['suite', 'run', 'grade', 'publish'])
function stringValue(value: string | boolean | undefined): string | undefined {
return typeof value === 'string' && value.length > 0 ? value : undefined
}
function publishTarget(value: string | undefined): PublishTarget | undefined {
if (value === undefined) return undefined
if (value === 'r2') return 'r2'
throw new Error(`Unsupported publish target: ${value}`)
}
function requireOne(
command: string,
configPath: string | undefined,
suitePath: string | undefined,
): void {
if (!configPath && !suitePath) {
throw new Error(`${command} requires --config or --suite`)
}
if (configPath && suitePath) {
throw new Error(`${command} accepts either --config or --suite, not both`)
}
}
function parseSuiteLikeArgs(
command: 'suite' | 'run',
argv: string[],
): SuiteCliArgs | RunCliArgs {
const { values } = parseArgs({
args: argv,
options: {
config: { type: 'string' },
suite: { type: 'string' },
variant: { type: 'string' },
provider: { type: 'string' },
model: { type: 'string' },
'api-key': { type: 'string' },
'base-url': { type: 'string' },
publish: { type: 'string' },
},
})
const configPath = stringValue(values.config)
const suitePath = stringValue(values.suite)
requireOne(command, configPath, suitePath)
const parsed: SuiteCliArgs | RunCliArgs =
command === 'suite' ? { command: 'suite' } : { command: 'run' }
if (configPath) parsed.configPath = configPath
if (suitePath) parsed.suitePath = suitePath
const variantId = stringValue(values.variant)
if (variantId) parsed.variantId = variantId
const provider = stringValue(values.provider)
if (provider) parsed.provider = provider
const model = stringValue(values.model)
if (model) parsed.model = model
const apiKey = stringValue(values['api-key'])
if (apiKey) parsed.apiKey = apiKey
const baseUrl = stringValue(values['base-url'])
if (baseUrl) parsed.baseUrl = baseUrl
if (command === 'suite') {
const target = publishTarget(stringValue(values.publish))
if (target) {
const suiteArgs = parsed as SuiteCliArgs
suiteArgs.publishTarget = target
}
}
return parsed
}
function parseLegacyArgs(argv: string[]): LegacyCliArgs {
const { values } = parseArgs({
args: argv,
options: {
config: { type: 'string', short: 'c' },
help: { type: 'boolean', short: 'h' },
},
})
const parsed: LegacyCliArgs = { command: 'legacy' }
const configPath = stringValue(values.config)
if (configPath) parsed.configPath = configPath
if (values.help) parsed.help = true
return parsed
}
/** Parses the eval CLI command without running browser or publishing side effects. */
export function parseEvalCliArgs(argv: string[]): EvalCliArgs {
const [command, ...rest] = argv
if (!COMMANDS.has(command ?? '')) {
return parseLegacyArgs(argv)
}
switch (command) {
case 'suite':
return parseSuiteLikeArgs('suite', rest)
case 'run':
return parseSuiteLikeArgs('run', rest)
case 'grade': {
const { values } = parseArgs({
args: rest,
options: { run: { type: 'string' } },
})
const runDir = stringValue(values.run)
if (!runDir) throw new Error('grade requires --run')
return { command: 'grade', runDir }
}
case 'publish': {
const { values } = parseArgs({
args: rest,
options: { run: { type: 'string' }, target: { type: 'string' } },
})
const runDir = stringValue(values.run)
if (!runDir) throw new Error('publish requires --run')
const target = publishTarget(stringValue(values.target))
if (!target) throw new Error('publish requires --target')
return { command: 'publish', runDir, target }
}
default:
return parseLegacyArgs(argv)
}
}

View File

@@ -0,0 +1,84 @@
import { readdir, readFile, stat } from 'node:fs/promises'
import { join } from 'node:path'
import { TrajectorySaver } from '../../capture/trajectory-saver'
import { runGraders } from '../../grading/grader-runner'
import { type Message, MessageSchema, TaskMetadataSchema } from '../../types'
import type { GradeCliArgs } from '../args'
async function loadMessages(taskDir: string): Promise<Message[]> {
const content = await readFile(
join(taskDir, 'messages.jsonl'),
'utf-8',
).catch(() => '')
return content
.split('\n')
.filter((line) => line.trim().length > 0)
.map((line) => MessageSchema.parse(JSON.parse(line)))
}
async function findTaskDirs(runDir: string): Promise<string[]> {
const entries = await readdir(runDir, { withFileTypes: true })
const taskDirs: string[] = []
for (const entry of entries) {
if (!entry.isDirectory()) continue
const taskDir = join(runDir, entry.name)
const metadata = await stat(join(taskDir, 'metadata.json')).catch(
() => null,
)
if (metadata?.isFile()) taskDirs.push(taskDir)
}
return taskDirs
}
/** Re-runs graders for task artifacts that already contain metadata and messages. */
export async function runGradeCommand(args: GradeCliArgs): Promise<void> {
const runStat = await stat(args.runDir).catch(() => null)
if (!runStat?.isDirectory()) {
throw new Error(`Not a run directory: ${args.runDir}`)
}
const taskDirs = await findTaskDirs(args.runDir)
if (taskDirs.length === 0) {
throw new Error(`No task metadata found under ${args.runDir}`)
}
let graded = 0
for (const taskDir of taskDirs) {
const metadata = TaskMetadataSchema.parse(
JSON.parse(await readFile(join(taskDir, 'metadata.json'), 'utf-8')),
)
const graderNames = Object.keys(metadata.grader_results ?? {})
if (graderNames.length === 0) {
console.warn(`Skipping ${metadata.query_id}: no existing grader names`)
continue
}
const messages = await loadMessages(taskDir)
const graderResults = await runGraders(graderNames, {
task: {
query_id: metadata.query_id,
query: metadata.query,
dataset: metadata.dataset,
},
messages,
screenshotCount: metadata.screenshot_count ?? metadata.total_steps,
finalAnswer: metadata.final_answer,
taskArtifactDir: taskDir,
outputDir: taskDir,
mcpUrl: `${process.env.BROWSEROS_SERVER_URL || 'http://127.0.0.1:9110'}/mcp`,
})
await new TrajectorySaver(
args.runDir,
metadata.query_id,
).updateGraderResults(graderResults)
graded++
}
if (graded === 0) {
throw new Error(
`No tasks with existing grader names found under ${args.runDir}`,
)
}
console.log(`Re-graded ${graded} task(s) in ${args.runDir}`)
}

View File

@@ -0,0 +1,25 @@
import { publishPathToR2 } from '../../publishing/r2-publisher'
import type { PublishCliArgs, PublishTarget } from '../args'
export interface PublishRunOptions {
runDir: string
target: PublishTarget
}
/** Publishes run artifacts through the R2 viewer upload path. */
export async function publishRun(options: PublishRunOptions): Promise<void> {
if (options.target !== 'r2') {
throw new Error(`Unsupported publish target: ${options.target}`)
}
const result = await publishPathToR2(options.runDir)
for (const run of result.uploadedRuns) {
console.log(run.viewerUrl)
}
for (const runId of result.skippedRuns) {
console.log(`${runId}: already uploaded, skipping`)
}
}
export async function runPublishCommand(args: PublishCliArgs): Promise<void> {
await publishRun({ runDir: args.runDir, target: args.target })
}

View File

@@ -0,0 +1,21 @@
import type { RunCliArgs } from '../args'
import { runSuiteCommand, type SuiteCommandDeps } from './suite'
/** Executes tasks from a config or suite without publishing artifacts. */
export async function runRunCommand(
args: RunCliArgs,
deps: SuiteCommandDeps = {},
): Promise<void> {
await runSuiteCommand(
{
configPath: args.configPath,
suitePath: args.suitePath,
variantId: args.variantId,
provider: args.provider,
model: args.model,
apiKey: args.apiKey,
baseUrl: args.baseUrl,
},
deps,
)
}

View File

@@ -0,0 +1,187 @@
import type { RunEvalOptions, RunEvalResult } from '../../runner/types'
import { runEval as defaultRunEval } from '../../runs/eval-runner'
import {
type AdaptedEvalConfig,
adaptEvalConfigFile,
} from '../../suites/config-adapter'
import { loadSuite } from '../../suites/load-suite'
import { type EvalVariant, resolveVariant } from '../../suites/resolve-variant'
import type { EvalSuite } from '../../suites/schema'
import { type EvalConfig, EvalConfigSchema } from '../../types'
import type { PublishTarget } from '../args'
type Env = Record<string, string | undefined>
export interface SuiteCommandOptions {
configPath?: string
suitePath?: string
variantId?: string
provider?: string
model?: string
apiKey?: string
baseUrl?: string
publishTarget?: PublishTarget
env?: Env
}
export type ResolvedSuiteCommand =
| (AdaptedEvalConfig & { kind: 'config'; datasetPath?: undefined })
| {
kind: 'suite'
suitePath: string
suite: EvalSuite
variant: EvalVariant
datasetPath: string
evalConfig: EvalConfig
}
export interface SuiteCommandDeps {
runEval?: (options: RunEvalOptions) => Promise<RunEvalResult | undefined>
publishRun?: (options: {
runDir: string
target: PublishTarget
}) => Promise<void>
}
function ensureRunnableSuite(suite: EvalSuite): void {
if (!suite.browseros) {
throw new Error('suite browseros config is required to run suite commands')
}
}
function suiteToEvalConfig(
suite: EvalSuite,
datasetPath: string,
variant: EvalVariant,
env: Env,
): EvalConfig {
ensureRunnableSuite(suite)
const base = {
dataset: datasetPath,
num_workers: suite.workers,
restart_server_per_task: suite.restartBrowserPerTask,
browseros: suite.browseros,
graders: suite.graders,
timeout_ms: suite.timeoutMs,
captcha: suite.captcha,
}
if (suite.agent.type === 'single' || suite.agent.type === 'tool-loop') {
// The legacy runner names the BrowserOS tool-loop agent "single".
return EvalConfigSchema.parse({
...base,
agent: {
type: 'single',
provider: variant.agent.provider,
model: variant.agent.model,
apiKey: variant.agent.apiKey,
baseUrl: variant.agent.baseUrl,
supportsImages: variant.agent.supportsImages,
},
})
}
const executorBackend = suite.agent.executorBackend ?? 'tool-loop'
const executor =
executorBackend === 'clado'
? {
provider: 'clado-action' as const,
model:
env.EVAL_EXECUTOR_MODEL ?? env.CLADO_ACTION_MODEL ?? 'clado-action',
apiKey: env.EVAL_EXECUTOR_API_KEY ?? env.CLADO_ACTION_API_KEY ?? '',
baseUrl:
env.EVAL_EXECUTOR_BASE_URL ??
env.CLADO_ACTION_BASE_URL ??
env.CLADO_ACTION_URL,
}
: {
provider: variant.agent.provider,
model: variant.agent.model,
apiKey: variant.agent.apiKey,
baseUrl: variant.agent.baseUrl,
}
return EvalConfigSchema.parse({
...base,
agent: {
type: 'orchestrator-executor',
orchestrator: {
provider: variant.agent.provider,
model: variant.agent.model,
apiKey: variant.agent.apiKey,
baseUrl: variant.agent.baseUrl,
},
executor,
},
})
}
/** Resolves config-backed or suite-backed CLI input into the run shape used by the runner. */
export async function resolveSuiteCommand(
options: SuiteCommandOptions,
): Promise<ResolvedSuiteCommand> {
const env = options.env ?? process.env
if (options.configPath) {
return {
kind: 'config',
...(await adaptEvalConfigFile(options.configPath, { env })),
}
}
if (!options.suitePath) {
throw new Error('suite requires --config or --suite')
}
const loaded = await loadSuite(options.suitePath)
const variant = resolveVariant({
variantId: options.variantId,
provider: options.provider,
model: options.model,
apiKey: options.apiKey,
baseUrl: options.baseUrl,
env,
})
return {
kind: 'suite',
suitePath: loaded.suitePath,
suite: loaded.suite,
variant,
datasetPath: loaded.datasetPath,
evalConfig: suiteToEvalConfig(
loaded.suite,
loaded.datasetPath,
variant,
env,
),
}
}
/** Runs the full suite loop: resolve input, execute tasks, then optionally publish the run. */
export async function runSuiteCommand(
options: SuiteCommandOptions,
deps: SuiteCommandDeps = {},
): Promise<void> {
const runEval = deps.runEval ?? defaultRunEval
const resolved = await resolveSuiteCommand(options)
const runOptions: RunEvalOptions =
resolved.kind === 'config'
? { configPath: resolved.configPath }
: {
configPath: resolved.suitePath,
dataPath: resolved.datasetPath,
config: resolved.evalConfig,
}
const result = await runEval(runOptions)
if (!options.publishTarget) return
const outputDir = result?.outputDir
if (!outputDir) {
throw new Error('publish requested but runner did not return an outputDir')
}
if (!deps.publishRun) {
throw new Error('publish requested before the publisher is configured')
}
await deps.publishRun({ runDir: outputDir, target: options.publishTarget })
}

View File

@@ -0,0 +1,70 @@
import { startDashboard } from '../dashboard/server'
import { runEval } from '../runs/eval-runner'
import { type EvalCliArgs, parseEvalCliArgs } from './args'
import { runGradeCommand } from './commands/grade'
import { publishRun, runPublishCommand } from './commands/publish'
import { runRunCommand } from './commands/run'
import { runSuiteCommand } from './commands/suite'
export function usage(): string {
return `
BrowserOS Eval
Usage:
bun run eval suite --config <config.json> [--publish r2]
bun run eval suite --suite <suite.json> --variant <id> [--publish r2]
bun run eval run --config <config.json>
bun run eval run --suite <suite.json> --variant <id>
bun run eval grade --run <results/run-dir>
bun run eval publish --run <results/run-dir> --target r2
bun run eval -c <config.json>
`
}
async function runLegacyCommand(args: EvalCliArgs): Promise<void> {
if (args.command !== 'legacy') return
if (args.help) {
console.log(usage())
return
}
if (args.configPath) {
await runEval({ configPath: args.configPath })
return
}
startDashboard({
tasks: [],
configName: '',
agentType: '',
outputDir: '',
configMode: true,
})
console.log(
'Dashboard running at http://localhost:9900 — configure and run from the UI',
)
await new Promise(() => {})
}
/** Dispatches the eval CLI while preserving the old config/dashboard entry points. */
export async function runCli(
argv: string[] = Bun.argv.slice(2),
): Promise<void> {
const args = parseEvalCliArgs(argv)
switch (args.command) {
case 'legacy':
await runLegacyCommand(args)
break
case 'suite':
await runSuiteCommand(args, { publishRun })
break
case 'run':
await runRunCommand(args)
break
case 'grade':
await runGradeCommand(args)
break
case 'publish':
await runPublishCommand(args)
break
}
}

View File

@@ -1,5 +1,5 @@
import { mkdir, readdir, readFile, stat } from 'node:fs/promises'
import { join, resolve } from 'node:path'
import { dirname, join, resolve, sep } from 'node:path'
import { Hono } from 'hono'
import { streamSSE } from 'hono/streaming'
import { ParallelExecutor } from '../runner/parallel-executor'
@@ -128,6 +128,35 @@ let dashboardConfigMode = false
const configsDir = join(import.meta.dir, '..', '..', 'configs')
const projectRoot = resolve(import.meta.dir, '..', '..', '..', '..')
async function listConfigFiles(dir: string, prefix = ''): Promise<string[]> {
const entries = await readdir(join(dir, prefix), { withFileTypes: true })
const files: string[] = []
for (const entry of entries) {
const relativePath = prefix ? join(prefix, entry.name) : entry.name
if (entry.isDirectory()) {
files.push(...(await listConfigFiles(dir, relativePath)))
} else if (entry.isFile() && entry.name.endsWith('.json')) {
files.push(relativePath.split(sep).join('/'))
}
}
return files.sort()
}
function resolveConfigPath(name: string): string | null {
if (!name.endsWith('.json')) return null
if (name.split('/').some((part) => !part || part === '.' || part === '..')) {
return null
}
const resolvedPath = resolve(configsDir, name)
const resolvedConfigsDir = resolve(configsDir)
const configRootPrefix = resolvedConfigsDir.endsWith(sep)
? resolvedConfigsDir
: `${resolvedConfigsDir}${sep}`
if (!resolvedPath.startsWith(configRootPrefix)) return null
return resolvedPath
}
// ============================================================================
// Hono App
// ============================================================================
@@ -339,21 +368,21 @@ app.get('/api/mode', (c) => {
// List saved config files
app.get('/api/configs', async (c) => {
try {
const files = await readdir(configsDir)
return c.json(files.filter((f) => f.endsWith('.json')))
return c.json(await listConfigFiles(configsDir))
} catch {
return c.json([])
}
})
// Read a specific config file
app.get('/api/config/:name', async (c) => {
const name = c.req.param('name')
if (name.includes('/') || name.includes('..')) {
app.get('/api/config/*', async (c) => {
const name = decodeURIComponent(c.req.path.slice('/api/config/'.length))
const configPath = resolveConfigPath(name)
if (!configPath) {
return c.json({ error: 'Invalid config name' }, 400)
}
try {
const content = await readFile(join(configsDir, name), 'utf-8')
const content = await readFile(configPath, 'utf-8')
return c.json(JSON.parse(content))
} catch {
return c.notFound()
@@ -382,8 +411,17 @@ app.post('/api/run', async (c) => {
const config = parseResult.data
// Resolve relative paths from configs/ dir (dataset dropdown values are relative to it)
const baseDir = configsDir
let baseDir = configsDir
if (body.configName) {
const configPath = resolveConfigPath(body.configName)
if (!configPath) {
return c.json({ error: 'Invalid config name' }, 400)
}
baseDir = dirname(configPath)
}
// Resolve relative paths from the loaded config location. Unsaved dashboard
// configs keep using apps/eval/configs as their base for dropdown values.
const datasetPath = resolve(
config.dataset.startsWith('/')
? config.dataset

View File

@@ -1,5 +1,12 @@
import { spawn } from 'node:child_process'
import { join } from 'node:path'
import {
writeGraderJsonArtifact,
writeGraderTextArtifact,
} from '../../grading/artifacts'
import {
type PythonEvaluatorResult,
runPythonJsonEvaluator,
} from '../../grading/python-evaluator'
import type { GraderResult } from '../../types'
import { callMcpTool } from '../../utils/mcp-client'
import type { Grader, GraderInput } from '../types'
@@ -7,12 +14,23 @@ import type { Grader, GraderInput } from '../types'
const EVAL_SCRIPT = join(
import.meta.dirname,
'..',
'..',
'..',
'scripts',
'python',
'agisdk-evaluate.py',
)
interface AgisdkEvaluatorInput {
task_id: string
env_state: Record<string, unknown>
model_response: string
}
interface AgisdkEvaluatorOutput {
reward: number
pass: boolean
message: string
per_criterion: unknown[]
}
export class AgisdkStateDiffGrader implements Grader {
name = 'agisdk_state_diff'
@@ -36,6 +54,16 @@ export class AgisdkStateDiffGrader implements Grader {
let envState: Record<string, unknown>
try {
envState = await this.fetchFinishState(origin, mcpEndpoint)
await writeGraderJsonArtifact(
input,
this.name,
'finish-state.json',
envState,
)
await writeGraderJsonArtifact(input, this.name, 'context.json', {
origin,
agisdk_task_id: taskId,
})
} catch (error) {
return {
score: 0,
@@ -46,10 +74,30 @@ export class AgisdkStateDiffGrader implements Grader {
}
try {
const result = await this.runPythonEvaluator(
taskId,
envState,
input.finalAnswer || '',
const evaluatorInput: AgisdkEvaluatorInput = {
task_id: taskId,
env_state: envState,
model_response: input.finalAnswer || '',
}
await writeGraderJsonArtifact(
input,
this.name,
'evaluator-input.json',
evaluatorInput,
)
const evaluation = await this.runPythonEvaluator(evaluatorInput)
const result = evaluation.output
await writeGraderJsonArtifact(
input,
this.name,
'evaluator-output.json',
result,
)
await writeGraderTextArtifact(
input,
this.name,
'stderr.txt',
evaluation.stderr,
)
return {
score: result.reward,
@@ -144,59 +192,12 @@ export class AgisdkStateDiffGrader implements Grader {
}
private runPythonEvaluator(
taskId: string,
envState: Record<string, unknown>,
modelResponse: string,
): Promise<{
reward: number
pass: boolean
message: string
per_criterion: unknown[]
}> {
return new Promise((resolve, reject) => {
const proc = spawn('python3', [EVAL_SCRIPT], {
stdio: ['pipe', 'pipe', 'pipe'],
})
const inputData = JSON.stringify({
task_id: taskId,
env_state: envState,
model_response: modelResponse,
})
let stdout = ''
let stderr = ''
proc.stdout.on('data', (data: Buffer) => {
stdout += data.toString()
})
proc.stderr.on('data', (data: Buffer) => {
stderr += data.toString()
})
proc.on('close', (code) => {
if (code !== 0) {
reject(
new Error(`Python evaluator exited with code ${code}: ${stderr}`),
)
return
}
try {
const result = JSON.parse(stdout.trim())
resolve(result)
} catch {
reject(new Error(`Failed to parse evaluator output: ${stdout}`))
}
})
proc.on('error', (err) => {
reject(new Error(`Failed to spawn Python evaluator: ${err.message}`))
})
proc.stdin.write(inputData)
proc.stdin.end()
evalInput: AgisdkEvaluatorInput,
): Promise<PythonEvaluatorResult<AgisdkEvaluatorOutput>> {
return runPythonJsonEvaluator<AgisdkEvaluatorOutput>({
scriptPath: EVAL_SCRIPT,
input: evalInput,
timeoutMs: 300_000,
})
}
}

View File

@@ -1,4 +1,12 @@
import { join, resolve } from 'node:path'
import {
writeGraderJsonArtifact,
writeGraderTextArtifact,
} from '../../grading/artifacts'
import {
type PythonEvaluatorResult,
runPythonJsonEvaluator,
} from '../../grading/python-evaluator'
import type { GraderResult } from '../../types'
import type { Grader, GraderInput } from '../types'
@@ -14,10 +22,7 @@ interface InfinityEvalOutput {
message: string
}
const EVAL_SCRIPT = resolve(
import.meta.dir,
'../../../scripts/infinity-evaluate.py',
)
const EVAL_SCRIPT = resolve(import.meta.dir, '../python/infinity-evaluate.py')
export class InfinityStateGrader implements Grader {
name = 'infinity_state'
@@ -66,7 +71,32 @@ export class InfinityStateGrader implements Grader {
}
try {
const result = await this.runPythonEvaluator(evalInput)
await writeGraderJsonArtifact(input, this.name, 'verifier.json', {
appName: parsed.appName,
taskId: parsed.taskId,
verifierPath,
appServerUrl,
})
await writeGraderJsonArtifact(
input,
this.name,
'evaluator-input.json',
evalInput,
)
const evaluation = await this.runPythonEvaluator(evalInput)
const result = evaluation.output
await writeGraderJsonArtifact(
input,
this.name,
'evaluator-output.json',
result,
)
await writeGraderTextArtifact(
input,
this.name,
'stderr.txt',
evaluation.stderr,
)
return {
score: result.pass ? 1 : 0,
pass: result.pass,
@@ -108,27 +138,11 @@ export class InfinityStateGrader implements Grader {
private async runPythonEvaluator(
evalInput: InfinityEvalInput,
): Promise<InfinityEvalOutput> {
const proc = Bun.spawn(['python3', EVAL_SCRIPT], {
stdin: 'pipe',
stdout: 'pipe',
stderr: 'pipe',
): Promise<PythonEvaluatorResult<InfinityEvalOutput>> {
return runPythonJsonEvaluator<InfinityEvalOutput>({
scriptPath: EVAL_SCRIPT,
input: evalInput,
timeoutMs: 300_000,
})
const inputJson = JSON.stringify(evalInput)
proc.stdin.write(inputJson)
proc.stdin.end()
const stdout = await new Response(proc.stdout).text()
const stderr = await new Response(proc.stderr).text()
const exitCode = await proc.exited
if (exitCode !== 0) {
throw new Error(
`Python evaluator exited with code ${exitCode}: ${stderr || stdout}`,
)
}
return JSON.parse(stdout.trim()) as InfinityEvalOutput
}
}

View File

@@ -1,6 +1,7 @@
import { readFile } from 'node:fs/promises'
import { join } from 'node:path'
import { query } from '@anthropic-ai/claude-agent-sdk'
import { writeGraderJsonArtifact } from '../../grading/artifacts'
import type { GraderResult } from '../../types'
import type { Grader, GraderInput } from '../types'
import {
@@ -63,6 +64,7 @@ export class PerformanceGrader implements Grader {
input.screenshotCount,
terminationReason,
)
await writeGraderJsonArtifact(input, this.name, 'metrics.json', metrics)
const systemPrompt = PERFORMANCE_SYSTEM_PROMPT.replace(
/\{screenshot_count\}/g,
@@ -82,6 +84,14 @@ export class PerformanceGrader implements Grader {
userPrompt,
input.outputDir,
)
if (response) {
await writeGraderJsonArtifact(
input,
this.name,
'agent-output.json',
response,
)
}
if (!response) {
return {
@@ -140,6 +150,7 @@ export class PerformanceGrader implements Grader {
`Perf grader: LLM returned ${returnedAxes.size}/${expectedAxes.size} axes, missing: ${missingAxes.join(', ')}`,
)
}
await writeGraderJsonArtifact(input, this.name, 'axes.json', axisResults)
return {
score: compositeScore / 100,

View File

@@ -1,51 +1,2 @@
import type { GraderResult } from '../types'
import { AgisdkStateDiffGrader } from './benchmark/agisdk-state-diff'
import { InfinityStateGrader } from './benchmark/infinity-state'
import { PerformanceGrader } from './performance/performance-grader'
import type { Grader, GraderInput } from './types'
export const PASS_FAIL_GRADER_ORDER = [
'agisdk_state_diff',
'infinity_state',
'performance_grader',
] as const
export function createGrader(name: string): Grader | null {
switch (name) {
case 'agisdk_state_diff':
return new AgisdkStateDiffGrader()
case 'infinity_state':
return new InfinityStateGrader()
case 'performance_grader':
return new PerformanceGrader()
default:
console.warn(`Unknown grader: ${name}`)
return null
}
}
export async function runGraders(
graderNames: string[],
input: GraderInput,
): Promise<Record<string, GraderResult>> {
const results: Record<string, GraderResult> = {}
for (const name of graderNames) {
const grader = createGrader(name)
if (!grader) continue
try {
console.log(` Running grader: ${name}`)
results[name] = await grader.grade(input)
} catch (error) {
results[name] = {
score: 0,
pass: false,
reasoning: `Error running grader: ${error}`,
}
}
}
return results
}
export { AgisdkStateDiffGrader, InfinityStateGrader, PerformanceGrader }
export * from '../grading/grader-registry'
export { runConfiguredGraders, runGraders } from '../grading/grader-runner'

View File

@@ -1,21 +1 @@
import type { GraderResult, Message } from '../types'
export interface GraderInput {
task: {
query_id: string
query: string
dataset: string
}
messages: Message[]
screenshotCount: number
finalAnswer: string | null
expectedAnswer?: string | null
outputDir: string
mcpUrl?: string
infinityAppUrl?: string
}
export interface Grader {
name: string
grade(input: GraderInput): Promise<GraderResult>
}
export type { Grader, GraderInput } from '../grading/types'

View File

@@ -0,0 +1,34 @@
import { mkdir, writeFile } from 'node:fs/promises'
import { join } from 'node:path'
import type { GraderInput } from './types'
function artifactDir(input: GraderInput, graderName: string): string {
return join(
input.taskArtifactDir || input.outputDir,
'grader-artifacts',
graderName,
)
}
/** Writes a JSON artifact for a grader under the task artifact directory. */
export async function writeGraderJsonArtifact(
input: GraderInput,
graderName: string,
filename: string,
value: unknown,
): Promise<void> {
const dir = artifactDir(input, graderName)
await mkdir(dir, { recursive: true })
await writeFile(join(dir, filename), JSON.stringify(value, null, 2))
}
export async function writeGraderTextArtifact(
input: GraderInput,
graderName: string,
filename: string,
value: string,
): Promise<void> {
const dir = artifactDir(input, graderName)
await mkdir(dir, { recursive: true })
await writeFile(join(dir, filename), value)
}

View File

@@ -0,0 +1,26 @@
import { AgisdkStateDiffGrader } from '../graders/benchmark/agisdk-state-diff'
import { InfinityStateGrader } from '../graders/benchmark/infinity-state'
import { PerformanceGrader } from '../graders/performance/performance-grader'
import type { Grader } from './types'
export const PASS_FAIL_GRADER_ORDER = [
'agisdk_state_diff',
'infinity_state',
'performance_grader',
] as const
export function createGrader(name: string): Grader | null {
switch (name) {
case 'agisdk_state_diff':
return new AgisdkStateDiffGrader()
case 'infinity_state':
return new InfinityStateGrader()
case 'performance_grader':
return new PerformanceGrader()
default:
console.warn(`Unknown grader: ${name}`)
return null
}
}
export { AgisdkStateDiffGrader, InfinityStateGrader, PerformanceGrader }

View File

@@ -0,0 +1,36 @@
import type { GraderResult } from '../types'
import { createGrader as defaultCreateGrader } from './grader-registry'
import type { Grader, GraderInput } from './types'
export interface GraderRunnerDeps {
createGrader?: (name: string) => Grader | null
}
/** Runs configured graders independently so one failure does not hide others. */
export async function runConfiguredGraders(
graderNames: string[],
input: GraderInput,
deps: GraderRunnerDeps = {},
): Promise<Record<string, GraderResult>> {
const create = deps.createGrader ?? defaultCreateGrader
const results: Record<string, GraderResult> = {}
for (const name of graderNames) {
const grader = create(name)
if (!grader) continue
try {
console.log(` Running grader: ${name}`)
results[name] = await grader.grade(input)
} catch (error) {
results[name] = {
score: 0,
pass: false,
reasoning: `Error running grader: ${error instanceof Error ? error.message : String(error)}`,
}
}
}
return results
}
export const runGraders = runConfiguredGraders

View File

@@ -0,0 +1,65 @@
export interface PythonEvaluatorOptions {
scriptPath: string
input: unknown
timeoutMs: number
}
export interface PythonEvaluatorResult<T> {
output: T
stdout: string
stderr: string
exitCode: number
}
/** Runs a Python evaluator that accepts stdin JSON and emits stdout JSON. */
export async function runPythonJsonEvaluator<T>(
options: PythonEvaluatorOptions,
): Promise<PythonEvaluatorResult<T>> {
const proc = Bun.spawn(['python3', options.scriptPath], {
stdin: 'pipe',
stdout: 'pipe',
stderr: 'pipe',
})
proc.stdin.write(JSON.stringify(options.input))
proc.stdin.end()
let timeoutHandle: ReturnType<typeof setTimeout> | undefined
const timeout = new Promise<never>((_, reject) => {
timeoutHandle = setTimeout(() => {
proc.kill('SIGKILL')
reject(
new Error(`Python evaluator timed out after ${options.timeoutMs}ms`),
)
}, options.timeoutMs)
})
const completed = (async (): Promise<PythonEvaluatorResult<T>> => {
const stdout = await new Response(proc.stdout).text()
const stderr = await new Response(proc.stderr).text()
const exitCode = await proc.exited
if (exitCode !== 0) {
throw new Error(
`Python evaluator exited with code ${exitCode}: ${stderr || stdout}`,
)
}
try {
return {
output: JSON.parse(stdout.trim()) as T,
stdout,
stderr,
exitCode,
}
} catch {
throw new Error(`Failed to parse Python evaluator output: ${stdout}`)
}
})()
try {
return await Promise.race([completed, timeout])
} finally {
clearTimeout(timeoutHandle)
}
}

View File

@@ -0,0 +1,22 @@
import type { GraderResult, Message } from '../types'
export interface GraderInput {
task: {
query_id: string
query: string
dataset: string
}
messages: Message[]
screenshotCount: number
finalAnswer: string | null
expectedAnswer?: string | null
taskArtifactDir: string
outputDir: string
mcpUrl?: string
infinityAppUrl?: string
}
export interface Grader {
name: string
grade(input: GraderInput): Promise<GraderResult>
}

View File

@@ -1,73 +1,10 @@
#!/usr/bin/env bun
import { parseArgs } from 'node:util'
import { runEval } from './runner/eval-runner'
import { runCli } from './cli'
const { values } = parseArgs({
args: Bun.argv.slice(2),
options: {
config: { type: 'string', short: 'c' },
help: { type: 'boolean', short: 'h' },
},
})
if (values.help) {
console.log(`
BrowserOS Eval
Usage:
bun run eval # Opens dashboard in config mode
bun run eval --config <config.json> # Runs eval with config file
Available agent types:
- single Single LLM agent driven by the BrowserOS tool loop
- orchestrator-executor High-level planner + visual/text executor
Available graders:
- performance_grader Multi-axis grader using Claude Agent SDK
- agisdk_state_diff AGI SDK / REAL Bench state-diff grader
- infinity_state WebArena-Infinity verifier-script grader
Preset configs in configs/:
- browseros-agent-weekly.json Weekly eval (single agent)
- browseros-oe-agent-weekly.json Weekly eval (orchestrator + LLM executor)
- browseros-oe-clado-weekly.json Weekly eval (orchestrator + Clado executor)
- agisdk-real-smoke.json AGI SDK smoke run (1 task)
- agisdk-real.json AGI SDK full run (36 tasks)
- infinity-hard-50.json WebArena-Infinity hard-50 set
- test-webvoyager.json WebVoyager test
- test-mind2web.json Mind2Web test
Examples:
bun run eval # Dashboard config mode
bun run eval -c configs/browseros-agent-weekly.json
bun run eval -c configs/test-webvoyager.json
`)
process.exit(0)
}
if (values.config) {
try {
await runEval({ configPath: values.config })
} catch (error) {
console.error(error instanceof Error ? error.message : String(error))
process.exit(1)
}
process.exit(0)
} else {
// No config — start dashboard in config mode, wait for user to configure and run
const { startDashboard } = await import('./dashboard/server')
startDashboard({
tasks: [],
configName: '',
agentType: '',
outputDir: '',
configMode: true,
})
console.log(
'Dashboard running at http://localhost:9900 — configure and run from the UI',
)
// Keep process alive until SIGINT
await new Promise(() => {})
try {
await runCli(Bun.argv.slice(2))
} catch (error) {
console.error(error instanceof Error ? error.message : String(error))
process.exit(1)
}

View File

@@ -0,0 +1,41 @@
export interface R2UploadConfig {
accountId: string
accessKeyId: string
secretAccessKey: string
bucket: string
cdnBaseUrl: string
}
export interface R2ManifestTask {
queryId: string
query: string
startUrl: string
status: string
durationMs: number
screenshotCount: number
graderResults: Record<string, unknown>
}
export interface R2RunManifest {
runId: string
uploadedAt: string
agentConfig?: Record<string, unknown>
dataset?: string
summary?: {
passRate?: unknown
avgDurationMs?: unknown
}
tasks: R2ManifestTask[]
}
export interface R2PublishRunResult {
runId: string
uploadedFiles: number
viewerUrl: string
manifest: R2RunManifest
}
export interface R2PublishPathResult {
uploadedRuns: R2PublishRunResult[]
skippedRuns: string[]
}

View File

@@ -0,0 +1,425 @@
import { readdir, readFile, stat } from 'node:fs/promises'
import { basename, dirname, extname, join } from 'node:path'
import {
GetObjectCommand,
PutObjectCommand,
S3Client,
} from '@aws-sdk/client-s3'
import type {
R2ManifestTask,
R2PublishPathResult,
R2PublishRunResult,
R2RunManifest,
R2UploadConfig,
} from './r2-manifest'
const DEFAULT_CONCURRENCY = 20
const CONTENT_TYPES: Record<string, string> = {
'.json': 'application/json',
'.jsonl': 'application/x-ndjson',
'.png': 'image/png',
'.html': 'text/html',
}
export interface R2Client {
send(command: unknown): Promise<unknown>
}
export interface R2PublisherOptions {
config: R2UploadConfig
client?: R2Client
viewerPath?: string
concurrency?: number
now?: () => Date
}
interface UploadJob {
key: string
filePath: string
contentType: string
}
interface TaskDirEntry {
taskId: string
taskPath: string
canonicalLayout: boolean
}
export function contentTypeForPath(filePath: string): string {
return CONTENT_TYPES[extname(filePath)] || 'application/octet-stream'
}
export function loadR2ConfigFromEnv(
env: Record<string, string | undefined> = process.env,
): R2UploadConfig {
const accountId = env.EVAL_R2_ACCOUNT_ID
const accessKeyId = env.EVAL_R2_ACCESS_KEY_ID
const secretAccessKey = env.EVAL_R2_SECRET_ACCESS_KEY
if (!accountId || !accessKeyId || !secretAccessKey) {
throw new Error(
'Missing required env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY',
)
}
return {
accountId,
accessKeyId,
secretAccessKey,
bucket: env.EVAL_R2_BUCKET || 'browseros-eval',
cdnBaseUrl: (
env.EVAL_R2_CDN_BASE_URL || 'https://eval.browseros.com'
).replace(/\/+$/, ''),
}
}
export function createR2Client(config: R2UploadConfig): S3Client {
return new S3Client({
region: 'auto',
endpoint: `https://${config.accountId}.r2.cloudflarestorage.com`,
credentials: {
accessKeyId: config.accessKeyId,
secretAccessKey: config.secretAccessKey,
},
})
}
async function collectFiles(dir: string): Promise<string[]> {
const files: string[] = []
const entries = await readdir(dir, { withFileTypes: true })
for (const entry of entries) {
const full = join(dir, entry.name)
if (entry.isDirectory()) {
files.push(...(await collectFiles(full)))
} else {
files.push(full)
}
}
return files
}
async function runPool<T>(
items: T[],
concurrency: number,
fn: (item: T) => Promise<void>,
): Promise<void> {
let i = 0
const workers = Array.from({ length: concurrency }, async () => {
while (i < items.length) {
const idx = i++
await fn(items[idx])
}
})
await Promise.all(workers)
}
async function hasMetadata(dir: string): Promise<boolean> {
const metaStat = await stat(join(dir, 'metadata.json')).catch(() => null)
return !!metaStat?.isFile()
}
async function findTaskDirs(runDir: string): Promise<TaskDirEntry[]> {
const entries = await readdir(runDir, { withFileTypes: true })
const legacyTasks: TaskDirEntry[] = []
for (const entry of entries) {
if (!entry.isDirectory() || entry.name === 'tasks') continue
const taskPath = join(runDir, entry.name)
if (await hasMetadata(taskPath)) {
legacyTasks.push({
taskId: entry.name,
taskPath,
canonicalLayout: false,
})
}
}
const tasksRoot = join(runDir, 'tasks')
const canonicalEntries = await readdir(tasksRoot, {
withFileTypes: true,
}).catch(() => [])
const canonicalTasks: TaskDirEntry[] = []
for (const entry of canonicalEntries) {
if (!entry.isDirectory()) continue
const taskPath = join(tasksRoot, entry.name)
if (await hasMetadata(taskPath)) {
canonicalTasks.push({
taskId: entry.name,
taskPath,
canonicalLayout: true,
})
}
}
return legacyTasks.length > 0 ? legacyTasks : canonicalTasks
}
async function isRunDir(dir: string): Promise<boolean> {
return (await findTaskDirs(dir)).length > 0
}
async function collectRunRootFiles(runDir: string): Promise<UploadJob[]> {
const entries = await readdir(runDir, { withFileTypes: true })
return entries
.filter((entry) => entry.isFile())
.map((entry) => {
const filePath = join(runDir, entry.name)
return {
key: entry.name,
filePath,
contentType: contentTypeForPath(filePath),
}
})
}
function statusFromMetadata(meta: Record<string, unknown>): string {
return meta.termination_reason === 'completed'
? 'completed'
: ((meta.termination_reason as string | undefined) ?? 'unknown')
}
function runIdForDir(runDir: string): string {
const timestamp = basename(runDir)
const configName = basename(dirname(runDir))
return `${configName}-${timestamp}`
}
/** Publishes eval artifacts in the viewer-compatible R2 layout. */
export class R2Publisher {
private readonly client: R2Client
private readonly config: R2UploadConfig
private readonly viewerPath: string
private readonly concurrency: number
private readonly now: () => Date
constructor(options: R2PublisherOptions) {
this.config = options.config
this.client = options.client ?? createR2Client(options.config)
this.viewerPath =
options.viewerPath ??
join(import.meta.dirname, '..', 'dashboard', 'viewer.html')
this.concurrency = options.concurrency ?? DEFAULT_CONCURRENCY
this.now = options.now ?? (() => new Date())
}
async isUploaded(runId: string): Promise<boolean> {
try {
await this.client.send(
new GetObjectCommand({
Bucket: this.config.bucket,
Key: `runs/${runId}/manifest.json`,
}),
)
return true
} catch {
return false
}
}
async publishPath(inputDir: string): Promise<R2PublishPathResult> {
const dirStat = await stat(inputDir).catch(() => null)
if (!dirStat?.isDirectory()) {
throw new Error(`Not a directory: ${inputDir}`)
}
if (await isRunDir(inputDir)) {
const result = await this.publishRun(inputDir, runIdForDir(inputDir))
return { uploadedRuns: [result], skippedRuns: [] }
}
const configName = basename(inputDir)
const entries = await readdir(inputDir, { withFileTypes: true })
const runDirs = entries
.filter((entry) => entry.isDirectory())
.map((entry) => entry.name)
.sort()
if (runDirs.length === 0) {
throw new Error('No run subdirectories found')
}
const uploadedRuns: R2PublishRunResult[] = []
const skippedRuns: string[] = []
for (const dir of runDirs) {
const runId = `${configName}-${dir}`
if (await this.isUploaded(runId)) {
skippedRuns.push(runId)
continue
}
uploadedRuns.push(await this.publishRun(join(inputDir, dir), runId))
}
return { uploadedRuns, skippedRuns }
}
async publishRun(
runDir: string,
runId: string = runIdForDir(runDir),
): Promise<R2PublishRunResult> {
const taskEntries = await findTaskDirs(runDir)
if (taskEntries.length === 0) {
throw new Error(`No task subdirectories in ${runId}`)
}
const manifestTasks: R2ManifestTask[] = []
const jobs: UploadJob[] = (await collectRunRootFiles(runDir)).map(
(job) => ({
...job,
key: `runs/${runId}/${job.key}`,
}),
)
let agentConfig: Record<string, unknown> | undefined
let dataset: string | undefined
for (const taskDirEntry of taskEntries) {
const { taskId, taskPath } = taskDirEntry
const meta = await this.readMetadata(taskPath)
if (!meta) continue
if (!agentConfig && meta.agent_config) {
agentConfig = meta.agent_config as Record<string, unknown>
}
if (!dataset && meta.dataset) dataset = meta.dataset as string
const files = await collectFiles(taskPath)
let screenshotCount = 0
for (const file of files) {
const relative = file.slice(taskPath.length + 1)
if (relative.startsWith('screenshots/') && extname(file) === '.png') {
screenshotCount++
}
jobs.push({
key: `runs/${runId}/${taskId}/${relative}`,
filePath: file,
contentType: contentTypeForPath(file),
})
if (taskDirEntry.canonicalLayout) {
jobs.push({
key: `runs/${runId}/tasks/${taskId}/${relative}`,
filePath: file,
contentType: contentTypeForPath(file),
})
}
}
manifestTasks.push({
queryId: (meta.query_id as string | undefined) || taskId,
query: (meta.query as string | undefined) || '',
startUrl: (meta.start_url as string | undefined) || '',
status: statusFromMetadata(meta),
durationMs: (meta.total_duration_ms as number | undefined) || 0,
screenshotCount:
(meta.screenshot_count as number | undefined) || screenshotCount,
graderResults:
(meta.grader_results as Record<string, unknown> | undefined) || {},
})
}
if (manifestTasks.length === 0) {
throw new Error(`No completed tasks in ${runId}`)
}
let uploaded = 0
await runPool(jobs, this.concurrency, async (job) => {
await this.uploadFile(job)
uploaded++
})
const manifest = await this.buildManifest(
runDir,
runId,
agentConfig,
dataset,
manifestTasks,
)
await this.uploadBuffer(
`runs/${runId}/manifest.json`,
Buffer.from(JSON.stringify(manifest, null, 2)),
'application/json',
)
await this.uploadBuffer(
'viewer.html',
await readFile(this.viewerPath),
'text/html',
)
return {
runId,
uploadedFiles: uploaded + 2,
viewerUrl: `${this.config.cdnBaseUrl}/viewer.html?run=${runId}`,
manifest,
}
}
private async readMetadata(
taskPath: string,
): Promise<Record<string, unknown> | null> {
try {
return JSON.parse(
await readFile(join(taskPath, 'metadata.json'), 'utf-8'),
) as Record<string, unknown>
} catch {
return null
}
}
private async buildManifest(
runDir: string,
runId: string,
agentConfig: Record<string, unknown> | undefined,
dataset: string | undefined,
tasks: R2ManifestTask[],
): Promise<R2RunManifest> {
let summaryData: Record<string, unknown> | undefined
try {
summaryData = JSON.parse(
await readFile(join(runDir, 'summary.json'), 'utf-8'),
) as Record<string, unknown>
} catch {}
return {
runId,
uploadedAt: this.now().toISOString(),
agentConfig,
dataset,
summary: summaryData
? {
passRate: summaryData.passRate,
avgDurationMs: summaryData.avgDurationMs,
}
: undefined,
tasks,
}
}
private async uploadFile(job: UploadJob): Promise<void> {
await this.uploadBuffer(
job.key,
await readFile(job.filePath),
job.contentType,
)
}
private async uploadBuffer(
key: string,
body: Buffer,
contentType: string,
): Promise<void> {
await this.client.send(
new PutObjectCommand({
Bucket: this.config.bucket,
Key: key,
Body: body,
ContentType: contentType,
}),
)
}
}
export async function publishPathToR2(
inputDir: string,
): Promise<R2PublishPathResult> {
const config = loadR2ConfigFromEnv()
return new R2Publisher({ config }).publishPath(inputDir)
}

View File

@@ -1,362 +1 @@
import { mkdir, writeFile } from 'node:fs/promises'
import { basename, dirname, join, resolve } from 'node:path'
import {
dashboardState,
setActiveExecutor,
startDashboard,
stopDashboard,
} from '../dashboard/server'
import type { ErrorSource, EvalConfig, Task } from '../types'
import {
printValidationResult,
validateConfig,
} from '../utils/config-validator'
import { ParallelExecutor } from './parallel-executor'
import {
getTaskSourceDescription,
loadTasks,
TaskLoadError,
} from './task-loader'
import type {
BatchSummary,
RunEvalOptions,
TaskResult,
TaskResultSummary,
TaskSource,
} from './types'
import { getPrimaryGraderResult, isSuccessfulResult } from './types'
// ============================================================================
// Main Entry Point
// ============================================================================
export async function runEval(options: RunEvalOptions): Promise<void> {
// Step 1: Validate configuration
const config = await loadAndValidateConfig(options.configPath)
// Step 2: Resolve paths relative to config location
const configDir = dirname(resolve(options.configPath))
const resolvedPaths = resolvePaths(options, config, configDir)
// Log configuration
console.log('Eval Configuration:')
console.log(` Config: ${options.configPath}`)
console.log(` Dataset: ${resolvedPaths.dataPath}`)
console.log(` Output: ${resolvedPaths.outputDir}`)
console.log(` Workers: ${config.num_workers}`)
console.log(` Agent: ${config.agent.type}`)
console.log()
// Step 3: Load tasks
const taskSource = resolveTaskSource(options, resolvedPaths.dataPath)
const { tasks } = await loadTasksWithLogging(taskSource)
// Step 4: Setup
await mkdir(resolvedPaths.outputDir, { recursive: true })
// Step 5: Start dashboard
startDashboard({
tasks,
configName: options.configPath,
agentType: config.agent.type,
outputDir: resolvedPaths.outputDir,
})
// Step 6: Execute tasks (parallel or sequential based on num_workers)
const results = await executeTasks(tasks, config, resolvedPaths.outputDir)
// Step 7: Summary
const summary = buildSummary(results)
await saveSummary(summary, resolvedPaths.outputDir)
printSummary(summary)
console.log(`\nResults saved to: ${resolvedPaths.outputDir}`)
stopDashboard()
}
// ============================================================================
// Configuration
// ============================================================================
async function loadAndValidateConfig(configPath: string) {
console.log('Validating configuration...')
const validationResult = await validateConfig(configPath)
printValidationResult(validationResult)
if (!validationResult.valid || !validationResult.config) {
throw new Error(
'Configuration validation failed. Fix the above errors and try again.',
)
}
return validationResult.config
}
interface ResolvedPaths {
dataPath: string
outputDir: string
}
function resolvePaths(
options: RunEvalOptions,
config: EvalConfig,
configDir: string,
): ResolvedPaths {
// Resolve dataset path: use options.dataPath if provided, otherwise resolve from config
const dataPath = options.dataPath
? options.dataPath
: config.dataset.startsWith('/')
? config.dataset
: resolve(configDir, config.dataset)
// Resolve output directory: results/{config-name}/{timestamp}/
// Config name derived from config filename (e.g., "browseros-agent-weekly.json" → "browseros-agent-weekly")
const configName = options.configPath
? basename(resolve(options.configPath), '.json')
: 'eval'
const timestamp = formatTimestamp(new Date())
const resultsBase = config.output_dir
? config.output_dir.startsWith('/')
? config.output_dir
: resolve(configDir, config.output_dir)
: resolve(configDir, '..', 'results')
const outputDir = join(resultsBase, configName, timestamp)
return { dataPath, outputDir }
}
function formatTimestamp(date: Date): string {
const y = date.getFullYear()
const m = String(date.getMonth() + 1).padStart(2, '0')
const d = String(date.getDate()).padStart(2, '0')
const h = String(date.getHours()).padStart(2, '0')
const min = String(date.getMinutes()).padStart(2, '0')
return `${y}-${m}-${d}-${h}${min}`
}
// ============================================================================
// Task Loading
// ============================================================================
function resolveTaskSource(
options: RunEvalOptions,
dataPath: string,
): TaskSource {
// If query is provided, use single task mode
if (options.query) {
return { type: 'single', query: options.query, startUrl: options.startUrl }
}
// Otherwise use file mode with the resolved dataPath
return { type: 'file', path: dataPath }
}
async function loadTasksWithLogging(
source: TaskSource,
): Promise<{ tasks: Awaited<ReturnType<typeof loadTasks>>['tasks'] }> {
console.log(`Loading tasks from ${getTaskSourceDescription(source)}...`)
try {
const result = await loadTasks(source)
console.log(`Loaded ${result.tasks.length} task(s)`)
return { tasks: result.tasks }
} catch (error) {
if (error instanceof TaskLoadError) {
throw new Error(`Failed to load tasks: ${error.message}`)
}
throw new Error(`Failed to load tasks: ${error}`)
}
}
// ============================================================================
// Task Execution
// ============================================================================
async function executeTasks(
tasks: Task[],
config: EvalConfig,
outputDir: string,
): Promise<TaskResult[]> {
console.log(`\n${'='.repeat(60)}`)
console.log('STARTING EVALUATION')
console.log(`${'='.repeat(60)}\n`)
const numWorkers = config.num_workers || 1
console.log(`Running with ${numWorkers} worker(s)`)
if (config.restart_server_per_task) {
console.log(`Server restart per task: enabled`)
}
console.log()
const executor = new ParallelExecutor({
numWorkers,
config,
outputDir,
restartServerPerTask: config.restart_server_per_task,
onEvent: (taskId, event) =>
dashboardState.broadcastStreamEvent(taskId, event),
})
// Register so dashboard stop button works for CLI runs too
setActiveExecutor(executor)
try {
return await executor.execute(tasks, (completed, total, task, result) => {
printTaskProgress(completed, total, task, result)
})
} finally {
setActiveExecutor(null)
}
}
function printTaskProgress(
completed: number,
total: number,
task: Task,
result: TaskResult,
): void {
const status =
result.status === 'completed'
? 'DONE'
: result.status === 'timeout'
? 'TIMEOUT'
: 'FAILED'
const duration =
result.durationMs > 0 ? ` (${(result.durationMs / 1000).toFixed(1)}s)` : ''
console.log(`[${completed}/${total}] ${task.query_id}: ${status}${duration}`)
if (result.status === 'failed') {
console.log(` ERROR: ${result.error.message}`)
} else if (isSuccessfulResult(result)) {
// Log agent errors (e.g., LLM API failures) even if task "completed"
if (result.agentResult.metadata.errors?.length) {
for (const err of result.agentResult.metadata.errors) {
console.log(` ERROR [${err.source}]: ${err.message}`)
}
}
for (const [name, gr] of Object.entries(result.graderResults)) {
const icon = gr.pass ? 'PASS' : 'FAIL'
console.log(` ${name}: ${icon}`)
}
}
}
// ============================================================================
// Summary
// ============================================================================
function buildSummary(results: TaskResult[]): BatchSummary {
// Track errors by source
const errorsBySource: Partial<Record<ErrorSource, number>> = {}
let totalWarnings = 0
const taskSummaries: TaskResultSummary[] = results.map((r) => {
let errorCount = 0
let warningCount = 0
let errorSources: ErrorSource[] | undefined
let failureReason: string | undefined
if (isSuccessfulResult(r)) {
// Count errors and warnings from agent metadata
errorCount = r.agentResult.metadata.errors?.length ?? 0
warningCount = r.agentResult.metadata.warnings?.length ?? 0
totalWarnings += warningCount
// Track error sources
if (r.agentResult.metadata.errors?.length) {
errorSources = r.agentResult.metadata.errors.map((e) => e.source)
for (const err of r.agentResult.metadata.errors) {
errorsBySource[err.source] = (errorsBySource[err.source] ?? 0) + 1
}
}
} else {
// Failed task
errorCount = 1
errorSources = [r.errorSource]
failureReason = r.error.message
errorsBySource[r.errorSource] = (errorsBySource[r.errorSource] ?? 0) + 1
}
return {
queryId: r.task.query_id,
status: r.status,
durationMs: r.durationMs,
graderResults: isSuccessfulResult(r)
? Object.fromEntries(
Object.entries(r.graderResults).map(([name, gr]) => [
name,
{ pass: gr.pass, score: gr.score },
]),
)
: undefined,
errorCount,
warningCount,
errorSources: errorSources?.length ? errorSources : undefined,
failureReason,
}
})
const completed = results.filter((r) => r.status === 'completed').length
const timeout = results.filter((r) => r.status === 'timeout').length
const failed = results.filter((r) => r.status === 'failed').length
// Calculate pass rate using primary grader (fallback order)
let totalGraded = 0
let totalPasses = 0
for (const result of results) {
if (isSuccessfulResult(result)) {
const primary = getPrimaryGraderResult(result.graderResults)
if (primary) {
totalGraded++
if (primary.pass) totalPasses++
}
}
}
const passRate = totalGraded > 0 ? totalPasses / totalGraded : 0
// Calculate average duration for non-failed tasks
const durations = results
.filter((r) => r.status !== 'failed')
.map((r) => r.durationMs)
const avgDurationMs =
durations.length > 0
? durations.reduce((a, b) => a + b, 0) / durations.length
: 0
return {
total: results.length,
completed,
failed,
timeout,
passRate,
avgDurationMs,
errorsBySource,
totalWarnings,
results: taskSummaries,
}
}
async function saveSummary(
summary: BatchSummary,
outputDir: string,
): Promise<void> {
await writeFile(
join(outputDir, 'summary.json'),
JSON.stringify(summary, null, 2),
)
}
function printSummary(summary: BatchSummary): void {
console.log('='.repeat(60))
console.log('EVALUATION COMPLETE')
console.log('='.repeat(60))
console.log(`Total: ${summary.total} tasks`)
console.log(` Completed: ${summary.completed}`)
console.log(` Timeout: ${summary.timeout}`)
console.log(` Failed: ${summary.failed}`)
console.log(` Pass Rate: ${(summary.passRate * 100).toFixed(1)}%`)
console.log(` Avg Duration: ${(summary.avgDurationMs / 1000).toFixed(1)}s`)
}
export { runEval } from '../runs/eval-runner'

View File

@@ -1,266 +1,5 @@
/**
* Parallel Executor
*
* Each worker gets its own isolated BrowserOS stack:
* - BrowserOSAppManager (Chrome + Server on unique ports)
* - TaskExecutor (uses that worker's server URL)
*
* Port allocation: Worker N → CDP=base+N, Server=base+N, Extension=base+N
*/
import type { EvalConfig, Task } from '../types'
import { BrowserOSAppManager, type EvalPorts } from './browseros-app-manager'
import { createTaskExecutor } from './task-executor'
import type { TaskResult } from './types'
// ============================================================================
// Types
// ============================================================================
export interface ParallelExecutorConfig {
numWorkers: number
config: EvalConfig
outputDir: string
restartServerPerTask?: boolean
onEvent?: (taskId: string, event: Record<string, unknown>) => void
}
export type ProgressCallback = (
completed: number,
total: number,
task: Task,
result: TaskResult,
) => void
// ============================================================================
// Task Queue (thread-safe for single-threaded async — index is atomic)
// ============================================================================
class TaskQueue {
private tasks: Task[]
private index: number = 0
private stopped: boolean = false
constructor(tasks: Task[]) {
this.tasks = [...tasks]
}
next(): Task | null {
if (this.stopped || this.index >= this.tasks.length) return null
return this.tasks[this.index++]
}
stop(): void {
this.stopped = true
}
}
// ============================================================================
// Parallel Executor
// ============================================================================
export class ParallelExecutor {
private readonly numWorkers: number
private readonly appManagers = new Map<number, BrowserOSAppManager>()
private completedCount: number = 0
private readonly resultLock = new Map<string, TaskResult>()
private queue: TaskQueue | null = null
constructor(private readonly config: ParallelExecutorConfig) {
this.numWorkers = Math.max(1, config.numWorkers)
}
async stop(): Promise<void> {
console.log('\nStopping eval run...')
this.queue?.stop()
const kills = [...this.appManagers.values()].map((m) => m.killApp())
await Promise.allSettled(kills)
}
async execute(
tasks: Task[],
onProgress?: ProgressCallback,
): Promise<TaskResult[]> {
if (tasks.length === 0) return []
const cleanup = this.setupSignalHandlers()
const loadExtensions = this.config.config.browseros.load_extensions ?? false
// Patch NopeCHA API key before launching any workers
const captchaConfig = this.config.config.captcha
if (captchaConfig) {
const apiKey = process.env[captchaConfig.api_key_env]
if (apiKey) {
BrowserOSAppManager.patchNopechaApiKey(apiKey)
}
}
this.queue = new TaskQueue(tasks)
const totalTasks = tasks.length
try {
const queue = this.queue
// Launch N workers in parallel — each gets its own Chrome + Server
const workers = Array.from({ length: this.numWorkers }, (_, i) =>
this.runWorker(i, queue, totalTasks, loadExtensions, onProgress),
)
await Promise.all(workers)
// Return results in original task order
return tasks.map((task) => {
const result = this.resultLock.get(task.query_id)
if (!result) {
return {
status: 'failed' as const,
task,
error: new Error('Task result not found'),
errorSource: 'unknown' as const,
durationMs: 0,
}
}
return result
})
} finally {
cleanup()
}
}
private async runWorker(
workerIndex: number,
queue: TaskQueue,
totalTasks: number,
loadExtensions: boolean,
onProgress?: ProgressCallback,
): Promise<void> {
// Per-worker isolated ports
const basePorts: EvalPorts = {
cdp: this.config.config.browseros.base_cdp_port,
server: this.config.config.browseros.base_server_port,
extension: this.config.config.browseros.base_extension_port,
}
const headless = this.config.config.browseros.headless ?? false
const appManager = new BrowserOSAppManager(
workerIndex,
basePorts,
loadExtensions,
headless,
)
this.appManagers.set(workerIndex, appManager)
// Per-worker executor pointing to this worker's server
const workerConfig: typeof this.config.config = {
...this.config.config,
browseros: {
...this.config.config.browseros,
server_url: appManager.getServerUrl(),
},
}
const executor = createTaskExecutor(
workerConfig,
workerIndex,
this.config.outputDir,
this.config.onEvent,
)
try {
// Always start Chrome+Server once for this worker
console.log(`\n Worker ${workerIndex}: Starting BrowserOS stack...`)
await appManager.restart()
while (true) {
const task = queue.next()
if (!task) break
const taskStartTime = Date.now()
let result: TaskResult
try {
// Restart between tasks if configured
if (this.config.restartServerPerTask) {
console.log(`\n${'─'.repeat(60)}`)
console.log(` Worker ${workerIndex}: Task: ${task.query_id}`)
console.log(`${'─'.repeat(60)}`)
await appManager.restart()
}
this.config.onEvent?.(task.query_id, {
type: 'task-state',
taskId: task.query_id,
status: 'running',
})
result = await executor.execute(task)
console.log(
` Worker ${workerIndex}: ${task.query_id}: ${result.status}`,
)
} catch (error) {
console.error(
` Worker ${workerIndex}: ${task.query_id}: FAILED - ${error instanceof Error ? error.message : String(error)}`,
)
result = {
status: 'failed',
task,
error: error instanceof Error ? error : new Error(String(error)),
errorSource: 'unknown',
durationMs: Date.now() - taskStartTime,
}
}
this.resultLock.set(task.query_id, result)
this.completedCount++
// Emit task completion to dashboard
const stateEvent: Record<string, unknown> = {
type: 'task-state',
taskId: task.query_id,
status: result.status,
durationMs: result.durationMs,
}
if (result.status !== 'failed' && 'graderResults' in result) {
stateEvent.graderResults = Object.fromEntries(
Object.entries(result.graderResults).map(([name, gr]) => [
name,
{
pass: gr.pass,
score: gr.score,
reasoning: gr.reasoning,
details: gr.details,
},
]),
)
stateEvent.screenshotCount =
result.agentResult?.metadata?.total_steps ?? 0
}
this.config.onEvent?.(task.query_id, stateEvent)
onProgress?.(this.completedCount, totalTasks, task, result)
if (this.config.restartServerPerTask) {
await new Promise((resolve) => setTimeout(resolve, 2000))
}
}
} finally {
await appManager.killApp()
}
}
/**
* SIGINT/SIGTERM kills all Chrome + Server instances across all workers.
* Returns a cleanup function that removes the listeners after execute() completes.
*/
private setupSignalHandlers(): () => void {
const onSignal = async () => {
console.log('\nShutting down all workers...')
this.queue?.stop()
const kills = [...this.appManagers.values()].map((m) => m.killApp())
await Promise.allSettled(kills)
process.exit(0)
}
process.on('SIGINT', onSignal)
process.on('SIGTERM', onSignal)
return () => {
process.off('SIGINT', onSignal)
process.off('SIGTERM', onSignal)
}
}
}
export {
type ProgressCallback,
TaskWorkerPool as ParallelExecutor,
type TaskWorkerPoolConfig as ParallelExecutorConfig,
} from '../runs/task-worker-pool'

View File

@@ -1,316 +1,6 @@
import { join } from 'node:path'
import { createAgent } from '../agents'
import type { AgentContext, AgentResult } from '../agents/types'
import { CaptureContext } from '../capture/context'
import {
hasExistingGraderResults,
TrajectorySaver,
} from '../capture/trajectory-saver'
import { runGraders } from '../graders/registry'
import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
import { callMcpTool } from '../utils/mcp-client'
import { InfinityAppManager } from './infinity-app-manager'
import type { TaskResult } from './types'
// ============================================================================
// Errors
// ============================================================================
export class TaskExecutionError extends Error {
public readonly errorSource: ErrorSource
constructor(
message: string,
public readonly task: Task,
public readonly phase:
| 'navigation'
| 'agent_execution'
| 'grading'
| 'cleanup',
public readonly cause?: Error,
) {
super(message)
this.name = 'TaskExecutionError'
this.errorSource = phase as ErrorSource
}
}
// ============================================================================
// Task Executor
// ============================================================================
export interface TaskExecutorDeps {
onEvent?: (taskId: string, event: Record<string, unknown>) => void
}
export class TaskExecutor {
constructor(
private readonly config: EvalConfig,
private readonly workerIndex: number,
private readonly outputDir: string,
private readonly deps: TaskExecutorDeps,
) {}
/**
* Resolve the initial page ID via list_pages MCP call.
* Called once per task on a fresh browser — there's exactly one page.
*/
private async resolveInitialPageId(mcpUrl: string): Promise<number> {
try {
const result = await callMcpTool(mcpUrl, 'list_pages', {})
if (!result.isError) {
const textContent = result.content?.find(
(c: { type: string }) => c.type === 'text',
)
const match = textContent?.text?.match(/^\s*(\d+)\./m)
if (match) return Number.parseInt(match[1], 10)
}
} catch {
// Fall through to default
}
// Fresh browser always has page 1
return 1
}
async execute(task: Task): Promise<TaskResult> {
const startTime = Date.now()
const mcpUrl = `${this.config.browseros.server_url}/mcp`
// Check if task already has grader results (resume capability)
const existing = await hasExistingGraderResults(
this.outputDir,
task.query_id,
)
if (existing.exists && existing.metadata) {
console.log(` Skipping: already has grader results`)
return {
status:
existing.metadata.termination_reason === 'timeout'
? 'timeout'
: 'completed',
task,
agentResult: {
metadata: existing.metadata,
messages: [],
finalAnswer: existing.metadata.final_answer,
},
graderResults: existing.metadata.grader_results,
durationMs: existing.metadata.total_duration_ms,
}
}
// Resolve page ID once — fresh browser has exactly one page
const pageId = await this.resolveInitialPageId(mcpUrl)
// For Infinity tasks, start a fresh app server per task
let infinityManager: InfinityAppManager | null = null
let actualStartUrl = task.start_url
if (task.dataset === 'webarena-infinity') {
const appName = (task.metadata?.additional as Record<string, unknown>)
?.app_name as string
const appBasePort =
((task.metadata?.additional as Record<string, unknown>)
?.app_base_port as number) || 8000
if (appName && process.env.WEBARENA_INFINITY_DIR) {
infinityManager = new InfinityAppManager(this.workerIndex, appBasePort)
try {
actualStartUrl = await infinityManager.startApp(appName)
console.log(
` Infinity app "${appName}" started on port ${infinityManager.getPort()}`,
)
} catch (error) {
throw new TaskExecutionError(
`Failed to start Infinity app: ${error instanceof Error ? error.message : String(error)}`,
task,
'navigation',
error instanceof Error ? error : undefined,
)
}
}
}
try {
// Phase 1: Set viewport + navigate to start URL
try {
await callMcpTool(mcpUrl, 'evaluate_script', {
page: pageId,
expression: 'window.resizeTo(1440, 900)',
})
} catch (vpError) {
console.warn(
` Viewport resize failed: ${vpError instanceof Error ? vpError.message : String(vpError)}`,
)
}
if (actualStartUrl && actualStartUrl !== 'about:blank') {
try {
await callMcpTool(mcpUrl, 'navigate_page', {
url: actualStartUrl,
page: pageId,
})
} catch (error) {
throw new TaskExecutionError(
`Failed to navigate to start URL: ${error instanceof Error ? error.message : String(error)}`,
task,
'navigation',
error instanceof Error ? error : undefined,
)
}
}
// Phase 2: Execute agent
const agentResult = await this.executeAgent(task, pageId)
// Phase 3: Run graders
const graderResults = await this.runGraders(
task,
agentResult,
infinityManager?.getUrl(),
)
const status =
agentResult.metadata.termination_reason === 'timeout'
? 'timeout'
: 'completed'
return {
status,
task,
agentResult,
graderResults,
durationMs: Date.now() - startTime,
}
} catch (error) {
const errorSource: ErrorSource =
error instanceof TaskExecutionError ? error.errorSource : 'unknown'
return {
status: 'failed',
task,
error: error instanceof Error ? error : new Error(String(error)),
errorSource,
durationMs: Date.now() - startTime,
}
} finally {
// Navigate to about:blank to clean up
try {
await callMcpTool(mcpUrl, 'navigate_page', {
url: 'about:blank',
page: pageId,
})
} catch {
// Ignore cleanup errors
}
// Stop Infinity app server if running
if (infinityManager) {
await infinityManager.stop().catch(() => {})
}
}
}
private async executeAgent(task: Task, pageId: number): Promise<AgentResult> {
try {
const { capture, taskOutputDir } = await CaptureContext.create({
serverUrl: this.config.browseros.server_url,
outputDir: this.outputDir,
taskId: task.query_id,
initialPageId: pageId,
onEvent: this.deps.onEvent,
})
const context: AgentContext = {
config: this.config,
task,
workerIndex: this.workerIndex,
initialPageId: pageId,
outputDir: this.outputDir,
taskOutputDir,
capture,
}
const agent = createAgent(context)
return await agent.execute()
} catch (error) {
if (error instanceof TaskExecutionError) {
throw error
}
throw new TaskExecutionError(
`Agent execution failed: ${error instanceof Error ? error.message : String(error)}`,
task,
'agent_execution',
error instanceof Error ? error : undefined,
)
}
}
private async runGraders(
task: Task,
agentResult: AgentResult,
infinityAppUrl?: string,
): Promise<Record<string, GraderResult>> {
const configGraders = this.config.graders ?? []
const taskGraders = task.graders ?? []
const graderNames = configGraders.length > 0 ? configGraders : taskGraders
if (graderNames.length === 0) {
return {}
}
try {
const graderResults = await runGraders(graderNames, {
task: {
query_id: task.query_id,
query: task.query,
dataset: task.dataset,
},
messages: agentResult.messages,
screenshotCount:
agentResult.metadata.screenshot_count ??
agentResult.metadata.total_steps,
finalAnswer: agentResult.finalAnswer,
expectedAnswer: (task.metadata?.additional as Record<string, unknown>)
?.answer as string | undefined,
outputDir: join(this.outputDir, task.query_id),
mcpUrl: `${this.config.browseros.server_url}/mcp`,
infinityAppUrl,
})
try {
const saver = new TrajectorySaver(this.outputDir, task.query_id)
await saver.updateGraderResults(graderResults)
} catch (saveError) {
console.warn(
` Failed to persist grader results: ${saveError instanceof Error ? saveError.message : String(saveError)}`,
)
}
return graderResults
} catch (error) {
console.warn(
` Grading failed: ${error instanceof Error ? error.message : String(error)}`,
)
return {
_error: {
score: 0,
pass: false,
reasoning: `Grading failed: ${error instanceof Error ? error.message : String(error)}`,
},
}
}
}
}
// ============================================================================
// Factory
// ============================================================================
export function createTaskExecutor(
config: EvalConfig,
workerIndex: number,
outputDir: string,
onEvent?: (taskId: string, event: Record<string, unknown>) => void,
): TaskExecutor {
return new TaskExecutor(config, workerIndex, outputDir, { onEvent })
}
export {
createTaskRunPipeline as createTaskExecutor,
TaskExecutionError,
TaskRunPipeline as TaskExecutor,
type TaskRunPipelineDeps as TaskExecutorDeps,
} from '../runs/task-run-pipeline'

View File

@@ -8,12 +8,18 @@ import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
export interface RunEvalOptions {
configPath: string
config?: EvalConfig
dataPath?: string
query?: string
startUrl?: string
outputDir?: string
}
export interface RunEvalResult {
outputDir: string
summary: BatchSummary
}
// ============================================================================
// Task Loading
// ============================================================================

View File

@@ -0,0 +1,46 @@
import { join } from 'node:path'
function timestamp(date: Date): string {
const y = date.getUTCFullYear()
const m = String(date.getUTCMonth() + 1).padStart(2, '0')
const d = String(date.getUTCDate()).padStart(2, '0')
const h = String(date.getUTCHours()).padStart(2, '0')
const min = String(date.getUTCMinutes()).padStart(2, '0')
return `${y}-${m}-${d}-${h}${min}`
}
function safeSegment(value: string): string {
return value
.toLowerCase()
.replace(/[^a-z0-9._-]+/g, '-')
.replace(/^-+|-+$/g, '')
}
/** Creates a path-safe run id from suite/config, variant, and time. */
export function createRunId(
suiteId: string,
variantId: string,
date = new Date(),
): string {
return `${safeSegment(suiteId)}__${safeSegment(variantId)}__${timestamp(date)}`
}
export function getRunPaths(baseDir: string, runId: string, taskId?: string) {
const runDir = join(baseDir, 'runs', runId)
const taskDir = taskId ? join(runDir, 'tasks', taskId) : undefined
return {
runDir,
runManifest: join(runDir, 'run.json'),
summary: join(runDir, 'summary.json'),
viewerManifest: join(runDir, 'viewer-manifest.json'),
uploadManifest: join(runDir, 'upload-manifest.json'),
taskDir,
attempt: taskDir ? join(taskDir, 'attempt.json') : undefined,
trace: taskDir ? join(taskDir, 'trace.jsonl') : undefined,
messages: taskDir ? join(taskDir, 'messages.jsonl') : undefined,
grades: taskDir ? join(taskDir, 'grades.json') : undefined,
graderArtifacts: taskDir ? join(taskDir, 'grader-artifacts') : undefined,
screenshots: taskDir ? join(taskDir, 'screenshots') : undefined,
}
}

View File

@@ -0,0 +1,380 @@
import { mkdir, writeFile } from 'node:fs/promises'
import { basename, dirname, join, resolve } from 'node:path'
import {
dashboardState,
setActiveExecutor,
startDashboard,
stopDashboard,
} from '../dashboard/server'
import {
getTaskSourceDescription,
loadTasks,
TaskLoadError,
} from '../runner/task-loader'
import type {
BatchSummary,
RunEvalOptions,
RunEvalResult,
TaskResult,
TaskResultSummary,
TaskSource,
} from '../runner/types'
import { getPrimaryGraderResult, isSuccessfulResult } from '../runner/types'
import type { ErrorSource, EvalConfig, Task } from '../types'
import {
printValidationResult,
validateConfig,
} from '../utils/config-validator'
import { TaskWorkerPool } from './task-worker-pool'
// ============================================================================
// Main Entry Point
// ============================================================================
export async function runEval(options: RunEvalOptions): Promise<RunEvalResult> {
// Step 1: Validate configuration
const config =
options.config ?? (await loadAndValidateConfig(options.configPath))
// Step 2: Resolve paths relative to config location
const configDir = options.configPath
? dirname(resolve(options.configPath))
: process.cwd()
const resolvedPaths = resolvePaths(options, config, configDir)
// Log configuration
console.log('Eval Configuration:')
console.log(` Config: ${options.configPath}`)
console.log(` Dataset: ${resolvedPaths.dataPath}`)
console.log(` Output: ${resolvedPaths.outputDir}`)
console.log(` Workers: ${config.num_workers}`)
console.log(` Agent: ${config.agent.type}`)
console.log()
// Step 3: Load tasks
const taskSource = resolveTaskSource(options, resolvedPaths.dataPath)
const { tasks } = await loadTasksWithLogging(taskSource)
// Step 4: Setup
await mkdir(resolvedPaths.outputDir, { recursive: true })
// Step 5: Start dashboard
startDashboard({
tasks,
configName: options.configPath,
agentType: config.agent.type,
outputDir: resolvedPaths.outputDir,
})
// Step 6: Execute tasks (parallel or sequential based on num_workers)
const results = await executeTasks(tasks, config, resolvedPaths.outputDir)
// Step 7: Summary
const summary = buildSummary(results)
await saveSummary(summary, resolvedPaths.outputDir)
printSummary(summary)
console.log(`\nResults saved to: ${resolvedPaths.outputDir}`)
stopDashboard()
return { outputDir: resolvedPaths.outputDir, summary }
}
// ============================================================================
// Configuration
// ============================================================================
async function loadAndValidateConfig(configPath: string) {
console.log('Validating configuration...')
const validationResult = await validateConfig(configPath)
printValidationResult(validationResult)
if (!validationResult.valid || !validationResult.config) {
throw new Error(
'Configuration validation failed. Fix the above errors and try again.',
)
}
return validationResult.config
}
interface ResolvedPaths {
dataPath: string
outputDir: string
}
/** Returns the eval results directory for both flat and nested config layouts. */
function defaultResultsBase(configDir: string): string {
const resolvedConfigDir = resolve(configDir)
if (basename(resolvedConfigDir) === 'configs') {
return resolve(resolvedConfigDir, '..', 'results')
}
if (basename(dirname(resolvedConfigDir)) === 'configs') {
return resolve(resolvedConfigDir, '..', '..', 'results')
}
return resolve(resolvedConfigDir, '..', 'results')
}
function resolvePaths(
options: RunEvalOptions,
config: EvalConfig,
configDir: string,
): ResolvedPaths {
// Resolve dataset path: use options.dataPath if provided, otherwise resolve from config
const dataPath = options.dataPath
? options.dataPath
: config.dataset.startsWith('/')
? config.dataset
: resolve(configDir, config.dataset)
// Resolve output directory: results/{config-name}/{timestamp}/
// Config name derived from config filename (e.g., "browseros-agent-weekly.json" → "browseros-agent-weekly")
const configName = options.configPath
? basename(resolve(options.configPath), '.json')
: 'eval'
const timestamp = formatTimestamp(new Date())
const resultsBase = config.output_dir
? config.output_dir.startsWith('/')
? config.output_dir
: resolve(configDir, config.output_dir)
: defaultResultsBase(configDir)
const outputDir =
options.outputDir ?? join(resultsBase, configName, timestamp)
return { dataPath, outputDir }
}
function formatTimestamp(date: Date): string {
const y = date.getUTCFullYear()
const m = String(date.getUTCMonth() + 1).padStart(2, '0')
const d = String(date.getUTCDate()).padStart(2, '0')
const h = String(date.getUTCHours()).padStart(2, '0')
const min = String(date.getUTCMinutes()).padStart(2, '0')
return `${y}-${m}-${d}-${h}${min}`
}
// ============================================================================
// Task Loading
// ============================================================================
function resolveTaskSource(
options: RunEvalOptions,
dataPath: string,
): TaskSource {
// If query is provided, use single task mode
if (options.query) {
return { type: 'single', query: options.query, startUrl: options.startUrl }
}
// Otherwise use file mode with the resolved dataPath
return { type: 'file', path: dataPath }
}
async function loadTasksWithLogging(
source: TaskSource,
): Promise<{ tasks: Awaited<ReturnType<typeof loadTasks>>['tasks'] }> {
console.log(`Loading tasks from ${getTaskSourceDescription(source)}...`)
try {
const result = await loadTasks(source)
console.log(`Loaded ${result.tasks.length} task(s)`)
return { tasks: result.tasks }
} catch (error) {
if (error instanceof TaskLoadError) {
throw new Error(`Failed to load tasks: ${error.message}`)
}
throw new Error(`Failed to load tasks: ${error}`)
}
}
// ============================================================================
// Task Execution
// ============================================================================
async function executeTasks(
tasks: Task[],
config: EvalConfig,
outputDir: string,
): Promise<TaskResult[]> {
console.log(`\n${'='.repeat(60)}`)
console.log('STARTING EVALUATION')
console.log(`${'='.repeat(60)}\n`)
const numWorkers = config.num_workers || 1
console.log(`Running with ${numWorkers} worker(s)`)
if (config.restart_server_per_task) {
console.log(`Server restart per task: enabled`)
}
console.log()
const executor = new TaskWorkerPool({
numWorkers,
config,
outputDir,
restartServerPerTask: config.restart_server_per_task,
onEvent: (taskId, event) =>
dashboardState.broadcastStreamEvent(taskId, event),
})
// Register so dashboard stop button works for CLI runs too
setActiveExecutor(executor)
try {
return await executor.execute(tasks, (completed, total, task, result) => {
printTaskProgress(completed, total, task, result)
})
} finally {
setActiveExecutor(null)
}
}
function printTaskProgress(
completed: number,
total: number,
task: Task,
result: TaskResult,
): void {
const status =
result.status === 'completed'
? 'DONE'
: result.status === 'timeout'
? 'TIMEOUT'
: 'FAILED'
const duration =
result.durationMs > 0 ? ` (${(result.durationMs / 1000).toFixed(1)}s)` : ''
console.log(`[${completed}/${total}] ${task.query_id}: ${status}${duration}`)
if (result.status === 'failed') {
console.log(` ERROR: ${result.error.message}`)
} else if (isSuccessfulResult(result)) {
// Log agent errors (e.g., LLM API failures) even if task "completed"
if (result.agentResult.metadata.errors?.length) {
for (const err of result.agentResult.metadata.errors) {
console.log(` ERROR [${err.source}]: ${err.message}`)
}
}
for (const [name, gr] of Object.entries(result.graderResults)) {
const icon = gr.pass ? 'PASS' : 'FAIL'
console.log(` ${name}: ${icon}`)
}
}
}
// ============================================================================
// Summary
// ============================================================================
function buildSummary(results: TaskResult[]): BatchSummary {
// Track errors by source
const errorsBySource: Partial<Record<ErrorSource, number>> = {}
let totalWarnings = 0
const taskSummaries: TaskResultSummary[] = results.map((r) => {
let errorCount = 0
let warningCount = 0
let errorSources: ErrorSource[] | undefined
let failureReason: string | undefined
if (isSuccessfulResult(r)) {
// Count errors and warnings from agent metadata
errorCount = r.agentResult.metadata.errors?.length ?? 0
warningCount = r.agentResult.metadata.warnings?.length ?? 0
totalWarnings += warningCount
// Track error sources
if (r.agentResult.metadata.errors?.length) {
errorSources = r.agentResult.metadata.errors.map((e) => e.source)
for (const err of r.agentResult.metadata.errors) {
errorsBySource[err.source] = (errorsBySource[err.source] ?? 0) + 1
}
}
} else {
// Failed task
errorCount = 1
errorSources = [r.errorSource]
failureReason = r.error.message
errorsBySource[r.errorSource] = (errorsBySource[r.errorSource] ?? 0) + 1
}
return {
queryId: r.task.query_id,
status: r.status,
durationMs: r.durationMs,
graderResults: isSuccessfulResult(r)
? Object.fromEntries(
Object.entries(r.graderResults).map(([name, gr]) => [
name,
{ pass: gr.pass, score: gr.score },
]),
)
: undefined,
errorCount,
warningCount,
errorSources: errorSources?.length ? errorSources : undefined,
failureReason,
}
})
const completed = results.filter((r) => r.status === 'completed').length
const timeout = results.filter((r) => r.status === 'timeout').length
const failed = results.filter((r) => r.status === 'failed').length
// Calculate pass rate using primary grader (fallback order)
let totalGraded = 0
let totalPasses = 0
for (const result of results) {
if (isSuccessfulResult(result)) {
const primary = getPrimaryGraderResult(result.graderResults)
if (primary) {
totalGraded++
if (primary.pass) totalPasses++
}
}
}
const passRate = totalGraded > 0 ? totalPasses / totalGraded : 0
// Calculate average duration for non-failed tasks
const durations = results
.filter((r) => r.status !== 'failed')
.map((r) => r.durationMs)
const avgDurationMs =
durations.length > 0
? durations.reduce((a, b) => a + b, 0) / durations.length
: 0
return {
total: results.length,
completed,
failed,
timeout,
passRate,
avgDurationMs,
errorsBySource,
totalWarnings,
results: taskSummaries,
}
}
async function saveSummary(
summary: BatchSummary,
outputDir: string,
): Promise<void> {
await writeFile(
join(outputDir, 'summary.json'),
JSON.stringify(summary, null, 2),
)
}
function printSummary(summary: BatchSummary): void {
console.log('='.repeat(60))
console.log('EVALUATION COMPLETE')
console.log('='.repeat(60))
console.log(`Total: ${summary.total} tasks`)
console.log(` Completed: ${summary.completed}`)
console.log(` Timeout: ${summary.timeout}`)
console.log(` Failed: ${summary.failed}`)
console.log(` Pass Rate: ${(summary.passRate * 100).toFixed(1)}%`)
console.log(` Avg Duration: ${(summary.avgDurationMs / 1000).toFixed(1)}s`)
}

View File

@@ -0,0 +1,44 @@
import type { EvalVariant } from '../suites/resolve-variant'
export interface BuildRunManifestInput {
runId: string
suiteId: string
variant: EvalVariant
datasetPath: string
datasetHash?: string
graders: string[]
gitSha?: string
browserosVersion?: string
startedAt?: string
}
export interface RunManifest {
runId: string
suiteId: string
variant: EvalVariant['publicMetadata']
dataset: {
path: string
hash?: string
}
graders: string[]
gitSha?: string
browserosVersion?: string
startedAt: string
}
/** Builds the sanitized run manifest used for reproducibility. */
export function buildRunManifest(input: BuildRunManifestInput): RunManifest {
return {
runId: input.runId,
suiteId: input.suiteId,
variant: input.variant.publicMetadata,
dataset: {
path: input.datasetPath,
hash: input.datasetHash,
},
graders: input.graders,
gitSha: input.gitSha,
browserosVersion: input.browserosVersion,
startedAt: input.startedAt ?? new Date().toISOString(),
}
}

View File

@@ -0,0 +1,317 @@
import { join } from 'node:path'
import { createAgent } from '../agents'
import type { AgentContext, AgentResult } from '../agents/types'
import { CaptureContext } from '../capture/context'
import {
hasExistingGraderResults,
TrajectorySaver,
} from '../capture/trajectory-saver'
import { runGraders } from '../graders/registry'
import { InfinityAppManager } from '../runner/infinity-app-manager'
import type { TaskResult } from '../runner/types'
import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
import { callMcpTool } from '../utils/mcp-client'
// ============================================================================
// Errors
// ============================================================================
export class TaskExecutionError extends Error {
public readonly errorSource: ErrorSource
constructor(
message: string,
public readonly task: Task,
public readonly phase:
| 'navigation'
| 'agent_execution'
| 'grading'
| 'cleanup',
public readonly cause?: Error,
) {
super(message)
this.name = 'TaskExecutionError'
this.errorSource = phase as ErrorSource
}
}
// ============================================================================
// Task Executor
// ============================================================================
export interface TaskRunPipelineDeps {
onEvent?: (taskId: string, event: Record<string, unknown>) => void
}
export class TaskRunPipeline {
constructor(
private readonly config: EvalConfig,
private readonly workerIndex: number,
private readonly outputDir: string,
private readonly deps: TaskRunPipelineDeps,
) {}
/**
* Resolve the initial page ID via list_pages MCP call.
* Called once per task on a fresh browser — there's exactly one page.
*/
private async resolveInitialPageId(mcpUrl: string): Promise<number> {
try {
const result = await callMcpTool(mcpUrl, 'list_pages', {})
if (!result.isError) {
const textContent = result.content?.find(
(c: { type: string }) => c.type === 'text',
)
const match = textContent?.text?.match(/^\s*(\d+)\./m)
if (match) return Number.parseInt(match[1], 10)
}
} catch {
// Fall through to default
}
// Fresh browser always has page 1
return 1
}
async execute(task: Task): Promise<TaskResult> {
const startTime = Date.now()
const mcpUrl = `${this.config.browseros.server_url}/mcp`
// Check if task already has grader results (resume capability)
const existing = await hasExistingGraderResults(
this.outputDir,
task.query_id,
)
if (existing.exists && existing.metadata) {
console.log(` Skipping: already has grader results`)
return {
status:
existing.metadata.termination_reason === 'timeout'
? 'timeout'
: 'completed',
task,
agentResult: {
metadata: existing.metadata,
messages: [],
finalAnswer: existing.metadata.final_answer,
},
graderResults: existing.metadata.grader_results,
durationMs: existing.metadata.total_duration_ms,
}
}
// Resolve page ID once — fresh browser has exactly one page
const pageId = await this.resolveInitialPageId(mcpUrl)
// For Infinity tasks, start a fresh app server per task
let infinityManager: InfinityAppManager | null = null
let actualStartUrl = task.start_url
if (task.dataset === 'webarena-infinity') {
const appName = (task.metadata?.additional as Record<string, unknown>)
?.app_name as string
const appBasePort =
((task.metadata?.additional as Record<string, unknown>)
?.app_base_port as number) || 8000
if (appName && process.env.WEBARENA_INFINITY_DIR) {
infinityManager = new InfinityAppManager(this.workerIndex, appBasePort)
try {
actualStartUrl = await infinityManager.startApp(appName)
console.log(
` Infinity app "${appName}" started on port ${infinityManager.getPort()}`,
)
} catch (error) {
throw new TaskExecutionError(
`Failed to start Infinity app: ${error instanceof Error ? error.message : String(error)}`,
task,
'navigation',
error instanceof Error ? error : undefined,
)
}
}
}
try {
// Phase 1: Set viewport + navigate to start URL
try {
await callMcpTool(mcpUrl, 'evaluate_script', {
page: pageId,
expression: 'window.resizeTo(1440, 900)',
})
} catch (vpError) {
console.warn(
` Viewport resize failed: ${vpError instanceof Error ? vpError.message : String(vpError)}`,
)
}
if (actualStartUrl && actualStartUrl !== 'about:blank') {
try {
await callMcpTool(mcpUrl, 'navigate_page', {
url: actualStartUrl,
page: pageId,
})
} catch (error) {
throw new TaskExecutionError(
`Failed to navigate to start URL: ${error instanceof Error ? error.message : String(error)}`,
task,
'navigation',
error instanceof Error ? error : undefined,
)
}
}
// Phase 2: Execute agent
const agentResult = await this.executeAgent(task, pageId)
// Phase 3: Run graders
const graderResults = await this.runGraders(
task,
agentResult,
infinityManager?.getUrl(),
)
const status =
agentResult.metadata.termination_reason === 'timeout'
? 'timeout'
: 'completed'
return {
status,
task,
agentResult,
graderResults,
durationMs: Date.now() - startTime,
}
} catch (error) {
const errorSource: ErrorSource =
error instanceof TaskExecutionError ? error.errorSource : 'unknown'
return {
status: 'failed',
task,
error: error instanceof Error ? error : new Error(String(error)),
errorSource,
durationMs: Date.now() - startTime,
}
} finally {
// Navigate to about:blank to clean up
try {
await callMcpTool(mcpUrl, 'navigate_page', {
url: 'about:blank',
page: pageId,
})
} catch {
// Ignore cleanup errors
}
// Stop Infinity app server if running
if (infinityManager) {
await infinityManager.stop().catch(() => {})
}
}
}
private async executeAgent(task: Task, pageId: number): Promise<AgentResult> {
try {
const { capture, taskOutputDir } = await CaptureContext.create({
serverUrl: this.config.browseros.server_url,
outputDir: this.outputDir,
taskId: task.query_id,
initialPageId: pageId,
onEvent: this.deps.onEvent,
})
const context: AgentContext = {
config: this.config,
task,
workerIndex: this.workerIndex,
initialPageId: pageId,
outputDir: this.outputDir,
taskOutputDir,
capture,
}
const agent = createAgent(context)
return await agent.execute()
} catch (error) {
if (error instanceof TaskExecutionError) {
throw error
}
throw new TaskExecutionError(
`Agent execution failed: ${error instanceof Error ? error.message : String(error)}`,
task,
'agent_execution',
error instanceof Error ? error : undefined,
)
}
}
private async runGraders(
task: Task,
agentResult: AgentResult,
infinityAppUrl?: string,
): Promise<Record<string, GraderResult>> {
const configGraders = this.config.graders ?? []
const taskGraders = task.graders ?? []
const graderNames = configGraders.length > 0 ? configGraders : taskGraders
if (graderNames.length === 0) {
return {}
}
try {
const graderResults = await runGraders(graderNames, {
task: {
query_id: task.query_id,
query: task.query,
dataset: task.dataset,
},
messages: agentResult.messages,
screenshotCount:
agentResult.metadata.screenshot_count ??
agentResult.metadata.total_steps,
finalAnswer: agentResult.finalAnswer,
expectedAnswer: (task.metadata?.additional as Record<string, unknown>)
?.answer as string | undefined,
taskArtifactDir: join(this.outputDir, task.query_id),
outputDir: join(this.outputDir, task.query_id),
mcpUrl: `${this.config.browseros.server_url}/mcp`,
infinityAppUrl,
})
try {
const saver = new TrajectorySaver(this.outputDir, task.query_id)
await saver.updateGraderResults(graderResults)
} catch (saveError) {
console.warn(
` Failed to persist grader results: ${saveError instanceof Error ? saveError.message : String(saveError)}`,
)
}
return graderResults
} catch (error) {
console.warn(
` Grading failed: ${error instanceof Error ? error.message : String(error)}`,
)
return {
_error: {
score: 0,
pass: false,
reasoning: `Grading failed: ${error instanceof Error ? error.message : String(error)}`,
},
}
}
}
}
// ============================================================================
// Factory
// ============================================================================
export function createTaskRunPipeline(
config: EvalConfig,
workerIndex: number,
outputDir: string,
onEvent?: (taskId: string, event: Record<string, unknown>) => void,
): TaskRunPipeline {
return new TaskRunPipeline(config, workerIndex, outputDir, { onEvent })
}

View File

@@ -0,0 +1,269 @@
/**
* Task Worker Pool
*
* Each worker gets its own isolated BrowserOS stack:
* - BrowserOSAppManager (Chrome + Server on unique ports)
* - TaskRunPipeline (uses that worker's server URL)
*
* Port allocation: Worker N → CDP=base+N, Server=base+N, Extension=base+N
*/
import {
BrowserOSAppManager,
type EvalPorts,
} from '../runner/browseros-app-manager'
import type { TaskResult } from '../runner/types'
import type { EvalConfig, Task } from '../types'
import { createTaskRunPipeline } from './task-run-pipeline'
// ============================================================================
// Types
// ============================================================================
export interface TaskWorkerPoolConfig {
numWorkers: number
config: EvalConfig
outputDir: string
restartServerPerTask?: boolean
onEvent?: (taskId: string, event: Record<string, unknown>) => void
}
export type ProgressCallback = (
completed: number,
total: number,
task: Task,
result: TaskResult,
) => void
// ============================================================================
// Task Queue (thread-safe for single-threaded async — index is atomic)
// ============================================================================
class TaskQueue {
private tasks: Task[]
private index: number = 0
private stopped: boolean = false
constructor(tasks: Task[]) {
this.tasks = [...tasks]
}
next(): Task | null {
if (this.stopped || this.index >= this.tasks.length) return null
return this.tasks[this.index++]
}
stop(): void {
this.stopped = true
}
}
// ============================================================================
// Task Worker Pool
// ============================================================================
export class TaskWorkerPool {
private readonly numWorkers: number
private readonly appManagers = new Map<number, BrowserOSAppManager>()
private completedCount: number = 0
private readonly resultsByTaskId = new Map<string, TaskResult>()
private queue: TaskQueue | null = null
constructor(private readonly config: TaskWorkerPoolConfig) {
this.numWorkers = Math.max(1, config.numWorkers)
}
async stop(): Promise<void> {
console.log('\nStopping eval run...')
this.queue?.stop()
const kills = [...this.appManagers.values()].map((m) => m.killApp())
await Promise.allSettled(kills)
}
async execute(
tasks: Task[],
onProgress?: ProgressCallback,
): Promise<TaskResult[]> {
if (tasks.length === 0) return []
const cleanup = this.setupSignalHandlers()
const loadExtensions = this.config.config.browseros.load_extensions ?? false
// Patch NopeCHA API key before launching any workers
const captchaConfig = this.config.config.captcha
if (captchaConfig) {
const apiKey = process.env[captchaConfig.api_key_env]
if (apiKey) {
BrowserOSAppManager.patchNopechaApiKey(apiKey)
}
}
this.queue = new TaskQueue(tasks)
const totalTasks = tasks.length
try {
const queue = this.queue
// Launch N workers in parallel — each gets its own Chrome + Server
const workers = Array.from({ length: this.numWorkers }, (_, i) =>
this.runWorker(i, queue, totalTasks, loadExtensions, onProgress),
)
await Promise.all(workers)
// Return results in original task order
return tasks.map((task) => {
const result = this.resultsByTaskId.get(task.query_id)
if (!result) {
return {
status: 'failed' as const,
task,
error: new Error('Task result not found'),
errorSource: 'unknown' as const,
durationMs: 0,
}
}
return result
})
} finally {
cleanup()
}
}
private async runWorker(
workerIndex: number,
queue: TaskQueue,
totalTasks: number,
loadExtensions: boolean,
onProgress?: ProgressCallback,
): Promise<void> {
// Per-worker isolated ports
const basePorts: EvalPorts = {
cdp: this.config.config.browseros.base_cdp_port,
server: this.config.config.browseros.base_server_port,
extension: this.config.config.browseros.base_extension_port,
}
const headless = this.config.config.browseros.headless ?? false
const appManager = new BrowserOSAppManager(
workerIndex,
basePorts,
loadExtensions,
headless,
)
this.appManagers.set(workerIndex, appManager)
// Per-worker executor pointing to this worker's server
const workerConfig: typeof this.config.config = {
...this.config.config,
browseros: {
...this.config.config.browseros,
server_url: appManager.getServerUrl(),
},
}
const executor = createTaskRunPipeline(
workerConfig,
workerIndex,
this.config.outputDir,
this.config.onEvent,
)
try {
// Always start Chrome+Server once for this worker
console.log(`\n Worker ${workerIndex}: Starting BrowserOS stack...`)
await appManager.restart()
while (true) {
const task = queue.next()
if (!task) break
const taskStartTime = Date.now()
let result: TaskResult
try {
// Restart between tasks if configured
if (this.config.restartServerPerTask) {
console.log(`\n${'─'.repeat(60)}`)
console.log(` Worker ${workerIndex}: Task: ${task.query_id}`)
console.log(`${'─'.repeat(60)}`)
await appManager.restart()
}
this.config.onEvent?.(task.query_id, {
type: 'task-state',
taskId: task.query_id,
status: 'running',
})
result = await executor.execute(task)
console.log(
` Worker ${workerIndex}: ${task.query_id}: ${result.status}`,
)
} catch (error) {
console.error(
` Worker ${workerIndex}: ${task.query_id}: FAILED - ${error instanceof Error ? error.message : String(error)}`,
)
result = {
status: 'failed',
task,
error: error instanceof Error ? error : new Error(String(error)),
errorSource: 'unknown',
durationMs: Date.now() - taskStartTime,
}
}
this.resultsByTaskId.set(task.query_id, result)
this.completedCount++
// Emit task completion to dashboard
const stateEvent: Record<string, unknown> = {
type: 'task-state',
taskId: task.query_id,
status: result.status,
durationMs: result.durationMs,
}
if (result.status !== 'failed' && 'graderResults' in result) {
stateEvent.graderResults = Object.fromEntries(
Object.entries(result.graderResults).map(([name, gr]) => [
name,
{
pass: gr.pass,
score: gr.score,
reasoning: gr.reasoning,
details: gr.details,
},
]),
)
stateEvent.screenshotCount =
result.agentResult?.metadata?.total_steps ?? 0
}
this.config.onEvent?.(task.query_id, stateEvent)
onProgress?.(this.completedCount, totalTasks, task, result)
if (this.config.restartServerPerTask) {
await new Promise((resolve) => setTimeout(resolve, 2000))
}
}
} finally {
await appManager.killApp()
}
}
/**
* SIGINT/SIGTERM kills all Chrome + Server instances across all workers.
* Returns a cleanup function that removes the listeners after execute() completes.
*/
private setupSignalHandlers(): () => void {
const onSignal = async () => {
console.log('\nShutting down all workers...')
this.queue?.stop()
const kills = [...this.appManagers.values()].map((m) => m.killApp())
await Promise.allSettled(kills)
process.exit(0)
}
process.on('SIGINT', onSignal)
process.on('SIGTERM', onSignal)
return () => {
process.off('SIGINT', onSignal)
process.off('SIGTERM', onSignal)
}
}
}

View File

@@ -0,0 +1,101 @@
import { basename, resolve } from 'node:path'
import { type EvalConfig, EvalConfigSchema } from '../types'
import { type EvalVariant, resolveVariant } from './resolve-variant'
import type { EvalSuite } from './schema'
type Env = Record<string, string | undefined>
export interface AdaptEvalConfigOptions {
env?: Env
}
export interface AdaptedEvalConfig {
configPath: string
evalConfig: EvalConfig
suite: EvalSuite
variant: EvalVariant
}
function executorBackend(
config: EvalConfig,
): 'tool-loop' | 'clado' | undefined {
if (config.agent.type !== 'orchestrator-executor') return undefined
return config.agent.executor.provider === 'clado-action'
? 'clado'
: 'tool-loop'
}
function variantSource(config: EvalConfig): {
provider: string
model: string
apiKey?: string
apiKeyEnv?: string
baseUrl?: string
supportsImages?: boolean
} {
const agent =
config.agent.type === 'single' ? config.agent : config.agent.orchestrator
if (!agent.model) {
throw new Error('Config agent model is required')
}
const apiKeyEnv = /^[A-Z][A-Z0-9_]*$/.test(agent.apiKey ?? '')
? agent.apiKey
: undefined
return {
provider: agent.provider,
model: agent.model,
apiKey: agent.apiKey,
apiKeyEnv,
baseUrl: agent.baseUrl,
supportsImages:
config.agent.type === 'single' ? config.agent.supportsImages : undefined,
}
}
/** Adapts an existing eval config into the suite/variant model. */
export async function adaptEvalConfigFile(
configPath: string,
options: AdaptEvalConfigOptions = {},
): Promise<AdaptedEvalConfig> {
const absolute = resolve(configPath)
const raw = JSON.parse(await Bun.file(absolute).text())
const evalConfig = EvalConfigSchema.parse(raw)
const id = basename(absolute, '.json')
const backend = executorBackend(evalConfig)
const source = variantSource(evalConfig)
const env = options.env ?? process.env
const apiKey =
source.apiKeyEnv && env[source.apiKeyEnv]
? env[source.apiKeyEnv]
: source.apiKey
return {
configPath: absolute,
evalConfig,
suite: {
id,
dataset: evalConfig.dataset,
agent:
evalConfig.agent.type === 'single'
? { type: 'tool-loop' }
: { type: 'orchestrated', executorBackend: backend ?? 'tool-loop' },
graders: evalConfig.graders ?? [],
workers: evalConfig.num_workers,
restartBrowserPerTask: evalConfig.restart_server_per_task,
timeoutMs: evalConfig.timeout_ms,
browseros: evalConfig.browseros,
captcha: evalConfig.captcha,
},
variant: resolveVariant({
variantId: id,
provider: source.provider,
model: source.model,
apiKey,
apiKeyEnv: source.apiKeyEnv,
baseUrl: source.baseUrl,
supportsImages: source.supportsImages,
env,
}),
}
}

View File

@@ -0,0 +1,22 @@
import { dirname, resolve } from 'node:path'
import { type EvalSuite, EvalSuiteSchema } from './schema'
export interface LoadedSuite {
suite: EvalSuite
suitePath: string
suiteDir: string
datasetPath: string
}
/** Loads a suite file and resolves its dataset relative to the suite. */
export async function loadSuite(suitePath: string): Promise<LoadedSuite> {
const absolute = resolve(suitePath)
const raw = JSON.parse(await Bun.file(absolute).text())
const suite = EvalSuiteSchema.parse(raw)
const suiteDir = dirname(absolute)
const datasetPath = suite.dataset.startsWith('/')
? suite.dataset
: resolve(suiteDir, suite.dataset)
return { suite, suitePath: absolute, suiteDir, datasetPath }
}

View File

@@ -0,0 +1,102 @@
type Env = Record<string, string | undefined>
export interface ResolveVariantOptions {
variantId?: string
provider?: string
model?: string
apiKey?: string
apiKeyEnv?: string
baseUrl?: string
supportsImages?: boolean
env?: Env
requireApiKey?: boolean
}
export interface EvalVariant {
id: string
agent: {
provider: string
model: string
apiKey?: string
baseUrl?: string
supportsImages?: boolean
}
publicMetadata: {
id: string
agent: {
provider: string
model: string
baseUrlHost?: string
supportsImages?: boolean
apiKeyConfigured: boolean
apiKeyEnv?: string
}
}
}
function boolFromEnv(value: string | undefined): boolean | undefined {
if (value === undefined) return undefined
return ['1', 'true', 'yes'].includes(value.toLowerCase())
}
function hostFromUrl(value: string | undefined): string | undefined {
if (!value) return undefined
try {
return new URL(value).host
} catch {
return undefined
}
}
function isEnvName(value: string | undefined): boolean {
return !!value && /^[A-Z][A-Z0-9_]*$/.test(value)
}
/** Resolves one model/backend variant from CLI values first, then env. */
export function resolveVariant(
options: ResolveVariantOptions = {},
): EvalVariant {
const env = options.env ?? process.env
const id = options.variantId ?? env.EVAL_VARIANT ?? 'default'
const provider =
options.provider ?? env.EVAL_AGENT_PROVIDER ?? 'openai-compatible'
const model = options.model ?? env.EVAL_AGENT_MODEL
const apiKey = options.apiKey ?? env.EVAL_AGENT_API_KEY
const apiKeyEnv =
options.apiKeyEnv ?? (options.apiKey ? undefined : 'EVAL_AGENT_API_KEY')
const baseUrl = options.baseUrl ?? env.EVAL_AGENT_BASE_URL
const supportsImages =
options.supportsImages ?? boolFromEnv(env.EVAL_AGENT_SUPPORTS_IMAGES)
if (!model) {
throw new Error('EVAL_AGENT_MODEL is required')
}
if (options.requireApiKey && !apiKey) {
throw new Error('EVAL_AGENT_API_KEY is required')
}
const publicApiKeyEnv =
options.apiKeyEnv ?? (isEnvName(apiKey) ? apiKey : apiKeyEnv)
return {
id,
agent: {
provider,
model,
apiKey,
baseUrl,
supportsImages,
},
publicMetadata: {
id,
agent: {
provider,
model,
baseUrlHost: hostFromUrl(baseUrl),
supportsImages,
apiKeyConfigured: !!apiKey,
apiKeyEnv: publicApiKeyEnv,
},
},
}
}

View File

@@ -0,0 +1,41 @@
import { z } from 'zod'
import { EvalConfigSchema } from '../types'
export const SuiteAgentSchema = z
.object({
type: z.enum([
'tool-loop',
'single',
'orchestrated',
'orchestrator-executor',
]),
executorBackend: z.enum(['tool-loop', 'clado']).optional(),
})
.superRefine((agent, ctx) => {
if (
(agent.type === 'orchestrated' ||
agent.type === 'orchestrator-executor') &&
!agent.executorBackend
) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
path: ['executorBackend'],
message: 'executorBackend is required for orchestrated suites',
})
}
})
export const EvalSuiteSchema = z.object({
id: z.string().min(1),
dataset: z.string().min(1),
agent: SuiteAgentSchema,
graders: z.array(z.string()).default([]),
workers: z.number().int().min(1).max(20).default(1),
restartBrowserPerTask: z.boolean().default(false),
timeoutMs: z.number().int().min(30_000).max(3_600_000).optional(),
browseros: EvalConfigSchema.shape.browseros.optional(),
captcha: EvalConfigSchema.shape.captcha.optional(),
})
export type EvalSuite = z.infer<typeof EvalSuiteSchema>
export type SuiteAgent = z.infer<typeof SuiteAgentSchema>

View File

@@ -0,0 +1,66 @@
import type { GraderResult } from '../types'
export interface ViewerManifestTaskInput {
queryId: string
query: string
startUrl?: string
status: string
durationMs: number
screenshotCount: number
graderResults: Record<string, GraderResult>
}
export interface ViewerManifestTask extends ViewerManifestTaskInput {
paths: {
attempt: string
metadata: string
messages: string
trace: string
grades: string
screenshots: string
graderArtifacts: string
}
}
export interface ViewerManifest {
runId: string
suiteId: string
variantId: string
uploadedAt?: string
summary: Record<string, unknown>
tasks: ViewerManifestTask[]
}
export interface BuildViewerManifestInput {
runId: string
suiteId: string
variantId: string
uploadedAt?: string
summary: Record<string, unknown>
tasks: ViewerManifestTaskInput[]
}
/** Builds the compact JSON index consumed by the static R2 viewer. */
export function buildViewerManifest(
input: BuildViewerManifestInput,
): ViewerManifest {
return {
runId: input.runId,
suiteId: input.suiteId,
variantId: input.variantId,
uploadedAt: input.uploadedAt,
summary: input.summary,
tasks: input.tasks.map((task) => ({
...task,
paths: {
attempt: `tasks/${task.queryId}/attempt.json`,
metadata: `tasks/${task.queryId}/metadata.json`,
messages: `tasks/${task.queryId}/messages.jsonl`,
trace: `tasks/${task.queryId}/trace.jsonl`,
grades: `tasks/${task.queryId}/grades.json`,
screenshots: `tasks/${task.queryId}/screenshots`,
graderArtifacts: `tasks/${task.queryId}/grader-artifacts`,
},
})),
}
}

View File

@@ -0,0 +1,69 @@
import { describe, expect, it } from 'bun:test'
import {
extractCladoThinking,
formatCladoHistory,
getCladoActionSignature,
parseCladoActions,
} from '../../src/agents/orchestrated/backends/clado/clado-actions'
import type { CladoActionResponse } from '../../src/agents/orchestrated/backends/clado/types'
describe('Clado action parsing', () => {
it('merges the structured response with the first raw answer block', () => {
const prediction: CladoActionResponse = {
action: 'click',
x: 800,
raw_response:
'<answer>{"action":"click","x":100,"y":200}</answer><answer>{"action":"press_key","key":"Enter"}</answer>',
}
expect(parseCladoActions(prediction)).toEqual([
{ action: 'click', x: 800, y: 200 },
{ action: 'press_key', key: 'Enter' },
])
})
it('returns no action for malformed or missing action payloads', () => {
expect(
parseCladoActions({
action: null,
raw_response: '<answer>{"x":100}</answer><answer>bad json</answer>',
}),
).toEqual([])
})
it('deduplicates repeated raw actions after the primary action', () => {
const prediction: CladoActionResponse = {
raw_response: [
'<answer>{"action":"click","x":100,"y":200}</answer>',
'<answer>{"action":"click","x":100,"y":200}</answer>',
'<answer>{"action":"type","text":"hello"}</answer>',
].join(''),
}
expect(parseCladoActions(prediction)).toEqual([
{ action: 'click', x: 100, y: 200 },
{ action: 'type', text: 'hello' },
])
})
it('extracts compact thinking text from raw model output', () => {
expect(
extractCladoThinking(
'<thinking> first\\n thought </thinking><thinking>second thought</thinking>',
),
).toBe('first\\n thought second thought')
})
it('formats history and signatures using the existing trajectory shape', () => {
const actions = [
{ action: 'click', x: 100, y: 200 },
{ action: 'type', text: "can't" },
{ action: 'scroll', direction: 'down', amount: 500 },
]
expect(formatCladoHistory(actions)).toBe(
"click(100, 200) -> type('can\\'t') -> scroll(down)",
)
expect(getCladoActionSignature(actions[0])).toBe('click:100:200')
})
})

View File

@@ -0,0 +1,45 @@
import { describe, expect, it } from 'bun:test'
import {
prepareCladoToolArgs,
resolveCladoPoint,
} from '../../src/agents/orchestrated/backends/clado/clado-browser-driver'
describe('Clado browser driver helpers', () => {
it('maps normalized coordinates into the current viewport', () => {
expect(resolveCladoPoint({ width: 1440, height: 900 }, 500, 500)).toEqual({
x: 720,
y: 450,
})
})
it('clamps normalized coordinates before mapping to pixels', () => {
expect(resolveCladoPoint({ width: 1000, height: 800 }, -10, 1200)).toEqual({
x: 0,
y: 799,
})
})
it('keeps the current evaluate_script argument conversion', () => {
expect(
prepareCladoToolArgs(
'evaluate_script',
{ function: '() => window.location.href' },
7,
),
).toEqual({
expression: '(() => window.location.href)()',
page: 7,
})
})
it('normalizes click_at and adds page for page-scoped tools', () => {
expect(
prepareCladoToolArgs('click_at', { x: 10, y: 20, dblClick: true }, 3),
).toEqual({
x: 10,
y: 20,
clickCount: 2,
page: 3,
})
})
})

View File

@@ -0,0 +1,45 @@
import { describe, expect, it } from 'bun:test'
import {
backendKindForProvider,
createExecutorBackend,
} from '../../src/agents/orchestrated/backends/create-executor-backend'
import type { ExecutorBackend } from '../../src/agents/orchestrated/executor-backend'
describe('executor backend boundary', () => {
it('selects Clado only for the Clado action provider', () => {
expect(backendKindForProvider('clado-action')).toBe('clado')
expect(backendKindForProvider('openai-compatible')).toBe('tool-loop')
})
it('forwards execution and step state through the backend interface', async () => {
const signal = new AbortController().signal
const fakeBackend: ExecutorBackend = {
kind: 'tool-loop',
async execute(instruction, receivedSignal) {
expect(instruction).toBe('Click checkout')
expect(receivedSignal).toBe(signal)
return {
observation: 'Clicked checkout',
status: 'done',
url: 'https://example.test/checkout',
actionsPerformed: 2,
toolsUsed: ['browser_click_element'],
}
},
async close() {},
getTotalSteps() {
return 2
},
}
const backend = createExecutorBackend({
backendKind: 'tool-loop',
executor: fakeBackend,
})
const result = await backend.execute('Click checkout', signal)
expect(result.observation).toBe('Clicked checkout')
expect(result.actionsPerformed).toBe(2)
expect(backend.getTotalSteps()).toBe(2)
})
})

View File

@@ -0,0 +1,64 @@
import { describe, expect, it } from 'bun:test'
import { parseEvalCliArgs } from '../../src/cli/args'
describe('parseEvalCliArgs', () => {
it('parses the workflow-compatible suite config command', () => {
expect(
parseEvalCliArgs([
'suite',
'--config',
'configs/legacy/browseros-agent-weekly.json',
'--publish',
'r2',
]),
).toEqual({
command: 'suite',
configPath: 'configs/legacy/browseros-agent-weekly.json',
publishTarget: 'r2',
})
})
it('parses suite variant and model options', () => {
expect(
parseEvalCliArgs([
'suite',
'--suite',
'configs/suites/agisdk-daily-10.json',
'--variant',
'kimi-fireworks',
'--provider',
'openai-compatible',
'--model',
'accounts/fireworks/models/kimi-k2p5',
'--base-url',
'https://api.fireworks.ai/inference/v1',
]),
).toEqual({
command: 'suite',
suitePath: 'configs/suites/agisdk-daily-10.json',
variantId: 'kimi-fireworks',
provider: 'openai-compatible',
model: 'accounts/fireworks/models/kimi-k2p5',
baseUrl: 'https://api.fireworks.ai/inference/v1',
})
})
it('keeps the old config shorthand as legacy config mode', () => {
expect(
parseEvalCliArgs(['-c', 'configs/legacy/browseros-agent-weekly.json']),
).toEqual({
command: 'legacy',
configPath: 'configs/legacy/browseros-agent-weekly.json',
})
})
it('rejects missing required command options with targeted errors', () => {
expect(() => parseEvalCliArgs(['run'])).toThrow(
'run requires --config or --suite',
)
expect(() => parseEvalCliArgs(['grade'])).toThrow('grade requires --run')
expect(() =>
parseEvalCliArgs(['publish', '--run', 'results/run-1']),
).toThrow('publish requires --target')
})
})

View File

@@ -0,0 +1,115 @@
import { describe, expect, it } from 'bun:test'
import { mkdtemp, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { basename, join } from 'node:path'
import {
resolveSuiteCommand,
runSuiteCommand,
} from '../../src/cli/commands/suite'
import type { RunEvalOptions } from '../../src/runner/types'
async function writeTempSuite(): Promise<{ dir: string; suitePath: string }> {
const dir = await mkdtemp(join(tmpdir(), 'eval-suite-cli-'))
const suitePath = join(dir, 'agisdk-daily-10.json')
await writeFile(
suitePath,
JSON.stringify(
{
id: 'agisdk-daily-10',
dataset: 'tasks.jsonl',
agent: { type: 'single' },
graders: ['agisdk_state_diff'],
workers: 2,
restartBrowserPerTask: true,
browseros: {
server_url: 'http://127.0.0.1:9110',
headless: true,
},
},
null,
2,
),
)
await writeFile(join(dir, 'tasks.jsonl'), '')
return { dir, suitePath }
}
describe('suite command', () => {
it('resolves an existing config through the config adapter', async () => {
const resolved = await resolveSuiteCommand({
configPath: 'apps/eval/configs/legacy/browseros-agent-weekly.json',
env: {},
})
expect(resolved.kind).toBe('config')
expect(resolved.suite.id).toBe('browseros-agent-weekly')
expect(resolved.evalConfig.dataset).toBe(
'../../data/webbench-2of4-50.jsonl',
)
expect(resolved.variant.publicMetadata.agent.apiKeyConfigured).toBe(true)
})
it('resolves a suite file and variant into a runnable eval config', async () => {
const { dir, suitePath } = await writeTempSuite()
const resolved = await resolveSuiteCommand({
suitePath,
variantId: 'kimi-fireworks',
provider: 'openai-compatible',
model: 'accounts/fireworks/models/kimi-k2p5',
apiKey: 'test-key',
baseUrl: 'https://api.fireworks.ai/inference/v1',
env: {},
})
expect(resolved.kind).toBe('suite')
expect(resolved.suite.id).toBe('agisdk-daily-10')
expect(resolved.datasetPath).toBe(join(dir, 'tasks.jsonl'))
expect(resolved.evalConfig.agent).toMatchObject({
type: 'single',
provider: 'openai-compatible',
model: 'accounts/fireworks/models/kimi-k2p5',
apiKey: 'test-key',
baseUrl: 'https://api.fireworks.ai/inference/v1',
})
expect(resolved.evalConfig.num_workers).toBe(2)
})
it('runs config and suite commands through the runner dependency', async () => {
const calls: RunEvalOptions[] = []
await runSuiteCommand(
{
configPath: 'apps/eval/configs/legacy/browseros-agent-weekly.json',
env: {},
},
{
runEval: async (options) => {
calls.push(options)
},
},
)
const { suitePath } = await writeTempSuite()
await runSuiteCommand(
{
suitePath,
model: 'moonshotai/kimi-k2.5',
provider: 'openai-compatible',
env: {},
},
{
runEval: async (options) => {
calls.push(options)
},
},
)
expect(calls).toHaveLength(2)
expect(calls[0].configPath.endsWith('browseros-agent-weekly.json')).toBe(
true,
)
expect(basename(calls[1].configPath)).toBe('agisdk-daily-10.json')
expect(calls[1].config).toBeDefined()
expect(calls[1].dataPath?.endsWith('tasks.jsonl')).toBe(true)
})
})

View File

@@ -0,0 +1,79 @@
import { describe, expect, it } from 'bun:test'
import { mkdtemp, readFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { AgisdkStateDiffGrader } from '../../src/graders/benchmark/agisdk-state-diff'
import type { GraderInput } from '../../src/grading/types'
describe('AgisdkStateDiffGrader artifacts', () => {
it('writes finish state and evaluator artifacts', async () => {
const dir = await mkdtemp(join(tmpdir(), 'agisdk-artifacts-'))
const grader = new AgisdkStateDiffGrader()
const internals = grader as unknown as {
fetchFinishState(
origin: string,
endpoint: string,
): Promise<Record<string, unknown>>
runPythonEvaluator(input: unknown): Promise<{
output: {
reward: number
pass: boolean
message: string
per_criterion: unknown[]
}
stderr: string
}>
}
internals.fetchFinishState = async () => ({ cart: [{ name: 'Soup' }] })
internals.runPythonEvaluator = async () => ({
output: {
reward: 0,
pass: false,
message: 'Missing entree',
per_criterion: [{ passed: false, detail: 'entree missing' }],
},
stderr: 'criterion log',
})
const input: GraderInput = {
task: {
query_id: 'agisdk-dashdish-10',
query: 'Order dinner',
dataset: 'agisdk',
},
messages: [],
screenshotCount: 0,
finalAnswer: 'done',
taskArtifactDir: dir,
outputDir: dir,
mcpUrl: 'http://127.0.0.1:9110/mcp',
}
const result = await grader.grade(input)
expect(result.pass).toBe(false)
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/agisdk_state_diff/finish-state.json'),
'utf-8',
),
),
).toEqual({ cart: [{ name: 'Soup' }] })
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/agisdk_state_diff/evaluator-output.json'),
'utf-8',
),
),
).toMatchObject({ message: 'Missing entree' })
expect(
await readFile(
join(dir, 'grader-artifacts/agisdk_state_diff/stderr.txt'),
'utf-8',
),
).toContain('criterion log')
})
})

View File

@@ -0,0 +1,56 @@
import { describe, expect, it } from 'bun:test'
import { createGrader } from '../../src/grading/grader-registry'
import { runConfiguredGraders } from '../../src/grading/grader-runner'
import type { Grader, GraderInput } from '../../src/grading/types'
const fixtureInput: GraderInput = {
task: {
query_id: 'task-1',
query: 'Do the thing',
dataset: 'fixture',
},
messages: [],
screenshotCount: 0,
finalAnswer: null,
taskArtifactDir: '/tmp/task-1',
outputDir: '/tmp/task-1',
}
describe('grader registry', () => {
it('creates all current graders behind the shared interface', () => {
expect(createGrader('agisdk_state_diff')?.name).toBe('agisdk_state_diff')
expect(createGrader('infinity_state')?.name).toBe('infinity_state')
expect(createGrader('performance_grader')?.name).toBe('performance_grader')
})
})
describe('runConfiguredGraders', () => {
it('records one grader failure without aborting other graders', async () => {
const passing: Grader = {
name: 'passing',
async grade() {
return { score: 1, pass: true, reasoning: 'ok' }
},
}
const failing: Grader = {
name: 'failing',
async grade() {
throw new Error('grader exploded')
},
}
const results = await runConfiguredGraders(
['failing', 'passing'],
fixtureInput,
{
createGrader(name) {
return name === 'failing' ? failing : passing
},
},
)
expect(results.failing.pass).toBe(false)
expect(results.failing.reasoning).toContain('grader exploded')
expect(results.passing.pass).toBe(true)
})
})

View File

@@ -0,0 +1,67 @@
import { describe, expect, it } from 'bun:test'
import { mkdtemp, readFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { InfinityStateGrader } from '../../src/graders/benchmark/infinity-state'
import type { GraderInput } from '../../src/grading/types'
describe('InfinityStateGrader artifacts', () => {
it('writes verifier and evaluator artifacts', async () => {
const dir = await mkdtemp(join(tmpdir(), 'infinity-artifacts-'))
const oldInfinityDir = process.env.WEBARENA_INFINITY_DIR
process.env.WEBARENA_INFINITY_DIR = '/tmp/webarena-infinity'
try {
const grader = new InfinityStateGrader()
const internals = grader as unknown as {
runPythonEvaluator(input: unknown): Promise<{
output: { pass: boolean; reward: number; message: string }
stderr: string
}>
}
internals.runPythonEvaluator = async () => ({
output: { pass: true, reward: 1, message: 'verified' },
stderr: 'verifier log',
})
const input: GraderInput = {
task: {
query_id: 'infinity-elation-prescriptions-task_h69',
query: 'Verify the app state',
dataset: 'webarena-infinity',
},
messages: [],
screenshotCount: 0,
finalAnswer: null,
taskArtifactDir: dir,
outputDir: dir,
infinityAppUrl: 'http://127.0.0.1:8123',
}
const result = await grader.grade(input)
expect(result.pass).toBe(true)
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/infinity_state/verifier.json'),
'utf-8',
),
),
).toMatchObject({
appName: 'elation-prescriptions',
appServerUrl: 'http://127.0.0.1:8123',
})
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/infinity_state/evaluator-output.json'),
'utf-8',
),
),
).toMatchObject({ message: 'verified' })
} finally {
process.env.WEBARENA_INFINITY_DIR = oldInfinityDir
}
})
})

View File

@@ -0,0 +1,92 @@
import { describe, expect, it } from 'bun:test'
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { PerformanceGrader } from '../../src/graders/performance/performance-grader'
import type { GraderInput } from '../../src/grading/types'
describe('PerformanceGrader artifacts', () => {
it('writes metrics, agent output, and axes artifacts', async () => {
const dir = await mkdtemp(join(tmpdir(), 'performance-artifacts-'))
await mkdir(join(dir, 'screenshots'))
await writeFile(
join(dir, 'metadata.json'),
JSON.stringify({ termination_reason: 'completed' }),
)
const grader = new PerformanceGrader(undefined, undefined, 'claude-test')
const internals = grader as unknown as {
runAgent(
systemPrompt: string,
userPrompt: string,
outputDir: string,
): Promise<{
type: 'result'
subtype: string
result: string
total_cost_usd: number
num_turns: number
structured_output: unknown
}>
}
internals.runAgent = async () => ({
type: 'result',
subtype: 'success',
result: 'ok',
total_cost_usd: 0.01,
num_turns: 2,
structured_output: {
axes: [{ axis: 'task_completion', score: 90, reasoning: 'completed' }],
},
})
const input: GraderInput = {
task: {
query_id: 'task-1',
query: 'Find the answer',
dataset: 'fixture',
},
messages: [
{
type: 'tool-input-available',
timestamp: '2026-04-29T00:00:00.000Z',
toolCallId: 'call-1',
toolName: 'browser_get_page_content',
input: {},
},
],
screenshotCount: 1,
finalAnswer: 'answer',
taskArtifactDir: dir,
outputDir: dir,
}
const result = await grader.grade(input)
expect(result.details?.model).toBe('claude-test')
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/performance_grader/metrics.json'),
'utf-8',
),
),
).toMatchObject({ totalToolCalls: 1 })
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/performance_grader/axes.json'),
'utf-8',
),
),
).toMatchObject({ task_completion: { score: 90 } })
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/performance_grader/agent-output.json'),
'utf-8',
),
),
).toMatchObject({ subtype: 'success' })
})
})

View File

@@ -0,0 +1,66 @@
import { describe, expect, it } from 'bun:test'
import { mkdtemp, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { runPythonJsonEvaluator } from '../../src/grading/python-evaluator'
async function writeScript(source: string): Promise<string> {
const dir = await mkdtemp(join(tmpdir(), 'eval-python-'))
const script = join(dir, 'script.py')
await writeFile(script, source)
return script
}
describe('runPythonJsonEvaluator', () => {
it('sends JSON on stdin, captures stderr, and parses stdout JSON', async () => {
const script = await writeScript(`
import json, sys
data = json.loads(sys.stdin.read())
print("warning", file=sys.stderr)
print(json.dumps({"ok": True, "value": data["value"]}))
`)
const result = await runPythonJsonEvaluator<{ ok: boolean; value: number }>(
{
scriptPath: script,
input: { value: 42 },
timeoutMs: 5_000,
},
)
expect(result.output).toEqual({ ok: true, value: 42 })
expect(result.stderr).toContain('warning')
expect(result.exitCode).toBe(0)
})
it('reports non-zero exits with stderr', async () => {
const script = await writeScript(`
import sys
print("bad verifier", file=sys.stderr)
sys.exit(3)
`)
await expect(
runPythonJsonEvaluator({
scriptPath: script,
input: {},
timeoutMs: 5_000,
}),
).rejects.toThrow('bad verifier')
})
it('enforces timeouts', async () => {
const script = await writeScript(`
import time
time.sleep(5)
`)
await expect(
runPythonJsonEvaluator({
scriptPath: script,
input: {},
timeoutMs: 50,
}),
).rejects.toThrow('timed out')
})
})

View File

@@ -0,0 +1,21 @@
import { describe, expect, it } from 'bun:test'
import { stat } from 'node:fs/promises'
import { resolve } from 'node:path'
async function exists(path: string): Promise<boolean> {
return !!(await stat(path).catch(() => null))
}
describe('grader python script layout', () => {
it('keeps runtime evaluator scripts next to the grader implementation', async () => {
const pythonDir = resolve(import.meta.dir, '../../src/graders/python')
const scriptsDir = resolve(import.meta.dir, '../../scripts')
expect(await exists(resolve(pythonDir, 'agisdk-evaluate.py'))).toBe(true)
expect(await exists(resolve(pythonDir, 'infinity-evaluate.py'))).toBe(true)
expect(await exists(resolve(scriptsDir, 'agisdk-evaluate.py'))).toBe(false)
expect(await exists(resolve(scriptsDir, 'infinity-evaluate.py'))).toBe(
false,
)
})
})

View File

@@ -0,0 +1,193 @@
import { describe, expect, it } from 'bun:test'
import { mkdir, mkdtemp, readFile, rename, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import {
contentTypeForPath,
R2Publisher,
} from '../../src/publishing/r2-publisher'
class FakeR2Client {
readonly puts: Record<string, unknown>[] = []
readonly existing = new Set<string>()
async send(command: { input: Record<string, unknown> }): Promise<unknown> {
const key = command.input.Key as string
if ('Body' in command.input) {
this.puts.push(command.input)
return {}
}
if (this.existing.has(key)) return {}
throw new Error('not found')
}
}
async function writeRunFixture(
root: string,
configName = 'browseros-agent-weekly',
timestamp = '2026-04-29-1200',
): Promise<{ runDir: string; runId: string }> {
const runDir = join(root, configName, timestamp)
const taskDir = join(runDir, 'task-1')
await mkdir(join(taskDir, 'screenshots'), { recursive: true })
await writeFile(
join(taskDir, 'metadata.json'),
JSON.stringify({
query_id: 'task-1',
dataset: 'webbench',
query: 'Find pricing',
start_url: 'https://example.test',
termination_reason: 'completed',
total_duration_ms: 1200,
screenshot_count: 1,
agent_config: { type: 'single', model: 'kimi' },
grader_results: {
performance_grader: { score: 1, pass: true, reasoning: 'ok' },
},
}),
)
await writeFile(join(taskDir, 'messages.jsonl'), '{"type":"user"}\n')
await writeFile(join(taskDir, 'grades.json'), '{"ok":true}')
await writeFile(join(taskDir, 'screenshots', '1.png'), 'png')
await writeFile(
join(runDir, 'summary.json'),
JSON.stringify({ passRate: 1, avgDurationMs: 1200 }),
)
return { runDir, runId: `${configName}-${timestamp}` }
}
describe('R2Publisher', () => {
it('maps artifact file extensions to viewer-compatible content types', () => {
expect(contentTypeForPath('metadata.json')).toBe('application/json')
expect(contentTypeForPath('messages.jsonl')).toBe('application/x-ndjson')
expect(contentTypeForPath('screenshots/1.png')).toBe('image/png')
expect(contentTypeForPath('viewer.html')).toBe('text/html')
})
it('uploads run artifacts, manifest, and viewer html', async () => {
const dir = await mkdtemp(join(tmpdir(), 'eval-r2-'))
const { runDir, runId } = await writeRunFixture(dir)
const viewerPath = join(dir, 'viewer.html')
await writeFile(viewerPath, '<html>viewer</html>')
const client = new FakeR2Client()
const result = await new R2Publisher({
client,
viewerPath,
config: {
accountId: 'acct',
accessKeyId: 'key',
secretAccessKey: 'secret',
bucket: 'bucket',
cdnBaseUrl: 'https://eval.example.test',
},
now: () => new Date('2026-04-29T12:00:00.000Z'),
}).publishRun(runDir, runId)
const byKey = new Map(client.puts.map((put) => [put.Key, put]))
expect(byKey.get(`runs/${runId}/task-1/metadata.json`)?.ContentType).toBe(
'application/json',
)
expect(byKey.get(`runs/${runId}/task-1/messages.jsonl`)?.ContentType).toBe(
'application/x-ndjson',
)
expect(
byKey.get(`runs/${runId}/task-1/screenshots/1.png`)?.ContentType,
).toBe('image/png')
expect(byKey.get(`runs/${runId}/manifest.json`)?.ContentType).toBe(
'application/json',
)
expect(byKey.get(`runs/${runId}/summary.json`)?.ContentType).toBe(
'application/json',
)
expect(byKey.get('viewer.html')?.ContentType).toBe('text/html')
expect(result.viewerUrl).toBe(
`https://eval.example.test/viewer.html?run=${runId}`,
)
const manifest = JSON.parse(
Buffer.from(
byKey.get(`runs/${runId}/manifest.json`)?.Body as Buffer,
).toString('utf-8'),
)
expect(manifest).toMatchObject({
runId,
uploadedAt: '2026-04-29T12:00:00.000Z',
dataset: 'webbench',
summary: { passRate: 1, avgDurationMs: 1200 },
tasks: [
{
queryId: 'task-1',
status: 'completed',
screenshotCount: 1,
},
],
})
})
it('publishes unuploaded runs from a config results directory', async () => {
const dir = await mkdtemp(join(tmpdir(), 'eval-r2-config-'))
const first = await writeRunFixture(dir, 'weekly', '2026-04-29-1200')
const second = await writeRunFixture(dir, 'weekly', '2026-04-30-1200')
const viewerPath = join(dir, 'viewer.html')
await writeFile(viewerPath, '<html>viewer</html>')
const client = new FakeR2Client()
client.existing.add(`runs/${first.runId}/manifest.json`)
const result = await new R2Publisher({
client,
viewerPath,
config: {
accountId: 'acct',
accessKeyId: 'key',
secretAccessKey: 'secret',
bucket: 'bucket',
cdnBaseUrl: 'https://eval.example.test',
},
now: () => new Date('2026-04-29T12:00:00.000Z'),
}).publishPath(join(dir, 'weekly'))
expect(result.uploadedRuns.map((run) => run.runId)).toEqual([second.runId])
expect(
client.puts.some(
(put) => put.Key === `runs/${first.runId}/manifest.json`,
),
).toBe(false)
expect(
client.puts.some(
(put) => put.Key === `runs/${second.runId}/manifest.json`,
),
).toBe(true)
await expect(
readFile(join(second.runDir, 'summary.json'), 'utf-8'),
).resolves.toContain('passRate')
})
it('recognizes and publishes canonical tasks directory runs', async () => {
const dir = await mkdtemp(join(tmpdir(), 'eval-r2-tasks-'))
const { runDir, runId } = await writeRunFixture(dir)
await mkdir(join(runDir, 'tasks'), { recursive: true })
await rename(join(runDir, 'task-1'), join(runDir, 'tasks', 'task-1'))
const viewerPath = join(dir, 'viewer.html')
await writeFile(viewerPath, '<html>viewer</html>')
const client = new FakeR2Client()
const result = await new R2Publisher({
client,
viewerPath,
config: {
accountId: 'acct',
accessKeyId: 'key',
secretAccessKey: 'secret',
bucket: 'bucket',
cdnBaseUrl: 'https://eval.example.test',
},
}).publishPath(runDir)
const keys = client.puts.map((put) => put.Key)
expect(result.uploadedRuns.map((run) => run.runId)).toEqual([runId])
expect(keys).toContain(`runs/${runId}/task-1/metadata.json`)
expect(keys).toContain(`runs/${runId}/tasks/task-1/metadata.json`)
})
})

View File

@@ -0,0 +1,82 @@
import { describe, expect, it } from 'bun:test'
import { mkdtemp, readFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { TrajectorySaver } from '../../src/capture/trajectory-saver'
import { createRunId, getRunPaths } from '../../src/runs/artifact-paths'
import type { TaskMetadata } from '../../src/types'
describe('artifact paths', () => {
it('creates stable safe run ids', () => {
const runId = createRunId(
'agisdk/daily 10',
'kimi fire?',
new Date('2026-04-29T06:00:00Z'),
)
expect(runId).toBe('agisdk-daily-10__kimi-fire__2026-04-29-0600')
})
it('returns run and task artifact paths', () => {
const paths = getRunPaths('results', 'run-1', 'task-1')
expect(paths.runDir).toBe(join('results', 'runs', 'run-1'))
expect(paths.runManifest).toBe(join('results', 'runs', 'run-1', 'run.json'))
expect(paths.viewerManifest).toBe(
join('results', 'runs', 'run-1', 'viewer-manifest.json'),
)
expect(paths.messages).toBe(
join('results', 'runs', 'run-1', 'tasks', 'task-1', 'messages.jsonl'),
)
expect(paths.graderArtifacts).toBe(
join('results', 'runs', 'run-1', 'tasks', 'task-1', 'grader-artifacts'),
)
})
})
describe('TrajectorySaver artifact compatibility', () => {
it('keeps metadata.json and writes attempt and grades artifacts', async () => {
const dir = await mkdtemp(join(tmpdir(), 'eval-artifacts-'))
const saver = new TrajectorySaver(dir, 'task-1')
const taskDir = await saver.init()
const metadata: TaskMetadata = {
query_id: 'task-1',
dataset: 'fixture',
query: 'Do the task',
started_at: '2026-04-29T00:00:00.000Z',
completed_at: '2026-04-29T00:00:01.000Z',
total_duration_ms: 1000,
total_steps: 1,
screenshot_count: 1,
termination_reason: 'completed',
final_answer: 'done',
errors: [],
warnings: [],
agent_config: { type: 'single', model: 'model' },
grader_results: {},
}
await saver.saveMetadata(metadata)
await saver.saveAttempt({ status: 'completed', taskId: 'task-1' })
await saver.saveGrades({
performance_grader: { score: 1, pass: true, reasoning: 'ok' },
})
expect(
JSON.parse(await readFile(join(taskDir, 'metadata.json'), 'utf-8')),
).toMatchObject({
query_id: 'task-1',
})
expect(
JSON.parse(await readFile(join(taskDir, 'attempt.json'), 'utf-8')),
).toEqual({
status: 'completed',
taskId: 'task-1',
})
expect(
JSON.parse(await readFile(join(taskDir, 'grades.json'), 'utf-8')),
).toMatchObject({
performance_grader: { pass: true },
})
})
})

View File

@@ -0,0 +1,21 @@
import { describe, expect, it } from 'bun:test'
import { runEval as oldRunEval } from '../../src/runner/eval-runner'
import { ParallelExecutor } from '../../src/runner/parallel-executor'
import { TaskExecutor } from '../../src/runner/task-executor'
import { runEval } from '../../src/runs/eval-runner'
import { TaskRunPipeline } from '../../src/runs/task-run-pipeline'
import { TaskWorkerPool } from '../../src/runs/task-worker-pool'
describe('runner naming compatibility', () => {
it('exports new runner-layer names', () => {
expect(TaskWorkerPool.name).toBe('TaskWorkerPool')
expect(TaskRunPipeline.name).toBe('TaskRunPipeline')
expect(typeof runEval).toBe('function')
})
it('keeps old runner imports working', () => {
expect(ParallelExecutor).toBe(TaskWorkerPool)
expect(TaskExecutor).toBe(TaskRunPipeline)
expect(oldRunEval).toBe(runEval)
})
})

View File

@@ -0,0 +1,40 @@
import { describe, expect, it } from 'bun:test'
import { buildRunManifest } from '../../src/runs/run-manifest'
describe('buildRunManifest', () => {
it('captures reproducibility fields without raw secrets', () => {
const manifest = buildRunManifest({
runId: 'agisdk-daily-10__kimi__2026-04-29-0600',
suiteId: 'agisdk-daily-10',
variant: {
id: 'kimi',
agent: {
provider: 'openai-compatible',
model: 'moonshotai/kimi-k2.5',
apiKey: 'secret-value',
baseUrl: 'https://api.example.com/v1',
},
publicMetadata: {
id: 'kimi',
agent: {
provider: 'openai-compatible',
model: 'moonshotai/kimi-k2.5',
baseUrlHost: 'api.example.com',
apiKeyConfigured: true,
apiKeyEnv: 'EVAL_AGENT_API_KEY',
},
},
},
datasetPath: 'apps/eval/data/agisdk-real.jsonl',
datasetHash: 'sha256:abc',
graders: ['agisdk_state_diff'],
gitSha: 'abc123',
browserosVersion: 'BrowserOS 1.0.0',
startedAt: '2026-04-29T06:00:00.000Z',
})
expect(manifest.variant.agent.baseUrlHost).toBe('api.example.com')
expect(manifest.dataset.hash).toBe('sha256:abc')
expect(JSON.stringify(manifest)).not.toContain('secret-value')
})
})

View File

@@ -0,0 +1,37 @@
import { describe, expect, it } from 'bun:test'
import { adaptEvalConfigFile } from '../../src/suites/config-adapter'
describe('adaptEvalConfigFile', () => {
it('preserves browseros-agent-weekly config semantics', async () => {
const adapted = await adaptEvalConfigFile(
'apps/eval/configs/legacy/browseros-agent-weekly.json',
)
expect(adapted.suite.id).toBe('browseros-agent-weekly')
expect(adapted.suite.dataset).toBe('../../data/webbench-2of4-50.jsonl')
expect(adapted.suite.graders).toEqual(['performance_grader'])
expect(adapted.suite.workers).toBe(10)
expect(adapted.suite.restartBrowserPerTask).toBe(true)
expect(adapted.suite.timeoutMs).toBe(1_800_000)
expect(adapted.evalConfig.num_workers).toBe(10)
expect(adapted.evalConfig.browseros.server_url).toBe(
'http://127.0.0.1:9110',
)
})
it('keeps API key env names public while omitting secret values', async () => {
const adapted = await adaptEvalConfigFile(
'apps/eval/configs/legacy/browseros-agent-weekly.json',
{
env: { OPENROUTER_API_KEY: 'secret-openrouter-value' },
},
)
expect(adapted.variant.publicMetadata.agent.apiKeyEnv).toBe(
'OPENROUTER_API_KEY',
)
expect(JSON.stringify(adapted.variant.publicMetadata)).not.toContain(
'secret-openrouter-value',
)
})
})

View File

@@ -0,0 +1,92 @@
import { describe, expect, it } from 'bun:test'
import { readFile } from 'node:fs/promises'
import { loadSuite } from '../../src/suites/load-suite'
import { resolveVariant } from '../../src/suites/resolve-variant'
import { EvalSuiteSchema } from '../../src/suites/schema'
describe('EvalSuiteSchema', () => {
it('validates suite settings used by the eval pipeline', () => {
const suite = EvalSuiteSchema.parse({
id: 'agisdk-daily-10',
dataset: 'data/agisdk-daily-10.jsonl',
agent: {
type: 'orchestrated',
executorBackend: 'tool-loop',
},
graders: ['agisdk_state_diff'],
workers: 4,
restartBrowserPerTask: true,
timeoutMs: 1_800_000,
})
expect(suite.id).toBe('agisdk-daily-10')
expect(suite.agent.type).toBe('orchestrated')
expect(suite.agent.executorBackend).toBe('tool-loop')
expect(suite.workers).toBe(4)
})
it('rejects suites without a dataset', () => {
const parsed = EvalSuiteSchema.safeParse({
id: 'bad-suite',
agent: { type: 'tool-loop' },
graders: ['performance_grader'],
})
expect(parsed.success).toBe(false)
})
it('validates the daily AGISDK 10-task suite', async () => {
const loaded = await loadSuite(
'apps/eval/configs/suites/agisdk-daily-10.json',
)
const lines = (await readFile(loaded.datasetPath, 'utf-8'))
.trim()
.split('\n')
expect(loaded.suite.id).toBe('agisdk-daily-10')
expect(loaded.suite.graders).toEqual(['agisdk_state_diff'])
expect(loaded.suite.workers).toBe(1)
expect(lines).toHaveLength(10)
expect(JSON.parse(lines[0]).query_id).toBe('agisdk-dashdish-10')
expect(JSON.parse(lines[9]).query_id).toBe('agisdk-zilloft-6')
})
})
describe('resolveVariant', () => {
it('prefers CLI values over env values and does not expose raw API keys', () => {
const variant = resolveVariant({
variantId: 'cli-variant',
provider: 'anthropic',
model: 'claude-test',
apiKey: 'cli-secret',
baseUrl: 'https://cli.example/v1',
env: {
EVAL_VARIANT: 'env-variant',
EVAL_AGENT_PROVIDER: 'openai-compatible',
EVAL_AGENT_MODEL: 'env-model',
EVAL_AGENT_API_KEY: 'env-secret',
EVAL_AGENT_BASE_URL: 'https://env.example/v1',
},
})
expect(variant.id).toBe('cli-variant')
expect(variant.agent.provider).toBe('anthropic')
expect(variant.agent.model).toBe('claude-test')
expect(variant.agent.apiKey).toBe('cli-secret')
expect(variant.publicMetadata.agent.apiKeyConfigured).toBe(true)
expect(JSON.stringify(variant.publicMetadata)).not.toContain('cli-secret')
expect(JSON.stringify(variant.publicMetadata)).not.toContain('env-secret')
})
it('fails clearly when credentials are required but missing', () => {
expect(() =>
resolveVariant({
variantId: 'missing-key',
provider: 'openai-compatible',
model: 'kimi',
env: {},
requireApiKey: true,
}),
).toThrow('EVAL_AGENT_API_KEY')
})
})

View File

@@ -0,0 +1,41 @@
import { describe, expect, it } from 'bun:test'
import { buildViewerManifest } from '../../src/viewer/viewer-manifest'
describe('buildViewerManifest', () => {
it('indexes task artifacts for the R2 viewer', () => {
const manifest = buildViewerManifest({
runId: 'run-1',
suiteId: 'agisdk-daily-10',
variantId: 'kimi',
uploadedAt: '2026-04-29T06:00:00.000Z',
summary: { total: 1, passRate: 0 },
tasks: [
{
queryId: 'agisdk-dashdish-4',
query: 'Schedule a delivery order',
startUrl: 'https://evals-dashdish.vercel.app',
status: 'completed',
durationMs: 353_000,
screenshotCount: 42,
graderResults: {
agisdk_state_diff: {
score: 0,
pass: false,
reasoning: 'Missing checkout item',
},
},
},
],
})
expect(manifest.tasks[0].paths.messages).toBe(
'tasks/agisdk-dashdish-4/messages.jsonl',
)
expect(manifest.tasks[0].paths.screenshots).toBe(
'tasks/agisdk-dashdish-4/screenshots',
)
expect(manifest.tasks[0].paths.graderArtifacts).toBe(
'tasks/agisdk-dashdish-4/grader-artifacts',
)
})
})