Compare commits

...

4 Commits

Author SHA1 Message Date
shivammittal274
1b44de2ee8 Eval metrics configs (#932)
* feat(eval): add agisdk comparison metrics configs

* fix(eval): keep cdp crashes from aborting run
2026-05-04 21:00:10 +05:30
shivammittal274
10822ff8de fix(eval): bypass claude code tool permissions 2026-05-01 21:14:44 +05:30
shivammittal274
b7892515d7 fix(eval): install claude code cli for CI evals 2026-05-01 21:05:48 +05:30
shivammittal274
2a57dd6ab8 feat(eval): add claude-generated run report artifact 2026-05-01 20:26:30 +05:30
23 changed files with 1182 additions and 43 deletions

View File

@@ -44,6 +44,19 @@ jobs:
working-directory: packages/browseros-agent
run: bun install --ignore-scripts
- name: Install Claude Code CLI
working-directory: packages/browseros-agent/apps/eval
env:
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }}
run: |
if bun -e "const config = await Bun.file(process.env.EVAL_CONFIG).json(); process.exit(config.agent?.type === 'claude-code' ? 0 : 1)"; then
npm install -g @anthropic-ai/claude-code@2.1.119
echo "Claude Code CLI installed at $(command -v claude)"
claude --version
else
echo "Eval config does not use Claude Code; skipping Claude Code CLI install"
fi
- name: Install Python eval dependencies
# agisdk pinned so silent upstream releases can't shift task definitions
# or grader behavior. Bump intentionally with a documented re-baseline.
@@ -67,13 +80,11 @@ jobs:
env:
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
AWS_REGION: ${{ secrets.AWS_REGION || 'us-west-2' }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }}
EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }}
EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }}
EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }}
BROWSEROS_BINARY: /usr/bin/browseros
WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
# OpenClaw container runtime is macOS-only; opt the Linux runner
@@ -82,7 +93,35 @@ jobs:
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }}
run: |
echo "Running eval with config: $EVAL_CONFIG"
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts suite --config "$EVAL_CONFIG" --publish r2
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts suite --config "$EVAL_CONFIG"
# Capture the run directory so report.html can be generated before the R2 publish step.
SUMMARY_PATH="$(find results -name summary.json -type f -print | sort | tail -n 1)"
if [ -z "$SUMMARY_PATH" ]; then
echo "No eval run summary found"
exit 1
fi
RUN_DIR="$(dirname "$SUMMARY_PATH")"
echo "EVAL_RUN_DIR=$RUN_DIR" >> "$GITHUB_ENV"
- name: Generate run analysis report
if: success()
working-directory: packages/browseros-agent/apps/eval
env:
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
run: |
echo "Generating run report for $EVAL_RUN_DIR"
bun scripts/generate-report.ts --input "$EVAL_RUN_DIR" --output "$EVAL_RUN_DIR/report.html"
- name: Publish eval run to R2
if: success()
working-directory: packages/browseros-agent/apps/eval
env:
EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }}
EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }}
EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }}
EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }}
run: bun run src/index.ts publish --run "$EVAL_RUN_DIR" --target r2
- name: Generate trend report
if: success()
@@ -97,7 +136,7 @@ jobs:
EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }}
run: bun apps/eval/scripts/weekly-report.ts /tmp/eval-report.html
- name: Upload report as artifact
- name: Upload trend report as artifact
if: success()
uses: actions/upload-artifact@v4
with:

View File

@@ -0,0 +1,26 @@
{
"agent": {
"type": "single",
"provider": "openai-compatible",
"model": "moonshotai/kimi-k2.5",
"apiKey": "OPENROUTER_API_KEY",
"baseUrl": "https://openrouter.ai/api/v1",
"supportsImages": true
},
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 3,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["agisdk_state_diff"],
"timeout_ms": 1800000
}

View File

@@ -0,0 +1,27 @@
{
"agent": {
"type": "single",
"provider": "bedrock",
"model": "global.anthropic.claude-opus-4-6-v1",
"region": "AWS_REGION",
"accessKeyId": "AWS_ACCESS_KEY_ID",
"secretAccessKey": "AWS_SECRET_ACCESS_KEY",
"supportsImages": true
},
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 2,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["agisdk_state_diff"],
"timeout_ms": 1800000
}

View File

@@ -8,7 +8,7 @@
"supportsImages": true
},
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 10,
"num_workers": 3,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",

View File

@@ -1,7 +1,8 @@
{
"agent": {
"type": "claude-code",
"model": "opus"
"model": "opus",
"extraArgs": ["--permission-mode", "bypassPermissions"]
},
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 1,

View File

@@ -0,0 +1,191 @@
#!/usr/bin/env bun
import { mkdir, stat } from 'node:fs/promises'
import { dirname, resolve } from 'node:path'
import { query as claudeQuery } from '@anthropic-ai/claude-agent-sdk'
import { readRunMetricSummary } from '../src/reporting/task-metrics'
export const DEFAULT_REPORT_MODEL = 'claude-opus-4-6'
export const DEFAULT_REPORT_MAX_TURNS = 300
type Env = Record<string, string | undefined>
type ClaudeQuery = (input: unknown) => AsyncIterable<Record<string, unknown>>
export interface ReportAgentInvocation {
inputDir: string
outputPath: string
prompt: string
}
export interface GenerateEvalReportOptions {
inputDir: string
outputPath: string
runAgent?: (invocation: ReportAgentInvocation) => Promise<void>
}
interface ClaudeReportAgentDeps {
query?: ClaudeQuery
env?: Env
}
function usage(): string {
return `Usage: bun scripts/generate-report.ts --input <run-dir> --output <report.html>`
}
function parseArgs(
argv: string[],
): Pick<GenerateEvalReportOptions, 'inputDir' | 'outputPath'> {
let inputDir = ''
let outputPath = ''
for (let i = 0; i < argv.length; i++) {
const arg = argv[i]
if (arg === '--input' || arg === '--run') {
inputDir = argv[++i] ?? ''
} else if (arg === '--output' || arg === '--out') {
outputPath = argv[++i] ?? ''
} else if (arg === '--help' || arg === '-h') {
console.log(usage())
process.exit(0)
}
}
if (!inputDir || !outputPath) {
throw new Error(usage())
}
return { inputDir, outputPath }
}
function claudeCodeEnv(env: Env): Env {
return {
CLAUDE_CODE_OAUTH_TOKEN: env.CLAUDE_CODE_OAUTH_TOKEN,
ANTHROPIC_API_KEY: env.ANTHROPIC_API_KEY,
HOME: env.HOME,
PATH: env.PATH,
SHELL: env.SHELL,
TMPDIR: env.TMPDIR,
TMP: env.TMP,
TEMP: env.TEMP,
USER: env.USER,
CLAUDECODE: '',
}
}
async function buildReportPrompt(
inputDir: string,
outputPath: string,
): Promise<string> {
const metrics = await readRunMetricSummary(inputDir)
return `Analyze this BrowserOS eval run and write a shareable HTML report.
Run directory: ${inputDir}
Output file to write: ${outputPath}
You are running with the run directory as cwd. Inspect the local artifacts:
- summary.json for run totals and pass rate
- each task directory's metadata.json for query, final answer, timing, screenshots, and grader results
- each task directory's messages.jsonl for tool calls, tool errors, and recent trajectory
- screenshots/ for visual evidence
- grader-artifacts/ when present for grader-specific context
Write the final report directly to the output file path above. Do not print the
report instead of writing it. Do not modify any input artifacts. The only file
you should create or overwrite is the requested report.html.
The report should follow the style and density of the Shadowfax AGI SDK report:
- Title like "AGI SDK Random-10 Failure Report" or a run-specific equivalent
- Run directory and note that screenshots are embedded as data URIs
- Summary cards for total tasks, passed, failed, pass rate, average duration, average steps, and average tool calls
- A Metrics section with compact charts for Duration by task, Steps by task, Tool calls by task, and Tool errors by task
- Task Summary table with task id, status, score, duration, steps, and prompt
- Include tool calls and tool errors in the Task Summary table
- Failure sections with stable anchors using each task id, for example <section id="agisdk-networkin-10">
- For each failed task: Diagnosis, Evidence, Next Check, final screenshot, AGI SDK / grader criteria, final answer, and recent trajectory events
- Make failure links in the summary table point to the task anchors
- Keep the HTML self-contained: inline CSS and embedded final screenshots as data:image/png;base64 URIs
- Escape user/model text correctly so task outputs cannot break the page
Analysis guidance:
- Focus on why the model failed: task understanding, browser/tool usage, missing verification, tool errors, max-step/timeout, bad final answer, or grader ambiguity
- Use messages.jsonl strategically. Do not paste huge DOM outputs into the report. Summarize only the relevant recent trajectory and evidence.
- Limit trajectory analysis to the most relevant 200-300 events/calls across the run. Prefer failed tasks and the final/key actions for each failure.
- If a grader criterion is boolean-only or ambiguous, say so and identify what additional artifact would make it debuggable.
Deterministic run metrics computed from metadata.json and messages.jsonl:
\`\`\`json
${JSON.stringify(metrics, null, 2)}
\`\`\`
After writing the file, verify that ${outputPath} exists and is non-empty.`
}
async function assertRunDir(inputDir: string): Promise<void> {
const inputStat = await stat(inputDir).catch(() => null)
if (!inputStat?.isDirectory()) {
throw new Error(`Not a run directory: ${inputDir}`)
}
}
async function assertReportWritten(outputPath: string): Promise<void> {
const outputStat = await stat(outputPath).catch(() => null)
if (!outputStat?.isFile() || outputStat.size === 0) {
throw new Error(`Report was not written: ${outputPath}`)
}
}
export async function runClaudeCodeReportAgent(
invocation: ReportAgentInvocation,
deps: ClaudeReportAgentDeps = {},
): Promise<void> {
const query = deps.query ?? (claudeQuery as unknown as ClaudeQuery)
let resultSubtype: string | undefined
for await (const message of query({
prompt: invocation.prompt,
options: {
cwd: invocation.inputDir,
model: DEFAULT_REPORT_MODEL,
systemPrompt:
'You are an eval failure analyst. Produce a concise, evidence-backed, self-contained HTML report from local run artifacts.',
permissionMode: 'bypassPermissions',
allowDangerouslySkipPermissions: true,
maxTurns: DEFAULT_REPORT_MAX_TURNS,
env: claudeCodeEnv(deps.env ?? process.env),
},
})) {
if (message.type === 'result') {
resultSubtype =
typeof message.subtype === 'string' ? message.subtype : undefined
}
}
if (resultSubtype && resultSubtype !== 'success') {
throw new Error(`Claude Code report agent failed: ${resultSubtype}`)
}
}
export async function generateEvalReport(
options: GenerateEvalReportOptions,
): Promise<void> {
const inputDir = resolve(options.inputDir)
const outputPath = resolve(options.outputPath)
await assertRunDir(inputDir)
await mkdir(dirname(outputPath), { recursive: true })
const invocation = {
inputDir,
outputPath,
prompt: await buildReportPrompt(inputDir, outputPath),
}
await (options.runAgent ?? runClaudeCodeReportAgent)(invocation)
await assertReportWritten(outputPath)
}
if (import.meta.main) {
try {
await generateEvalReport(parseArgs(Bun.argv.slice(2)))
} catch (error) {
console.error(error instanceof Error ? error.message : String(error))
process.exit(1)
}
}

View File

@@ -134,7 +134,10 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
// Connect to Chrome via CDP — same per-worker offset used by app-manager.
const cdpPort = config.browseros.base_cdp_port + workerIndex
const cdp = new CdpBackend({ port: cdpPort })
const cdp = new CdpBackend({
port: cdpPort,
exitOnReconnectFailure: false,
})
await cdp.connect()
const browser = new Browser(cdp)
capture.screenshot.setBrowser(browser)

View File

@@ -43,7 +43,10 @@ export class SingleAgentEvaluator implements AgentEvaluator {
// Connect to Chrome via CDP — same per-worker offset used by app-manager.
const cdpPort = config.browseros.base_cdp_port + workerIndex
const cdp = new CdpBackend({ port: cdpPort })
const cdp = new CdpBackend({
port: cdpPort,
exitOnReconnectFailure: false,
})
await cdp.connect()
const browser = new Browser(cdp)

View File

@@ -536,6 +536,12 @@ export interface DashboardConfig {
configMode?: boolean
}
export function shouldAutoOpenDashboard(
env: Record<string, string | undefined> = process.env,
): boolean {
return env.CI !== 'true'
}
export function startDashboard(config: DashboardConfig) {
const port = config.port ?? 9900
dashboardConfigMode = config.configMode ?? false
@@ -558,10 +564,12 @@ export function startDashboard(config: DashboardConfig) {
console.log(` Dashboard: ${url}`)
// Auto-open browser
try {
Bun.spawn(['open', url], { stdout: 'ignore', stderr: 'ignore' })
} catch {
/* ignore if open command fails */
if (shouldAutoOpenDashboard()) {
try {
Bun.spawn(['open', url], { stdout: 'ignore', stderr: 'ignore' })
} catch {
/* ignore if open command fails */
}
}
return { url, port }

View File

@@ -61,6 +61,17 @@
.header-stats .stat-pass { color: #3fb950; }
.header-stats .stat-fail { color: #f85149; }
.header-stats .stat-score { color: #f0883e; }
.header-report {
color: #58a6ff;
text-decoration: none;
font-size: 12px;
font-weight: 600;
border: 1px solid #30363d;
border-radius: 6px;
padding: 5px 9px;
white-space: nowrap;
}
.header-report:hover { border-color: #58a6ff; background: #1c2333; }
/* ── 3-column layout ─────────────────────────────────────────── */
.layout {
@@ -84,6 +95,7 @@
background: #161b22;
border-bottom: 1px solid #30363d;
display: flex;
flex-wrap: wrap;
gap: 12px;
font-size: 11px;
font-weight: 600;
@@ -93,6 +105,80 @@
}
.sidebar-stats .s-pass { color: #3fb950; }
.sidebar-stats .s-fail { color: #f85149; }
.sidebar-metrics {
padding: 12px 16px;
background: #0d1117;
border-bottom: 1px solid #21262d;
}
.metric-grid {
display: grid;
grid-template-columns: repeat(3, minmax(0, 1fr));
gap: 8px;
margin-bottom: 12px;
}
.metric-cell {
min-width: 0;
}
.metric-label {
display: block;
font-size: 9px;
font-weight: 600;
color: #6e7681;
text-transform: uppercase;
letter-spacing: 0.04em;
white-space: nowrap;
}
.metric-value {
display: block;
font-size: 13px;
font-weight: 700;
color: #e6edf3;
margin-top: 2px;
overflow: hidden;
text-overflow: ellipsis;
}
.mini-chart {
display: flex;
flex-direction: column;
gap: 6px;
}
.mini-chart-title {
font-size: 10px;
font-weight: 700;
color: #8b949e;
text-transform: uppercase;
letter-spacing: 0.04em;
}
.mini-bar-row {
display: grid;
grid-template-columns: minmax(60px, 1fr) 70px 28px;
gap: 8px;
align-items: center;
font-size: 10px;
color: #8b949e;
}
.mini-bar-name {
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
}
.mini-bar-track {
height: 6px;
background: #21262d;
border-radius: 999px;
overflow: hidden;
}
.mini-bar-fill {
height: 100%;
background: #58a6ff;
border-radius: 999px;
}
.mini-bar-value {
color: #e6edf3;
font-variant-numeric: tabular-nums;
text-align: right;
}
.sidebar-filter {
padding: 8px 12px;
border-bottom: 1px solid #21262d;
@@ -526,6 +612,7 @@
<div class="header-sep"></div>
<span class="header-run" id="header-run"></span>
<span class="header-date" id="header-date"></span>
<a class="header-report" id="header-report" target="_blank" rel="noopener" style="display: none;">Run Report</a>
<div class="header-stats" id="header-stats"></div>
</div>
@@ -533,6 +620,7 @@
<!-- Left sidebar -->
<div class="sidebar" id="sidebar">
<div class="sidebar-stats" id="sidebar-stats"></div>
<div class="sidebar-metrics" id="sidebar-metrics"></div>
<div class="sidebar-filter">
<input type="text" id="filter-input" placeholder="Search tasks..." autocomplete="off" spellcheck="false" />
</div>
@@ -627,7 +715,23 @@
if (stats.avgScore !== null) {
parts.push(`<span class="stat-score">avg ${stats.avgScore}%</span>`);
}
if (stats.avgDurationMs !== null) {
parts.push(`<span>${fmtDuration(stats.avgDurationMs)} avg</span>`);
}
if (stats.avgToolCalls !== null) {
parts.push(`<span>${fmtCompact(stats.avgToolCalls)} tools/task</span>`);
}
el.innerHTML = parts.join('');
const reportLink = document.getElementById('header-report');
const url = reportUrl(manifest);
if (url) {
reportLink.href = url;
reportLink.style.display = '';
} else {
reportLink.removeAttribute('href');
reportLink.style.display = 'none';
}
}
// ── Sidebar rendering ─────────────────────────────────────────
@@ -639,11 +743,49 @@
statsEl.innerHTML =
'<span>' + stats.total + ' total</span>' +
'<span class="s-pass">' + stats.passed + ' pass</span>' +
'<span class="s-fail">' + stats.failed + ' fail</span>';
'<span class="s-fail">' + stats.failed + ' fail</span>' +
(stats.avgSteps !== null ? '<span>' + fmtCompact(stats.avgSteps) + ' steps/task</span>' : '') +
(stats.avgToolCalls !== null ? '<span>' + fmtCompact(stats.avgToolCalls) + ' tools/task</span>' : '');
renderSidebarMetrics(tasks, stats);
renderTaskList('');
}
function renderSidebarMetrics(tasks, stats) {
const el = document.getElementById('sidebar-metrics');
if (!el) return;
const chartTasks = tasks
.slice()
.sort((a, b) => taskMetrics(b).toolCalls - taskMetrics(a).toolCalls)
.slice(0, 5);
const maxCalls = Math.max(1, ...chartTasks.map((task) => taskMetrics(task).toolCalls));
const bars = chartTasks.map((task) => {
const calls = taskMetrics(task).toolCalls;
const width = Math.max(4, Math.round((calls / maxCalls) * 100));
return (
'<div class="mini-bar-row">' +
'<span class="mini-bar-name" title="' + escAttr(task.queryId || task.id || 'Untitled') + '">' + esc(task.queryId || task.id || 'Untitled') + '</span>' +
'<span class="mini-bar-track"><span class="mini-bar-fill" style="width: ' + width + '%"></span></span>' +
'<span class="mini-bar-value">' + fmtCompact(calls) + '</span>' +
'</div>'
);
}).join('');
el.innerHTML =
'<div class="metric-grid">' +
'<div class="metric-cell"><span class="metric-label">Avg Time</span><span class="metric-value">' + (stats.avgDurationMs !== null ? fmtDuration(stats.avgDurationMs) : '-') + '</span></div>' +
'<div class="metric-cell"><span class="metric-label">Avg Steps</span><span class="metric-value">' + (stats.avgSteps !== null ? fmtCompact(stats.avgSteps) : '-') + '</span></div>' +
'<div class="metric-cell"><span class="metric-label">Avg Tools</span><span class="metric-value">' + (stats.avgToolCalls !== null ? fmtCompact(stats.avgToolCalls) : '-') + '</span></div>' +
'</div>' +
'<div class="mini-chart">' +
'<div class="mini-chart-title">Tool Calls by Task</div>' +
(bars || '<div class="task-meta-line"><span>No tool calls recorded</span></div>') +
'</div>';
}
function renderTaskList(filter) {
const list = document.getElementById('task-list');
list.innerHTML = '';
@@ -668,8 +810,11 @@
}
const metaParts = [];
if (task.durationMs) metaParts.push(fmtDuration(task.durationMs));
if (task.screenshotCount) metaParts.push(`${task.screenshotCount} steps`);
const metrics = taskMetrics(task);
if (metrics.durationMs) metaParts.push(fmtDuration(metrics.durationMs));
if (metrics.steps) metaParts.push(`${fmtCompact(metrics.steps)} steps`);
if (metrics.toolCalls) metaParts.push(`${fmtCompact(metrics.toolCalls)} tools`);
if (metrics.toolErrors) metaParts.push(`${fmtCompact(metrics.toolErrors)} errors`);
item.innerHTML =
'<div class="task-row">' +
@@ -714,7 +859,7 @@
}
function artifactPath(task, artifact) {
const manifestPath = task.paths && task.paths[artifact];
const manifestPath = task.paths?.[artifact];
if (typeof manifestPath === 'string' && manifestPath.length > 0) {
return manifestPath.replace(/^\/+/, '');
}
@@ -725,6 +870,17 @@
return `${basePath}/${artifactPath(task, artifact)}`;
}
function runArtifactUrl(path) {
if (typeof path !== 'string' || path.length === 0) return null;
return `${basePath}/${path.replace(/^\/+/, '')}`;
}
function reportUrl(manifest, task) {
const url = runArtifactUrl(manifest?.reportPath);
if (!url || !task) return url;
return `${url}#${encodeURIComponent(task.queryId || task.id || '')}`;
}
function metadataUrl(task) {
return artifactUrl(task, 'metadata');
}
@@ -905,10 +1061,38 @@
}
// Duration
if (task.durationMs) {
const metrics = taskMetrics(task);
if (metrics.durationMs) {
html += '<div class="db-section">';
html += '<span class="db-label">Duration</span>';
html += `<span class="db-value">${fmtDuration(task.durationMs)}</span>`;
html += `<span class="db-value">${fmtDuration(metrics.durationMs)}</span>`;
html += '</div>';
}
if (metrics.steps) {
html += '<div class="db-section">';
html += '<span class="db-label">Steps</span>';
html += `<span class="db-value">${fmtCompact(metrics.steps)}</span>`;
html += '</div>';
}
html += '<div class="db-section">';
html += '<span class="db-label">Tool Calls</span>';
html += `<span class="db-value">${fmtCompact(metrics.toolCalls)}</span>`;
html += '</div>';
if (metrics.toolErrors) {
html += '<div class="db-section">';
html += '<span class="db-label">Tool Errors</span>';
html += `<span class="db-value">${fmtCompact(metrics.toolErrors)}</span>`;
html += '</div>';
}
const reportLink = reportUrl(manifest, task);
if (reportLink) {
html += '<div class="db-section">';
html += '<span class="db-label">Report</span>';
html += `<span class="db-value"><a href="${escAttr(reportLink)}" target="_blank" rel="noopener">Open task analysis</a></span>`;
html += '</div>';
}
@@ -1234,8 +1418,25 @@
function computeStats(tasks) {
const total = tasks.length;
let passed = 0, failed = 0, totalScore = 0, scoredCount = 0;
let totalDurationMs = 0, durationCount = 0;
let totalSteps = 0, stepsCount = 0;
let totalToolCalls = 0, toolCount = 0;
let totalToolErrors = 0;
tasks.forEach((t) => {
const metrics = taskMetrics(t);
if (metrics.durationMs > 0) {
totalDurationMs += metrics.durationMs;
durationCount++;
}
if (metrics.steps > 0) {
totalSteps += metrics.steps;
stepsCount++;
}
totalToolCalls += metrics.toolCalls;
totalToolErrors += metrics.toolErrors;
toolCount++;
const graders = t.graderResults || {};
const keys = Object.keys(graders);
if (keys.length > 0) {
@@ -1254,7 +1455,34 @@
total: total,
passed: passed,
failed: failed,
avgScore: scoredCount > 0 ? Math.round((totalScore / scoredCount) * 100) : null
avgScore: scoredCount > 0 ? Math.round((totalScore / scoredCount) * 100) : null,
avgDurationMs: durationCount > 0 ? totalDurationMs / durationCount : null,
avgSteps: stepsCount > 0 ? totalSteps / stepsCount : null,
avgToolCalls: toolCount > 0 ? totalToolCalls / toolCount : null,
totalToolCalls: totalToolCalls,
totalToolErrors: totalToolErrors
};
}
function taskMetrics(task) {
const metrics = task.metrics || {};
const screenshots = Number.isFinite(Number(metrics.screenshots))
? Number(metrics.screenshots)
: Number(task.screenshotCount || 0);
return {
durationMs: Number.isFinite(Number(metrics.durationMs))
? Number(metrics.durationMs)
: Number(task.durationMs || 0),
steps: Number.isFinite(Number(metrics.steps))
? Number(metrics.steps)
: screenshots,
screenshots: screenshots,
toolCalls: Number.isFinite(Number(metrics.toolCalls))
? Number(metrics.toolCalls)
: 0,
toolErrors: Number.isFinite(Number(metrics.toolErrors))
? Number(metrics.toolErrors)
: 0
};
}
@@ -1310,6 +1538,13 @@
return `${h}h ${remM}m`;
}
function fmtCompact(value) {
const num = Number(value);
if (!Number.isFinite(num)) return '0';
if (Number.isInteger(num)) return String(num);
return num.toFixed(1);
}
function showFatalError(msgHtml) {
document.getElementById('center-panel').innerHTML =
'<div class="placeholder error">' +

View File

@@ -5,6 +5,7 @@ import {
PutObjectCommand,
S3Client,
} from '@aws-sdk/client-s3'
import { readTaskMetrics } from '../reporting/task-metrics'
import {
buildViewerManifest,
type ViewerManifestTaskInput,
@@ -315,6 +316,7 @@ export class R2Publisher {
graderResults:
(meta.grader_results as ViewerManifestTaskInput['graderResults']) ||
{},
metrics: await readTaskMetrics(taskPath, meta, screenshotCount),
})
}
@@ -379,10 +381,12 @@ export class R2Publisher {
await readFile(join(runDir, 'summary.json'), 'utf-8'),
) as Record<string, unknown>
} catch {}
const reportStat = await stat(join(runDir, 'report.html')).catch(() => null)
return buildViewerManifest({
runId,
uploadedAt: this.now().toISOString(),
reportPath: reportStat?.isFile() ? 'report.html' : undefined,
agentConfig,
dataset,
summary: summaryData

View File

@@ -0,0 +1,188 @@
import { readdir, readFile, stat } from 'node:fs/promises'
import { join } from 'node:path'
export interface EvalTaskMetrics {
durationMs: number
steps: number
screenshots: number
toolCalls: number
toolErrors: number
}
export interface EvalRunMetrics {
taskCount: number
totalDurationMs: number
avgDurationMs: number
totalSteps: number
avgSteps: number
totalToolCalls: number
avgToolCalls: number
totalToolErrors: number
avgToolErrors: number
}
export interface EvalTaskMetricSummary {
queryId: string
status: string
score?: number
pass?: boolean
metrics: EvalTaskMetrics
}
export interface EvalRunMetricSummary {
run: EvalRunMetrics
tasks: EvalTaskMetricSummary[]
}
interface TaskDirEntry {
taskId: string
taskPath: string
}
function numberValue(value: unknown): number {
return typeof value === 'number' && Number.isFinite(value) ? value : 0
}
export function countMessageMetrics(messagesJsonl: string): {
toolCalls: number
toolErrors: number
} {
let toolCalls = 0
let toolErrors = 0
for (const line of messagesJsonl.split('\n')) {
const trimmed = line.trim()
if (!trimmed) continue
try {
const event = JSON.parse(trimmed) as { type?: unknown }
if (event.type === 'tool-input-available') toolCalls++
if (event.type === 'tool-output-error') toolErrors++
} catch {
// Ignore malformed telemetry lines; the raw artifact is still uploaded.
}
}
return { toolCalls, toolErrors }
}
export function buildTaskMetrics(
metadata: Record<string, unknown>,
messageMetrics: { toolCalls: number; toolErrors: number },
screenshotCount = 0,
): EvalTaskMetrics {
const screenshots = numberValue(metadata.screenshot_count) || screenshotCount
return {
durationMs: numberValue(metadata.total_duration_ms),
steps: numberValue(metadata.total_steps) || screenshots,
screenshots,
toolCalls: messageMetrics.toolCalls,
toolErrors: messageMetrics.toolErrors,
}
}
export function buildRunMetrics(metrics: EvalTaskMetrics[]): EvalRunMetrics {
const taskCount = metrics.length
const totalDurationMs = metrics.reduce((sum, metric) => {
return sum + metric.durationMs
}, 0)
const totalSteps = metrics.reduce((sum, metric) => sum + metric.steps, 0)
const totalToolCalls = metrics.reduce((sum, metric) => {
return sum + metric.toolCalls
}, 0)
const totalToolErrors = metrics.reduce((sum, metric) => {
return sum + metric.toolErrors
}, 0)
return {
taskCount,
totalDurationMs,
avgDurationMs: taskCount > 0 ? totalDurationMs / taskCount : 0,
totalSteps,
avgSteps: taskCount > 0 ? totalSteps / taskCount : 0,
totalToolCalls,
avgToolCalls: taskCount > 0 ? totalToolCalls / taskCount : 0,
totalToolErrors,
avgToolErrors: taskCount > 0 ? totalToolErrors / taskCount : 0,
}
}
export async function readTaskMetrics(
taskPath: string,
metadata: Record<string, unknown>,
screenshotCount = 0,
): Promise<EvalTaskMetrics> {
const messages = await readFile(join(taskPath, 'messages.jsonl'), 'utf-8')
.then(countMessageMetrics)
.catch(() => ({ toolCalls: 0, toolErrors: 0 }))
return buildTaskMetrics(metadata, messages, screenshotCount)
}
function statusFromMetadata(metadata: Record<string, unknown>): string {
const termination = metadata.termination_reason
if (termination === 'timeout') return 'timeout'
if (Array.isArray(metadata.errors) && metadata.errors.length > 0) {
return 'failed'
}
return 'completed'
}
function primaryGrade(metadata: Record<string, unknown>): {
score?: number
pass?: boolean
} {
const graders = metadata.grader_results as
| Record<string, { score?: unknown; pass?: unknown }>
| undefined
const first = graders ? Object.values(graders)[0] : undefined
return {
...(typeof first?.score === 'number' ? { score: first.score } : {}),
...(typeof first?.pass === 'boolean' ? { pass: first.pass } : {}),
}
}
async function readTaskDirs(runDir: string): Promise<TaskDirEntry[]> {
const canonicalTasksDir = join(runDir, 'tasks')
const canonicalStat = await stat(canonicalTasksDir).catch(() => null)
const baseDir = canonicalStat?.isDirectory() ? canonicalTasksDir : runDir
const entries = await readdir(baseDir, { withFileTypes: true }).catch(
() => [],
)
return entries
.filter((entry) => entry.isDirectory())
.filter((entry) => entry.name !== 'screenshots')
.filter((entry) => entry.name !== 'tasks')
.map((entry) => ({
taskId: entry.name,
taskPath: join(baseDir, entry.name),
}))
}
export async function readRunMetricSummary(
runDir: string,
): Promise<EvalRunMetricSummary> {
const tasks: EvalTaskMetricSummary[] = []
for (const entry of await readTaskDirs(runDir)) {
const metadata = await readFile(
join(entry.taskPath, 'metadata.json'),
'utf-8',
)
.then((text) => JSON.parse(text) as Record<string, unknown>)
.catch(() => null)
if (!metadata) continue
const metrics = await readTaskMetrics(entry.taskPath, metadata)
tasks.push({
queryId: (metadata.query_id as string | undefined) || entry.taskId,
status: statusFromMetadata(metadata),
...primaryGrade(metadata),
metrics,
})
}
return {
run: buildRunMetrics(tasks.map((task) => task.metrics)),
tasks,
}
}

View File

@@ -36,5 +36,6 @@ export async function resolveProviderConfig(
accessKeyId: resolveEnvValue(agent.accessKeyId),
secretAccessKey: resolveEnvValue(agent.secretAccessKey),
sessionToken: resolveEnvValue(agent.sessionToken),
region: resolveEnvValue(agent.region),
}
}

View File

@@ -1,3 +1,8 @@
import {
buildRunMetrics,
type EvalRunMetrics,
type EvalTaskMetrics,
} from '../reporting/task-metrics'
import type { GraderResult } from '../types'
export const VIEWER_MANIFEST_SCHEMA_VERSION = 2
@@ -20,6 +25,7 @@ export interface ViewerManifestTaskInput {
status: string
durationMs: number
screenshotCount: number
metrics?: EvalTaskMetrics
graderResults: Record<string, GraderResult>
}
@@ -35,9 +41,11 @@ export interface ViewerManifest {
suiteId?: string
variantId?: string
uploadedAt?: string
reportPath?: string
agentConfig?: Record<string, unknown>
dataset?: string
summary?: Record<string, unknown>
metrics?: EvalRunMetrics
tasks: ViewerManifestTask[]
}
@@ -46,6 +54,7 @@ export interface BuildViewerManifestInput {
suiteId?: string
variantId?: string
uploadedAt?: string
reportPath?: string
agentConfig?: Record<string, unknown>
dataset?: string
summary?: Record<string, unknown>
@@ -68,22 +77,37 @@ function taskPaths(queryId: string): ViewerManifestTaskPaths {
export function buildViewerManifest(
input: BuildViewerManifestInput,
): ViewerManifest {
const tasks = input.tasks.map((task) => {
const { artifactId, ...publicTask } = task
const metrics =
publicTask.metrics ??
({
durationMs: publicTask.durationMs,
steps: publicTask.screenshotCount,
screenshots: publicTask.screenshotCount,
toolCalls: 0,
toolErrors: 0,
} satisfies EvalTaskMetrics)
return {
...publicTask,
metrics,
startUrl: publicTask.startUrl ?? '',
paths: taskPaths(artifactId ?? publicTask.queryId),
}
})
return {
schemaVersion: VIEWER_MANIFEST_SCHEMA_VERSION,
runId: input.runId,
...(input.suiteId ? { suiteId: input.suiteId } : {}),
...(input.variantId ? { variantId: input.variantId } : {}),
...(input.uploadedAt ? { uploadedAt: input.uploadedAt } : {}),
...(input.reportPath ? { reportPath: input.reportPath } : {}),
...(input.agentConfig ? { agentConfig: input.agentConfig } : {}),
...(input.dataset ? { dataset: input.dataset } : {}),
...(input.summary ? { summary: input.summary } : {}),
tasks: input.tasks.map((task) => {
const { artifactId, ...publicTask } = task
return {
...publicTask,
startUrl: publicTask.startUrl ?? '',
paths: taskPaths(artifactId ?? publicTask.queryId),
}
}),
metrics: buildRunMetrics(tasks.map((task) => task.metrics)),
tasks,
}
}

View File

@@ -0,0 +1,12 @@
import { describe, expect, it } from 'bun:test'
import { shouldAutoOpenDashboard } from '../../src/dashboard/server'
describe('dashboard server', () => {
it('does not auto-open the dashboard in CI', () => {
expect(shouldAutoOpenDashboard({ CI: 'true' })).toBe(false)
})
it('auto-opens the dashboard outside CI by default', () => {
expect(shouldAutoOpenDashboard({})).toBe(true)
})
})

View File

@@ -40,6 +40,7 @@ async function writeRunFixture(
start_url: 'https://example.test',
termination_reason: 'completed',
total_duration_ms: 1200,
total_steps: 4,
screenshot_count: 1,
agent_config: { type: 'single', model: 'kimi' },
grader_results: {
@@ -47,13 +48,22 @@ async function writeRunFixture(
},
}),
)
await writeFile(join(taskDir, 'messages.jsonl'), '{"type":"user"}\n')
await writeFile(
join(taskDir, 'messages.jsonl'),
[
'{"type":"user"}',
'{"type":"tool-input-available","toolName":"click"}',
'{"type":"tool-input-available","toolName":"take_snapshot"}',
'{"type":"tool-output-error","toolName":"click"}',
].join('\n'),
)
await writeFile(join(taskDir, 'grades.json'), '{"ok":true}')
await writeFile(join(taskDir, 'screenshots', '1.png'), 'png')
await writeFile(
join(runDir, 'summary.json'),
JSON.stringify({ passRate: 1, avgDurationMs: 1200 }),
)
await writeFile(join(runDir, 'report.html'), '<html>report</html>')
return { runDir, runId: `${configName}-${timestamp}` }
}
@@ -110,6 +120,9 @@ describe('R2Publisher', () => {
expect(byKey.get(`runs/${runId}/summary.json`)?.ContentType).toBe(
'application/json',
)
expect(byKey.get(`runs/${runId}/report.html`)?.ContentType).toBe(
'text/html',
)
expect(byKey.get('viewer.html')?.ContentType).toBe('text/html')
expect(result.viewerUrl).toBe(
`https://eval.example.test/viewer.html?run=${runId}`,
@@ -126,12 +139,28 @@ describe('R2Publisher', () => {
uploadedAt: '2026-04-29T12:00:00.000Z',
agentConfig: { type: 'single', model: 'kimi' },
dataset: 'webbench',
reportPath: 'report.html',
summary: { passRate: 1, avgDurationMs: 1200 },
metrics: {
taskCount: 1,
avgDurationMs: 1200,
avgSteps: 4,
avgToolCalls: 2,
totalToolCalls: 2,
totalToolErrors: 1,
},
tasks: [
{
queryId: 'task-1',
status: 'completed',
screenshotCount: 1,
metrics: {
durationMs: 1200,
steps: 4,
screenshots: 1,
toolCalls: 2,
toolErrors: 1,
},
paths: {
attempt: 'tasks/task-1/attempt.json',
metadata: 'tasks/task-1/metadata.json',

View File

@@ -6,6 +6,7 @@ interface ViewerPathResolvers {
artifactUrl(task: Record<string, unknown>, artifact: string): string
metadataUrl(task: Record<string, unknown>): string
messagesUrl(task: Record<string, unknown>): string
reportUrl(manifest: Record<string, unknown>): string | null
screenshotUrl(task: Record<string, unknown>, step: number): string
}
@@ -24,7 +25,7 @@ async function loadViewerPathResolvers(): Promise<ViewerPathResolvers> {
`
const basePath = 'runs/run-1';
${block}
return { artifactUrl, metadataUrl, messagesUrl, screenshotUrl };
return { artifactUrl, metadataUrl, messagesUrl, reportUrl, screenshotUrl };
`,
) as () => ViewerPathResolvers
return createResolvers()
@@ -60,6 +61,35 @@ async function runAutoSelectFromHash(hash: string): Promise<unknown> {
return runAutoSelect()
}
async function runComputeStats(): Promise<unknown> {
const html = await readFile(
join(import.meta.dir, '..', '..', 'src', 'dashboard', 'viewer.html'),
'utf-8',
)
const start = html.indexOf('function computeStats(tasks)')
const end = html.indexOf('function resolveStatus(task)', start)
expect(start).toBeGreaterThan(-1)
expect(end).toBeGreaterThan(start)
const block = html.slice(start, end)
const compute = new Function(
`
${block}
return computeStats([
{
graderResults: { agisdk_state_diff: { pass: true, score: 1 } },
metrics: { durationMs: 1000, steps: 4, toolCalls: 3, toolErrors: 0 }
},
{
graderResults: { agisdk_state_diff: { pass: false, score: 0 } },
metrics: { durationMs: 3000, steps: 8, toolCalls: 5, toolErrors: 2 }
}
]);
`,
) as () => unknown
return compute()
}
describe('R2 viewer artifact path compatibility', () => {
it('uses explicit manifest paths for new uploaded runs', async () => {
const resolvers = await loadViewerPathResolvers()
@@ -95,6 +125,15 @@ describe('R2 viewer artifact path compatibility', () => {
)
})
it('resolves manifest-level run report links', async () => {
const resolvers = await loadViewerPathResolvers()
expect(resolvers.reportUrl({ reportPath: 'report.html' })).toBe(
'runs/run-1/report.html',
)
expect(resolvers.reportUrl({})).toBe(null)
})
it('falls back to legacy inferred paths for old uploaded runs', async () => {
const resolvers = await loadViewerPathResolvers()
const task = { queryId: 'legacy-task' }
@@ -127,4 +166,17 @@ describe('R2 viewer artifact path compatibility', () => {
queryId: 'legacy-task',
})
})
it('computes run-level timing and tool metrics for the viewer', async () => {
expect(await runComputeStats()).toMatchObject({
total: 2,
passed: 1,
failed: 1,
avgDurationMs: 2000,
avgSteps: 6,
avgToolCalls: 4,
totalToolCalls: 8,
totalToolErrors: 2,
})
})
})

View File

@@ -0,0 +1,159 @@
import { describe, expect, it } from 'bun:test'
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import {
DEFAULT_REPORT_MAX_TURNS,
DEFAULT_REPORT_MODEL,
generateEvalReport,
runClaudeCodeReportAgent,
} from '../../scripts/generate-report'
async function writeRunFixture(): Promise<string> {
const runDir = await mkdtemp(join(tmpdir(), 'eval-report-script-'))
const taskDir = join(runDir, 'agisdk-networkin-10')
await mkdir(join(taskDir, 'screenshots'), { recursive: true })
await writeFile(
join(runDir, 'summary.json'),
JSON.stringify({
total: 1,
completed: 1,
passRate: 0,
avgDurationMs: 1234,
}),
)
await writeFile(
join(taskDir, 'metadata.json'),
JSON.stringify({
query_id: 'agisdk-networkin-10',
dataset: 'agisdk-real',
query: 'Send a follow-up message starting with "Following up on".',
termination_reason: 'completed',
total_duration_ms: 1234,
total_steps: 2,
screenshot_count: 1,
final_answer: 'No app action was taken.',
errors: [],
warnings: [],
agent_config: { type: 'single', model: 'kimi' },
grader_results: {
agisdk_state_diff: {
score: 0,
pass: false,
reasoning: 'Some criteria failed',
details: {
per_criterion: [
{ passed: true, detail: 'message starts correctly' },
{ passed: false, detail: 'message was not sent' },
],
},
},
},
}),
)
await writeFile(
join(taskDir, 'messages.jsonl'),
[
JSON.stringify({
type: 'tool-input-available',
timestamp: '2026-04-30T00:00:00.000Z',
toolCallId: 'call-1',
toolName: 'memory_search',
input: { q: 'chat' },
}),
JSON.stringify({
type: 'tool-output-error',
timestamp: '2026-04-30T00:00:01.000Z',
toolCallId: 'call-1',
errorText: 'memory unavailable',
}),
].join('\n'),
)
await writeFile(join(taskDir, 'screenshots', '1.png'), 'png')
return runDir
}
describe('generate-report script', () => {
it('delegates report.html creation to Claude Code', async () => {
const runDir = await writeRunFixture()
const outputPath = join(runDir, 'report.html')
let prompt = ''
await generateEvalReport({
inputDir: runDir,
outputPath,
runAgent: async (invocation) => {
prompt = invocation.prompt
await writeFile(
invocation.outputPath,
'<!doctype html><h1>Claude-written report</h1>',
)
},
})
expect(await readFile(outputPath, 'utf-8')).toContain(
'Claude-written report',
)
expect(prompt).toContain('AGI SDK Random-10 Failure Report')
expect(prompt).toContain('summary.json')
expect(prompt).toContain('messages.jsonl')
expect(prompt).toContain('screenshots')
expect(prompt).toContain('Deterministic run metrics')
expect(prompt).toContain('"queryId": "agisdk-networkin-10"')
expect(prompt).toContain('"toolCalls": 1')
expect(prompt).toContain('"toolErrors": 1')
expect(prompt).toContain('Duration by task')
expect(prompt).toContain('Tool calls by task')
expect(prompt).toContain(outputPath)
})
it('fails when the Claude Code agent does not write the report', async () => {
const runDir = await writeRunFixture()
await expect(
generateEvalReport({
inputDir: runDir,
outputPath: join(runDir, 'missing-report.html'),
runAgent: async () => {},
}),
).rejects.toThrow('Report was not written')
})
it('runs Claude Code with Opus 4.6, full bypass, and bounded turns', async () => {
const runDir = await writeRunFixture()
const calls: unknown[] = []
await runClaudeCodeReportAgent(
{
inputDir: runDir,
outputPath: join(runDir, 'report.html'),
prompt: 'write the report',
},
{
query: async function* (call: unknown) {
calls.push(call)
yield { type: 'result', subtype: 'success', result: 'done' }
},
env: {
CLAUDE_CODE_OAUTH_TOKEN: 'token',
EVAL_R2_SECRET_ACCESS_KEY: 'secret',
HOME: '/tmp/home',
PATH: '/bin',
},
},
)
expect(calls).toHaveLength(1)
expect(calls[0]).toMatchObject({
prompt: 'write the report',
options: {
cwd: runDir,
model: DEFAULT_REPORT_MODEL,
maxTurns: DEFAULT_REPORT_MAX_TURNS,
permissionMode: 'bypassPermissions',
allowDangerouslySkipPermissions: true,
},
})
expect(JSON.stringify(calls[0])).not.toContain('secret')
})
})

View File

@@ -13,10 +13,10 @@ describe('adaptEvalConfigFile', () => {
expect(adapted.suite.id).toBe('browseros-agent-weekly')
expect(adapted.suite.dataset).toBe('../../data/agisdk-real.jsonl')
expect(adapted.suite.graders).toEqual(['agisdk_state_diff'])
expect(adapted.suite.workers).toBe(10)
expect(adapted.suite.workers).toBe(3)
expect(adapted.suite.restartBrowserPerTask).toBe(true)
expect(adapted.suite.timeoutMs).toBe(1_800_000)
expect(adapted.evalConfig.num_workers).toBe(10)
expect(adapted.evalConfig.num_workers).toBe(3)
expect(adapted.evalConfig.browseros.server_url).toBe(
'http://127.0.0.1:9110',
)
@@ -38,6 +38,34 @@ describe('adaptEvalConfigFile', () => {
)
})
it('adapts BrowserOS AGI SDK comparison configs', async () => {
const kimi = await adaptEvalConfigFile(
'apps/eval/configs/legacy/browseros-agent-kimi-k2-5-agisdk-real.json',
)
const opus = await adaptEvalConfigFile(
'apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json',
)
expect(kimi.suite.id).toBe('browseros-agent-kimi-k2-5-agisdk-real')
expect(kimi.evalConfig.agent).toMatchObject({
type: 'single',
provider: 'openai-compatible',
model: 'moonshotai/kimi-k2.5',
})
expect(kimi.evalConfig.num_workers).toBe(3)
expect(opus.suite.id).toBe('browseros-agent-opus-4-6-agisdk-real')
expect(opus.evalConfig.agent).toMatchObject({
type: 'single',
provider: 'bedrock',
model: 'global.anthropic.claude-opus-4-6-v1',
region: 'AWS_REGION',
accessKeyId: 'AWS_ACCESS_KEY_ID',
secretAccessKey: 'AWS_SECRET_ACCESS_KEY',
})
expect(opus.evalConfig.num_workers).toBe(2)
})
it('adapts claude-code configs without provider credentials', async () => {
const dir = await mkdtemp(join(tmpdir(), 'claude-code-config-'))
const configPath = join(dir, 'claude-code-agisdk.json')

View File

@@ -0,0 +1,38 @@
import { describe, expect, it } from 'bun:test'
import { resolveProviderConfig } from '../../src/utils/resolve-provider-config'
describe('resolveProviderConfig', () => {
it('resolves Bedrock region from environment variables', async () => {
const previous = {
AWS_REGION: process.env.AWS_REGION,
AWS_ACCESS_KEY_ID: process.env.AWS_ACCESS_KEY_ID,
AWS_SECRET_ACCESS_KEY: process.env.AWS_SECRET_ACCESS_KEY,
}
process.env.AWS_REGION = 'us-west-2'
process.env.AWS_ACCESS_KEY_ID = 'test-access-key'
process.env.AWS_SECRET_ACCESS_KEY = 'test-secret-key'
try {
const resolved = await resolveProviderConfig({
provider: 'bedrock',
model: 'global.anthropic.claude-opus-4-6-v1',
region: 'AWS_REGION',
accessKeyId: 'AWS_ACCESS_KEY_ID',
secretAccessKey: 'AWS_SECRET_ACCESS_KEY',
})
expect(resolved).toMatchObject({
provider: 'bedrock',
model: 'global.anthropic.claude-opus-4-6-v1',
region: process.env.AWS_REGION,
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
})
} finally {
for (const [key, value] of Object.entries(previous)) {
if (value === undefined) delete process.env[key]
else process.env[key] = value
}
}
})
})

View File

@@ -9,6 +9,7 @@ describe('buildViewerManifest', () => {
suiteId: 'agisdk-daily-10',
variantId: 'kimi',
uploadedAt: '2026-04-29T06:00:00.000Z',
reportPath: 'report.html',
summary: { total: 1, passRate: 0 },
tasks: [
{
@@ -18,6 +19,13 @@ describe('buildViewerManifest', () => {
status: 'completed',
durationMs: 353_000,
screenshotCount: 42,
metrics: {
durationMs: 353_000,
steps: 47,
screenshots: 42,
toolCalls: 19,
toolErrors: 2,
},
graderResults: {
agisdk_state_diff: {
score: 0,
@@ -32,6 +40,7 @@ describe('buildViewerManifest', () => {
const publishManifest: R2RunManifest = manifest
expect(publishManifest.schemaVersion).toBe(2)
expect(manifest.reportPath).toBe('report.html')
expect(manifest.tasks[0].paths.messages).toBe(
'tasks/agisdk-dashdish-4/messages.jsonl',
)
@@ -41,6 +50,21 @@ describe('buildViewerManifest', () => {
expect(manifest.tasks[0].paths.graderArtifacts).toBe(
'tasks/agisdk-dashdish-4/grader-artifacts',
)
expect(manifest.metrics).toMatchObject({
taskCount: 1,
avgDurationMs: 353_000,
avgSteps: 47,
avgToolCalls: 19,
totalToolCalls: 19,
totalToolErrors: 2,
})
expect(manifest.tasks[0].metrics).toEqual({
durationMs: 353_000,
steps: 47,
screenshots: 42,
toolCalls: 19,
toolErrors: 2,
})
expect(manifest.tasks[0].graderResults.agisdk_state_diff.details).toEqual({
missing: ['checkout item'],
})

View File

@@ -23,11 +23,17 @@ interface CdpVersion {
const LOOPBACK_DISCOVERY_HOSTS = ['127.0.0.1', 'localhost', '[::1]'] as const
type LoopbackDiscoveryHost = (typeof LOOPBACK_DISCOVERY_HOSTS)[number]
interface CdpBackendConfig {
port: number
exitOnReconnectFailure?: boolean
}
// biome-ignore lint/correctness/noUnusedVariables: declaration merging adds ProtocolApi properties to the class
interface CdpBackend extends ProtocolApi {}
// biome-ignore lint/suspicious/noUnsafeDeclarationMerging: intentional — Object.assign fills these at runtime
class CdpBackend implements ICdpBackend {
private port: number
private exitOnReconnectFailure: boolean
private ws: WebSocket | null = null
private messageId = 0
private pending = new Map<number, PendingRequest>()
@@ -44,8 +50,9 @@ class CdpBackend implements ICdpBackend {
private keepaliveTimer: ReturnType<typeof setInterval> | null = null
private preferredDiscoveryHost: LoopbackDiscoveryHost | null = null
constructor(config: { port: number }) {
constructor(config: CdpBackendConfig) {
this.port = config.port
this.exitOnReconnectFailure = config.exitOnReconnectFailure ?? true
const rawSend: RawSend = (method, params) => this.rawSend(method, params)
const rawOn: RawOn = (event, handler) => this.rawOn(event, handler)
@@ -293,7 +300,8 @@ class CdpBackend implements ICdpBackend {
private async reconnectLoop(): Promise<void> {
do {
this.reconnectRequested = false
await this.reconnectWithRetries()
const reconnected = await this.reconnectWithRetries()
if (!reconnected) return
} while (
!this.disconnecting &&
(this.reconnectRequested || !this.connected)
@@ -309,12 +317,12 @@ class CdpBackend implements ICdpBackend {
this.pending.clear()
}
private async reconnectWithRetries(): Promise<void> {
private async reconnectWithRetries(): Promise<boolean> {
const maxRetries = CDP_LIMITS.RECONNECT_MAX_RETRIES
const delay = TIMEOUTS.CDP_RECONNECT_DELAY
for (let attempt = 1; attempt <= maxRetries; attempt++) {
if (this.disconnecting) return
if (this.disconnecting) return false
try {
logger.info(`CDP reconnection attempt ${attempt}/${maxRetries}...`)
@@ -322,7 +330,7 @@ class CdpBackend implements ICdpBackend {
await this.attemptConnect()
this.startKeepalive()
logger.info('CDP reconnected successfully')
return
return true
} catch (error) {
const msg = error instanceof Error ? error.message : String(error)
logger.warn(
@@ -331,10 +339,14 @@ class CdpBackend implements ICdpBackend {
}
}
logger.error(
`CDP reconnection failed after ${maxRetries} attempts, exiting for restart`,
)
process.exit(EXIT_CODES.GENERAL_ERROR)
if (this.exitOnReconnectFailure) {
logger.error(
`CDP reconnection failed after ${maxRetries} attempts, exiting for restart`,
)
process.exit(EXIT_CODES.GENERAL_ERROR)
}
logger.error(`CDP reconnection failed after ${maxRetries} attempts`)
return false
}
async disconnect(): Promise<void> {

View File

@@ -51,13 +51,17 @@ describe('CdpBackend', () => {
const originalReconnectDelay = TIMEOUTS.CDP_RECONNECT_DELAY
let fetchUrls: string[] = []
let failIpv4Discovery = false
let failAllDiscovery = false
let wsHost = '127.0.0.1'
let originalExit: typeof process.exit
beforeEach(() => {
MockWebSocket.instances = []
fetchUrls = []
failIpv4Discovery = false
failAllDiscovery = false
wsHost = '127.0.0.1'
originalExit = process.exit
;(TIMEOUTS as unknown as { CDP_CONNECT: number }).CDP_CONNECT = 200
;(
@@ -67,6 +71,9 @@ describe('CdpBackend', () => {
globalThis.fetch = (async (input: string | URL | Request) => {
const url = String(input)
fetchUrls.push(url)
if (failAllDiscovery) {
throw new Error('Unable to connect')
}
if (failIpv4Discovery && url.includes('127.0.0.1')) {
throw new Error('Unable to connect')
}
@@ -87,6 +94,7 @@ describe('CdpBackend', () => {
afterEach(() => {
globalThis.fetch = originalFetch
globalThis.WebSocket = originalWebSocket
process.exit = originalExit
;(TIMEOUTS as unknown as { CDP_CONNECT: number }).CDP_CONNECT =
originalConnectTimeout
;(
@@ -160,4 +168,31 @@ describe('CdpBackend', () => {
assert(fetchUrls.length >= 3)
await cdp.disconnect()
})
it('can disable process exit when reconnect retries are exhausted', async () => {
let exitCalled = false
process.exit = (() => {
exitCalled = true
throw new Error('process.exit should not be called')
}) as unknown as typeof process.exit
const cdp = new CdpBackend({ port: 9222, exitOnReconnectFailure: false })
const connectPromise = cdp.connect()
await waitFor(() => MockWebSocket.instances.length === 1)
const ws1 = MockWebSocket.instances[0]
ws1?.open()
await connectPromise
assert.strictEqual(cdp.isConnected(), true)
failAllDiscovery = true
ws1?.close()
await waitFor(() => fetchUrls.length >= 10)
await Bun.sleep(5)
assert.strictEqual(exitCalled, false)
assert.strictEqual(cdp.isConnected(), false)
await cdp.disconnect()
})
})