mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-14 16:14:28 +00:00
Compare commits
4 Commits
dev
...
eval-repor
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1b44de2ee8 | ||
|
|
10822ff8de | ||
|
|
b7892515d7 | ||
|
|
2a57dd6ab8 |
53
.github/workflows/eval-weekly.yml
vendored
53
.github/workflows/eval-weekly.yml
vendored
@@ -44,6 +44,19 @@ jobs:
|
||||
working-directory: packages/browseros-agent
|
||||
run: bun install --ignore-scripts
|
||||
|
||||
- name: Install Claude Code CLI
|
||||
working-directory: packages/browseros-agent/apps/eval
|
||||
env:
|
||||
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }}
|
||||
run: |
|
||||
if bun -e "const config = await Bun.file(process.env.EVAL_CONFIG).json(); process.exit(config.agent?.type === 'claude-code' ? 0 : 1)"; then
|
||||
npm install -g @anthropic-ai/claude-code@2.1.119
|
||||
echo "Claude Code CLI installed at $(command -v claude)"
|
||||
claude --version
|
||||
else
|
||||
echo "Eval config does not use Claude Code; skipping Claude Code CLI install"
|
||||
fi
|
||||
|
||||
- name: Install Python eval dependencies
|
||||
# agisdk pinned so silent upstream releases can't shift task definitions
|
||||
# or grader behavior. Bump intentionally with a documented re-baseline.
|
||||
@@ -67,13 +80,11 @@ jobs:
|
||||
env:
|
||||
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
|
||||
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
||||
AWS_REGION: ${{ secrets.AWS_REGION || 'us-west-2' }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
|
||||
NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
|
||||
EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
|
||||
EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }}
|
||||
EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }}
|
||||
EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }}
|
||||
EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }}
|
||||
BROWSEROS_BINARY: /usr/bin/browseros
|
||||
WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
|
||||
# OpenClaw container runtime is macOS-only; opt the Linux runner
|
||||
@@ -82,7 +93,35 @@ jobs:
|
||||
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }}
|
||||
run: |
|
||||
echo "Running eval with config: $EVAL_CONFIG"
|
||||
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts suite --config "$EVAL_CONFIG" --publish r2
|
||||
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts suite --config "$EVAL_CONFIG"
|
||||
# Capture the run directory so report.html can be generated before the R2 publish step.
|
||||
SUMMARY_PATH="$(find results -name summary.json -type f -print | sort | tail -n 1)"
|
||||
if [ -z "$SUMMARY_PATH" ]; then
|
||||
echo "No eval run summary found"
|
||||
exit 1
|
||||
fi
|
||||
RUN_DIR="$(dirname "$SUMMARY_PATH")"
|
||||
echo "EVAL_RUN_DIR=$RUN_DIR" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Generate run analysis report
|
||||
if: success()
|
||||
working-directory: packages/browseros-agent/apps/eval
|
||||
env:
|
||||
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
|
||||
run: |
|
||||
echo "Generating run report for $EVAL_RUN_DIR"
|
||||
bun scripts/generate-report.ts --input "$EVAL_RUN_DIR" --output "$EVAL_RUN_DIR/report.html"
|
||||
|
||||
- name: Publish eval run to R2
|
||||
if: success()
|
||||
working-directory: packages/browseros-agent/apps/eval
|
||||
env:
|
||||
EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
|
||||
EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }}
|
||||
EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }}
|
||||
EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }}
|
||||
EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }}
|
||||
run: bun run src/index.ts publish --run "$EVAL_RUN_DIR" --target r2
|
||||
|
||||
- name: Generate trend report
|
||||
if: success()
|
||||
@@ -97,7 +136,7 @@ jobs:
|
||||
EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }}
|
||||
run: bun apps/eval/scripts/weekly-report.ts /tmp/eval-report.html
|
||||
|
||||
- name: Upload report as artifact
|
||||
- name: Upload trend report as artifact
|
||||
if: success()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
|
||||
26
packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-kimi-k2-5-agisdk-real.json
vendored
Normal file
26
packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-kimi-k2-5-agisdk-real.json
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "single",
|
||||
"provider": "openai-compatible",
|
||||
"model": "moonshotai/kimi-k2.5",
|
||||
"apiKey": "OPENROUTER_API_KEY",
|
||||
"baseUrl": "https://openrouter.ai/api/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../../data/agisdk-real.jsonl",
|
||||
"num_workers": 3,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"timeout_ms": 1800000
|
||||
}
|
||||
27
packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json
vendored
Normal file
27
packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "single",
|
||||
"provider": "bedrock",
|
||||
"model": "global.anthropic.claude-opus-4-6-v1",
|
||||
"region": "AWS_REGION",
|
||||
"accessKeyId": "AWS_ACCESS_KEY_ID",
|
||||
"secretAccessKey": "AWS_SECRET_ACCESS_KEY",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../../data/agisdk-real.jsonl",
|
||||
"num_workers": 2,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"timeout_ms": 1800000
|
||||
}
|
||||
@@ -8,7 +8,7 @@
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../../data/agisdk-real.jsonl",
|
||||
"num_workers": 10,
|
||||
"num_workers": 3,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "claude-code",
|
||||
"model": "opus"
|
||||
"model": "opus",
|
||||
"extraArgs": ["--permission-mode", "bypassPermissions"]
|
||||
},
|
||||
"dataset": "../../data/agisdk-real.jsonl",
|
||||
"num_workers": 1,
|
||||
|
||||
191
packages/browseros-agent/apps/eval/scripts/generate-report.ts
vendored
Normal file
191
packages/browseros-agent/apps/eval/scripts/generate-report.ts
vendored
Normal file
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env bun
|
||||
|
||||
import { mkdir, stat } from 'node:fs/promises'
|
||||
import { dirname, resolve } from 'node:path'
|
||||
import { query as claudeQuery } from '@anthropic-ai/claude-agent-sdk'
|
||||
import { readRunMetricSummary } from '../src/reporting/task-metrics'
|
||||
|
||||
export const DEFAULT_REPORT_MODEL = 'claude-opus-4-6'
|
||||
export const DEFAULT_REPORT_MAX_TURNS = 300
|
||||
|
||||
type Env = Record<string, string | undefined>
|
||||
type ClaudeQuery = (input: unknown) => AsyncIterable<Record<string, unknown>>
|
||||
|
||||
export interface ReportAgentInvocation {
|
||||
inputDir: string
|
||||
outputPath: string
|
||||
prompt: string
|
||||
}
|
||||
|
||||
export interface GenerateEvalReportOptions {
|
||||
inputDir: string
|
||||
outputPath: string
|
||||
runAgent?: (invocation: ReportAgentInvocation) => Promise<void>
|
||||
}
|
||||
|
||||
interface ClaudeReportAgentDeps {
|
||||
query?: ClaudeQuery
|
||||
env?: Env
|
||||
}
|
||||
|
||||
function usage(): string {
|
||||
return `Usage: bun scripts/generate-report.ts --input <run-dir> --output <report.html>`
|
||||
}
|
||||
|
||||
function parseArgs(
|
||||
argv: string[],
|
||||
): Pick<GenerateEvalReportOptions, 'inputDir' | 'outputPath'> {
|
||||
let inputDir = ''
|
||||
let outputPath = ''
|
||||
for (let i = 0; i < argv.length; i++) {
|
||||
const arg = argv[i]
|
||||
if (arg === '--input' || arg === '--run') {
|
||||
inputDir = argv[++i] ?? ''
|
||||
} else if (arg === '--output' || arg === '--out') {
|
||||
outputPath = argv[++i] ?? ''
|
||||
} else if (arg === '--help' || arg === '-h') {
|
||||
console.log(usage())
|
||||
process.exit(0)
|
||||
}
|
||||
}
|
||||
if (!inputDir || !outputPath) {
|
||||
throw new Error(usage())
|
||||
}
|
||||
return { inputDir, outputPath }
|
||||
}
|
||||
|
||||
function claudeCodeEnv(env: Env): Env {
|
||||
return {
|
||||
CLAUDE_CODE_OAUTH_TOKEN: env.CLAUDE_CODE_OAUTH_TOKEN,
|
||||
ANTHROPIC_API_KEY: env.ANTHROPIC_API_KEY,
|
||||
HOME: env.HOME,
|
||||
PATH: env.PATH,
|
||||
SHELL: env.SHELL,
|
||||
TMPDIR: env.TMPDIR,
|
||||
TMP: env.TMP,
|
||||
TEMP: env.TEMP,
|
||||
USER: env.USER,
|
||||
CLAUDECODE: '',
|
||||
}
|
||||
}
|
||||
|
||||
async function buildReportPrompt(
|
||||
inputDir: string,
|
||||
outputPath: string,
|
||||
): Promise<string> {
|
||||
const metrics = await readRunMetricSummary(inputDir)
|
||||
|
||||
return `Analyze this BrowserOS eval run and write a shareable HTML report.
|
||||
|
||||
Run directory: ${inputDir}
|
||||
Output file to write: ${outputPath}
|
||||
|
||||
You are running with the run directory as cwd. Inspect the local artifacts:
|
||||
- summary.json for run totals and pass rate
|
||||
- each task directory's metadata.json for query, final answer, timing, screenshots, and grader results
|
||||
- each task directory's messages.jsonl for tool calls, tool errors, and recent trajectory
|
||||
- screenshots/ for visual evidence
|
||||
- grader-artifacts/ when present for grader-specific context
|
||||
|
||||
Write the final report directly to the output file path above. Do not print the
|
||||
report instead of writing it. Do not modify any input artifacts. The only file
|
||||
you should create or overwrite is the requested report.html.
|
||||
|
||||
The report should follow the style and density of the Shadowfax AGI SDK report:
|
||||
- Title like "AGI SDK Random-10 Failure Report" or a run-specific equivalent
|
||||
- Run directory and note that screenshots are embedded as data URIs
|
||||
- Summary cards for total tasks, passed, failed, pass rate, average duration, average steps, and average tool calls
|
||||
- A Metrics section with compact charts for Duration by task, Steps by task, Tool calls by task, and Tool errors by task
|
||||
- Task Summary table with task id, status, score, duration, steps, and prompt
|
||||
- Include tool calls and tool errors in the Task Summary table
|
||||
- Failure sections with stable anchors using each task id, for example <section id="agisdk-networkin-10">
|
||||
- For each failed task: Diagnosis, Evidence, Next Check, final screenshot, AGI SDK / grader criteria, final answer, and recent trajectory events
|
||||
- Make failure links in the summary table point to the task anchors
|
||||
- Keep the HTML self-contained: inline CSS and embedded final screenshots as data:image/png;base64 URIs
|
||||
- Escape user/model text correctly so task outputs cannot break the page
|
||||
|
||||
Analysis guidance:
|
||||
- Focus on why the model failed: task understanding, browser/tool usage, missing verification, tool errors, max-step/timeout, bad final answer, or grader ambiguity
|
||||
- Use messages.jsonl strategically. Do not paste huge DOM outputs into the report. Summarize only the relevant recent trajectory and evidence.
|
||||
- Limit trajectory analysis to the most relevant 200-300 events/calls across the run. Prefer failed tasks and the final/key actions for each failure.
|
||||
- If a grader criterion is boolean-only or ambiguous, say so and identify what additional artifact would make it debuggable.
|
||||
|
||||
Deterministic run metrics computed from metadata.json and messages.jsonl:
|
||||
\`\`\`json
|
||||
${JSON.stringify(metrics, null, 2)}
|
||||
\`\`\`
|
||||
|
||||
After writing the file, verify that ${outputPath} exists and is non-empty.`
|
||||
}
|
||||
|
||||
async function assertRunDir(inputDir: string): Promise<void> {
|
||||
const inputStat = await stat(inputDir).catch(() => null)
|
||||
if (!inputStat?.isDirectory()) {
|
||||
throw new Error(`Not a run directory: ${inputDir}`)
|
||||
}
|
||||
}
|
||||
|
||||
async function assertReportWritten(outputPath: string): Promise<void> {
|
||||
const outputStat = await stat(outputPath).catch(() => null)
|
||||
if (!outputStat?.isFile() || outputStat.size === 0) {
|
||||
throw new Error(`Report was not written: ${outputPath}`)
|
||||
}
|
||||
}
|
||||
|
||||
export async function runClaudeCodeReportAgent(
|
||||
invocation: ReportAgentInvocation,
|
||||
deps: ClaudeReportAgentDeps = {},
|
||||
): Promise<void> {
|
||||
const query = deps.query ?? (claudeQuery as unknown as ClaudeQuery)
|
||||
let resultSubtype: string | undefined
|
||||
|
||||
for await (const message of query({
|
||||
prompt: invocation.prompt,
|
||||
options: {
|
||||
cwd: invocation.inputDir,
|
||||
model: DEFAULT_REPORT_MODEL,
|
||||
systemPrompt:
|
||||
'You are an eval failure analyst. Produce a concise, evidence-backed, self-contained HTML report from local run artifacts.',
|
||||
permissionMode: 'bypassPermissions',
|
||||
allowDangerouslySkipPermissions: true,
|
||||
maxTurns: DEFAULT_REPORT_MAX_TURNS,
|
||||
env: claudeCodeEnv(deps.env ?? process.env),
|
||||
},
|
||||
})) {
|
||||
if (message.type === 'result') {
|
||||
resultSubtype =
|
||||
typeof message.subtype === 'string' ? message.subtype : undefined
|
||||
}
|
||||
}
|
||||
|
||||
if (resultSubtype && resultSubtype !== 'success') {
|
||||
throw new Error(`Claude Code report agent failed: ${resultSubtype}`)
|
||||
}
|
||||
}
|
||||
|
||||
export async function generateEvalReport(
|
||||
options: GenerateEvalReportOptions,
|
||||
): Promise<void> {
|
||||
const inputDir = resolve(options.inputDir)
|
||||
const outputPath = resolve(options.outputPath)
|
||||
|
||||
await assertRunDir(inputDir)
|
||||
await mkdir(dirname(outputPath), { recursive: true })
|
||||
|
||||
const invocation = {
|
||||
inputDir,
|
||||
outputPath,
|
||||
prompt: await buildReportPrompt(inputDir, outputPath),
|
||||
}
|
||||
await (options.runAgent ?? runClaudeCodeReportAgent)(invocation)
|
||||
await assertReportWritten(outputPath)
|
||||
}
|
||||
|
||||
if (import.meta.main) {
|
||||
try {
|
||||
await generateEvalReport(parseArgs(Bun.argv.slice(2)))
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : String(error))
|
||||
process.exit(1)
|
||||
}
|
||||
}
|
||||
@@ -134,7 +134,10 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
|
||||
|
||||
// Connect to Chrome via CDP — same per-worker offset used by app-manager.
|
||||
const cdpPort = config.browseros.base_cdp_port + workerIndex
|
||||
const cdp = new CdpBackend({ port: cdpPort })
|
||||
const cdp = new CdpBackend({
|
||||
port: cdpPort,
|
||||
exitOnReconnectFailure: false,
|
||||
})
|
||||
await cdp.connect()
|
||||
const browser = new Browser(cdp)
|
||||
capture.screenshot.setBrowser(browser)
|
||||
|
||||
@@ -43,7 +43,10 @@ export class SingleAgentEvaluator implements AgentEvaluator {
|
||||
|
||||
// Connect to Chrome via CDP — same per-worker offset used by app-manager.
|
||||
const cdpPort = config.browseros.base_cdp_port + workerIndex
|
||||
const cdp = new CdpBackend({ port: cdpPort })
|
||||
const cdp = new CdpBackend({
|
||||
port: cdpPort,
|
||||
exitOnReconnectFailure: false,
|
||||
})
|
||||
await cdp.connect()
|
||||
|
||||
const browser = new Browser(cdp)
|
||||
|
||||
@@ -536,6 +536,12 @@ export interface DashboardConfig {
|
||||
configMode?: boolean
|
||||
}
|
||||
|
||||
export function shouldAutoOpenDashboard(
|
||||
env: Record<string, string | undefined> = process.env,
|
||||
): boolean {
|
||||
return env.CI !== 'true'
|
||||
}
|
||||
|
||||
export function startDashboard(config: DashboardConfig) {
|
||||
const port = config.port ?? 9900
|
||||
dashboardConfigMode = config.configMode ?? false
|
||||
@@ -558,10 +564,12 @@ export function startDashboard(config: DashboardConfig) {
|
||||
console.log(` Dashboard: ${url}`)
|
||||
|
||||
// Auto-open browser
|
||||
try {
|
||||
Bun.spawn(['open', url], { stdout: 'ignore', stderr: 'ignore' })
|
||||
} catch {
|
||||
/* ignore if open command fails */
|
||||
if (shouldAutoOpenDashboard()) {
|
||||
try {
|
||||
Bun.spawn(['open', url], { stdout: 'ignore', stderr: 'ignore' })
|
||||
} catch {
|
||||
/* ignore if open command fails */
|
||||
}
|
||||
}
|
||||
|
||||
return { url, port }
|
||||
|
||||
@@ -61,6 +61,17 @@
|
||||
.header-stats .stat-pass { color: #3fb950; }
|
||||
.header-stats .stat-fail { color: #f85149; }
|
||||
.header-stats .stat-score { color: #f0883e; }
|
||||
.header-report {
|
||||
color: #58a6ff;
|
||||
text-decoration: none;
|
||||
font-size: 12px;
|
||||
font-weight: 600;
|
||||
border: 1px solid #30363d;
|
||||
border-radius: 6px;
|
||||
padding: 5px 9px;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.header-report:hover { border-color: #58a6ff; background: #1c2333; }
|
||||
|
||||
/* ── 3-column layout ─────────────────────────────────────────── */
|
||||
.layout {
|
||||
@@ -84,6 +95,7 @@
|
||||
background: #161b22;
|
||||
border-bottom: 1px solid #30363d;
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 12px;
|
||||
font-size: 11px;
|
||||
font-weight: 600;
|
||||
@@ -93,6 +105,80 @@
|
||||
}
|
||||
.sidebar-stats .s-pass { color: #3fb950; }
|
||||
.sidebar-stats .s-fail { color: #f85149; }
|
||||
.sidebar-metrics {
|
||||
padding: 12px 16px;
|
||||
background: #0d1117;
|
||||
border-bottom: 1px solid #21262d;
|
||||
}
|
||||
.metric-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(3, minmax(0, 1fr));
|
||||
gap: 8px;
|
||||
margin-bottom: 12px;
|
||||
}
|
||||
.metric-cell {
|
||||
min-width: 0;
|
||||
}
|
||||
.metric-label {
|
||||
display: block;
|
||||
font-size: 9px;
|
||||
font-weight: 600;
|
||||
color: #6e7681;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.metric-value {
|
||||
display: block;
|
||||
font-size: 13px;
|
||||
font-weight: 700;
|
||||
color: #e6edf3;
|
||||
margin-top: 2px;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
}
|
||||
.mini-chart {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 6px;
|
||||
}
|
||||
.mini-chart-title {
|
||||
font-size: 10px;
|
||||
font-weight: 700;
|
||||
color: #8b949e;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
}
|
||||
.mini-bar-row {
|
||||
display: grid;
|
||||
grid-template-columns: minmax(60px, 1fr) 70px 28px;
|
||||
gap: 8px;
|
||||
align-items: center;
|
||||
font-size: 10px;
|
||||
color: #8b949e;
|
||||
}
|
||||
.mini-bar-name {
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
white-space: nowrap;
|
||||
font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
|
||||
}
|
||||
.mini-bar-track {
|
||||
height: 6px;
|
||||
background: #21262d;
|
||||
border-radius: 999px;
|
||||
overflow: hidden;
|
||||
}
|
||||
.mini-bar-fill {
|
||||
height: 100%;
|
||||
background: #58a6ff;
|
||||
border-radius: 999px;
|
||||
}
|
||||
.mini-bar-value {
|
||||
color: #e6edf3;
|
||||
font-variant-numeric: tabular-nums;
|
||||
text-align: right;
|
||||
}
|
||||
.sidebar-filter {
|
||||
padding: 8px 12px;
|
||||
border-bottom: 1px solid #21262d;
|
||||
@@ -526,6 +612,7 @@
|
||||
<div class="header-sep"></div>
|
||||
<span class="header-run" id="header-run"></span>
|
||||
<span class="header-date" id="header-date"></span>
|
||||
<a class="header-report" id="header-report" target="_blank" rel="noopener" style="display: none;">Run Report</a>
|
||||
<div class="header-stats" id="header-stats"></div>
|
||||
</div>
|
||||
|
||||
@@ -533,6 +620,7 @@
|
||||
<!-- Left sidebar -->
|
||||
<div class="sidebar" id="sidebar">
|
||||
<div class="sidebar-stats" id="sidebar-stats"></div>
|
||||
<div class="sidebar-metrics" id="sidebar-metrics"></div>
|
||||
<div class="sidebar-filter">
|
||||
<input type="text" id="filter-input" placeholder="Search tasks..." autocomplete="off" spellcheck="false" />
|
||||
</div>
|
||||
@@ -627,7 +715,23 @@
|
||||
if (stats.avgScore !== null) {
|
||||
parts.push(`<span class="stat-score">avg ${stats.avgScore}%</span>`);
|
||||
}
|
||||
if (stats.avgDurationMs !== null) {
|
||||
parts.push(`<span>${fmtDuration(stats.avgDurationMs)} avg</span>`);
|
||||
}
|
||||
if (stats.avgToolCalls !== null) {
|
||||
parts.push(`<span>${fmtCompact(stats.avgToolCalls)} tools/task</span>`);
|
||||
}
|
||||
el.innerHTML = parts.join('');
|
||||
|
||||
const reportLink = document.getElementById('header-report');
|
||||
const url = reportUrl(manifest);
|
||||
if (url) {
|
||||
reportLink.href = url;
|
||||
reportLink.style.display = '';
|
||||
} else {
|
||||
reportLink.removeAttribute('href');
|
||||
reportLink.style.display = 'none';
|
||||
}
|
||||
}
|
||||
|
||||
// ── Sidebar rendering ─────────────────────────────────────────
|
||||
@@ -639,11 +743,49 @@
|
||||
statsEl.innerHTML =
|
||||
'<span>' + stats.total + ' total</span>' +
|
||||
'<span class="s-pass">' + stats.passed + ' pass</span>' +
|
||||
'<span class="s-fail">' + stats.failed + ' fail</span>';
|
||||
'<span class="s-fail">' + stats.failed + ' fail</span>' +
|
||||
(stats.avgSteps !== null ? '<span>' + fmtCompact(stats.avgSteps) + ' steps/task</span>' : '') +
|
||||
(stats.avgToolCalls !== null ? '<span>' + fmtCompact(stats.avgToolCalls) + ' tools/task</span>' : '');
|
||||
|
||||
renderSidebarMetrics(tasks, stats);
|
||||
|
||||
renderTaskList('');
|
||||
}
|
||||
|
||||
function renderSidebarMetrics(tasks, stats) {
|
||||
const el = document.getElementById('sidebar-metrics');
|
||||
if (!el) return;
|
||||
|
||||
const chartTasks = tasks
|
||||
.slice()
|
||||
.sort((a, b) => taskMetrics(b).toolCalls - taskMetrics(a).toolCalls)
|
||||
.slice(0, 5);
|
||||
const maxCalls = Math.max(1, ...chartTasks.map((task) => taskMetrics(task).toolCalls));
|
||||
|
||||
const bars = chartTasks.map((task) => {
|
||||
const calls = taskMetrics(task).toolCalls;
|
||||
const width = Math.max(4, Math.round((calls / maxCalls) * 100));
|
||||
return (
|
||||
'<div class="mini-bar-row">' +
|
||||
'<span class="mini-bar-name" title="' + escAttr(task.queryId || task.id || 'Untitled') + '">' + esc(task.queryId || task.id || 'Untitled') + '</span>' +
|
||||
'<span class="mini-bar-track"><span class="mini-bar-fill" style="width: ' + width + '%"></span></span>' +
|
||||
'<span class="mini-bar-value">' + fmtCompact(calls) + '</span>' +
|
||||
'</div>'
|
||||
);
|
||||
}).join('');
|
||||
|
||||
el.innerHTML =
|
||||
'<div class="metric-grid">' +
|
||||
'<div class="metric-cell"><span class="metric-label">Avg Time</span><span class="metric-value">' + (stats.avgDurationMs !== null ? fmtDuration(stats.avgDurationMs) : '-') + '</span></div>' +
|
||||
'<div class="metric-cell"><span class="metric-label">Avg Steps</span><span class="metric-value">' + (stats.avgSteps !== null ? fmtCompact(stats.avgSteps) : '-') + '</span></div>' +
|
||||
'<div class="metric-cell"><span class="metric-label">Avg Tools</span><span class="metric-value">' + (stats.avgToolCalls !== null ? fmtCompact(stats.avgToolCalls) : '-') + '</span></div>' +
|
||||
'</div>' +
|
||||
'<div class="mini-chart">' +
|
||||
'<div class="mini-chart-title">Tool Calls by Task</div>' +
|
||||
(bars || '<div class="task-meta-line"><span>No tool calls recorded</span></div>') +
|
||||
'</div>';
|
||||
}
|
||||
|
||||
function renderTaskList(filter) {
|
||||
const list = document.getElementById('task-list');
|
||||
list.innerHTML = '';
|
||||
@@ -668,8 +810,11 @@
|
||||
}
|
||||
|
||||
const metaParts = [];
|
||||
if (task.durationMs) metaParts.push(fmtDuration(task.durationMs));
|
||||
if (task.screenshotCount) metaParts.push(`${task.screenshotCount} steps`);
|
||||
const metrics = taskMetrics(task);
|
||||
if (metrics.durationMs) metaParts.push(fmtDuration(metrics.durationMs));
|
||||
if (metrics.steps) metaParts.push(`${fmtCompact(metrics.steps)} steps`);
|
||||
if (metrics.toolCalls) metaParts.push(`${fmtCompact(metrics.toolCalls)} tools`);
|
||||
if (metrics.toolErrors) metaParts.push(`${fmtCompact(metrics.toolErrors)} errors`);
|
||||
|
||||
item.innerHTML =
|
||||
'<div class="task-row">' +
|
||||
@@ -714,7 +859,7 @@
|
||||
}
|
||||
|
||||
function artifactPath(task, artifact) {
|
||||
const manifestPath = task.paths && task.paths[artifact];
|
||||
const manifestPath = task.paths?.[artifact];
|
||||
if (typeof manifestPath === 'string' && manifestPath.length > 0) {
|
||||
return manifestPath.replace(/^\/+/, '');
|
||||
}
|
||||
@@ -725,6 +870,17 @@
|
||||
return `${basePath}/${artifactPath(task, artifact)}`;
|
||||
}
|
||||
|
||||
function runArtifactUrl(path) {
|
||||
if (typeof path !== 'string' || path.length === 0) return null;
|
||||
return `${basePath}/${path.replace(/^\/+/, '')}`;
|
||||
}
|
||||
|
||||
function reportUrl(manifest, task) {
|
||||
const url = runArtifactUrl(manifest?.reportPath);
|
||||
if (!url || !task) return url;
|
||||
return `${url}#${encodeURIComponent(task.queryId || task.id || '')}`;
|
||||
}
|
||||
|
||||
function metadataUrl(task) {
|
||||
return artifactUrl(task, 'metadata');
|
||||
}
|
||||
@@ -905,10 +1061,38 @@
|
||||
}
|
||||
|
||||
// Duration
|
||||
if (task.durationMs) {
|
||||
const metrics = taskMetrics(task);
|
||||
if (metrics.durationMs) {
|
||||
html += '<div class="db-section">';
|
||||
html += '<span class="db-label">Duration</span>';
|
||||
html += `<span class="db-value">${fmtDuration(task.durationMs)}</span>`;
|
||||
html += `<span class="db-value">${fmtDuration(metrics.durationMs)}</span>`;
|
||||
html += '</div>';
|
||||
}
|
||||
|
||||
if (metrics.steps) {
|
||||
html += '<div class="db-section">';
|
||||
html += '<span class="db-label">Steps</span>';
|
||||
html += `<span class="db-value">${fmtCompact(metrics.steps)}</span>`;
|
||||
html += '</div>';
|
||||
}
|
||||
|
||||
html += '<div class="db-section">';
|
||||
html += '<span class="db-label">Tool Calls</span>';
|
||||
html += `<span class="db-value">${fmtCompact(metrics.toolCalls)}</span>`;
|
||||
html += '</div>';
|
||||
|
||||
if (metrics.toolErrors) {
|
||||
html += '<div class="db-section">';
|
||||
html += '<span class="db-label">Tool Errors</span>';
|
||||
html += `<span class="db-value">${fmtCompact(metrics.toolErrors)}</span>`;
|
||||
html += '</div>';
|
||||
}
|
||||
|
||||
const reportLink = reportUrl(manifest, task);
|
||||
if (reportLink) {
|
||||
html += '<div class="db-section">';
|
||||
html += '<span class="db-label">Report</span>';
|
||||
html += `<span class="db-value"><a href="${escAttr(reportLink)}" target="_blank" rel="noopener">Open task analysis</a></span>`;
|
||||
html += '</div>';
|
||||
}
|
||||
|
||||
@@ -1234,8 +1418,25 @@
|
||||
function computeStats(tasks) {
|
||||
const total = tasks.length;
|
||||
let passed = 0, failed = 0, totalScore = 0, scoredCount = 0;
|
||||
let totalDurationMs = 0, durationCount = 0;
|
||||
let totalSteps = 0, stepsCount = 0;
|
||||
let totalToolCalls = 0, toolCount = 0;
|
||||
let totalToolErrors = 0;
|
||||
|
||||
tasks.forEach((t) => {
|
||||
const metrics = taskMetrics(t);
|
||||
if (metrics.durationMs > 0) {
|
||||
totalDurationMs += metrics.durationMs;
|
||||
durationCount++;
|
||||
}
|
||||
if (metrics.steps > 0) {
|
||||
totalSteps += metrics.steps;
|
||||
stepsCount++;
|
||||
}
|
||||
totalToolCalls += metrics.toolCalls;
|
||||
totalToolErrors += metrics.toolErrors;
|
||||
toolCount++;
|
||||
|
||||
const graders = t.graderResults || {};
|
||||
const keys = Object.keys(graders);
|
||||
if (keys.length > 0) {
|
||||
@@ -1254,7 +1455,34 @@
|
||||
total: total,
|
||||
passed: passed,
|
||||
failed: failed,
|
||||
avgScore: scoredCount > 0 ? Math.round((totalScore / scoredCount) * 100) : null
|
||||
avgScore: scoredCount > 0 ? Math.round((totalScore / scoredCount) * 100) : null,
|
||||
avgDurationMs: durationCount > 0 ? totalDurationMs / durationCount : null,
|
||||
avgSteps: stepsCount > 0 ? totalSteps / stepsCount : null,
|
||||
avgToolCalls: toolCount > 0 ? totalToolCalls / toolCount : null,
|
||||
totalToolCalls: totalToolCalls,
|
||||
totalToolErrors: totalToolErrors
|
||||
};
|
||||
}
|
||||
|
||||
function taskMetrics(task) {
|
||||
const metrics = task.metrics || {};
|
||||
const screenshots = Number.isFinite(Number(metrics.screenshots))
|
||||
? Number(metrics.screenshots)
|
||||
: Number(task.screenshotCount || 0);
|
||||
return {
|
||||
durationMs: Number.isFinite(Number(metrics.durationMs))
|
||||
? Number(metrics.durationMs)
|
||||
: Number(task.durationMs || 0),
|
||||
steps: Number.isFinite(Number(metrics.steps))
|
||||
? Number(metrics.steps)
|
||||
: screenshots,
|
||||
screenshots: screenshots,
|
||||
toolCalls: Number.isFinite(Number(metrics.toolCalls))
|
||||
? Number(metrics.toolCalls)
|
||||
: 0,
|
||||
toolErrors: Number.isFinite(Number(metrics.toolErrors))
|
||||
? Number(metrics.toolErrors)
|
||||
: 0
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1310,6 +1538,13 @@
|
||||
return `${h}h ${remM}m`;
|
||||
}
|
||||
|
||||
function fmtCompact(value) {
|
||||
const num = Number(value);
|
||||
if (!Number.isFinite(num)) return '0';
|
||||
if (Number.isInteger(num)) return String(num);
|
||||
return num.toFixed(1);
|
||||
}
|
||||
|
||||
function showFatalError(msgHtml) {
|
||||
document.getElementById('center-panel').innerHTML =
|
||||
'<div class="placeholder error">' +
|
||||
|
||||
@@ -5,6 +5,7 @@ import {
|
||||
PutObjectCommand,
|
||||
S3Client,
|
||||
} from '@aws-sdk/client-s3'
|
||||
import { readTaskMetrics } from '../reporting/task-metrics'
|
||||
import {
|
||||
buildViewerManifest,
|
||||
type ViewerManifestTaskInput,
|
||||
@@ -315,6 +316,7 @@ export class R2Publisher {
|
||||
graderResults:
|
||||
(meta.grader_results as ViewerManifestTaskInput['graderResults']) ||
|
||||
{},
|
||||
metrics: await readTaskMetrics(taskPath, meta, screenshotCount),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -379,10 +381,12 @@ export class R2Publisher {
|
||||
await readFile(join(runDir, 'summary.json'), 'utf-8'),
|
||||
) as Record<string, unknown>
|
||||
} catch {}
|
||||
const reportStat = await stat(join(runDir, 'report.html')).catch(() => null)
|
||||
|
||||
return buildViewerManifest({
|
||||
runId,
|
||||
uploadedAt: this.now().toISOString(),
|
||||
reportPath: reportStat?.isFile() ? 'report.html' : undefined,
|
||||
agentConfig,
|
||||
dataset,
|
||||
summary: summaryData
|
||||
|
||||
188
packages/browseros-agent/apps/eval/src/reporting/task-metrics.ts
vendored
Normal file
188
packages/browseros-agent/apps/eval/src/reporting/task-metrics.ts
vendored
Normal file
@@ -0,0 +1,188 @@
|
||||
import { readdir, readFile, stat } from 'node:fs/promises'
|
||||
import { join } from 'node:path'
|
||||
|
||||
export interface EvalTaskMetrics {
|
||||
durationMs: number
|
||||
steps: number
|
||||
screenshots: number
|
||||
toolCalls: number
|
||||
toolErrors: number
|
||||
}
|
||||
|
||||
export interface EvalRunMetrics {
|
||||
taskCount: number
|
||||
totalDurationMs: number
|
||||
avgDurationMs: number
|
||||
totalSteps: number
|
||||
avgSteps: number
|
||||
totalToolCalls: number
|
||||
avgToolCalls: number
|
||||
totalToolErrors: number
|
||||
avgToolErrors: number
|
||||
}
|
||||
|
||||
export interface EvalTaskMetricSummary {
|
||||
queryId: string
|
||||
status: string
|
||||
score?: number
|
||||
pass?: boolean
|
||||
metrics: EvalTaskMetrics
|
||||
}
|
||||
|
||||
export interface EvalRunMetricSummary {
|
||||
run: EvalRunMetrics
|
||||
tasks: EvalTaskMetricSummary[]
|
||||
}
|
||||
|
||||
interface TaskDirEntry {
|
||||
taskId: string
|
||||
taskPath: string
|
||||
}
|
||||
|
||||
function numberValue(value: unknown): number {
|
||||
return typeof value === 'number' && Number.isFinite(value) ? value : 0
|
||||
}
|
||||
|
||||
export function countMessageMetrics(messagesJsonl: string): {
|
||||
toolCalls: number
|
||||
toolErrors: number
|
||||
} {
|
||||
let toolCalls = 0
|
||||
let toolErrors = 0
|
||||
|
||||
for (const line of messagesJsonl.split('\n')) {
|
||||
const trimmed = line.trim()
|
||||
if (!trimmed) continue
|
||||
try {
|
||||
const event = JSON.parse(trimmed) as { type?: unknown }
|
||||
if (event.type === 'tool-input-available') toolCalls++
|
||||
if (event.type === 'tool-output-error') toolErrors++
|
||||
} catch {
|
||||
// Ignore malformed telemetry lines; the raw artifact is still uploaded.
|
||||
}
|
||||
}
|
||||
|
||||
return { toolCalls, toolErrors }
|
||||
}
|
||||
|
||||
export function buildTaskMetrics(
|
||||
metadata: Record<string, unknown>,
|
||||
messageMetrics: { toolCalls: number; toolErrors: number },
|
||||
screenshotCount = 0,
|
||||
): EvalTaskMetrics {
|
||||
const screenshots = numberValue(metadata.screenshot_count) || screenshotCount
|
||||
return {
|
||||
durationMs: numberValue(metadata.total_duration_ms),
|
||||
steps: numberValue(metadata.total_steps) || screenshots,
|
||||
screenshots,
|
||||
toolCalls: messageMetrics.toolCalls,
|
||||
toolErrors: messageMetrics.toolErrors,
|
||||
}
|
||||
}
|
||||
|
||||
export function buildRunMetrics(metrics: EvalTaskMetrics[]): EvalRunMetrics {
|
||||
const taskCount = metrics.length
|
||||
const totalDurationMs = metrics.reduce((sum, metric) => {
|
||||
return sum + metric.durationMs
|
||||
}, 0)
|
||||
const totalSteps = metrics.reduce((sum, metric) => sum + metric.steps, 0)
|
||||
const totalToolCalls = metrics.reduce((sum, metric) => {
|
||||
return sum + metric.toolCalls
|
||||
}, 0)
|
||||
const totalToolErrors = metrics.reduce((sum, metric) => {
|
||||
return sum + metric.toolErrors
|
||||
}, 0)
|
||||
|
||||
return {
|
||||
taskCount,
|
||||
totalDurationMs,
|
||||
avgDurationMs: taskCount > 0 ? totalDurationMs / taskCount : 0,
|
||||
totalSteps,
|
||||
avgSteps: taskCount > 0 ? totalSteps / taskCount : 0,
|
||||
totalToolCalls,
|
||||
avgToolCalls: taskCount > 0 ? totalToolCalls / taskCount : 0,
|
||||
totalToolErrors,
|
||||
avgToolErrors: taskCount > 0 ? totalToolErrors / taskCount : 0,
|
||||
}
|
||||
}
|
||||
|
||||
export async function readTaskMetrics(
|
||||
taskPath: string,
|
||||
metadata: Record<string, unknown>,
|
||||
screenshotCount = 0,
|
||||
): Promise<EvalTaskMetrics> {
|
||||
const messages = await readFile(join(taskPath, 'messages.jsonl'), 'utf-8')
|
||||
.then(countMessageMetrics)
|
||||
.catch(() => ({ toolCalls: 0, toolErrors: 0 }))
|
||||
return buildTaskMetrics(metadata, messages, screenshotCount)
|
||||
}
|
||||
|
||||
function statusFromMetadata(metadata: Record<string, unknown>): string {
|
||||
const termination = metadata.termination_reason
|
||||
if (termination === 'timeout') return 'timeout'
|
||||
if (Array.isArray(metadata.errors) && metadata.errors.length > 0) {
|
||||
return 'failed'
|
||||
}
|
||||
return 'completed'
|
||||
}
|
||||
|
||||
function primaryGrade(metadata: Record<string, unknown>): {
|
||||
score?: number
|
||||
pass?: boolean
|
||||
} {
|
||||
const graders = metadata.grader_results as
|
||||
| Record<string, { score?: unknown; pass?: unknown }>
|
||||
| undefined
|
||||
const first = graders ? Object.values(graders)[0] : undefined
|
||||
return {
|
||||
...(typeof first?.score === 'number' ? { score: first.score } : {}),
|
||||
...(typeof first?.pass === 'boolean' ? { pass: first.pass } : {}),
|
||||
}
|
||||
}
|
||||
|
||||
async function readTaskDirs(runDir: string): Promise<TaskDirEntry[]> {
|
||||
const canonicalTasksDir = join(runDir, 'tasks')
|
||||
const canonicalStat = await stat(canonicalTasksDir).catch(() => null)
|
||||
const baseDir = canonicalStat?.isDirectory() ? canonicalTasksDir : runDir
|
||||
const entries = await readdir(baseDir, { withFileTypes: true }).catch(
|
||||
() => [],
|
||||
)
|
||||
|
||||
return entries
|
||||
.filter((entry) => entry.isDirectory())
|
||||
.filter((entry) => entry.name !== 'screenshots')
|
||||
.filter((entry) => entry.name !== 'tasks')
|
||||
.map((entry) => ({
|
||||
taskId: entry.name,
|
||||
taskPath: join(baseDir, entry.name),
|
||||
}))
|
||||
}
|
||||
|
||||
export async function readRunMetricSummary(
|
||||
runDir: string,
|
||||
): Promise<EvalRunMetricSummary> {
|
||||
const tasks: EvalTaskMetricSummary[] = []
|
||||
|
||||
for (const entry of await readTaskDirs(runDir)) {
|
||||
const metadata = await readFile(
|
||||
join(entry.taskPath, 'metadata.json'),
|
||||
'utf-8',
|
||||
)
|
||||
.then((text) => JSON.parse(text) as Record<string, unknown>)
|
||||
.catch(() => null)
|
||||
if (!metadata) continue
|
||||
|
||||
const metrics = await readTaskMetrics(entry.taskPath, metadata)
|
||||
tasks.push({
|
||||
queryId: (metadata.query_id as string | undefined) || entry.taskId,
|
||||
status: statusFromMetadata(metadata),
|
||||
...primaryGrade(metadata),
|
||||
metrics,
|
||||
})
|
||||
}
|
||||
|
||||
return {
|
||||
run: buildRunMetrics(tasks.map((task) => task.metrics)),
|
||||
tasks,
|
||||
}
|
||||
}
|
||||
@@ -36,5 +36,6 @@ export async function resolveProviderConfig(
|
||||
accessKeyId: resolveEnvValue(agent.accessKeyId),
|
||||
secretAccessKey: resolveEnvValue(agent.secretAccessKey),
|
||||
sessionToken: resolveEnvValue(agent.sessionToken),
|
||||
region: resolveEnvValue(agent.region),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
import {
|
||||
buildRunMetrics,
|
||||
type EvalRunMetrics,
|
||||
type EvalTaskMetrics,
|
||||
} from '../reporting/task-metrics'
|
||||
import type { GraderResult } from '../types'
|
||||
|
||||
export const VIEWER_MANIFEST_SCHEMA_VERSION = 2
|
||||
@@ -20,6 +25,7 @@ export interface ViewerManifestTaskInput {
|
||||
status: string
|
||||
durationMs: number
|
||||
screenshotCount: number
|
||||
metrics?: EvalTaskMetrics
|
||||
graderResults: Record<string, GraderResult>
|
||||
}
|
||||
|
||||
@@ -35,9 +41,11 @@ export interface ViewerManifest {
|
||||
suiteId?: string
|
||||
variantId?: string
|
||||
uploadedAt?: string
|
||||
reportPath?: string
|
||||
agentConfig?: Record<string, unknown>
|
||||
dataset?: string
|
||||
summary?: Record<string, unknown>
|
||||
metrics?: EvalRunMetrics
|
||||
tasks: ViewerManifestTask[]
|
||||
}
|
||||
|
||||
@@ -46,6 +54,7 @@ export interface BuildViewerManifestInput {
|
||||
suiteId?: string
|
||||
variantId?: string
|
||||
uploadedAt?: string
|
||||
reportPath?: string
|
||||
agentConfig?: Record<string, unknown>
|
||||
dataset?: string
|
||||
summary?: Record<string, unknown>
|
||||
@@ -68,22 +77,37 @@ function taskPaths(queryId: string): ViewerManifestTaskPaths {
|
||||
export function buildViewerManifest(
|
||||
input: BuildViewerManifestInput,
|
||||
): ViewerManifest {
|
||||
const tasks = input.tasks.map((task) => {
|
||||
const { artifactId, ...publicTask } = task
|
||||
const metrics =
|
||||
publicTask.metrics ??
|
||||
({
|
||||
durationMs: publicTask.durationMs,
|
||||
steps: publicTask.screenshotCount,
|
||||
screenshots: publicTask.screenshotCount,
|
||||
toolCalls: 0,
|
||||
toolErrors: 0,
|
||||
} satisfies EvalTaskMetrics)
|
||||
|
||||
return {
|
||||
...publicTask,
|
||||
metrics,
|
||||
startUrl: publicTask.startUrl ?? '',
|
||||
paths: taskPaths(artifactId ?? publicTask.queryId),
|
||||
}
|
||||
})
|
||||
|
||||
return {
|
||||
schemaVersion: VIEWER_MANIFEST_SCHEMA_VERSION,
|
||||
runId: input.runId,
|
||||
...(input.suiteId ? { suiteId: input.suiteId } : {}),
|
||||
...(input.variantId ? { variantId: input.variantId } : {}),
|
||||
...(input.uploadedAt ? { uploadedAt: input.uploadedAt } : {}),
|
||||
...(input.reportPath ? { reportPath: input.reportPath } : {}),
|
||||
...(input.agentConfig ? { agentConfig: input.agentConfig } : {}),
|
||||
...(input.dataset ? { dataset: input.dataset } : {}),
|
||||
...(input.summary ? { summary: input.summary } : {}),
|
||||
tasks: input.tasks.map((task) => {
|
||||
const { artifactId, ...publicTask } = task
|
||||
return {
|
||||
...publicTask,
|
||||
startUrl: publicTask.startUrl ?? '',
|
||||
paths: taskPaths(artifactId ?? publicTask.queryId),
|
||||
}
|
||||
}),
|
||||
metrics: buildRunMetrics(tasks.map((task) => task.metrics)),
|
||||
tasks,
|
||||
}
|
||||
}
|
||||
|
||||
12
packages/browseros-agent/apps/eval/tests/dashboard/server.test.ts
vendored
Normal file
12
packages/browseros-agent/apps/eval/tests/dashboard/server.test.ts
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { shouldAutoOpenDashboard } from '../../src/dashboard/server'
|
||||
|
||||
describe('dashboard server', () => {
|
||||
it('does not auto-open the dashboard in CI', () => {
|
||||
expect(shouldAutoOpenDashboard({ CI: 'true' })).toBe(false)
|
||||
})
|
||||
|
||||
it('auto-opens the dashboard outside CI by default', () => {
|
||||
expect(shouldAutoOpenDashboard({})).toBe(true)
|
||||
})
|
||||
})
|
||||
@@ -40,6 +40,7 @@ async function writeRunFixture(
|
||||
start_url: 'https://example.test',
|
||||
termination_reason: 'completed',
|
||||
total_duration_ms: 1200,
|
||||
total_steps: 4,
|
||||
screenshot_count: 1,
|
||||
agent_config: { type: 'single', model: 'kimi' },
|
||||
grader_results: {
|
||||
@@ -47,13 +48,22 @@ async function writeRunFixture(
|
||||
},
|
||||
}),
|
||||
)
|
||||
await writeFile(join(taskDir, 'messages.jsonl'), '{"type":"user"}\n')
|
||||
await writeFile(
|
||||
join(taskDir, 'messages.jsonl'),
|
||||
[
|
||||
'{"type":"user"}',
|
||||
'{"type":"tool-input-available","toolName":"click"}',
|
||||
'{"type":"tool-input-available","toolName":"take_snapshot"}',
|
||||
'{"type":"tool-output-error","toolName":"click"}',
|
||||
].join('\n'),
|
||||
)
|
||||
await writeFile(join(taskDir, 'grades.json'), '{"ok":true}')
|
||||
await writeFile(join(taskDir, 'screenshots', '1.png'), 'png')
|
||||
await writeFile(
|
||||
join(runDir, 'summary.json'),
|
||||
JSON.stringify({ passRate: 1, avgDurationMs: 1200 }),
|
||||
)
|
||||
await writeFile(join(runDir, 'report.html'), '<html>report</html>')
|
||||
return { runDir, runId: `${configName}-${timestamp}` }
|
||||
}
|
||||
|
||||
@@ -110,6 +120,9 @@ describe('R2Publisher', () => {
|
||||
expect(byKey.get(`runs/${runId}/summary.json`)?.ContentType).toBe(
|
||||
'application/json',
|
||||
)
|
||||
expect(byKey.get(`runs/${runId}/report.html`)?.ContentType).toBe(
|
||||
'text/html',
|
||||
)
|
||||
expect(byKey.get('viewer.html')?.ContentType).toBe('text/html')
|
||||
expect(result.viewerUrl).toBe(
|
||||
`https://eval.example.test/viewer.html?run=${runId}`,
|
||||
@@ -126,12 +139,28 @@ describe('R2Publisher', () => {
|
||||
uploadedAt: '2026-04-29T12:00:00.000Z',
|
||||
agentConfig: { type: 'single', model: 'kimi' },
|
||||
dataset: 'webbench',
|
||||
reportPath: 'report.html',
|
||||
summary: { passRate: 1, avgDurationMs: 1200 },
|
||||
metrics: {
|
||||
taskCount: 1,
|
||||
avgDurationMs: 1200,
|
||||
avgSteps: 4,
|
||||
avgToolCalls: 2,
|
||||
totalToolCalls: 2,
|
||||
totalToolErrors: 1,
|
||||
},
|
||||
tasks: [
|
||||
{
|
||||
queryId: 'task-1',
|
||||
status: 'completed',
|
||||
screenshotCount: 1,
|
||||
metrics: {
|
||||
durationMs: 1200,
|
||||
steps: 4,
|
||||
screenshots: 1,
|
||||
toolCalls: 2,
|
||||
toolErrors: 1,
|
||||
},
|
||||
paths: {
|
||||
attempt: 'tasks/task-1/attempt.json',
|
||||
metadata: 'tasks/task-1/metadata.json',
|
||||
|
||||
@@ -6,6 +6,7 @@ interface ViewerPathResolvers {
|
||||
artifactUrl(task: Record<string, unknown>, artifact: string): string
|
||||
metadataUrl(task: Record<string, unknown>): string
|
||||
messagesUrl(task: Record<string, unknown>): string
|
||||
reportUrl(manifest: Record<string, unknown>): string | null
|
||||
screenshotUrl(task: Record<string, unknown>, step: number): string
|
||||
}
|
||||
|
||||
@@ -24,7 +25,7 @@ async function loadViewerPathResolvers(): Promise<ViewerPathResolvers> {
|
||||
`
|
||||
const basePath = 'runs/run-1';
|
||||
${block}
|
||||
return { artifactUrl, metadataUrl, messagesUrl, screenshotUrl };
|
||||
return { artifactUrl, metadataUrl, messagesUrl, reportUrl, screenshotUrl };
|
||||
`,
|
||||
) as () => ViewerPathResolvers
|
||||
return createResolvers()
|
||||
@@ -60,6 +61,35 @@ async function runAutoSelectFromHash(hash: string): Promise<unknown> {
|
||||
return runAutoSelect()
|
||||
}
|
||||
|
||||
async function runComputeStats(): Promise<unknown> {
|
||||
const html = await readFile(
|
||||
join(import.meta.dir, '..', '..', 'src', 'dashboard', 'viewer.html'),
|
||||
'utf-8',
|
||||
)
|
||||
const start = html.indexOf('function computeStats(tasks)')
|
||||
const end = html.indexOf('function resolveStatus(task)', start)
|
||||
expect(start).toBeGreaterThan(-1)
|
||||
expect(end).toBeGreaterThan(start)
|
||||
|
||||
const block = html.slice(start, end)
|
||||
const compute = new Function(
|
||||
`
|
||||
${block}
|
||||
return computeStats([
|
||||
{
|
||||
graderResults: { agisdk_state_diff: { pass: true, score: 1 } },
|
||||
metrics: { durationMs: 1000, steps: 4, toolCalls: 3, toolErrors: 0 }
|
||||
},
|
||||
{
|
||||
graderResults: { agisdk_state_diff: { pass: false, score: 0 } },
|
||||
metrics: { durationMs: 3000, steps: 8, toolCalls: 5, toolErrors: 2 }
|
||||
}
|
||||
]);
|
||||
`,
|
||||
) as () => unknown
|
||||
return compute()
|
||||
}
|
||||
|
||||
describe('R2 viewer artifact path compatibility', () => {
|
||||
it('uses explicit manifest paths for new uploaded runs', async () => {
|
||||
const resolvers = await loadViewerPathResolvers()
|
||||
@@ -95,6 +125,15 @@ describe('R2 viewer artifact path compatibility', () => {
|
||||
)
|
||||
})
|
||||
|
||||
it('resolves manifest-level run report links', async () => {
|
||||
const resolvers = await loadViewerPathResolvers()
|
||||
|
||||
expect(resolvers.reportUrl({ reportPath: 'report.html' })).toBe(
|
||||
'runs/run-1/report.html',
|
||||
)
|
||||
expect(resolvers.reportUrl({})).toBe(null)
|
||||
})
|
||||
|
||||
it('falls back to legacy inferred paths for old uploaded runs', async () => {
|
||||
const resolvers = await loadViewerPathResolvers()
|
||||
const task = { queryId: 'legacy-task' }
|
||||
@@ -127,4 +166,17 @@ describe('R2 viewer artifact path compatibility', () => {
|
||||
queryId: 'legacy-task',
|
||||
})
|
||||
})
|
||||
|
||||
it('computes run-level timing and tool metrics for the viewer', async () => {
|
||||
expect(await runComputeStats()).toMatchObject({
|
||||
total: 2,
|
||||
passed: 1,
|
||||
failed: 1,
|
||||
avgDurationMs: 2000,
|
||||
avgSteps: 6,
|
||||
avgToolCalls: 4,
|
||||
totalToolCalls: 8,
|
||||
totalToolErrors: 2,
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
159
packages/browseros-agent/apps/eval/tests/reporting/generate-report-script.test.ts
vendored
Normal file
159
packages/browseros-agent/apps/eval/tests/reporting/generate-report-script.test.ts
vendored
Normal file
@@ -0,0 +1,159 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'
|
||||
import { tmpdir } from 'node:os'
|
||||
import { join } from 'node:path'
|
||||
import {
|
||||
DEFAULT_REPORT_MAX_TURNS,
|
||||
DEFAULT_REPORT_MODEL,
|
||||
generateEvalReport,
|
||||
runClaudeCodeReportAgent,
|
||||
} from '../../scripts/generate-report'
|
||||
|
||||
async function writeRunFixture(): Promise<string> {
|
||||
const runDir = await mkdtemp(join(tmpdir(), 'eval-report-script-'))
|
||||
const taskDir = join(runDir, 'agisdk-networkin-10')
|
||||
await mkdir(join(taskDir, 'screenshots'), { recursive: true })
|
||||
await writeFile(
|
||||
join(runDir, 'summary.json'),
|
||||
JSON.stringify({
|
||||
total: 1,
|
||||
completed: 1,
|
||||
passRate: 0,
|
||||
avgDurationMs: 1234,
|
||||
}),
|
||||
)
|
||||
await writeFile(
|
||||
join(taskDir, 'metadata.json'),
|
||||
JSON.stringify({
|
||||
query_id: 'agisdk-networkin-10',
|
||||
dataset: 'agisdk-real',
|
||||
query: 'Send a follow-up message starting with "Following up on".',
|
||||
termination_reason: 'completed',
|
||||
total_duration_ms: 1234,
|
||||
total_steps: 2,
|
||||
screenshot_count: 1,
|
||||
final_answer: 'No app action was taken.',
|
||||
errors: [],
|
||||
warnings: [],
|
||||
agent_config: { type: 'single', model: 'kimi' },
|
||||
grader_results: {
|
||||
agisdk_state_diff: {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: 'Some criteria failed',
|
||||
details: {
|
||||
per_criterion: [
|
||||
{ passed: true, detail: 'message starts correctly' },
|
||||
{ passed: false, detail: 'message was not sent' },
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
}),
|
||||
)
|
||||
await writeFile(
|
||||
join(taskDir, 'messages.jsonl'),
|
||||
[
|
||||
JSON.stringify({
|
||||
type: 'tool-input-available',
|
||||
timestamp: '2026-04-30T00:00:00.000Z',
|
||||
toolCallId: 'call-1',
|
||||
toolName: 'memory_search',
|
||||
input: { q: 'chat' },
|
||||
}),
|
||||
JSON.stringify({
|
||||
type: 'tool-output-error',
|
||||
timestamp: '2026-04-30T00:00:01.000Z',
|
||||
toolCallId: 'call-1',
|
||||
errorText: 'memory unavailable',
|
||||
}),
|
||||
].join('\n'),
|
||||
)
|
||||
await writeFile(join(taskDir, 'screenshots', '1.png'), 'png')
|
||||
return runDir
|
||||
}
|
||||
|
||||
describe('generate-report script', () => {
|
||||
it('delegates report.html creation to Claude Code', async () => {
|
||||
const runDir = await writeRunFixture()
|
||||
const outputPath = join(runDir, 'report.html')
|
||||
let prompt = ''
|
||||
|
||||
await generateEvalReport({
|
||||
inputDir: runDir,
|
||||
outputPath,
|
||||
runAgent: async (invocation) => {
|
||||
prompt = invocation.prompt
|
||||
await writeFile(
|
||||
invocation.outputPath,
|
||||
'<!doctype html><h1>Claude-written report</h1>',
|
||||
)
|
||||
},
|
||||
})
|
||||
|
||||
expect(await readFile(outputPath, 'utf-8')).toContain(
|
||||
'Claude-written report',
|
||||
)
|
||||
expect(prompt).toContain('AGI SDK Random-10 Failure Report')
|
||||
expect(prompt).toContain('summary.json')
|
||||
expect(prompt).toContain('messages.jsonl')
|
||||
expect(prompt).toContain('screenshots')
|
||||
expect(prompt).toContain('Deterministic run metrics')
|
||||
expect(prompt).toContain('"queryId": "agisdk-networkin-10"')
|
||||
expect(prompt).toContain('"toolCalls": 1')
|
||||
expect(prompt).toContain('"toolErrors": 1')
|
||||
expect(prompt).toContain('Duration by task')
|
||||
expect(prompt).toContain('Tool calls by task')
|
||||
expect(prompt).toContain(outputPath)
|
||||
})
|
||||
|
||||
it('fails when the Claude Code agent does not write the report', async () => {
|
||||
const runDir = await writeRunFixture()
|
||||
|
||||
await expect(
|
||||
generateEvalReport({
|
||||
inputDir: runDir,
|
||||
outputPath: join(runDir, 'missing-report.html'),
|
||||
runAgent: async () => {},
|
||||
}),
|
||||
).rejects.toThrow('Report was not written')
|
||||
})
|
||||
|
||||
it('runs Claude Code with Opus 4.6, full bypass, and bounded turns', async () => {
|
||||
const runDir = await writeRunFixture()
|
||||
const calls: unknown[] = []
|
||||
|
||||
await runClaudeCodeReportAgent(
|
||||
{
|
||||
inputDir: runDir,
|
||||
outputPath: join(runDir, 'report.html'),
|
||||
prompt: 'write the report',
|
||||
},
|
||||
{
|
||||
query: async function* (call: unknown) {
|
||||
calls.push(call)
|
||||
yield { type: 'result', subtype: 'success', result: 'done' }
|
||||
},
|
||||
env: {
|
||||
CLAUDE_CODE_OAUTH_TOKEN: 'token',
|
||||
EVAL_R2_SECRET_ACCESS_KEY: 'secret',
|
||||
HOME: '/tmp/home',
|
||||
PATH: '/bin',
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
expect(calls).toHaveLength(1)
|
||||
expect(calls[0]).toMatchObject({
|
||||
prompt: 'write the report',
|
||||
options: {
|
||||
cwd: runDir,
|
||||
model: DEFAULT_REPORT_MODEL,
|
||||
maxTurns: DEFAULT_REPORT_MAX_TURNS,
|
||||
permissionMode: 'bypassPermissions',
|
||||
allowDangerouslySkipPermissions: true,
|
||||
},
|
||||
})
|
||||
expect(JSON.stringify(calls[0])).not.toContain('secret')
|
||||
})
|
||||
})
|
||||
@@ -13,10 +13,10 @@ describe('adaptEvalConfigFile', () => {
|
||||
expect(adapted.suite.id).toBe('browseros-agent-weekly')
|
||||
expect(adapted.suite.dataset).toBe('../../data/agisdk-real.jsonl')
|
||||
expect(adapted.suite.graders).toEqual(['agisdk_state_diff'])
|
||||
expect(adapted.suite.workers).toBe(10)
|
||||
expect(adapted.suite.workers).toBe(3)
|
||||
expect(adapted.suite.restartBrowserPerTask).toBe(true)
|
||||
expect(adapted.suite.timeoutMs).toBe(1_800_000)
|
||||
expect(adapted.evalConfig.num_workers).toBe(10)
|
||||
expect(adapted.evalConfig.num_workers).toBe(3)
|
||||
expect(adapted.evalConfig.browseros.server_url).toBe(
|
||||
'http://127.0.0.1:9110',
|
||||
)
|
||||
@@ -38,6 +38,34 @@ describe('adaptEvalConfigFile', () => {
|
||||
)
|
||||
})
|
||||
|
||||
it('adapts BrowserOS AGI SDK comparison configs', async () => {
|
||||
const kimi = await adaptEvalConfigFile(
|
||||
'apps/eval/configs/legacy/browseros-agent-kimi-k2-5-agisdk-real.json',
|
||||
)
|
||||
const opus = await adaptEvalConfigFile(
|
||||
'apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json',
|
||||
)
|
||||
|
||||
expect(kimi.suite.id).toBe('browseros-agent-kimi-k2-5-agisdk-real')
|
||||
expect(kimi.evalConfig.agent).toMatchObject({
|
||||
type: 'single',
|
||||
provider: 'openai-compatible',
|
||||
model: 'moonshotai/kimi-k2.5',
|
||||
})
|
||||
expect(kimi.evalConfig.num_workers).toBe(3)
|
||||
|
||||
expect(opus.suite.id).toBe('browseros-agent-opus-4-6-agisdk-real')
|
||||
expect(opus.evalConfig.agent).toMatchObject({
|
||||
type: 'single',
|
||||
provider: 'bedrock',
|
||||
model: 'global.anthropic.claude-opus-4-6-v1',
|
||||
region: 'AWS_REGION',
|
||||
accessKeyId: 'AWS_ACCESS_KEY_ID',
|
||||
secretAccessKey: 'AWS_SECRET_ACCESS_KEY',
|
||||
})
|
||||
expect(opus.evalConfig.num_workers).toBe(2)
|
||||
})
|
||||
|
||||
it('adapts claude-code configs without provider credentials', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'claude-code-config-'))
|
||||
const configPath = join(dir, 'claude-code-agisdk.json')
|
||||
|
||||
38
packages/browseros-agent/apps/eval/tests/utils/resolve-provider-config.test.ts
vendored
Normal file
38
packages/browseros-agent/apps/eval/tests/utils/resolve-provider-config.test.ts
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { resolveProviderConfig } from '../../src/utils/resolve-provider-config'
|
||||
|
||||
describe('resolveProviderConfig', () => {
|
||||
it('resolves Bedrock region from environment variables', async () => {
|
||||
const previous = {
|
||||
AWS_REGION: process.env.AWS_REGION,
|
||||
AWS_ACCESS_KEY_ID: process.env.AWS_ACCESS_KEY_ID,
|
||||
AWS_SECRET_ACCESS_KEY: process.env.AWS_SECRET_ACCESS_KEY,
|
||||
}
|
||||
process.env.AWS_REGION = 'us-west-2'
|
||||
process.env.AWS_ACCESS_KEY_ID = 'test-access-key'
|
||||
process.env.AWS_SECRET_ACCESS_KEY = 'test-secret-key'
|
||||
|
||||
try {
|
||||
const resolved = await resolveProviderConfig({
|
||||
provider: 'bedrock',
|
||||
model: 'global.anthropic.claude-opus-4-6-v1',
|
||||
region: 'AWS_REGION',
|
||||
accessKeyId: 'AWS_ACCESS_KEY_ID',
|
||||
secretAccessKey: 'AWS_SECRET_ACCESS_KEY',
|
||||
})
|
||||
|
||||
expect(resolved).toMatchObject({
|
||||
provider: 'bedrock',
|
||||
model: 'global.anthropic.claude-opus-4-6-v1',
|
||||
region: process.env.AWS_REGION,
|
||||
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
|
||||
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
|
||||
})
|
||||
} finally {
|
||||
for (const [key, value] of Object.entries(previous)) {
|
||||
if (value === undefined) delete process.env[key]
|
||||
else process.env[key] = value
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
@@ -9,6 +9,7 @@ describe('buildViewerManifest', () => {
|
||||
suiteId: 'agisdk-daily-10',
|
||||
variantId: 'kimi',
|
||||
uploadedAt: '2026-04-29T06:00:00.000Z',
|
||||
reportPath: 'report.html',
|
||||
summary: { total: 1, passRate: 0 },
|
||||
tasks: [
|
||||
{
|
||||
@@ -18,6 +19,13 @@ describe('buildViewerManifest', () => {
|
||||
status: 'completed',
|
||||
durationMs: 353_000,
|
||||
screenshotCount: 42,
|
||||
metrics: {
|
||||
durationMs: 353_000,
|
||||
steps: 47,
|
||||
screenshots: 42,
|
||||
toolCalls: 19,
|
||||
toolErrors: 2,
|
||||
},
|
||||
graderResults: {
|
||||
agisdk_state_diff: {
|
||||
score: 0,
|
||||
@@ -32,6 +40,7 @@ describe('buildViewerManifest', () => {
|
||||
|
||||
const publishManifest: R2RunManifest = manifest
|
||||
expect(publishManifest.schemaVersion).toBe(2)
|
||||
expect(manifest.reportPath).toBe('report.html')
|
||||
expect(manifest.tasks[0].paths.messages).toBe(
|
||||
'tasks/agisdk-dashdish-4/messages.jsonl',
|
||||
)
|
||||
@@ -41,6 +50,21 @@ describe('buildViewerManifest', () => {
|
||||
expect(manifest.tasks[0].paths.graderArtifacts).toBe(
|
||||
'tasks/agisdk-dashdish-4/grader-artifacts',
|
||||
)
|
||||
expect(manifest.metrics).toMatchObject({
|
||||
taskCount: 1,
|
||||
avgDurationMs: 353_000,
|
||||
avgSteps: 47,
|
||||
avgToolCalls: 19,
|
||||
totalToolCalls: 19,
|
||||
totalToolErrors: 2,
|
||||
})
|
||||
expect(manifest.tasks[0].metrics).toEqual({
|
||||
durationMs: 353_000,
|
||||
steps: 47,
|
||||
screenshots: 42,
|
||||
toolCalls: 19,
|
||||
toolErrors: 2,
|
||||
})
|
||||
expect(manifest.tasks[0].graderResults.agisdk_state_diff.details).toEqual({
|
||||
missing: ['checkout item'],
|
||||
})
|
||||
|
||||
@@ -23,11 +23,17 @@ interface CdpVersion {
|
||||
const LOOPBACK_DISCOVERY_HOSTS = ['127.0.0.1', 'localhost', '[::1]'] as const
|
||||
type LoopbackDiscoveryHost = (typeof LOOPBACK_DISCOVERY_HOSTS)[number]
|
||||
|
||||
interface CdpBackendConfig {
|
||||
port: number
|
||||
exitOnReconnectFailure?: boolean
|
||||
}
|
||||
|
||||
// biome-ignore lint/correctness/noUnusedVariables: declaration merging adds ProtocolApi properties to the class
|
||||
interface CdpBackend extends ProtocolApi {}
|
||||
// biome-ignore lint/suspicious/noUnsafeDeclarationMerging: intentional — Object.assign fills these at runtime
|
||||
class CdpBackend implements ICdpBackend {
|
||||
private port: number
|
||||
private exitOnReconnectFailure: boolean
|
||||
private ws: WebSocket | null = null
|
||||
private messageId = 0
|
||||
private pending = new Map<number, PendingRequest>()
|
||||
@@ -44,8 +50,9 @@ class CdpBackend implements ICdpBackend {
|
||||
private keepaliveTimer: ReturnType<typeof setInterval> | null = null
|
||||
private preferredDiscoveryHost: LoopbackDiscoveryHost | null = null
|
||||
|
||||
constructor(config: { port: number }) {
|
||||
constructor(config: CdpBackendConfig) {
|
||||
this.port = config.port
|
||||
this.exitOnReconnectFailure = config.exitOnReconnectFailure ?? true
|
||||
|
||||
const rawSend: RawSend = (method, params) => this.rawSend(method, params)
|
||||
const rawOn: RawOn = (event, handler) => this.rawOn(event, handler)
|
||||
@@ -293,7 +300,8 @@ class CdpBackend implements ICdpBackend {
|
||||
private async reconnectLoop(): Promise<void> {
|
||||
do {
|
||||
this.reconnectRequested = false
|
||||
await this.reconnectWithRetries()
|
||||
const reconnected = await this.reconnectWithRetries()
|
||||
if (!reconnected) return
|
||||
} while (
|
||||
!this.disconnecting &&
|
||||
(this.reconnectRequested || !this.connected)
|
||||
@@ -309,12 +317,12 @@ class CdpBackend implements ICdpBackend {
|
||||
this.pending.clear()
|
||||
}
|
||||
|
||||
private async reconnectWithRetries(): Promise<void> {
|
||||
private async reconnectWithRetries(): Promise<boolean> {
|
||||
const maxRetries = CDP_LIMITS.RECONNECT_MAX_RETRIES
|
||||
const delay = TIMEOUTS.CDP_RECONNECT_DELAY
|
||||
|
||||
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
||||
if (this.disconnecting) return
|
||||
if (this.disconnecting) return false
|
||||
|
||||
try {
|
||||
logger.info(`CDP reconnection attempt ${attempt}/${maxRetries}...`)
|
||||
@@ -322,7 +330,7 @@ class CdpBackend implements ICdpBackend {
|
||||
await this.attemptConnect()
|
||||
this.startKeepalive()
|
||||
logger.info('CDP reconnected successfully')
|
||||
return
|
||||
return true
|
||||
} catch (error) {
|
||||
const msg = error instanceof Error ? error.message : String(error)
|
||||
logger.warn(
|
||||
@@ -331,10 +339,14 @@ class CdpBackend implements ICdpBackend {
|
||||
}
|
||||
}
|
||||
|
||||
logger.error(
|
||||
`CDP reconnection failed after ${maxRetries} attempts, exiting for restart`,
|
||||
)
|
||||
process.exit(EXIT_CODES.GENERAL_ERROR)
|
||||
if (this.exitOnReconnectFailure) {
|
||||
logger.error(
|
||||
`CDP reconnection failed after ${maxRetries} attempts, exiting for restart`,
|
||||
)
|
||||
process.exit(EXIT_CODES.GENERAL_ERROR)
|
||||
}
|
||||
logger.error(`CDP reconnection failed after ${maxRetries} attempts`)
|
||||
return false
|
||||
}
|
||||
|
||||
async disconnect(): Promise<void> {
|
||||
|
||||
@@ -51,13 +51,17 @@ describe('CdpBackend', () => {
|
||||
const originalReconnectDelay = TIMEOUTS.CDP_RECONNECT_DELAY
|
||||
let fetchUrls: string[] = []
|
||||
let failIpv4Discovery = false
|
||||
let failAllDiscovery = false
|
||||
let wsHost = '127.0.0.1'
|
||||
let originalExit: typeof process.exit
|
||||
|
||||
beforeEach(() => {
|
||||
MockWebSocket.instances = []
|
||||
fetchUrls = []
|
||||
failIpv4Discovery = false
|
||||
failAllDiscovery = false
|
||||
wsHost = '127.0.0.1'
|
||||
originalExit = process.exit
|
||||
|
||||
;(TIMEOUTS as unknown as { CDP_CONNECT: number }).CDP_CONNECT = 200
|
||||
;(
|
||||
@@ -67,6 +71,9 @@ describe('CdpBackend', () => {
|
||||
globalThis.fetch = (async (input: string | URL | Request) => {
|
||||
const url = String(input)
|
||||
fetchUrls.push(url)
|
||||
if (failAllDiscovery) {
|
||||
throw new Error('Unable to connect')
|
||||
}
|
||||
if (failIpv4Discovery && url.includes('127.0.0.1')) {
|
||||
throw new Error('Unable to connect')
|
||||
}
|
||||
@@ -87,6 +94,7 @@ describe('CdpBackend', () => {
|
||||
afterEach(() => {
|
||||
globalThis.fetch = originalFetch
|
||||
globalThis.WebSocket = originalWebSocket
|
||||
process.exit = originalExit
|
||||
;(TIMEOUTS as unknown as { CDP_CONNECT: number }).CDP_CONNECT =
|
||||
originalConnectTimeout
|
||||
;(
|
||||
@@ -160,4 +168,31 @@ describe('CdpBackend', () => {
|
||||
assert(fetchUrls.length >= 3)
|
||||
await cdp.disconnect()
|
||||
})
|
||||
|
||||
it('can disable process exit when reconnect retries are exhausted', async () => {
|
||||
let exitCalled = false
|
||||
process.exit = (() => {
|
||||
exitCalled = true
|
||||
throw new Error('process.exit should not be called')
|
||||
}) as unknown as typeof process.exit
|
||||
|
||||
const cdp = new CdpBackend({ port: 9222, exitOnReconnectFailure: false })
|
||||
const connectPromise = cdp.connect()
|
||||
|
||||
await waitFor(() => MockWebSocket.instances.length === 1)
|
||||
const ws1 = MockWebSocket.instances[0]
|
||||
ws1?.open()
|
||||
await connectPromise
|
||||
assert.strictEqual(cdp.isConnected(), true)
|
||||
|
||||
failAllDiscovery = true
|
||||
ws1?.close()
|
||||
|
||||
await waitFor(() => fetchUrls.length >= 10)
|
||||
await Bun.sleep(5)
|
||||
|
||||
assert.strictEqual(exitCalled, false)
|
||||
assert.strictEqual(cdp.isConnected(), false)
|
||||
await cdp.disconnect()
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user