Files
BrowserOS/packages/browseros-agent/apps/eval/tests/reporting/run-summary.test.ts
Nikhil 26afb826c6 feat(eval): add viewer manifest contract (#878)
* refactor(eval): canonicalize viewer manifest contract

* refactor(eval): publish canonical viewer manifests

* feat(eval): make r2 viewer use manifest artifact paths

* fix(eval): keep weekly report compatible with viewer manifests

* docs(eval): document r2 viewer manifest contract

* chore: self-review fixes

* fix: address review feedback for PR #878
2026-04-29 20:50:35 -07:00

106 lines
2.9 KiB
TypeScript
Vendored

import { describe, expect, it } from 'bun:test'
import {
buildRunSummaries,
extractConfigName,
} from '../../src/reporting/run-summary'
describe('report run summaries', () => {
it('summarizes schema v2 manifests without depending on artifact paths', () => {
const [summary] = buildRunSummaries([
{
schemaVersion: 2,
runId: 'agisdk-real-smoke-2026-04-30-0000',
uploadedAt: '2026-04-30T01:03:59.663Z',
agentConfig: { type: 'single', model: 'moonshotai/kimi-k2.5' },
dataset: 'agisdk-real',
tasks: [
{
queryId: 'task-1',
query: 'Do task 1',
status: 'completed',
durationMs: 1000,
screenshotCount: 1,
paths: { metadata: 'tasks/task-1/metadata.json' },
graderResults: {
agisdk_state_diff: { score: 1, pass: true },
},
},
{
queryId: 'task-2',
query: 'Do task 2',
status: 'timeout',
durationMs: 3000,
screenshotCount: 0,
paths: { metadata: 'tasks/task-2/metadata.json' },
graderResults: {
agisdk_state_diff: { score: 0, pass: false },
},
},
],
},
])
expect(summary).toMatchObject({
runId: 'agisdk-real-smoke-2026-04-30-0000',
configName: 'agisdk-real-smoke',
date: '2026-04-30 01:03',
avgScore: 50,
total: 2,
completed: 1,
timeout: 1,
avgDurationMs: 2000,
model: 'moonshotai/kimi-k2.5',
dataset: 'agisdk-real',
agentType: 'single',
})
})
it('summarizes legacy manifests without schema version or paths', () => {
const [summary] = buildRunSummaries([
{
runId: 'browseros-agent-weekly-2026-04-29-1430',
uploadedAt: '2026-04-29T14:30:00.000Z',
agentConfig: { type: 'orchestrator-executor', model: 'kimi' },
dataset: 'webbench',
tasks: [
{
queryId: 'legacy-task',
query: 'Do the old task',
status: 'failed',
durationMs: 0,
screenshotCount: 0,
graderResults: {
performance_grader: { score: 0.25, pass: false },
},
},
],
},
])
expect(summary).toMatchObject({
runId: 'browseros-agent-weekly-2026-04-29-1430',
configName: 'browseros-agent-weekly',
avgScore: 25,
total: 1,
completed: 0,
failed: 1,
avgDurationMs: 0,
})
})
it('keeps legacy config names when run ids have no timestamp suffix', () => {
expect(extractConfigName('ci-weekly')).toBe('ci-weekly')
})
it('uses an explicit unknown date when uploadedAt is missing', () => {
const [summary] = buildRunSummaries([
{
runId: 'ci-weekly',
tasks: [],
},
])
expect(summary.date).toBe('unknown')
})
})