Compare commits

...

7 Commits

Author SHA1 Message Date
Nikhil Sonti
d631c7dd06 fix: address review feedback for PR #878 2026-04-29 20:14:06 -07:00
Nikhil Sonti
3c706ec3fc chore: self-review fixes 2026-04-29 20:01:15 -07:00
Nikhil Sonti
8ec5c147ac docs(eval): document r2 viewer manifest contract 2026-04-29 20:01:15 -07:00
Nikhil Sonti
2ac3fbb3b1 fix(eval): keep weekly report compatible with viewer manifests 2026-04-29 20:01:15 -07:00
Nikhil Sonti
241755b3ec feat(eval): make r2 viewer use manifest artifact paths 2026-04-29 20:01:15 -07:00
Nikhil Sonti
8af80e2fff refactor(eval): publish canonical viewer manifests 2026-04-29 20:01:15 -07:00
Nikhil Sonti
ac654e443a refactor(eval): canonicalize viewer manifest contract 2026-04-29 20:01:14 -07:00
11 changed files with 692 additions and 179 deletions

View File

@@ -181,6 +181,8 @@ export EVAL_R2_BUCKET=browseros-eval
export EVAL_R2_CDN_BASE_URL=https://eval.browseros.com
```
`EVAL_R2_CDN_BASE_URL` must be a public R2 custom domain, `r2.dev` URL, or Worker URL. Do not set it to the private `*.r2.cloudflarestorage.com` S3 API endpoint.
Published runs are available at `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
### BrowserOS infrastructure
@@ -253,7 +255,35 @@ results/
summary.json # Aggregate pass rates
```
R2 publishing preserves the same task files under `runs/<run-id>/...`, writes `runs/<run-id>/manifest.json`, and uploads `viewer.html` at the bucket root. The viewer URL is `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
R2 publishing preserves the task files under `runs/<run-id>/...`, writes `runs/<run-id>/manifest.json`, and uploads `viewer.html` at the bucket root. The viewer URL is `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
### R2 viewer manifest
`runs/<run-id>/manifest.json` is the source of truth for the public viewer. New manifests include `schemaVersion: 2` and each task includes explicit artifact paths:
```json
{
"schemaVersion": 2,
"runId": "agisdk-real-smoke-2026-04-30-0000",
"tasks": [
{
"queryId": "agisdk-dashdish-10",
"paths": {
"metadata": "tasks/agisdk-dashdish-10/metadata.json",
"messages": "tasks/agisdk-dashdish-10/messages.jsonl",
"grades": "tasks/agisdk-dashdish-10/grades.json",
"trace": "tasks/agisdk-dashdish-10/trace.jsonl",
"screenshots": "tasks/agisdk-dashdish-10/screenshots",
"graderArtifacts": "tasks/agisdk-dashdish-10/grader-artifacts"
}
}
]
}
```
The static viewer uses `task.paths` when present. Older uploaded runs without `schemaVersion` or `task.paths` still work through the legacy inferred layout: `runs/<run-id>/<task-id>/metadata.json`, `messages.jsonl`, and `screenshots/<n>.png`.
Manifest paths are stable artifact locations, not a guarantee that every optional artifact exists for every task. For example, `attempt.json`, `trace.jsonl`, or grader artifact directories may be absent when that artifact was not produced by the run.
## Troubleshooting

View File

@@ -24,45 +24,11 @@ import {
PutObjectCommand,
S3Client,
} from '@aws-sdk/client-s3'
interface ManifestTask {
queryId: string
query: string
status: string
durationMs: number
screenshotCount: number
graderResults: Record<string, { pass: boolean; score: number }>
}
interface Manifest {
runId: string
uploadedAt: string
agentConfig?: { type?: string; model?: string }
dataset?: string
summary?: { passRate?: number; avgDurationMs?: number }
tasks: ManifestTask[]
}
interface RunSummary {
runId: string
configName: string
date: string
avgScore: number
total: number
completed: number
failed: number
timeout: number
avgDurationMs: number
model: string
dataset: string
agentType: string
}
const PASS_FAIL_GRADER_ORDER = [
'agisdk_state_diff',
'infinity_state',
'performance_grader',
]
import {
buildRunSummaries,
type ReportManifest,
type RunSummary,
} from '../src/reporting/run-summary'
function requireEnv(name: string): string {
const value = process.env[name]
@@ -87,7 +53,7 @@ const client = new S3Client({
// Step 1: List all manifest.json files in runs/
console.log('Scanning R2 for eval runs...')
const manifests: Manifest[] = []
const manifests: ReportManifest[] = []
let continuationToken: string | undefined
do {
@@ -127,64 +93,9 @@ if (manifests.length === 0) {
}
// Step 2: Build run summaries
const runs: RunSummary[] = manifests
.map((m) => {
const total = m.tasks.length
const completed = m.tasks.filter((t) => t.status === 'completed').length
const failed = m.tasks.filter((t) => t.status === 'failed').length
const timeout = m.tasks.filter((t) => t.status === 'timeout').length
let scoredCount = 0
let scoreSum = 0
for (const task of m.tasks) {
if (!task.graderResults) continue
for (const name of PASS_FAIL_GRADER_ORDER) {
if (task.graderResults[name]) {
scoredCount++
scoreSum += task.graderResults[name].score ?? 0
break
}
}
}
const avgScore = scoredCount > 0 ? (scoreSum / scoredCount) * 100 : 0
const durations = m.tasks
.filter((t) => t.durationMs > 0)
.map((t) => t.durationMs)
const avgDurationMs =
durations.length > 0
? durations.reduce((a, b) => a + b, 0) / durations.length
: 0
const date = m.uploadedAt
? `${m.uploadedAt.split('T')[0]} ${m.uploadedAt.split('T')[1]?.slice(0, 5) || ''}`
: m.runId.slice(0, 15)
const model = m.agentConfig?.model || 'unknown'
const dataset = m.dataset || m.runId
const agentType = m.agentConfig?.type || 'unknown'
const configName = extractConfigName(m.runId)
return {
runId: m.runId,
configName,
date,
avgScore,
total,
completed,
failed,
timeout,
avgDurationMs,
model,
dataset,
agentType,
}
})
.sort((a, b) => a.date.localeCompare(b.date))
const runs: RunSummary[] = buildRunSummaries(manifests)
// Step 3: Identify unique config groups
// runId can be "ci-weekly" (old) or "ci-weekly-2026-03-21-1730" (timestamped)
// Extract config name by stripping the date-time suffix pattern
function escHtml(s: string): string {
return s
.replace(/&/g, '&amp;')
@@ -193,12 +104,6 @@ function escHtml(s: string): string {
.replace(/"/g, '&quot;')
}
function extractConfigName(runId: string): string {
// "browseros-agent-weekly-2026-03-21-1730" → "browseros-agent-weekly"
// "ci-weekly" → "ci-weekly" (no timestamp, old format)
return runId.replace(/-\d{4}-\d{2}-\d{2}-\d{4}$/, '')
}
const configGroups = [...new Set(runs.map((r) => r.configName))]
const defaultConfig = configGroups.includes('ci-weekly')
? 'ci-weekly'

View File

@@ -685,6 +685,59 @@
});
}
// Test harness note: these ASCII section markers are used by r2-viewer-compat.test.ts.
// -- Artifact path resolution
function taskKey(task) {
return task.queryId || task.id || 'unknown-task';
}
function legacyArtifactPath(task, artifact) {
const id = taskKey(task);
switch (artifact) {
case 'attempt':
return `${id}/attempt.json`;
case 'metadata':
return `${id}/metadata.json`;
case 'messages':
return `${id}/messages.jsonl`;
case 'trace':
return `${id}/trace.jsonl`;
case 'grades':
return `${id}/grades.json`;
case 'screenshots':
return `${id}/screenshots`;
case 'graderArtifacts':
return `${id}/grader-artifacts`;
default:
return `${id}/${artifact}`;
}
}
function artifactPath(task, artifact) {
const manifestPath = task.paths && task.paths[artifact];
if (typeof manifestPath === 'string' && manifestPath.length > 0) {
return manifestPath.replace(/^\/+/, '');
}
return legacyArtifactPath(task, artifact);
}
function artifactUrl(task, artifact) {
return `${basePath}/${artifactPath(task, artifact)}`;
}
function metadataUrl(task) {
return artifactUrl(task, 'metadata');
}
function messagesUrl(task) {
return artifactUrl(task, 'messages');
}
function screenshotUrl(task, n) {
return `${artifactUrl(task, 'screenshots')}/${n}.png`;
}
// -- Task selection
// ── Task selection ─────────────────────────────────────────────
function selectTask(task) {
stopAutoplay();
@@ -716,6 +769,7 @@
}
}
// -- Center panel
// ── Center panel: screenshot viewer ────────────────────────────
function renderCenterPanel(task) {
const panel = document.getElementById('center-panel');
@@ -763,10 +817,6 @@
updateControls();
}
function screenshotUrl(task, n) {
return `${basePath}/${task.queryId || task.id}/screenshots/${n}.png`;
}
function goToStep(n) {
if (!selectedTask || n < 1 || n > totalSteps) return;
currentStep = n;
@@ -914,7 +964,7 @@
body.innerHTML = '<div class="placeholder"><div class="ph-text" style="color: #6e7681;">Loading messages...</div></div>';
countEl.textContent = '';
const msgUrl = `${basePath}/${task.queryId || task.id}/messages.jsonl`;
const msgUrl = messagesUrl(task);
fetch(msgUrl)
.then((res) => {
@@ -1075,7 +1125,7 @@
// ── Load task metadata for rich grader details ──────────────────
function loadTaskMetadata(task) {
const metaUrl = `${basePath}/${task.queryId || task.id}/metadata.json`;
const metaUrl = metadataUrl(task);
fetch(metaUrl)
.then((res) => res.ok ? res.json() : null)
.then((meta) => {

View File

@@ -1,3 +1,8 @@
import type {
ViewerManifest,
ViewerManifestTask,
} from '../viewer/viewer-manifest'
export interface R2UploadConfig {
accountId: string
accessKeyId: string
@@ -6,27 +11,9 @@ export interface R2UploadConfig {
cdnBaseUrl: string
}
export interface R2ManifestTask {
queryId: string
query: string
startUrl: string
status: string
durationMs: number
screenshotCount: number
graderResults: Record<string, unknown>
}
export type R2ManifestTask = ViewerManifestTask
export interface R2RunManifest {
runId: string
uploadedAt: string
agentConfig?: Record<string, unknown>
dataset?: string
summary?: {
passRate?: unknown
avgDurationMs?: unknown
}
tasks: R2ManifestTask[]
}
export type R2RunManifest = ViewerManifest
export interface R2PublishRunResult {
runId: string

View File

@@ -5,8 +5,11 @@ import {
PutObjectCommand,
S3Client,
} from '@aws-sdk/client-s3'
import {
buildViewerManifest,
type ViewerManifestTaskInput,
} from '../viewer/viewer-manifest'
import type {
R2ManifestTask,
R2PublishPathResult,
R2PublishRunResult,
R2RunManifest,
@@ -43,7 +46,6 @@ interface UploadJob {
interface TaskDirEntry {
taskId: string
taskPath: string
canonicalLayout: boolean
}
export function contentTypeForPath(filePath: string): string {
@@ -129,7 +131,6 @@ async function findTaskDirs(runDir: string): Promise<TaskDirEntry[]> {
legacyTasks.push({
taskId: entry.name,
taskPath,
canonicalLayout: false,
})
}
}
@@ -146,7 +147,6 @@ async function findTaskDirs(runDir: string): Promise<TaskDirEntry[]> {
canonicalTasks.push({
taskId: entry.name,
taskPath,
canonicalLayout: true,
})
}
}
@@ -262,7 +262,7 @@ export class R2Publisher {
throw new Error(`No task subdirectories in ${runId}`)
}
const manifestTasks: R2ManifestTask[] = []
const manifestTasks: ViewerManifestTaskInput[] = []
const jobs: UploadJob[] = (await collectRunRootFiles(runDir)).map(
(job) => ({
...job,
@@ -289,22 +289,23 @@ export class R2Publisher {
if (relative.startsWith('screenshots/') && extname(file) === '.png') {
screenshotCount++
}
// Keep legacy keys during the manifest v2 rollout so cached viewers and
// old manifests can still resolve task artifacts.
jobs.push({
key: `runs/${runId}/${taskId}/${relative}`,
filePath: file,
contentType: contentTypeForPath(file),
})
if (taskDirEntry.canonicalLayout) {
jobs.push({
key: `runs/${runId}/tasks/${taskId}/${relative}`,
filePath: file,
contentType: contentTypeForPath(file),
})
}
jobs.push({
key: `runs/${runId}/tasks/${taskId}/${relative}`,
filePath: file,
contentType: contentTypeForPath(file),
})
}
manifestTasks.push({
queryId: (meta.query_id as string | undefined) || taskId,
artifactId: taskId,
query: (meta.query as string | undefined) || '',
startUrl: (meta.start_url as string | undefined) || '',
status: statusFromMetadata(meta),
@@ -312,7 +313,8 @@ export class R2Publisher {
screenshotCount:
(meta.screenshot_count as number | undefined) || screenshotCount,
graderResults:
(meta.grader_results as Record<string, unknown> | undefined) || {},
(meta.grader_results as ViewerManifestTaskInput['graderResults']) ||
{},
})
}
@@ -347,7 +349,7 @@ export class R2Publisher {
return {
runId,
uploadedFiles: uploaded + 2,
viewerUrl: `${this.config.cdnBaseUrl}/viewer.html?run=${runId}`,
viewerUrl: `${this.config.cdnBaseUrl}/viewer.html?run=${encodeURIComponent(runId)}`,
manifest,
}
}
@@ -369,7 +371,7 @@ export class R2Publisher {
runId: string,
agentConfig: Record<string, unknown> | undefined,
dataset: string | undefined,
tasks: R2ManifestTask[],
tasks: ViewerManifestTaskInput[],
): Promise<R2RunManifest> {
let summaryData: Record<string, unknown> | undefined
try {
@@ -378,7 +380,7 @@ export class R2Publisher {
) as Record<string, unknown>
} catch {}
return {
return buildViewerManifest({
runId,
uploadedAt: this.now().toISOString(),
agentConfig,
@@ -390,7 +392,7 @@ export class R2Publisher {
}
: undefined,
tasks,
}
})
}
private async uploadFile(job: UploadJob): Promise<void> {

View File

@@ -0,0 +1,104 @@
export interface ReportManifestTask {
queryId: string
query?: string
status: string
durationMs: number
screenshotCount?: number
paths?: Record<string, string>
graderResults?: Record<string, { pass?: boolean; score?: number }>
}
export interface ReportManifest {
schemaVersion?: number
runId: string
uploadedAt?: string
agentConfig?: { type?: string; model?: string }
dataset?: string
summary?: { passRate?: number; avgDurationMs?: number }
tasks?: ReportManifestTask[]
}
export interface RunSummary {
runId: string
configName: string
date: string
avgScore: number
total: number
completed: number
failed: number
timeout: number
avgDurationMs: number
model: string
dataset: string
agentType: string
}
// Report score uses the primary pass/fail grader so mixed-grader runs keep
// the same precedence as the eval summary.
const PASS_FAIL_GRADER_ORDER = [
'agisdk_state_diff',
'infinity_state',
'performance_grader',
]
export function extractConfigName(runId: string): string {
return runId.replace(/-\d{4}-\d{2}-\d{2}-\d{4}$/, '')
}
function reportDate(manifest: ReportManifest): string {
if (!manifest.uploadedAt) return 'unknown'
const [date, time] = manifest.uploadedAt.split('T')
return `${date} ${time?.slice(0, 5) || ''}`
}
function primaryScore(task: ReportManifestTask): number | null {
if (!task.graderResults) return null
for (const name of PASS_FAIL_GRADER_ORDER) {
const result = task.graderResults[name]
if (result) return result.score ?? 0
}
return null
}
export function buildRunSummaries(manifests: ReportManifest[]): RunSummary[] {
return manifests
.map((manifest) => {
const tasks = Array.isArray(manifest.tasks) ? manifest.tasks : []
const total = tasks.length
const completed = tasks.filter((t) => t.status === 'completed').length
const failed = tasks.filter((t) => t.status === 'failed').length
const timeout = tasks.filter((t) => t.status === 'timeout').length
let scoredCount = 0
let scoreSum = 0
for (const task of tasks) {
const score = primaryScore(task)
if (score === null) continue
scoredCount++
scoreSum += score
}
const durations = tasks
.filter((t) => t.durationMs > 0)
.map((t) => t.durationMs)
return {
runId: manifest.runId,
configName: extractConfigName(manifest.runId),
date: reportDate(manifest),
avgScore: scoredCount > 0 ? (scoreSum / scoredCount) * 100 : 0,
total,
completed,
failed,
timeout,
avgDurationMs:
durations.length > 0
? durations.reduce((a, b) => a + b, 0) / durations.length
: 0,
model: manifest.agentConfig?.model || 'unknown',
dataset: manifest.dataset || manifest.runId,
agentType: manifest.agentConfig?.type || 'unknown',
}
})
.sort((a, b) => a.date.localeCompare(b.date))
}

View File

@@ -1,7 +1,20 @@
import type { GraderResult } from '../types'
export const VIEWER_MANIFEST_SCHEMA_VERSION = 2
export interface ViewerManifestTaskPaths {
attempt: string
metadata: string
messages: string
trace: string
grades: string
screenshots: string
graderArtifacts: string
}
export interface ViewerManifestTaskInput {
queryId: string
artifactId?: string
query: string
startUrl?: string
status: string
@@ -10,57 +23,67 @@ export interface ViewerManifestTaskInput {
graderResults: Record<string, GraderResult>
}
export interface ViewerManifestTask extends ViewerManifestTaskInput {
paths: {
attempt: string
metadata: string
messages: string
trace: string
grades: string
screenshots: string
graderArtifacts: string
}
export interface ViewerManifestTask
extends Omit<ViewerManifestTaskInput, 'artifactId'> {
startUrl: string
paths: ViewerManifestTaskPaths
}
export interface ViewerManifest {
schemaVersion: typeof VIEWER_MANIFEST_SCHEMA_VERSION
runId: string
suiteId: string
variantId: string
suiteId?: string
variantId?: string
uploadedAt?: string
summary: Record<string, unknown>
agentConfig?: Record<string, unknown>
dataset?: string
summary?: Record<string, unknown>
tasks: ViewerManifestTask[]
}
export interface BuildViewerManifestInput {
runId: string
suiteId: string
variantId: string
suiteId?: string
variantId?: string
uploadedAt?: string
summary: Record<string, unknown>
agentConfig?: Record<string, unknown>
dataset?: string
summary?: Record<string, unknown>
tasks: ViewerManifestTaskInput[]
}
function taskPaths(queryId: string): ViewerManifestTaskPaths {
return {
attempt: `tasks/${queryId}/attempt.json`,
metadata: `tasks/${queryId}/metadata.json`,
messages: `tasks/${queryId}/messages.jsonl`,
trace: `tasks/${queryId}/trace.jsonl`,
grades: `tasks/${queryId}/grades.json`,
screenshots: `tasks/${queryId}/screenshots`,
graderArtifacts: `tasks/${queryId}/grader-artifacts`,
}
}
/** Builds the compact JSON index consumed by the static R2 viewer. */
export function buildViewerManifest(
input: BuildViewerManifestInput,
): ViewerManifest {
return {
schemaVersion: VIEWER_MANIFEST_SCHEMA_VERSION,
runId: input.runId,
suiteId: input.suiteId,
variantId: input.variantId,
uploadedAt: input.uploadedAt,
summary: input.summary,
tasks: input.tasks.map((task) => ({
...task,
paths: {
attempt: `tasks/${task.queryId}/attempt.json`,
metadata: `tasks/${task.queryId}/metadata.json`,
messages: `tasks/${task.queryId}/messages.jsonl`,
trace: `tasks/${task.queryId}/trace.jsonl`,
grades: `tasks/${task.queryId}/grades.json`,
screenshots: `tasks/${task.queryId}/screenshots`,
graderArtifacts: `tasks/${task.queryId}/grader-artifacts`,
},
})),
...(input.suiteId ? { suiteId: input.suiteId } : {}),
...(input.variantId ? { variantId: input.variantId } : {}),
...(input.uploadedAt ? { uploadedAt: input.uploadedAt } : {}),
...(input.agentConfig ? { agentConfig: input.agentConfig } : {}),
...(input.dataset ? { dataset: input.dataset } : {}),
...(input.summary ? { summary: input.summary } : {}),
tasks: input.tasks.map((task) => {
const { artifactId, ...publicTask } = task
return {
...publicTask,
startUrl: publicTask.startUrl ?? '',
paths: taskPaths(artifactId ?? publicTask.queryId),
}
}),
}
}

View File

@@ -26,6 +26,7 @@ async function writeRunFixture(
root: string,
configName = 'browseros-agent-weekly',
timestamp = '2026-04-29-1200',
options: { queryId?: string } = {},
): Promise<{ runDir: string; runId: string }> {
const runDir = join(root, configName, timestamp)
const taskDir = join(runDir, 'task-1')
@@ -33,7 +34,7 @@ async function writeRunFixture(
await writeFile(
join(taskDir, 'metadata.json'),
JSON.stringify({
query_id: 'task-1',
query_id: options.queryId ?? 'task-1',
dataset: 'webbench',
query: 'Find pricing',
start_url: 'https://example.test',
@@ -94,6 +95,15 @@ describe('R2Publisher', () => {
expect(
byKey.get(`runs/${runId}/task-1/screenshots/1.png`)?.ContentType,
).toBe('image/png')
expect(
byKey.get(`runs/${runId}/tasks/task-1/metadata.json`)?.ContentType,
).toBe('application/json')
expect(
byKey.get(`runs/${runId}/tasks/task-1/messages.jsonl`)?.ContentType,
).toBe('application/x-ndjson')
expect(
byKey.get(`runs/${runId}/tasks/task-1/screenshots/1.png`)?.ContentType,
).toBe('image/png')
expect(byKey.get(`runs/${runId}/manifest.json`)?.ContentType).toBe(
'application/json',
)
@@ -111,8 +121,10 @@ describe('R2Publisher', () => {
).toString('utf-8'),
)
expect(manifest).toMatchObject({
schemaVersion: 2,
runId,
uploadedAt: '2026-04-29T12:00:00.000Z',
agentConfig: { type: 'single', model: 'kimi' },
dataset: 'webbench',
summary: { passRate: 1, avgDurationMs: 1200 },
tasks: [
@@ -120,11 +132,86 @@ describe('R2Publisher', () => {
queryId: 'task-1',
status: 'completed',
screenshotCount: 1,
paths: {
attempt: 'tasks/task-1/attempt.json',
metadata: 'tasks/task-1/metadata.json',
messages: 'tasks/task-1/messages.jsonl',
trace: 'tasks/task-1/trace.jsonl',
grades: 'tasks/task-1/grades.json',
screenshots: 'tasks/task-1/screenshots',
graderArtifacts: 'tasks/task-1/grader-artifacts',
},
},
],
})
})
it('uses task directory ids for canonical paths when metadata query ids differ', async () => {
const dir = await mkdtemp(join(tmpdir(), 'eval-r2-path-id-'))
const { runDir, runId } = await writeRunFixture(
dir,
'weekly',
'2026-04-29-1200',
{ queryId: 'query-id-from-metadata' },
)
const viewerPath = join(dir, 'viewer.html')
await writeFile(viewerPath, '<html>viewer</html>')
const client = new FakeR2Client()
await new R2Publisher({
client,
viewerPath,
config: {
accountId: 'acct',
accessKeyId: 'key',
secretAccessKey: 'secret',
bucket: 'bucket',
cdnBaseUrl: 'https://eval.example.test',
},
now: () => new Date('2026-04-29T12:00:00.000Z'),
}).publishRun(runDir, runId)
const byKey = new Map(client.puts.map((put) => [put.Key, put]))
const manifest = JSON.parse(
Buffer.from(
byKey.get(`runs/${runId}/manifest.json`)?.Body as Buffer,
).toString('utf-8'),
)
expect(byKey.has(`runs/${runId}/tasks/task-1/metadata.json`)).toBe(true)
expect(manifest.tasks[0]).toMatchObject({
queryId: 'query-id-from-metadata',
paths: {
metadata: 'tasks/task-1/metadata.json',
screenshots: 'tasks/task-1/screenshots',
},
})
})
it('encodes run ids in returned viewer urls', async () => {
const dir = await mkdtemp(join(tmpdir(), 'eval-r2-viewer-url-'))
const { runDir } = await writeRunFixture(dir)
const viewerPath = join(dir, 'viewer.html')
await writeFile(viewerPath, '<html>viewer</html>')
const client = new FakeR2Client()
const result = await new R2Publisher({
client,
viewerPath,
config: {
accountId: 'acct',
accessKeyId: 'key',
secretAccessKey: 'secret',
bucket: 'bucket',
cdnBaseUrl: 'https://eval.example.test',
},
}).publishRun(runDir, 'run with spaces')
expect(result.viewerUrl).toBe(
'https://eval.example.test/viewer.html?run=run%20with%20spaces',
)
})
it('publishes unuploaded runs from a config results directory', async () => {
const dir = await mkdtemp(join(tmpdir(), 'eval-r2-config-'))
const first = await writeRunFixture(dir, 'weekly', '2026-04-29-1200')
@@ -186,8 +273,27 @@ describe('R2Publisher', () => {
}).publishPath(runDir)
const keys = client.puts.map((put) => put.Key)
const byKey = new Map(client.puts.map((put) => [put.Key, put]))
const manifest = JSON.parse(
Buffer.from(
byKey.get(`runs/${runId}/manifest.json`)?.Body as Buffer,
).toString('utf-8'),
)
expect(result.uploadedRuns.map((run) => run.runId)).toEqual([runId])
expect(keys).toContain(`runs/${runId}/task-1/metadata.json`)
expect(keys).toContain(`runs/${runId}/tasks/task-1/metadata.json`)
expect(manifest).toMatchObject({
schemaVersion: 2,
tasks: [
{
queryId: 'task-1',
paths: {
metadata: 'tasks/task-1/metadata.json',
screenshots: 'tasks/task-1/screenshots',
},
},
],
})
})
})

View File

@@ -0,0 +1,130 @@
import { describe, expect, it } from 'bun:test'
import { readFile } from 'node:fs/promises'
import { join } from 'node:path'
interface ViewerPathResolvers {
artifactUrl(task: Record<string, unknown>, artifact: string): string
metadataUrl(task: Record<string, unknown>): string
messagesUrl(task: Record<string, unknown>): string
screenshotUrl(task: Record<string, unknown>, step: number): string
}
async function loadViewerPathResolvers(): Promise<ViewerPathResolvers> {
const html = await readFile(
join(import.meta.dir, '..', '..', 'src', 'dashboard', 'viewer.html'),
'utf-8',
)
const start = html.indexOf('// -- Artifact path resolution')
const end = html.indexOf('// -- Task selection', start)
expect(start).toBeGreaterThan(-1)
expect(end).toBeGreaterThan(start)
const block = html.slice(start, end)
const createResolvers = new Function(
`
const basePath = 'runs/run-1';
${block}
return { artifactUrl, metadataUrl, messagesUrl, screenshotUrl };
`,
) as () => ViewerPathResolvers
return createResolvers()
}
async function runAutoSelectFromHash(hash: string): Promise<unknown> {
const html = await readFile(
join(import.meta.dir, '..', '..', 'src', 'dashboard', 'viewer.html'),
'utf-8',
)
const start = html.indexOf('function autoSelectFromHash()')
const end = html.indexOf('// -- Center panel', start)
expect(start).toBeGreaterThan(-1)
expect(end).toBeGreaterThan(start)
const block = html.slice(start, end)
const runAutoSelect = new Function(
`
const window = { location: { hash: ${JSON.stringify(hash)} } };
const manifest = {
tasks: [
{ queryId: 'legacy-task' },
{ queryId: 'new-task', paths: { metadata: 'tasks/new-task/metadata.json' } },
],
};
let selected = null;
function selectTask(task) { selected = task; }
${block}
autoSelectFromHash();
return selected;
`,
) as () => unknown
return runAutoSelect()
}
describe('R2 viewer artifact path compatibility', () => {
it('uses explicit manifest paths for new uploaded runs', async () => {
const resolvers = await loadViewerPathResolvers()
const task = {
queryId: 'task-1',
paths: {
metadata: 'tasks/task-1/metadata.json',
messages: 'tasks/task-1/messages.jsonl',
grades: 'tasks/task-1/grades.json',
trace: 'tasks/task-1/trace.jsonl',
screenshots: 'tasks/task-1/screenshots',
graderArtifacts: 'tasks/task-1/grader-artifacts',
},
}
expect(resolvers.metadataUrl(task)).toBe(
'runs/run-1/tasks/task-1/metadata.json',
)
expect(resolvers.messagesUrl(task)).toBe(
'runs/run-1/tasks/task-1/messages.jsonl',
)
expect(resolvers.artifactUrl(task, 'grades')).toBe(
'runs/run-1/tasks/task-1/grades.json',
)
expect(resolvers.artifactUrl(task, 'trace')).toBe(
'runs/run-1/tasks/task-1/trace.jsonl',
)
expect(resolvers.artifactUrl(task, 'graderArtifacts')).toBe(
'runs/run-1/tasks/task-1/grader-artifacts',
)
expect(resolvers.screenshotUrl(task, 7)).toBe(
'runs/run-1/tasks/task-1/screenshots/7.png',
)
})
it('falls back to legacy inferred paths for old uploaded runs', async () => {
const resolvers = await loadViewerPathResolvers()
const task = { queryId: 'legacy-task' }
expect(resolvers.metadataUrl(task)).toBe(
'runs/run-1/legacy-task/metadata.json',
)
expect(resolvers.messagesUrl(task)).toBe(
'runs/run-1/legacy-task/messages.jsonl',
)
expect(resolvers.artifactUrl(task, 'grades')).toBe(
'runs/run-1/legacy-task/grades.json',
)
expect(resolvers.artifactUrl(task, 'trace')).toBe(
'runs/run-1/legacy-task/trace.jsonl',
)
expect(resolvers.artifactUrl(task, 'graderArtifacts')).toBe(
'runs/run-1/legacy-task/grader-artifacts',
)
expect(resolvers.screenshotUrl(task, 3)).toBe(
'runs/run-1/legacy-task/screenshots/3.png',
)
})
it('keeps hash-based task selection independent of artifact layout', async () => {
expect(await runAutoSelectFromHash('#new-task')).toMatchObject({
queryId: 'new-task',
})
expect(await runAutoSelectFromHash('#legacy-task')).toMatchObject({
queryId: 'legacy-task',
})
})
})

View File

@@ -0,0 +1,105 @@
import { describe, expect, it } from 'bun:test'
import {
buildRunSummaries,
extractConfigName,
} from '../../src/reporting/run-summary'
describe('report run summaries', () => {
it('summarizes schema v2 manifests without depending on artifact paths', () => {
const [summary] = buildRunSummaries([
{
schemaVersion: 2,
runId: 'agisdk-real-smoke-2026-04-30-0000',
uploadedAt: '2026-04-30T01:03:59.663Z',
agentConfig: { type: 'single', model: 'moonshotai/kimi-k2.5' },
dataset: 'agisdk-real',
tasks: [
{
queryId: 'task-1',
query: 'Do task 1',
status: 'completed',
durationMs: 1000,
screenshotCount: 1,
paths: { metadata: 'tasks/task-1/metadata.json' },
graderResults: {
agisdk_state_diff: { score: 1, pass: true },
},
},
{
queryId: 'task-2',
query: 'Do task 2',
status: 'timeout',
durationMs: 3000,
screenshotCount: 0,
paths: { metadata: 'tasks/task-2/metadata.json' },
graderResults: {
agisdk_state_diff: { score: 0, pass: false },
},
},
],
},
])
expect(summary).toMatchObject({
runId: 'agisdk-real-smoke-2026-04-30-0000',
configName: 'agisdk-real-smoke',
date: '2026-04-30 01:03',
avgScore: 50,
total: 2,
completed: 1,
timeout: 1,
avgDurationMs: 2000,
model: 'moonshotai/kimi-k2.5',
dataset: 'agisdk-real',
agentType: 'single',
})
})
it('summarizes legacy manifests without schema version or paths', () => {
const [summary] = buildRunSummaries([
{
runId: 'browseros-agent-weekly-2026-04-29-1430',
uploadedAt: '2026-04-29T14:30:00.000Z',
agentConfig: { type: 'orchestrator-executor', model: 'kimi' },
dataset: 'webbench',
tasks: [
{
queryId: 'legacy-task',
query: 'Do the old task',
status: 'failed',
durationMs: 0,
screenshotCount: 0,
graderResults: {
performance_grader: { score: 0.25, pass: false },
},
},
],
},
])
expect(summary).toMatchObject({
runId: 'browseros-agent-weekly-2026-04-29-1430',
configName: 'browseros-agent-weekly',
avgScore: 25,
total: 1,
completed: 0,
failed: 1,
avgDurationMs: 0,
})
})
it('keeps legacy config names when run ids have no timestamp suffix', () => {
expect(extractConfigName('ci-weekly')).toBe('ci-weekly')
})
it('uses an explicit unknown date when uploadedAt is missing', () => {
const [summary] = buildRunSummaries([
{
runId: 'ci-weekly',
tasks: [],
},
])
expect(summary.date).toBe('unknown')
})
})

View File

@@ -1,4 +1,5 @@
import { describe, expect, it } from 'bun:test'
import type { R2RunManifest } from '../../src/publishing/r2-manifest'
import { buildViewerManifest } from '../../src/viewer/viewer-manifest'
describe('buildViewerManifest', () => {
@@ -22,12 +23,15 @@ describe('buildViewerManifest', () => {
score: 0,
pass: false,
reasoning: 'Missing checkout item',
details: { missing: ['checkout item'] },
},
},
},
],
})
const publishManifest: R2RunManifest = manifest
expect(publishManifest.schemaVersion).toBe(2)
expect(manifest.tasks[0].paths.messages).toBe(
'tasks/agisdk-dashdish-4/messages.jsonl',
)
@@ -37,5 +41,72 @@ describe('buildViewerManifest', () => {
expect(manifest.tasks[0].paths.graderArtifacts).toBe(
'tasks/agisdk-dashdish-4/grader-artifacts',
)
expect(manifest.tasks[0].graderResults.agisdk_state_diff.details).toEqual({
missing: ['checkout item'],
})
})
it('builds stable paths when optional task fields are missing', () => {
const manifest = buildViewerManifest({
runId: 'run-2',
uploadedAt: '2026-04-29T06:00:00.000Z',
tasks: [
{
queryId: 'task-with-minimal-fields',
query: 'Do the task',
status: 'completed',
durationMs: 10,
screenshotCount: 0,
graderResults: {},
},
],
})
expect(manifest).toMatchObject({
schemaVersion: 2,
runId: 'run-2',
uploadedAt: '2026-04-29T06:00:00.000Z',
tasks: [
{
queryId: 'task-with-minimal-fields',
startUrl: '',
paths: {
attempt: 'tasks/task-with-minimal-fields/attempt.json',
metadata: 'tasks/task-with-minimal-fields/metadata.json',
messages: 'tasks/task-with-minimal-fields/messages.jsonl',
trace: 'tasks/task-with-minimal-fields/trace.jsonl',
grades: 'tasks/task-with-minimal-fields/grades.json',
screenshots: 'tasks/task-with-minimal-fields/screenshots',
graderArtifacts: 'tasks/task-with-minimal-fields/grader-artifacts',
},
},
],
})
})
it('can separate display query ids from artifact path ids', () => {
const manifest = buildViewerManifest({
runId: 'run-3',
tasks: [
{
queryId: 'metadata-query-id',
artifactId: 'task-dir-id',
query: 'Do the task',
status: 'completed',
durationMs: 10,
screenshotCount: 0,
graderResults: {},
},
],
})
expect(manifest.tasks[0]).toMatchObject({
queryId: 'metadata-query-id',
paths: {
metadata: 'tasks/task-dir-id/metadata.json',
screenshots: 'tasks/task-dir-id/screenshots',
},
})
expect('artifactId' in manifest.tasks[0]).toBe(false)
})
})