/** * Weekly Report Generator * * Reads all uploaded eval runs from R2, builds cumulative score history, * and generates an HTML dashboard with: * - Config selector dropdown (groups runs by config/runId pattern) * - Config details card (architecture, model, dataset, grader) * - Interactive trend chart (filtered by selected config) * - Stat cards (latest, trend, best, duration) * - Searchable table of all runs * * Usage: * bun apps/eval/scripts/weekly-report.ts [local-output-path] * * Env vars required: * EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY * EVAL_R2_BUCKET (default: browseros-eval) */ import { writeFile } from 'node:fs/promises' import { GetObjectCommand, ListObjectsV2Command, PutObjectCommand, S3Client, } from '@aws-sdk/client-s3' import { buildRunSummaries, type ReportManifest, type RunSummary, } from '../src/reporting/run-summary' function requireEnv(name: string): string { const value = process.env[name] if (!value) { console.error(`Missing required env var: ${name}`) process.exit(1) } return value } const accountId = requireEnv('EVAL_R2_ACCOUNT_ID') const accessKeyId = requireEnv('EVAL_R2_ACCESS_KEY_ID') const secretAccessKey = requireEnv('EVAL_R2_SECRET_ACCESS_KEY') const bucket = process.env.EVAL_R2_BUCKET || 'browseros-eval' const client = new S3Client({ region: 'auto', endpoint: `https://${accountId}.r2.cloudflarestorage.com`, credentials: { accessKeyId, secretAccessKey }, }) // Step 1: List all manifest.json files in runs/ console.log('Scanning R2 for eval runs...') const manifests: ReportManifest[] = [] let continuationToken: string | undefined do { const listRes = await client.send( new ListObjectsV2Command({ Bucket: bucket, Prefix: 'runs/', ContinuationToken: continuationToken, }), ) const manifestKeys = listRes.Contents?.filter((obj) => obj.Key?.endsWith('/manifest.json')).map( (obj) => obj.Key as string, ) ?? [] for (const key of manifestKeys) { try { const res = await client.send( new GetObjectCommand({ Bucket: bucket, Key: key }), ) const body = await res.Body?.transformToString() if (body) manifests.push(JSON.parse(body)) } catch { console.warn(` Failed to read ${key}, skipping`) } } continuationToken = listRes.NextContinuationToken } while (continuationToken) console.log(`Found ${manifests.length} runs`) if (manifests.length === 0) { console.log('No runs found. Nothing to report.') process.exit(0) } // Step 2: Build run summaries const runs: RunSummary[] = buildRunSummaries(manifests) // Step 3: Identify unique config groups function escHtml(s: string): string { return s .replace(/&/g, '&') .replace(//g, '>') .replace(/"/g, '"') } const configGroups = [...new Set(runs.map((r) => r.configName))] const defaultConfig = configGroups.includes('ci-weekly') ? 'ci-weekly' : configGroups[0] // Step 4: Generate HTML report const html = ` BrowserOS Eval Dashboard

All Runs

${runs .slice() .reverse() .map((r) => { const viewerUrl = `viewer.html?run=${encodeURIComponent(r.runId)}` const archLabel = r.agentType === 'orchestrator-executor' ? 'Orch-Exec' : r.agentType === 'single' ? 'Tool Loop' : r.agentType || '—' return `` }) .join('\n')}
Date Config Model Dataset Architecture Score Tasks Timeout Avg Duration View
${escHtml(r.date)} ${escHtml(r.runId)} ${escHtml(r.model)} ${escHtml(r.dataset)} ${escHtml(archLabel)} ${r.avgScore.toFixed(1)}% ${r.total} ${r.timeout} ${(r.avgDurationMs / 1000).toFixed(0)}s View →
` // Step 5: Save locally and upload to R2 const localPath = process.argv[2] || '/tmp/eval-report.html' await writeFile(localPath, html) console.log(`Report saved locally: ${localPath}`) await client.send( new PutObjectCommand({ Bucket: bucket, Key: 'report.html', Body: html, ContentType: 'text/html', CacheControl: 'public, max-age=300', }), ) const cdnBaseUrl = ( process.env.EVAL_R2_CDN_BASE_URL || 'https://eval.browseros.com' ).replace(/\/+$/, '') console.log(`Report uploaded to R2: ${bucket}/report.html`) console.log(` View at: ${cdnBaseUrl}/report.html`) // Print summary console.log('\nScore trend:') for (const run of runs.slice(-10)) { const bar = '\u2588'.repeat(Math.round(run.avgScore / 5)) const pct = run.avgScore.toFixed(0).padStart(3) console.log(` ${run.date} ${pct}% ${bar}`) }