mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 23:53:25 +00:00
* feat(eval): weekly eval pipeline with R2 uploads and trend dashboard
Add infrastructure for running weekly evaluations and tracking score
trends over time:
- Auto-generated output dirs: results/{config-name}/{timestamp}/
Each eval run gets its own timestamped folder, nothing is overwritten.
- upload-run.ts: uploads eval results to Cloudflare R2. Supports
uploading a specific run or all un-uploaded runs for a config.
- weekly-report.ts: generates an interactive HTML dashboard from R2
data. Config dropdown, trend chart with hover tooltips, searchable
runs table. Groups runs by config name.
- viewer.html: client-facing 3-column run viewer (task list,
screenshots with autoplay, agent stream with messages.jsonl).
Shows performance grader axis breakdown with per-axis scores.
- browseros-agent-weekly.json: weekly benchmark config (kimi-k2p5,
webbench-2of4-50, 10 workers, performance grader, headless).
- eval-weekly.yml: GitHub Actions workflow with cron (Saturday 6am)
and manual trigger. Runs on self-hosted Mac Studio runner.
Concurrency group ensures only one eval runs at a time.
- Dashboard updates: load previous runs, messages.jsonl viewer,
grade badges show percentages, async stream loading.
- Grader updates: timeout 30min, max turns 100, DOM content
verification guidance for performance grader.
* fix(eval): address Greptile review — injection, nested dirs, escaping
- Fix script injection in eval-weekly.yml: pass github.event.inputs
through env var instead of interpolating into shell
- Fix /api/runs to enumerate nested results/{config}/{timestamp}/ dirs
- Fix /api/load-run to allow single-slash run names (config/timestamp)
- Add HTML escaping for R2-sourced values in weekly-report.ts
- Escape axis names in viewer.html renderAxesBreakdown
* fix(eval): fix biome lint — non-null assertion, template literals
* fix(eval): fix biome errors — replace var with let, fix inner function declaration
* fix(eval): address Greptile P2 issues
- isRunDir: check all subdirs for metadata.json, not just first 3
- eval-runner: guard configPath for dashboard-driven runs (fallback to 'eval')
- load-run: default unknown termination_reason to 'failed' not 'completed'
* feat(eval): make BROWSEROS_BINARY configurable via env var
350 lines
9.3 KiB
TypeScript
Vendored
350 lines
9.3 KiB
TypeScript
Vendored
/**
|
|
* Upload eval runs to R2.
|
|
*
|
|
* Two modes:
|
|
* bun scripts/upload-run.ts results/browseros-agent-weekly/2026-03-21-1730
|
|
* → uploads that specific run
|
|
*
|
|
* bun scripts/upload-run.ts results/browseros-agent-weekly
|
|
* → finds all timestamped subfolders, uploads any not yet in R2
|
|
*
|
|
* Env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY
|
|
* EVAL_R2_BUCKET (default: browseros-eval)
|
|
* EVAL_R2_CDN_BASE_URL (default: https://eval.browseros.com)
|
|
*/
|
|
|
|
import { readdir, readFile, stat } from 'node:fs/promises'
|
|
import { basename, dirname, extname, join } from 'node:path'
|
|
import {
|
|
GetObjectCommand,
|
|
PutObjectCommand,
|
|
S3Client,
|
|
} from '@aws-sdk/client-s3'
|
|
|
|
const CONCURRENCY = 20
|
|
|
|
const CONTENT_TYPES: Record<string, string> = {
|
|
'.json': 'application/json',
|
|
'.jsonl': 'application/x-ndjson',
|
|
'.png': 'image/png',
|
|
}
|
|
|
|
interface R2Config {
|
|
accountId: string
|
|
accessKeyId: string
|
|
secretAccessKey: string
|
|
bucket: string
|
|
cdnBaseUrl: string
|
|
}
|
|
|
|
function loadConfig(): R2Config {
|
|
const accountId = process.env.EVAL_R2_ACCOUNT_ID
|
|
const accessKeyId = process.env.EVAL_R2_ACCESS_KEY_ID
|
|
const secretAccessKey = process.env.EVAL_R2_SECRET_ACCESS_KEY
|
|
|
|
if (!accountId || !accessKeyId || !secretAccessKey) {
|
|
console.error(
|
|
'Missing required env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY',
|
|
)
|
|
process.exit(1)
|
|
}
|
|
|
|
return {
|
|
accountId,
|
|
accessKeyId,
|
|
secretAccessKey,
|
|
bucket: process.env.EVAL_R2_BUCKET || 'browseros-eval',
|
|
cdnBaseUrl: (
|
|
process.env.EVAL_R2_CDN_BASE_URL || 'https://eval.browseros.com'
|
|
).replace(/\/+$/, ''),
|
|
}
|
|
}
|
|
|
|
function createClient(config: R2Config): S3Client {
|
|
return new S3Client({
|
|
region: 'auto',
|
|
endpoint: `https://${config.accountId}.r2.cloudflarestorage.com`,
|
|
credentials: {
|
|
accessKeyId: config.accessKeyId,
|
|
secretAccessKey: config.secretAccessKey,
|
|
},
|
|
})
|
|
}
|
|
|
|
async function upload(
|
|
client: S3Client,
|
|
bucket: string,
|
|
key: string,
|
|
body: Buffer,
|
|
contentType: string,
|
|
) {
|
|
await client.send(
|
|
new PutObjectCommand({
|
|
Bucket: bucket,
|
|
Key: key,
|
|
Body: body,
|
|
ContentType: contentType,
|
|
}),
|
|
)
|
|
}
|
|
|
|
async function collectFiles(dir: string): Promise<string[]> {
|
|
const files: string[] = []
|
|
const entries = await readdir(dir, { withFileTypes: true })
|
|
for (const entry of entries) {
|
|
const full = join(dir, entry.name)
|
|
if (entry.isDirectory()) {
|
|
files.push(...(await collectFiles(full)))
|
|
} else {
|
|
files.push(full)
|
|
}
|
|
}
|
|
return files
|
|
}
|
|
|
|
async function runPool<T>(
|
|
items: T[],
|
|
concurrency: number,
|
|
fn: (item: T) => Promise<void>,
|
|
) {
|
|
let i = 0
|
|
const workers = Array.from({ length: concurrency }, async () => {
|
|
while (i < items.length) {
|
|
const idx = i++
|
|
await fn(items[idx])
|
|
}
|
|
})
|
|
await Promise.all(workers)
|
|
}
|
|
|
|
// Check if a run has already been uploaded to R2
|
|
async function isUploaded(
|
|
client: S3Client,
|
|
bucket: string,
|
|
runId: string,
|
|
): Promise<boolean> {
|
|
try {
|
|
await client.send(
|
|
new GetObjectCommand({
|
|
Bucket: bucket,
|
|
Key: `runs/${runId}/manifest.json`,
|
|
}),
|
|
)
|
|
return true
|
|
} catch {
|
|
return false
|
|
}
|
|
}
|
|
|
|
// Detect if a directory is a run dir (has task subdirs with metadata.json)
|
|
// vs a config dir (has timestamped subdirs like 2026-03-21-1730/)
|
|
async function isRunDir(dir: string): Promise<boolean> {
|
|
const entries = await readdir(dir, { withFileTypes: true })
|
|
const subdirs = entries.filter((e) => e.isDirectory())
|
|
for (const subdir of subdirs) {
|
|
const metaPath = join(dir, subdir.name, 'metadata.json')
|
|
const metaStat = await stat(metaPath).catch(() => null)
|
|
if (metaStat?.isFile()) return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
async function uploadSingleRun(
|
|
runDir: string,
|
|
runId: string,
|
|
r2Config: R2Config,
|
|
client: S3Client,
|
|
): Promise<void> {
|
|
const taskDirs = await readdir(runDir, { withFileTypes: true })
|
|
const taskEntries = taskDirs.filter((d) => d.isDirectory())
|
|
|
|
if (taskEntries.length === 0) {
|
|
console.warn(` No task subdirectories in ${runId}, skipping`)
|
|
return
|
|
}
|
|
|
|
const manifestTasks: Record<string, unknown>[] = []
|
|
const jobs: { key: string; filePath: string; contentType: string }[] = []
|
|
|
|
// Extract agent config from first task
|
|
let agentConfig: Record<string, unknown> | undefined
|
|
let dataset: string | undefined
|
|
|
|
for (const taskDir of taskEntries) {
|
|
const taskId = taskDir.name
|
|
const taskPath = join(runDir, taskId)
|
|
const metaPath = join(taskPath, 'metadata.json')
|
|
|
|
let meta: Record<string, unknown> = {}
|
|
try {
|
|
meta = JSON.parse(await readFile(metaPath, 'utf-8'))
|
|
} catch {
|
|
continue
|
|
}
|
|
|
|
if (!agentConfig && meta.agent_config)
|
|
agentConfig = meta.agent_config as Record<string, unknown>
|
|
if (!dataset && meta.dataset) dataset = meta.dataset as string
|
|
|
|
const files = await collectFiles(taskPath)
|
|
let screenshotCount = 0
|
|
|
|
for (const file of files) {
|
|
const relative = file.slice(taskPath.length + 1)
|
|
const ext = extname(file)
|
|
if (relative.startsWith('screenshots/') && ext === '.png')
|
|
screenshotCount++
|
|
|
|
jobs.push({
|
|
key: `runs/${runId}/${taskId}/${relative}`,
|
|
filePath: file,
|
|
contentType: CONTENT_TYPES[ext] || 'application/octet-stream',
|
|
})
|
|
}
|
|
|
|
manifestTasks.push({
|
|
queryId: meta.query_id || taskId,
|
|
query: meta.query || '',
|
|
startUrl: meta.start_url || '',
|
|
status:
|
|
meta.termination_reason === 'completed'
|
|
? 'completed'
|
|
: meta.termination_reason || 'unknown',
|
|
durationMs: meta.total_duration_ms || 0,
|
|
screenshotCount: (meta.screenshot_count as number) || screenshotCount,
|
|
graderResults: meta.grader_results || {},
|
|
})
|
|
}
|
|
|
|
if (manifestTasks.length === 0) {
|
|
console.warn(` No completed tasks in ${runId}, skipping`)
|
|
return
|
|
}
|
|
|
|
console.log(
|
|
` Uploading ${jobs.length} files across ${manifestTasks.length} tasks...`,
|
|
)
|
|
|
|
let uploaded = 0
|
|
await runPool(jobs, CONCURRENCY, async (job) => {
|
|
const body = await readFile(job.filePath)
|
|
await upload(client, r2Config.bucket, job.key, body, job.contentType)
|
|
uploaded++
|
|
if (uploaded % 50 === 0 || uploaded === jobs.length) {
|
|
console.log(` ${uploaded}/${jobs.length}`)
|
|
}
|
|
})
|
|
|
|
// Read summary.json if it exists
|
|
let summaryData: Record<string, unknown> | undefined
|
|
try {
|
|
summaryData = JSON.parse(
|
|
await readFile(join(runDir, 'summary.json'), 'utf-8'),
|
|
)
|
|
} catch {}
|
|
|
|
// Upload manifest
|
|
const manifest = {
|
|
runId,
|
|
uploadedAt: new Date().toISOString(),
|
|
agentConfig,
|
|
dataset,
|
|
summary: summaryData
|
|
? {
|
|
passRate: summaryData.passRate,
|
|
avgDurationMs: summaryData.avgDurationMs,
|
|
}
|
|
: undefined,
|
|
tasks: manifestTasks,
|
|
}
|
|
const manifestBody = Buffer.from(JSON.stringify(manifest, null, 2))
|
|
await upload(
|
|
client,
|
|
r2Config.bucket,
|
|
`runs/${runId}/manifest.json`,
|
|
manifestBody,
|
|
'application/json',
|
|
)
|
|
|
|
// Upload viewer.html to bucket root
|
|
const viewerPath = join(
|
|
import.meta.dir,
|
|
'..',
|
|
'src',
|
|
'dashboard',
|
|
'viewer.html',
|
|
)
|
|
const viewerBody = await readFile(viewerPath)
|
|
await upload(client, r2Config.bucket, 'viewer.html', viewerBody, 'text/html')
|
|
|
|
console.log(` Uploaded ${uploaded + 2} files`)
|
|
console.log(` ${r2Config.cdnBaseUrl}/viewer.html?run=${runId}`)
|
|
}
|
|
|
|
async function main() {
|
|
const inputDir = process.argv[2]
|
|
if (!inputDir) {
|
|
console.error(
|
|
'Usage:\n' +
|
|
' bun scripts/upload-run.ts results/config-name/2026-03-21-1730 (specific run)\n' +
|
|
' bun scripts/upload-run.ts results/config-name (all un-uploaded runs)',
|
|
)
|
|
process.exit(1)
|
|
}
|
|
|
|
const dirStat = await stat(inputDir).catch(() => null)
|
|
if (!dirStat?.isDirectory()) {
|
|
console.error(`Not a directory: ${inputDir}`)
|
|
process.exit(1)
|
|
}
|
|
|
|
const r2Config = loadConfig()
|
|
const client = createClient(r2Config)
|
|
|
|
if (await isRunDir(inputDir)) {
|
|
// Single run: results/config-name/2026-03-21-1730
|
|
const timestamp = basename(inputDir)
|
|
const configName = basename(dirname(inputDir))
|
|
const runId = `${configName}-${timestamp}`
|
|
console.log(`Uploading run: ${runId}`)
|
|
await uploadSingleRun(inputDir, runId, r2Config, client)
|
|
} else {
|
|
// Config dir: results/config-name/ — upload all un-uploaded runs
|
|
const configName = basename(inputDir)
|
|
const entries = await readdir(inputDir, { withFileTypes: true })
|
|
const runDirs = entries
|
|
.filter((e) => e.isDirectory())
|
|
.map((e) => e.name)
|
|
.sort()
|
|
|
|
if (runDirs.length === 0) {
|
|
console.error('No run subdirectories found')
|
|
process.exit(1)
|
|
}
|
|
|
|
console.log(
|
|
`Found ${runDirs.length} runs for config "${configName}", checking R2...`,
|
|
)
|
|
|
|
let uploadedCount = 0
|
|
for (const dir of runDirs) {
|
|
const runId = `${configName}-${dir}`
|
|
const alreadyUploaded = await isUploaded(client, r2Config.bucket, runId)
|
|
if (alreadyUploaded) {
|
|
console.log(` ${runId}: already uploaded, skipping`)
|
|
continue
|
|
}
|
|
|
|
console.log(` ${runId}: uploading...`)
|
|
await uploadSingleRun(join(inputDir, dir), runId, r2Config, client)
|
|
uploadedCount++
|
|
}
|
|
|
|
console.log(
|
|
`\nDone. Uploaded ${uploadedCount} new run(s), ${runDirs.length - uploadedCount} already in R2.`,
|
|
)
|
|
}
|
|
}
|
|
|
|
main()
|