mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-14 08:03:58 +00:00
Compare commits
7 Commits
fix/browse
...
fix/eval-v
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d631c7dd06 | ||
|
|
3c706ec3fc | ||
|
|
8ec5c147ac | ||
|
|
2ac3fbb3b1 | ||
|
|
241755b3ec | ||
|
|
8af80e2fff | ||
|
|
ac654e443a |
32
packages/browseros-agent/apps/eval/README.md
vendored
32
packages/browseros-agent/apps/eval/README.md
vendored
@@ -181,6 +181,8 @@ export EVAL_R2_BUCKET=browseros-eval
|
||||
export EVAL_R2_CDN_BASE_URL=https://eval.browseros.com
|
||||
```
|
||||
|
||||
`EVAL_R2_CDN_BASE_URL` must be a public R2 custom domain, `r2.dev` URL, or Worker URL. Do not set it to the private `*.r2.cloudflarestorage.com` S3 API endpoint.
|
||||
|
||||
Published runs are available at `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
|
||||
|
||||
### BrowserOS infrastructure
|
||||
@@ -253,7 +255,35 @@ results/
|
||||
summary.json # Aggregate pass rates
|
||||
```
|
||||
|
||||
R2 publishing preserves the same task files under `runs/<run-id>/...`, writes `runs/<run-id>/manifest.json`, and uploads `viewer.html` at the bucket root. The viewer URL is `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
|
||||
R2 publishing preserves the task files under `runs/<run-id>/...`, writes `runs/<run-id>/manifest.json`, and uploads `viewer.html` at the bucket root. The viewer URL is `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
|
||||
|
||||
### R2 viewer manifest
|
||||
|
||||
`runs/<run-id>/manifest.json` is the source of truth for the public viewer. New manifests include `schemaVersion: 2` and each task includes explicit artifact paths:
|
||||
|
||||
```json
|
||||
{
|
||||
"schemaVersion": 2,
|
||||
"runId": "agisdk-real-smoke-2026-04-30-0000",
|
||||
"tasks": [
|
||||
{
|
||||
"queryId": "agisdk-dashdish-10",
|
||||
"paths": {
|
||||
"metadata": "tasks/agisdk-dashdish-10/metadata.json",
|
||||
"messages": "tasks/agisdk-dashdish-10/messages.jsonl",
|
||||
"grades": "tasks/agisdk-dashdish-10/grades.json",
|
||||
"trace": "tasks/agisdk-dashdish-10/trace.jsonl",
|
||||
"screenshots": "tasks/agisdk-dashdish-10/screenshots",
|
||||
"graderArtifacts": "tasks/agisdk-dashdish-10/grader-artifacts"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The static viewer uses `task.paths` when present. Older uploaded runs without `schemaVersion` or `task.paths` still work through the legacy inferred layout: `runs/<run-id>/<task-id>/metadata.json`, `messages.jsonl`, and `screenshots/<n>.png`.
|
||||
|
||||
Manifest paths are stable artifact locations, not a guarantee that every optional artifact exists for every task. For example, `attempt.json`, `trace.jsonl`, or grader artifact directories may be absent when that artifact was not produced by the run.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
|
||||
@@ -24,45 +24,11 @@ import {
|
||||
PutObjectCommand,
|
||||
S3Client,
|
||||
} from '@aws-sdk/client-s3'
|
||||
|
||||
interface ManifestTask {
|
||||
queryId: string
|
||||
query: string
|
||||
status: string
|
||||
durationMs: number
|
||||
screenshotCount: number
|
||||
graderResults: Record<string, { pass: boolean; score: number }>
|
||||
}
|
||||
|
||||
interface Manifest {
|
||||
runId: string
|
||||
uploadedAt: string
|
||||
agentConfig?: { type?: string; model?: string }
|
||||
dataset?: string
|
||||
summary?: { passRate?: number; avgDurationMs?: number }
|
||||
tasks: ManifestTask[]
|
||||
}
|
||||
|
||||
interface RunSummary {
|
||||
runId: string
|
||||
configName: string
|
||||
date: string
|
||||
avgScore: number
|
||||
total: number
|
||||
completed: number
|
||||
failed: number
|
||||
timeout: number
|
||||
avgDurationMs: number
|
||||
model: string
|
||||
dataset: string
|
||||
agentType: string
|
||||
}
|
||||
|
||||
const PASS_FAIL_GRADER_ORDER = [
|
||||
'agisdk_state_diff',
|
||||
'infinity_state',
|
||||
'performance_grader',
|
||||
]
|
||||
import {
|
||||
buildRunSummaries,
|
||||
type ReportManifest,
|
||||
type RunSummary,
|
||||
} from '../src/reporting/run-summary'
|
||||
|
||||
function requireEnv(name: string): string {
|
||||
const value = process.env[name]
|
||||
@@ -87,7 +53,7 @@ const client = new S3Client({
|
||||
// Step 1: List all manifest.json files in runs/
|
||||
console.log('Scanning R2 for eval runs...')
|
||||
|
||||
const manifests: Manifest[] = []
|
||||
const manifests: ReportManifest[] = []
|
||||
let continuationToken: string | undefined
|
||||
|
||||
do {
|
||||
@@ -127,64 +93,9 @@ if (manifests.length === 0) {
|
||||
}
|
||||
|
||||
// Step 2: Build run summaries
|
||||
const runs: RunSummary[] = manifests
|
||||
.map((m) => {
|
||||
const total = m.tasks.length
|
||||
const completed = m.tasks.filter((t) => t.status === 'completed').length
|
||||
const failed = m.tasks.filter((t) => t.status === 'failed').length
|
||||
const timeout = m.tasks.filter((t) => t.status === 'timeout').length
|
||||
|
||||
let scoredCount = 0
|
||||
let scoreSum = 0
|
||||
for (const task of m.tasks) {
|
||||
if (!task.graderResults) continue
|
||||
for (const name of PASS_FAIL_GRADER_ORDER) {
|
||||
if (task.graderResults[name]) {
|
||||
scoredCount++
|
||||
scoreSum += task.graderResults[name].score ?? 0
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const avgScore = scoredCount > 0 ? (scoreSum / scoredCount) * 100 : 0
|
||||
const durations = m.tasks
|
||||
.filter((t) => t.durationMs > 0)
|
||||
.map((t) => t.durationMs)
|
||||
const avgDurationMs =
|
||||
durations.length > 0
|
||||
? durations.reduce((a, b) => a + b, 0) / durations.length
|
||||
: 0
|
||||
|
||||
const date = m.uploadedAt
|
||||
? `${m.uploadedAt.split('T')[0]} ${m.uploadedAt.split('T')[1]?.slice(0, 5) || ''}`
|
||||
: m.runId.slice(0, 15)
|
||||
|
||||
const model = m.agentConfig?.model || 'unknown'
|
||||
const dataset = m.dataset || m.runId
|
||||
const agentType = m.agentConfig?.type || 'unknown'
|
||||
|
||||
const configName = extractConfigName(m.runId)
|
||||
return {
|
||||
runId: m.runId,
|
||||
configName,
|
||||
date,
|
||||
avgScore,
|
||||
total,
|
||||
completed,
|
||||
failed,
|
||||
timeout,
|
||||
avgDurationMs,
|
||||
model,
|
||||
dataset,
|
||||
agentType,
|
||||
}
|
||||
})
|
||||
.sort((a, b) => a.date.localeCompare(b.date))
|
||||
const runs: RunSummary[] = buildRunSummaries(manifests)
|
||||
|
||||
// Step 3: Identify unique config groups
|
||||
// runId can be "ci-weekly" (old) or "ci-weekly-2026-03-21-1730" (timestamped)
|
||||
// Extract config name by stripping the date-time suffix pattern
|
||||
function escHtml(s: string): string {
|
||||
return s
|
||||
.replace(/&/g, '&')
|
||||
@@ -193,12 +104,6 @@ function escHtml(s: string): string {
|
||||
.replace(/"/g, '"')
|
||||
}
|
||||
|
||||
function extractConfigName(runId: string): string {
|
||||
// "browseros-agent-weekly-2026-03-21-1730" → "browseros-agent-weekly"
|
||||
// "ci-weekly" → "ci-weekly" (no timestamp, old format)
|
||||
return runId.replace(/-\d{4}-\d{2}-\d{2}-\d{4}$/, '')
|
||||
}
|
||||
|
||||
const configGroups = [...new Set(runs.map((r) => r.configName))]
|
||||
const defaultConfig = configGroups.includes('ci-weekly')
|
||||
? 'ci-weekly'
|
||||
|
||||
@@ -685,6 +685,59 @@
|
||||
});
|
||||
}
|
||||
|
||||
// Test harness note: these ASCII section markers are used by r2-viewer-compat.test.ts.
|
||||
// -- Artifact path resolution
|
||||
function taskKey(task) {
|
||||
return task.queryId || task.id || 'unknown-task';
|
||||
}
|
||||
|
||||
function legacyArtifactPath(task, artifact) {
|
||||
const id = taskKey(task);
|
||||
switch (artifact) {
|
||||
case 'attempt':
|
||||
return `${id}/attempt.json`;
|
||||
case 'metadata':
|
||||
return `${id}/metadata.json`;
|
||||
case 'messages':
|
||||
return `${id}/messages.jsonl`;
|
||||
case 'trace':
|
||||
return `${id}/trace.jsonl`;
|
||||
case 'grades':
|
||||
return `${id}/grades.json`;
|
||||
case 'screenshots':
|
||||
return `${id}/screenshots`;
|
||||
case 'graderArtifacts':
|
||||
return `${id}/grader-artifacts`;
|
||||
default:
|
||||
return `${id}/${artifact}`;
|
||||
}
|
||||
}
|
||||
|
||||
function artifactPath(task, artifact) {
|
||||
const manifestPath = task.paths && task.paths[artifact];
|
||||
if (typeof manifestPath === 'string' && manifestPath.length > 0) {
|
||||
return manifestPath.replace(/^\/+/, '');
|
||||
}
|
||||
return legacyArtifactPath(task, artifact);
|
||||
}
|
||||
|
||||
function artifactUrl(task, artifact) {
|
||||
return `${basePath}/${artifactPath(task, artifact)}`;
|
||||
}
|
||||
|
||||
function metadataUrl(task) {
|
||||
return artifactUrl(task, 'metadata');
|
||||
}
|
||||
|
||||
function messagesUrl(task) {
|
||||
return artifactUrl(task, 'messages');
|
||||
}
|
||||
|
||||
function screenshotUrl(task, n) {
|
||||
return `${artifactUrl(task, 'screenshots')}/${n}.png`;
|
||||
}
|
||||
|
||||
// -- Task selection
|
||||
// ── Task selection ─────────────────────────────────────────────
|
||||
function selectTask(task) {
|
||||
stopAutoplay();
|
||||
@@ -716,6 +769,7 @@
|
||||
}
|
||||
}
|
||||
|
||||
// -- Center panel
|
||||
// ── Center panel: screenshot viewer ────────────────────────────
|
||||
function renderCenterPanel(task) {
|
||||
const panel = document.getElementById('center-panel');
|
||||
@@ -763,10 +817,6 @@
|
||||
updateControls();
|
||||
}
|
||||
|
||||
function screenshotUrl(task, n) {
|
||||
return `${basePath}/${task.queryId || task.id}/screenshots/${n}.png`;
|
||||
}
|
||||
|
||||
function goToStep(n) {
|
||||
if (!selectedTask || n < 1 || n > totalSteps) return;
|
||||
currentStep = n;
|
||||
@@ -914,7 +964,7 @@
|
||||
body.innerHTML = '<div class="placeholder"><div class="ph-text" style="color: #6e7681;">Loading messages...</div></div>';
|
||||
countEl.textContent = '';
|
||||
|
||||
const msgUrl = `${basePath}/${task.queryId || task.id}/messages.jsonl`;
|
||||
const msgUrl = messagesUrl(task);
|
||||
|
||||
fetch(msgUrl)
|
||||
.then((res) => {
|
||||
@@ -1075,7 +1125,7 @@
|
||||
|
||||
// ── Load task metadata for rich grader details ──────────────────
|
||||
function loadTaskMetadata(task) {
|
||||
const metaUrl = `${basePath}/${task.queryId || task.id}/metadata.json`;
|
||||
const metaUrl = metadataUrl(task);
|
||||
fetch(metaUrl)
|
||||
.then((res) => res.ok ? res.json() : null)
|
||||
.then((meta) => {
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
import type {
|
||||
ViewerManifest,
|
||||
ViewerManifestTask,
|
||||
} from '../viewer/viewer-manifest'
|
||||
|
||||
export interface R2UploadConfig {
|
||||
accountId: string
|
||||
accessKeyId: string
|
||||
@@ -6,27 +11,9 @@ export interface R2UploadConfig {
|
||||
cdnBaseUrl: string
|
||||
}
|
||||
|
||||
export interface R2ManifestTask {
|
||||
queryId: string
|
||||
query: string
|
||||
startUrl: string
|
||||
status: string
|
||||
durationMs: number
|
||||
screenshotCount: number
|
||||
graderResults: Record<string, unknown>
|
||||
}
|
||||
export type R2ManifestTask = ViewerManifestTask
|
||||
|
||||
export interface R2RunManifest {
|
||||
runId: string
|
||||
uploadedAt: string
|
||||
agentConfig?: Record<string, unknown>
|
||||
dataset?: string
|
||||
summary?: {
|
||||
passRate?: unknown
|
||||
avgDurationMs?: unknown
|
||||
}
|
||||
tasks: R2ManifestTask[]
|
||||
}
|
||||
export type R2RunManifest = ViewerManifest
|
||||
|
||||
export interface R2PublishRunResult {
|
||||
runId: string
|
||||
|
||||
@@ -5,8 +5,11 @@ import {
|
||||
PutObjectCommand,
|
||||
S3Client,
|
||||
} from '@aws-sdk/client-s3'
|
||||
import {
|
||||
buildViewerManifest,
|
||||
type ViewerManifestTaskInput,
|
||||
} from '../viewer/viewer-manifest'
|
||||
import type {
|
||||
R2ManifestTask,
|
||||
R2PublishPathResult,
|
||||
R2PublishRunResult,
|
||||
R2RunManifest,
|
||||
@@ -43,7 +46,6 @@ interface UploadJob {
|
||||
interface TaskDirEntry {
|
||||
taskId: string
|
||||
taskPath: string
|
||||
canonicalLayout: boolean
|
||||
}
|
||||
|
||||
export function contentTypeForPath(filePath: string): string {
|
||||
@@ -129,7 +131,6 @@ async function findTaskDirs(runDir: string): Promise<TaskDirEntry[]> {
|
||||
legacyTasks.push({
|
||||
taskId: entry.name,
|
||||
taskPath,
|
||||
canonicalLayout: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -146,7 +147,6 @@ async function findTaskDirs(runDir: string): Promise<TaskDirEntry[]> {
|
||||
canonicalTasks.push({
|
||||
taskId: entry.name,
|
||||
taskPath,
|
||||
canonicalLayout: true,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -262,7 +262,7 @@ export class R2Publisher {
|
||||
throw new Error(`No task subdirectories in ${runId}`)
|
||||
}
|
||||
|
||||
const manifestTasks: R2ManifestTask[] = []
|
||||
const manifestTasks: ViewerManifestTaskInput[] = []
|
||||
const jobs: UploadJob[] = (await collectRunRootFiles(runDir)).map(
|
||||
(job) => ({
|
||||
...job,
|
||||
@@ -289,22 +289,23 @@ export class R2Publisher {
|
||||
if (relative.startsWith('screenshots/') && extname(file) === '.png') {
|
||||
screenshotCount++
|
||||
}
|
||||
// Keep legacy keys during the manifest v2 rollout so cached viewers and
|
||||
// old manifests can still resolve task artifacts.
|
||||
jobs.push({
|
||||
key: `runs/${runId}/${taskId}/${relative}`,
|
||||
filePath: file,
|
||||
contentType: contentTypeForPath(file),
|
||||
})
|
||||
if (taskDirEntry.canonicalLayout) {
|
||||
jobs.push({
|
||||
key: `runs/${runId}/tasks/${taskId}/${relative}`,
|
||||
filePath: file,
|
||||
contentType: contentTypeForPath(file),
|
||||
})
|
||||
}
|
||||
jobs.push({
|
||||
key: `runs/${runId}/tasks/${taskId}/${relative}`,
|
||||
filePath: file,
|
||||
contentType: contentTypeForPath(file),
|
||||
})
|
||||
}
|
||||
|
||||
manifestTasks.push({
|
||||
queryId: (meta.query_id as string | undefined) || taskId,
|
||||
artifactId: taskId,
|
||||
query: (meta.query as string | undefined) || '',
|
||||
startUrl: (meta.start_url as string | undefined) || '',
|
||||
status: statusFromMetadata(meta),
|
||||
@@ -312,7 +313,8 @@ export class R2Publisher {
|
||||
screenshotCount:
|
||||
(meta.screenshot_count as number | undefined) || screenshotCount,
|
||||
graderResults:
|
||||
(meta.grader_results as Record<string, unknown> | undefined) || {},
|
||||
(meta.grader_results as ViewerManifestTaskInput['graderResults']) ||
|
||||
{},
|
||||
})
|
||||
}
|
||||
|
||||
@@ -347,7 +349,7 @@ export class R2Publisher {
|
||||
return {
|
||||
runId,
|
||||
uploadedFiles: uploaded + 2,
|
||||
viewerUrl: `${this.config.cdnBaseUrl}/viewer.html?run=${runId}`,
|
||||
viewerUrl: `${this.config.cdnBaseUrl}/viewer.html?run=${encodeURIComponent(runId)}`,
|
||||
manifest,
|
||||
}
|
||||
}
|
||||
@@ -369,7 +371,7 @@ export class R2Publisher {
|
||||
runId: string,
|
||||
agentConfig: Record<string, unknown> | undefined,
|
||||
dataset: string | undefined,
|
||||
tasks: R2ManifestTask[],
|
||||
tasks: ViewerManifestTaskInput[],
|
||||
): Promise<R2RunManifest> {
|
||||
let summaryData: Record<string, unknown> | undefined
|
||||
try {
|
||||
@@ -378,7 +380,7 @@ export class R2Publisher {
|
||||
) as Record<string, unknown>
|
||||
} catch {}
|
||||
|
||||
return {
|
||||
return buildViewerManifest({
|
||||
runId,
|
||||
uploadedAt: this.now().toISOString(),
|
||||
agentConfig,
|
||||
@@ -390,7 +392,7 @@ export class R2Publisher {
|
||||
}
|
||||
: undefined,
|
||||
tasks,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
private async uploadFile(job: UploadJob): Promise<void> {
|
||||
|
||||
104
packages/browseros-agent/apps/eval/src/reporting/run-summary.ts
vendored
Normal file
104
packages/browseros-agent/apps/eval/src/reporting/run-summary.ts
vendored
Normal file
@@ -0,0 +1,104 @@
|
||||
export interface ReportManifestTask {
|
||||
queryId: string
|
||||
query?: string
|
||||
status: string
|
||||
durationMs: number
|
||||
screenshotCount?: number
|
||||
paths?: Record<string, string>
|
||||
graderResults?: Record<string, { pass?: boolean; score?: number }>
|
||||
}
|
||||
|
||||
export interface ReportManifest {
|
||||
schemaVersion?: number
|
||||
runId: string
|
||||
uploadedAt?: string
|
||||
agentConfig?: { type?: string; model?: string }
|
||||
dataset?: string
|
||||
summary?: { passRate?: number; avgDurationMs?: number }
|
||||
tasks?: ReportManifestTask[]
|
||||
}
|
||||
|
||||
export interface RunSummary {
|
||||
runId: string
|
||||
configName: string
|
||||
date: string
|
||||
avgScore: number
|
||||
total: number
|
||||
completed: number
|
||||
failed: number
|
||||
timeout: number
|
||||
avgDurationMs: number
|
||||
model: string
|
||||
dataset: string
|
||||
agentType: string
|
||||
}
|
||||
|
||||
// Report score uses the primary pass/fail grader so mixed-grader runs keep
|
||||
// the same precedence as the eval summary.
|
||||
const PASS_FAIL_GRADER_ORDER = [
|
||||
'agisdk_state_diff',
|
||||
'infinity_state',
|
||||
'performance_grader',
|
||||
]
|
||||
|
||||
export function extractConfigName(runId: string): string {
|
||||
return runId.replace(/-\d{4}-\d{2}-\d{2}-\d{4}$/, '')
|
||||
}
|
||||
|
||||
function reportDate(manifest: ReportManifest): string {
|
||||
if (!manifest.uploadedAt) return 'unknown'
|
||||
const [date, time] = manifest.uploadedAt.split('T')
|
||||
return `${date} ${time?.slice(0, 5) || ''}`
|
||||
}
|
||||
|
||||
function primaryScore(task: ReportManifestTask): number | null {
|
||||
if (!task.graderResults) return null
|
||||
for (const name of PASS_FAIL_GRADER_ORDER) {
|
||||
const result = task.graderResults[name]
|
||||
if (result) return result.score ?? 0
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
export function buildRunSummaries(manifests: ReportManifest[]): RunSummary[] {
|
||||
return manifests
|
||||
.map((manifest) => {
|
||||
const tasks = Array.isArray(manifest.tasks) ? manifest.tasks : []
|
||||
const total = tasks.length
|
||||
const completed = tasks.filter((t) => t.status === 'completed').length
|
||||
const failed = tasks.filter((t) => t.status === 'failed').length
|
||||
const timeout = tasks.filter((t) => t.status === 'timeout').length
|
||||
|
||||
let scoredCount = 0
|
||||
let scoreSum = 0
|
||||
for (const task of tasks) {
|
||||
const score = primaryScore(task)
|
||||
if (score === null) continue
|
||||
scoredCount++
|
||||
scoreSum += score
|
||||
}
|
||||
|
||||
const durations = tasks
|
||||
.filter((t) => t.durationMs > 0)
|
||||
.map((t) => t.durationMs)
|
||||
|
||||
return {
|
||||
runId: manifest.runId,
|
||||
configName: extractConfigName(manifest.runId),
|
||||
date: reportDate(manifest),
|
||||
avgScore: scoredCount > 0 ? (scoreSum / scoredCount) * 100 : 0,
|
||||
total,
|
||||
completed,
|
||||
failed,
|
||||
timeout,
|
||||
avgDurationMs:
|
||||
durations.length > 0
|
||||
? durations.reduce((a, b) => a + b, 0) / durations.length
|
||||
: 0,
|
||||
model: manifest.agentConfig?.model || 'unknown',
|
||||
dataset: manifest.dataset || manifest.runId,
|
||||
agentType: manifest.agentConfig?.type || 'unknown',
|
||||
}
|
||||
})
|
||||
.sort((a, b) => a.date.localeCompare(b.date))
|
||||
}
|
||||
@@ -1,7 +1,20 @@
|
||||
import type { GraderResult } from '../types'
|
||||
|
||||
export const VIEWER_MANIFEST_SCHEMA_VERSION = 2
|
||||
|
||||
export interface ViewerManifestTaskPaths {
|
||||
attempt: string
|
||||
metadata: string
|
||||
messages: string
|
||||
trace: string
|
||||
grades: string
|
||||
screenshots: string
|
||||
graderArtifacts: string
|
||||
}
|
||||
|
||||
export interface ViewerManifestTaskInput {
|
||||
queryId: string
|
||||
artifactId?: string
|
||||
query: string
|
||||
startUrl?: string
|
||||
status: string
|
||||
@@ -10,57 +23,67 @@ export interface ViewerManifestTaskInput {
|
||||
graderResults: Record<string, GraderResult>
|
||||
}
|
||||
|
||||
export interface ViewerManifestTask extends ViewerManifestTaskInput {
|
||||
paths: {
|
||||
attempt: string
|
||||
metadata: string
|
||||
messages: string
|
||||
trace: string
|
||||
grades: string
|
||||
screenshots: string
|
||||
graderArtifacts: string
|
||||
}
|
||||
export interface ViewerManifestTask
|
||||
extends Omit<ViewerManifestTaskInput, 'artifactId'> {
|
||||
startUrl: string
|
||||
paths: ViewerManifestTaskPaths
|
||||
}
|
||||
|
||||
export interface ViewerManifest {
|
||||
schemaVersion: typeof VIEWER_MANIFEST_SCHEMA_VERSION
|
||||
runId: string
|
||||
suiteId: string
|
||||
variantId: string
|
||||
suiteId?: string
|
||||
variantId?: string
|
||||
uploadedAt?: string
|
||||
summary: Record<string, unknown>
|
||||
agentConfig?: Record<string, unknown>
|
||||
dataset?: string
|
||||
summary?: Record<string, unknown>
|
||||
tasks: ViewerManifestTask[]
|
||||
}
|
||||
|
||||
export interface BuildViewerManifestInput {
|
||||
runId: string
|
||||
suiteId: string
|
||||
variantId: string
|
||||
suiteId?: string
|
||||
variantId?: string
|
||||
uploadedAt?: string
|
||||
summary: Record<string, unknown>
|
||||
agentConfig?: Record<string, unknown>
|
||||
dataset?: string
|
||||
summary?: Record<string, unknown>
|
||||
tasks: ViewerManifestTaskInput[]
|
||||
}
|
||||
|
||||
function taskPaths(queryId: string): ViewerManifestTaskPaths {
|
||||
return {
|
||||
attempt: `tasks/${queryId}/attempt.json`,
|
||||
metadata: `tasks/${queryId}/metadata.json`,
|
||||
messages: `tasks/${queryId}/messages.jsonl`,
|
||||
trace: `tasks/${queryId}/trace.jsonl`,
|
||||
grades: `tasks/${queryId}/grades.json`,
|
||||
screenshots: `tasks/${queryId}/screenshots`,
|
||||
graderArtifacts: `tasks/${queryId}/grader-artifacts`,
|
||||
}
|
||||
}
|
||||
|
||||
/** Builds the compact JSON index consumed by the static R2 viewer. */
|
||||
export function buildViewerManifest(
|
||||
input: BuildViewerManifestInput,
|
||||
): ViewerManifest {
|
||||
return {
|
||||
schemaVersion: VIEWER_MANIFEST_SCHEMA_VERSION,
|
||||
runId: input.runId,
|
||||
suiteId: input.suiteId,
|
||||
variantId: input.variantId,
|
||||
uploadedAt: input.uploadedAt,
|
||||
summary: input.summary,
|
||||
tasks: input.tasks.map((task) => ({
|
||||
...task,
|
||||
paths: {
|
||||
attempt: `tasks/${task.queryId}/attempt.json`,
|
||||
metadata: `tasks/${task.queryId}/metadata.json`,
|
||||
messages: `tasks/${task.queryId}/messages.jsonl`,
|
||||
trace: `tasks/${task.queryId}/trace.jsonl`,
|
||||
grades: `tasks/${task.queryId}/grades.json`,
|
||||
screenshots: `tasks/${task.queryId}/screenshots`,
|
||||
graderArtifacts: `tasks/${task.queryId}/grader-artifacts`,
|
||||
},
|
||||
})),
|
||||
...(input.suiteId ? { suiteId: input.suiteId } : {}),
|
||||
...(input.variantId ? { variantId: input.variantId } : {}),
|
||||
...(input.uploadedAt ? { uploadedAt: input.uploadedAt } : {}),
|
||||
...(input.agentConfig ? { agentConfig: input.agentConfig } : {}),
|
||||
...(input.dataset ? { dataset: input.dataset } : {}),
|
||||
...(input.summary ? { summary: input.summary } : {}),
|
||||
tasks: input.tasks.map((task) => {
|
||||
const { artifactId, ...publicTask } = task
|
||||
return {
|
||||
...publicTask,
|
||||
startUrl: publicTask.startUrl ?? '',
|
||||
paths: taskPaths(artifactId ?? publicTask.queryId),
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,6 +26,7 @@ async function writeRunFixture(
|
||||
root: string,
|
||||
configName = 'browseros-agent-weekly',
|
||||
timestamp = '2026-04-29-1200',
|
||||
options: { queryId?: string } = {},
|
||||
): Promise<{ runDir: string; runId: string }> {
|
||||
const runDir = join(root, configName, timestamp)
|
||||
const taskDir = join(runDir, 'task-1')
|
||||
@@ -33,7 +34,7 @@ async function writeRunFixture(
|
||||
await writeFile(
|
||||
join(taskDir, 'metadata.json'),
|
||||
JSON.stringify({
|
||||
query_id: 'task-1',
|
||||
query_id: options.queryId ?? 'task-1',
|
||||
dataset: 'webbench',
|
||||
query: 'Find pricing',
|
||||
start_url: 'https://example.test',
|
||||
@@ -94,6 +95,15 @@ describe('R2Publisher', () => {
|
||||
expect(
|
||||
byKey.get(`runs/${runId}/task-1/screenshots/1.png`)?.ContentType,
|
||||
).toBe('image/png')
|
||||
expect(
|
||||
byKey.get(`runs/${runId}/tasks/task-1/metadata.json`)?.ContentType,
|
||||
).toBe('application/json')
|
||||
expect(
|
||||
byKey.get(`runs/${runId}/tasks/task-1/messages.jsonl`)?.ContentType,
|
||||
).toBe('application/x-ndjson')
|
||||
expect(
|
||||
byKey.get(`runs/${runId}/tasks/task-1/screenshots/1.png`)?.ContentType,
|
||||
).toBe('image/png')
|
||||
expect(byKey.get(`runs/${runId}/manifest.json`)?.ContentType).toBe(
|
||||
'application/json',
|
||||
)
|
||||
@@ -111,8 +121,10 @@ describe('R2Publisher', () => {
|
||||
).toString('utf-8'),
|
||||
)
|
||||
expect(manifest).toMatchObject({
|
||||
schemaVersion: 2,
|
||||
runId,
|
||||
uploadedAt: '2026-04-29T12:00:00.000Z',
|
||||
agentConfig: { type: 'single', model: 'kimi' },
|
||||
dataset: 'webbench',
|
||||
summary: { passRate: 1, avgDurationMs: 1200 },
|
||||
tasks: [
|
||||
@@ -120,11 +132,86 @@ describe('R2Publisher', () => {
|
||||
queryId: 'task-1',
|
||||
status: 'completed',
|
||||
screenshotCount: 1,
|
||||
paths: {
|
||||
attempt: 'tasks/task-1/attempt.json',
|
||||
metadata: 'tasks/task-1/metadata.json',
|
||||
messages: 'tasks/task-1/messages.jsonl',
|
||||
trace: 'tasks/task-1/trace.jsonl',
|
||||
grades: 'tasks/task-1/grades.json',
|
||||
screenshots: 'tasks/task-1/screenshots',
|
||||
graderArtifacts: 'tasks/task-1/grader-artifacts',
|
||||
},
|
||||
},
|
||||
],
|
||||
})
|
||||
})
|
||||
|
||||
it('uses task directory ids for canonical paths when metadata query ids differ', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'eval-r2-path-id-'))
|
||||
const { runDir, runId } = await writeRunFixture(
|
||||
dir,
|
||||
'weekly',
|
||||
'2026-04-29-1200',
|
||||
{ queryId: 'query-id-from-metadata' },
|
||||
)
|
||||
const viewerPath = join(dir, 'viewer.html')
|
||||
await writeFile(viewerPath, '<html>viewer</html>')
|
||||
const client = new FakeR2Client()
|
||||
|
||||
await new R2Publisher({
|
||||
client,
|
||||
viewerPath,
|
||||
config: {
|
||||
accountId: 'acct',
|
||||
accessKeyId: 'key',
|
||||
secretAccessKey: 'secret',
|
||||
bucket: 'bucket',
|
||||
cdnBaseUrl: 'https://eval.example.test',
|
||||
},
|
||||
now: () => new Date('2026-04-29T12:00:00.000Z'),
|
||||
}).publishRun(runDir, runId)
|
||||
|
||||
const byKey = new Map(client.puts.map((put) => [put.Key, put]))
|
||||
const manifest = JSON.parse(
|
||||
Buffer.from(
|
||||
byKey.get(`runs/${runId}/manifest.json`)?.Body as Buffer,
|
||||
).toString('utf-8'),
|
||||
)
|
||||
|
||||
expect(byKey.has(`runs/${runId}/tasks/task-1/metadata.json`)).toBe(true)
|
||||
expect(manifest.tasks[0]).toMatchObject({
|
||||
queryId: 'query-id-from-metadata',
|
||||
paths: {
|
||||
metadata: 'tasks/task-1/metadata.json',
|
||||
screenshots: 'tasks/task-1/screenshots',
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
it('encodes run ids in returned viewer urls', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'eval-r2-viewer-url-'))
|
||||
const { runDir } = await writeRunFixture(dir)
|
||||
const viewerPath = join(dir, 'viewer.html')
|
||||
await writeFile(viewerPath, '<html>viewer</html>')
|
||||
const client = new FakeR2Client()
|
||||
|
||||
const result = await new R2Publisher({
|
||||
client,
|
||||
viewerPath,
|
||||
config: {
|
||||
accountId: 'acct',
|
||||
accessKeyId: 'key',
|
||||
secretAccessKey: 'secret',
|
||||
bucket: 'bucket',
|
||||
cdnBaseUrl: 'https://eval.example.test',
|
||||
},
|
||||
}).publishRun(runDir, 'run with spaces')
|
||||
|
||||
expect(result.viewerUrl).toBe(
|
||||
'https://eval.example.test/viewer.html?run=run%20with%20spaces',
|
||||
)
|
||||
})
|
||||
|
||||
it('publishes unuploaded runs from a config results directory', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'eval-r2-config-'))
|
||||
const first = await writeRunFixture(dir, 'weekly', '2026-04-29-1200')
|
||||
@@ -186,8 +273,27 @@ describe('R2Publisher', () => {
|
||||
}).publishPath(runDir)
|
||||
|
||||
const keys = client.puts.map((put) => put.Key)
|
||||
const byKey = new Map(client.puts.map((put) => [put.Key, put]))
|
||||
const manifest = JSON.parse(
|
||||
Buffer.from(
|
||||
byKey.get(`runs/${runId}/manifest.json`)?.Body as Buffer,
|
||||
).toString('utf-8'),
|
||||
)
|
||||
|
||||
expect(result.uploadedRuns.map((run) => run.runId)).toEqual([runId])
|
||||
expect(keys).toContain(`runs/${runId}/task-1/metadata.json`)
|
||||
expect(keys).toContain(`runs/${runId}/tasks/task-1/metadata.json`)
|
||||
expect(manifest).toMatchObject({
|
||||
schemaVersion: 2,
|
||||
tasks: [
|
||||
{
|
||||
queryId: 'task-1',
|
||||
paths: {
|
||||
metadata: 'tasks/task-1/metadata.json',
|
||||
screenshots: 'tasks/task-1/screenshots',
|
||||
},
|
||||
},
|
||||
],
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
130
packages/browseros-agent/apps/eval/tests/publishing/r2-viewer-compat.test.ts
vendored
Normal file
130
packages/browseros-agent/apps/eval/tests/publishing/r2-viewer-compat.test.ts
vendored
Normal file
@@ -0,0 +1,130 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { readFile } from 'node:fs/promises'
|
||||
import { join } from 'node:path'
|
||||
|
||||
interface ViewerPathResolvers {
|
||||
artifactUrl(task: Record<string, unknown>, artifact: string): string
|
||||
metadataUrl(task: Record<string, unknown>): string
|
||||
messagesUrl(task: Record<string, unknown>): string
|
||||
screenshotUrl(task: Record<string, unknown>, step: number): string
|
||||
}
|
||||
|
||||
async function loadViewerPathResolvers(): Promise<ViewerPathResolvers> {
|
||||
const html = await readFile(
|
||||
join(import.meta.dir, '..', '..', 'src', 'dashboard', 'viewer.html'),
|
||||
'utf-8',
|
||||
)
|
||||
const start = html.indexOf('// -- Artifact path resolution')
|
||||
const end = html.indexOf('// -- Task selection', start)
|
||||
expect(start).toBeGreaterThan(-1)
|
||||
expect(end).toBeGreaterThan(start)
|
||||
|
||||
const block = html.slice(start, end)
|
||||
const createResolvers = new Function(
|
||||
`
|
||||
const basePath = 'runs/run-1';
|
||||
${block}
|
||||
return { artifactUrl, metadataUrl, messagesUrl, screenshotUrl };
|
||||
`,
|
||||
) as () => ViewerPathResolvers
|
||||
return createResolvers()
|
||||
}
|
||||
|
||||
async function runAutoSelectFromHash(hash: string): Promise<unknown> {
|
||||
const html = await readFile(
|
||||
join(import.meta.dir, '..', '..', 'src', 'dashboard', 'viewer.html'),
|
||||
'utf-8',
|
||||
)
|
||||
const start = html.indexOf('function autoSelectFromHash()')
|
||||
const end = html.indexOf('// -- Center panel', start)
|
||||
expect(start).toBeGreaterThan(-1)
|
||||
expect(end).toBeGreaterThan(start)
|
||||
|
||||
const block = html.slice(start, end)
|
||||
const runAutoSelect = new Function(
|
||||
`
|
||||
const window = { location: { hash: ${JSON.stringify(hash)} } };
|
||||
const manifest = {
|
||||
tasks: [
|
||||
{ queryId: 'legacy-task' },
|
||||
{ queryId: 'new-task', paths: { metadata: 'tasks/new-task/metadata.json' } },
|
||||
],
|
||||
};
|
||||
let selected = null;
|
||||
function selectTask(task) { selected = task; }
|
||||
${block}
|
||||
autoSelectFromHash();
|
||||
return selected;
|
||||
`,
|
||||
) as () => unknown
|
||||
return runAutoSelect()
|
||||
}
|
||||
|
||||
describe('R2 viewer artifact path compatibility', () => {
|
||||
it('uses explicit manifest paths for new uploaded runs', async () => {
|
||||
const resolvers = await loadViewerPathResolvers()
|
||||
const task = {
|
||||
queryId: 'task-1',
|
||||
paths: {
|
||||
metadata: 'tasks/task-1/metadata.json',
|
||||
messages: 'tasks/task-1/messages.jsonl',
|
||||
grades: 'tasks/task-1/grades.json',
|
||||
trace: 'tasks/task-1/trace.jsonl',
|
||||
screenshots: 'tasks/task-1/screenshots',
|
||||
graderArtifacts: 'tasks/task-1/grader-artifacts',
|
||||
},
|
||||
}
|
||||
|
||||
expect(resolvers.metadataUrl(task)).toBe(
|
||||
'runs/run-1/tasks/task-1/metadata.json',
|
||||
)
|
||||
expect(resolvers.messagesUrl(task)).toBe(
|
||||
'runs/run-1/tasks/task-1/messages.jsonl',
|
||||
)
|
||||
expect(resolvers.artifactUrl(task, 'grades')).toBe(
|
||||
'runs/run-1/tasks/task-1/grades.json',
|
||||
)
|
||||
expect(resolvers.artifactUrl(task, 'trace')).toBe(
|
||||
'runs/run-1/tasks/task-1/trace.jsonl',
|
||||
)
|
||||
expect(resolvers.artifactUrl(task, 'graderArtifacts')).toBe(
|
||||
'runs/run-1/tasks/task-1/grader-artifacts',
|
||||
)
|
||||
expect(resolvers.screenshotUrl(task, 7)).toBe(
|
||||
'runs/run-1/tasks/task-1/screenshots/7.png',
|
||||
)
|
||||
})
|
||||
|
||||
it('falls back to legacy inferred paths for old uploaded runs', async () => {
|
||||
const resolvers = await loadViewerPathResolvers()
|
||||
const task = { queryId: 'legacy-task' }
|
||||
|
||||
expect(resolvers.metadataUrl(task)).toBe(
|
||||
'runs/run-1/legacy-task/metadata.json',
|
||||
)
|
||||
expect(resolvers.messagesUrl(task)).toBe(
|
||||
'runs/run-1/legacy-task/messages.jsonl',
|
||||
)
|
||||
expect(resolvers.artifactUrl(task, 'grades')).toBe(
|
||||
'runs/run-1/legacy-task/grades.json',
|
||||
)
|
||||
expect(resolvers.artifactUrl(task, 'trace')).toBe(
|
||||
'runs/run-1/legacy-task/trace.jsonl',
|
||||
)
|
||||
expect(resolvers.artifactUrl(task, 'graderArtifacts')).toBe(
|
||||
'runs/run-1/legacy-task/grader-artifacts',
|
||||
)
|
||||
expect(resolvers.screenshotUrl(task, 3)).toBe(
|
||||
'runs/run-1/legacy-task/screenshots/3.png',
|
||||
)
|
||||
})
|
||||
|
||||
it('keeps hash-based task selection independent of artifact layout', async () => {
|
||||
expect(await runAutoSelectFromHash('#new-task')).toMatchObject({
|
||||
queryId: 'new-task',
|
||||
})
|
||||
expect(await runAutoSelectFromHash('#legacy-task')).toMatchObject({
|
||||
queryId: 'legacy-task',
|
||||
})
|
||||
})
|
||||
})
|
||||
105
packages/browseros-agent/apps/eval/tests/reporting/run-summary.test.ts
vendored
Normal file
105
packages/browseros-agent/apps/eval/tests/reporting/run-summary.test.ts
vendored
Normal file
@@ -0,0 +1,105 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import {
|
||||
buildRunSummaries,
|
||||
extractConfigName,
|
||||
} from '../../src/reporting/run-summary'
|
||||
|
||||
describe('report run summaries', () => {
|
||||
it('summarizes schema v2 manifests without depending on artifact paths', () => {
|
||||
const [summary] = buildRunSummaries([
|
||||
{
|
||||
schemaVersion: 2,
|
||||
runId: 'agisdk-real-smoke-2026-04-30-0000',
|
||||
uploadedAt: '2026-04-30T01:03:59.663Z',
|
||||
agentConfig: { type: 'single', model: 'moonshotai/kimi-k2.5' },
|
||||
dataset: 'agisdk-real',
|
||||
tasks: [
|
||||
{
|
||||
queryId: 'task-1',
|
||||
query: 'Do task 1',
|
||||
status: 'completed',
|
||||
durationMs: 1000,
|
||||
screenshotCount: 1,
|
||||
paths: { metadata: 'tasks/task-1/metadata.json' },
|
||||
graderResults: {
|
||||
agisdk_state_diff: { score: 1, pass: true },
|
||||
},
|
||||
},
|
||||
{
|
||||
queryId: 'task-2',
|
||||
query: 'Do task 2',
|
||||
status: 'timeout',
|
||||
durationMs: 3000,
|
||||
screenshotCount: 0,
|
||||
paths: { metadata: 'tasks/task-2/metadata.json' },
|
||||
graderResults: {
|
||||
agisdk_state_diff: { score: 0, pass: false },
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
])
|
||||
|
||||
expect(summary).toMatchObject({
|
||||
runId: 'agisdk-real-smoke-2026-04-30-0000',
|
||||
configName: 'agisdk-real-smoke',
|
||||
date: '2026-04-30 01:03',
|
||||
avgScore: 50,
|
||||
total: 2,
|
||||
completed: 1,
|
||||
timeout: 1,
|
||||
avgDurationMs: 2000,
|
||||
model: 'moonshotai/kimi-k2.5',
|
||||
dataset: 'agisdk-real',
|
||||
agentType: 'single',
|
||||
})
|
||||
})
|
||||
|
||||
it('summarizes legacy manifests without schema version or paths', () => {
|
||||
const [summary] = buildRunSummaries([
|
||||
{
|
||||
runId: 'browseros-agent-weekly-2026-04-29-1430',
|
||||
uploadedAt: '2026-04-29T14:30:00.000Z',
|
||||
agentConfig: { type: 'orchestrator-executor', model: 'kimi' },
|
||||
dataset: 'webbench',
|
||||
tasks: [
|
||||
{
|
||||
queryId: 'legacy-task',
|
||||
query: 'Do the old task',
|
||||
status: 'failed',
|
||||
durationMs: 0,
|
||||
screenshotCount: 0,
|
||||
graderResults: {
|
||||
performance_grader: { score: 0.25, pass: false },
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
])
|
||||
|
||||
expect(summary).toMatchObject({
|
||||
runId: 'browseros-agent-weekly-2026-04-29-1430',
|
||||
configName: 'browseros-agent-weekly',
|
||||
avgScore: 25,
|
||||
total: 1,
|
||||
completed: 0,
|
||||
failed: 1,
|
||||
avgDurationMs: 0,
|
||||
})
|
||||
})
|
||||
|
||||
it('keeps legacy config names when run ids have no timestamp suffix', () => {
|
||||
expect(extractConfigName('ci-weekly')).toBe('ci-weekly')
|
||||
})
|
||||
|
||||
it('uses an explicit unknown date when uploadedAt is missing', () => {
|
||||
const [summary] = buildRunSummaries([
|
||||
{
|
||||
runId: 'ci-weekly',
|
||||
tasks: [],
|
||||
},
|
||||
])
|
||||
|
||||
expect(summary.date).toBe('unknown')
|
||||
})
|
||||
})
|
||||
@@ -1,4 +1,5 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import type { R2RunManifest } from '../../src/publishing/r2-manifest'
|
||||
import { buildViewerManifest } from '../../src/viewer/viewer-manifest'
|
||||
|
||||
describe('buildViewerManifest', () => {
|
||||
@@ -22,12 +23,15 @@ describe('buildViewerManifest', () => {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: 'Missing checkout item',
|
||||
details: { missing: ['checkout item'] },
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
const publishManifest: R2RunManifest = manifest
|
||||
expect(publishManifest.schemaVersion).toBe(2)
|
||||
expect(manifest.tasks[0].paths.messages).toBe(
|
||||
'tasks/agisdk-dashdish-4/messages.jsonl',
|
||||
)
|
||||
@@ -37,5 +41,72 @@ describe('buildViewerManifest', () => {
|
||||
expect(manifest.tasks[0].paths.graderArtifacts).toBe(
|
||||
'tasks/agisdk-dashdish-4/grader-artifacts',
|
||||
)
|
||||
expect(manifest.tasks[0].graderResults.agisdk_state_diff.details).toEqual({
|
||||
missing: ['checkout item'],
|
||||
})
|
||||
})
|
||||
|
||||
it('builds stable paths when optional task fields are missing', () => {
|
||||
const manifest = buildViewerManifest({
|
||||
runId: 'run-2',
|
||||
uploadedAt: '2026-04-29T06:00:00.000Z',
|
||||
tasks: [
|
||||
{
|
||||
queryId: 'task-with-minimal-fields',
|
||||
query: 'Do the task',
|
||||
status: 'completed',
|
||||
durationMs: 10,
|
||||
screenshotCount: 0,
|
||||
graderResults: {},
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
expect(manifest).toMatchObject({
|
||||
schemaVersion: 2,
|
||||
runId: 'run-2',
|
||||
uploadedAt: '2026-04-29T06:00:00.000Z',
|
||||
tasks: [
|
||||
{
|
||||
queryId: 'task-with-minimal-fields',
|
||||
startUrl: '',
|
||||
paths: {
|
||||
attempt: 'tasks/task-with-minimal-fields/attempt.json',
|
||||
metadata: 'tasks/task-with-minimal-fields/metadata.json',
|
||||
messages: 'tasks/task-with-minimal-fields/messages.jsonl',
|
||||
trace: 'tasks/task-with-minimal-fields/trace.jsonl',
|
||||
grades: 'tasks/task-with-minimal-fields/grades.json',
|
||||
screenshots: 'tasks/task-with-minimal-fields/screenshots',
|
||||
graderArtifacts: 'tasks/task-with-minimal-fields/grader-artifacts',
|
||||
},
|
||||
},
|
||||
],
|
||||
})
|
||||
})
|
||||
|
||||
it('can separate display query ids from artifact path ids', () => {
|
||||
const manifest = buildViewerManifest({
|
||||
runId: 'run-3',
|
||||
tasks: [
|
||||
{
|
||||
queryId: 'metadata-query-id',
|
||||
artifactId: 'task-dir-id',
|
||||
query: 'Do the task',
|
||||
status: 'completed',
|
||||
durationMs: 10,
|
||||
screenshotCount: 0,
|
||||
graderResults: {},
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
expect(manifest.tasks[0]).toMatchObject({
|
||||
queryId: 'metadata-query-id',
|
||||
paths: {
|
||||
metadata: 'tasks/task-dir-id/metadata.json',
|
||||
screenshots: 'tasks/task-dir-id/screenshots',
|
||||
},
|
||||
})
|
||||
expect('artifactId' in manifest.tasks[0]).toBe(false)
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user