Files
BrowserOS/packages/browseros-agent/apps/eval/tests/grading/performance-artifacts.test.ts
Nikhil 84a79ba0a1 feat: refactor eval pipeline workflow (#875)
* feat(eval): add suite variant config bridge

* feat(eval): add stable run artifacts

* refactor(eval): add shared grader contract

* feat(eval): persist grader artifacts

* refactor(eval): rename runner layers

* refactor(eval): add executor backend boundary

* refactor(eval): split clado backend

* feat(eval): add workflow compatible cli

* feat(eval): add r2 publisher module

* ci(eval): migrate weekly workflow to eval cli

* docs(eval): document suite pipeline

* chore(eval): verify pipeline refactor

* fix: address review feedback for PR #875

* docs(eval): add env example

* docs(eval): explain suites and variants

* chore(eval): organize config layouts

* chore(eval): colocate grader python evaluators
2026-04-29 17:21:02 -07:00

93 lines
2.6 KiB
TypeScript
Vendored

import { describe, expect, it } from 'bun:test'
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { PerformanceGrader } from '../../src/graders/performance/performance-grader'
import type { GraderInput } from '../../src/grading/types'
describe('PerformanceGrader artifacts', () => {
it('writes metrics, agent output, and axes artifacts', async () => {
const dir = await mkdtemp(join(tmpdir(), 'performance-artifacts-'))
await mkdir(join(dir, 'screenshots'))
await writeFile(
join(dir, 'metadata.json'),
JSON.stringify({ termination_reason: 'completed' }),
)
const grader = new PerformanceGrader(undefined, undefined, 'claude-test')
const internals = grader as unknown as {
runAgent(
systemPrompt: string,
userPrompt: string,
outputDir: string,
): Promise<{
type: 'result'
subtype: string
result: string
total_cost_usd: number
num_turns: number
structured_output: unknown
}>
}
internals.runAgent = async () => ({
type: 'result',
subtype: 'success',
result: 'ok',
total_cost_usd: 0.01,
num_turns: 2,
structured_output: {
axes: [{ axis: 'task_completion', score: 90, reasoning: 'completed' }],
},
})
const input: GraderInput = {
task: {
query_id: 'task-1',
query: 'Find the answer',
dataset: 'fixture',
},
messages: [
{
type: 'tool-input-available',
timestamp: '2026-04-29T00:00:00.000Z',
toolCallId: 'call-1',
toolName: 'browser_get_page_content',
input: {},
},
],
screenshotCount: 1,
finalAnswer: 'answer',
taskArtifactDir: dir,
outputDir: dir,
}
const result = await grader.grade(input)
expect(result.details?.model).toBe('claude-test')
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/performance_grader/metrics.json'),
'utf-8',
),
),
).toMatchObject({ totalToolCalls: 1 })
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/performance_grader/axes.json'),
'utf-8',
),
),
).toMatchObject({ task_completion: { score: 90 } })
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/performance_grader/agent-output.json'),
'utf-8',
),
),
).toMatchObject({ subtype: 'success' })
})
})