mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-19 11:31:03 +00:00
* feat(eval): add suite variant config bridge * feat(eval): add stable run artifacts * refactor(eval): add shared grader contract * feat(eval): persist grader artifacts * refactor(eval): rename runner layers * refactor(eval): add executor backend boundary * refactor(eval): split clado backend * feat(eval): add workflow compatible cli * feat(eval): add r2 publisher module * ci(eval): migrate weekly workflow to eval cli * docs(eval): document suite pipeline * chore(eval): verify pipeline refactor * fix: address review feedback for PR #875 * docs(eval): add env example * docs(eval): explain suites and variants * chore(eval): organize config layouts * chore(eval): colocate grader python evaluators
93 lines
2.6 KiB
TypeScript
Vendored
93 lines
2.6 KiB
TypeScript
Vendored
import { describe, expect, it } from 'bun:test'
|
|
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'
|
|
import { tmpdir } from 'node:os'
|
|
import { join } from 'node:path'
|
|
import { PerformanceGrader } from '../../src/graders/performance/performance-grader'
|
|
import type { GraderInput } from '../../src/grading/types'
|
|
|
|
describe('PerformanceGrader artifacts', () => {
|
|
it('writes metrics, agent output, and axes artifacts', async () => {
|
|
const dir = await mkdtemp(join(tmpdir(), 'performance-artifacts-'))
|
|
await mkdir(join(dir, 'screenshots'))
|
|
await writeFile(
|
|
join(dir, 'metadata.json'),
|
|
JSON.stringify({ termination_reason: 'completed' }),
|
|
)
|
|
|
|
const grader = new PerformanceGrader(undefined, undefined, 'claude-test')
|
|
const internals = grader as unknown as {
|
|
runAgent(
|
|
systemPrompt: string,
|
|
userPrompt: string,
|
|
outputDir: string,
|
|
): Promise<{
|
|
type: 'result'
|
|
subtype: string
|
|
result: string
|
|
total_cost_usd: number
|
|
num_turns: number
|
|
structured_output: unknown
|
|
}>
|
|
}
|
|
internals.runAgent = async () => ({
|
|
type: 'result',
|
|
subtype: 'success',
|
|
result: 'ok',
|
|
total_cost_usd: 0.01,
|
|
num_turns: 2,
|
|
structured_output: {
|
|
axes: [{ axis: 'task_completion', score: 90, reasoning: 'completed' }],
|
|
},
|
|
})
|
|
|
|
const input: GraderInput = {
|
|
task: {
|
|
query_id: 'task-1',
|
|
query: 'Find the answer',
|
|
dataset: 'fixture',
|
|
},
|
|
messages: [
|
|
{
|
|
type: 'tool-input-available',
|
|
timestamp: '2026-04-29T00:00:00.000Z',
|
|
toolCallId: 'call-1',
|
|
toolName: 'browser_get_page_content',
|
|
input: {},
|
|
},
|
|
],
|
|
screenshotCount: 1,
|
|
finalAnswer: 'answer',
|
|
taskArtifactDir: dir,
|
|
outputDir: dir,
|
|
}
|
|
|
|
const result = await grader.grade(input)
|
|
|
|
expect(result.details?.model).toBe('claude-test')
|
|
expect(
|
|
JSON.parse(
|
|
await readFile(
|
|
join(dir, 'grader-artifacts/performance_grader/metrics.json'),
|
|
'utf-8',
|
|
),
|
|
),
|
|
).toMatchObject({ totalToolCalls: 1 })
|
|
expect(
|
|
JSON.parse(
|
|
await readFile(
|
|
join(dir, 'grader-artifacts/performance_grader/axes.json'),
|
|
'utf-8',
|
|
),
|
|
),
|
|
).toMatchObject({ task_completion: { score: 90 } })
|
|
expect(
|
|
JSON.parse(
|
|
await readFile(
|
|
join(dir, 'grader-artifacts/performance_grader/agent-output.json'),
|
|
'utf-8',
|
|
),
|
|
),
|
|
).toMatchObject({ subtype: 'success' })
|
|
})
|
|
})
|