mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-19 11:31:03 +00:00
* feat(eval): add suite variant config bridge * feat(eval): add stable run artifacts * refactor(eval): add shared grader contract * feat(eval): persist grader artifacts * refactor(eval): rename runner layers * refactor(eval): add executor backend boundary * refactor(eval): split clado backend * feat(eval): add workflow compatible cli * feat(eval): add r2 publisher module * ci(eval): migrate weekly workflow to eval cli * docs(eval): document suite pipeline * chore(eval): verify pipeline refactor * fix: address review feedback for PR #875 * docs(eval): add env example * docs(eval): explain suites and variants * chore(eval): organize config layouts * chore(eval): colocate grader python evaluators
68 lines
2.1 KiB
TypeScript
Vendored
68 lines
2.1 KiB
TypeScript
Vendored
import { describe, expect, it } from 'bun:test'
|
|
import { mkdtemp, readFile } from 'node:fs/promises'
|
|
import { tmpdir } from 'node:os'
|
|
import { join } from 'node:path'
|
|
import { InfinityStateGrader } from '../../src/graders/benchmark/infinity-state'
|
|
import type { GraderInput } from '../../src/grading/types'
|
|
|
|
describe('InfinityStateGrader artifacts', () => {
|
|
it('writes verifier and evaluator artifacts', async () => {
|
|
const dir = await mkdtemp(join(tmpdir(), 'infinity-artifacts-'))
|
|
const oldInfinityDir = process.env.WEBARENA_INFINITY_DIR
|
|
process.env.WEBARENA_INFINITY_DIR = '/tmp/webarena-infinity'
|
|
|
|
try {
|
|
const grader = new InfinityStateGrader()
|
|
const internals = grader as unknown as {
|
|
runPythonEvaluator(input: unknown): Promise<{
|
|
output: { pass: boolean; reward: number; message: string }
|
|
stderr: string
|
|
}>
|
|
}
|
|
internals.runPythonEvaluator = async () => ({
|
|
output: { pass: true, reward: 1, message: 'verified' },
|
|
stderr: 'verifier log',
|
|
})
|
|
|
|
const input: GraderInput = {
|
|
task: {
|
|
query_id: 'infinity-elation-prescriptions-task_h69',
|
|
query: 'Verify the app state',
|
|
dataset: 'webarena-infinity',
|
|
},
|
|
messages: [],
|
|
screenshotCount: 0,
|
|
finalAnswer: null,
|
|
taskArtifactDir: dir,
|
|
outputDir: dir,
|
|
infinityAppUrl: 'http://127.0.0.1:8123',
|
|
}
|
|
|
|
const result = await grader.grade(input)
|
|
|
|
expect(result.pass).toBe(true)
|
|
expect(
|
|
JSON.parse(
|
|
await readFile(
|
|
join(dir, 'grader-artifacts/infinity_state/verifier.json'),
|
|
'utf-8',
|
|
),
|
|
),
|
|
).toMatchObject({
|
|
appName: 'elation-prescriptions',
|
|
appServerUrl: 'http://127.0.0.1:8123',
|
|
})
|
|
expect(
|
|
JSON.parse(
|
|
await readFile(
|
|
join(dir, 'grader-artifacts/infinity_state/evaluator-output.json'),
|
|
'utf-8',
|
|
),
|
|
),
|
|
).toMatchObject({ message: 'verified' })
|
|
} finally {
|
|
process.env.WEBARENA_INFINITY_DIR = oldInfinityDir
|
|
}
|
|
})
|
|
})
|