Files
BrowserOS/packages/browseros-agent/apps/eval/tests/grading/infinity-artifacts.test.ts
Nikhil 84a79ba0a1 feat: refactor eval pipeline workflow (#875)
* feat(eval): add suite variant config bridge

* feat(eval): add stable run artifacts

* refactor(eval): add shared grader contract

* feat(eval): persist grader artifacts

* refactor(eval): rename runner layers

* refactor(eval): add executor backend boundary

* refactor(eval): split clado backend

* feat(eval): add workflow compatible cli

* feat(eval): add r2 publisher module

* ci(eval): migrate weekly workflow to eval cli

* docs(eval): document suite pipeline

* chore(eval): verify pipeline refactor

* fix: address review feedback for PR #875

* docs(eval): add env example

* docs(eval): explain suites and variants

* chore(eval): organize config layouts

* chore(eval): colocate grader python evaluators
2026-04-29 17:21:02 -07:00

68 lines
2.1 KiB
TypeScript
Vendored

import { describe, expect, it } from 'bun:test'
import { mkdtemp, readFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { InfinityStateGrader } from '../../src/graders/benchmark/infinity-state'
import type { GraderInput } from '../../src/grading/types'
describe('InfinityStateGrader artifacts', () => {
it('writes verifier and evaluator artifacts', async () => {
const dir = await mkdtemp(join(tmpdir(), 'infinity-artifacts-'))
const oldInfinityDir = process.env.WEBARENA_INFINITY_DIR
process.env.WEBARENA_INFINITY_DIR = '/tmp/webarena-infinity'
try {
const grader = new InfinityStateGrader()
const internals = grader as unknown as {
runPythonEvaluator(input: unknown): Promise<{
output: { pass: boolean; reward: number; message: string }
stderr: string
}>
}
internals.runPythonEvaluator = async () => ({
output: { pass: true, reward: 1, message: 'verified' },
stderr: 'verifier log',
})
const input: GraderInput = {
task: {
query_id: 'infinity-elation-prescriptions-task_h69',
query: 'Verify the app state',
dataset: 'webarena-infinity',
},
messages: [],
screenshotCount: 0,
finalAnswer: null,
taskArtifactDir: dir,
outputDir: dir,
infinityAppUrl: 'http://127.0.0.1:8123',
}
const result = await grader.grade(input)
expect(result.pass).toBe(true)
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/infinity_state/verifier.json'),
'utf-8',
),
),
).toMatchObject({
appName: 'elation-prescriptions',
appServerUrl: 'http://127.0.0.1:8123',
})
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/infinity_state/evaluator-output.json'),
'utf-8',
),
),
).toMatchObject({ message: 'verified' })
} finally {
process.env.WEBARENA_INFINITY_DIR = oldInfinityDir
}
})
})