Files
BrowserOS/packages/browseros-agent/apps/eval/tests/cli/suite-command.test.ts
Nikhil 84a79ba0a1 feat: refactor eval pipeline workflow (#875)
* feat(eval): add suite variant config bridge

* feat(eval): add stable run artifacts

* refactor(eval): add shared grader contract

* feat(eval): persist grader artifacts

* refactor(eval): rename runner layers

* refactor(eval): add executor backend boundary

* refactor(eval): split clado backend

* feat(eval): add workflow compatible cli

* feat(eval): add r2 publisher module

* ci(eval): migrate weekly workflow to eval cli

* docs(eval): document suite pipeline

* chore(eval): verify pipeline refactor

* fix: address review feedback for PR #875

* docs(eval): add env example

* docs(eval): explain suites and variants

* chore(eval): organize config layouts

* chore(eval): colocate grader python evaluators
2026-04-29 17:21:02 -07:00

116 lines
3.4 KiB
TypeScript
Vendored

import { describe, expect, it } from 'bun:test'
import { mkdtemp, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { basename, join } from 'node:path'
import {
resolveSuiteCommand,
runSuiteCommand,
} from '../../src/cli/commands/suite'
import type { RunEvalOptions } from '../../src/runner/types'
async function writeTempSuite(): Promise<{ dir: string; suitePath: string }> {
const dir = await mkdtemp(join(tmpdir(), 'eval-suite-cli-'))
const suitePath = join(dir, 'agisdk-daily-10.json')
await writeFile(
suitePath,
JSON.stringify(
{
id: 'agisdk-daily-10',
dataset: 'tasks.jsonl',
agent: { type: 'single' },
graders: ['agisdk_state_diff'],
workers: 2,
restartBrowserPerTask: true,
browseros: {
server_url: 'http://127.0.0.1:9110',
headless: true,
},
},
null,
2,
),
)
await writeFile(join(dir, 'tasks.jsonl'), '')
return { dir, suitePath }
}
describe('suite command', () => {
it('resolves an existing config through the config adapter', async () => {
const resolved = await resolveSuiteCommand({
configPath: 'apps/eval/configs/legacy/browseros-agent-weekly.json',
env: {},
})
expect(resolved.kind).toBe('config')
expect(resolved.suite.id).toBe('browseros-agent-weekly')
expect(resolved.evalConfig.dataset).toBe(
'../../data/webbench-2of4-50.jsonl',
)
expect(resolved.variant.publicMetadata.agent.apiKeyConfigured).toBe(true)
})
it('resolves a suite file and variant into a runnable eval config', async () => {
const { dir, suitePath } = await writeTempSuite()
const resolved = await resolveSuiteCommand({
suitePath,
variantId: 'kimi-fireworks',
provider: 'openai-compatible',
model: 'accounts/fireworks/models/kimi-k2p5',
apiKey: 'test-key',
baseUrl: 'https://api.fireworks.ai/inference/v1',
env: {},
})
expect(resolved.kind).toBe('suite')
expect(resolved.suite.id).toBe('agisdk-daily-10')
expect(resolved.datasetPath).toBe(join(dir, 'tasks.jsonl'))
expect(resolved.evalConfig.agent).toMatchObject({
type: 'single',
provider: 'openai-compatible',
model: 'accounts/fireworks/models/kimi-k2p5',
apiKey: 'test-key',
baseUrl: 'https://api.fireworks.ai/inference/v1',
})
expect(resolved.evalConfig.num_workers).toBe(2)
})
it('runs config and suite commands through the runner dependency', async () => {
const calls: RunEvalOptions[] = []
await runSuiteCommand(
{
configPath: 'apps/eval/configs/legacy/browseros-agent-weekly.json',
env: {},
},
{
runEval: async (options) => {
calls.push(options)
},
},
)
const { suitePath } = await writeTempSuite()
await runSuiteCommand(
{
suitePath,
model: 'moonshotai/kimi-k2.5',
provider: 'openai-compatible',
env: {},
},
{
runEval: async (options) => {
calls.push(options)
},
},
)
expect(calls).toHaveLength(2)
expect(calls[0].configPath.endsWith('browseros-agent-weekly.json')).toBe(
true,
)
expect(basename(calls[1].configPath)).toBe('agisdk-daily-10.json')
expect(calls[1].config).toBeDefined()
expect(calls[1].dataPath?.endsWith('tasks.jsonl')).toBe(true)
})
})