Files
Nikhil 84a79ba0a1 feat: refactor eval pipeline workflow (#875)
* feat(eval): add suite variant config bridge

* feat(eval): add stable run artifacts

* refactor(eval): add shared grader contract

* feat(eval): persist grader artifacts

* refactor(eval): rename runner layers

* refactor(eval): add executor backend boundary

* refactor(eval): split clado backend

* feat(eval): add workflow compatible cli

* feat(eval): add r2 publisher module

* ci(eval): migrate weekly workflow to eval cli

* docs(eval): document suite pipeline

* chore(eval): verify pipeline refactor

* fix: address review feedback for PR #875

* docs(eval): add env example

* docs(eval): explain suites and variants

* chore(eval): organize config layouts

* chore(eval): colocate grader python evaluators
2026-04-29 17:21:02 -07:00

65 lines
1.9 KiB
TypeScript
Vendored

import { describe, expect, it } from 'bun:test'
import { parseEvalCliArgs } from '../../src/cli/args'
describe('parseEvalCliArgs', () => {
it('parses the workflow-compatible suite config command', () => {
expect(
parseEvalCliArgs([
'suite',
'--config',
'configs/legacy/browseros-agent-weekly.json',
'--publish',
'r2',
]),
).toEqual({
command: 'suite',
configPath: 'configs/legacy/browseros-agent-weekly.json',
publishTarget: 'r2',
})
})
it('parses suite variant and model options', () => {
expect(
parseEvalCliArgs([
'suite',
'--suite',
'configs/suites/agisdk-daily-10.json',
'--variant',
'kimi-fireworks',
'--provider',
'openai-compatible',
'--model',
'accounts/fireworks/models/kimi-k2p5',
'--base-url',
'https://api.fireworks.ai/inference/v1',
]),
).toEqual({
command: 'suite',
suitePath: 'configs/suites/agisdk-daily-10.json',
variantId: 'kimi-fireworks',
provider: 'openai-compatible',
model: 'accounts/fireworks/models/kimi-k2p5',
baseUrl: 'https://api.fireworks.ai/inference/v1',
})
})
it('keeps the old config shorthand as legacy config mode', () => {
expect(
parseEvalCliArgs(['-c', 'configs/legacy/browseros-agent-weekly.json']),
).toEqual({
command: 'legacy',
configPath: 'configs/legacy/browseros-agent-weekly.json',
})
})
it('rejects missing required command options with targeted errors', () => {
expect(() => parseEvalCliArgs(['run'])).toThrow(
'run requires --config or --suite',
)
expect(() => parseEvalCliArgs(['grade'])).toThrow('grade requires --run')
expect(() =>
parseEvalCliArgs(['publish', '--run', 'results/run-1']),
).toThrow('publish requires --target')
})
})