Compare commits

...

1 Commits

Author SHA1 Message Date
shivammittal274
d6012304f6 feat(eval): add claude code eval agent 2026-05-01 02:13:37 +05:30
26 changed files with 1222 additions and 25 deletions

View File

@@ -9,6 +9,7 @@ Evaluation framework for BrowserOS browser automation agents. Runs tasks from st
- **BrowserOS binary** at `/Applications/BrowserOS.app` (macOS) or `BROWSEROS_BINARY` pointing at it
- **Bun** runtime
- **API keys** for your LLM provider (and `CLAUDE_CODE_OAUTH_TOKEN` if you use `performance_grader`)
- **Python 3.10+ with `agisdk`** for AGI SDK / REAL Bench grading. Set `BROWSEROS_EVAL_PYTHON` if your default `python3` is older.
## Quick Start
@@ -67,7 +68,7 @@ This lets us run the same suite against multiple model setups without copying th
```txt
agisdk-daily-10 + kimi-fireworks
agisdk-daily-10 + claude-sonnet
agisdk-daily-10 + claude-opus
agisdk-daily-10 + clado-action-000159
```
@@ -79,6 +80,7 @@ For `orchestrator-executor` suites, there can also be an executor model/backend.
|------|-------------|
| `single` | Single LLM agent driven by the BrowserOS tool loop (CDP) |
| `orchestrator-executor` | High-level orchestrator + per-step executor (LLM or Clado visual model) |
| `claude-code` | External Claude Code CLI driven through BrowserOS MCP |
### Single agent
@@ -119,6 +121,24 @@ The orchestrator works with any LLM provider. The executor can be another LLM, o
}
```
### Claude Code
Claude Code runs as an external `claude -p` subprocess. The eval runner passes a task-scoped MCP config that points Claude Code at the active worker's BrowserOS MCP endpoint, while the eval capture layer still saves messages, screenshots, trajectory metadata, and grader outputs.
```json
{
"agent": {
"type": "claude-code",
"model": "opus"
}
}
```
```bash
BROWSEROS_EVAL_PYTHON=/path/to/python3 bun run eval run --config configs/legacy/claude-code-agisdk-real.json
bun run eval suite --config configs/legacy/claude-code-agisdk-real.json --publish r2
```
## Graders
| Name | Description |
@@ -151,6 +171,7 @@ The `apiKey` field supports two formats:
| `CLADO_ACTION_MODEL`, `CLADO_ACTION_API_KEY`, `CLADO_ACTION_BASE_URL` | Clado executor defaults |
| `BROWSEROS_BINARY` | BrowserOS binary path in CI/local smoke runs |
| `BROWSEROS_SERVER_URL` | Optional grader MCP URL override |
| `BROWSEROS_EVAL_PYTHON` | Optional Python interpreter for JSON graders such as `agisdk_state_diff` |
| `WEBARENA_INFINITY_DIR` | Local WebArena-Infinity checkout for Infinity tasks |
| `NOPECHA_API_KEY` | CAPTCHA solver extension |
| `EVAL_R2_ACCOUNT_ID`, `EVAL_R2_ACCESS_KEY_ID`, `EVAL_R2_SECRET_ACCESS_KEY`, `EVAL_R2_BUCKET`, `EVAL_R2_CDN_BASE_URL` | R2 upload and viewer URL |
@@ -194,7 +215,7 @@ Published runs are available at `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": true
"headless": false
}
```

View File

@@ -7,7 +7,7 @@
"baseUrl": "https://openrouter.ai/api/v1",
"supportsImages": true
},
"dataset": "../../data/webbench-2of4-50.jsonl",
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 10,
"restart_server_per_task": true,
"browseros": {
@@ -21,6 +21,6 @@
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"graders": ["agisdk_state_diff"],
"timeout_ms": 1800000
}

View File

@@ -23,7 +23,7 @@
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": true
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"

View File

@@ -0,0 +1,22 @@
{
"agent": {
"type": "claude-code",
"model": "opus"
},
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 1,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["agisdk_state_diff"],
"timeout_ms": 1800000
}

View File

@@ -14,7 +14,7 @@
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": true
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"

View File

@@ -0,0 +1,238 @@
import { writeFile } from 'node:fs/promises'
import { join } from 'node:path'
import { DEFAULT_TIMEOUT_MS } from '../../constants'
import type { ClaudeCodeAgentConfig, UIMessageStreamEvent } from '../../types'
import { withEvalTimeout } from '../../utils/with-eval-timeout'
import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
import {
type ClaudeCodeProcessRunner,
createClaudeCodeProcessRunner,
} from './process-runner'
import {
ClaudeCodeStreamParser,
shouldCaptureScreenshotForTool,
} from './stream-parser'
export interface ClaudeCodeEvaluatorDeps {
processRunner?: ClaudeCodeProcessRunner
}
export class ClaudeCodeEvaluator implements AgentEvaluator {
private processRunner: ClaudeCodeProcessRunner
constructor(
private ctx: AgentContext,
deps: ClaudeCodeEvaluatorDeps = {},
) {
this.processRunner = deps.processRunner ?? createClaudeCodeProcessRunner()
}
async execute(): Promise<AgentResult> {
const { config, task, capture, taskOutputDir } = this.ctx
const startTime = Date.now()
const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS
await capture.messageLogger.logUser(task.query)
if (config.agent.type !== 'claude-code') {
throw new Error('ClaudeCodeEvaluator only supports claude-code config')
}
const agentConfig = config.agent
const mcpConfigPath = join(taskOutputDir, 'claude-code-mcp.json')
await writeFile(
mcpConfigPath,
JSON.stringify(
buildClaudeCodeMcpConfig(config.browseros.server_url),
null,
2,
),
)
const parser = new ClaudeCodeStreamParser()
const toolNamesById = new Map<string, string>()
const prompt = buildClaudeCodePrompt(task.query)
const args = buildClaudeCodeArgs({
prompt,
mcpConfigPath,
config: agentConfig,
})
const { terminationReason } = await withEvalTimeout(
timeoutMs,
capture,
async (signal) => {
const runResult = await this.processRunner.run({
executable: agentConfig.claudePath,
args,
cwd: taskOutputDir,
signal,
onStdoutLine: async (line) => {
const events = parser.pushLine(line)
for (const event of events) {
await this.handleStreamEvent(event, toolNamesById)
}
},
})
if (runResult.exitCode !== 0) {
const message =
runResult.stderr.trim() ||
`Claude Code exited with status ${runResult.exitCode}`
capture.addError('agent_execution', message, {
exitCode: runResult.exitCode,
})
if (!parser.getLastText()) {
throw new Error(message)
}
}
for (const error of runResult.streamErrors ?? []) {
capture.addWarning(
'message_logging',
`Claude Code stream event processing failed: ${error}`,
)
}
return runResult
},
)
const endTime = Date.now()
const finalAnswer = parser.getLastText() ?? capture.getLastAssistantText()
const metadata = {
query_id: task.query_id,
dataset: task.dataset,
query: task.query,
started_at: new Date(startTime).toISOString(),
completed_at: new Date(endTime).toISOString(),
total_duration_ms: endTime - startTime,
total_steps: parser.getToolCallCount() || capture.getScreenshotCount(),
termination_reason: terminationReason,
final_answer: finalAnswer,
errors: capture.getErrors(),
warnings: capture.getWarnings(),
device_pixel_ratio: capture.screenshot.getDevicePixelRatio(),
agent_config: {
type: 'claude-code' as const,
model: agentConfig.model,
},
grader_results: {},
}
await capture.trajectorySaver.saveMetadata(metadata)
return {
metadata,
messages: capture.getMessages(),
finalAnswer,
}
}
private async handleStreamEvent(
event: UIMessageStreamEvent,
toolNamesById: Map<string, string>,
): Promise<void> {
const { capture, task } = this.ctx
let screenshot: number | undefined
if (event.type === 'tool-input-available') {
toolNamesById.set(event.toolCallId, event.toolName)
if (isPageInput(event.input)) {
capture.setActivePageId(event.input.page)
}
}
if (
event.type === 'tool-output-available' ||
event.type === 'tool-output-error'
) {
const toolName = toolNamesById.get(event.toolCallId)
if (toolName && shouldCaptureScreenshotForTool(toolName)) {
screenshot = await this.captureScreenshot()
}
}
await capture.messageLogger.logStreamEvent(event, screenshot)
capture.emitEvent(task.query_id, {
...event,
...(screenshot !== undefined && { screenshot }),
})
}
private async captureScreenshot(): Promise<number | undefined> {
const { capture, task } = this.ctx
try {
const screenshot = await capture.screenshot.capture(
capture.getActivePageId(),
)
capture.emitEvent(task.query_id, {
type: 'screenshot-captured',
screenshot,
})
return screenshot
} catch {
return undefined
}
}
}
function isPageInput(input: unknown): input is { page: number } {
return (
typeof input === 'object' &&
input !== null &&
'page' in input &&
typeof input.page === 'number'
)
}
function buildClaudeCodePrompt(taskQuery: string): string {
return [
'You are running inside BrowserOS eval.',
'Use the BrowserOS MCP tools to interact with the already-open browser and complete the user task.',
'When the task is complete, respond with the final answer only.',
'If blocked, explain the blocker clearly.',
'',
`Task: ${taskQuery}`,
].join('\n')
}
function buildClaudeCodeArgs({
prompt,
mcpConfigPath,
config,
}: {
prompt: string
mcpConfigPath: string
config: ClaudeCodeAgentConfig
}): string[] {
const args = [
'-p',
prompt,
'--mcp-config',
mcpConfigPath,
'--strict-mcp-config',
'--output-format',
'stream-json',
'--verbose',
]
if (config.model) args.push('--model', config.model)
args.push(...config.extraArgs)
return args
}
function buildClaudeCodeMcpConfig(serverUrl: string) {
const trimmed = serverUrl.replace(/\/$/, '')
const url = trimmed.endsWith('/mcp') ? trimmed : `${trimmed}/mcp`
return {
mcpServers: {
browseros: {
type: 'http',
url,
headers: { 'X-BrowserOS-Source': 'sdk-internal' },
},
},
}
}

View File

@@ -0,0 +1,114 @@
export interface ClaudeCodeRunOptions {
executable: string
args: string[]
cwd: string
signal?: AbortSignal
onStdoutLine: (line: string) => Promise<void>
}
export interface ClaudeCodeRunResult {
exitCode: number
stderr: string
streamErrors?: string[]
}
export interface ClaudeCodeProcessRunner {
run(options: ClaudeCodeRunOptions): Promise<ClaudeCodeRunResult>
}
export interface SpawnOptions {
cwd: string
signal?: AbortSignal
onStdoutLine: (line: string) => Promise<void>
}
export interface CreateClaudeCodeProcessRunnerDeps {
spawn?: (cmd: string[], options: SpawnOptions) => Promise<ClaudeCodeRunResult>
}
export function createClaudeCodeProcessRunner(
deps: CreateClaudeCodeProcessRunnerDeps = {},
): ClaudeCodeProcessRunner {
const spawn = deps.spawn ?? spawnClaudeCode
return {
run: async ({ executable, args, cwd, signal, onStdoutLine }) =>
spawn([executable, ...args], { cwd, signal, onStdoutLine }),
}
}
async function spawnClaudeCode(
cmd: string[],
options: SpawnOptions,
): Promise<ClaudeCodeRunResult> {
const proc = Bun.spawn({
cmd,
cwd: options.cwd,
stdin: 'ignore',
stdout: 'pipe',
stderr: 'pipe',
})
const abort = () => {
try {
proc.kill('SIGTERM')
} catch {
// Process may already have exited.
}
}
options.signal?.addEventListener('abort', abort, { once: true })
try {
const streamErrors: string[] = []
const stdoutPromise = readLines(
proc.stdout,
options.onStdoutLine,
streamErrors,
)
const stderrPromise = new Response(proc.stderr).text()
const exitCode = await proc.exited
await stdoutPromise
const stderr = await stderrPromise
return { exitCode, stderr, streamErrors }
} finally {
options.signal?.removeEventListener('abort', abort)
}
}
async function readLines(
stream: ReadableStream<Uint8Array>,
onLine: (line: string) => Promise<void>,
streamErrors: string[],
): Promise<void> {
const reader = stream.getReader()
const decoder = new TextDecoder()
let buffer = ''
while (true) {
const { done, value } = await reader.read()
if (done) break
buffer += decoder.decode(value, { stream: true })
const lines = buffer.split('\n')
buffer = lines.pop() ?? ''
for (const line of lines) {
await emitLine(line, onLine, streamErrors)
}
}
buffer += decoder.decode()
if (buffer.length > 0) {
await emitLine(buffer, onLine, streamErrors)
}
}
async function emitLine(
line: string,
onLine: (line: string) => Promise<void>,
streamErrors: string[],
): Promise<void> {
try {
await onLine(line)
} catch (error) {
streamErrors.push(error instanceof Error ? error.message : String(error))
}
}

View File

@@ -0,0 +1,142 @@
import { randomUUID } from 'node:crypto'
import type { UIMessageStreamEvent } from '../../types'
type JsonObject = Record<string, unknown>
export class ClaudeCodeStreamParser {
private lastText: string | null = null
private toolCallCount = 0
pushLine(line: string): UIMessageStreamEvent[] {
const trimmed = line.trim()
if (!trimmed) return []
let parsed: unknown
try {
parsed = JSON.parse(trimmed)
} catch {
return []
}
if (!isObject(parsed)) return []
if (parsed.type === 'assistant') {
return this.parseAssistantMessage(parsed)
}
if (parsed.type === 'user') {
return this.parseUserMessage(parsed)
}
if (parsed.type === 'result' && typeof parsed.result === 'string') {
this.lastText = parsed.result
}
return []
}
getLastText(): string | null {
return this.lastText
}
getToolCallCount(): number {
return this.toolCallCount
}
private parseAssistantMessage(message: JsonObject): UIMessageStreamEvent[] {
const content = contentBlocks(message)
const events: UIMessageStreamEvent[] = []
for (const block of content) {
if (block.type === 'text' && typeof block.text === 'string') {
const id = randomUUID()
this.lastText = block.text
events.push(
{ type: 'text-start', id },
{ type: 'text-delta', id, delta: block.text },
{ type: 'text-end', id },
)
} else if (
block.type === 'tool_use' &&
typeof block.id === 'string' &&
typeof block.name === 'string'
) {
this.toolCallCount++
events.push({
type: 'tool-input-available',
toolCallId: block.id,
toolName: block.name,
input: block.input,
})
}
}
return events
}
private parseUserMessage(message: JsonObject): UIMessageStreamEvent[] {
const content = contentBlocks(message)
const events: UIMessageStreamEvent[] = []
for (const block of content) {
if (
block.type !== 'tool_result' ||
typeof block.tool_use_id !== 'string'
) {
continue
}
if (block.is_error === true) {
events.push({
type: 'tool-output-error',
toolCallId: block.tool_use_id,
errorText: stringifyToolContent(block.content),
})
} else {
events.push({
type: 'tool-output-available',
toolCallId: block.tool_use_id,
output: normalizeToolContent(block.content),
})
}
}
return events
}
}
export function shouldCaptureScreenshotForTool(toolName: string): boolean {
if (!toolName.startsWith('mcp__browseros__')) return false
return !toolName.endsWith('__take_screenshot')
}
function contentBlocks(message: JsonObject): JsonObject[] {
const inner = isObject(message.message) ? message.message : message
return Array.isArray(inner.content) ? inner.content.filter(isObject) : []
}
function isObject(value: unknown): value is JsonObject {
return typeof value === 'object' && value !== null
}
function normalizeToolContent(content: unknown): unknown {
if (!Array.isArray(content)) return content
return content.map((item) => {
if (
isObject(item) &&
item.type === 'text' &&
typeof item.text === 'string'
) {
return item.text
}
return item
})
}
function stringifyToolContent(content: unknown): string {
const normalized = normalizeToolContent(content)
if (typeof normalized === 'string') return normalized
try {
return JSON.stringify(normalized)
} catch {
return String(normalized)
}
}

View File

@@ -1,3 +1,4 @@
import { ClaudeCodeEvaluator } from './claude-code'
import { OrchestratorExecutorEvaluator } from './orchestrator-executor'
import { SingleAgentEvaluator } from './single-agent'
import type { AgentContext, AgentEvaluator } from './types'
@@ -8,6 +9,8 @@ export function createAgent(context: AgentContext): AgentEvaluator {
return new SingleAgentEvaluator(context)
case 'orchestrator-executor':
return new OrchestratorExecutorEvaluator(context)
case 'claude-code':
return new ClaudeCodeEvaluator(context)
}
}

View File

@@ -105,7 +105,10 @@ export class TrajectorySaver {
errors: [],
warnings: [],
agent_config: {
type: agentConfig.type as 'single' | 'orchestrator-executor',
type: agentConfig.type as
| 'single'
| 'orchestrator-executor'
| 'claude-code',
model: agentConfig.model,
},
grader_results: {},

View File

@@ -82,6 +82,16 @@ function suiteToEvalConfig(
})
}
if (suite.agent.type === 'claude-code') {
return EvalConfigSchema.parse({
...base,
agent: {
type: 'claude-code',
...(variant.agent.model && { model: variant.agent.model }),
},
})
}
const executorBackend = suite.agent.executorBackend ?? 'tool-loop'
const executor =
executorBackend === 'clado'
@@ -135,7 +145,10 @@ export async function resolveSuiteCommand(
const loaded = await loadSuite(options.suitePath)
const variant = resolveVariant({
variantId: options.variantId,
provider: options.provider,
provider:
loaded.suite.agent.type === 'claude-code'
? 'claude-code'
: options.provider,
model: options.model,
apiKey: options.apiKey,
baseUrl: options.baseUrl,

View File

@@ -2,6 +2,7 @@ export interface PythonEvaluatorOptions {
scriptPath: string
input: unknown
timeoutMs: number
pythonPath?: string
}
export interface PythonEvaluatorResult<T> {
@@ -15,7 +16,9 @@ export interface PythonEvaluatorResult<T> {
export async function runPythonJsonEvaluator<T>(
options: PythonEvaluatorOptions,
): Promise<PythonEvaluatorResult<T>> {
const proc = Bun.spawn(['python3', options.scriptPath], {
const pythonPath =
options.pythonPath || process.env.BROWSEROS_EVAL_PYTHON || 'python3'
const proc = Bun.spawn([pythonPath, options.scriptPath], {
stdin: 'pipe',
stdout: 'pipe',
stderr: 'pipe',

View File

@@ -33,6 +33,13 @@ function variantSource(config: EvalConfig): {
baseUrl?: string
supportsImages?: boolean
} {
if (config.agent.type === 'claude-code') {
return {
provider: 'claude-code',
model: config.agent.model ?? 'default',
}
}
const agent =
config.agent.type === 'single' ? config.agent : config.agent.orchestrator
if (!agent.model) {
@@ -76,10 +83,7 @@ export async function adaptEvalConfigFile(
suite: {
id,
dataset: evalConfig.dataset,
agent:
evalConfig.agent.type === 'single'
? { type: 'tool-loop' }
: { type: 'orchestrated', executorBackend: backend ?? 'tool-loop' },
agent: suiteAgent(evalConfig, backend),
graders: evalConfig.graders ?? [],
workers: evalConfig.num_workers,
restartBrowserPerTask: evalConfig.restart_server_per_task,
@@ -99,3 +103,17 @@ export async function adaptEvalConfigFile(
}),
}
}
function suiteAgent(
config: EvalConfig,
backend: ReturnType<typeof executorBackend>,
): EvalSuite['agent'] {
switch (config.agent.type) {
case 'single':
return { type: 'tool-loop' }
case 'orchestrator-executor':
return { type: 'orchestrated', executorBackend: backend ?? 'tool-loop' }
case 'claude-code':
return { type: 'claude-code' }
}
}

View File

@@ -57,10 +57,30 @@ export function resolveVariant(
options: ResolveVariantOptions = {},
): EvalVariant {
const env = options.env ?? process.env
const id = options.variantId ?? env.EVAL_VARIANT ?? 'default'
const provider =
options.provider ?? env.EVAL_AGENT_PROVIDER ?? 'openai-compatible'
const model = options.model ?? env.EVAL_AGENT_MODEL
if (provider === 'claude-code') {
const id = options.variantId ?? env.EVAL_VARIANT ?? 'claude-code'
return {
id,
agent: {
provider,
model: model ?? '',
},
publicMetadata: {
id,
agent: {
provider,
model: model || 'default',
apiKeyConfigured: false,
},
},
}
}
const id = options.variantId ?? env.EVAL_VARIANT ?? 'default'
const apiKey = options.apiKey ?? env.EVAL_AGENT_API_KEY
const apiKeyEnv =
options.apiKeyEnv ?? (options.apiKey ? undefined : 'EVAL_AGENT_API_KEY')

View File

@@ -8,6 +8,7 @@ export const SuiteAgentSchema = z
'single',
'orchestrated',
'orchestrator-executor',
'claude-code',
]),
executorBackend: z.enum(['tool-loop', 'clado']).optional(),
})

View File

@@ -19,9 +19,19 @@ export const OrchestratorExecutorConfigSchema = z.object({
}),
})
export const ClaudeCodeAgentConfigSchema = z
.object({
type: z.literal('claude-code'),
model: z.string().min(1).optional(),
claudePath: z.string().min(1).default('claude'),
extraArgs: z.array(z.string()).default([]),
})
.strict()
export const AgentConfigSchema = z.discriminatedUnion('type', [
SingleAgentConfigSchema,
OrchestratorExecutorConfigSchema,
ClaudeCodeAgentConfigSchema,
])
export const EvalConfigSchema = z.object({
@@ -53,5 +63,6 @@ export type SingleAgentConfig = z.infer<typeof SingleAgentConfigSchema>
export type OrchestratorExecutorConfig = z.infer<
typeof OrchestratorExecutorConfigSchema
>
export type ClaudeCodeAgentConfig = z.infer<typeof ClaudeCodeAgentConfigSchema>
export type AgentConfig = z.infer<typeof AgentConfigSchema>
export type EvalConfig = z.infer<typeof EvalConfigSchema>

View File

@@ -2,6 +2,8 @@
export {
type AgentConfig,
AgentConfigSchema,
type ClaudeCodeAgentConfig,
ClaudeCodeAgentConfigSchema,
type EvalConfig,
EvalConfigSchema,
type OrchestratorExecutorConfig,

View File

@@ -13,7 +13,7 @@ export const GraderResultSchema = z.object({
// Agent config in metadata
const AgentConfigMetaSchema = z
.object({
type: z.enum(['single', 'orchestrator-executor']),
type: z.enum(['single', 'orchestrator-executor', 'claude-code']),
model: z.string().optional(),
})
.passthrough()

View File

@@ -59,7 +59,7 @@ export async function validateConfig(
) {
envVarsToCheck.push(config.agent.apiKey)
}
} else {
} else if (config.agent.type === 'orchestrator-executor') {
const { orchestrator, executor } = config.agent
if (orchestrator.apiKey && isEnvVarName(orchestrator.apiKey)) {
envVarsToCheck.push(orchestrator.apiKey)

View File

@@ -0,0 +1,268 @@
import { describe, expect, it } from 'bun:test'
import { mkdtemp, readFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { createAgent } from '../../src/agents'
import { ClaudeCodeEvaluator } from '../../src/agents/claude-code'
import { CaptureContext } from '../../src/capture/context'
import {
AgentConfigSchema,
type EvalConfig,
EvalConfigSchema,
type Task,
TaskMetadataSchema,
} from '../../src/types'
function config(): EvalConfig {
return {
agent: {
type: 'claude-code',
model: 'opus',
claudePath: 'claude',
extraArgs: [],
},
dataset: 'data/test.jsonl',
num_workers: 1,
restart_server_per_task: false,
browseros: {
server_url: 'http://127.0.0.1:9110',
base_cdp_port: 9010,
base_server_port: 9110,
base_extension_port: 9310,
load_extensions: false,
headless: false,
},
graders: [],
}
}
const task: Task = {
query_id: 'task-1',
dataset: 'test',
query: 'Find the title',
graders: [],
metadata: {
original_task_id: 'task-1',
},
}
describe('ClaudeCodeEvaluator', () => {
it('accepts claude-code config defaults without permission mode', () => {
const agent = AgentConfigSchema.parse({ type: 'claude-code' })
expect(agent).toEqual({
type: 'claude-code',
claudePath: 'claude',
extraArgs: [],
})
})
it('accepts claude-code as a runnable eval agent', () => {
const parsed = EvalConfigSchema.parse({
agent: {
type: 'claude-code',
model: 'opus',
},
dataset: 'data/test-set.jsonl',
browseros: {
server_url: 'http://127.0.0.1:9110',
},
})
expect(parsed.agent.type).toBe('claude-code')
expect(parsed.agent.model).toBe('opus')
})
it('rejects unsupported claude-code settings instead of silently ignoring them', () => {
expect(
AgentConfigSchema.safeParse({
type: 'claude-code',
permissionMode: 'bypassPermissions',
}).success,
).toBe(false)
expect(
AgentConfigSchema.safeParse({
type: 'claude-code',
maxTurns: 3,
}).success,
).toBe(false)
})
it('allows claude-code in task metadata', () => {
const metadata = TaskMetadataSchema.parse({
query_id: 'task-1',
dataset: 'test',
query: 'Do the thing',
started_at: new Date().toISOString(),
completed_at: new Date().toISOString(),
total_duration_ms: 100,
total_steps: 1,
termination_reason: 'completed',
final_answer: 'done',
errors: [],
warnings: [],
agent_config: {
type: 'claude-code',
model: 'opus',
},
grader_results: {},
})
expect(metadata.agent_config.type).toBe('claude-code')
})
it('is created by the agent factory', async () => {
const outputDir = await mkdtemp(join(tmpdir(), 'claude-code-eval-'))
const { capture, taskOutputDir } = await CaptureContext.create({
serverUrl: 'http://127.0.0.1:9110',
outputDir,
taskId: task.query_id,
initialPageId: 1,
})
const agent = createAgent({
config: config(),
task,
workerIndex: 0,
initialPageId: 1,
outputDir,
taskOutputDir,
capture,
})
expect(agent).toBeInstanceOf(ClaudeCodeEvaluator)
})
it('runs claude code, logs messages, writes MCP config, and saves metadata', async () => {
const outputDir = await mkdtemp(join(tmpdir(), 'claude-code-eval-'))
const { capture, taskOutputDir } = await CaptureContext.create({
serverUrl: 'http://127.0.0.1:9110',
outputDir,
taskId: task.query_id,
initialPageId: 1,
})
const calls: Array<{ executable: string; args: string[]; cwd: string }> = []
const evaluator = new ClaudeCodeEvaluator(
{
config: config(),
task,
workerIndex: 0,
initialPageId: 1,
outputDir,
taskOutputDir,
capture,
},
{
processRunner: {
async run(options) {
calls.push(options)
await options.onStdoutLine(
JSON.stringify({
type: 'assistant',
message: {
content: [{ type: 'text', text: 'The title is Example' }],
},
}),
)
await options.onStdoutLine(
JSON.stringify({
type: 'result',
subtype: 'success',
result: 'The title is Example',
}),
)
return { exitCode: 0, stderr: '' }
},
},
},
)
const result = await evaluator.execute()
expect(result.finalAnswer).toBe('The title is Example')
expect(result.metadata.agent_config).toMatchObject({
type: 'claude-code',
model: 'opus',
})
expect(result.messages.some((msg) => msg.type === 'user')).toBe(true)
expect(result.messages.some((msg) => msg.type === 'text-delta')).toBe(true)
const mcpConfig = JSON.parse(
await readFile(join(taskOutputDir, 'claude-code-mcp.json'), 'utf-8'),
)
expect(mcpConfig.mcpServers.browseros).toMatchObject({
type: 'http',
url: 'http://127.0.0.1:9110/mcp',
headers: {
'X-BrowserOS-Source': 'sdk-internal',
},
})
expect(calls).toEqual([
expect.objectContaining({
executable: 'claude',
cwd: taskOutputDir,
args: [
'-p',
expect.stringContaining('Task: Find the title'),
'--mcp-config',
join(taskOutputDir, 'claude-code-mcp.json'),
'--strict-mcp-config',
'--output-format',
'stream-json',
'--verbose',
'--model',
'opus',
],
}),
])
expect(calls[0].args).not.toContain('--permission-mode')
})
it('records non-fatal stream processing errors as warnings', async () => {
const outputDir = await mkdtemp(join(tmpdir(), 'claude-code-eval-'))
const { capture, taskOutputDir } = await CaptureContext.create({
serverUrl: 'http://127.0.0.1:9110',
outputDir,
taskId: task.query_id,
initialPageId: 1,
})
const evaluator = new ClaudeCodeEvaluator(
{
config: config(),
task,
workerIndex: 0,
initialPageId: 1,
outputDir,
taskOutputDir,
capture,
},
{
processRunner: {
async run(options) {
await options.onStdoutLine(
JSON.stringify({
type: 'result',
subtype: 'success',
result: 'done',
}),
)
return {
exitCode: 0,
stderr: '',
streamErrors: ['bad stream line'],
}
},
},
},
)
const result = await evaluator.execute()
expect(result.finalAnswer).toBe('done')
expect(result.metadata.warnings).toEqual([
expect.objectContaining({
source: 'message_logging',
message: 'Claude Code stream event processing failed: bad stream line',
}),
])
})
})

View File

@@ -0,0 +1,78 @@
import { describe, expect, it } from 'bun:test'
import { chmod, mkdtemp, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { createClaudeCodeProcessRunner } from '../../src/agents/claude-code/process-runner'
async function writeStdoutScript(): Promise<string> {
const dir = await mkdtemp(join(tmpdir(), 'claude-code-runner-'))
const script = join(dir, 'stdout-lines')
await writeFile(script, '#!/bin/sh\nprintf "first\\nbad\\nlast\\n"\n')
await chmod(script, 0o755)
return script
}
describe('createClaudeCodeProcessRunner', () => {
it('passes executable and args to the spawn dependency', async () => {
const calls: unknown[] = []
const runner = createClaudeCodeProcessRunner({
spawn: async (cmd, options) => {
calls.push({ cmd, options })
await options.onStdoutLine('{"type":"result","result":"done"}')
return { exitCode: 0, stderr: '' }
},
})
const result = await runner.run({
executable: 'claude',
args: ['-p', 'hello'],
cwd: '/tmp',
signal: new AbortController().signal,
onStdoutLine: async () => {},
})
expect(result.exitCode).toBe(0)
expect(calls).toEqual([
{
cmd: ['claude', '-p', 'hello'],
options: expect.objectContaining({ cwd: '/tmp' }),
},
])
})
it('returns stderr and non-zero exit codes', async () => {
const runner = createClaudeCodeProcessRunner({
spawn: async () => ({ exitCode: 2, stderr: 'bad auth' }),
})
const result = await runner.run({
executable: 'claude',
args: [],
cwd: '/tmp',
signal: new AbortController().signal,
onStdoutLine: async () => {},
})
expect(result).toEqual({ exitCode: 2, stderr: 'bad auth' })
})
it('continues reading stdout after a line handler error', async () => {
const script = await writeStdoutScript()
const lines: string[] = []
const runner = createClaudeCodeProcessRunner()
const result = await runner.run({
executable: script,
args: [],
cwd: '/tmp',
onStdoutLine: async (line) => {
lines.push(line)
if (line === 'bad') throw new Error('bad line')
},
})
expect(result.exitCode).toBe(0)
expect(result.streamErrors).toEqual(['bad line'])
expect(lines).toEqual(['first', 'bad', 'last'])
})
})

View File

@@ -0,0 +1,102 @@
import { describe, expect, it } from 'bun:test'
import {
ClaudeCodeStreamParser,
shouldCaptureScreenshotForTool,
} from '../../src/agents/claude-code/stream-parser'
describe('ClaudeCodeStreamParser', () => {
it('maps assistant text and MCP tool use into eval stream events', () => {
const parser = new ClaudeCodeStreamParser()
const events = parser.pushLine(
JSON.stringify({
type: 'assistant',
message: {
content: [
{ type: 'text', text: 'I will navigate.' },
{
type: 'tool_use',
id: 'toolu_1',
name: 'mcp__browseros__navigate_page',
input: { page: 2, url: 'https://example.com' },
},
],
},
}),
)
expect(events).toEqual([
{ type: 'text-start', id: expect.any(String) },
{
type: 'text-delta',
id: expect.any(String),
delta: 'I will navigate.',
},
{ type: 'text-end', id: expect.any(String) },
{
type: 'tool-input-available',
toolCallId: 'toolu_1',
toolName: 'mcp__browseros__navigate_page',
input: { page: 2, url: 'https://example.com' },
},
])
expect(parser.getLastText()).toBe('I will navigate.')
expect(parser.getToolCallCount()).toBe(1)
})
it('maps Claude Code tool results into eval output events', () => {
const parser = new ClaudeCodeStreamParser()
const events = parser.pushLine(
JSON.stringify({
type: 'user',
message: {
content: [
{
type: 'tool_result',
tool_use_id: 'toolu_1',
content: 'Navigated successfully',
},
],
},
}),
)
expect(events).toEqual([
{
type: 'tool-output-available',
toolCallId: 'toolu_1',
output: 'Navigated successfully',
},
])
})
it('uses result messages as the authoritative final text', () => {
const parser = new ClaudeCodeStreamParser()
parser.pushLine(
JSON.stringify({
type: 'assistant',
message: {
content: [{ type: 'text', text: 'I will complete the task.' }],
},
}),
)
parser.pushLine(
JSON.stringify({
type: 'result',
subtype: 'success',
result: 'Final answer',
}),
)
expect(parser.getLastText()).toBe('Final answer')
})
it('identifies BrowserOS MCP tools that should trigger screenshots', () => {
expect(
shouldCaptureScreenshotForTool('mcp__browseros__navigate_page'),
).toBe(true)
expect(
shouldCaptureScreenshotForTool('mcp__browseros__take_screenshot'),
).toBe(false)
expect(shouldCaptureScreenshotForTool('Read')).toBe(false)
})
})

View File

@@ -7,8 +7,11 @@ import {
runSuiteCommand,
} from '../../src/cli/commands/suite'
import type { RunEvalOptions } from '../../src/runner/types'
import type { EvalSuite } from '../../src/suites/schema'
async function writeTempSuite(): Promise<{ dir: string; suitePath: string }> {
async function writeTempSuite(
overrides: Partial<EvalSuite> = {},
): Promise<{ dir: string; suitePath: string }> {
const dir = await mkdtemp(join(tmpdir(), 'eval-suite-cli-'))
const suitePath = join(dir, 'agisdk-daily-10.json')
await writeFile(
@@ -23,8 +26,9 @@ async function writeTempSuite(): Promise<{ dir: string; suitePath: string }> {
restartBrowserPerTask: true,
browseros: {
server_url: 'http://127.0.0.1:9110',
headless: true,
headless: false,
},
...overrides,
},
null,
2,
@@ -43,9 +47,7 @@ describe('suite command', () => {
expect(resolved.kind).toBe('config')
expect(resolved.suite.id).toBe('browseros-agent-weekly')
expect(resolved.evalConfig.dataset).toBe(
'../../data/webbench-2of4-50.jsonl',
)
expect(resolved.evalConfig.dataset).toBe('../../data/agisdk-real.jsonl')
expect(resolved.variant.publicMetadata.agent.apiKeyConfigured).toBe(true)
})
@@ -75,6 +77,25 @@ describe('suite command', () => {
expect(resolved.evalConfig.num_workers).toBe(2)
})
it('resolves claude-code suites without provider API credentials', async () => {
const { dir, suitePath } = await writeTempSuite({
agent: { type: 'claude-code' },
})
const resolved = await resolveSuiteCommand({
suitePath,
model: 'opus',
env: {},
})
expect(resolved.kind).toBe('suite')
expect(resolved.evalConfig.agent).toMatchObject({
type: 'claude-code',
model: 'opus',
})
expect(resolved.datasetPath).toBe(join(dir, 'tasks.jsonl'))
})
it('runs config and suite commands through the runner dependency', async () => {
const calls: RunEvalOptions[] = []
await runSuiteCommand(

View File

@@ -1,5 +1,5 @@
import { describe, expect, it } from 'bun:test'
import { mkdtemp, writeFile } from 'node:fs/promises'
import { chmod, mkdtemp, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { runPythonJsonEvaluator } from '../../src/grading/python-evaluator'
@@ -11,6 +11,17 @@ async function writeScript(source: string): Promise<string> {
return script
}
async function writePythonWrapper(): Promise<string> {
const dir = await mkdtemp(join(tmpdir(), 'eval-python-wrapper-'))
const wrapper = join(dir, 'python-wrapper')
await writeFile(
wrapper,
'#!/bin/sh\necho custom-python >&2\nexec python3 "$@"\n',
)
await chmod(wrapper, 0o755)
return wrapper
}
describe('runPythonJsonEvaluator', () => {
it('sends JSON on stdin, captures stderr, and parses stdout JSON', async () => {
const script = await writeScript(`
@@ -49,6 +60,34 @@ sys.exit(3)
).rejects.toThrow('bad verifier')
})
it('uses BROWSEROS_EVAL_PYTHON when provided', async () => {
const script = await writeScript(`
import json, sys
data = json.loads(sys.stdin.read())
print(json.dumps({"ok": data["ok"]}))
`)
const wrapper = await writePythonWrapper()
const previousPythonPath = process.env.BROWSEROS_EVAL_PYTHON
process.env.BROWSEROS_EVAL_PYTHON = wrapper
try {
const result = await runPythonJsonEvaluator<{ ok: boolean }>({
scriptPath: script,
input: { ok: true },
timeoutMs: 5_000,
})
expect(result.output).toEqual({ ok: true })
expect(result.stderr).toContain('custom-python')
} finally {
if (previousPythonPath === undefined) {
delete process.env.BROWSEROS_EVAL_PYTHON
} else {
process.env.BROWSEROS_EVAL_PYTHON = previousPythonPath
}
}
})
it('enforces timeouts', async () => {
const script = await writeScript(`
import time

View File

@@ -1,15 +1,18 @@
import { describe, expect, it } from 'bun:test'
import { mkdtemp, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { adaptEvalConfigFile } from '../../src/suites/config-adapter'
describe('adaptEvalConfigFile', () => {
it('preserves browseros-agent-weekly config semantics', async () => {
it('preserves browseros-agent-weekly AGI SDK config semantics', async () => {
const adapted = await adaptEvalConfigFile(
'apps/eval/configs/legacy/browseros-agent-weekly.json',
)
expect(adapted.suite.id).toBe('browseros-agent-weekly')
expect(adapted.suite.dataset).toBe('../../data/webbench-2of4-50.jsonl')
expect(adapted.suite.graders).toEqual(['performance_grader'])
expect(adapted.suite.dataset).toBe('../../data/agisdk-real.jsonl')
expect(adapted.suite.graders).toEqual(['agisdk_state_diff'])
expect(adapted.suite.workers).toBe(10)
expect(adapted.suite.restartBrowserPerTask).toBe(true)
expect(adapted.suite.timeoutMs).toBe(1_800_000)
@@ -34,4 +37,33 @@ describe('adaptEvalConfigFile', () => {
'secret-openrouter-value',
)
})
it('adapts claude-code configs without provider credentials', async () => {
const dir = await mkdtemp(join(tmpdir(), 'claude-code-config-'))
const configPath = join(dir, 'claude-code-agisdk.json')
await writeFile(
configPath,
JSON.stringify({
agent: {
type: 'claude-code',
model: 'opus',
},
dataset: 'tasks.jsonl',
num_workers: 1,
restart_server_per_task: false,
browseros: {
server_url: 'http://127.0.0.1:9110',
headless: false,
},
}),
)
const adapted = await adaptEvalConfigFile(configPath, { env: {} })
expect(adapted.suite.agent).toEqual({ type: 'claude-code' })
expect(adapted.variant.agent).toMatchObject({
provider: 'claude-code',
model: 'opus',
})
})
})

View File

@@ -35,6 +35,16 @@ describe('EvalSuiteSchema', () => {
expect(parsed.success).toBe(false)
})
it('validates claude-code suites', () => {
const suite = EvalSuiteSchema.parse({
id: 'claude-code-agisdk',
dataset: 'data/agisdk-real.jsonl',
agent: { type: 'claude-code' },
})
expect(suite.agent.type).toBe('claude-code')
})
it('validates the daily AGISDK 10-task suite', async () => {
const loaded = await loadSuite(
'apps/eval/configs/suites/agisdk-daily-10.json',
@@ -89,4 +99,40 @@ describe('resolveVariant', () => {
}),
).toThrow('EVAL_AGENT_API_KEY')
})
it('resolves claude-code variants without model or API key requirements', () => {
const variant = resolveVariant({
variantId: 'claude-opus',
provider: 'claude-code',
model: 'opus',
env: {},
})
expect(variant.id).toBe('claude-opus')
expect(variant.agent).toEqual({
provider: 'claude-code',
model: 'opus',
})
expect(variant.publicMetadata.agent).toEqual({
provider: 'claude-code',
model: 'opus',
apiKeyConfigured: false,
})
const defaultVariant = resolveVariant({
provider: 'claude-code',
env: {},
})
expect(defaultVariant.id).toBe('claude-code')
expect(defaultVariant.agent).toEqual({
provider: 'claude-code',
model: '',
})
expect(defaultVariant.publicMetadata.agent).toEqual({
provider: 'claude-code',
model: 'default',
apiKeyConfigured: false,
})
})
})