BrowserOS/packages/browseros-agent/apps/eval/tests/grading/performance-artifacts.test.ts

import { describe, expect, it } from 'bun:test'
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { PerformanceGrader } from '../../src/graders/performance/performance-grader'
import type { GraderInput } from '../../src/grading/types'

describe('PerformanceGrader artifacts', () => {
  it('writes metrics, agent output, and axes artifacts', async () => {
    const dir = await mkdtemp(join(tmpdir(), 'performance-artifacts-'))
    await mkdir(join(dir, 'screenshots'))
    await writeFile(
      join(dir, 'metadata.json'),
      JSON.stringify({ termination_reason: 'completed' }),
    )

    const grader = new PerformanceGrader(undefined, undefined, 'claude-test')
    const internals = grader as unknown as {
      runAgent(
        systemPrompt: string,
        userPrompt: string,
        outputDir: string,
      ): Promise<{
        type: 'result'
        subtype: string
        result: string
        total_cost_usd: number
        num_turns: number
        structured_output: unknown
      }>
    }
    internals.runAgent = async () => ({
      type: 'result',
      subtype: 'success',
      result: 'ok',
      total_cost_usd: 0.01,
      num_turns: 2,
      structured_output: {
        axes: [{ axis: 'task_completion', score: 90, reasoning: 'completed' }],
      },
    })

    const input: GraderInput = {
      task: {
        query_id: 'task-1',
        query: 'Find the answer',
        dataset: 'fixture',
      },
      messages: [
        {
          type: 'tool-input-available',
          timestamp: '2026-04-29T00:00:00.000Z',
          toolCallId: 'call-1',
          toolName: 'browser_get_page_content',
          input: {},
        },
      ],
      screenshotCount: 1,
      finalAnswer: 'answer',
      taskArtifactDir: dir,
      outputDir: dir,
    }

    const result = await grader.grade(input)

    expect(result.details?.model).toBe('claude-test')
    expect(
      JSON.parse(
        await readFile(
          join(dir, 'grader-artifacts/performance_grader/metrics.json'),
          'utf-8',
        ),
      ),
    ).toMatchObject({ totalToolCalls: 1 })
    expect(
      JSON.parse(
        await readFile(
          join(dir, 'grader-artifacts/performance_grader/axes.json'),
          'utf-8',
        ),
      ),
    ).toMatchObject({ task_completion: { score: 90 } })
    expect(
      JSON.parse(
        await readFile(
          join(dir, 'grader-artifacts/performance_grader/agent-output.json'),
          'utf-8',
        ),
      ),
    ).toMatchObject({ subtype: 'success' })
  })
})