feat(eval): capture server stdout/stderr and dump on health-check timeout

The AppManager was discarding server output, so 'Server health check timed out' gave us no idea why the server was unhealthy. Pipe both streams into a per-worker log file and tail the last 40 lines on failure so the next CI run reveals the actual error.
ci(eval): pull BrowserOS from rolling stable CDN URL
2026-05-14 08:03:58 +00:00 · 2026-04-29 02:17:07 +05:30 · 2026-04-29 02:08:43 +05:30 · 2026-04-29 01:37:01 +05:30
61 changed files with 203 additions and 10590 deletions
--- a/.github/workflows/eval-weekly.yml
+++ b/.github/workflows/eval-weekly.yml
@@ -30,8 +30,9 @@ jobs:

      - name: Install BrowserOS
        run: |
-          wget -q https://github.com/browseros-ai/BrowserOS/releases/download/v0.44.0.1/BrowserOS_v0.44.0.1_amd64.deb
-          sudo dpkg -i BrowserOS_v0.44.0.1_amd64.deb
+          # Rolling stable channel — see https://cdn.browseros.com/download/BrowserOS.deb
+          wget -q -O BrowserOS.deb https://cdn.browseros.com/download/BrowserOS.deb
+          sudo dpkg -i BrowserOS.deb
          browseros --version || echo "BrowserOS installed at $(which browseros)"

      - name: Install Bun
--- a/packages/browseros-agent/apps/eval/DESIGN_DOC.md
+++ b/packages/browseros-agent/apps/eval/DESIGN_DOC.md
@@ -1,875 +0,0 @@
-# Eval System - Production Grade Design Doc
-
-## Current State Analysis
-
-### What's Working Well
-1. **Zod validation** - Already exists in `config-validator.ts`, reuses `LLMConfigSchema` from `@browseros/shared`
-2. **Grader registry pattern** - `createGrader()` factory works well, easy to add new graders
-3. **AgentEvaluator interface** - Clean interface: `execute() → AgentResult`
-4. **Discriminated unions** - Messages, agent types use proper TypeScript patterns
-5. **Capture utilities** - `ScreenshotCapture`, `MessageLogger`, `TrajectorySaver` are modular
-
-### Key Problems
-
-**1. No Agent Registry/Factory**
-Agent creation is hardcoded if-else in `task-executor.ts`:
-```typescript
-// Current approach - not scalable
-if (this.config.agent.type === 'single') {
-  const evaluator = new SingleAgentEvaluator(...)
-} else if (this.config.agent.type === 'orchestrator-executor') {
-  const evaluator = new OrchestratorExecutorEvaluator(...)
-}
-// Adding new agent = modify this file
-```
-
-**2. Heavy Server Dependency**
-Imports from `@browseros/server`:
- `GeminiAgent` - Core agent (necessary)
- `ToolExecutionHooks` - Hook interface
- `ResolvedAgentConfig` - Agent config type
- `AgentExecutionError` - Error type
- `VercelAIContentGenerator` - Provider adapter
- Gateway client functions
-
-**3. Scattered Types**
- `src/types.ts` - Main types
- `agents/types.ts` - Agent interface
- `agents/orchestrator-executor/types.ts` - Orchestrator types
- `runner/types.ts` - Runner types
- `graders/types.ts` - Grader types
-
-**4. Duplicated Capture Logic**
-Both agent evaluators duplicate:
- Initialize ScreenshotCapture
- Initialize MessageLogger
- Set up tool hooks
- Handle timeouts
- Collect errors/warnings
-
-**5. No Unified Utils**
-Hooks, screenshot capture, message logging code is copy-pasted per agent type.
-
---
-
-## Design Goals
-
-1. **Easy to add new agents** - Register new agent type, implement interface, done
-2. **Shared capture infrastructure** - All agents use same screenshot/logging utils
-3. **Type-safe with Zod** - Config validation at entry point
-4. **Minimal server coupling** - Only import what's necessary
-5. **Clear folder structure** - Types where they belong
-6. **Production patterns** - Factory, registry, composition
-
---
-
-## Proposed Architecture
-
-### Folder Structure
-
-```
-eval/src/
-├── index.ts                      # Entry point, CLI
-├── types/
-│   ├── index.ts                  # Re-exports all types
-│   ├── config.ts                 # EvalConfig, AgentConfig (Zod schemas + types)
-│   ├── task.ts                   # Task, TaskMetadata
-│   ├── message.ts                # Message discriminated union
-│   ├── result.ts                 # AgentResult, GraderResult
-│   └── errors.ts                 # ErrorSource, TaskError, EvalWarning
-│
-├── agents/
-│   ├── index.ts                  # Re-exports + auto-registration
-│   ├── registry.ts               # Agent registry + factory
-│   ├── types.ts                  # AgentEvaluator interface, AgentContext
-│   ├── single/
-│   │   └── index.ts              # SingleAgentEvaluator
-│   └── orchestrator-executor/
-│       ├── index.ts              # OrchestratorExecutorEvaluator
-│       ├── types.ts              # Orchestrator-specific types only
-│       ├── orchestrator.ts
-│       ├── orchestrator-agent.ts
-│       ├── orchestrator-tools.ts
-│       ├── executor.ts
-│       └── executor-store.ts
-│
-├── capture/
-│   ├── index.ts                  # Re-exports
-│   ├── types.ts                  # CaptureContext interface
-│   ├── context.ts                # CaptureContext class (bundles all capture)
-│   ├── hooks.ts                  # createCaptureHooks() utility
-│   ├── screenshot.ts             # ScreenshotCapture
-│   ├── message-logger.ts         # MessageLogger
-│   ├── trajectory-saver.ts       # TrajectorySaver
-│   └── window-manager.ts         # WindowManager
-│
-├── graders/
-│   ├── index.ts                  # Re-exports
-│   ├── registry.ts               # Grader registry (existing pattern)
-│   ├── types.ts                  # Grader interface
-│   ├── benchmark/
-│   │   ├── webvoyager.ts
-│   │   └── mind2web.ts
-│   └── fara/
-│       ├── alignment.ts
-│       ├── rubric.ts
-│       ├── multimodal.ts
-│       └── combined.ts
-│
-├── runner/
-│   ├── index.ts                  # runEval() main entry
-│   ├── types.ts                  # RunEvalOptions, TaskResult, BatchSummary
-│   ├── task-loader.ts
-│   ├── task-executor.ts
-│   └── parallel-executor.ts
-│
-└── utils/
-    ├── env.ts                    # resolveEnvValue() helper
-    └── validation.ts             # Config validation logic
-```
-
---
-
-## Key Components
-
-### 1. Type System (`types/`)
-
-**`types/config.ts`** - Zod schemas + inferred types:
-```typescript
-import { LLMConfigSchema, LLMProviderSchema } from '@browseros/shared/schemas/llm'
-import { z } from 'zod'
-
-// Single agent config
-export const SingleAgentConfigSchema = LLMConfigSchema.extend({
-  type: z.literal('single'),
-})
-export type SingleAgentConfig = z.infer<typeof SingleAgentConfigSchema>
-
-// Orchestrator-executor config
-export const OrchestratorExecutorConfigSchema = z.object({
-  type: z.literal('orchestrator-executor'),
-  orchestrator: LLMConfigSchema.extend({
-    maxTurns: z.number().int().min(1).optional(),
-  }),
-  executor: LLMConfigSchema.extend({
-    maxStepsPerDelegation: z.number().int().min(1).optional(),
-  }),
-})
-export type OrchestratorExecutorConfig = z.infer<typeof OrchestratorExecutorConfigSchema>
-
-// Discriminated union
-export const AgentConfigSchema = z.discriminatedUnion('type', [
-  SingleAgentConfigSchema,
-  OrchestratorExecutorConfigSchema,
-])
-export type AgentConfig = z.infer<typeof AgentConfigSchema>
-
-// Full eval config
-export const EvalConfigSchema = z.object({
-  agent: AgentConfigSchema,
-  dataset: z.string().min(1),
-  output_dir: z.string().optional(),
-  num_workers: z.number().int().min(1).max(20).default(1),
-  browseros: z.object({
-    server_url: z.string().url(),
-  }),
-  grader_model: z.string().optional(),
-  grader_api_key_env: z.string().optional(),
-  grader_base_url: z.string().url().optional(),
-  timeout_ms: z.number().int().min(30000).max(3600000).optional(),
-})
-export type EvalConfig = z.infer<typeof EvalConfigSchema>
-```
-
-**`types/message.ts`** - Message types:
-```typescript
-import { z } from 'zod'
-
-const BaseMessageSchema = z.object({
-  timestamp: z.string().datetime(),
-})
-
-export const UserMessageSchema = BaseMessageSchema.extend({
-  type: z.literal('user'),
-  content: z.string(),
-})
-
-export const AssistantMessageSchema = BaseMessageSchema.extend({
-  type: z.literal('assistant'),
-  content: z.string(),
-})
-
-export const ToolCallMessageSchema = BaseMessageSchema.extend({
-  type: z.literal('tool_call'),
-  tool: z.string(),
-  toolCallId: z.string(),
-  params: z.record(z.unknown()),
-})
-
-export const ToolResultMessageSchema = BaseMessageSchema.extend({
-  type: z.literal('tool_result'),
-  toolCallId: z.string(),
-  result: z.unknown(),
-  isError: z.boolean(),
-  screenshot: z.number().optional(),
-})
-
-export const ErrorMessageSchema = BaseMessageSchema.extend({
-  type: z.literal('error'),
-  content: z.string(),
-  errorCode: z.string().optional(),
-})
-
-// Orchestrator-specific messages
-export const DelegationMessageSchema = BaseMessageSchema.extend({
-  type: z.literal('delegation'),
-  instruction: z.string(),
-  executorId: z.string(),
-  maxSteps: z.number().optional(),
-})
-
-export const DelegationResultMessageSchema = BaseMessageSchema.extend({
-  type: z.literal('delegation_result'),
-  executorId: z.string(),
-  summary: z.string(),
-  status: z.enum(['done', 'blocked', 'max_steps']),
-  stepsUsed: z.number(),
-  currentUrl: z.string().optional(),
-})
-
-export const MessageSchema = z.discriminatedUnion('type', [
-  UserMessageSchema,
-  AssistantMessageSchema,
-  ToolCallMessageSchema,
-  ToolResultMessageSchema,
-  ErrorMessageSchema,
-  DelegationMessageSchema,
-  DelegationResultMessageSchema,
-])
-
-export type Message = z.infer<typeof MessageSchema>
-export type UserMessage = z.infer<typeof UserMessageSchema>
-export type AssistantMessage = z.infer<typeof AssistantMessageSchema>
-export type ToolCallMessage = z.infer<typeof ToolCallMessageSchema>
-export type ToolResultMessage = z.infer<typeof ToolResultMessageSchema>
-export type ErrorMessage = z.infer<typeof ErrorMessageSchema>
-export type DelegationMessage = z.infer<typeof DelegationMessageSchema>
-export type DelegationResultMessage = z.infer<typeof DelegationResultMessageSchema>
-
-// Type guards
-export const isToolCallMessage = (m: Message): m is ToolCallMessage => m.type === 'tool_call'
-export const isDelegationMessage = (m: Message): m is DelegationMessage => m.type === 'delegation'
-// ... etc
-```
-
---
-
-### 2. Agent Registry (`agents/registry.ts`)
-
-```typescript
-import type { AgentContext, AgentEvaluator } from './types'
-
-type AgentFactory = (context: AgentContext) => AgentEvaluator
-
-const registry = new Map<string, AgentFactory>()
-
-/**
- * Register an agent type
- */
-export function registerAgent(type: string, factory: AgentFactory): void {
-  if (registry.has(type)) {
-    throw new Error(`Agent type "${type}" already registered`)
-  }
-  registry.set(type, factory)
-}
-
-/**
- * Create agent evaluator from context
- */
-export function createAgent(context: AgentContext): AgentEvaluator {
-  const factory = registry.get(context.config.agent.type)
-  if (!factory) {
-    const available = Array.from(registry.keys()).join(', ')
-    throw new Error(
-      `Unknown agent type: "${context.config.agent.type}". Available: ${available}`
-    )
-  }
-  return factory(context)
-}
-
-/**
- * Get all registered agent types
- */
-export function getRegisteredAgentTypes(): string[] {
-  return Array.from(registry.keys())
-}
-```
-
-**`agents/index.ts`** - Auto-registration:
-```typescript
-import { registerAgent } from './registry'
-import { SingleAgentEvaluator } from './single'
-import { OrchestratorExecutorEvaluator } from './orchestrator-executor'
-
-// Auto-register built-in agents
-registerAgent('single', (ctx) => new SingleAgentEvaluator(ctx))
-registerAgent('orchestrator-executor', (ctx) => new OrchestratorExecutorEvaluator(ctx))
-
-// Re-exports
-export { createAgent, registerAgent, getRegisteredAgentTypes } from './registry'
-export type { AgentContext, AgentEvaluator, AgentResult } from './types'
-```
-
---
-
-### 3. Agent Context (`agents/types.ts`)
-
-```typescript
-import type { CaptureContext } from '../capture/types'
-import type { EvalConfig, Task, TaskMetadata, Message } from '../types'
-
-/**
- * All dependencies an agent needs - passed to factory
- */
-export interface AgentContext {
-  // Config
-  config: EvalConfig
-  task: Task
-
-  // Browser window
-  windowId: number
-  tabId: number
-
-  // Output
-  outputDir: string      // Root output dir
-  taskOutputDir: string  // Task-specific: outputDir/query_id/
-
-  // Capture infrastructure (pre-initialized)
-  capture: CaptureContext
-}
-
-/**
- * Result returned by agent execution
- */
-export interface AgentResult {
-  metadata: TaskMetadata
-  messages: Message[]
-  finalAnswer: string | null
-}
-
-/**
- * Interface all agent evaluators must implement
- */
-export interface AgentEvaluator {
-  /**
-   * Execute the agent on the task
-   */
-  execute(): Promise<AgentResult>
-}
-```
-
---
-
-### 4. Capture Context (`capture/context.ts`)
-
-Bundle all capture utilities:
-```typescript
-import { randomUUID } from 'node:crypto'
-import type { ToolExecutionHooks, ToolExecutionResult } from '@browseros/server/agent'
-import type { Message, TaskError, EvalWarning, ErrorSource } from '../types'
-import { MessageLogger } from './message-logger'
-import { ScreenshotCapture } from './screenshot'
-import { TrajectorySaver } from './trajectory-saver'
-
-export interface CaptureContextConfig {
-  serverUrl: string
-  outputDir: string
-  taskId: string
-  tabId: number
-  windowId: number
-}
-
-/**
- * Unified capture context - bundles screenshot, message logging, errors/warnings
- */
-export class CaptureContext {
-  readonly screenshot: ScreenshotCapture
-  readonly messageLogger: MessageLogger
-  readonly trajectorySaver: TrajectorySaver
-
-  private errors: TaskError[] = []
-  private warnings: EvalWarning[] = []
-  private currentToolCallId: string | null = null
-
-  private readonly tabId: number
-  private readonly windowId: number
-
-  constructor(private config: CaptureContextConfig) {
-    this.tabId = config.tabId
-    this.windowId = config.windowId
-    this.trajectorySaver = new TrajectorySaver(config.outputDir, config.taskId)
-  }
-
-  /**
-   * Initialize - must be called before use
-   */
-  async init(): Promise<string> {
-    const taskOutputDir = await this.trajectorySaver.init()
-
-    this.screenshot = new ScreenshotCapture(this.config.serverUrl, taskOutputDir)
-    await this.screenshot.init()
-
-    this.messageLogger = new MessageLogger(taskOutputDir)
-
-    return taskOutputDir
-  }
-
-  /**
-   * Create tool execution hooks for GeminiAgent
-   */
-  createToolHooks(): ToolExecutionHooks {
-    return {
-      onBeforeToolCall: async (toolName: string, args: unknown) => {
-        try {
-          this.currentToolCallId = randomUUID()
-          await this.messageLogger.logToolCall(
-            toolName,
-            this.currentToolCallId,
-            args as Record<string, unknown>
-          )
-        } catch (err) {
-          this.addWarning('message_logging', `Failed to log tool call ${toolName}: ${err}`)
-        }
-      },
-
-      onAfterToolCall: async (toolName: string, result: ToolExecutionResult) => {
-        let screenshotNum = 0
-
-        // Capture screenshot
-        try {
-          screenshotNum = await this.screenshot.capture(this.tabId, this.windowId)
-        } catch (err) {
-          this.addWarning('screenshot', `Screenshot after ${toolName} failed: ${err}`)
-          screenshotNum = this.screenshot.getCount()
-        }
-
-        // Log tool errors
-        if (result.isError) {
-          this.addWarning('mcp_tool', `Tool ${toolName} error: ${result.errorMessage}`)
-        }
-
-        // Log result
-        if (this.currentToolCallId) {
-          try {
-            await this.messageLogger.logToolResult(
-              this.currentToolCallId,
-              result.isError ? { error: result.errorMessage } : result.parts,
-              result.isError,
-              screenshotNum
-            )
-          } catch (err) {
-            this.addWarning('message_logging', `Failed to log tool result: ${err}`)
-          }
-        }
-
-        this.currentToolCallId = null
-      },
-    }
-  }
-
-  // Error/warning collection
-  addError(source: ErrorSource, message: string, details?: Record<string, unknown>): void {
-    this.errors.push({ source, message, timestamp: new Date().toISOString(), details })
-  }
-
-  addWarning(source: ErrorSource, message: string): void {
-    this.warnings.push({ source, message, timestamp: new Date().toISOString() })
-    console.warn(`[${source}] ${message}`)
-  }
-
-  getErrors(): TaskError[] { return [...this.errors] }
-  getWarnings(): EvalWarning[] { return [...this.warnings] }
-  getMessages(): Message[] { return this.messageLogger.getMessages() }
-  getScreenshotCount(): number { return this.screenshot.getCount() }
-  getLastAssistantMessage(): string | null { return this.messageLogger.getLastAssistantMessage() }
-
-  // Delegation logging (for orchestrator-executor)
-  async logDelegation(instruction: string, executorId: string, maxSteps?: number): Promise<void> {
-    await this.messageLogger.logDelegation(instruction, executorId, maxSteps)
-  }
-
-  async logDelegationResult(
-    executorId: string,
-    summary: string,
-    status: 'done' | 'blocked' | 'max_steps',
-    stepsUsed: number,
-    currentUrl?: string
-  ): Promise<void> {
-    await this.messageLogger.logDelegationResult(executorId, summary, status, stepsUsed, currentUrl)
-  }
-}
-```
-
---
-
-### 5. Single Agent Evaluator (`agents/single/index.ts`)
-
-Clean implementation using context:
-```typescript
-import { randomUUID } from 'node:crypto'
-import { GeminiAgent } from '@browseros/server/agent'
-import { AgentExecutionError } from '@browseros/server/agent/errors'
-import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
-import { MCPServerConfig } from '@google/gemini-cli-core'
-import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
-import type { SingleAgentConfig, TaskMetadata } from '../../types'
-import { resolveEnvValue } from '../../utils/env'
-
-const DEFAULT_TIMEOUT_MS = 15 * 60 * 1000
-
-export class SingleAgentEvaluator implements AgentEvaluator {
-  constructor(private ctx: AgentContext) {}
-
-  async execute(): Promise<AgentResult> {
-    const startTime = Date.now()
-    const { config, task, capture } = this.ctx
-    const agentConfig = config.agent as SingleAgentConfig
-    const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS
-
-    // Log initial user message
-    await capture.messageLogger.logUser(task.query)
-
-    // Set up timeout
-    const abortController = new AbortController()
-    const timeoutHandle = setTimeout(() => abortController.abort(), timeoutMs)
-
-    // Create agent
-    const resolvedConfig: ResolvedAgentConfig = {
-      conversationId: randomUUID(),
-      provider: agentConfig.provider,
-      model: agentConfig.model ?? 'gemini-2.0-flash',
-      apiKey: resolveEnvValue(agentConfig.apiKey),
-      baseUrl: agentConfig.baseUrl,
-      sessionExecutionDir: '/tmp/browseros-eval',
-      evalMode: true,
-    }
-
-    const mcpServers = {
-      'browseros-mcp': new MCPServerConfig(
-        undefined, undefined, undefined, undefined, undefined,
-        `${config.browseros.server_url}/mcp`,
-        { Accept: 'application/json, text/event-stream', 'X-BrowserOS-Source': 'eval' },
-        undefined, undefined, true
-      ),
-    }
-
-    const agent = await GeminiAgent.create(resolvedConfig, mcpServers)
-
-    // Set capture hooks
-    agent.setToolHooks(capture.createToolHooks())
-
-    // Create mock stream to capture assistant messages
-    let lastAssistantMessage = ''
-    const mockStream = {
-      write: async (data: string) => {
-        if (data.includes('"type":"text-delta"')) {
-          const match = data.match(/"delta":"((?:[^"\\]|\\.)*)"/)
-          if (match) lastAssistantMessage += JSON.parse(`"${match[1]}"`)
-        } else if (data.includes('"type":"finish"')) {
-          if (lastAssistantMessage) {
-            await capture.messageLogger.logAssistant(lastAssistantMessage)
-            lastAssistantMessage = ''
-          }
-        }
-      },
-    }
-
-    // Execute
-    let terminationReason: TaskMetadata['termination_reason'] = 'completed'
-
-    try {
-      await agent.execute(
-        task.query,
-        mockStream as Parameters<typeof agent.execute>[1],
-        abortController.signal,
-        { windowId: this.ctx.windowId, activeTab: { id: this.ctx.tabId, url: task.start_url } }
-      )
-    } catch (err) {
-      const error = err instanceof Error ? err : new Error(String(err))
-
-      if (abortController.signal.aborted) {
-        terminationReason = 'timeout'
-        capture.addError('agent_execution', `Task timed out after ${timeoutMs / 1000}s`)
-      } else {
-        terminationReason = 'error'
-        const msg = err instanceof AgentExecutionError && err.originalError
-          ? `${error.message}: ${err.originalError.message}`
-          : error.message
-        capture.addError('agent_execution', msg, { stack: error.stack })
-      }
-      await capture.messageLogger.logError(error.message)
-    } finally {
-      clearTimeout(timeoutHandle)
-    }
-
-    // Build metadata
-    const metadata: TaskMetadata = {
-      query_id: task.query_id,
-      dataset: task.dataset,
-      query: task.query,
-      started_at: new Date(startTime).toISOString(),
-      completed_at: new Date().toISOString(),
-      total_duration_ms: Date.now() - startTime,
-      total_steps: capture.getScreenshotCount(),
-      termination_reason: terminationReason,
-      final_answer: capture.getLastAssistantMessage(),
-      errors: capture.getErrors(),
-      warnings: capture.getWarnings(),
-      agent_config: { type: 'single', model: resolvedConfig.model },
-      grader_results: {},
-    }
-
-    await capture.trajectorySaver.saveMetadata(metadata)
-
-    return {
-      metadata,
-      messages: capture.getMessages(),
-      finalAnswer: metadata.final_answer,
-    }
-  }
-}
-```
-
---
-
-### 6. Task Executor (`runner/task-executor.ts`)
-
-Uses agent registry:
-```typescript
-import { createAgent } from '../agents'
-import type { AgentContext } from '../agents/types'
-import { CaptureContext } from '../capture/context'
-import type { EvalConfig, Task } from '../types'
-import type { WindowManager } from '../capture/window-manager'
-
-export class TaskExecutor {
-  constructor(
-    private config: EvalConfig,
-    private outputDir: string,
-    private windowManager: WindowManager,
-    private graderOptions: GraderOptions | null,
-  ) {}
-
-  async execute(task: Task): Promise<TaskResult> {
-    const startTime = Date.now()
-    let window: { windowId: number; tabId: number } | null = null
-
-    try {
-      // Create window
-      window = await this.windowManager.createWindow(task.query_id, task.start_url)
-
-      // Initialize capture context
-      const capture = new CaptureContext({
-        serverUrl: this.config.browseros.server_url,
-        outputDir: this.outputDir,
-        taskId: task.query_id,
-        tabId: window.tabId,
-        windowId: window.windowId,
-      })
-      const taskOutputDir = await capture.init()
-
-      // Build agent context
-      const context: AgentContext = {
-        config: this.config,
-        task,
-        windowId: window.windowId,
-        tabId: window.tabId,
-        outputDir: this.outputDir,
-        taskOutputDir,
-        capture,
-      }
-
-      // Create and execute agent (via registry)
-      const agent = createAgent(context)
-      const agentResult = await agent.execute()
-
-      // Run graders
-      const graderResults = await this.runGraders(task, agentResult)
-
-      return {
-        status: agentResult.metadata.termination_reason === 'timeout' ? 'timeout' : 'completed',
-        task,
-        agentResult,
-        graderResults,
-        durationMs: Date.now() - startTime,
-      }
-    } catch (error) {
-      return {
-        status: 'failed',
-        task,
-        error: error instanceof Error ? error : new Error(String(error)),
-        errorSource: 'unknown',
-        durationMs: Date.now() - startTime,
-      }
-    } finally {
-      if (window) {
-        await this.windowManager.closeWindow(task.query_id)
-      }
-    }
-  }
-}
-```
-
---
-
-## Server Dependencies
-
-### What We MUST Import from Server
-
-These are necessary - `GeminiAgent` IS the agent:
-```typescript
-// Core agent
-import { GeminiAgent, type ToolExecutionHooks, type ToolExecutionResult } from '@browseros/server/agent'
-import { AgentExecutionError } from '@browseros/server/agent/errors'
-import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
-
-// Provider adapter (for orchestrator-agent)
-import { VercelAIContentGenerator } from '@browseros/server/agent/provider-adapter'
-
-// Gateway client (for browseros provider only)
-import { fetchBrowserOSConfig, getLLMConfigFromProvider } from '@browseros/server/lib/clients/gateway'
-```
-
-### What Could Move to Shared (Future)
-
-If we want to decouple more:
-```typescript
-// These types could be in @browseros/shared
-export interface ToolExecutionHooks { ... }
-export interface ToolExecutionResult { ... }
-export interface ResolvedAgentConfig { ... }
-```
-
-But for now, importing from server is fine - eval is tightly coupled to server anyway.
-
---
-
-## Import Guidelines
-
-```typescript
-// Shared package - schemas, constants
-import { LLMConfigSchema, LLMProviderSchema, LLM_PROVIDERS } from '@browseros/shared/schemas/llm'
-import { TIMEOUTS } from '@browseros/shared/constants/timeouts'
-import { AGENT_LIMITS } from '@browseros/shared/constants/limits'
-import type { BrowserContext } from '@browseros/shared/schemas/browser-context'
-
-// Server - only agent-related imports
-import { GeminiAgent, type ToolExecutionHooks } from '@browseros/server/agent'
-import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
-
-// Internal eval types - from types/ folder
-import type { EvalConfig, Task, Message, AgentResult } from '../types'
-import type { AgentContext, AgentEvaluator } from '../agents/types'
-```
-
---
-
-## Adding a New Agent Type
-
-1. Create folder: `agents/my-new-agent/`
-2. Implement `AgentEvaluator` interface:
-
-```typescript
-// agents/my-new-agent/index.ts
-import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
-
-export class MyNewAgentEvaluator implements AgentEvaluator {
-  constructor(private ctx: AgentContext) {}
-
-  async execute(): Promise<AgentResult> {
-    const { config, task, capture } = this.ctx
-
-    // Use capture.createToolHooks() for screenshot/logging
-    // Use capture.messageLogger for messages
-    // Use capture.addError/addWarning for errors
-
-    // Return AgentResult
-  }
-}
-```
-
-3. Register in `agents/index.ts`:
-
-```typescript
-import { MyNewAgentEvaluator } from './my-new-agent'
-
-registerAgent('my-new-agent', (ctx) => new MyNewAgentEvaluator(ctx))
-```
-
-4. Add config schema in `types/config.ts`:
-
-```typescript
-export const MyNewAgentConfigSchema = z.object({
-  type: z.literal('my-new-agent'),
-  // ... specific fields
-})
-
-export const AgentConfigSchema = z.discriminatedUnion('type', [
-  SingleAgentConfigSchema,
-  OrchestratorExecutorConfigSchema,
-  MyNewAgentConfigSchema,  // Add here
-])
-```
-
-Done - no changes to runner code needed.
-
---
-
-## Implementation Order
-
-1. **Phase 1: Types** (~1 hour)
-   - Create `types/` folder with proper structure
-   - Move/consolidate all types
-   - Add Zod schemas for messages
-
-2. **Phase 2: Capture Context** (~1 hour)
-   - Create `CaptureContext` class
-   - Add delegation message methods
-   - Create `createToolHooks()` utility
-
-3. **Phase 3: Agent Registry** (~30 min)
-   - Create `registry.ts`
-   - Create `AgentContext` interface
-   - Update exports
-
-4. **Phase 4: Refactor Single Agent** (~1 hour)
-   - Use `AgentContext`
-   - Use `CaptureContext`
-   - Clean up code
-
-5. **Phase 5: Refactor Orchestrator-Executor** (~2 hours)
-   - Use `AgentContext`
-   - Integrate `CaptureContext`
-   - Wire up hooks properly
-
-6. **Phase 6: Update Runner** (~30 min)
-   - Use `createAgent()` instead of if-else
-   - Initialize `CaptureContext` in executor
-
-7. **Phase 7: Testing** (~1 hour)
-   - Run single-agent eval
-   - Run orchestrator-executor eval
-   - Verify screenshots/messages captured
-
---
-
-## Summary
-
-| Before | After |
-|--------|-------|
-| If-else agent creation | Registry + factory pattern |
-| Duplicated capture code | Shared `CaptureContext` |
-| Scattered types | Organized `types/` folder |
-| Copy-paste hooks | `createToolHooks()` utility |
-| Tight coupling | Clear interfaces |
-| Hard to add agents | Register + implement |
--- a/packages/browseros-agent/apps/eval/IMPLEMENTATION_PHASES.md
+++ b/packages/browseros-agent/apps/eval/IMPLEMENTATION_PHASES.md
@@ -1,431 +0,0 @@
-# Implementation Phases - Parallel Execution Plan
-
-## Dependency Graph
-
-```
-Phase 1: Types (4 parallel subagents)
-    │
-    ├──────────────────┬──────────────────┐
-    ▼                  ▼                  │
-Phase 2: Capture    Phase 3: Agent       │
-(2 parallel)        Registry             │
-    │               (1 subagent)         │
-    │                  │                  │
-    └────────┬─────────┘                  │
-             ▼                            │
-Phase 4: Agent Refactors                  │
-(2 parallel - after 2+3)                  │
-             │                            │
-             ▼                            │
-Phase 5: Runner Update                    │
-(1 subagent - after 4)                    │
-             │                            │
-             ▼                            │
-Phase 6: Cleanup & Test ◄─────────────────┘
-(1 subagent)
-```
-
---
-
-## Phase 1: Types (4 Parallel Subagents)
-
-No dependencies - can all run simultaneously.
-
-### Subagent 1A: Config Types
-```
-Create /apps/eval/src/types/config.ts
-
-Requirements:
-1. Import LLMConfigSchema, LLMProviderSchema from @browseros/shared/schemas/llm
-2. Import z from zod
-
-Create Zod schemas:
- SingleAgentConfigSchema = LLMConfigSchema.extend({ type: z.literal('single') })
- OrchestratorExecutorConfigSchema with orchestrator + executor nested configs
- AgentConfigSchema = z.discriminatedUnion('type', [...])
- EvalConfigSchema with all fields (agent, dataset, output_dir, num_workers, browseros, grader_*, timeout_ms)
-
-Export both schemas and inferred types (z.infer<>)
-
-Reference: Current implementation in /apps/eval/src/utils/config-validator.ts (lines 1-42)
-```
-
-### Subagent 1B: Message Types
-```
-Create /apps/eval/src/types/message.ts
-
-Requirements:
-1. Use Zod for all schemas
-2. Create BaseMessageSchema with timestamp field
-
-Create schemas for:
- UserMessageSchema (type: 'user', content)
- AssistantMessageSchema (type: 'assistant', content)
- ToolCallMessageSchema (type: 'tool_call', tool, toolCallId, params)
- ToolResultMessageSchema (type: 'tool_result', toolCallId, result, isError, screenshot?)
- ErrorMessageSchema (type: 'error', content, errorCode?)
- DelegationMessageSchema (type: 'delegation', instruction, executorId, maxSteps?)
- DelegationResultMessageSchema (type: 'delegation_result', executorId, summary, status, stepsUsed, currentUrl?)
-
-Create MessageSchema = z.discriminatedUnion('type', [...all schemas])
-
-Export schemas, types, and type guards (isToolCallMessage, isDelegationMessage, etc.)
-
-Reference: Current types in /apps/eval/src/types.ts (lines 62-127)
-```
-
-### Subagent 1C: Task & Result Types
-```
-Create /apps/eval/src/types/task.ts
-
-Requirements:
-1. Use Zod schemas with inferred types
-
-Create:
- TaskMetadataSchema (original_task_id, website?, category?, additional?)
- TaskSchema (query_id, dataset, query, graders[], start_url?, setup_script?, metadata)
-
-Export schemas and types.
-
---
-
-Create /apps/eval/src/types/result.ts
-
-Create:
- GraderResultSchema (score, pass, reasoning, details?)
- TaskMetadataSchema (query_id, dataset, query, started_at, completed_at, total_duration_ms, total_steps, termination_reason, final_answer, errors, warnings, agent_config, grader_results)
- AgentResultSchema (metadata, messages, finalAnswer)
-
-Export schemas and types.
-
-Reference: Current types in /apps/eval/src/types.ts (lines 6-20, 156-182)
-```
-
-### Subagent 1D: Error Types + Index
-```
-Create /apps/eval/src/types/errors.ts
-
-Create:
- ErrorSourceSchema = z.enum(['window_creation', 'agent_execution', 'mcp_tool', 'screenshot', 'grader', 'message_logging', 'cleanup', 'unknown'])
- TaskErrorSchema (source, message, timestamp, details?)
- EvalWarningSchema (source, message, timestamp)
-
-Export schemas and types.
-
---
-
-Create /apps/eval/src/types/index.ts
-
-Re-export everything from:
- ./config
- ./message
- ./task
- ./result
- ./errors
-
-This becomes the single import point: import { EvalConfig, Message, Task } from '../types'
-
-Reference: Current types in /apps/eval/src/types.ts (lines 129-154)
-```
-
---
-
-## Phase 2: Capture Infrastructure (2 Parallel Subagents)
-
-**Depends on:** Phase 1 (types)
-
-### Subagent 2A: CaptureContext Class
-```
-Create /apps/eval/src/capture/types.ts
-
-Define interface:
- CaptureContextConfig { serverUrl, outputDir, taskId, tabId, windowId }
-
---
-
-Create /apps/eval/src/capture/context.ts
-
-Requirements:
-1. Import ToolExecutionHooks, ToolExecutionResult from @browseros/server/agent
-2. Import types from ../types
-3. Import existing ScreenshotCapture, MessageLogger, TrajectorySaver
-
-Implement CaptureContext class:
- Constructor takes CaptureContextConfig
- async init() - initializes screenshot, messageLogger, trajectorySaver, returns taskOutputDir
- createToolHooks(): ToolExecutionHooks - returns hooks for GeminiAgent
- addError(source, message, details?)
- addWarning(source, message)
- getErrors(), getWarnings(), getMessages(), getScreenshotCount(), getLastAssistantMessage()
- logDelegation(instruction, executorId, maxSteps?)
- logDelegationResult(executorId, summary, status, stepsUsed, currentUrl?)
-
-Reference implementation details in DESIGN_DOC.md section "4. Capture Context"
-
-Update /apps/eval/src/capture/index.ts to export CaptureContext
-```
-
-### Subagent 2B: MessageLogger Extensions
-```
-Update /apps/eval/src/capture/message-logger.ts
-
-Add two new methods:
-
-1. logDelegation(instruction: string, executorId: string, maxSteps?: number): Promise<void>
-   - Creates DelegationMessage with type: 'delegation'
-   - Appends to messages
-
-2. logDelegationResult(executorId: string, summary: string, status: 'done' | 'blocked' | 'max_steps', stepsUsed: number, currentUrl?: string): Promise<void>
-   - Creates DelegationResultMessage with type: 'delegation_result'
-   - Appends to messages
-
-Import DelegationMessage, DelegationResultMessage from ../types
-
-Reference: Current MessageLogger in /apps/eval/src/capture/message-logger.ts
-```
-
---
-
-## Phase 3: Agent Registry (1 Subagent)
-
-**Depends on:** Phase 1 (types)
-**Can run parallel with:** Phase 2
-
-### Subagent 3A: Agent Registry + Types
-```
-Create /apps/eval/src/agents/types.ts
-
-Define:
- AgentContext interface:
-  {
-    config: EvalConfig
-    task: Task
-    windowId: number
-    tabId: number
-    outputDir: string
-    taskOutputDir: string
-    capture: CaptureContext
-  }
-
- AgentResult interface (re-export from ../types or define here)
- AgentEvaluator interface { execute(): Promise<AgentResult> }
-
---
-
-Create /apps/eval/src/agents/registry.ts
-
-Implement:
- type AgentFactory = (context: AgentContext) => AgentEvaluator
- const registry = new Map<string, AgentFactory>()
- registerAgent(type: string, factory: AgentFactory): void
- createAgent(context: AgentContext): AgentEvaluator
- getRegisteredAgentTypes(): string[]
-
-Reference: DESIGN_DOC.md section "2. Agent Registry"
-
---
-
-Update /apps/eval/src/agents/index.ts
-
- Import registerAgent from ./registry
- Import SingleAgentEvaluator (will be updated later)
- Import OrchestratorExecutorEvaluator (will be updated later)
- Call registerAgent for both
- Re-export createAgent, registerAgent, getRegisteredAgentTypes
- Re-export types
-
-Note: Registration calls will fail initially until agents are refactored.
-That's OK - add TODO comments for now.
-```
-
---
-
-## Phase 4: Agent Refactors (2 Parallel Subagents)
-
-**Depends on:** Phase 2 + Phase 3
-
-### Subagent 4A: Single Agent Refactor
-```
-Refactor /apps/eval/src/agents/single-agent.ts
-
-Changes:
-1. Change constructor to accept AgentContext instead of individual params:
-   constructor(private ctx: AgentContext) {}
-
-2. Use ctx.capture instead of creating ScreenshotCapture/MessageLogger:
-   - Remove local ScreenshotCapture initialization
-   - Remove local MessageLogger initialization
-   - Remove local hooks setup
-   - Use ctx.capture.createToolHooks() for GeminiAgent hooks
-   - Use ctx.capture.messageLogger.logUser/logAssistant
-   - Use ctx.capture.addError/addWarning
-   - Use ctx.capture.getMessages(), getScreenshotCount(), etc.
-
-3. Build metadata using capture methods
-
-4. Remove TrajectorySaver init (done in CaptureContext)
-
-5. Keep the core agent execution logic (GeminiAgent.create, agent.execute)
-
-Reference:
- Current implementation: /apps/eval/src/agents/single-agent.ts
- Target implementation: DESIGN_DOC.md section "5. Single Agent Evaluator"
-```
-
-### Subagent 4B: Orchestrator-Executor Refactor
-```
-Refactor /apps/eval/src/agents/orchestrator-executor/index.ts
-
-Changes:
-1. Change OrchestratorExecutorEvaluator constructor to accept AgentContext:
-   constructor(private ctx: AgentContext) {}
-
-2. Initialize capture from context (already done in runner)
-
-3. Add hook integration:
-   - Create executor hooks that use ctx.capture.createToolHooks()
-   - Wire hooks through Orchestrator → ExecutorStore → Executor
-   - Call ctx.capture.logDelegation() when orchestrator delegates
-   - Call ctx.capture.logDelegationResult() when executor returns
-
-4. Update return to include messages:
-   return {
-     metadata,
-     messages: ctx.capture.getMessages(),  // Now populated!
-     finalAnswer,
-   }
-
-Also update supporting files if needed:
- orchestrator.ts - add setExecutorHooks() method
- executor.ts - accept external hooks via setObservationHooks()
- executor-store.ts - pass hooks to new executors
-
-Reference:
- Current: /apps/eval/src/agents/orchestrator-executor/index.ts
- Target: DESIGN_DOC.md and previous IMPLEMENTATION_PLAN.md
-```
-
---
-
-## Phase 5: Runner Update (1 Subagent)
-
-**Depends on:** Phase 4
-
-### Subagent 5A: Task Executor Update
-```
-Update /apps/eval/src/runner/task-executor.ts
-
-Changes:
-1. Import createAgent from ../agents instead of individual evaluators
-2. Import CaptureContext from ../capture
-
-3. In execute() method:
-   - Create CaptureContext and call init()
-   - Build AgentContext with all required fields
-   - Use createAgent(context) instead of if-else switch
-   - Remove the if (config.agent.type === 'single') / else if blocks
-
-4. Remove direct imports of SingleAgentEvaluator, OrchestratorExecutorEvaluator
-
-Before:
-```typescript
-if (this.config.agent.type === 'single') {
-  const evaluator = new SingleAgentEvaluator(this.config, task, window.windowId, ...)
-} else if (this.config.agent.type === 'orchestrator-executor') {
-  const evaluator = new OrchestratorExecutorEvaluator(this.config, task, ...)
-}
-```
-
-After:
-```typescript
-const capture = new CaptureContext({ serverUrl, outputDir, taskId, tabId, windowId })
-const taskOutputDir = await capture.init()
-
-const context: AgentContext = {
-  config: this.config,
-  task,
-  windowId: window.windowId,
-  tabId: window.tabId,
-  outputDir: this.outputDir,
-  taskOutputDir,
-  capture,
-}
-
-const agent = createAgent(context)
-const agentResult = await agent.execute()
-```
-
-Reference:
- Current: /apps/eval/src/runner/task-executor.ts (lines 143-186)
- Target: DESIGN_DOC.md section "6. Task Executor"
-```
-
---
-
-## Phase 6: Cleanup & Test (1 Subagent)
-
-**Depends on:** Phase 5
-
-### Subagent 6A: Cleanup Old Files + Verify
-```
-Tasks:
-1. Delete old /apps/eval/src/types.ts (replaced by types/ folder)
-
-2. Update all imports across the codebase:
-   - Change: import { EvalConfig, Task, Message } from '../types'
-   - Keep same (types/index.ts re-exports everything)
-
-3. Update /apps/eval/src/utils/config-validator.ts:
-   - Import schemas from ../types/config instead of defining locally
-   - Remove duplicate schema definitions
-
-4. Verify no TypeScript errors:
-   - Run: cd apps/eval && bun run typecheck
-
-5. Test single-agent eval:
-   - Run: cd apps/eval && bun run eval -c configs/webvoyager-test.json
-   - Verify screenshots captured
-   - Verify messages.jsonl populated
-
-6. Test orchestrator-executor eval:
-   - Run: cd apps/eval && bun run eval -c configs/orchestrator-executor-test.json
-   - Verify screenshots captured
-   - Verify messages.jsonl has delegation messages
-   - Verify graders pass (no "no_screenshots" error)
-
-Report any issues found.
-```
-
---
-
-## Execution Summary
-
-| Phase | Subagents | Can Parallelize? | Dependencies |
-|-------|-----------|------------------|--------------|
-| 1 | 4 (1A, 1B, 1C, 1D) | Yes - all parallel | None |
-| 2 | 2 (2A, 2B) | Yes - both parallel | Phase 1 |
-| 3 | 1 (3A) | Yes - parallel with Phase 2 | Phase 1 |
-| 4 | 2 (4A, 4B) | Yes - both parallel | Phase 2 + 3 |
-| 5 | 1 (5A) | No | Phase 4 |
-| 6 | 1 (6A) | No | Phase 5 |
-
-**Total: 11 subagent tasks**
-
-**Parallel execution timeline:**
-```
-Time →
-─────────────────────────────────────────────────────────────────
-Phase 1: [1A] [1B] [1C] [1D]     (4 parallel)
-         ─────────────────
-Phase 2:                   [2A] [2B]   (2 parallel)
-Phase 3:                   [3A]        (parallel with Phase 2)
-                           ───────────
-Phase 4:                              [4A] [4B]  (2 parallel)
-                                      ──────────
-Phase 5:                                        [5A]
-                                                ────
-Phase 6:                                            [6A]
-                                                    ────
-```
-
-**Maximum parallelism: 4 subagents** (Phase 1)
--- a/packages/browseros-agent/apps/eval/IMPLEMENTATION_PLAN.md
+++ b/packages/browseros-agent/apps/eval/IMPLEMENTATION_PLAN.md
@@ -1,888 +0,0 @@
-# Eval System - Production Grade Implementation Plan
-
-## Overview
-
-This plan outlines the changes needed to make the eval system production-grade with uniform agent observation across all agent patterns (single-agent, orchestrator-executor, future patterns).
-
-**Goal:** All agent evaluators produce consistent `AgentResult` with screenshots, message traces, and verifiable action sequences.
-
---
-
-## Phase 1: Type System Extensions
-
-### 1.1 Add New Message Types
-
-**File:** `src/types.ts`
-
-Add delegation-specific message types for orchestrator pattern:
-
-```typescript
-// After ErrorMessage definition (~line 99)
-
-export interface DelegationMessage extends BaseMessage {
-  type: 'delegation'
-  instruction: string
-  executorId: string
-  maxSteps?: number
-}
-
-export interface DelegationResultMessage extends BaseMessage {
-  type: 'delegation_result'
-  executorId: string
-  summary: string
-  status: 'done' | 'blocked' | 'max_steps'
-  stepsUsed: number
-  currentUrl?: string
-}
-
-// Update Message union (~line 101)
-export type Message =
-  | UserMessage
-  | AssistantMessage
-  | ToolCallMessage
-  | ToolResultMessage
-  | ErrorMessage
-  | DelegationMessage      // NEW
-  | DelegationResultMessage // NEW
-
-// Add type guards
-export function isDelegationMessage(msg: Message): msg is DelegationMessage {
-  return msg.type === 'delegation'
-}
-
-export function isDelegationResultMessage(msg: Message): msg is DelegationResultMessage {
-  return msg.type === 'delegation_result'
-}
-```
-
-### 1.2 Add Orchestrator Hook Types
-
-**File:** `src/agents/orchestrator-executor/types.ts`
-
-```typescript
-// Add after existing types
-
-export interface OrchestratorHooks {
-  onDelegation?: (instruction: string, executorId: string, maxSteps?: number) => Promise<void>
-  onDelegationResult?: (result: ExecutorResult) => Promise<void>
-  onTurnStart?: (turn: number) => Promise<void>
-  onTurnComplete?: (turn: number) => Promise<void>
-  onComplete?: (answer: string) => Promise<void>
-  onFailed?: (reason: string) => Promise<void>
-}
-
-export interface ExecutorObservationHooks {
-  onBeforeToolCall?: (toolName: string, args: unknown) => Promise<string> // returns toolCallId
-  onAfterToolCall?: (toolName: string, toolCallId: string, result: unknown, isError: boolean) => Promise<void>
-}
-```
-
---
-
-## Phase 2: Unified Capture Infrastructure
-
-### 2.1 Create EvalCapture Class
-
-**File:** `src/capture/eval-capture.ts` (NEW)
-
-```typescript
-/**
- * EvalCapture - Unified capture infrastructure for all agent evaluators
- *
- * Combines screenshot capture, message logging, and provides hooks for
- * both single-agent and orchestrator-executor patterns.
- */
-
-import { randomUUID } from 'node:crypto'
-import type {
-  AssistantMessage,
-  DelegationMessage,
-  DelegationResultMessage,
-  ErrorMessage,
-  Message,
-  ToolCallMessage,
-  ToolResultMessage,
-  UserMessage,
-} from '../types'
-import { MessageLogger } from './message-logger'
-import { ScreenshotCapture } from './screenshot'
-
-export interface EvalCaptureConfig {
-  serverUrl: string
-  outputDir: string
-  tabId: number
-  windowId: number
-}
-
-export class EvalCapture {
-  private screenshotCapture: ScreenshotCapture
-  private messageLogger: MessageLogger
-  private tabId: number
-  private windowId: number
-  private currentToolCallId: string | null = null
-
-  constructor(config: EvalCaptureConfig) {
-    this.screenshotCapture = new ScreenshotCapture(config.serverUrl, config.outputDir)
-    this.messageLogger = new MessageLogger(config.outputDir)
-    this.tabId = config.tabId
-    this.windowId = config.windowId
-  }
-
-  async init(): Promise<void> {
-    await this.screenshotCapture.init()
-  }
-
-  // ============================================================================
-  // Screenshot Capture
-  // ============================================================================
-
-  async captureScreenshot(): Promise<number> {
-    return this.screenshotCapture.capture(this.tabId, this.windowId)
-  }
-
-  getScreenshotCount(): number {
-    return this.screenshotCapture.getCount()
-  }
-
-  // ============================================================================
-  // Message Logging - Basic Types
-  // ============================================================================
-
-  async logUser(content: string): Promise<void> {
-    await this.messageLogger.logUser(content)
-  }
-
-  async logAssistant(content: string): Promise<void> {
-    await this.messageLogger.logAssistant(content)
-  }
-
-  async logError(content: string, errorCode?: string): Promise<void> {
-    await this.messageLogger.logError(content, errorCode)
-  }
-
-  // ============================================================================
-  // Tool Call Logging (for single-agent and executor)
-  // ============================================================================
-
-  async logToolCall(tool: string, params: Record<string, unknown>): Promise<string> {
-    const toolCallId = randomUUID()
-    this.currentToolCallId = toolCallId
-    await this.messageLogger.logToolCall(tool, toolCallId, params)
-    return toolCallId
-  }
-
-  async logToolResult(
-    toolCallId: string,
-    result: unknown,
-    isError: boolean,
-    screenshot?: number,
-  ): Promise<void> {
-    await this.messageLogger.logToolResult(toolCallId, result, isError, screenshot)
-    this.currentToolCallId = null
-  }
-
-  getCurrentToolCallId(): string | null {
-    return this.currentToolCallId
-  }
-
-  // ============================================================================
-  // Delegation Logging (for orchestrator-executor)
-  // ============================================================================
-
-  async logDelegation(
-    instruction: string,
-    executorId: string,
-    maxSteps?: number,
-  ): Promise<void> {
-    const message: DelegationMessage = {
-      type: 'delegation',
-      timestamp: new Date().toISOString(),
-      instruction,
-      executorId,
-      ...(maxSteps !== undefined && { maxSteps }),
-    }
-    // Extend MessageLogger to handle this, or append directly
-    await this.appendMessage(message)
-  }
-
-  async logDelegationResult(
-    executorId: string,
-    summary: string,
-    status: 'done' | 'blocked' | 'max_steps',
-    stepsUsed: number,
-    currentUrl?: string,
-  ): Promise<void> {
-    const message: DelegationResultMessage = {
-      type: 'delegation_result',
-      timestamp: new Date().toISOString(),
-      executorId,
-      summary,
-      status,
-      stepsUsed,
-      ...(currentUrl && { currentUrl }),
-    }
-    await this.appendMessage(message)
-  }
-
-  // ============================================================================
-  // Helpers
-  // ============================================================================
-
-  private async appendMessage(message: Message): Promise<void> {
-    // Access internal messages array and file
-    // This requires either extending MessageLogger or using a shared approach
-    const messages = this.messageLogger.getMessages()
-    messages.push(message)
-    // Write to file - MessageLogger needs extension for this
-  }
-
-  getMessages(): Message[] {
-    return this.messageLogger.getMessages()
-  }
-
-  getLastAssistantMessage(): string | null {
-    return this.messageLogger.getLastAssistantMessage()
-  }
-}
-```
-
-### 2.2 Extend MessageLogger for New Types
-
-**File:** `src/capture/message-logger.ts`
-
-Add methods for delegation messages:
-
-```typescript
-// Add after logError method
-
-async logDelegation(
-  instruction: string,
-  executorId: string,
-  maxSteps?: number,
-): Promise<void> {
-  const message: DelegationMessage = {
-    type: 'delegation',
-    timestamp: new Date().toISOString(),
-    instruction,
-    executorId,
-    ...(maxSteps !== undefined && { maxSteps }),
-  }
-  await this.append(message)
-}
-
-async logDelegationResult(
-  executorId: string,
-  summary: string,
-  status: 'done' | 'blocked' | 'max_steps',
-  stepsUsed: number,
-  currentUrl?: string,
-): Promise<void> {
-  const message: DelegationResultMessage = {
-    type: 'delegation_result',
-    timestamp: new Date().toISOString(),
-    executorId,
-    summary,
-    status,
-    stepsUsed,
-    ...(currentUrl && { currentUrl }),
-  }
-  await this.append(message)
-}
-```
-
---
-
-## Phase 3: Executor Hook Integration
-
-### 3.1 Modify Executor to Accept External Hooks
-
-**File:** `src/agents/orchestrator-executor/executor.ts`
-
-```typescript
-// Add import
-import type { ExecutorObservationHooks } from './types'
-
-export class Executor {
-  private agent: GeminiAgent | null = null
-  private stepsUsed = 0
-  private currentUrl = ''
-  private config: ExecutorConfig
-  private serverUrl: string
-  private windowId: number
-  private tabId: number
-  private observationHooks?: ExecutorObservationHooks  // NEW
-
-  // ... existing constructor ...
-
-  /**
-   * Set external observation hooks for capture integration
-   */
-  setObservationHooks(hooks: ExecutorObservationHooks): void {
-    this.observationHooks = hooks
-  }
-
-  async execute(
-    instruction: string,
-    maxSteps?: number,
-    signal?: AbortSignal,
-  ): Promise<Omit<ExecutorResult, 'executorId'>> {
-    // ... existing setup ...
-
-    // Track steps via hooks - MODIFIED to include external observation
-    let stepsThisRun = 0
-    const hooks: ToolExecutionHooks = {
-      onBeforeToolCall: async (toolName: string, args: unknown) => {
-        // Call external hook if set (for logging)
-        if (this.observationHooks?.onBeforeToolCall) {
-          await this.observationHooks.onBeforeToolCall(toolName, args)
-        }
-      },
-      onAfterToolCall: async (toolName: string, result: ToolExecutionResult) => {
-        stepsThisRun++
-        this.stepsUsed++
-
-        // Call external hook if set (for screenshot capture and logging)
-        if (this.observationHooks?.onAfterToolCall) {
-          const toolCallId = 'current' // Will be tracked by EvalCapture
-          await this.observationHooks.onAfterToolCall(
-            toolName,
-            toolCallId,
-            result.parts,
-            result.isError,
-          )
-        }
-      },
-    }
-    this.agent.setToolHooks(hooks)
-
-    // ... rest of execute method ...
-  }
-}
-```
-
-### 3.2 Pass Hooks Through ExecutorStore
-
-**File:** `src/agents/orchestrator-executor/executor-store.ts`
-
-```typescript
-import type { ExecutorObservationHooks } from './types'
-
-export class ExecutorStore {
-  private executors = new Map<string, Executor>()
-  private observationHooks?: ExecutorObservationHooks  // NEW
-
-  /**
-   * Set observation hooks that will be applied to all executors
-   */
-  setObservationHooks(hooks: ExecutorObservationHooks): void {
-    this.observationHooks = hooks
-    // Apply to existing executors
-    for (const executor of this.executors.values()) {
-      executor.setObservationHooks(hooks)
-    }
-  }
-
-  getOrCreate(
-    id: string,
-    config: ExecutorConfig,
-    serverUrl: string,
-    windowId: number,
-    tabId: number,
-  ): Executor {
-    if (!this.executors.has(id)) {
-      const executor = new Executor(config, serverUrl, windowId, tabId)
-      // Apply observation hooks to new executor
-      if (this.observationHooks) {
-        executor.setObservationHooks(this.observationHooks)
-      }
-      this.executors.set(id, executor)
-    }
-    return this.executors.get(id)!
-  }
-
-  // ... rest unchanged ...
-}
-```
-
---
-
-## Phase 4: Orchestrator Hook Integration
-
-### 4.1 Add Hooks to OrchestratorAgent
-
-**File:** `src/agents/orchestrator-executor/orchestrator-agent.ts`
-
-```typescript
-import type { ExecutorObservationHooks, OrchestratorHooks } from './types'
-
-export class OrchestratorAgent {
-  private orchestratorHooks?: OrchestratorHooks  // NEW
-
-  private constructor(
-    private client: GeminiClient,
-    private geminiConfig: GeminiConfig,
-    private state: OrchestratorState,
-    private executorStore: ExecutorStore,
-    private maxTurns: number,
-  ) {}
-
-  /**
-   * Set orchestrator-level hooks for delegation tracking
-   */
-  setHooks(hooks: OrchestratorHooks): void {
-    this.orchestratorHooks = hooks
-  }
-
-  /**
-   * Set executor observation hooks (passed through to ExecutorStore)
-   */
-  setExecutorObservationHooks(hooks: ExecutorObservationHooks): void {
-    this.executorStore.setObservationHooks(hooks)
-  }
-
-  /**
-   * Get hooks for tool context (used by orchestrator-tools.ts)
-   */
-  getOrchestratorHooks(): OrchestratorHooks | undefined {
-    return this.orchestratorHooks
-  }
-
-  async run(taskQuery: string): Promise<OrchestratorAgentResult> {
-    let currentParts: Part[] = [{ text: taskQuery }]
-    let turns = 0
-
-    while (
-      !this.state.isComplete &&
-      !this.state.isFailed &&
-      turns < this.maxTurns
-    ) {
-      turns++
-
-      // Fire turn start hook
-      await this.orchestratorHooks?.onTurnStart?.(turns)
-
-      // ... existing turn logic ...
-
-      // Fire turn complete hook
-      await this.orchestratorHooks?.onTurnComplete?.(turns)
-    }
-
-    // Fire completion hooks
-    if (this.state.isComplete && this.state.finalAnswer) {
-      await this.orchestratorHooks?.onComplete?.(this.state.finalAnswer)
-    } else if (this.state.isFailed && this.state.failureReason) {
-      await this.orchestratorHooks?.onFailed?.(this.state.failureReason)
-    }
-
-    return {
-      success: this.state.isComplete,
-      answer: this.state.finalAnswer,
-      reason: this.state.failureReason,
-      delegationCount: this.state.delegationCount,
-      totalExecutorSteps: this.state.totalExecutorSteps,
-      turns,
-    }
-  }
-
-  // ... rest unchanged ...
-}
-```
-
-### 4.2 Fire Hooks in Orchestrator Tools
-
-**File:** `src/agents/orchestrator-executor/orchestrator-tools.ts`
-
-Modify the delegate tool handler to fire hooks:
-
-```typescript
-// In createOrchestratorTools function, modify the delegate tool handler
-
-// Inside the delegate tool's handler:
-handler: async (args) => {
-  const { instruction, executorId, maxSteps } = args as DelegateParams
-
-  // Fire delegation hook BEFORE execution
-  const hooks = context.getOrchestratorHooks?.()
-  const actualExecutorId = executorId ?? randomUUID()
-  await hooks?.onDelegation?.(instruction, actualExecutorId, maxSteps)
-
-  // Get or create executor
-  const executor = context.executorStore.getOrCreate(
-    actualExecutorId,
-    context.executorConfig,
-    context.serverUrl,
-    context.windowId,
-    context.tabId,
-  )
-
-  // Execute
-  const result = await executor.execute(instruction, maxSteps)
-
-  // Update state
-  context.state.delegationCount++
-  context.state.totalExecutorSteps += result.stepsUsed
-
-  // Fire delegation result hook AFTER execution
-  await hooks?.onDelegationResult?.({
-    ...result,
-    executorId: actualExecutorId,
-  })
-
-  // Return result to orchestrator
-  return {
-    executorId: actualExecutorId,
-    ...result,
-  }
-}
-```
-
---
-
-## Phase 5: Update OrchestratorExecutorEvaluator
-
-### 5.1 Full Integration
-
-**File:** `src/agents/orchestrator-executor/index.ts`
-
-```typescript
-import { ScreenshotCapture } from '../../capture/screenshot'
-import { MessageLogger } from '../../capture/message-logger'
-import { TrajectorySaver } from '../../capture/trajectory-saver'
-import type { ExecutorObservationHooks, OrchestratorHooks } from './types'
-
-export class OrchestratorExecutorEvaluator implements AgentEvaluator {
-  constructor(
-    private config: EvalConfig,
-    private task: Task,
-    private windowId: number,
-    private tabId: number,
-    private outputDir: string,
-  ) {}
-
-  async execute(): Promise<AgentResult> {
-    const startTime = Date.now()
-    const timeoutMs = this.config.timeout_ms ?? DEFAULT_TIMEOUT_MS
-
-    const errors: TaskError[] = []
-    const warnings: EvalWarning[] = []
-
-    const addError = (source: TaskError['source'], message: string, details?: Record<string, unknown>) => {
-      errors.push({ source, message, timestamp: new Date().toISOString(), details })
-    }
-
-    const addWarning = (source: EvalWarning['source'], message: string) => {
-      warnings.push({ source, message, timestamp: new Date().toISOString() })
-      console.warn(`[${source}] ${message}`)
-    }
-
-    // Initialize trajectory saver
-    const saver = new TrajectorySaver(this.outputDir, this.task.query_id)
-    const taskOutputDir = await saver.init()
-
-    // NEW: Initialize capture infrastructure (same as single-agent)
-    const screenshotCapture = new ScreenshotCapture(
-      this.config.browseros.server_url,
-      taskOutputDir,
-    )
-    await screenshotCapture.init()
-
-    const messageLogger = new MessageLogger(taskOutputDir)
-
-    // Log initial user message
-    await messageLogger.logUser(this.task.query)
-
-    // Validate config type
-    if (this.config.agent.type !== 'orchestrator-executor') {
-      throw new Error('OrchestratorExecutorEvaluator requires orchestrator-executor config')
-    }
-
-    const agentConfig = this.config.agent as OrchestratorExecutorConfig
-    const { orchestrator: orchestratorConfig, executor: executorConfig } =
-      resolveAgentConfig(agentConfig)
-
-    // Create orchestrator
-    const orchestrator = new Orchestrator(
-      orchestratorConfig,
-      executorConfig,
-      this.config.browseros.server_url,
-      this.windowId,
-      this.tabId,
-    )
-
-    // NEW: Set up executor observation hooks (for tool call/result capture)
-    let currentToolCallId: string | null = null
-
-    const executorHooks: ExecutorObservationHooks = {
-      onBeforeToolCall: async (toolName: string, args: unknown) => {
-        try {
-          currentToolCallId = randomUUID()
-          await messageLogger.logToolCall(toolName, currentToolCallId, args as Record<string, unknown>)
-        } catch (err) {
-          addWarning('message_logging', `Failed to log tool call ${toolName}: ${err instanceof Error ? err.message : String(err)}`)
-        }
-        return currentToolCallId
-      },
-      onAfterToolCall: async (toolName: string, _toolCallId: string, result: unknown, isError: boolean) => {
-        let screenshotNum = 0
-
-        // Capture screenshot after tool execution
-        try {
-          screenshotNum = await screenshotCapture.capture(this.tabId, this.windowId)
-        } catch (err) {
-          addWarning('screenshot', `Screenshot after ${toolName} failed: ${err instanceof Error ? err.message : String(err)}`)
-          screenshotNum = screenshotCapture.getCount()
-        }
-
-        // Log tool errors
-        if (isError) {
-          addWarning('mcp_tool', `Tool ${toolName} returned error`)
-        }
-
-        if (!currentToolCallId) {
-          addWarning('message_logging', 'Tool result without matching tool call')
-          return
-        }
-
-        try {
-          await messageLogger.logToolResult(currentToolCallId, result, isError, screenshotNum)
-        } catch (err) {
-          addWarning('message_logging', `Failed to log tool result: ${err instanceof Error ? err.message : String(err)}`)
-        }
-
-        currentToolCallId = null
-      },
-    }
-
-    // NEW: Set up orchestrator hooks (for delegation tracking)
-    const orchestratorHooks: OrchestratorHooks = {
-      onDelegation: async (instruction: string, executorId: string, maxSteps?: number) => {
-        try {
-          await messageLogger.logDelegation(instruction, executorId, maxSteps)
-        } catch (err) {
-          addWarning('message_logging', `Failed to log delegation: ${err instanceof Error ? err.message : String(err)}`)
-        }
-      },
-      onDelegationResult: async (result) => {
-        try {
-          await messageLogger.logDelegationResult(
-            result.executorId,
-            result.summary,
-            result.status,
-            result.stepsUsed,
-            result.currentUrl,
-          )
-        } catch (err) {
-          addWarning('message_logging', `Failed to log delegation result: ${err instanceof Error ? err.message : String(err)}`)
-        }
-      },
-    }
-
-    // Apply hooks to orchestrator
-    orchestrator.setHooks(orchestratorHooks)
-    orchestrator.setExecutorObservationHooks(executorHooks)
-
-    // Set up timeout
-    const abortController = new AbortController()
-    const timeoutHandle = setTimeout(() => {
-      abortController.abort()
-    }, timeoutMs)
-
-    let terminationReason: 'completed' | 'max_steps' | 'error' | 'timeout' = 'completed'
-    let finalAnswer: string | null = null
-    let orchestratorResult: Awaited<ReturnType<typeof orchestrator.run>> | null = null
-
-    try {
-      const runPromise = orchestrator.run(this.task.query)
-
-      orchestratorResult = await Promise.race([
-        runPromise,
-        new Promise<never>((_, reject) => {
-          abortController.signal.addEventListener('abort', () => {
-            reject(new Error('Timeout'))
-          })
-        }),
-      ])
-
-      if (orchestratorResult.success) {
-        finalAnswer = orchestratorResult.answer
-        terminationReason = 'completed'
-        // Log final assistant message
-        if (finalAnswer) {
-          await messageLogger.logAssistant(finalAnswer)
-        }
-      } else {
-        terminationReason = 'error'
-        addError('agent_execution', orchestratorResult.reason ?? 'Unknown failure')
-        await messageLogger.logError(orchestratorResult.reason ?? 'Unknown failure')
-      }
-    } catch (err) {
-      const error = err instanceof Error ? err : new Error(String(err))
-
-      if (error.message === 'Timeout' || abortController.signal.aborted) {
-        terminationReason = 'timeout'
-        addError('agent_execution', `Task timed out after ${timeoutMs / 1000}s`)
-      } else {
-        terminationReason = 'error'
-        addError('agent_execution', error.message, { stack: error.stack })
-      }
-      await messageLogger.logError(error.message)
-    } finally {
-      clearTimeout(timeoutHandle)
-      orchestrator.getExecutorStore().clear()
-    }
-
-    const endTime = Date.now()
-
-    // Create metadata
-    const metadata: TaskMetadata = {
-      query_id: this.task.query_id,
-      dataset: this.task.dataset,
-      query: this.task.query,
-      started_at: new Date(startTime).toISOString(),
-      completed_at: new Date(endTime).toISOString(),
-      total_duration_ms: endTime - startTime,
-      total_steps: screenshotCapture.getCount(),  // Now accurate
-      termination_reason: terminationReason,
-      final_answer: finalAnswer,
-      errors,
-      warnings,
-      agent_config: {
-        type: 'orchestrator-executor',
-        model: `${orchestratorConfig.model} / ${executorConfig.model}`,
-      },
-      grader_results: {},
-    }
-
-    await saver.saveMetadata(metadata)
-
-    return {
-      metadata,
-      messages: messageLogger.getMessages(),  // NOW POPULATED
-      finalAnswer,
-    }
-  }
-}
-```
-
---
-
-## Phase 6: Orchestrator Class Updates
-
-### 6.1 Add Hook Passthrough Methods
-
-**File:** `src/agents/orchestrator-executor/orchestrator.ts`
-
-```typescript
-import type { ExecutorObservationHooks, OrchestratorHooks } from './types'
-
-export class Orchestrator {
-  private agent: OrchestratorAgent | null = null
-  private executorStore: ExecutorStore
-  private pendingOrchestratorHooks?: OrchestratorHooks
-  private pendingExecutorHooks?: ExecutorObservationHooks
-
-  constructor(
-    private orchestratorConfig: OrchestratorConfig,
-    private executorConfig: ExecutorConfig,
-    private serverUrl: string,
-    private windowId: number,
-    private tabId: number,
-  ) {
-    this.executorStore = new ExecutorStore()
-  }
-
-  /**
-   * Set orchestrator-level hooks (must be called before run())
-   */
-  setHooks(hooks: OrchestratorHooks): void {
-    this.pendingOrchestratorHooks = hooks
-    if (this.agent) {
-      this.agent.setHooks(hooks)
-    }
-  }
-
-  /**
-   * Set executor observation hooks (must be called before run())
-   */
-  setExecutorObservationHooks(hooks: ExecutorObservationHooks): void {
-    this.pendingExecutorHooks = hooks
-    this.executorStore.setObservationHooks(hooks)
-    if (this.agent) {
-      this.agent.setExecutorObservationHooks(hooks)
-    }
-  }
-
-  async run(taskQuery: string): Promise<OrchestratorAgentResult> {
-    this.agent = await OrchestratorAgent.create(
-      this.orchestratorConfig,
-      this.executorConfig,
-      this.serverUrl,
-      this.windowId,
-      this.tabId,
-    )
-
-    // Apply pending hooks
-    if (this.pendingOrchestratorHooks) {
-      this.agent.setHooks(this.pendingOrchestratorHooks)
-    }
-    if (this.pendingExecutorHooks) {
-      this.agent.setExecutorObservationHooks(this.pendingExecutorHooks)
-    }
-
-    const result = await this.agent.run(taskQuery)
-    this.executorStore = this.agent.getExecutorStore()
-
-    return result
-  }
-
-  getExecutorStore(): ExecutorStore {
-    return this.agent?.getExecutorStore() ?? this.executorStore
-  }
-}
-```
-
---
-
-## Implementation Order
-
-1. **Phase 1** - Type extensions (types.ts) - 30 min
-2. **Phase 2** - MessageLogger extensions - 30 min
-3. **Phase 3** - Executor hook integration - 1 hour
-4. **Phase 4** - OrchestratorAgent hooks - 1 hour
-5. **Phase 5** - OrchestratorExecutorEvaluator update - 1.5 hours
-6. **Phase 6** - Orchestrator passthrough - 30 min
-7. **Testing** - End-to-end verification - 1 hour
-
-**Total estimated time:** ~6 hours
-
---
-
-## Testing Checklist
-
- [ ] Single-agent eval still works (regression test)
- [ ] Orchestrator-executor produces screenshots in output folder
- [ ] Orchestrator-executor produces messages.jsonl with:
-  - [ ] user message
-  - [ ] delegation messages
-  - [ ] tool_call messages (from executor)
-  - [ ] tool_result messages with screenshot numbers
-  - [ ] delegation_result messages
-  - [ ] assistant message (final answer)
- [ ] Graders pass with orchestrator-executor (no "no_screenshots" error)
- [ ] metadata.json has accurate `total_steps` count
- [ ] Error/warning capture works for both patterns
-
---
-
-## Future Considerations
-
-1. **New Agent Patterns:** Any new agent type just needs to:
-   - Accept hooks in constructor or via setter
-   - Fire hooks at appropriate points
-   - Use shared capture infrastructure
-
-2. **Grader Updates:** May need to update graders to understand delegation messages
-
-3. **Parallel Executors:** If orchestrator delegates to multiple executors in parallel, need to handle concurrent screenshot capture
-
-4. **Memory/Performance:** Screenshot capture creates MCP connection per capture - consider connection pooling for high-volume evals
--- a/packages/browseros-agent/apps/eval/README.md
+++ b/packages/browseros-agent/apps/eval/README.md
@@ -2,81 +2,67 @@

 [![License: AGPL v3](https://img.shields.io/badge/License-AGPL%20v3-blue.svg)](../../../../LICENSE)

-Evaluation framework for benchmarking BrowserOS browser automation agents. Runs tasks from standard datasets ([WebVoyager](https://arxiv.org/abs/2401.13919), [Mind2Web](https://arxiv.org/abs/2306.06070)), captures trajectories with screenshots, and grades results automatically.
+Evaluation framework for BrowserOS browser automation agents. Runs tasks from standard datasets ([WebVoyager](https://arxiv.org/abs/2401.13919), [Mind2Web](https://arxiv.org/abs/2306.06070), AGI SDK / REAL Bench, WebArena-Infinity, WebBench), captures trajectories with screenshots, and grades results automatically.

 ## Prerequisites

- **BrowserOS binary** installed at `/Applications/BrowserOS.app` (macOS)
+- **BrowserOS binary** at `/Applications/BrowserOS.app` (macOS) or `BROWSEROS_BINARY` pointing at it
 - **Bun** runtime
- **API keys** for your chosen LLM provider and grader model
+- **API keys** for your LLM provider (and `CLAUDE_CODE_OAUTH_TOKEN` if you use `performance_grader`)

 ## Quick Start

-### 1. Set up environment
-
 ```bash
 cd apps/eval
-```
-
-Edit `.env.development` and add your API keys:
-
-```bash
-# Pick ONE provider for the orchestrator (whichever you have access to)
-OPENAI_API_KEY=sk-xxxxx
-ANTHROPIC_API_KEY=sk-ant-xxxxx
-FIREWORKS_API_KEY=fw_xxxxx
-GOOGLE_API_KEY=AIza-xxxxx
-
-# For grading results (OpenRouter recommended — gives access to many models)
-OPENROUTER_API_KEY=sk-or-v1-xxxxx
-```
-
-### 2. Launch the dashboard
-
-```bash
+# Edit .env.development with your keys, then:
 bun run eval
 ```

-Opens the **Eval Dashboard** at `http://localhost:9900` in config mode.
+Opens the eval dashboard at `http://localhost:9900` in config mode. From there: load a preset, edit settings, click **Run**.

-### 3. Configure and run
-
-From the dashboard:
-
-1. **Load a preset** — select from the dropdown or click **Load File** to import a config JSON
-2. **Edit settings** — change agent type, provider, model, API keys, dataset, workers, timeouts
-3. **Save Config** — export your configuration for reuse
-4. **Click Run** — starts the evaluation with live progress
-
-### Alternative: Run from CLI
+### CLI mode

 ```bash
-bun run eval -c configs/orchestrator-executor-clado-test.json
+bun run eval -c configs/browseros-agent-weekly.json
 ```

 Runs immediately. Dashboard still available at `http://localhost:9900` for live progress.

-## Agent Types
+## Agent types

-### Orchestrator-Executor with Clado
+| Type | Description |
+|------|-------------|
+| `single` | Single LLM agent driven by the BrowserOS tool loop (CDP) |
+| `orchestrator-executor` | High-level orchestrator + per-step executor (LLM or Clado visual model) |

-The recommended architecture for visual model evals. Two tiers:
+### Single agent

- **Orchestrator** — An LLM that plans and issues high-level instructions
- **Executor** — The **Clado Action** visual model that takes screenshots and predicts click/type/scroll coordinates
+```json
+{
+  "agent": {
+    "type": "single",
+    "provider": "openai-compatible",
+    "model": "moonshotai/kimi-k2.5",
+    "apiKey": "OPENROUTER_API_KEY",
+    "baseUrl": "https://openrouter.ai/api/v1",
+    "supportsImages": true
+  }
+}
+```

-The orchestrator works with **any LLM provider**. Pick whichever you have access to:
+### Orchestrator-Executor

-#### OpenAI orchestrator
+The orchestrator works with any LLM provider. The executor can be another LLM, or the **Clado action** visual model that takes screenshots and predicts click/type/scroll coordinates.

 ```json
 {
  "agent": {
    "type": "orchestrator-executor",
    "orchestrator": {
-      "provider": "openai",
-      "model": "gpt-4o",
-      "apiKey": "OPENAI_API_KEY"
+      "provider": "openai-compatible",
+      "model": "accounts/fireworks/models/kimi-k2p5",
+      "apiKey": "FIREWORKS_API_KEY",
+      "baseUrl": "https://api.fireworks.ai/inference/v1"
    },
    "executor": {
      "provider": "clado-action",
@@ -84,73 +70,31 @@ The orchestrator works with **any LLM provider**. Pick whichever you have access
      "apiKey": "",
      "baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
    }
-  },
-  "dataset": "../data/webvoyager_e2e_test.jsonl",
-  "output_dir": "../results/oe-clado-openai",
-  "num_workers": 3,
-  "browseros": {
-    "server_url": "http://127.0.0.1:9110",
-    "base_cdp_port": 9010,
-    "base_server_port": 9110,
-    "base_extension_port": 9310,
-    "headless": true
-  },
-  "grader_api_key_env": "OPENROUTER_API_KEY",
-  "grader_base_url": "https://openrouter.ai/api/v1",
-  "grader_model": "openai/gpt-4.1",
-  "timeout_ms": 1200000
+  }
 }
 ```

-#### Anthropic orchestrator
+## Graders
+
+| Name | Description |
+|------|-------------|
+| `performance_grader` | Multi-axis grader running on Claude Agent SDK (uses its own credentials via `CLAUDE_CODE_OAUTH_TOKEN`) |
+| `agisdk_state_diff` | AGI SDK / REAL Bench environment state-diff grader (deterministic) |
+| `infinity_state` | WebArena-Infinity verifier-script grader (deterministic) |
+
+Set `graders` in your config to override the per-task `graders` field from the dataset:

 ```json
-"orchestrator": {
-  "provider": "anthropic",
-  "model": "claude-sonnet-4-20250514",
-  "apiKey": "ANTHROPIC_API_KEY"
-}
+"graders": ["performance_grader"]
 ```

-#### Google orchestrator
-
-```json
-"orchestrator": {
-  "provider": "google",
-  "model": "gemini-2.0-flash",
-  "apiKey": "GOOGLE_API_KEY"
-}
-```
-
-#### Fireworks orchestrator (OpenAI-compatible)
-
-```json
-"orchestrator": {
-  "provider": "openai-compatible",
-  "model": "accounts/fireworks/models/kimi-k2p5",
-  "apiKey": "FIREWORKS_API_KEY",
-  "baseUrl": "https://api.fireworks.ai/inference/v1"
-}
-```
-
-The executor config stays the same across all orchestrator providers — it always uses the Clado action model.
-
-### Other Agent Types
-
-| Type | Description | Example config |
-|------|-------------|----------------|
-| `single` | Single LLM agent via Gemini CLI + MCP | `webvoyager-test.json` |
-| `tool-loop` | AI SDK tool loop, connects via CDP | `tool-loop-test.json` |
-| `gemini-computer-use` | Google native computer use API | `gemini-computer-use.json` |
-| `yutori-navigator` | Yutori N1 visual model | `yutori-navigator.json` |
-
-## Configuration Reference
+## Configuration reference

 ### API keys

 The `apiKey` field supports two formats:
 - **Env var name**: `"OPENAI_API_KEY"` — resolved from `.env.development` at runtime
- **Direct value**: `"sk-xxxxx"` — used as-is (not recommended, prefer env vars)
+- **Direct value**: `"sk-xxxxx"` — used as-is (not recommended)

 ### Supported providers

@@ -160,7 +104,7 @@ The `apiKey` field supports two formats:
 | Anthropic | `anthropic` | No |
 | Google | `google` | No |
 | Azure OpenAI | `azure` | Yes |
-| AWS Bedrock | `bedrock` | No (uses `region`, `accessKeyId`, `secretAccessKey`) |
+| AWS Bedrock | `bedrock` | No |
 | OpenRouter | `openrouter` | No |
 | Fireworks, Together, etc. | `openai-compatible` | Yes |
 | Ollama | `ollama` | No |
@@ -179,34 +123,27 @@ The `apiKey` field supports two formats:
 }
 ```

-Each worker gets its own Chrome instance. Worker N uses `base_port + N` for CDP and server ports. `base_extension_port` is still reserved as a legacy BrowserOS launch argument for compatibility with Chromium builds that still pass it.
+Each worker gets its own Chrome instance. Worker N uses `base_port + N` for CDP and server ports.

 ### Execution settings

 | Field | Description | Default |
 |-------|-------------|---------|
 | `num_workers` | Parallel workers (each gets its own Chrome) | `1` |
-| `timeout_ms` | Per-task timeout in ms | `900000` (15 min) |
+| `timeout_ms` | Per-task timeout in ms | `1800000` (30 min) |
 | `restart_server_per_task` | Restart Chrome between tasks (cleaner state, slower) | `false` |

-### Grading
-
-Results are auto-graded after each task. The grader uses an LLM judge.
-
-| Field | Description |
-|-------|-------------|
-| `grader_model` | Model for grading (e.g., `openai/gpt-4.1`) |
-| `grader_api_key_env` | Env var name for grader API key |
-| `grader_base_url` | API endpoint (e.g., `https://openrouter.ai/api/v1`) |
-
 ## Datasets

 | File | Tasks | Description |
 |------|-------|-------------|
-| `webvoyager_e2e_test.jsonl` | 10 | WebVoyager test subset (quick smoke test) |
 | `webvoyager.jsonl` | 643 | Full WebVoyager benchmark |
-| `mind2web_e2e_test.jsonl` | 10 | Mind2Web test subset |
-| `mind2web.jsonl` | 300 | Full Mind2Web benchmark |
+| `mind2web.jsonl` | 300 | Online-Mind2Web |
+| `webbench-{0,1,2}of4-50.jsonl` | 50 each | WebBench shards (50-task subsets) |
+| `agisdk-real.jsonl` | 40 | AGI SDK / REAL Bench (action-only tasks) |
+| `webarena-infinity-hard-50.jsonl` | 50 | WebArena-Infinity hard set |
+| `browsecomp-medium-hard-50.jsonl` | 50 | BrowseComp medium-hard |
+| `browsecomp-very-hard-50.jsonl` | 50 | BrowseComp very-hard |

 Task format (JSONL, one per line):

@@ -215,7 +152,7 @@ Task format (JSONL, one per line):
  "query_id": "Amazon--0",
  "dataset": "webvoyager",
  "query": "Search an Xbox Wireless controller with green color and rated above 4 stars.",
-  "graders": ["webvoyager_grader", "fara_combined"],
+  "graders": ["performance_grader"],
  "start_url": "https://www.amazon.com/",
  "metadata": { "original_task_id": "Amazon--0", "website": "Amazon" }
 }
@@ -227,24 +164,25 @@ Results are saved to `output_dir`:

 ```
 results/
-  oe-clado-openai/
-    Amazon--0/
-      metadata.json         # Task result, timing, grader scores
-      messages.jsonl         # Full message log
-      screenshots/
-        001.png              # Step-by-step screenshots
-        002.png
-    summary.json             # Aggregate pass rates
+  browseros-agent-weekly/
+    2026-04-29-1430/
+      Amazon--0/
+        metadata.json         # Task result, timing, grader scores
+        messages.jsonl         # Full message log
+        screenshots/
+          001.png              # Step-by-step screenshots
+          002.png
+      summary.json             # Aggregate pass rates
 ```

 ## Troubleshooting

-**BrowserOS not found**: Expects `/Applications/BrowserOS.app/Contents/MacOS/BrowserOS`. Make sure it's installed.
+**BrowserOS not found**: Expects `/Applications/BrowserOS.app/Contents/MacOS/BrowserOS`. Set `BROWSEROS_BINARY` to override.

 **Port conflicts**: Each worker uses `base_port + workerIndex`. 3 workers on base 9110 → ports 9110, 9111, 9112. Stop other BrowserOS instances first.

 **API key not resolving**: If your config has `"apiKey": "OPENAI_API_KEY"`, ensure the env var is set in `.env.development`.

-**Tasks timing out**: Increase `timeout_ms`. Default is 15 minutes; complex tasks may need 20+ minutes.
+**Tasks timing out**: Increase `timeout_ms`. Default is 30 minutes.

-**Headless vs headed**: Set `"headless": false` to watch Chrome in real-time. Useful for debugging.
+**Headless vs headed**: Set `"headless": false` to watch Chrome in real time.
--- a/packages/browseros-agent/apps/eval/config.json
+++ b/packages/browseros-agent/apps/eval/config.json
@@ -1,18 +0,0 @@
-{
-  "agent": {
-    "type": "single",
-    "provider": "openrouter",
-    "model": "openai/gpt-4o",
-    "apiKey": "OPENROUTER_API_KEY"
-  },
-  "dataset": "data/webvoyager_e2e_test.jsonl",
-  "output_dir": "results",
-  "num_workers": 5,
-  "browseros": {
-    "server_url": "http://127.0.0.1:9110"
-  },
-  "grader_api_key_env": "OPENROUTER_API_KEY",
-  "grader_base_url": "https://openrouter.ai/api/v1",
-  "grader_model": "openai/gpt-4.1",
-  "timeout_ms": 300000
-}
--- a/packages/browseros-agent/apps/eval/configs/browseros-agent-weekly.json
+++ b/packages/browseros-agent/apps/eval/configs/browseros-agent-weekly.json
@@ -22,8 +22,5 @@
    "api_key_env": "NOPECHA_API_KEY"
  },
  "graders": ["performance_grader"],
-  "grader_api_key_env": "OPENROUTER_API_KEY",
-  "grader_base_url": "https://openrouter.ai/api/v1",
-  "grader_model": "openai/gpt-4.1",
  "timeout_ms": 1800000
 }
--- a/packages/browseros-agent/apps/eval/configs/browseros-oe-agent-weekly.json
+++ b/packages/browseros-agent/apps/eval/configs/browseros-oe-agent-weekly.json
@@ -29,8 +29,5 @@
    "api_key_env": "NOPECHA_API_KEY"
  },
  "graders": ["performance_grader"],
-  "grader_api_key_env": "OPENROUTER_API_KEY",
-  "grader_base_url": "https://openrouter.ai/api/v1",
-  "grader_model": "openai/gpt-4.1",
  "timeout_ms": 1800000
 }
--- a/packages/browseros-agent/apps/eval/configs/browseros-oe-clado-weekly.json
+++ b/packages/browseros-agent/apps/eval/configs/browseros-oe-clado-weekly.json
@@ -29,8 +29,5 @@
    "api_key_env": "NOPECHA_API_KEY"
  },
  "graders": ["performance_grader"],
-  "grader_api_key_env": "OPENROUTER_API_KEY",
-  "grader_base_url": "https://openrouter.ai/api/v1",
-  "grader_model": "openai/gpt-4.1",
  "timeout_ms": 1800000
 }
--- a/packages/browseros-agent/apps/eval/configs/test-mind2web.json
+++ b/packages/browseros-agent/apps/eval/configs/test-mind2web.json
@@ -20,8 +20,5 @@
    "api_key_env": "NOPECHA_API_KEY"
  },
  "graders": ["performance_grader"],
-  "grader_api_key_env": "OPENROUTER_API_KEY",
-  "grader_base_url": "https://openrouter.ai/api/v1",
-  "grader_model": "openai/gpt-4.1",
  "timeout_ms": 300000
 }
--- a/packages/browseros-agent/apps/eval/configs/test-webvoyager.json
+++ b/packages/browseros-agent/apps/eval/configs/test-webvoyager.json
@@ -22,8 +22,5 @@
    "api_key_env": "NOPECHA_API_KEY"
  },
  "graders": ["performance_grader"],
-  "grader_api_key_env": "OPENROUTER_API_KEY",
-  "grader_base_url": "https://openrouter.ai/api/v1",
-  "grader_model": "openai/gpt-4.1",
  "timeout_ms": 1200000
 }
--- a/packages/browseros-agent/apps/eval/configs/test_gemini-computer-use.json
+++ b/packages/browseros-agent/apps/eval/configs/test_gemini-computer-use.json
@@ -1,30 +0,0 @@
-{
-  "agent": {
-    "type": "gemini-computer-use",
-    "apiKey": "GOOGLE_AI_API_KEY",
-    "screenSize": {
-      "width": 1440,
-      "height": 900
-    },
-    "turnLimit": 100
-  },
-  "dataset": "../data/test-set.jsonl",
-  "num_workers": 1,
-  "restart_server_per_task": true,
-  "browseros": {
-    "server_url": "http://127.0.0.1:9110",
-    "base_cdp_port": 9010,
-    "base_server_port": 9110,
-    "base_extension_port": 9310,
-    "load_extensions": false,
-    "headless": false
-  },
-  "captcha": {
-    "api_key_env": "NOPECHA_API_KEY"
-  },
-  "graders": ["performance_grader"],
-  "grader_api_key_env": "OPENROUTER_API_KEY",
-  "grader_base_url": "https://openrouter.ai/api/v1",
-  "grader_model": "openai/gpt-4.1",
-  "timeout_ms": 1200000
-}
--- a/packages/browseros-agent/apps/eval/configs/test_yutori-navigator.json
+++ b/packages/browseros-agent/apps/eval/configs/test_yutori-navigator.json
@@ -1,30 +0,0 @@
-{
-  "agent": {
-    "type": "yutori-navigator",
-    "apiKey": "YUTORI_API_KEY",
-    "screenSize": {
-      "width": 1280,
-      "height": 800
-    },
-    "turnLimit": 100
-  },
-  "dataset": "../data/test-set.jsonl",
-  "num_workers": 1,
-  "restart_server_per_task": true,
-  "browseros": {
-    "server_url": "http://127.0.0.1:9110",
-    "base_cdp_port": 9010,
-    "base_server_port": 9110,
-    "base_extension_port": 9310,
-    "load_extensions": false,
-    "headless": false
-  },
-  "captcha": {
-    "api_key_env": "NOPECHA_API_KEY"
-  },
-  "graders": ["performance_grader"],
-  "grader_api_key_env": "OPENROUTER_API_KEY",
-  "grader_base_url": "https://openrouter.ai/api/v1",
-  "grader_model": "openai/gpt-4.1",
-  "timeout_ms": 1200000
-}
--- a/packages/browseros-agent/apps/eval/data/coordinate-click.jsonl
+++ b/packages/browseros-agent/apps/eval/data/coordinate-click.jsonl
@@ -1,5 +0,0 @@
-{"query_id": "CoordClick--1", "dataset": "coordinate-click", "query": "Click on circle A located at the top-left corner of the page.", "graders": ["webvoyager_grader"], "start_url": "http://localhost:3100", "metadata": {"original_task_id": "CoordClick--1", "website": "eval-target", "category": "coordinate-prediction", "additional": {"ground_truth": "Circle A is clicked and shows data-clicked=true", "answer_type": "golden"}}}
-{"query_id": "CoordClick--2", "dataset": "coordinate-click", "query": "Click on circle B located at the top-right corner of the page.", "graders": ["webvoyager_grader"], "start_url": "http://localhost:3100", "metadata": {"original_task_id": "CoordClick--2", "website": "eval-target", "category": "coordinate-prediction", "additional": {"ground_truth": "Circle B is clicked and shows data-clicked=true", "answer_type": "golden"}}}
-{"query_id": "CoordClick--3", "dataset": "coordinate-click", "query": "Click on circle C located at the bottom-left corner of the page.", "graders": ["webvoyager_grader"], "start_url": "http://localhost:3100", "metadata": {"original_task_id": "CoordClick--3", "website": "eval-target", "category": "coordinate-prediction", "additional": {"ground_truth": "Circle C is clicked and shows data-clicked=true", "answer_type": "golden"}}}
-{"query_id": "CoordClick--4", "dataset": "coordinate-click", "query": "Click on circle D located at the bottom-right corner of the page.", "graders": ["webvoyager_grader"], "start_url": "http://localhost:3100", "metadata": {"original_task_id": "CoordClick--4", "website": "eval-target", "category": "coordinate-prediction", "additional": {"ground_truth": "Circle D is clicked and shows data-clicked=true", "answer_type": "golden"}}}
-{"query_id": "CoordClick--5", "dataset": "coordinate-click", "query": "Click on all four circles A, B, C, and D on the page.", "graders": ["webvoyager_grader"], "start_url": "http://localhost:3100", "metadata": {"original_task_id": "CoordClick--5", "website": "eval-target", "category": "coordinate-prediction", "additional": {"ground_truth": "All four circles are clicked and page shows ALL TARGETS HIT", "answer_type": "golden"}}}
--- a/packages/browseros-agent/apps/eval/eval-targets/coordinate-click/index.html
+++ b/packages/browseros-agent/apps/eval/eval-targets/coordinate-click/index.html
@@ -1,147 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-  <meta charset="UTF-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <title>Coordinate Click Test</title>
-  <style>
-    * { margin: 0; padding: 0; box-sizing: border-box; }
-
-    body {
-      width: 100vw;
-      height: 100vh;
-      overflow: hidden;
-      background: #1a1a2e;
-      font-family: system-ui, -apple-system, sans-serif;
-    }
-
-    .circle {
-      position: fixed;
-      border-radius: 50%;
-      background: #e94560;
-      display: flex;
-      align-items: center;
-      justify-content: center;
-      color: #fff;
-      font-weight: 700;
-      cursor: pointer;
-      user-select: none;
-      transition: background 0.2s, transform 0.15s;
-      clip-path: circle(50%);
-    }
-
-    .circle:hover { transform: scale(1.08); }
-
-    .circle[data-clicked="true"] {
-      background: #0f3460;
-      pointer-events: none;
-    }
-
-    /* A — top-left area, large */
-    .circle-a {
-      width: 80px;
-      height: 80px;
-      font-size: 24px;
-      top: 15%;
-      left: 10%;
-    }
-
-    /* B — right side, upper-middle, medium */
-    .circle-b {
-      width: 50px;
-      height: 50px;
-      font-size: 18px;
-      top: 30%;
-      right: 18%;
-    }
-
-    /* C — center-left, lower area, small */
-    .circle-c {
-      width: 30px;
-      height: 30px;
-      font-size: 13px;
-      bottom: 25%;
-      left: 35%;
-    }
-
-    /* D — bottom-right area, very small */
-    .circle-d {
-      width: 16px;
-      height: 16px;
-      font-size: 9px;
-      bottom: 12%;
-      right: 30%;
-    }
-
-    #status {
-      position: fixed;
-      top: 50%;
-      left: 50%;
-      transform: translate(-50%, -50%);
-      color: #eee;
-      font-size: 16px;
-      text-align: center;
-      pointer-events: none;
-    }
-
-    #status .count {
-      font-size: 48px;
-      font-weight: 700;
-      color: #0f3460;
-    }
-
-    .success-flash {
-      animation: flash 0.4s ease-out;
-    }
-
-    @keyframes flash {
-      0% { background: #16c79a; transform: scale(1.3); }
-      100% { background: #0f3460; transform: scale(1); }
-    }
-  </style>
-</head>
-<body>
-  <div id="circle-a" class="circle circle-a" data-target="A" data-clicked="false">A</div>
-  <div id="circle-b" class="circle circle-b" data-target="B" data-clicked="false">B</div>
-  <div id="circle-c" class="circle circle-c" data-target="C" data-clicked="false">C</div>
-  <div id="circle-d" class="circle circle-d" data-target="D" data-clicked="false">D</div>
-
-  <div id="status">
-    <div class="count" id="clicked-count">0</div>
-    <div>of 4 targets clicked</div>
-  </div>
-
-  <script>
-    const circles = document.querySelectorAll('.circle')
-    const countEl = document.getElementById('clicked-count')
-    let clicked = 0
-
-    circles.forEach(circle => {
-      circle.addEventListener('click', (e) => {
-        if (circle.dataset.clicked === 'true') return
-
-        const rect = circle.getBoundingClientRect()
-        const centerX = rect.left + rect.width / 2
-        const centerY = rect.top + rect.height / 2
-        const radius = rect.width / 2
-        const dx = e.clientX - centerX
-        const dy = e.clientY - centerY
-
-        if (dx * dx + dy * dy > radius * radius) return
-
-        circle.dataset.clicked = 'true'
-        circle.classList.add('success-flash')
-        clicked++
-        countEl.textContent = clicked
-
-        if (clicked === 4) {
-          document.getElementById('status').innerHTML =
-            '<div class="count" style="color:#16c79a">ALL TARGETS HIT</div>' +
-            '<div>4 of 4 targets clicked</div>'
-          document.body.dataset.allClicked = 'true'
-        }
-      })
-    })
-  </script>
-</body>
-</html>
--- a/packages/browseros-agent/apps/eval/eval-targets/coordinate-click/serve.ts
+++ b/packages/browseros-agent/apps/eval/eval-targets/coordinate-click/serve.ts
@@ -1,16 +0,0 @@
-const server = Bun.serve({
-  port: 3100,
-  async fetch(req) {
-    const url = new URL(req.url)
-    const path = url.pathname === '/' ? '/index.html' : url.pathname
-    const file = Bun.file(import.meta.dir + path)
-
-    if (await file.exists()) {
-      return new Response(file)
-    }
-
-    return new Response('Not Found', { status: 404 })
-  },
-})
-
-console.log(`Coordinate click test running at http://localhost:${server.port}`)
--- a/packages/browseros-agent/apps/eval/scripts/analyze-webbench-auth.py
+++ b/packages/browseros-agent/apps/eval/scripts/analyze-webbench-auth.py
@@ -1,93 +0,0 @@
-"""
-Analyze how many WebBench tasks require authentication across ALL buckets.
-Usage: python3 apps/eval/scripts/analyze-webbench-auth.py
-"""
-import json
-import re
-from collections import defaultdict
-
-# Login/auth indicators in task text
-AUTH_KEYWORDS = [
-    "log in", "login", "sign in", "signin", "sign up", "signup",
-    "your account", "your profile", "your wishlist", "your order",
-    "your cart", "your dashboard", "your settings", "your subscription",
-    "your inbox", "your message", "your review", "your playlist",
-    "your favorites", "your saved", "your history", "your list",
-    "your address", "your payment", "your booking", "your reservation",
-    "my account", "my profile", "my wishlist", "my order", "my cart",
-    "my dashboard", "my settings", "my subscription", "my inbox",
-    "my message", "my review", "my playlist", "my favorites",
-    "my saved", "my history", "my list", "my address", "my payment",
-    "my booking", "my reservation", "my bag",
-    "send a message", "post a comment", "write a review", "submit a review",
-    "leave a review", "publish", "upload a", "create a playlist",
-    "add to cart", "add to bag", "add to wishlist", "add to favorites",
-    "save to", "bookmark", "subscribe", "unsubscribe",
-    "delete your", "remove your", "delete my", "remove my",
-    "edit your", "edit my", "update your", "update my",
-    "change your", "change my", "modify your", "modify my",
-]
-
-# Categories that almost always need auth
-WRITE_CATEGORIES = {"CREATE", "UPDATE", "DELETE"}
-
-def needs_auth(task_text, category):
-    task_lower = task_text.lower()
-    # Check keywords
-    for kw in AUTH_KEYWORDS:
-        if kw in task_lower:
-            return True, f"keyword: '{kw}'"
-    # WRITE tasks that don't match keywords but still likely need auth
-    # (be conservative — some CREATE tasks like "create a search filter" don't need login)
-    return False, ""
-
-
-# Load all datasets
-for bucket in [0, 1, 2]:
-    full_path = f"apps/eval/data/webbench-{bucket}of4.jsonl"
-    tasks = []
-    with open(full_path) as f:
-        for line in f:
-            tasks.append(json.loads(line))
-
-    auth_tasks = []
-    no_auth_tasks = []
-    for t in tasks:
-        needs, reason = needs_auth(t["query"], t["metadata"]["category"])
-        if needs:
-            auth_tasks.append((t, reason))
-        else:
-            no_auth_tasks.append(t)
-
-    print(f"{'=' * 60}")
-    print(f"BUCKET {bucket}/4: {len(tasks)} total")
-    print(f"  Needs auth:    {len(auth_tasks)} ({len(auth_tasks)/len(tasks)*100:.0f}%)")
-    print(f"  No auth:       {len(no_auth_tasks)} ({len(no_auth_tasks)/len(tasks)*100:.0f}%)")
-
-    # Breakdown of no-auth tasks
-    cats = defaultdict(int)
-    diffs = defaultdict(int)
-    domains = set()
-    for t in no_auth_tasks:
-        cats[t["metadata"]["category"]] += 1
-        diffs[t["metadata"]["additional"]["difficulty"]] += 1
-        domains.add(t["metadata"]["website"])
-    cat_str = ", ".join(f"{c}({n})" for c, n in sorted(cats.items(), key=lambda x: -x[1]))
-    diff_str = ", ".join(f"{d}({n})" for d, n in sorted(diffs.items(), key=lambda x: -x[1]))
-    print(f"  No-auth breakdown:")
-    print(f"    categories: {cat_str}")
-    print(f"    difficulty: {diff_str}")
-    print(f"    websites:   {len(domains)}")
-
-    # Sample no-auth tasks
-    print(f"\n  Sample no-auth tasks:")
-    for t in no_auth_tasks[:8]:
-        print(f"    [{t['metadata']['additional']['webbench_id']}] [{t['metadata']['category']}] {t['metadata']['website']}")
-        print(f"      {t['query'][:150]}")
-
-    # Sample auth tasks (to verify detection)
-    print(f"\n  Sample auth tasks (verify detection):")
-    for t, reason in auth_tasks[:5]:
-        print(f"    [{t['metadata']['additional']['webbench_id']}] [{t['metadata']['category']}] {t['metadata']['website']} ({reason})")
-        print(f"      {t['query'][:150]}")
-    print()
--- a/packages/browseros-agent/apps/eval/scripts/analyze-webbench.py
+++ b/packages/browseros-agent/apps/eval/scripts/analyze-webbench.py
@@ -1,214 +0,0 @@
-"""
-Analyze WebBench results across ALL 8 agents to stratify tasks by pass count.
-Usage: python3 apps/eval/scripts/analyze-webbench.py
-"""
-import csv
-import os
-from collections import defaultdict
-
-DATA_DIR = "apps/eval/data/webbench"
-
-AGENTS = [
-    {"file": "anthropicfinal.csv", "eval_col": "Anthropic_Eval", "name": "Anthropic CUA"},
-    {"file": "skyvern2.0final.csv", "eval_col": "Skyvern2.0Eval", "name": "Skyvern 2.0"},
-    {"file": "skyvern2.0browserbasefinal.csv", "eval_col": "Browserbase_SkyvernEval", "name": "Skyvern BB"},
-    {"file": "openaicuafinal.csv", "eval_col": "CUAEval", "name": "OpenAI CUA"},
-    {"file": "browserusefinal.csv", "eval_col": "BUEval", "name": "BrowserUse"},
-    {"file": "convergencehitlfinal.csv", "eval_col": "convergence_hitl_eval", "name": "Convergence"},
-    {"file": "operatorhitlfinal.csv", "eval_col": "operator_hitl_eval", "name": "Operator"},
-    {"file": "rtrvrfinal.csv", "eval_col": "Human Label", "name": "RTRVR"},
-]
-
-
-def load_agent(agent):
-    path = os.path.join(DATA_DIR, agent["file"])
-    results = {}
-    with open(path, newline="", encoding="utf-8") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            try:
-                task_id = int(row["ID"])
-            except (ValueError, KeyError):
-                continue
-            eval_val = row.get(agent["eval_col"], "")
-            results[task_id] = {
-                "eval": eval_val,
-                "difficulty": row.get("Difficulty", ""),
-                "category": row.get("Category", ""),
-                "task": row.get("Task", ""),
-                "url": row.get("Starting URL", ""),
-            }
-    return results
-
-
-# Load all agents
-print("Loading agents...")
-agent_results = {}
-for agent in AGENTS:
-    data = load_agent(agent)
-    agent_results[agent["name"]] = data
-    print(f"  {agent['name']}: {len(data)} tasks")
-
-# ─── INDIVIDUAL AGENT STATS ──────────────────────────────────────────
-print("\n" + "=" * 70)
-print("INDIVIDUAL AGENT PASS RATES")
-print("=" * 70)
-
-for agent in AGENTS:
-    name = agent["name"]
-    data = agent_results[name]
-    total = len(data)
-    passed = sum(1 for r in data.values() if r["eval"] and "success" in r["eval"].lower())
-    easy_total = sum(1 for r in data.values() if r["difficulty"] == "easy")
-    easy_pass = sum(1 for r in data.values() if r["difficulty"] == "easy" and r["eval"] and "success" in r["eval"].lower())
-    hard_total = sum(1 for r in data.values() if r["difficulty"] == "hard")
-    hard_pass = sum(1 for r in data.values() if r["difficulty"] == "hard" and r["eval"] and "success" in r["eval"].lower())
-    print(f"\n{name}: {passed}/{total} = {passed/total*100:.1f}%")
-    if easy_total:
-        print(f"  easy: {easy_pass}/{easy_total} = {easy_pass/easy_total*100:.1f}%")
-    if hard_total:
-        print(f"  hard: {hard_pass}/{hard_total} = {hard_pass/hard_total*100:.1f}%")
-
-# ─── FULL-COVERAGE AGENTS (2452 tasks each) ──────────────────────────
-# Anthropic CUA, Skyvern 2.0, Skyvern BB, OpenAI CUA
-full_agents = ["Anthropic CUA", "Skyvern 2.0", "Skyvern BB", "OpenAI CUA"]
-
-print("\n" + "=" * 70)
-print(f"4 FULL-COVERAGE AGENTS: {', '.join(full_agents)}")
-print("(each has ~2452 tasks)")
-print("=" * 70)
-
-# Collect IDs present in ALL 4 full agents
-all_ids = None
-for name in full_agents:
-    ids = set(agent_results[name].keys())
-    all_ids = ids if all_ids is None else all_ids & ids
-
-print(f"Tasks in intersection: {len(all_ids)}")
-
-by_pass = defaultdict(list)
-for tid in sorted(all_ids):
-    pass_count = 0
-    info = {}
-    agent_evals = {}
-    for name in full_agents:
-        r = agent_results[name][tid]
-        is_success = "success" in r["eval"].lower() if r["eval"] else False
-        if is_success:
-            pass_count += 1
-        agent_evals[name] = "PASS" if is_success else "FAIL"
-        if not info:
-            info = r
-    by_pass[pass_count].append({
-        "id": tid, "pass_count": pass_count,
-        "difficulty": info["difficulty"], "category": info["category"],
-        "task": info["task"], "url": info["url"], "agents": agent_evals,
-    })
-
-for pc in range(5):
-    tasks = by_pass[pc]
-    label = {0: "0/4 (ALL FAIL)", 4: "4/4 (ALL PASS)"}.get(pc, f"{pc}/4")
-    easy = sum(1 for t in tasks if t["difficulty"] == "easy")
-    hard = sum(1 for t in tasks if t["difficulty"] == "hard")
-    cats = defaultdict(int)
-    for t in tasks:
-        cats[t["category"]] += 1
-    urls = len(set(t["url"] for t in tasks))
-    cat_str = ", ".join(f"{c}({n})" for c, n in sorted(cats.items(), key=lambda x: -x[1]))
-    print(f"\n{label}: {len(tasks)} tasks")
-    print(f"  easy: {easy}, hard: {hard}")
-    print(f"  categories: {cat_str}")
-    print(f"  unique websites: {urls}")
-
-# ─── NOW ALSO CHECK: how many 0/4 tasks require login? ───────────────
-print("\n" + "=" * 70)
-print("0/4 TASKS: LOGIN vs NO-LOGIN breakdown")
-print("=" * 70)
-
-login_keywords = ["log in", "login", "sign in", "signin", "your account", "your profile",
-                   "your wishlist", "your order", "your cart", "your dashboard", "your settings",
-                   "your subscription", "your inbox", "your message", "your review",
-                   "send a message", "post a comment", "write a review", "submit a",
-                   "publish", "upload"]
-zero_pass = by_pass[0]
-login_tasks = []
-no_login_tasks = []
-for t in zero_pass:
-    task_lower = t["task"].lower()
-    needs_login = any(kw in task_lower for kw in login_keywords)
-    if needs_login:
-        login_tasks.append(t)
-    else:
-        no_login_tasks.append(t)
-
-print(f"  Likely needs login: {len(login_tasks)}")
-print(f"  Possibly no login:  {len(no_login_tasks)}")
-
-print(f"\n  No-login 0/4 tasks by category:")
-cats = defaultdict(int)
-for t in no_login_tasks:
-    cats[t["category"]] += 1
-cat_str = ", ".join(f"{c}({n})" for c, n in sorted(cats.items(), key=lambda x: -x[1]))
-print(f"    {cat_str}")
-
-print(f"\n  Sample no-login 0/4 tasks:")
-for t in no_login_tasks[:10]:
-    print(f"    [{t['id']}] [{t['difficulty']}] [{t['category']}] {t['url']}")
-    print(f"      {t['task'][:180]}")
-
-# ─── ALSO INCLUDE THE HITL AGENTS (smaller overlap) ──────────────────
-hitl_agents = ["Convergence", "Operator", "RTRVR"]
-print("\n" + "=" * 70)
-print(f"HITL AGENTS: {', '.join(hitl_agents)}")
-print("=" * 70)
-
-for name in hitl_agents:
-    data = agent_results[name]
-    total = len(data)
-    passed = sum(1 for r in data.values() if r["eval"] and "success" in r["eval"].lower())
-    print(f"  {name}: {passed}/{total} = {passed/total*100:.1f}%")
-
-# See how HITL agents do on the same tasks as the 4 full agents
-hitl_ids = None
-for name in hitl_agents:
-    ids = set(agent_results[name].keys())
-    hitl_ids = ids if hitl_ids is None else hitl_ids & ids
-
-common_hitl = all_ids & hitl_ids if hitl_ids else set()
-print(f"\n  Tasks in common (all 7 agents): {len(common_hitl)}")
-
-if common_hitl:
-    by_pass_7 = defaultdict(list)
-    all_7 = full_agents + hitl_agents
-    for tid in sorted(common_hitl):
-        pass_count = 0
-        info = {}
-        for name in all_7:
-            r = agent_results[name].get(tid)
-            if r:
-                is_success = "success" in r["eval"].lower() if r["eval"] else False
-                if is_success:
-                    pass_count += 1
-                if not info:
-                    info = r
-        by_pass_7[pass_count].append({"id": tid, **info})
-
-    print("\n  7-AGENT PASS COUNT (on common subset):")
-    for pc in range(8):
-        if by_pass_7[pc]:
-            print(f"    {pc}/7: {len(by_pass_7[pc])} tasks")
-
-# ─── SUMMARY TABLE ───────────────────────────────────────────────────
-print("\n" + "=" * 70)
-print("SUMMARY FOR DATASET BUILDING")
-print("=" * 70)
-print(f"""
-Pool sizes (4 full-coverage agents):
-  0/4 (all fail):  {len(by_pass[0]):>4}  (login-required: ~{len(login_tasks)}, no-login: ~{len(no_login_tasks)})
-  1/4:             {len(by_pass[1]):>4}
-  2/4:             {len(by_pass[2]):>4}
-  3/4:             {len(by_pass[3]):>4}
-  4/4 (all pass):  {len(by_pass[4]):>4}
-  ─────────────────────
-  Total:           {sum(len(v) for v in by_pass.values()):>4}
-""")
--- a/packages/browseros-agent/apps/eval/scripts/analyze-webbench.ts
+++ b/packages/browseros-agent/apps/eval/scripts/analyze-webbench.ts
@@ -1,233 +0,0 @@
-/**
- * Analyze WebBench results across 4 agents to stratify tasks by pass count.
- * Usage: bun apps/eval/scripts/analyze-webbench.ts
- */
-import { parse } from 'csv-parse/sync'
-
-const dataDir = 'apps/eval/data/webbench'
-
-interface AgentConfig {
-  file: string
-  evalCol: string
-  name: string
-}
-
-const agents: AgentConfig[] = [
-  { file: 'anthropicfinal.csv', evalCol: 'Anthropic_Eval', name: 'Anthropic' },
-  { file: 'skyvern2.0final.csv', evalCol: 'Skyvern2.0Eval', name: 'Skyvern' },
-  { file: 'openaicuafinal.csv', evalCol: 'CUAEval', name: 'OpenAI CUA' },
-  { file: 'browserusefinal.csv', evalCol: 'BUEval', name: 'BrowserUse' },
-]
-
-type Row = Record<string, string>
-
-// Parse each agent's results
-const agentResults = new Map<
-  string,
-  Map<
-    number,
-    {
-      eval: string
-      difficulty: string
-      category: string
-      task: string
-      url: string
-    }
-  >
->()
-
-for (const agent of agents) {
-  const text = await Bun.file(`${dataDir}/${agent.file}`).text()
-  const rows: Row[] = parse(text, {
-    columns: true,
-    skip_empty_lines: true,
-    relax_column_count: true,
-  })
-  const results = new Map<
-    number,
-    {
-      eval: string
-      difficulty: string
-      category: string
-      task: string
-      url: string
-    }
-  >()
-  for (const row of rows) {
-    const id = parseInt(row.ID, 10)
-    if (Number.isNaN(id)) continue
-    results.set(id, {
-      eval: row[agent.evalCol] || '',
-      difficulty: row.Difficulty || '',
-      category: row.Category || '',
-      task: row.Task || '',
-      url: row['Starting URL'] || '',
-    })
-  }
-  agentResults.set(agent.name, results)
-  console.log(`${agent.name}: ${results.size} tasks loaded`)
-}
-
-// Find common task IDs (present in all 4 agents)
-const allIds = new Set<number>()
-for (const [, results] of agentResults) {
-  for (const id of results.keys()) allIds.add(id)
-}
-
-// Build pass count per task
-interface TaskStats {
-  id: number
-  passCount: number
-  difficulty: string
-  category: string
-  task: string
-  url: string
-  agents: Record<string, string>
-}
-
-const taskStats: TaskStats[] = []
-const _fullAgentNames = agents.map((a) => a.name)
-
-for (const id of allIds) {
-  let passCount = 0
-  let _presentCount = 0
-  const agentEvals: Record<string, string> = {}
-  let difficulty = ''
-  let category = ''
-  let task = ''
-  let url = ''
-
-  for (const agent of agents) {
-    const result = agentResults.get(agent.name)?.get(id)
-    if (result) {
-      _presentCount++
-      const isSuccess = result.eval?.toLowerCase().includes('success')
-      if (isSuccess) passCount++
-      agentEvals[agent.name] = isSuccess ? 'PASS' : 'FAIL'
-      if (!difficulty) difficulty = result.difficulty
-      if (!category) category = result.category
-      if (!task) task = result.task
-      if (!url) url = result.url
-    } else {
-      agentEvals[agent.name] = 'N/A'
-    }
-  }
-
-  taskStats.push({
-    id,
-    passCount,
-    difficulty,
-    category,
-    task,
-    url,
-    agents: agentEvals,
-  })
-}
-
-// Group by pass count
-const byPassCount: Record<number, TaskStats[]> = {
-  0: [],
-  1: [],
-  2: [],
-  3: [],
-  4: [],
-}
-for (const t of taskStats) {
-  byPassCount[t.passCount].push(t)
-}
-
-console.log('\n═══════════════════════════════════════════════════')
-console.log('TASKS BY PASS COUNT (how many agents succeeded)')
-console.log('═══════════════════════════════════════════════════\n')
-
-for (let pc = 0; pc <= 4; pc++) {
-  const tasks = byPassCount[pc]
-  const label =
-    pc === 0 ? '0/4 (ALL FAIL)' : pc === 4 ? '4/4 (ALL PASS)' : `${pc}/4`
-  console.log(`${label}: ${tasks.length} tasks`)
-
-  // Breakdown by difficulty
-  const easy = tasks.filter((t) => t.difficulty === 'easy').length
-  const hard = tasks.filter((t) => t.difficulty === 'hard').length
-  console.log(`  easy: ${easy}, hard: ${hard}`)
-
-  // Breakdown by category
-  const byCat: Record<string, number> = {}
-  for (const t of tasks) {
-    byCat[t.category] = (byCat[t.category] || 0) + 1
-  }
-  console.log(
-    `  categories: ${Object.entries(byCat)
-      .sort((a, b) => b[1] - a[1])
-      .map(([c, n]) => `${c}(${n})`)
-      .join(', ')}`,
-  )
-  console.log()
-}
-
-// Now handle BrowserUse only having 658 tasks — let's also do a 3-agent view (Anthropic, Skyvern, OpenAI)
-console.log('\n═══════════════════════════════════════════════════')
-console.log('3-AGENT VIEW (Anthropic + Skyvern + OpenAI CUA)')
-console.log('(BrowserUse only has 658 tasks, so this is more complete)')
-console.log('═══════════════════════════════════════════════════\n')
-
-const threeAgents = ['Anthropic', 'Skyvern', 'OpenAI CUA']
-const byPassCount3: Record<number, TaskStats[]> = { 0: [], 1: [], 2: [], 3: [] }
-
-for (const t of taskStats) {
-  let pc3 = 0
-  let allPresent = true
-  for (const a of threeAgents) {
-    if (t.agents[a] === 'N/A') {
-      allPresent = false
-      break
-    }
-    if (t.agents[a] === 'PASS') pc3++
-  }
-  if (!allPresent) continue
-  if (!byPassCount3[pc3]) byPassCount3[pc3] = []
-  byPassCount3[pc3].push(t)
-}
-
-let total3 = 0
-for (let pc = 0; pc <= 3; pc++) {
-  const tasks = byPassCount3[pc]
-  total3 += tasks.length
-  const label =
-    pc === 0 ? '0/3 (ALL FAIL)' : pc === 3 ? '3/3 (ALL PASS)' : `${pc}/3`
-  console.log(`${label}: ${tasks.length} tasks`)
-
-  const easy = tasks.filter((t) => t.difficulty === 'easy').length
-  const hard = tasks.filter((t) => t.difficulty === 'hard').length
-  console.log(`  easy: ${easy}, hard: ${hard}`)
-
-  const byCat: Record<string, number> = {}
-  for (const t of tasks) {
-    byCat[t.category] = (byCat[t.category] || 0) + 1
-  }
-  console.log(
-    `  categories: ${Object.entries(byCat)
-      .sort((a, b) => b[1] - a[1])
-      .map(([c, n]) => `${c}(${n})`)
-      .join(', ')}`,
-  )
-
-  // Show unique websites count
-  const uniqueUrls = new Set(tasks.map((t) => t.url))
-  console.log(`  unique websites: ${uniqueUrls.size}`)
-  console.log()
-}
-console.log(`Total tasks in 3-agent intersection: ${total3}`)
-
-// Quick sample of 0/3 tasks (hardest)
-console.log('\n── Sample 0/3 (all fail) tasks ──')
-byPassCount3[0].slice(0, 5).forEach((t) => {
-  console.log(`  [${t.id}] [${t.difficulty}] [${t.category}] ${t.url}`)
-  console.log(`    ${t.task.slice(0, 150)}`)
-})
-
-console.log('\n── Sample 1/3 tasks ──')
-byPassCount3[1].slice(0, 5).forEach((t) => {
-  console.log(`  [${t.id}] [${t.difficulty}] [${t.category}] ${t.url}`)
-  console.log(`    ${t.task.slice(0, 150)}`)
-})
--- a/packages/browseros-agent/apps/eval/scripts/annotate-screenshots.ts
+++ b/packages/browseros-agent/apps/eval/scripts/annotate-screenshots.ts
@@ -1,340 +0,0 @@
-#!/usr/bin/env bun
-/**
- * Annotate Screenshots with Tool Coordinates
- *
- * Reads messages.jsonl from an eval run and annotates screenshots with
- * coordinate markers showing where browser actions (click, fill, hover, drag)
- * actually landed.
- *
- * Coordinates are in CSS pixels (returned by tool outputs). They're mapped to
- * screenshot pixels using: screenshot_xy = css_xy × devicePixelRatio
- *
- * Usage:
- *   bun run apps/eval/scripts/annotate-screenshots.ts <results-folder> [--dpr=2]
- *
- * Options:
- *   --dpr=N   devicePixelRatio (default: 2). Use the value from take_screenshot output.
- *
- * Output:
- *   Creates an 'annotated' folder inside the screenshots directory.
- */
-
-import {
-  copyFileSync,
-  existsSync,
-  mkdirSync,
-  readdirSync,
-  readFileSync,
-} from 'node:fs'
-import { basename, join } from 'node:path'
-import sharp from 'sharp'
-
-interface ActionInfo {
-  screenshotNum: number
-  toolName: string
-  cssX: number
-  cssY: number
-  // For drag: second coordinate
-  cssX2?: number
-  cssY2?: number
-}
-
-const COORDINATE_TOOLS = new Set([
-  'click',
-  'click_at',
-  'fill',
-  'hover',
-  'hover_at',
-  'type_at',
-  'drag',
-  'drag_at',
-])
-
-/**
- * Parse CSS coordinates from tool output text.
- *
- * Formats returned by tools:
- *   "Clicked [47] at (125, 42)"
- *   "Typed 5 characters into [12] at (300, 150)"
- *   "Hovered over [31] at (200, 88)"
- *   "Clicked at (125, 42)"
- *   "Hovered at (125, 42)"
- *   "Typed 10 chars at (125, 42)"
- *   "Dragged [10] (50, 100) → [20] (400, 300)"
- *   "Dragged from (50, 100) to (400, 300)"
- */
-function parseCoordinates(
-  toolName: string,
-  output: unknown,
-): { x: number; y: number; x2?: number; y2?: number } | null {
-  const text = extractText(output)
-  if (!text) return null
-
-  // Drag with two coordinate pairs: "(x1, y1) → ... (x2, y2)" or "from (x1, y1) to (x2, y2)"
-  if (toolName === 'drag' || toolName === 'drag_at') {
-    const dragMatch = text.match(
-      /\((\d+),\s*(\d+)\).*?(?:→|to)\s*.*?\((\d+),\s*(\d+)\)/,
-    )
-    if (dragMatch) {
-      return {
-        x: Number(dragMatch[1]),
-        y: Number(dragMatch[2]),
-        x2: Number(dragMatch[3]),
-        y2: Number(dragMatch[4]),
-      }
-    }
-  }
-
-  // Single coordinate: "at (x, y)" or just "(x, y)"
-  const singleMatch = text.match(/\((\d+),\s*(\d+)\)/)
-  if (singleMatch) {
-    return { x: Number(singleMatch[1]), y: Number(singleMatch[2]) }
-  }
-
-  return null
-}
-
-function extractText(output: unknown): string | null {
-  if (typeof output === 'string') return output
-  if (Array.isArray(output)) {
-    for (const item of output) {
-      if (item?.type === 'text' && typeof item.text === 'string')
-        return item.text
-    }
-  }
-  if (output && typeof output === 'object' && 'text' in output) {
-    return String((output as Record<string, unknown>).text)
-  }
-  return null
-}
-
-/**
- * Parse messages.jsonl to extract actions with coordinates
- */
-function parseMessages(messagesPath: string): ActionInfo[] {
-  const content = readFileSync(messagesPath, 'utf-8')
-  const lines = content.trim().split('\n')
-  const messages = lines.map((line) => JSON.parse(line))
-
-  const actions: ActionInfo[] = []
-  const pendingTools = new Map<
-    string,
-    { toolName: string; screenshotNum: number }
-  >()
-  let screenshotNum = 0
-
-  for (const msg of messages) {
-    if (msg.type === 'tool-input-available') {
-      pendingTools.set(msg.toolCallId, {
-        toolName: msg.toolName,
-        screenshotNum: -1,
-      })
-    }
-
-    if (msg.type === 'tool-output-available') {
-      screenshotNum++
-      const pending = pendingTools.get(msg.toolCallId)
-      if (!pending) continue
-
-      if (!COORDINATE_TOOLS.has(pending.toolName)) {
-        pendingTools.delete(msg.toolCallId)
-        continue
-      }
-
-      const coords = parseCoordinates(pending.toolName, msg.output)
-      if (coords) {
-        actions.push({
-          screenshotNum,
-          toolName: pending.toolName,
-          cssX: coords.x,
-          cssY: coords.y,
-          cssX2: coords.x2,
-          cssY2: coords.y2,
-        })
-      }
-
-      pendingTools.delete(msg.toolCallId)
-    }
-  }
-
-  return actions
-}
-
-async function annotateScreenshot(
-  inputPath: string,
-  outputPath: string,
-  action: ActionInfo | null,
-  dpr: number,
-): Promise<void> {
-  if (!action) {
-    copyFileSync(inputPath, outputPath)
-    return
-  }
-
-  const image = sharp(inputPath)
-  const metadata = await image.metadata()
-  // biome-ignore lint/style/noNonNullAssertion: sharp metadata always has dimensions for valid images
-  const imgWidth = metadata.width!
-  // biome-ignore lint/style/noNonNullAssertion: sharp metadata always has dimensions for valid images
-  const imgHeight = metadata.height!
-
-  const sx = Math.round(action.cssX * dpr)
-  const sy = Math.round(action.cssY * dpr)
-
-  let markersSvg = ''
-
-  // Primary marker (red crosshair)
-  markersSvg += `
-    <circle cx="${sx}" cy="${sy}" r="25" fill="none" stroke="red" stroke-width="4"/>
-    <circle cx="${sx}" cy="${sy}" r="6" fill="red" fill-opacity="0.6"/>
-    <line x1="${sx - 40}" y1="${sy}" x2="${sx - 10}" y2="${sy}" stroke="red" stroke-width="3"/>
-    <line x1="${sx + 10}" y1="${sy}" x2="${sx + 40}" y2="${sy}" stroke="red" stroke-width="3"/>
-    <line x1="${sx}" y1="${sy - 40}" x2="${sx}" y2="${sy - 10}" stroke="red" stroke-width="3"/>
-    <line x1="${sx}" y1="${sy + 10}" x2="${sx}" y2="${sy + 40}" stroke="red" stroke-width="3"/>
-  `
-
-  // Drag target marker (orange)
-  if (action.cssX2 !== undefined && action.cssY2 !== undefined) {
-    const sx2 = Math.round(action.cssX2 * dpr)
-    const sy2 = Math.round(action.cssY2 * dpr)
-    markersSvg += `
-      <circle cx="${sx2}" cy="${sy2}" r="25" fill="none" stroke="orange" stroke-width="4"/>
-      <circle cx="${sx2}" cy="${sy2}" r="6" fill="orange" fill-opacity="0.6"/>
-      <line x1="${sx}" y1="${sy}" x2="${sx2}" y2="${sy2}" stroke="orange" stroke-width="2" stroke-dasharray="8,4"/>
-    `
-  }
-
-  // Info box
-  const label2 =
-    action.cssX2 !== undefined
-      ? ` → (${action.cssX2}, ${action.cssY2}) css`
-      : ''
-  const infoText = `${action.toolName}: (${action.cssX}, ${action.cssY}) css × ${dpr} dpr = (${sx}, ${sy}) px${label2}`
-
-  markersSvg += `
-    <rect x="10" y="10" width="${Math.min(infoText.length * 8 + 20, imgWidth - 20)}" height="50" fill="rgba(0,0,0,0.9)" rx="5"/>
-    <text x="20" y="30" fill="red" font-family="monospace" font-size="14" font-weight="bold">
-      Screenshot ${action.screenshotNum}: AFTER ${action.toolName}
-    </text>
-    <text x="20" y="50" fill="white" font-family="monospace" font-size="12">
-      ${infoText}
-    </text>
-  `
-
-  const svg = `<svg width="${imgWidth}" height="${imgHeight}">${markersSvg}</svg>`
-
-  await image
-    .composite([{ input: Buffer.from(svg), top: 0, left: 0 }])
-    .toFile(outputPath)
-}
-
-async function main() {
-  const args = process.argv.slice(2)
-  const flags = args.filter((a) => a.startsWith('--'))
-  const positional = args.filter((a) => !a.startsWith('--'))
-
-  if (positional.length === 0) {
-    console.log(
-      'Usage: bun run apps/eval/scripts/annotate-screenshots.ts <results-folder> [--dpr=2]',
-    )
-    console.log('')
-    console.log('Example:')
-    console.log(
-      '  bun run apps/eval/scripts/annotate-screenshots.ts apps/eval/results/single/Amazon--3',
-    )
-    process.exit(1)
-  }
-
-  const dprFlag = flags.find((f) => f.startsWith('--dpr='))
-  let dpr = dprFlag ? Number(dprFlag.split('=')[1]) : 0
-
-  // Try reading DPR from metadata.json if not explicitly provided
-  if (!dpr) {
-    const metadataPath = join(positional[0], 'metadata.json')
-    if (existsSync(metadataPath)) {
-      const meta = JSON.parse(readFileSync(metadataPath, 'utf-8'))
-      dpr = meta.device_pixel_ratio ?? 0
-      if (dpr) console.log(`Read devicePixelRatio=${dpr} from metadata.json`)
-    }
-  }
-  if (!dpr) {
-    console.error(
-      'Error: devicePixelRatio not found in metadata.json. Provide --dpr=N flag.',
-    )
-    process.exit(1)
-  }
-
-  const resultsFolder = positional[0]
-  const messagesPath = join(resultsFolder, 'messages.jsonl')
-  const screenshotsDir = join(resultsFolder, 'screenshots')
-  const annotatedDir = join(screenshotsDir, 'annotated')
-
-  if (!existsSync(messagesPath)) {
-    console.error(`Error: messages.jsonl not found at ${messagesPath}`)
-    process.exit(1)
-  }
-
-  if (!existsSync(screenshotsDir)) {
-    console.error(`Error: screenshots directory not found at ${screenshotsDir}`)
-    process.exit(1)
-  }
-
-  mkdirSync(annotatedDir, { recursive: true })
-
-  console.log(`devicePixelRatio: ${dpr}`)
-  console.log('Parsing messages.jsonl...')
-  const actions = parseMessages(messagesPath)
-
-  console.log(`Found ${actions.length} actions with coordinates:`)
-  for (const action of actions) {
-    const dragInfo =
-      action.cssX2 !== undefined ? ` → (${action.cssX2}, ${action.cssY2})` : ''
-    console.log(
-      `  Screenshot ${action.screenshotNum}: ${action.toolName} at (${action.cssX}, ${action.cssY})${dragInfo} css → (${Math.round(action.cssX * dpr)}, ${Math.round(action.cssY * dpr)}) px`,
-    )
-  }
-  console.log('')
-
-  const screenshots = readdirSync(screenshotsDir)
-    .filter((f) => f.endsWith('.png') && !f.includes('annotated'))
-    .sort((a, b) => {
-      const numA = parseInt(basename(a, '.png'), 10)
-      const numB = parseInt(basename(b, '.png'), 10)
-      return numA - numB
-    })
-
-  console.log(`Found ${screenshots.length} screenshots`)
-
-  const firstMeta = await sharp(join(screenshotsDir, screenshots[0])).metadata()
-  console.log(`Screenshot dimensions: ${firstMeta.width} x ${firstMeta.height}`)
-  console.log('')
-
-  const actionByScreenshot = new Map<number, ActionInfo>()
-  for (const action of actions) {
-    actionByScreenshot.set(action.screenshotNum, action)
-  }
-
-  console.log('Annotating screenshots...')
-  for (const ss of screenshots) {
-    const ssNum = parseInt(basename(ss, '.png'), 10)
-    const inputPath = join(screenshotsDir, ss)
-    const outputPath = join(annotatedDir, `${ssNum}_annotated.png`)
-    const action = actionByScreenshot.get(ssNum) || null
-
-    if (action) {
-      console.log(`  ${ss} → annotated (${action.toolName})`)
-    } else {
-      console.log(`  ${ss} → copied (no coordinates)`)
-    }
-
-    await annotateScreenshot(inputPath, outputPath, action, dpr)
-  }
-
-  console.log('')
-  console.log(`Done! Annotated screenshots saved to: ${annotatedDir}`)
-}
-
-main().catch((err) => {
-  console.error('Error:', err)
-  process.exit(1)
-})
--- a/packages/browseros-agent/apps/eval/scripts/debug-long-run.ts
+++ b/packages/browseros-agent/apps/eval/scripts/debug-long-run.ts
@@ -1,249 +0,0 @@
-/**
- * Long-running stress test to simulate eval behavior
- * Run with: bun apps/eval/scripts/debug-long-run.ts
- */
-
-import { Client } from '@modelcontextprotocol/sdk/client/index.js'
-import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
-
-const SERVER_URL = 'http://127.0.0.1:9110'
-const MCP_URL = `${SERVER_URL}/mcp`
-
-// Simulate 60 turns like the failing task had
-const NUM_TURNS = 60
-const SCREENSHOT_EVERY_N_TURNS = 1
-
-async function checkBrowserReady(): Promise<boolean> {
-  try {
-    const res = await fetch(`${SERVER_URL}/health`, {
-      signal: AbortSignal.timeout(5000),
-    })
-    if (!res.ok) return false
-    const data = (await res.json()) as { cdpConnected?: boolean }
-    return data.cdpConnected === true
-  } catch {
-    return false
-  }
-}
-
-async function callMcpTool(
-  name: string,
-  args: Record<string, unknown> = {},
-  timeoutMs: number = 65000,
-): Promise<{ success: boolean; error?: string; duration: number }> {
-  const start = Date.now()
-  const client = new Client({ name: 'long-run-test', version: '1.0.0' })
-  const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
-
-  try {
-    await client.connect(transport)
-
-    const toolPromise = client.callTool({ name, arguments: args })
-    const timeoutPromise = new Promise<never>((_, reject) =>
-      setTimeout(
-        () => reject(new Error(`Timeout after ${timeoutMs}ms`)),
-        timeoutMs,
-      ),
-    )
-
-    const result = await Promise.race([toolPromise, timeoutPromise])
-    const duration = Date.now() - start
-
-    const res = result as Record<string, unknown>
-    if (res.isError) {
-      const content = res.content as
-        | Array<{ type: string; text?: string }>
-        | undefined
-      const errorText =
-        content?.find((c) => c.type === 'text')?.text || 'Unknown error'
-      return { success: false, error: errorText, duration }
-    }
-
-    return { success: true, duration }
-  } catch (error) {
-    return {
-      success: false,
-      error: error instanceof Error ? error.message : String(error),
-      duration: Date.now() - start,
-    }
-  } finally {
-    try {
-      await transport.close()
-    } catch {}
-  }
-}
-
-async function main() {
-  console.log('='.repeat(60))
-  console.log('Long-Running Stress Test (simulating eval)')
-  console.log('='.repeat(60))
-  console.log(
-    `Simulating ${NUM_TURNS} turns with screenshots every ${SCREENSHOT_EVERY_N_TURNS} turn(s)`,
-  )
-  console.log()
-
-  // Create window
-  console.log('Creating window...')
-
-  let windowId = 0
-  let tabId = 0
-
-  const client = new Client({ name: 'long-run-test', version: '1.0.0' })
-  const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
-
-  try {
-    await client.connect(transport)
-    const result = await client.callTool({
-      name: 'browser_create_window',
-      arguments: { url: 'https://example.com', focused: false },
-    })
-
-    // Try structured content first
-    const createRes = result as Record<string, unknown>
-    const structured = createRes.structuredContent as
-      | Record<string, number>
-      | undefined
-    windowId = structured?.windowId ?? 0
-    tabId = structured?.tabId ?? 0
-
-    // Fall back to parsing text
-    if (!windowId || !tabId) {
-      const content = createRes.content as
-        | Array<{ type: string; text?: string }>
-        | undefined
-      const text = content?.find((c) => c.type === 'text')?.text || ''
-      const windowMatch = text.match(/window\s+(\d+)/i)
-      const tabMatch =
-        text.match(/Tab ID:\s*(\d+)/i) || text.match(/tab\s+(\d+)/i)
-      if (windowMatch) windowId = parseInt(windowMatch[1], 10)
-      if (tabMatch) tabId = parseInt(tabMatch[1], 10)
-    }
-  } finally {
-    try {
-      await transport.close()
-    } catch {}
-  }
-
-  if (!windowId || !tabId) {
-    console.log('❌ Could not determine window/tab IDs')
-    console.log('Trying to get from list tabs...')
-
-    // Try listing tabs
-    const client2 = new Client({ name: 'long-run-test', version: '1.0.0' })
-    const transport2 = new StreamableHTTPClientTransport(new URL(MCP_URL))
-    try {
-      await client2.connect(transport2)
-      const tabs = await client2.callTool({
-        name: 'browser_list_tabs',
-        arguments: {},
-      })
-      console.log('Tabs response:', JSON.stringify(tabs, null, 2))
-    } finally {
-      try {
-        await transport2.close()
-      } catch {}
-    }
-    return
-  }
-
-  console.log(`Window: ${windowId}, Tab: ${tabId}`)
-  console.log()
-
-  await new Promise((r) => setTimeout(r, 2000))
-
-  // Stats
-  let screenshotSuccess = 0
-  let screenshotFail = 0
-  let toolSuccess = 0
-  let toolFail = 0
-  let browserDisconnects = 0
-
-  const startTime = Date.now()
-
-  // Simulate turns
-  for (let turn = 1; turn <= NUM_TURNS; turn++) {
-    const _turnStart = Date.now()
-
-    // Random tool calls to simulate agent behavior
-    const tools = [
-      {
-        name: 'browser_get_interactive_elements',
-        args: { tabId, windowId, simplified: true },
-      },
-      { name: 'browser_list_tabs', args: { windowId } },
-      { name: 'browser_get_active_tab', args: { windowId } },
-    ]
-
-    // Pick a random tool
-    const tool = tools[Math.floor(Math.random() * tools.length)]
-    const toolRes = await callMcpTool(tool.name, tool.args, 30000)
-
-    if (toolRes.success) {
-      toolSuccess++
-    } else {
-      toolFail++
-      console.log(`  Turn ${turn}: ❌ ${tool.name} failed: ${toolRes.error}`)
-    }
-
-    // Screenshot every N turns
-    if (turn % SCREENSHOT_EVERY_N_TURNS === 0) {
-      const ssRes = await callMcpTool(
-        'browser_get_screenshot',
-        { tabId, windowId, size: 'small' },
-        65000,
-      )
-
-      if (ssRes.success) {
-        screenshotSuccess++
-      } else {
-        screenshotFail++
-        console.log(`  Turn ${turn}: ❌ Screenshot failed: ${ssRes.error}`)
-      }
-    }
-
-    // Check browser status
-    const browserReady = await checkBrowserReady()
-    if (!browserReady) {
-      browserDisconnects++
-      console.log(`  Turn ${turn}: ⚠️ Browser became unavailable!`)
-    }
-
-    // Progress
-    if (turn % 10 === 0) {
-      const elapsed = ((Date.now() - startTime) / 1000).toFixed(1)
-      console.log(
-        `Turn ${turn}/${NUM_TURNS} - Screenshots: ${screenshotSuccess}/${turn}, Tools: ${toolSuccess}/${turn}, Disconnects: ${browserDisconnects}, Elapsed: ${elapsed}s`,
-      )
-    }
-
-    // Small delay between turns
-    await new Promise((r) => setTimeout(r, 200))
-  }
-
-  // Cleanup
-  console.log('\nClosing window...')
-  await callMcpTool('browser_close_window', { windowId })
-
-  // Summary
-  const totalTime = ((Date.now() - startTime) / 1000).toFixed(1)
-
-  console.log(`\n${'='.repeat(60)}`)
-  console.log('SUMMARY')
-  console.log('='.repeat(60))
-  console.log(`Total time: ${totalTime}s`)
-  console.log(
-    `Screenshots: ${screenshotSuccess}/${NUM_TURNS} (${((screenshotSuccess / NUM_TURNS) * 100).toFixed(1)}%)`,
-  )
-  console.log(
-    `Tool calls: ${toolSuccess}/${NUM_TURNS} (${((toolSuccess / NUM_TURNS) * 100).toFixed(1)}%)`,
-  )
-  console.log(`Browser disconnects: ${browserDisconnects}`)
-
-  if (screenshotFail > 0 || toolFail > 0 || browserDisconnects > 0) {
-    console.log('\n⚠️ Issues detected during long run!')
-  } else {
-    console.log('\n✅ All operations completed successfully!')
-  }
-}
-
-main().catch(console.error)
--- a/packages/browseros-agent/apps/eval/scripts/debug-mcp.ts
+++ b/packages/browseros-agent/apps/eval/scripts/debug-mcp.ts
@@ -1,307 +0,0 @@
-/**
- * Debug script to test MCP server stability
- * Run with: bun apps/eval/scripts/debug-mcp.ts
- */
-
-import { Client } from '@modelcontextprotocol/sdk/client/index.js'
-import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
-
-const SERVER_URL = 'http://127.0.0.1:9110'
-const MCP_URL = `${SERVER_URL}/mcp`
-
-interface TestResult {
-  test: string
-  success: boolean
-  duration: number
-  error?: string
-}
-
-const results: TestResult[] = []
-
-async function checkHealth(): Promise<boolean> {
-  try {
-    const res = await fetch(`${SERVER_URL}/health`, {
-      signal: AbortSignal.timeout(5000),
-    })
-    return res.ok
-  } catch {
-    return false
-  }
-}
-
-async function checkBrowserReady(): Promise<boolean> {
-  try {
-    const res = await fetch(`${SERVER_URL}/health`, {
-      signal: AbortSignal.timeout(5000),
-    })
-    if (!res.ok) return false
-    const data = (await res.json()) as { cdpConnected?: boolean }
-    return data.cdpConnected === true
-  } catch {
-    return false
-  }
-}
-
-async function callMcpTool(
-  name: string,
-  args: Record<string, unknown> = {},
-  timeoutMs: number = 30000,
-): Promise<{
-  success: boolean
-  result?: unknown
-  error?: string
-  duration: number
-}> {
-  const start = Date.now()
-  const client = new Client({ name: 'debug-script', version: '1.0.0' })
-  const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
-
-  try {
-    await client.connect(transport)
-
-    const toolPromise = client.callTool({ name, arguments: args })
-    const timeoutPromise = new Promise<never>((_, reject) =>
-      setTimeout(
-        () => reject(new Error(`Timeout after ${timeoutMs}ms`)),
-        timeoutMs,
-      ),
-    )
-
-    const result = await Promise.race([toolPromise, timeoutPromise])
-    const duration = Date.now() - start
-
-    if ((result as any).isError) {
-      const errorText =
-        (result as any).content?.find((c: any) => c.type === 'text')?.text ||
-        'Unknown error'
-      return { success: false, error: errorText, duration }
-    }
-
-    return { success: true, result, duration }
-  } catch (error) {
-    return {
-      success: false,
-      error: error instanceof Error ? error.message : String(error),
-      duration: Date.now() - start,
-    }
-  } finally {
-    try {
-      await transport.close()
-    } catch {}
-  }
-}
-
-async function runTest(name: string, fn: () => Promise<void>): Promise<void> {
-  const start = Date.now()
-  try {
-    await fn()
-    results.push({ test: name, success: true, duration: Date.now() - start })
-    console.log(`✅ ${name} (${Date.now() - start}ms)`)
-  } catch (error) {
-    const errorMsg = error instanceof Error ? error.message : String(error)
-    results.push({
-      test: name,
-      success: false,
-      duration: Date.now() - start,
-      error: errorMsg,
-    })
-    console.log(`❌ ${name}: ${errorMsg} (${Date.now() - start}ms)`)
-  }
-}
-
-async function main() {
-  console.log('='.repeat(60))
-  console.log('MCP Server Debug Script')
-  console.log('='.repeat(60))
-  console.log(`Server URL: ${SERVER_URL}`)
-  console.log()
-
-  // Phase 1: Basic connectivity
-  console.log('\n--- Phase 1: Basic Connectivity ---\n')
-
-  await runTest('Health check', async () => {
-    const healthy = await checkHealth()
-    if (!healthy) throw new Error('Server not healthy')
-  })
-
-  await runTest('Browser status', async () => {
-    const connected = await checkBrowserReady()
-    if (!connected) throw new Error('Browser not ready')
-  })
-
-  // Phase 2: List tools
-  console.log('\n--- Phase 2: List Tools ---\n')
-
-  let tools: string[] = []
-  await runTest('List MCP tools', async () => {
-    const client = new Client({ name: 'debug-script', version: '1.0.0' })
-    const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
-    try {
-      await client.connect(transport)
-      const result = await client.listTools()
-      tools = result.tools.map((t) => t.name)
-      console.log(`  Found ${tools.length} tools`)
-    } finally {
-      try {
-        await transport.close()
-      } catch {}
-    }
-  })
-
-  // Phase 3: Create window and test tools
-  console.log('\n--- Phase 3: Window & Screenshot Tests ---\n')
-
-  let windowId: number | null = null
-  let tabId: number | null = null
-
-  await runTest('Create window', async () => {
-    const res = await callMcpTool('browser_create_window', {
-      url: 'https://example.com',
-      focused: false,
-    })
-    if (!res.success) throw new Error(res.error)
-
-    const structured = (res.result as any)?.structuredContent
-    windowId = structured?.windowId
-    tabId = structured?.tabId
-
-    if (!windowId || !tabId) {
-      // Try parsing from text
-      const text =
-        (res.result as any)?.content?.find((c: any) => c.type === 'text')
-          ?.text || ''
-      const windowMatch = text.match(/window\s+(\d+)/i)
-      const tabMatch = text.match(/tab\s+(?:ID:\s*)?(\d+)/i)
-      if (windowMatch) windowId = parseInt(windowMatch[1], 10)
-      if (tabMatch) tabId = parseInt(tabMatch[1], 10)
-    }
-
-    if (!windowId || !tabId) throw new Error('Could not get windowId/tabId')
-    console.log(`  Window: ${windowId}, Tab: ${tabId}`)
-  })
-
-  // Wait for page to load
-  await new Promise((r) => setTimeout(r, 2000))
-
-  // Phase 4: Screenshot stress test
-  console.log('\n--- Phase 4: Screenshot Stress Test (10 screenshots) ---\n')
-
-  let screenshotSuccesses = 0
-  let screenshotFailures = 0
-
-  for (let i = 1; i <= 10; i++) {
-    const res = await callMcpTool(
-      'browser_get_screenshot',
-      {
-        tabId,
-        windowId,
-        size: 'small',
-      },
-      65000,
-    )
-
-    if (res.success) {
-      screenshotSuccesses++
-      console.log(`  Screenshot ${i}: ✅ (${res.duration}ms)`)
-    } else {
-      screenshotFailures++
-      console.log(`  Screenshot ${i}: ❌ ${res.error} (${res.duration}ms)`)
-    }
-
-    // Check browser status between screenshots
-    const extConnected = await checkBrowserReady()
-    if (!extConnected) {
-      console.log(`  ⚠️  Browser became unavailable after screenshot ${i}!`)
-    }
-
-    // Small delay between screenshots
-    await new Promise((r) => setTimeout(r, 500))
-  }
-
-  console.log(
-    `\n  Screenshot results: ${screenshotSuccesses}/10 success, ${screenshotFailures}/10 failed`,
-  )
-
-  // Phase 5: Other tool tests
-  console.log('\n--- Phase 5: Other Tool Tests ---\n')
-
-  await runTest('Get active tab', async () => {
-    const res = await callMcpTool('browser_get_active_tab', { windowId })
-    if (!res.success) throw new Error(res.error)
-  })
-
-  await runTest('List tabs', async () => {
-    const res = await callMcpTool('browser_list_tabs', { windowId })
-    if (!res.success) throw new Error(res.error)
-  })
-
-  await runTest('Get interactive elements', async () => {
-    const res = await callMcpTool('browser_get_interactive_elements', {
-      tabId,
-      windowId,
-      simplified: true,
-    })
-    if (!res.success) throw new Error(res.error)
-  })
-
-  await runTest('Navigate', async () => {
-    const res = await callMcpTool('browser_navigate', {
-      url: 'https://google.com',
-      tabId,
-      windowId,
-    })
-    if (!res.success) throw new Error(res.error)
-  })
-
-  await new Promise((r) => setTimeout(r, 2000))
-
-  await runTest('Get content snapshot', async () => {
-    const res = await callMcpTool('browser_get_content', { tabId, windowId })
-    if (!res.success) throw new Error(res.error)
-  })
-
-  // Phase 6: Cleanup
-  console.log('\n--- Phase 6: Cleanup ---\n')
-
-  if (windowId) {
-    await runTest('Close window', async () => {
-      const res = await callMcpTool('browser_close_window', { windowId })
-      if (!res.success) throw new Error(res.error)
-    })
-  }
-
-  // Final browser readiness check
-  await runTest('Final browser status', async () => {
-    const connected = await checkBrowserReady()
-    if (!connected) throw new Error('Browser not ready')
-  })
-
-  // Summary
-  console.log(`\n${'='.repeat(60)}`)
-  console.log('SUMMARY')
-  console.log('='.repeat(60))
-
-  const passed = results.filter((r) => r.success).length
-  const failed = results.filter((r) => !r.success).length
-  const avgDuration =
-    results.reduce((a, b) => a + b.duration, 0) / results.length
-
-  console.log(`Total tests: ${results.length}`)
-  console.log(`Passed: ${passed}`)
-  console.log(`Failed: ${failed}`)
-  console.log(`Avg duration: ${avgDuration.toFixed(0)}ms`)
-  console.log(
-    `Screenshot success rate: ${screenshotSuccesses}/10 (${screenshotSuccesses * 10}%)`,
-  )
-
-  if (failed > 0) {
-    console.log('\nFailed tests:')
-    for (const r of results.filter((r) => !r.success)) {
-      console.log(`  - ${r.test}: ${r.error}`)
-    }
-  }
-
-  console.log()
-}
-
-main().catch(console.error)
--- a/packages/browseros-agent/apps/eval/scripts/test-failure-scenarios.ts
+++ b/packages/browseros-agent/apps/eval/scripts/test-failure-scenarios.ts
@@ -1,647 +0,0 @@
-/**
- * Test script to validate failure scenario handling
- * Run with: bun apps/eval/scripts/test-failure-scenarios.ts
- *
- * This script simulates various failure scenarios and shows the recovery flow.
- * Run each scenario individually to see how the system handles it.
- */
-
-import { dirname, join } from 'node:path'
-import { fileURLToPath } from 'node:url'
-import { type Subprocess, spawn, spawnSync } from 'bun'
-
-// Ports from config.dev.json - must match BrowserOS server_config.json
-const EVAL_PORTS = {
-  cdp: 9005,
-  server: 9105, // http_mcp in config.dev.json
-} as const
-const MONOREPO_ROOT = join(dirname(fileURLToPath(import.meta.url)), '../../..')
-
-// ============================================================================
-// Utility Functions (copied from parallel-executor for testing)
-// ============================================================================
-
-function log(category: string, message: string): void {
-  const timestamp = new Date().toISOString().split('T')[1].slice(0, 12)
-  console.log(`[${timestamp}] [${category}] ${message}`)
-}
-
-function killPort(port: number): void {
-  log('UTIL', `Killing processes on port ${port}`)
-  spawnSync({
-    cmd: ['sh', '-c', `lsof -ti:${port} | xargs kill -9 2>/dev/null || true`],
-  })
-}
-
-function isBrowserOSAppRunning(): boolean {
-  const result = spawnSync({
-    cmd: ['sh', '-c', 'pgrep -f "BrowserOS" 2>/dev/null || true'],
-  })
-  const output = result.stdout?.toString().trim() ?? ''
-  return output.length > 0
-}
-
-async function killBrowserOSApp(): Promise<void> {
-  log('BROWSEROS', 'Killing BrowserOS application...')
-  spawnSync({
-    cmd: ['sh', '-c', 'pkill -9 -f "BrowserOS" 2>/dev/null || true'],
-  })
-  killPort(EVAL_PORTS.cdp)
-  for (let i = 0; i < 10; i++) {
-    if (!isBrowserOSAppRunning()) {
-      log('BROWSEROS', 'Application killed')
-      return
-    }
-    await sleep(500)
-  }
-  log('BROWSEROS', 'Warning: Application may not have fully terminated')
-}
-
-async function launchBrowserOSApp(): Promise<boolean> {
-  log(
-    'BROWSEROS',
-    `Launching BrowserOS (server disabled, CDP=${EVAL_PORTS.cdp})...`,
-  )
-  spawnSync({
-    cmd: [
-      'open',
-      '-a',
-      'BrowserOS',
-      '--args',
-      '--disable-browseros-server',
-      `--browseros-cdp-port=${EVAL_PORTS.cdp}`,
-    ],
-  })
-  for (let i = 0; i < 30; i++) {
-    await sleep(1000)
-    if (isBrowserOSAppRunning()) {
-      log(
-        'BROWSEROS',
-        'Application launched, waiting for initialization (8s)...',
-      )
-      await sleep(8000)
-      return true
-    }
-  }
-  log('BROWSEROS', 'Failed to launch application')
-  return false
-}
-
-async function waitForPortFree(
-  port: number,
-  maxAttempts = 30,
-): Promise<boolean> {
-  for (let i = 0; i < maxAttempts; i++) {
-    const result = spawnSync({
-      cmd: ['sh', '-c', `lsof -ti:${port} 2>/dev/null`],
-    })
-    if (!result.stdout || result.stdout.toString().trim() === '') {
-      return true
-    }
-    await sleep(500)
-  }
-  return false
-}
-
-async function waitForServerHealth(
-  port: number,
-  maxAttempts = 60,
-): Promise<boolean> {
-  for (let i = 0; i < maxAttempts; i++) {
-    try {
-      const res = await fetch(`http://127.0.0.1:${port}/health`, {
-        signal: AbortSignal.timeout(1000),
-      })
-      if (res.ok) return true
-    } catch {
-      /* not ready */
-    }
-    await sleep(500)
-  }
-  return false
-}
-
-async function waitForBrowserReady(
-  port: number,
-  maxAttempts = 60,
-): Promise<boolean> {
-  let connectedCount = 0
-  for (let i = 0; i < maxAttempts; i++) {
-    try {
-      const res = await fetch(`http://127.0.0.1:${port}/health`, {
-        signal: AbortSignal.timeout(2000),
-      })
-      if (res.ok) {
-        const data = (await res.json()) as { cdpConnected?: boolean }
-        if (data.cdpConnected) {
-          connectedCount++
-          if (connectedCount >= 3) return true
-        } else {
-          connectedCount = 0
-        }
-      }
-    } catch {
-      connectedCount = 0
-    }
-    await sleep(500)
-  }
-  return false
-}
-
-async function checkBrowserReady(port: number): Promise<boolean> {
-  try {
-    const res = await fetch(`http://127.0.0.1:${port}/health`, {
-      signal: AbortSignal.timeout(3000),
-    })
-    if (res.ok) {
-      const data = (await res.json()) as { cdpConnected?: boolean }
-      return data.cdpConnected === true
-    }
-  } catch {
-    /* failed */
-  }
-  return false
-}
-
-function sleep(ms: number): Promise<void> {
-  return new Promise((r) => setTimeout(r, ms))
-}
-
-let serverProc: Subprocess | null = null
-
-async function startServer(): Promise<Subprocess> {
-  log('SERVER', 'Cleaning up ports...')
-  killPort(EVAL_PORTS.server)
-  await waitForPortFree(EVAL_PORTS.server, 30)
-
-  log('SERVER', 'Starting server process...')
-  const proc = spawn({
-    cmd: [
-      'bun',
-      'apps/server/src/index.ts',
-      '--server-port',
-      String(EVAL_PORTS.server),
-      '--cdp-port',
-      String(EVAL_PORTS.cdp),
-    ],
-    cwd: MONOREPO_ROOT,
-    stdout: 'pipe',
-    stderr: 'pipe',
-    env: { ...process.env, NODE_ENV: 'development' },
-  })
-  serverProc = proc
-  log('SERVER', `Server started with PID ${proc.pid}`)
-  return proc
-}
-
-async function stopServer(proc: Subprocess): Promise<void> {
-  log('SERVER', 'Stopping server...')
-  try {
-    proc.kill('SIGKILL')
-    await Promise.race([proc.exited, sleep(5000)])
-  } catch {
-    /* ignore */
-  }
-  serverProc = null
-  log('SERVER', 'Server stopped')
-}
-
-// ============================================================================
-// Scenario Tests
-// ============================================================================
-
-async function scenario1_AppNotRunningAtStart(): Promise<void> {
-  console.log(`\n${'='.repeat(70)}`)
-  console.log('SCENARIO 1: BrowserOS App Not Running at Start')
-  console.log('='.repeat(70))
-  console.log(
-    'Expected: Detect missing app → Launch app → Wait for init → Continue\n',
-  )
-
-  // Kill the app first
-  await killBrowserOSApp()
-  await sleep(2000)
-
-  // Now check what happens
-  log('CHECK', `Is BrowserOS running? ${isBrowserOSAppRunning()}`)
-
-  if (!isBrowserOSAppRunning()) {
-    log('FLOW', '→ App not running, attempting to launch...')
-    const launched = await launchBrowserOSApp()
-    if (launched) {
-      log('FLOW', '→ App launched successfully')
-      log('CHECK', `Is BrowserOS running now? ${isBrowserOSAppRunning()}`)
-    } else {
-      log('FLOW', '→ FAILED to launch app')
-      log(
-        'RESULT',
-        'Task would FAIL with: "BrowserOS application is not running"',
-      )
-      return
-    }
-  }
-
-  log('RESULT', 'SUCCESS - App is now running, can proceed with server start')
-}
-
-async function scenario2_BrowserNotReady(): Promise<void> {
-  console.log(`\n${'='.repeat(70)}`)
-  console.log('SCENARIO 2: Browser Does Not Become Ready Within 30 Seconds')
-  console.log('='.repeat(70))
-  console.log(
-    'Expected: Wait 30s → Restart BrowserOS app → Retry → Success or fail after 3 attempts\n',
-  )
-
-  // Make sure app is running first
-  if (!isBrowserOSAppRunning()) {
-    log('SETUP', 'Launching BrowserOS for test...')
-    await launchBrowserOSApp()
-  }
-
-  const MAX_RETRIES = 3
-  let browserOSRestartAttempted = false
-
-  for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
-    log('ATTEMPT', `Server start attempt ${attempt}/${MAX_RETRIES}`)
-
-    try {
-      const proc = await startServer()
-
-      log('WAIT', 'Waiting for server health...')
-      const healthy = await waitForServerHealth(EVAL_PORTS.server, 30)
-      if (!healthy) {
-        throw new Error('Server health check failed')
-      }
-      log('HEALTH', 'Server health OK')
-
-      log('WAIT', 'Waiting for browser readiness (30s timeout)...')
-      const browserReady = await waitForBrowserReady(EVAL_PORTS.server, 60)
-
-      if (!browserReady) {
-        log('TIMEOUT', 'Browser did not become ready within 30 seconds')
-        await stopServer(proc)
-
-        if (!browserOSRestartAttempted) {
-          log('RECOVERY', '→ Restarting BrowserOS application...')
-          await killBrowserOSApp()
-          await sleep(2000)
-          const restarted = await launchBrowserOSApp()
-          browserOSRestartAttempted = true
-
-          if (restarted) {
-            log('RECOVERY', '→ BrowserOS restarted, will retry server')
-            continue
-          } else {
-            log('RECOVERY', '→ FAILED to restart BrowserOS')
-          }
-        }
-
-        throw new Error('Browser did not become ready')
-      }
-
-      log('CONNECTED', 'Browser ready!')
-      await stopServer(proc)
-      log('RESULT', 'SUCCESS - Would proceed with task execution')
-      return
-    } catch (error) {
-      log('ERROR', `Attempt ${attempt} failed: ${error}`)
-      if (attempt === MAX_RETRIES) {
-        log('RESULT', 'FAILURE - All retries exhausted, task would fail')
-      }
-    }
-
-    await sleep(5000)
-  }
-}
-
-async function scenario3_ServerCrashesMidTask(): Promise<void> {
-  console.log(`\n${'='.repeat(70)}`)
-  console.log('SCENARIO 3: Server Process Crashes Mid-Task')
-  console.log('='.repeat(70))
-  console.log(
-    'Expected: Task fails → Clean up ports → Next task restarts fresh\n',
-  )
-
-  if (!isBrowserOSAppRunning()) {
-    log('SETUP', 'Launching BrowserOS for test...')
-    await launchBrowserOSApp()
-  }
-
-  const proc = await startServer()
-
-  log('WAIT', 'Waiting for server to be ready...')
-  const healthy = await waitForServerHealth(EVAL_PORTS.server, 30)
-  if (!healthy) {
-    log('SETUP', 'Server failed to become healthy')
-    return
-  }
-
-  const browserReady = await waitForBrowserReady(EVAL_PORTS.server, 60)
-  if (!browserReady) {
-    log('SETUP', 'Browser did not become ready')
-    await stopServer(proc)
-    return
-  }
-
-  log('READY', 'Server and browser ready')
-  log('SIMULATE', 'Simulating server crash by killing the process...')
-
-  // Kill the server to simulate crash
-  proc.kill('SIGKILL')
-  await sleep(1000)
-
-  // Check what we see now
-  log('CHECK', 'Checking server health after crash...')
-  const stillHealthy = await waitForServerHealth(EVAL_PORTS.server, 5)
-  log('CHECK', `Server health: ${stillHealthy ? 'OK' : 'FAILED'}`)
-
-  log('CHECK', 'Checking browser readiness...')
-  const stillConnected = await checkBrowserReady(EVAL_PORTS.server)
-  log('CHECK', `Browser ready: ${stillConnected}`)
-
-  if (!stillHealthy || !stillConnected) {
-    log('DETECTED', '→ Infrastructure failure detected!')
-    log(
-      'RECOVERY',
-      '→ In real flow: Would clean up ports and restart for next task',
-    )
-
-    killPort(EVAL_PORTS.server)
-    log('CLEANUP', 'Ports cleaned')
-
-    log('RESULT', 'Task would FAIL, but next task gets clean environment')
-  }
-}
-
-async function scenario4_ToolTimeout(): Promise<void> {
-  console.log(`\n${'='.repeat(70)}`)
-  console.log('SCENARIO 4: Tool Execution Timeout')
-  console.log('='.repeat(70))
-  console.log(
-    'Expected: Tool times out → Error contains "timeout" → Classified as infra error → Clean restart\n',
-  )
-
-  // Simulate what happens when we get a timeout error
-  const errorMessage = 'MCP tool call timed out after 65000ms'
-
-  log('ERROR', `Received error: "${errorMessage}"`)
-
-  const isInfraError =
-    errorMessage.includes('BrowserOS') ||
-    errorMessage.includes('server') ||
-    errorMessage.includes('not connected') ||
-    errorMessage.includes('timed out') ||
-    errorMessage.includes('timeout')
-
-  log('CLASSIFY', `Is infrastructure error? ${isInfraError}`)
-
-  if (isInfraError) {
-    log('FLOW', '→ Error classified as infrastructure failure')
-    log('FLOW', '→ Would kill ports for clean next-task state')
-    log('FLOW', '→ killPort(9110)')
-    log('FLOW', '→ killPort(9310)')
-    log('RESULT', 'Task FAILS, but ports cleaned for next task')
-  } else {
-    log('FLOW', '→ Error classified as task-specific failure')
-    log('RESULT', 'Task FAILS, environment not reset')
-  }
-}
-
-async function scenario5_BrowserUnavailableMidTask(): Promise<void> {
-  console.log(`\n${'='.repeat(70)}`)
-  console.log('SCENARIO 5: Browser Becomes Unavailable Mid-Task (App Crashes)')
-  console.log('='.repeat(70))
-  console.log(
-    'Expected: Tool call fails → "not connected" error → Kill app → Restart for next task\n',
-  )
-
-  if (!isBrowserOSAppRunning()) {
-    log('SETUP', 'Launching BrowserOS for test...')
-    await launchBrowserOSApp()
-  }
-
-  const proc = await startServer()
-
-  log('WAIT', 'Waiting for server to be ready...')
-  await waitForServerHealth(EVAL_PORTS.server, 30)
-  await waitForBrowserReady(EVAL_PORTS.server, 60)
-  log('READY', 'Server and browser ready')
-
-  log('SIMULATE', 'Simulating BrowserOS crash by killing the app...')
-  await killBrowserOSApp()
-  await sleep(2000)
-
-  // Check browser status
-  log('CHECK', 'Checking browser readiness after app crash...')
-  const stillConnected = await checkBrowserReady(EVAL_PORTS.server)
-  log('CHECK', `Browser ready: ${stillConnected}`)
-
-  if (!stillConnected) {
-    log('DETECTED', '→ Browser became unavailable!')
-
-    const errorMessage = 'BrowserOS helper service not connected'
-    log('ERROR', `Tool call would fail with: "${errorMessage}"`)
-
-    const isInfraError = errorMessage.includes('not connected')
-    log('CLASSIFY', `Is infrastructure error? ${isInfraError}`)
-
-    if (isInfraError) {
-      log('RECOVERY', '→ Cleaning up for next task...')
-      await stopServer(proc)
-      killPort(EVAL_PORTS.server)
-
-      log('RECOVERY', '→ Next task would check if BrowserOS is running...')
-      const appRunning = isBrowserOSAppRunning()
-      log('CHECK', `BrowserOS running: ${appRunning}`)
-
-      if (!appRunning) {
-        log('RECOVERY', '→ Would launch BrowserOS app')
-        await launchBrowserOSApp()
-      }
-
-      log('RESULT', 'Current task FAILS, next task gets fresh environment')
-    }
-  } else {
-    await stopServer(proc)
-  }
-}
-
-async function scenario6_GracefulShutdown(): Promise<void> {
-  console.log(`\n${'='.repeat(70)}`)
-  console.log('SCENARIO 6: Graceful Shutdown (Ctrl+C)')
-  console.log('='.repeat(70))
-  console.log('Expected: SIGINT received → Kill server → Clean ports → Exit\n')
-
-  log('INFO', 'In real flow, signal handlers are registered at startup:')
-  log('CODE', '  process.on("SIGINT", cleanup)')
-  log('CODE', '  process.on("SIGTERM", cleanup)')
-  log('CODE', '  process.on("uncaughtException", cleanup)')
-
-  log('FLOW', 'When Ctrl+C is pressed:')
-  log('FLOW', '  1. isShuttingDown = true (prevent duplicate cleanup)')
-  log('FLOW', '  2. Kill server process if running')
-  log('FLOW', '  3. Kill processes on ports 9110, 9310')
-  log('FLOW', '  4. Exit with code 0')
-
-  log('RESULT', 'Clean shutdown, no orphaned processes')
-}
-
-async function scenario7_ConsecutiveFailures(): Promise<void> {
-  console.log(`\n${'='.repeat(70)}`)
-  console.log('SCENARIO 7: Consecutive Task Failures')
-  console.log('='.repeat(70))
-  console.log(
-    'Expected: Each failed task cleans up → Next task gets fresh start\n',
-  )
-
-  const tasks = ['task-1', 'task-2', 'task-3']
-
-  for (const taskId of tasks) {
-    log('TASK', `=== Starting ${taskId} ===`)
-
-    // Check if app is running
-    log('CHECK', `BrowserOS running: ${isBrowserOSAppRunning()}`)
-    if (!isBrowserOSAppRunning()) {
-      log('FLOW', '→ Would launch BrowserOS')
-    }
-
-    // Simulate infrastructure check before task
-    log('FLOW', '→ Start server')
-    log('FLOW', '→ Wait for health')
-    log('FLOW', '→ Wait for browser readiness')
-
-    // Simulate task failure
-    const failureReason =
-      taskId === 'task-1'
-        ? 'Browser did not become ready'
-        : taskId === 'task-2'
-          ? 'Tool timed out after 65000ms'
-          : 'BrowserOS helper service not connected'
-
-    log('ERROR', `Task failed: ${failureReason}`)
-
-    const isInfraError =
-      failureReason.includes('timeout') ||
-      failureReason.includes('not connected')
-
-    if (isInfraError) {
-      log('CLEANUP', '→ Detected infra error, cleaning ports')
-      log('CLEANUP', '→ killPort(9110)')
-    }
-
-    log('CLEANUP', '→ Stop server')
-    log('CLEANUP', '→ Wait 2s before next task')
-
-    console.log()
-  }
-
-  log('RESULT', 'Each task failure is isolated, next task starts clean')
-}
-
-// ============================================================================
-// Main Menu
-// ============================================================================
-
-async function main() {
-  console.log('='.repeat(70))
-  console.log('Failure Scenario Test Suite')
-  console.log('='.repeat(70))
-  console.log(`Server Port: ${EVAL_PORTS.server}`)
-  console.log(`CDP Port: ${EVAL_PORTS.cdp}`)
-  console.log()
-
-  const scenarios = [
-    {
-      num: 1,
-      name: 'BrowserOS App Not Running at Start',
-      fn: scenario1_AppNotRunningAtStart,
-    },
-    {
-      num: 2,
-      name: 'Browser Does Not Become Ready (30s timeout)',
-      fn: scenario2_BrowserNotReady,
-    },
-    {
-      num: 3,
-      name: 'Server Process Crashes Mid-Task',
-      fn: scenario3_ServerCrashesMidTask,
-    },
-    {
-      num: 4,
-      name: 'Tool Execution Timeout (simulated)',
-      fn: scenario4_ToolTimeout,
-    },
-    {
-      num: 5,
-      name: 'Browser Becomes Unavailable Mid-Task (App Crash)',
-      fn: scenario5_BrowserUnavailableMidTask,
-    },
-    {
-      num: 6,
-      name: 'Graceful Shutdown (explanation)',
-      fn: scenario6_GracefulShutdown,
-    },
-    {
-      num: 7,
-      name: 'Consecutive Task Failures (simulated)',
-      fn: scenario7_ConsecutiveFailures,
-    },
-  ]
-
-  console.log('Available scenarios:')
-  for (const s of scenarios) {
-    console.log(`  ${s.num}. ${s.name}`)
-  }
-  console.log('  all. Run all scenarios')
-  console.log()
-
-  const arg = process.argv[2]
-
-  if (!arg) {
-    console.log(
-      'Usage: bun apps/eval/scripts/test-failure-scenarios.ts <scenario-number|all>',
-    )
-    console.log('Example: bun apps/eval/scripts/test-failure-scenarios.ts 1')
-    console.log('Example: bun apps/eval/scripts/test-failure-scenarios.ts all')
-    process.exit(0)
-  }
-
-  // Setup cleanup handler
-  const cleanup = async () => {
-    console.log('\n[CLEANUP] Cleaning up...')
-    if (serverProc) {
-      try {
-        serverProc.kill('SIGKILL')
-      } catch {}
-    }
-    killPort(EVAL_PORTS.server)
-    process.exit(0)
-  }
-  process.on('SIGINT', cleanup)
-
-  if (arg === 'all') {
-    for (const s of scenarios) {
-      await s.fn()
-      await sleep(3000)
-    }
-  } else {
-    const num = parseInt(arg, 10)
-    const scenario = scenarios.find((s) => s.num === num)
-    if (!scenario) {
-      console.log(`Unknown scenario: ${arg}`)
-      process.exit(1)
-    }
-    await scenario.fn()
-  }
-
-  // Cleanup
-  if (serverProc) {
-    await stopServer(serverProc)
-  }
-
-  console.log(`\n${'='.repeat(70)}`)
-  console.log('Test completed')
-  console.log('='.repeat(70))
-}
-
-main().catch(console.error)
--- a/packages/browseros-agent/apps/eval/scripts/test-lifecycle.ts
+++ b/packages/browseros-agent/apps/eval/scripts/test-lifecycle.ts
@@ -1,542 +0,0 @@
-/**
- * Test script to validate the complete eval lifecycle
- * Run with: bun apps/eval/scripts/test-lifecycle.ts
- *
- * Tests:
- * 1. BrowserOS app detection
- * 2. Server start/stop
- * 3. Browser readiness with verification
- * 4. Window create/close
- * 5. Screenshot capture
- * 6. Multiple tasks in sequence with server restart
- */
-
-import { dirname, join } from 'node:path'
-import { fileURLToPath } from 'node:url'
-import { Client } from '@modelcontextprotocol/sdk/client/index.js'
-import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
-import { type Subprocess, spawn, spawnSync } from 'bun'
-
-// Ports from config.dev.json - must match BrowserOS launch args
-const EVAL_PORTS = {
-  cdp: 9005,
-  server: 9105, // http_mcp in config.dev.json
-} as const
-const MONOREPO_ROOT = join(dirname(fileURLToPath(import.meta.url)), '../../..')
-const MCP_URL = `http://127.0.0.1:${EVAL_PORTS.server}/mcp`
-
-let currentServerPid: number | null = null
-
-// ============================================================================
-// Utility Functions (same as parallel-executor)
-// ============================================================================
-
-function killPort(port: number): void {
-  spawnSync({
-    cmd: ['sh', '-c', `lsof -ti:${port} | xargs kill -9 2>/dev/null || true`],
-  })
-}
-
-function isBrowserOSAppRunning(): boolean {
-  const result = spawnSync({
-    cmd: ['sh', '-c', 'pgrep -f "BrowserOS" 2>/dev/null || true'],
-  })
-  const output = result.stdout?.toString().trim() ?? ''
-  return output.length > 0
-}
-
-async function _killBrowserOSApp(): Promise<void> {
-  console.log('  Killing BrowserOS app...')
-  spawnSync({
-    cmd: ['sh', '-c', 'pkill -9 -f "BrowserOS" 2>/dev/null || true'],
-  })
-  killPort(EVAL_PORTS.cdp)
-  for (let i = 0; i < 10; i++) {
-    if (!isBrowserOSAppRunning()) return
-    await new Promise((r) => setTimeout(r, 500))
-  }
-}
-
-async function _launchBrowserOSApp(): Promise<boolean> {
-  console.log(
-    `  Launching BrowserOS (server disabled, CDP=${EVAL_PORTS.cdp})...`,
-  )
-  spawnSync({
-    cmd: [
-      'open',
-      '-a',
-      'BrowserOS',
-      '--args',
-      '--disable-browseros-server',
-      `--remote-debugging-port=${EVAL_PORTS.cdp}`,
-      `--browseros-cdp-port=${EVAL_PORTS.cdp}`,
-      `--browseros-mcp-port=${EVAL_PORTS.server}`,
-    ],
-  })
-  for (let i = 0; i < 30; i++) {
-    await new Promise((r) => setTimeout(r, 1000))
-    if (isBrowserOSAppRunning()) {
-      await new Promise((r) => setTimeout(r, 8000))
-      return true
-    }
-  }
-  return false
-}
-
-async function waitForPortFree(
-  port: number,
-  maxAttempts = 30,
-): Promise<boolean> {
-  for (let i = 0; i < maxAttempts; i++) {
-    const result = spawnSync({
-      cmd: ['sh', '-c', `lsof -ti:${port} 2>/dev/null`],
-    })
-    if (!result.stdout || result.stdout.toString().trim() === '') {
-      return true
-    }
-    await new Promise((resolve) => setTimeout(resolve, 500))
-  }
-  return false
-}
-
-async function waitForServerHealth(
-  serverPort: number,
-  maxAttempts = 60,
-): Promise<boolean> {
-  for (let i = 0; i < maxAttempts; i++) {
-    try {
-      const response = await fetch(`http://127.0.0.1:${serverPort}/health`, {
-        signal: AbortSignal.timeout(1000),
-      })
-      if (response.ok) return true
-    } catch {
-      /* not ready */
-    }
-    await new Promise((resolve) => setTimeout(resolve, 500))
-  }
-  return false
-}
-
-async function waitForBrowserReady(
-  serverPort: number,
-  maxAttempts = 90,
-): Promise<boolean> {
-  let connectedCount = 0
-  for (let i = 0; i < maxAttempts; i++) {
-    try {
-      const response = await fetch(`http://127.0.0.1:${serverPort}/health`, {
-        signal: AbortSignal.timeout(2000),
-      })
-      if (response.ok) {
-        const data = (await response.json()) as { cdpConnected?: boolean }
-        if (data.cdpConnected) {
-          connectedCount++
-          if (connectedCount >= 3) return true
-        } else {
-          connectedCount = 0
-        }
-      }
-    } catch {
-      connectedCount = 0
-    }
-    await new Promise((resolve) => setTimeout(resolve, 500))
-  }
-  return false
-}
-
-async function startServer(): Promise<Subprocess> {
-  killPort(EVAL_PORTS.server)
-  await waitForPortFree(EVAL_PORTS.server, 30)
-
-  const serverProc = spawn({
-    cmd: [
-      'bun',
-      'apps/server/src/index.ts',
-      '--server-port',
-      String(EVAL_PORTS.server),
-      '--cdp-port',
-      String(EVAL_PORTS.cdp),
-    ],
-    cwd: MONOREPO_ROOT,
-    stdout: 'pipe',
-    stderr: 'pipe',
-    env: { ...process.env, NODE_ENV: 'development' },
-  })
-
-  currentServerPid = serverProc.pid
-  return serverProc
-}
-
-async function stopServer(proc: Subprocess): Promise<void> {
-  try {
-    proc.kill('SIGKILL')
-    await Promise.race([
-      proc.exited,
-      new Promise((resolve) => setTimeout(resolve, 5000)),
-    ])
-  } catch {
-    /* ignore */
-  }
-  currentServerPid = null
-}
-
-async function callMcpTool(
-  name: string,
-  args: Record<string, unknown> = {},
-  timeoutMs = 60000,
-): Promise<{ success: boolean; result?: any; error?: string }> {
-  const client = new Client({ name: 'lifecycle-test', version: '1.0.0' })
-  const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
-
-  try {
-    await client.connect(transport)
-    const toolPromise = client.callTool({ name, arguments: args })
-    const timeoutPromise = new Promise<never>((_, reject) =>
-      setTimeout(
-        () => reject(new Error(`Timeout after ${timeoutMs}ms`)),
-        timeoutMs,
-      ),
-    )
-    const result = await Promise.race([toolPromise, timeoutPromise])
-
-    if ((result as any).isError) {
-      const errorText =
-        (result as any).content?.find((c: any) => c.type === 'text')?.text ||
-        'Unknown error'
-      return { success: false, error: errorText }
-    }
-    return { success: true, result }
-  } catch (error) {
-    return {
-      success: false,
-      error: error instanceof Error ? error.message : String(error),
-    }
-  } finally {
-    try {
-      await transport.close()
-    } catch {}
-  }
-}
-
-// ============================================================================
-// Tests
-// ============================================================================
-
-async function testBrowserOSDetection(): Promise<boolean> {
-  console.log('\n=== Test 1: BrowserOS App Detection ===')
-  const running = isBrowserOSAppRunning()
-  console.log(`  BrowserOS running: ${running}`)
-  if (!running) {
-    console.log('  ❌ BrowserOS app is not running. Please start it.')
-    return false
-  }
-  console.log('  ✅ BrowserOS app detected')
-  return true
-}
-
-async function testServerStartStop(): Promise<boolean> {
-  console.log('\n=== Test 2: Server Start/Stop ===')
-
-  console.log('  Starting server...')
-  const proc = await startServer()
-  console.log(`  Server PID: ${proc.pid}`)
-
-  console.log('  Waiting for health...')
-  const healthy = await waitForServerHealth(EVAL_PORTS.server, 30)
-  if (!healthy) {
-    console.log('  ❌ Server health check failed')
-    await stopServer(proc)
-    return false
-  }
-  console.log('  ✅ Server healthy')
-
-  console.log('  Waiting for browser readiness...')
-  const browserReady = await waitForBrowserReady(EVAL_PORTS.server, 60)
-  if (!browserReady) {
-    console.log('  ❌ Browser did not become ready')
-    await stopServer(proc)
-    return false
-  }
-  console.log('  ✅ Browser ready')
-
-  console.log('  Stopping server...')
-  await stopServer(proc)
-  console.log('  ✅ Server stopped')
-
-  return true
-}
-
-async function testWindowLifecycle(): Promise<boolean> {
-  console.log('\n=== Test 3: Window Create/Close ===')
-
-  console.log('  Starting server...')
-  const proc = await startServer()
-
-  const healthy = await waitForServerHealth(EVAL_PORTS.server, 30)
-  if (!healthy) {
-    console.log('  ❌ Server health check failed')
-    await stopServer(proc)
-    return false
-  }
-
-  const browserReady = await waitForBrowserReady(EVAL_PORTS.server, 60)
-  if (!browserReady) {
-    console.log('  ❌ Browser did not become ready')
-    await stopServer(proc)
-    return false
-  }
-
-  console.log('  Creating window...')
-  const createResult = await callMcpTool('browser_create_window', {
-    url: 'https://example.com',
-    focused: false,
-  })
-
-  if (!createResult.success) {
-    console.log(`  ❌ Failed to create window: ${createResult.error}`)
-    await stopServer(proc)
-    return false
-  }
-
-  const windowId = createResult.result?.structuredContent?.windowId
-  const tabId = createResult.result?.structuredContent?.tabId
-  console.log(`  ✅ Window created: windowId=${windowId}, tabId=${tabId}`)
-
-  // Wait for page load
-  await new Promise((r) => setTimeout(r, 2000))
-
-  // Take screenshot
-  console.log('  Taking screenshot...')
-  const ssResult = await callMcpTool('browser_get_screenshot', {
-    tabId,
-    windowId,
-    size: 'small',
-  })
-
-  if (!ssResult.success) {
-    console.log(`  ❌ Screenshot failed: ${ssResult.error}`)
-  } else {
-    console.log('  ✅ Screenshot captured')
-  }
-
-  // Close window
-  console.log('  Closing window...')
-  const closeResult = await callMcpTool('browser_close_window', { windowId })
-  if (!closeResult.success) {
-    console.log(
-      `  ⚠️  Close window returned error (may be expected): ${closeResult.error}`,
-    )
-  } else {
-    console.log('  ✅ Window closed')
-  }
-
-  console.log('  Stopping server...')
-  await stopServer(proc)
-  console.log('  ✅ Server stopped')
-
-  return true
-}
-
-async function testMultipleTasksWithRestart(): Promise<boolean> {
-  console.log('\n=== Test 4: Multiple Tasks with Server Restart ===')
-
-  const tasks = [
-    { id: 'task-1', url: 'https://example.com' },
-    { id: 'task-2', url: 'https://google.com' },
-    { id: 'task-3', url: 'https://github.com' },
-  ]
-
-  let successCount = 0
-
-  for (const task of tasks) {
-    console.log(`\n  --- Task: ${task.id} ---`)
-
-    // Start server
-    console.log('  Starting server...')
-    const proc = await startServer()
-
-    const healthy = await waitForServerHealth(EVAL_PORTS.server, 30)
-    if (!healthy) {
-      console.log(`  ❌ Task ${task.id}: Server health failed`)
-      await stopServer(proc)
-      continue
-    }
-
-    const browserReady = await waitForBrowserReady(EVAL_PORTS.server, 60)
-    if (!browserReady) {
-      console.log(`  ❌ Task ${task.id}: Browser not ready`)
-      await stopServer(proc)
-      continue
-    }
-
-    // Create window
-    const createResult = await callMcpTool('browser_create_window', {
-      url: task.url,
-      focused: false,
-    })
-
-    if (!createResult.success) {
-      console.log(
-        `  ❌ Task ${task.id}: Window creation failed - ${createResult.error}`,
-      )
-      await stopServer(proc)
-      continue
-    }
-
-    const windowId = createResult.result?.structuredContent?.windowId
-    console.log(`  Window created: ${windowId}`)
-
-    await new Promise((r) => setTimeout(r, 2000))
-
-    // Close window
-    await callMcpTool('browser_close_window', { windowId })
-    console.log(`  Window closed`)
-
-    // Stop server
-    await stopServer(proc)
-    console.log(`  Server stopped`)
-
-    successCount++
-    console.log(`  ✅ Task ${task.id} completed`)
-
-    // Delay between tasks
-    await new Promise((r) => setTimeout(r, 2000))
-  }
-
-  console.log(`\n  Results: ${successCount}/${tasks.length} tasks successful`)
-  return successCount === tasks.length
-}
-
-async function testBrowserStability(): Promise<boolean> {
-  console.log('\n=== Test 5: Browser Stability (30 seconds) ===')
-
-  console.log('  Starting server...')
-  const proc = await startServer()
-
-  const healthy = await waitForServerHealth(EVAL_PORTS.server, 30)
-  if (!healthy) {
-    console.log('  ❌ Server health check failed')
-    await stopServer(proc)
-    return false
-  }
-
-  const browserReady = await waitForBrowserReady(EVAL_PORTS.server, 60)
-  if (!browserReady) {
-    console.log('  ❌ Browser did not become ready')
-    await stopServer(proc)
-    return false
-  }
-
-  console.log('  Monitoring browser readiness for 30 seconds...')
-  let disconnects = 0
-  const checkInterval = 2000
-  const totalChecks = 30000 / checkInterval
-
-  for (let i = 0; i < totalChecks; i++) {
-    try {
-      const response = await fetch(
-        `http://127.0.0.1:${EVAL_PORTS.server}/health`,
-        {
-          signal: AbortSignal.timeout(2000),
-        },
-      )
-      const data = (await response.json()) as { cdpConnected?: boolean }
-      if (!data.cdpConnected) {
-        disconnects++
-        console.log(
-          `  ⚠️  Browser became unavailable at check ${i + 1}/${totalChecks}`,
-        )
-      }
-    } catch {
-      disconnects++
-      console.log(`  ⚠️  Failed to check browser at ${i + 1}/${totalChecks}`)
-    }
-    await new Promise((r) => setTimeout(r, checkInterval))
-  }
-
-  await stopServer(proc)
-
-  if (disconnects > 0) {
-    console.log(`  ❌ Browser had ${disconnects} readiness failures`)
-    return false
-  }
-
-  console.log('  ✅ Browser stayed ready for 30 seconds')
-  return true
-}
-
-// ============================================================================
-// Main
-// ============================================================================
-
-async function main() {
-  console.log('='.repeat(60))
-  console.log('Eval Lifecycle Test Suite')
-  console.log('='.repeat(60))
-  console.log(`Server Port: ${EVAL_PORTS.server}`)
-  console.log(`CDP Port: ${EVAL_PORTS.cdp}`)
-
-  const results: { name: string; passed: boolean }[] = []
-
-  // Test 1: BrowserOS Detection
-  results.push({
-    name: 'BrowserOS Detection',
-    passed: await testBrowserOSDetection(),
-  })
-  if (!results[0].passed) {
-    console.log('\n❌ Cannot continue without BrowserOS app running')
-    process.exit(1)
-  }
-
-  // Test 2: Server Start/Stop
-  results.push({
-    name: 'Server Start/Stop',
-    passed: await testServerStartStop(),
-  })
-
-  // Test 3: Window Lifecycle
-  results.push({
-    name: 'Window Lifecycle',
-    passed: await testWindowLifecycle(),
-  })
-
-  // Test 4: Multiple Tasks
-  results.push({
-    name: 'Multiple Tasks',
-    passed: await testMultipleTasksWithRestart(),
-  })
-
-  // Test 5: Browser Stability
-  results.push({
-    name: 'Browser Stability',
-    passed: await testBrowserStability(),
-  })
-
-  // Summary
-  console.log(`\n${'='.repeat(60)}`)
-  console.log('SUMMARY')
-  console.log('='.repeat(60))
-
-  const passed = results.filter((r) => r.passed).length
-  const failed = results.filter((r) => !r.passed).length
-
-  for (const r of results) {
-    console.log(`  ${r.passed ? '✅' : '❌'} ${r.name}`)
-  }
-
-  console.log(`\nTotal: ${passed} passed, ${failed} failed`)
-
-  if (failed > 0) {
-    process.exit(1)
-  }
-}
-
-main().catch((error) => {
-  console.error('Test suite failed:', error)
-  if (currentServerPid) {
-    try {
-      process.kill(currentServerPid, 'SIGKILL')
-    } catch {}
-  }
-  process.exit(1)
-})
--- a/packages/browseros-agent/apps/eval/scripts/test-performance-grader.ts
+++ b/packages/browseros-agent/apps/eval/scripts/test-performance-grader.ts
@@ -1,180 +0,0 @@
-/**
- * Test script for the PerformanceGrader.
- *
- * Runs against a real trajectory and logs:
- * - Pre-computed metrics passed to the agent
- * - Every tool call the agent makes (what it reads/greps)
- * - The final grading result with per-axis scores
- *
- * Uses the running Claude Code process for auth (no API key needed).
- *
- * Usage: bun run apps/eval/scripts/test-performance-grader.ts [output-dir]
- */
-import { readFile } from 'node:fs/promises'
-import { join } from 'node:path'
-import { query } from '@anthropic-ai/claude-agent-sdk'
-import {
-  buildUserPrompt,
-  DEFAULT_AXES,
-  PERFORMANCE_SYSTEM_PROMPT,
-} from '../src/graders/performance/axes'
-import { extractMetrics } from '../src/graders/performance/metadata-extractor'
-import {
-  DEFAULT_MAX_BUDGET_USD,
-  DEFAULT_MAX_TURNS,
-  DEFAULT_PASS_THRESHOLD,
-} from '../src/graders/performance/performance-grader'
-import {
-  PERFORMANCE_EVAL_SCHEMA,
-  type PerformanceEvalResponse,
-} from '../src/graders/performance/types'
-import { MessageSchema } from '../src/types/message'
-
-const DEFAULT_SAMPLE = 'results/webvoyager-restart/Allrecipes--0'
-
-async function main() {
-  const outputDir = process.argv[2]
-    ? process.argv[2]
-    : join(process.cwd(), DEFAULT_SAMPLE)
-
-  console.log(`\n=== Performance Grader Test ===`)
-  console.log(`Output dir: ${outputDir}\n`)
-
-  // 1. Load messages
-  const rawLines = (await readFile(join(outputDir, 'messages.jsonl'), 'utf-8'))
-    .split('\n')
-    .filter(Boolean)
-
-  const messages = rawLines.map((line) => MessageSchema.parse(JSON.parse(line)))
-  console.log(`Loaded ${messages.length} messages from messages.jsonl`)
-
-  // 2. Load metadata
-  const metadata = JSON.parse(
-    await readFile(join(outputDir, 'metadata.json'), 'utf-8'),
-  )
-  console.log(`Task: ${metadata.query}`)
-  console.log(`Duration: ${metadata.total_duration_ms}ms`)
-  console.log(`Screenshots: ${metadata.total_steps}`)
-
-  // 3. Extract metrics
-  const metrics = extractMetrics(
-    messages,
-    metadata.total_steps,
-    metadata.termination_reason || 'unknown',
-  )
-
-  console.log(`\n--- Pre-Computed Metrics (passed to agent) ---`)
-  console.log(JSON.stringify(metrics, null, 2))
-
-  // 4. Build prompt
-  const systemPrompt = PERFORMANCE_SYSTEM_PROMPT.replace(
-    /\{screenshot_count\}/g,
-    String(metrics.screenshotCount),
-  )
-  const userPrompt = buildUserPrompt(
-    metadata.query,
-    metadata.final_answer,
-    metrics,
-    DEFAULT_AXES,
-  )
-
-  console.log(`\nPrompt size: ${userPrompt.length} chars`)
-  console.log(`System prompt size: ${systemPrompt.length} chars`)
-
-  // 5. Run agent — log every tool call to see its trajectory
-  console.log(`\n=== Agent Trajectory ===\n`)
-
-  let turnCount = 0
-  let toolCallCount = 0
-
-  for await (const message of query({
-    prompt: userPrompt,
-    options: {
-      model: 'claude-sonnet-4-20250514',
-      cwd: outputDir,
-      systemPrompt,
-      allowedTools: ['Read', 'Glob', 'Grep'],
-      permissionMode: 'bypassPermissions',
-      allowDangerouslySkipPermissions: true,
-      maxTurns: DEFAULT_MAX_TURNS,
-      maxBudgetUsd: DEFAULT_MAX_BUDGET_USD,
-      outputFormat: {
-        type: 'json_schema',
-        schema: PERFORMANCE_EVAL_SCHEMA,
-      },
-      env: {
-        ...process.env,
-        CLAUDECODE: '',
-      },
-    },
-  })) {
-    if (message.type === 'assistant') {
-      turnCount++
-      console.log(`--- Turn ${turnCount} ---`)
-
-      for (const block of message.message.content) {
-        if (block.type === 'text' && block.text) {
-          const preview =
-            block.text.length > 400
-              ? `${block.text.slice(0, 400)}...`
-              : block.text
-          console.log(`  [text] ${preview}`)
-        }
-        if (block.type === 'tool_use') {
-          toolCallCount++
-          const input = block.input as Record<string, unknown>
-          // Show what the agent is reading/grepping
-          if (block.name === 'Read') {
-            console.log(
-              `  [tool #${toolCallCount}] Read → ${input.file_path}${input.limit ? ` (lines ${input.offset || 1}-${(input.offset || 1) + Number(input.limit)})` : ''}`,
-            )
-          } else if (block.name === 'Grep') {
-            console.log(
-              `  [tool #${toolCallCount}] Grep → pattern="${input.pattern}" path="${input.path || '.'}"`,
-            )
-          } else if (block.name === 'Glob') {
-            console.log(`  [tool #${toolCallCount}] Glob → ${input.pattern}`)
-          } else {
-            console.log(
-              `  [tool #${toolCallCount}] ${block.name}(${JSON.stringify(input).slice(0, 150)})`,
-            )
-          }
-        }
-      }
-    }
-
-    if (message.type === 'result') {
-      console.log(`\n=== Result ===`)
-      console.log(`Status: ${message.subtype}`)
-      console.log(`Turns: ${message.num_turns}`)
-      console.log(`Tool calls: ${toolCallCount}`)
-      console.log(`Cost: $${message.total_cost_usd.toFixed(4)}`)
-
-      if (message.subtype === 'success') {
-        console.log(`\n--- Scores ---`)
-        const axes = (
-          message.structured_output as PerformanceEvalResponse | undefined
-        )?.axes
-        if (Array.isArray(axes)) {
-          let composite = 0
-          for (const a of axes) {
-            const def = DEFAULT_AXES.find((d) => d.name === a.axis)
-            const weight = def?.weight ?? 0
-            composite += a.score * weight
-            console.log(
-              `  ${a.axis}: ${a.score}/100 (weight: ${weight}) — ${a.reasoning}`,
-            )
-          }
-          console.log(`\n  Composite: ${composite.toFixed(1)}/100`)
-          console.log(
-            `  Pass (>= ${DEFAULT_PASS_THRESHOLD}): ${composite >= DEFAULT_PASS_THRESHOLD ? 'YES' : 'NO'}`,
-          )
-        }
-      } else {
-        console.log(`Error: ${message.result}`)
-      }
-    }
-  }
-}
-
-main().catch(console.error)
--- a/packages/browseros-agent/apps/eval/scripts/validate-computer-use-tools.ts
+++ b/packages/browseros-agent/apps/eval/scripts/validate-computer-use-tools.ts
@@ -1,200 +0,0 @@
-/**
- * Validation script for Gemini Computer Use integration
- * Run: bun apps/eval/scripts/validate-computer-use-tools.ts
- */
-
-import { Client } from '@modelcontextprotocol/sdk/client/index.js'
-import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
-
-const MCP_URL = process.env.MCP_URL || 'http://127.0.0.1:9105/mcp'
-
-interface McpToolResult {
-  content: Array<{
-    type: string
-    text?: string
-    data?: string
-    mimeType?: string
-  }>
-  isError?: boolean
-}
-
-async function callMcpTool(
-  serverUrl: string,
-  name: string,
-  args: Record<string, unknown> = {},
-): Promise<McpToolResult> {
-  const client = new Client({ name: 'validate-computer-use', version: '1.0.0' })
-  const transport = new StreamableHTTPClientTransport(new URL(serverUrl), {
-    requestInit: { headers: { 'X-BrowserOS-Source': 'validation' } },
-  })
-
-  try {
-    await client.connect(transport)
-    return (await client.callTool({ name, arguments: args })) as McpToolResult
-  } finally {
-    try {
-      await transport.close()
-    } catch {}
-  }
-}
-
-async function validateTools() {
-  console.log('🔍 Validating MCP tools for Gemini Computer Use integration\n')
-  console.log(`MCP URL: ${MCP_URL}\n`)
-
-  // Get active tab first
-  console.log('1. Getting active tab...')
-  const tabResult = await callMcpTool(MCP_URL, 'browser_get_active_tab', {})
-  if (tabResult.isError) {
-    console.error('❌ Failed to get active tab:', tabResult.content)
-    process.exit(1)
-  }
-  const tabText = tabResult.content.find((c) => c.type === 'text')?.text ?? ''
-  const tabIdMatch = tabText.match(/ID: (\d+)/)
-  const tabId = tabIdMatch ? parseInt(tabIdMatch[1], 10) : 1
-  console.log(`   ✅ Active tab ID: ${tabId}\n`)
-
-  // Validate each tool needed for Computer Use
-  const toolTests = [
-    {
-      name: 'browser_get_screenshot',
-      args: { tabId, size: 'medium' },
-      description: 'Screenshot capture',
-      validate: (r: McpToolResult) => r.content.some((c) => c.type === 'image'),
-    },
-    {
-      name: 'browser_click_coordinates',
-      args: { tabId, x: 100, y: 100 },
-      description: 'Click at coordinates',
-      validate: (r: McpToolResult) => !r.isError,
-    },
-    {
-      name: 'browser_type_at_coordinates',
-      args: { tabId, x: 100, y: 100, text: 'test' },
-      description: 'Type at coordinates',
-      validate: (r: McpToolResult) => !r.isError,
-    },
-    {
-      name: 'browser_scroll_down',
-      args: { tabId },
-      description: 'Scroll down',
-      validate: (r: McpToolResult) => !r.isError,
-    },
-    {
-      name: 'browser_scroll_up',
-      args: { tabId },
-      description: 'Scroll up',
-      validate: (r: McpToolResult) => !r.isError,
-    },
-    {
-      name: 'browser_send_keys',
-      args: { tabId, key: 'Enter' },
-      description: 'Send keyboard key',
-      validate: (r: McpToolResult) => !r.isError,
-    },
-    {
-      name: 'browser_execute_javascript',
-      args: { tabId, code: 'window.location.href' },
-      description: 'Execute JavaScript (for go_back/forward workaround)',
-      validate: (r: McpToolResult) => !r.isError,
-    },
-  ]
-
-  let passed = 0
-  let failed = 0
-
-  for (const test of toolTests) {
-    process.stdout.write(`2. Testing ${test.name} (${test.description})... `)
-    try {
-      const result = await callMcpTool(MCP_URL, test.name, test.args)
-      if (test.validate(result)) {
-        console.log('✅')
-        passed++
-      } else {
-        console.log('❌ Validation failed')
-        console.log('   Result:', JSON.stringify(result, null, 2))
-        failed++
-      }
-    } catch (err) {
-      console.log('❌ Error:', err instanceof Error ? err.message : err)
-      failed++
-    }
-  }
-
-  console.log(`\n${'='.repeat(50)}`)
-  console.log(`Results: ${passed} passed, ${failed} failed`)
-  console.log('='.repeat(50))
-
-  if (failed === 0) {
-    console.log(
-      '\n✅ All tools validated! Gemini Computer Use integration should work.',
-    )
-    console.log('\nGaps to address with workarounds:')
-    console.log('  - key_combination: Use browser_execute_javascript')
-    console.log(
-      '  - go_back/go_forward: Use browser_execute_javascript with history.back()/forward()',
-    )
-    console.log(
-      '  - type_text_at press_enter: Chain browser_send_keys after typing',
-    )
-  } else {
-    console.log('\n⚠️  Some tools failed. Check your server is running.')
-  }
-}
-
-// Validate Gemini API access
-async function validateGeminiApi() {
-  const apiKey = process.env.GOOGLE_AI_API_KEY || process.env.GEMINI_API_KEY
-  if (!apiKey) {
-    console.log('\n⚠️  GOOGLE_AI_API_KEY not set - skipping API validation')
-    return
-  }
-
-  console.log('\n3. Validating Gemini Computer Use API access...')
-
-  const MODEL = 'gemini-2.5-computer-use-preview-10-2025'
-  const url = `https://generativelanguage.googleapis.com/v1beta/models/${MODEL}:generateContent`
-
-  // Minimal test - just check if model is accessible
-  const testPayload = {
-    contents: [{ role: 'user', parts: [{ text: 'test' }] }],
-  }
-
-  try {
-    const response = await fetch(url, {
-      method: 'POST',
-      headers: {
-        'Content-Type': 'application/json',
-        'x-goog-api-key': apiKey,
-      },
-      body: JSON.stringify(testPayload),
-    })
-
-    if (response.ok) {
-      console.log('   ✅ Gemini Computer Use API is accessible')
-    } else {
-      const error = await response.json()
-      console.log(
-        '   ❌ API error:',
-        error.error?.message || response.statusText,
-      )
-    }
-  } catch (err) {
-    console.log(
-      '   ❌ Network error:',
-      err instanceof Error ? err.message : err,
-    )
-  }
-}
-
-async function main() {
-  try {
-    await validateTools()
-    await validateGeminiApi()
-  } catch (err) {
-    console.error('Validation failed:', err)
-    process.exit(1)
-  }
-}
-
-main()
--- a/packages/browseros-agent/apps/eval/scripts/weekly-report.ts
+++ b/packages/browseros-agent/apps/eval/scripts/weekly-report.ts
@@ -62,9 +62,6 @@ const PASS_FAIL_GRADER_ORDER = [
  'agisdk_state_diff',
  'infinity_state',
  'performance_grader',
-  'webvoyager_grader',
-  'fara_combined',
-  'fara_grader',
 ]

 function requireEnv(name: string): string {
@@ -334,9 +331,7 @@ const html = `<!DOCTYPE html>
            ? 'Orch-Exec'
            : r.agentType === 'single'
              ? 'Tool Loop'
-              : r.agentType === 'gemini-computer-use'
-                ? 'Gemini CU'
-                : r.agentType || '—'
+              : r.agentType || '—'
        return `<tr data-config="${escHtml(r.runId)}" data-search="${escHtml(`${r.date} ${r.runId} ${r.model} ${r.dataset} ${archLabel}`)}">
      <td>${escHtml(r.date)}</td>
      <td class="mono">${escHtml(r.runId)}</td>
@@ -385,7 +380,6 @@ const html = `<!DOCTYPE html>
    var latest = runs[runs.length - 1];
    var archLabel = latest.agentType === 'orchestrator-executor' ? 'Orchestrator-Executor'
      : latest.agentType === 'single' ? 'Single Agent (Tool Loop)'
-      : latest.agentType === 'gemini-computer-use' ? 'Gemini Computer Use'
      : latest.agentType || 'Unknown';
    var scoreColor = latest.avgScore >= 75 ? '#3fb950' : latest.avgScore >= 40 ? '#f0883e' : '#f85149';
    el.innerHTML =
--- a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/action-mapper.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/action-mapper.ts
@@ -1,643 +0,0 @@
-/**
- * Maps Gemini Computer Use actions to MCP tool calls
- *
- * Coordinate System:
- * - Screenshots captured with size='large' (1028px width, aspect ratio preserved)
- * - Gemini outputs normalized coordinates (0-999) relative to the screenshot
- * - We convert these to actual viewport coordinates by:
- *   1. Getting the real viewport dimensions via JavaScript
- *   2. Scaling normalized coords to actual viewport pixels
- */
-
-import { Client } from '@modelcontextprotocol/sdk/client/index.js'
-import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
-import type { ActionContext, ComputerUseAction, ScreenSize } from './types'
-import { DEFAULTS } from './types'
-
-interface McpToolResult {
-  content: Array<{
-    type: string
-    text?: string
-    data?: string
-    mimeType?: string
-  }>
-  isError?: boolean
-}
-
-const MCP_TIMEOUT_MS = 30000
-
-export class ActionMapper {
-  private ctx: ActionContext
-  private cachedViewport: ScreenSize | null = null
-
-  constructor(ctx: ActionContext) {
-    this.ctx = ctx
-  }
-
-  // Store debug info about viewport detection for inclusion in responses
-  private viewportDebugInfo: string = ''
-
-  /**
-   * Get the actual browser viewport size via JavaScript
-   * Caches the result to avoid repeated calls
-   * Also stores debug info for troubleshooting
-   */
-  async getViewportSize(): Promise<ScreenSize> {
-    if (this.cachedViewport) {
-      return this.cachedViewport
-    }
-
-    try {
-      const result = await this.callMcp('browser_execute_javascript', {
-        tabId: this.ctx.tabId,
-        windowId: this.ctx.windowId,
-        code: '[window.innerWidth, window.innerHeight]',
-      })
-
-      const textContent =
-        result.content.find((c) => c.type === 'text')?.text ?? ''
-
-      // Check for error in result
-      if (result.isError) {
-        this.viewportDebugInfo = `[VIEWPORT ERROR] JS execution failed: ${textContent}. Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
-        console.warn(this.viewportDebugInfo)
-        return this.ctx.screenSize
-      }
-
-      // Response format can be multiline:
-      // "Result: [1440, 900]" or "Result: [\n  1200,\n  712\n]"
-      const arrayMatch = textContent.match(/\[\s*(\d+)\s*,\s*(\d+)\s*\]/s)
-      if (arrayMatch) {
-        const width = parseInt(arrayMatch[1], 10)
-        const height = parseInt(arrayMatch[2], 10)
-        if (width > 0 && height > 0) {
-          this.cachedViewport = { width, height }
-          this.viewportDebugInfo = `[VIEWPORT OK] Detected: ${width}x${height} (raw response: "${textContent.substring(0, 100)}")`
-          console.log(this.viewportDebugInfo)
-          return this.cachedViewport
-        } else {
-          this.viewportDebugInfo = `[VIEWPORT PARSE ERROR] Invalid dimensions: ${width}x${height} from "${textContent}". Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
-          console.warn(this.viewportDebugInfo)
-        }
-      } else {
-        this.viewportDebugInfo = `[VIEWPORT PARSE ERROR] Could not parse response: "${textContent}". Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
-        console.warn(this.viewportDebugInfo)
-      }
-    } catch (error) {
-      const errMsg = error instanceof Error ? error.message : String(error)
-      this.viewportDebugInfo = `[VIEWPORT EXCEPTION] ${errMsg}. Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
-      console.warn(this.viewportDebugInfo)
-    }
-
-    // Fallback to configured screenSize
-    return this.ctx.screenSize
-  }
-
-  /**
-   * Get the current viewport debug info
-   */
-  getViewportDebugInfo(): string {
-    return this.viewportDebugInfo
-  }
-
-  /**
-   * Clear cached viewport (call when tab/window changes or before new task)
-   */
-  clearViewportCache(): void {
-    this.cachedViewport = null
-  }
-
-  /**
-   * Scale normalized coordinate (0-999) to actual viewport pixel value
-   */
-  private async scaleCoordinates(
-    normalizedX: number,
-    normalizedY: number,
-  ): Promise<{ x: number; y: number }> {
-    const viewport = await this.getViewportSize()
-    return {
-      x: Math.round((normalizedX / 1000) * viewport.width),
-      y: Math.round((normalizedY / 1000) * viewport.height),
-    }
-  }
-
-  /**
-   * Call an MCP tool
-   */
-  private async callMcp(
-    name: string,
-    args: Record<string, unknown> = {},
-  ): Promise<McpToolResult> {
-    const client = new Client({
-      name: 'gemini-computer-use',
-      version: '1.0.0',
-    })
-
-    const transport = new StreamableHTTPClientTransport(
-      new URL(this.ctx.mcpUrl),
-      {
-        requestInit: {
-          headers: { 'X-BrowserOS-Source': 'gemini-computer-use' },
-        },
-      },
-    )
-
-    try {
-      await client.connect(transport)
-
-      const toolCallPromise = client.callTool({ name, arguments: args })
-      let timeoutId: ReturnType<typeof setTimeout> | null = null
-      const timeoutPromise = new Promise<never>((_, reject) => {
-        timeoutId = setTimeout(
-          () =>
-            reject(
-              new Error(`MCP tool call timed out after ${MCP_TIMEOUT_MS}ms`),
-            ),
-          MCP_TIMEOUT_MS,
-        )
-      })
-
-      try {
-        return (await Promise.race([
-          toolCallPromise,
-          timeoutPromise,
-        ])) as McpToolResult
-      } finally {
-        if (timeoutId) clearTimeout(timeoutId)
-      }
-    } finally {
-      try {
-        await transport.close()
-      } catch {
-        // Ignore close errors
-      }
-    }
-  }
-
-  /**
-   * Execute a Computer Use action by mapping to MCP tools
-   */
-  async execute(
-    action: ComputerUseAction,
-  ): Promise<{ success: boolean; message: string }> {
-    const { tabId, windowId } = this.ctx
-
-    try {
-      switch (action.name) {
-        case 'click_at': {
-          const viewport = await this.getViewportSize()
-          const { x, y } = await this.scaleCoordinates(
-            action.args.x,
-            action.args.y,
-          )
-          await this.callMcp('browser_click_coordinates', {
-            tabId,
-            windowId,
-            x,
-            y,
-          })
-          // Return original coordinates + debug info for troubleshooting
-          // Debug info shows: model input → viewport coords, viewport size, and any errors
-          const debugInfo = `[DEBUG: input=(${action.args.x},${action.args.y}) → viewport=(${x},${y}), viewport=${viewport.width}x${viewport.height}] ${this.viewportDebugInfo}`
-          return {
-            success: true,
-            message: `Clicked at (${action.args.x}, ${action.args.y}). ${debugInfo}`,
-          }
-        }
-
-        case 'type_text_at': {
-          const viewport = await this.getViewportSize()
-          const { x, y } = await this.scaleCoordinates(
-            action.args.x,
-            action.args.y,
-          )
-          const { text, press_enter, clear_before_typing } = action.args
-
-          // Clear field first if requested (select all + delete)
-          if (clear_before_typing) {
-            await this.callMcp('browser_click_coordinates', {
-              tabId,
-              windowId,
-              x,
-              y,
-            })
-            await this.callMcp('browser_execute_javascript', {
-              tabId,
-              windowId,
-              code: `document.execCommand('selectAll')`,
-            })
-            await this.callMcp('browser_send_keys', {
-              tabId,
-              windowId,
-              key: 'Delete',
-            })
-          }
-
-          // Type the text
-          await this.callMcp('browser_type_at_coordinates', {
-            tabId,
-            windowId,
-            x,
-            y,
-            text,
-          })
-
-          // Press Enter if requested
-          if (press_enter) {
-            await this.callMcp('browser_send_keys', {
-              tabId,
-              windowId,
-              key: 'Enter',
-            })
-          }
-
-          // Return original coordinates + debug info
-          const debugInfo = `[DEBUG: input=(${action.args.x},${action.args.y}) → viewport=(${x},${y}), viewport=${viewport.width}x${viewport.height}] ${this.viewportDebugInfo}`
-          return {
-            success: true,
-            message: `Typed "${text.substring(0, 50)}${text.length > 50 ? '...' : ''}" at (${action.args.x}, ${action.args.y}). ${debugInfo}`,
-          }
-        }
-
-        case 'navigate': {
-          await this.callMcp('browser_navigate', {
-            tabId,
-            windowId,
-            url: action.args.url,
-          })
-          return { success: true, message: `Navigated to ${action.args.url}` }
-        }
-
-        case 'scroll_document': {
-          const { direction } = action.args
-          if (direction === 'up') {
-            await this.callMcp('browser_scroll_up', { tabId, windowId })
-          } else if (direction === 'down') {
-            await this.callMcp('browser_scroll_down', { tabId, windowId })
-          } else {
-            // Left/right scroll via JavaScript
-            const scrollCode =
-              direction === 'left'
-                ? 'window.scrollBy(-window.innerWidth, 0)'
-                : 'window.scrollBy(window.innerWidth, 0)'
-            await this.callMcp('browser_execute_javascript', {
-              tabId,
-              windowId,
-              code: scrollCode,
-            })
-          }
-          return { success: true, message: `Scrolled ${direction}` }
-        }
-
-        case 'scroll_at': {
-          const { x, y } = await this.scaleCoordinates(
-            action.args.x,
-            action.args.y,
-          )
-          const { direction, magnitude = 500 } = action.args
-
-          // Click at position first to focus element
-          await this.callMcp('browser_click_coordinates', {
-            tabId,
-            windowId,
-            x,
-            y,
-          })
-
-          // Scale magnitude from 0-999 to actual pixels
-          const viewport = await this.getViewportSize()
-          const scrollAmount = Math.round((magnitude / 1000) * viewport.height)
-
-          // Use JavaScript scrollBy for precise control with magnitude
-          const scrollCode =
-            direction === 'up'
-              ? `window.scrollBy(0, -${scrollAmount})`
-              : direction === 'down'
-                ? `window.scrollBy(0, ${scrollAmount})`
-                : direction === 'left'
-                  ? `window.scrollBy(-${scrollAmount}, 0)`
-                  : `window.scrollBy(${scrollAmount}, 0)`
-          await this.callMcp('browser_execute_javascript', {
-            tabId,
-            windowId,
-            code: scrollCode,
-          })
-
-          // Return original coordinates to avoid confusing the model
-          return {
-            success: true,
-            message: `Scrolled ${direction} at (${action.args.x}, ${action.args.y})`,
-          }
-        }
-
-        case 'key_combination': {
-          const { keys } = action.args
-
-          // Map common key combinations to JavaScript or available keys
-          const keyMap: Record<string, () => Promise<void>> = {
-            'Control+a': async () => {
-              await this.callMcp('browser_execute_javascript', {
-                tabId,
-                windowId,
-                code: `document.execCommand('selectAll')`,
-              })
-            },
-            'Control+c': async () => {
-              await this.callMcp('browser_execute_javascript', {
-                tabId,
-                windowId,
-                code: `document.execCommand('copy')`,
-              })
-            },
-            'Control+v': async () => {
-              await this.callMcp('browser_execute_javascript', {
-                tabId,
-                windowId,
-                code: `document.execCommand('paste')`,
-              })
-            },
-            'Control+z': async () => {
-              await this.callMcp('browser_execute_javascript', {
-                tabId,
-                windowId,
-                code: `document.execCommand('undo')`,
-              })
-            },
-            Enter: async () => {
-              await this.callMcp('browser_send_keys', {
-                tabId,
-                windowId,
-                key: 'Enter',
-              })
-            },
-            Escape: async () => {
-              await this.callMcp('browser_send_keys', {
-                tabId,
-                windowId,
-                key: 'Escape',
-              })
-            },
-            Tab: async () => {
-              await this.callMcp('browser_send_keys', {
-                tabId,
-                windowId,
-                key: 'Tab',
-              })
-            },
-            Backspace: async () => {
-              await this.callMcp('browser_send_keys', {
-                tabId,
-                windowId,
-                key: 'Backspace',
-              })
-            },
-            Delete: async () => {
-              await this.callMcp('browser_send_keys', {
-                tabId,
-                windowId,
-                key: 'Delete',
-              })
-            },
-            ArrowUp: async () => {
-              await this.callMcp('browser_send_keys', {
-                tabId,
-                windowId,
-                key: 'ArrowUp',
-              })
-            },
-            ArrowDown: async () => {
-              await this.callMcp('browser_send_keys', {
-                tabId,
-                windowId,
-                key: 'ArrowDown',
-              })
-            },
-            ArrowLeft: async () => {
-              await this.callMcp('browser_send_keys', {
-                tabId,
-                windowId,
-                key: 'ArrowLeft',
-              })
-            },
-            ArrowRight: async () => {
-              await this.callMcp('browser_send_keys', {
-                tabId,
-                windowId,
-                key: 'ArrowRight',
-              })
-            },
-          }
-
-          // Normalize key string (case insensitive for modifiers)
-          const normalizedKeys = keys
-            .replace(/ctrl/i, 'Control')
-            .replace(/cmd/i, 'Control')
-          const handler = keyMap[normalizedKeys] || keyMap[keys]
-
-          if (handler) {
-            await handler()
-          } else {
-            const keyName = keys.split('+').pop() || ''
-            await this.callMcp('browser_execute_javascript', {
-              tabId,
-              windowId,
-              code: `
-                const event = new KeyboardEvent('keydown', {
-                  key: ${JSON.stringify(keyName)},
-                  ctrlKey: ${keys.toLowerCase().includes('control')},
-                  shiftKey: ${keys.toLowerCase().includes('shift')},
-                  altKey: ${keys.toLowerCase().includes('alt')},
-                  metaKey: ${keys.toLowerCase().includes('meta')},
-                  bubbles: true
-                });
-                document.activeElement?.dispatchEvent(event);
-              `,
-            })
-          }
-
-          return { success: true, message: `Pressed ${keys}` }
-        }
-
-        case 'hover_at': {
-          const { x, y } = await this.scaleCoordinates(
-            action.args.x,
-            action.args.y,
-          )
-
-          // Simulate hover via JavaScript mouseover event
-          await this.callMcp('browser_execute_javascript', {
-            tabId,
-            windowId,
-            code: `
-              const elem = document.elementFromPoint(${x}, ${y});
-              if (elem) {
-                const event = new MouseEvent('mouseover', { bubbles: true, clientX: ${x}, clientY: ${y} });
-                elem.dispatchEvent(event);
-              }
-            `,
-          })
-
-          // Return original coordinates to avoid confusing the model
-          return {
-            success: true,
-            message: `Hovered at (${action.args.x}, ${action.args.y})`,
-          }
-        }
-
-        case 'go_back': {
-          await this.callMcp('browser_execute_javascript', {
-            tabId,
-            windowId,
-            code: 'history.back()',
-          })
-          return { success: true, message: 'Navigated back' }
-        }
-
-        case 'go_forward': {
-          await this.callMcp('browser_execute_javascript', {
-            tabId,
-            windowId,
-            code: 'history.forward()',
-          })
-          return { success: true, message: 'Navigated forward' }
-        }
-
-        case 'wait_5_seconds': {
-          await new Promise((resolve) => setTimeout(resolve, 5000))
-          return { success: true, message: 'Waited 5 seconds' }
-        }
-
-        case 'drag_and_drop': {
-          const start = await this.scaleCoordinates(
-            action.args.x,
-            action.args.y,
-          )
-          const end = await this.scaleCoordinates(
-            action.args.destination_x,
-            action.args.destination_y,
-          )
-
-          // Simulate drag and drop via JavaScript
-          await this.callMcp('browser_execute_javascript', {
-            tabId,
-            windowId,
-            code: `
-              const startElem = document.elementFromPoint(${start.x}, ${start.y});
-              const endElem = document.elementFromPoint(${end.x}, ${end.y});
-              if (startElem && endElem) {
-                const dragStart = new DragEvent('dragstart', { bubbles: true, clientX: ${start.x}, clientY: ${start.y} });
-                const drop = new DragEvent('drop', { bubbles: true, clientX: ${end.x}, clientY: ${end.y} });
-                const dragEnd = new DragEvent('dragend', { bubbles: true });
-                startElem.dispatchEvent(dragStart);
-                endElem.dispatchEvent(drop);
-                startElem.dispatchEvent(dragEnd);
-              }
-            `,
-          })
-
-          // Return original coordinates to avoid confusing the model
-          return {
-            success: true,
-            message: `Dragged from (${action.args.x}, ${action.args.y}) to (${action.args.destination_x}, ${action.args.destination_y})`,
-          }
-        }
-
-        default: {
-          const _exhaustive: never = action
-          return {
-            success: false,
-            message: `Unknown action: ${JSON.stringify(action)}`,
-          }
-        }
-      }
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error)
-      return { success: false, message: `Action failed: ${message}` }
-    }
-  }
-
-  /**
-   * Capture a screenshot via MCP with retry logic
-   *
-   * Uses Gemini's recommended screenshot size (1440x900) for optimal model performance.
-   * Now that viewport detection is working correctly, the coordinate mapping will be accurate.
-   */
-  async captureScreenshot(retries = 2): Promise<string | null> {
-    const { width, height } = DEFAULTS.screenshotSize
-
-    for (let attempt = 0; attempt <= retries; attempt++) {
-      try {
-        const result = await this.callMcp('browser_get_screenshot', {
-          tabId: this.ctx.tabId,
-          windowId: this.ctx.windowId,
-          width,
-          height,
-          showHighlights: false,
-        })
-
-        if (result.isError) {
-          const errorText =
-            result.content?.find((c) => c.type === 'text')?.text ??
-            'Unknown error'
-          if (attempt < retries) {
-            console.warn(
-              `Screenshot attempt ${attempt + 1} failed: ${errorText}, retrying...`,
-            )
-            await new Promise((r) => setTimeout(r, 500))
-            continue
-          }
-          console.warn('Screenshot capture failed:', errorText)
-          return null
-        }
-
-        const imageContent = result.content.find((c) => c.type === 'image')
-        if (imageContent?.data) {
-          return imageContent.data
-        }
-
-        if (attempt < retries) {
-          console.warn(
-            `Screenshot attempt ${attempt + 1}: No image data, retrying...`,
-          )
-          await new Promise((r) => setTimeout(r, 500))
-          continue
-        }
-        return null
-      } catch (error) {
-        if (attempt < retries) {
-          console.warn(
-            `Screenshot attempt ${attempt + 1} error:`,
-            error,
-            'retrying...',
-          )
-          await new Promise((r) => setTimeout(r, 500))
-          continue
-        }
-        console.warn('Screenshot capture error:', error)
-        return null
-      }
-    }
-    return null
-  }
-
-  /**
-   * Get current page URL via MCP
-   */
-  async getCurrentUrl(): Promise<string> {
-    try {
-      const result = await this.callMcp('browser_execute_javascript', {
-        tabId: this.ctx.tabId,
-        windowId: this.ctx.windowId,
-        code: 'window.location.href',
-      })
-
-      const textContent =
-        result.content.find((c) => c.type === 'text')?.text ?? ''
-      // Extract URL from result text
-      const urlMatch = textContent.match(/Result:\s*"?([^"\n]+)"?/)
-      return urlMatch?.[1] ?? 'unknown'
-    } catch {
-      return 'unknown'
-    }
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/agent.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/agent.ts
@@ -1,327 +0,0 @@
-/**
- * Gemini Computer Use Agent
- * Implements the agent loop that calls Gemini API and executes actions
- * Uses UIMessageStreamEvent format for logging compatibility
- */
-
-import { randomUUID } from 'node:crypto'
-import { ActionMapper } from './action-mapper'
-import {
-  type ComputerUseAction,
-  DEFAULTS,
-  type GeminiComputerUseAgentConfig,
-  type GeminiContent,
-  type GeminiPart,
-  type GeminiResponse,
-} from './types'
-
-const GEMINI_API_BASE = 'https://generativelanguage.googleapis.com/v1beta'
-
-interface StreamWriter {
-  write: (data: string) => Promise<void>
-}
-
-type ActionHook = (
-  action: ComputerUseAction,
-  result: { success: boolean; message: string },
-) => Promise<void>
-
-/**
- * Emit SSE-formatted UIMessageStreamEvent
- */
-function emitEvent(
-  writer: StreamWriter,
-  event: Record<string, unknown>,
-): Promise<void> {
-  return writer.write(`data: ${JSON.stringify(event)}\n\n`)
-}
-
-export class GeminiComputerUseAgent {
-  private config: GeminiComputerUseAgentConfig
-  private actionMapper: ActionMapper
-  private actionHook?: ActionHook
-  private contents: GeminiContent[] = []
-
-  constructor(config: GeminiComputerUseAgentConfig) {
-    this.config = config
-    this.actionMapper = new ActionMapper({
-      mcpUrl: config.mcpUrl,
-      tabId: config.tabId,
-      windowId: config.windowId,
-      screenSize: config.screenSize,
-    })
-  }
-
-  /**
-   * Set a hook to be called after each action execution
-   */
-  setActionHook(hook: ActionHook): void {
-    this.actionHook = hook
-  }
-
-  /**
-   * Call the Gemini Computer Use API
-   */
-  private async callGeminiApi(): Promise<GeminiResponse> {
-    const url = `${GEMINI_API_BASE}/models/${DEFAULTS.model}:generateContent`
-
-    const response = await fetch(url, {
-      method: 'POST',
-      headers: {
-        'Content-Type': 'application/json',
-        'x-goog-api-key': this.config.apiKey,
-      },
-      body: JSON.stringify({
-        contents: this.contents,
-        tools: [
-          {
-            computer_use: {
-              environment: 'ENVIRONMENT_BROWSER',
-            },
-          },
-        ],
-      }),
-    })
-
-    if (!response.ok) {
-      const errorBody = await response.text()
-      throw new Error(
-        `Gemini API error: ${response.status} ${response.statusText} - ${errorBody}`,
-      )
-    }
-
-    return response.json()
-  }
-
-  /**
-   * Extract function calls from a Gemini response
-   */
-  private extractFunctionCalls(response: GeminiResponse): ComputerUseAction[] {
-    const candidate = response.candidates?.[0]
-    if (!candidate?.content?.parts) {
-      return []
-    }
-
-    const actions: ComputerUseAction[] = []
-    for (const part of candidate.content.parts) {
-      if (part.functionCall) {
-        const { name, args } = part.functionCall
-        // Construct action object
-        actions.push({ name, args: args ?? {} } as ComputerUseAction)
-      }
-    }
-
-    return actions
-  }
-
-  /**
-   * Extract text response from Gemini response
-   */
-  private extractTextResponse(response: GeminiResponse): string | null {
-    const candidate = response.candidates?.[0]
-    if (!candidate?.content?.parts) {
-      return null
-    }
-
-    const textParts = candidate.content.parts
-      .map((p) => p.text)
-      .filter((text): text is string => text !== undefined)
-    return textParts.length > 0 ? textParts.join('\n') : null
-  }
-
-  /**
-   * Build function response parts for the next turn
-   */
-  private buildFunctionResponses(
-    actions: ComputerUseAction[],
-    currentUrl: string,
-    screenshotBase64: string | null,
-  ): GeminiPart[] {
-    const parts: GeminiPart[] = []
-
-    for (const action of actions) {
-      parts.push({
-        functionResponse: {
-          name: action.name,
-          response: { url: currentUrl },
-        },
-      })
-    }
-
-    // Add screenshot as inline data
-    if (screenshotBase64) {
-      parts.push({
-        inlineData: {
-          mimeType: 'image/png',
-          data: screenshotBase64,
-        },
-      })
-    }
-
-    return parts
-  }
-
-  /**
-   * Execute the agent loop
-   */
-  async execute(
-    query: string,
-    streamWriter: StreamWriter,
-    signal: AbortSignal,
-  ): Promise<{ finalText: string | null; totalActions: number }> {
-    let totalActions = 0
-    let finalText: string | null = null
-
-    // Wait for page to stabilize before first screenshot
-    await new Promise((resolve) => setTimeout(resolve, 2000))
-
-    // Capture initial screenshot with retries
-    let initialScreenshot: string | null = null
-    for (let attempt = 1; attempt <= 3; attempt++) {
-      initialScreenshot = await this.actionMapper.captureScreenshot()
-      if (initialScreenshot) break
-      console.warn(`Initial screenshot attempt ${attempt} failed, retrying...`)
-      await new Promise((resolve) => setTimeout(resolve, 1000))
-    }
-
-    if (!initialScreenshot) {
-      throw new Error('Failed to capture initial screenshot after 3 attempts')
-    }
-
-    // Build initial content
-    const initialParts: GeminiPart[] = [
-      { text: query },
-      { inlineData: { mimeType: 'image/png', data: initialScreenshot } },
-    ]
-    this.contents.push({ role: 'user', parts: initialParts })
-
-    const messageId = randomUUID()
-    await emitEvent(streamWriter, { type: 'start', messageId })
-
-    let finished = false
-    for (let turn = 0; turn < this.config.turnLimit; turn++) {
-      if (signal.aborted) {
-        await emitEvent(streamWriter, { type: 'abort' })
-        break
-      }
-
-      // Start step (turn)
-      await emitEvent(streamWriter, { type: 'start-step' })
-
-      // Call Gemini API
-      let response: GeminiResponse
-      try {
-        response = await this.callGeminiApi()
-      } catch (error) {
-        const errorMsg = error instanceof Error ? error.message : String(error)
-        await emitEvent(streamWriter, {
-          type: 'error',
-          errorText: `API error: ${errorMsg}`,
-        })
-        throw error
-      }
-
-      // Check for API errors
-      if (response.error) {
-        await emitEvent(streamWriter, {
-          type: 'error',
-          errorText: response.error.message,
-        })
-        throw new Error(`Gemini API error: ${response.error.message}`)
-      }
-
-      // Extract text response
-      const textResponse = this.extractTextResponse(response)
-      if (textResponse) {
-        finalText = textResponse
-        const textId = randomUUID()
-        await emitEvent(streamWriter, { type: 'text-start', id: textId })
-        await emitEvent(streamWriter, {
-          type: 'text-delta',
-          id: textId,
-          delta: textResponse,
-        })
-        await emitEvent(streamWriter, { type: 'text-end', id: textId })
-      }
-
-      // Extract function calls
-      const actions = this.extractFunctionCalls(response)
-
-      // If no actions, task is complete
-      if (actions.length === 0) {
-        await emitEvent(streamWriter, { type: 'finish-step' })
-        await emitEvent(streamWriter, {
-          type: 'finish',
-          finishReason: 'completed',
-        })
-        finished = true
-        break
-      }
-
-      // Add model response to conversation
-      const candidate = response.candidates?.[0]
-      if (candidate?.content) {
-        this.contents.push(candidate.content)
-      }
-
-      // Execute each action
-      for (const action of actions) {
-        if (signal.aborted) break
-
-        const toolCallId = randomUUID()
-
-        // Tool input events
-        await emitEvent(streamWriter, {
-          type: 'tool-input-start',
-          toolCallId,
-          toolName: action.name,
-        })
-        await emitEvent(streamWriter, {
-          type: 'tool-input-available',
-          toolCallId,
-          toolName: action.name,
-          input: action.args,
-        })
-
-        const result = await this.actionMapper.execute(action)
-        totalActions++
-
-        // Tool output event
-        await emitEvent(streamWriter, {
-          type: 'tool-output-available',
-          toolCallId,
-          output: result,
-        })
-
-        // Call action hook (for screenshot capture)
-        if (this.actionHook) {
-          await this.actionHook(action, result)
-        }
-      }
-
-      // Capture new screenshot and URL
-      const newScreenshot = await this.actionMapper.captureScreenshot()
-      const currentUrl = await this.actionMapper.getCurrentUrl()
-
-      // Build function responses and add to conversation
-      const functionResponseParts = this.buildFunctionResponses(
-        actions,
-        currentUrl,
-        newScreenshot,
-      )
-      this.contents.push({ role: 'user', parts: functionResponseParts })
-
-      // Finish step (turn)
-      await emitEvent(streamWriter, { type: 'finish-step' })
-    }
-
-    if (!finished && !signal.aborted) {
-      await emitEvent(streamWriter, {
-        type: 'finish',
-        finishReason: 'max_turns',
-      })
-    }
-
-    return { finalText, totalActions }
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/index.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/index.ts
@@ -1,97 +0,0 @@
-/**
- * Gemini Computer Use Evaluator
- * Implements AgentEvaluator interface for the eval framework
- */
-
-import { DEFAULT_TIMEOUT_MS } from '../../constants'
-import type { GeminiComputerUseConfig, TaskMetadata } from '../../types'
-import { resolveEnvValue } from '../../utils/resolve-env'
-import { withEvalTimeout } from '../../utils/with-eval-timeout'
-import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
-import { GeminiComputerUseAgent } from './agent'
-import { DEFAULTS } from './types'
-
-export class GeminiComputerUseEvaluator implements AgentEvaluator {
-  constructor(private ctx: AgentContext) {}
-
-  async execute(): Promise<AgentResult> {
-    const { config, task, capture, windowId = 0, tabId = 0 } = this.ctx
-    const agentConfig = config.agent as GeminiComputerUseConfig
-
-    const startTime = Date.now()
-    const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS
-
-    await capture.messageLogger.logUser(task.query)
-
-    const apiKey = resolveEnvValue(agentConfig.apiKey)
-    if (!apiKey) {
-      throw new Error(
-        `API key not found. Set ${agentConfig.apiKey} environment variable or provide the key directly.`,
-      )
-    }
-
-    const agent = new GeminiComputerUseAgent({
-      apiKey,
-      turnLimit: agentConfig.turnLimit ?? DEFAULTS.turnLimit,
-      screenSize: agentConfig.screenSize ?? DEFAULTS.screenSize,
-      tabId,
-      windowId,
-      mcpUrl: `${config.browseros.server_url}/mcp`,
-    })
-
-    agent.setActionHook(async (_action, _result) => {
-      try {
-        await capture.screenshot.capture(capture.getActivePageId())
-      } catch (err) {
-        console.warn('Screenshot capture failed in hook:', err)
-      }
-    })
-
-    const streamWriter = capture.createStreamWriter()
-
-    let finalText: string | null = null
-    let totalActions = 0
-
-    const { terminationReason } = await withEvalTimeout(
-      timeoutMs,
-      capture,
-      async (signal) => {
-        const result = await agent.execute(task.query, streamWriter, signal)
-        finalText = result.finalText
-        totalActions = result.totalActions
-        return result
-      },
-    )
-
-    const endTime = Date.now()
-
-    const metadata: TaskMetadata = {
-      query_id: task.query_id,
-      dataset: task.dataset,
-      query: task.query,
-      started_at: new Date(startTime).toISOString(),
-      completed_at: new Date(endTime).toISOString(),
-      total_duration_ms: endTime - startTime,
-      total_steps: totalActions,
-      termination_reason: terminationReason,
-      final_answer: finalText ?? capture.getLastAssistantText(),
-      errors: capture.getErrors(),
-      warnings: capture.getWarnings(),
-      agent_config: {
-        type: 'gemini-computer-use',
-        model: DEFAULTS.model,
-        turnLimit: agentConfig.turnLimit ?? DEFAULTS.turnLimit,
-        screenSize: agentConfig.screenSize ?? DEFAULTS.screenSize,
-      },
-      grader_results: {},
-    }
-
-    await capture.trajectorySaver.saveMetadata(metadata)
-
-    return {
-      metadata,
-      messages: capture.getMessages(),
-      finalAnswer: finalText ?? capture.getLastAssistantText(),
-    }
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/types.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/types.ts
@@ -1,156 +0,0 @@
-/**
- * Types for Gemini Computer Use agent
- */
-
-import { z } from 'zod'
-
-// Gemini Computer Use predefined actions (from API docs)
-export const ComputerUseActionSchema = z.discriminatedUnion('name', [
-  z.object({
-    name: z.literal('click_at'),
-    args: z.object({
-      x: z.number().min(0).max(999),
-      y: z.number().min(0).max(999),
-    }),
-  }),
-  z.object({
-    name: z.literal('type_text_at'),
-    args: z.object({
-      x: z.number().min(0).max(999),
-      y: z.number().min(0).max(999),
-      text: z.string(),
-      press_enter: z.boolean().optional(),
-      clear_before_typing: z.boolean().optional(),
-    }),
-  }),
-  z.object({
-    name: z.literal('navigate'),
-    args: z.object({
-      url: z.string(),
-    }),
-  }),
-  z.object({
-    name: z.literal('scroll_document'),
-    args: z.object({
-      direction: z.enum(['up', 'down', 'left', 'right']),
-    }),
-  }),
-  z.object({
-    name: z.literal('scroll_at'),
-    args: z.object({
-      x: z.number().min(0).max(999),
-      y: z.number().min(0).max(999),
-      direction: z.enum(['up', 'down', 'left', 'right']),
-      magnitude: z.number().optional(),
-    }),
-  }),
-  z.object({
-    name: z.literal('key_combination'),
-    args: z.object({
-      keys: z.string(),
-    }),
-  }),
-  z.object({
-    name: z.literal('hover_at'),
-    args: z.object({
-      x: z.number().min(0).max(999),
-      y: z.number().min(0).max(999),
-    }),
-  }),
-  z.object({
-    name: z.literal('go_back'),
-    args: z.object({}).optional(),
-  }),
-  z.object({
-    name: z.literal('go_forward'),
-    args: z.object({}).optional(),
-  }),
-  z.object({
-    name: z.literal('wait_5_seconds'),
-    args: z.object({}).optional(),
-  }),
-  z.object({
-    name: z.literal('drag_and_drop'),
-    args: z.object({
-      x: z.number().min(0).max(999),
-      y: z.number().min(0).max(999),
-      destination_x: z.number().min(0).max(999),
-      destination_y: z.number().min(0).max(999),
-    }),
-  }),
-])
-
-export type ComputerUseAction = z.infer<typeof ComputerUseActionSchema>
-
-// Screen size configuration
-export interface ScreenSize {
-  width: number
-  height: number
-}
-
-// Context for action execution
-export interface ActionContext {
-  mcpUrl: string
-  tabId: number
-  windowId: number
-  screenSize: ScreenSize
-}
-
-// Gemini API types
-export interface GeminiContent {
-  role: 'user' | 'model'
-  parts: GeminiPart[]
-}
-
-export interface GeminiPart {
-  text?: string
-  inlineData?: {
-    mimeType: string
-    data: string
-  }
-  functionCall?: {
-    name: string
-    args?: Record<string, unknown>
-  }
-  functionResponse?: {
-    name: string
-    response: Record<string, unknown>
-  }
-}
-
-export interface GeminiResponse {
-  candidates?: Array<{
-    content: GeminiContent
-    finishReason?: string
-  }>
-  error?: {
-    message: string
-    code: number
-  }
-}
-
-// Safety decision from Computer Use
-export interface SafetyDecision {
-  decision: 'allow' | 'require_confirmation' | 'block'
-  explanation?: string
-}
-
-// Agent configuration
-export interface GeminiComputerUseAgentConfig {
-  apiKey: string
-  turnLimit: number
-  screenSize: ScreenSize
-  tabId: number
-  windowId: number
-  mcpUrl: string
-}
-
-// Defaults
-export const DEFAULTS = {
-  // Gemini's recommended screenshot size for optimal model accuracy
-  screenshotSize: { width: 1440, height: 900 },
-  // Fallback viewport size (used when actual viewport can't be determined)
-  screenSize: { width: 1440, height: 900 },
-  turnLimit: 30,
-  model: 'gemini-2.5-computer-use-preview-10-2025',
-} as const
--- a/packages/browseros-agent/apps/eval/src/agents/index.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/index.ts
@@ -1,26 +1,14 @@
-import { GeminiComputerUseEvaluator } from './gemini-computer-use'
 import { OrchestratorExecutorEvaluator } from './orchestrator-executor'
-import { registerAgent } from './registry'
 import { SingleAgentEvaluator } from './single-agent'
-import { YutoriNavigatorEvaluator } from './yutori-navigator'
+import type { AgentContext, AgentEvaluator } from './types'

-// Register built-in agent types
-registerAgent('single', (ctx) => new SingleAgentEvaluator(ctx))
-registerAgent(
-  'orchestrator-executor',
-  (ctx) => new OrchestratorExecutorEvaluator(ctx),
-)
-registerAgent(
-  'gemini-computer-use',
-  (ctx) => new GeminiComputerUseEvaluator(ctx),
-)
-registerAgent('yutori-navigator', (ctx) => new YutoriNavigatorEvaluator(ctx))
+export function createAgent(context: AgentContext): AgentEvaluator {
+  switch (context.config.agent.type) {
+    case 'single':
+      return new SingleAgentEvaluator(context)
+    case 'orchestrator-executor':
+      return new OrchestratorExecutorEvaluator(context)
+  }
+}

-// Re-exports
-export {
-  createAgent,
-  getRegisteredAgentTypes,
-  isAgentTypeRegistered,
-  registerAgent,
-} from './registry'
 export type { AgentContext, AgentEvaluator, AgentResult } from './types'
--- a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/index.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/index.ts
@@ -14,7 +14,6 @@ import { CdpBackend } from '@browseros/server/browser/backends/cdp'
 import { CaptchaWaiter } from '../../capture/captcha-waiter'
 import { DEFAULT_TIMEOUT_MS } from '../../constants'
 import type {
-  EvalConfig,
  OrchestratorExecutorConfig,
  TaskMetadata,
  UIMessageStreamEvent,
@@ -30,15 +29,6 @@ import { Executor, type ExecutorCallbacks } from './executor'
 import { OrchestratorAgent } from './orchestrator-agent'
 import type { ExecutorFactory, ExecutorResult } from './types'

-function extractCdpPort(config: EvalConfig): number {
-  const serverUrl = config.browseros.server_url
-  const match = serverUrl.match(/:(\d+)$/)
-  if (!match) return config.browseros.base_cdp_port
-  const serverPort = Number.parseInt(match[1], 10)
-  const workerOffset = serverPort - config.browseros.base_server_port
-  return config.browseros.base_cdp_port + workerOffset
-}
-
 interface ResolvedConfigs {
  orchestratorConfig: ResolvedAgentConfig & { maxTurns?: number }
  executorConfig: ResolvedAgentConfig
@@ -124,7 +114,7 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
  constructor(private ctx: AgentContext) {}

  async execute(): Promise<AgentResult> {
-    const { config, task, capture } = this.ctx
+    const { config, task, capture, workerIndex } = this.ctx
    const startTime = Date.now()
    const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS

@@ -140,8 +130,8 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
    const { orchestratorConfig, executorConfig, isCladoAction } =
      await resolveAgentConfig(agentConfig)

-    // Connect to Chrome via CDP
-    const cdpPort = extractCdpPort(config)
+    // Connect to Chrome via CDP — same per-worker offset used by app-manager.
+    const cdpPort = config.browseros.base_cdp_port + workerIndex
    const cdp = new CdpBackend({ port: cdpPort })
    await cdp.connect()
    const browser = new Browser(cdp)
--- a/packages/browseros-agent/apps/eval/src/agents/registry.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/registry.ts
@@ -1,51 +0,0 @@
-import type { AgentContext, AgentEvaluator } from './types'
-
-/**
- * Factory function signature for creating agents
- */
-type AgentFactory = (context: AgentContext) => AgentEvaluator
-
-/**
- * Registry of agent factories by type
- */
-const registry = new Map<string, AgentFactory>()
-
-/**
- * Register an agent type with its factory function
- * @throws If type is already registered
- */
-export function registerAgent(type: string, factory: AgentFactory): void {
-  if (registry.has(type)) {
-    throw new Error(`Agent type "${type}" is already registered`)
-  }
-  registry.set(type, factory)
-}
-
-/**
- * Create an agent evaluator from context
- * @throws If agent type is not registered
- */
-export function createAgent(context: AgentContext): AgentEvaluator {
-  const factory = registry.get(context.config.agent.type)
-  if (!factory) {
-    const available = Array.from(registry.keys()).join(', ')
-    throw new Error(
-      `Unknown agent type: "${context.config.agent.type}". Available types: ${available || 'none'}`,
-    )
-  }
-  return factory(context)
-}
-
-/**
- * Get list of all registered agent types
- */
-export function getRegisteredAgentTypes(): string[] {
-  return Array.from(registry.keys())
-}
-
-/**
- * Check if an agent type is registered
- */
-export function isAgentTypeRegistered(type: string): boolean {
-  return registry.has(type)
-}
--- a/packages/browseros-agent/apps/eval/src/agents/single-agent.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/single-agent.ts
@@ -9,25 +9,16 @@ import { CdpBackend } from '@browseros/server/browser/backends/cdp'
 import { registry } from '@browseros/server/tools/registry'
 import { CaptchaWaiter } from '../capture/captcha-waiter'
 import { DEFAULT_TIMEOUT_MS } from '../constants'
-import type { EvalConfig, TaskMetadata } from '../types'
+import type { TaskMetadata } from '../types'
 import { resolveProviderConfig } from '../utils/resolve-provider-config'
 import { withEvalTimeout } from '../utils/with-eval-timeout'
 import type { AgentContext, AgentEvaluator, AgentResult } from './types'

-function extractCdpPort(config: EvalConfig): number {
-  const serverUrl = config.browseros.server_url
-  const match = serverUrl.match(/:(\d+)$/)
-  if (!match) return config.browseros.base_cdp_port
-  const serverPort = Number.parseInt(match[1], 10)
-  const workerOffset = serverPort - config.browseros.base_server_port
-  return config.browseros.base_cdp_port + workerOffset
-}
-
 export class SingleAgentEvaluator implements AgentEvaluator {
  constructor(private ctx: AgentContext) {}

  async execute(): Promise<AgentResult> {
-    const { config, task, capture } = this.ctx
+    const { config, task, capture, workerIndex } = this.ctx
    const startTime = Date.now()
    const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS

@@ -50,8 +41,8 @@ export class SingleAgentEvaluator implements AgentEvaluator {
      supportsImages,
    }

-    // Connect to Chrome via CDP
-    const cdpPort = extractCdpPort(config)
+    // Connect to Chrome via CDP — same per-worker offset used by app-manager.
+    const cdpPort = config.browseros.base_cdp_port + workerIndex
    const cdp = new CdpBackend({ port: cdpPort })
    await cdp.connect()

--- a/packages/browseros-agent/apps/eval/src/agents/types.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/types.ts
@@ -1,26 +1,17 @@
 import type { CaptureContext } from '../capture/context'
 import type { EvalConfig, Message, Task, TaskMetadata } from '../types'

-/**
- * All dependencies an agent evaluator needs - passed via factory
- */
 export interface AgentContext {
-  // Configuration
  config: EvalConfig
  task: Task
+  workerIndex: number

-  // Page resolved once at task start (fresh browser has exactly one page)
+  // Resolved once at task start (fresh browser has exactly one page).
  initialPageId: number

-  // Browser window info for agents that operate on explicit window/tab ids
-  windowId?: number
-  tabId?: number
-
-  // Output paths
  outputDir: string // Root output directory
  taskOutputDir: string // Task-specific: outputDir/query_id/

-  // Capture infrastructure (pre-initialized by runner)
  capture: CaptureContext
 }

--- a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/action-mapper.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/action-mapper.ts
@@ -1,677 +0,0 @@
-/**
- * Maps Yutori n1 actions to MCP tool calls
- *
- * Coordinate System:
- * - n1 outputs normalized coordinates in 1000x1000 grid
- * - Screenshots captured with size='large' (1028px width, aspect ratio preserved)
- * - We scale normalized coords to actual viewport pixels
- *
- * Action Mapping (prioritize MCP tools over execute_javascript):
- * - click → browser_click_coordinates ✅
- * - type → browser_type_at_coordinates (uses last clicked coords) ✅
- * - scroll up/down → browser_scroll_up/down ✅
- * - scroll left/right → browser_execute_javascript (no horizontal scroll tool)
- * - key_press → browser_send_keys (for supported keys) ✅
- * - hover → browser_execute_javascript (no dedicated MCP tool)
- * - drag → browser_execute_javascript (no dedicated MCP tool)
- * - wait → setTimeout
- * - refresh → browser_execute_javascript (no dedicated MCP tool)
- * - go_back → browser_execute_javascript (no dedicated MCP tool)
- * - goto_url → browser_navigate ✅
- * - stop → returns answer (no MCP call)
- * - read_texts_and_links → browser_get_page_content ✅
- */
-
-import { Client } from '@modelcontextprotocol/sdk/client/index.js'
-import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
-import sharp from 'sharp'
-import type { ActionContext, N1Action, ScreenSize } from './types'
-import { DEFAULTS } from './types'
-
-/**
- * Convert PNG base64 to WebP base64 for smaller payload size.
- * Yutori n1 recommends WebP format for better compression.
- */
-async function convertToWebP(pngBase64: string): Promise<string> {
-  const pngBuffer = Buffer.from(pngBase64, 'base64')
-  const webpBuffer = await sharp(pngBuffer)
-    .webp({ quality: 80 }) // Good balance of quality and size
-    .toBuffer()
-  return webpBuffer.toString('base64')
-}
-
-interface McpToolResult {
-  content: Array<{
-    type: string
-    text?: string
-    data?: string
-    mimeType?: string
-  }>
-  isError?: boolean
-}
-
-const MCP_TIMEOUT_MS = 30000
-
-// Scroll amount per unit (n1 recommends treating each amount as 10-15% of screen)
-const SCROLL_PERCENT_PER_UNIT = 0.12 // 12% of viewport per scroll unit
-
-export class ActionMapper {
-  private ctx: ActionContext
-  private cachedViewport: ScreenSize | null = null
-  // Track last clicked coordinates for type action (n1 type has no coords)
-  private lastClickCoordinates: { x: number; y: number } | null = null
-
-  constructor(ctx: ActionContext) {
-    this.ctx = ctx
-  }
-
-  // Store debug info about viewport detection for inclusion in responses
-  private viewportDebugInfo: string = ''
-
-  /**
-   * Get the actual browser viewport size via JavaScript
-   * This is critical for correct coordinate mapping:
-   * - Screenshot is scaled to 1028px width (aspect ratio preserved)
-   * - Clicks must be at actual viewport coordinates
-   * - We scale: (normalized/1000) * viewport
-   * Caches the result to avoid repeated calls
-   * Also stores debug info for troubleshooting
-   */
-  async getViewportSize(): Promise<ScreenSize> {
-    if (this.cachedViewport) {
-      return this.cachedViewport
-    }
-
-    try {
-      const result = await this.callMcp('browser_execute_javascript', {
-        tabId: this.ctx.tabId,
-        windowId: this.ctx.windowId,
-        code: '[window.innerWidth, window.innerHeight]',
-      })
-
-      const textContent =
-        result.content.find((c) => c.type === 'text')?.text ?? ''
-
-      // Check for error in result
-      if (result.isError) {
-        this.viewportDebugInfo = `[VIEWPORT ERROR] JS execution failed: ${textContent}. Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
-        console.warn(this.viewportDebugInfo)
-        return this.ctx.screenSize
-      }
-
-      // Parse array format - can be multiline: [1440, 900] or "Result: [\n  1200,\n  712\n]"
-      const arrayMatch = textContent.match(/\[\s*(\d+)\s*,\s*(\d+)\s*\]/s)
-      if (arrayMatch) {
-        const width = parseInt(arrayMatch[1], 10)
-        const height = parseInt(arrayMatch[2], 10)
-        if (width > 0 && height > 0) {
-          this.cachedViewport = { width, height }
-          this.viewportDebugInfo = `[VIEWPORT OK] Detected: ${width}x${height} (raw: "${textContent.substring(0, 100)}")`
-          console.log(this.viewportDebugInfo)
-          return this.cachedViewport
-        } else {
-          this.viewportDebugInfo = `[VIEWPORT PARSE ERROR] Invalid dimensions: ${width}x${height} from "${textContent}". Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
-          console.warn(this.viewportDebugInfo)
-        }
-      } else {
-        this.viewportDebugInfo = `[VIEWPORT PARSE ERROR] Could not parse: "${textContent}". Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
-        console.warn(this.viewportDebugInfo)
-      }
-    } catch (error) {
-      const errMsg = error instanceof Error ? error.message : String(error)
-      this.viewportDebugInfo = `[VIEWPORT EXCEPTION] ${errMsg}. Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
-      console.warn(this.viewportDebugInfo)
-    }
-
-    // Fallback to config screenSize
-    return this.ctx.screenSize
-  }
-
-  /**
-   * Clear cached viewport (call when tab/window changes or before new task)
-   */
-  clearViewportCache(): void {
-    this.cachedViewport = null
-  }
-
-  /**
-   * Reset all tracked state (call before starting a new task)
-   */
-  reset(): void {
-    this.cachedViewport = null
-    this.lastClickCoordinates = null
-  }
-
-  /**
-   * Scale normalized coordinate (0-1000) to actual viewport pixel value
-   *
-   * How it works:
-   * - Screenshot is captured at 1028px width with preserved aspect ratio
-   * - n1 predicts normalized coords (0-1000) for that screenshot
-   * - Since aspect ratio is preserved, we can scale directly to viewport
-   * - Formula: actualX = (normalizedX / 1000) * viewport.innerWidth
-   */
-  private async scaleCoordinates(
-    normalizedX: number,
-    normalizedY: number,
-  ): Promise<{ x: number; y: number }> {
-    const viewport = await this.getViewportSize()
-    return {
-      x: Math.round((normalizedX / DEFAULTS.normalizedMax) * viewport.width),
-      y: Math.round((normalizedY / DEFAULTS.normalizedMax) * viewport.height),
-    }
-  }
-
-  /**
-   * Call an MCP tool
-   */
-  private async callMcp(
-    name: string,
-    args: Record<string, unknown> = {},
-  ): Promise<McpToolResult> {
-    const client = new Client({
-      name: 'yutori-navigator',
-      version: '1.0.0',
-    })
-
-    const transport = new StreamableHTTPClientTransport(
-      new URL(this.ctx.mcpUrl),
-      {
-        requestInit: {
-          headers: { 'X-BrowserOS-Source': 'yutori-navigator' },
-        },
-      },
-    )
-
-    try {
-      await client.connect(transport)
-
-      const toolCallPromise = client.callTool({ name, arguments: args })
-      let timeoutId: ReturnType<typeof setTimeout> | null = null
-      const timeoutPromise = new Promise<never>((_, reject) => {
-        timeoutId = setTimeout(
-          () =>
-            reject(
-              new Error(`MCP tool call timed out after ${MCP_TIMEOUT_MS}ms`),
-            ),
-          MCP_TIMEOUT_MS,
-        )
-      })
-
-      try {
-        return (await Promise.race([
-          toolCallPromise,
-          timeoutPromise,
-        ])) as McpToolResult
-      } finally {
-        if (timeoutId) clearTimeout(timeoutId)
-      }
-    } finally {
-      try {
-        await transport.close()
-      } catch {
-        // Ignore close errors
-      }
-    }
-  }
-
-  /**
-   * Execute an n1 action by mapping to MCP tools
-   * Prioritizes native MCP tools over browser_execute_javascript for reliability
-   * Returns the result message and optionally the stop answer
-   */
-  async execute(
-    action: N1Action,
-  ): Promise<{ success: boolean; message: string; stopAnswer?: string }> {
-    const { tabId, windowId } = this.ctx
-
-    try {
-      switch (action.action_type) {
-        case 'click': {
-          const [normX, normY] = action.center_coordinates
-          const viewport = await this.getViewportSize()
-          const { x, y } = await this.scaleCoordinates(normX, normY)
-
-          // Track coordinates for subsequent type action (n1 type has no coords)
-          this.lastClickCoordinates = { x, y }
-
-          await this.callMcp('browser_click_coordinates', {
-            tabId,
-            windowId,
-            x,
-            y,
-          })
-          // Return original coordinates + debug info
-          const debugInfo = `[DEBUG: input=(${normX},${normY}) → viewport=(${x},${y}), viewport=${viewport.width}x${viewport.height}] ${this.viewportDebugInfo}`
-          return {
-            success: true,
-            message: `Clicked at (${normX}, ${normY}). ${debugInfo}`,
-          }
-        }
-
-        case 'type': {
-          const { text, press_enter_after, clear_before_typing } = action
-
-          // n1 type action has no coordinates - it expects element to be focused
-          // Use last clicked coordinates with browser_type_at_coordinates
-          if (!this.lastClickCoordinates) {
-            // Fallback: click center of screen if no prior click
-            const viewport = await this.getViewportSize()
-            this.lastClickCoordinates = {
-              x: Math.round(viewport.width / 2),
-              y: Math.round(viewport.height / 2),
-            }
-          }
-
-          const { x, y } = this.lastClickCoordinates
-
-          // Clear field first if requested using native MCP tools
-          if (clear_before_typing) {
-            // Triple-click to select all text in the field
-            await this.callMcp('browser_click_coordinates', {
-              tabId,
-              windowId,
-              x,
-              y,
-            })
-            // Use Delete key to clear
-            await this.callMcp('browser_send_keys', {
-              tabId,
-              windowId,
-              key: 'Delete',
-            })
-          }
-
-          // Use browser_type_at_coordinates - the proper MCP tool for typing
-          await this.callMcp('browser_type_at_coordinates', {
-            tabId,
-            windowId,
-            x,
-            y,
-            text,
-          })
-
-          // Press Enter if requested using native MCP tool
-          if (press_enter_after) {
-            await this.callMcp('browser_send_keys', {
-              tabId,
-              windowId,
-              key: 'Enter',
-            })
-          }
-
-          // n1 type action has no coordinates - don't include viewport coords in response
-          return {
-            success: true,
-            message: `Typed "${text.substring(0, 50)}${text.length > 50 ? '...' : ''}"`,
-          }
-        }
-
-        case 'scroll': {
-          const { direction, center_coordinates, amount } = action
-          const [normX, normY] = center_coordinates
-          const { x, y } = await this.scaleCoordinates(normX, normY)
-
-          // Track coordinates
-          this.lastClickCoordinates = { x, y }
-
-          // Click at position first to focus element (for scrollable containers)
-          await this.callMcp('browser_click_coordinates', {
-            tabId,
-            windowId,
-            x,
-            y,
-          })
-
-          // For vertical scroll (up/down): use native MCP scroll tools
-          // For horizontal scroll (left/right): use JS (no MCP tool available)
-          if (direction === 'up' || direction === 'down') {
-            const scrollTool =
-              direction === 'up' ? 'browser_scroll_up' : 'browser_scroll_down'
-
-            // Calculate how many scroll calls based on amount
-            // n1 amount 1-2 = ~20% viewport, our tool = 100% viewport
-            // So we scroll once for small amounts, more for larger
-            const scrollCount = Math.max(1, Math.round(amount / 5))
-
-            for (let i = 0; i < scrollCount; i++) {
-              await this.callMcp(scrollTool, { tabId, windowId })
-              // Small delay between scrolls for stability
-              if (i < scrollCount - 1) {
-                await new Promise((r) => setTimeout(r, 100))
-              }
-            }
-
-            // Return original normalized coordinates
-            return {
-              success: true,
-              message: `Scrolled ${direction} at (${normX}, ${normY})`,
-            }
-          } else {
-            // Horizontal scroll - no MCP tool, use JS
-            const viewport = await this.getViewportSize()
-            const scrollPixels = Math.round(
-              amount * SCROLL_PERCENT_PER_UNIT * viewport.width,
-            )
-            const scrollCode =
-              direction === 'left'
-                ? `window.scrollBy(-${scrollPixels}, 0)`
-                : `window.scrollBy(${scrollPixels}, 0)`
-
-            await this.callMcp('browser_execute_javascript', {
-              tabId,
-              windowId,
-              code: scrollCode,
-            })
-
-            // Return original normalized coordinates
-            return {
-              success: true,
-              message: `Scrolled ${direction} at (${normX}, ${normY})`,
-            }
-          }
-        }
-
-        case 'key_press': {
-          const { key_comb } = action
-
-          // Map keys to browser_send_keys supported keys
-          // browser_send_keys supports: Enter, Delete, Backspace, Tab, Escape,
-          // ArrowUp, ArrowDown, ArrowLeft, ArrowRight, Home, End, PageUp, PageDown
-          const keyMap: Record<string, string> = {
-            Enter: 'Enter',
-            Escape: 'Escape',
-            Tab: 'Tab',
-            Backspace: 'Backspace',
-            Delete: 'Delete',
-            ArrowUp: 'ArrowUp',
-            ArrowDown: 'ArrowDown',
-            ArrowLeft: 'ArrowLeft',
-            ArrowRight: 'ArrowRight',
-            Home: 'Home',
-            End: 'End',
-            PageUp: 'PageUp',
-            PageDown: 'PageDown',
-            // Alternative names n1 might use
-            Return: 'Enter',
-            Esc: 'Escape',
-            Up: 'ArrowUp',
-            Down: 'ArrowDown',
-            Left: 'ArrowLeft',
-            Right: 'ArrowRight',
-          }
-
-          const mappedKey = keyMap[key_comb]
-          if (mappedKey) {
-            // Use native MCP tool
-            await this.callMcp('browser_send_keys', {
-              tabId,
-              windowId,
-              key: mappedKey,
-            })
-          } else {
-            // For complex key combinations (Ctrl+A, etc.), use JavaScript
-            const parts = key_comb.split('+')
-            const mainKey = parts.pop() || ''
-            const modifiers = parts.map((p) => p.toLowerCase())
-
-            await this.callMcp('browser_execute_javascript', {
-              tabId,
-              windowId,
-              code: `
-                const event = new KeyboardEvent('keydown', {
-                  key: '${mainKey}',
-                  code: 'Key${mainKey.toUpperCase()}',
-                  ctrlKey: ${modifiers.includes('control') || modifiers.includes('ctrl')},
-                  shiftKey: ${modifiers.includes('shift')},
-                  altKey: ${modifiers.includes('alt')},
-                  metaKey: ${modifiers.includes('meta') || modifiers.includes('cmd')},
-                  bubbles: true
-                });
-                document.activeElement?.dispatchEvent(event);
-              `,
-            })
-          }
-
-          return { success: true, message: `Pressed ${key_comb}` }
-        }
-
-        case 'hover': {
-          // No dedicated MCP hover tool - use JS
-          const [normX, normY] = action.center_coordinates
-          const { x, y } = await this.scaleCoordinates(normX, normY)
-
-          // Track coordinates
-          this.lastClickCoordinates = { x, y }
-
-          await this.callMcp('browser_execute_javascript', {
-            tabId,
-            windowId,
-            code: `
-              const elem = document.elementFromPoint(${x}, ${y});
-              if (elem) {
-                const event = new MouseEvent('mouseover', {
-                  bubbles: true,
-                  clientX: ${x},
-                  clientY: ${y}
-                });
-                elem.dispatchEvent(event);
-              }
-            `,
-          })
-
-          // Return original normalized coordinates
-          return { success: true, message: `Hovered at (${normX}, ${normY})` }
-        }
-
-        case 'drag': {
-          // No dedicated MCP drag tool - use JS
-          const [startNormX, startNormY] = action.start_coordinates
-          const [endNormX, endNormY] = action.center_coordinates
-          const start = await this.scaleCoordinates(startNormX, startNormY)
-          const end = await this.scaleCoordinates(endNormX, endNormY)
-
-          // Track end coordinates
-          this.lastClickCoordinates = end
-
-          await this.callMcp('browser_execute_javascript', {
-            tabId,
-            windowId,
-            code: `
-              const startElem = document.elementFromPoint(${start.x}, ${start.y});
-              const endElem = document.elementFromPoint(${end.x}, ${end.y});
-              if (startElem && endElem) {
-                const dragStart = new DragEvent('dragstart', {
-                  bubbles: true,
-                  clientX: ${start.x},
-                  clientY: ${start.y}
-                });
-                const drop = new DragEvent('drop', {
-                  bubbles: true,
-                  clientX: ${end.x},
-                  clientY: ${end.y}
-                });
-                const dragEnd = new DragEvent('dragend', { bubbles: true });
-                startElem.dispatchEvent(dragStart);
-                endElem.dispatchEvent(drop);
-                startElem.dispatchEvent(dragEnd);
-              }
-            `,
-          })
-
-          // Return original normalized coordinates
-          return {
-            success: true,
-            message: `Dragged from (${startNormX}, ${startNormY}) to (${endNormX}, ${endNormY})`,
-          }
-        }
-
-        case 'wait': {
-          // n1 uses this for page loads
-          await new Promise((resolve) => setTimeout(resolve, 2000))
-          return { success: true, message: 'Waited 2 seconds' }
-        }
-
-        case 'refresh': {
-          // No dedicated MCP refresh tool - use JS
-          await this.callMcp('browser_execute_javascript', {
-            tabId,
-            windowId,
-            code: 'location.reload()',
-          })
-          // Wait for page to start reloading
-          await new Promise((resolve) => setTimeout(resolve, 1000))
-          return { success: true, message: 'Refreshed page' }
-        }
-
-        case 'go_back': {
-          // No dedicated MCP go_back tool - use JS
-          await this.callMcp('browser_execute_javascript', {
-            tabId,
-            windowId,
-            code: 'history.back()',
-          })
-          return { success: true, message: 'Navigated back' }
-        }
-
-        case 'goto_url': {
-          // Use native MCP navigate tool
-          await this.callMcp('browser_navigate', {
-            tabId,
-            windowId,
-            url: action.url,
-          })
-          return { success: true, message: `Navigated to ${action.url}` }
-        }
-
-        case 'read_texts_and_links': {
-          // Use native MCP tool
-          const result = await this.callMcp('browser_get_page_content', {
-            tabId,
-            windowId,
-            type: 'text-with-links',
-          })
-          const content =
-            result.content.find((c) => c.type === 'text')?.text ?? ''
-          return {
-            success: true,
-            message: `Read page content (${content.length} chars)`,
-          }
-        }
-
-        case 'stop': {
-          // Stop action - task is complete, return the answer
-          return {
-            success: true,
-            message: 'Task completed',
-            stopAnswer: action.answer,
-          }
-        }
-
-        default: {
-          const _exhaustive: never = action
-          return {
-            success: false,
-            message: `Unknown action: ${JSON.stringify(action)}`,
-          }
-        }
-      }
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error)
-      return { success: false, message: `Action failed: ${message}` }
-    }
-  }
-
-  /**
-   * Capture a screenshot via MCP with retry logic
-   *
-   * Uses Yutori's recommended screenshot size (1280x800) for optimal model performance.
-   * Now that viewport detection is working correctly, the coordinate mapping will be accurate.
-   *
-   * Returns WebP base64 string
-   */
-  async captureScreenshot(retries = 2): Promise<string | null> {
-    const { width, height } = DEFAULTS.screenshotSize
-
-    for (let attempt = 0; attempt <= retries; attempt++) {
-      try {
-        const result = await this.callMcp('browser_get_screenshot', {
-          tabId: this.ctx.tabId,
-          windowId: this.ctx.windowId,
-          width,
-          height,
-          showHighlights: false,
-        })
-
-        if (result.isError) {
-          const errorText =
-            result.content?.find((c) => c.type === 'text')?.text ??
-            'Unknown error'
-          if (attempt < retries) {
-            console.warn(
-              `Screenshot attempt ${attempt + 1} failed: ${errorText}, retrying...`,
-            )
-            await new Promise((r) => setTimeout(r, 500))
-            continue
-          }
-          console.warn('Screenshot capture failed:', errorText)
-          return null
-        }
-
-        const imageContent = result.content.find((c) => c.type === 'image')
-        if (imageContent?.data) {
-          // Convert PNG to WebP for smaller payload (n1 recommends WebP)
-          try {
-            const webpBase64 = await convertToWebP(imageContent.data)
-            return webpBase64
-          } catch (conversionError) {
-            console.warn('WebP conversion failed, using PNG:', conversionError)
-            return imageContent.data
-          }
-        }
-
-        if (attempt < retries) {
-          console.warn(
-            `Screenshot attempt ${attempt + 1}: No image data, retrying...`,
-          )
-          await new Promise((r) => setTimeout(r, 500))
-          continue
-        }
-        return null
-      } catch (error) {
-        if (attempt < retries) {
-          console.warn(
-            `Screenshot attempt ${attempt + 1} error:`,
-            error,
-            'retrying...',
-          )
-          await new Promise((r) => setTimeout(r, 500))
-          continue
-        }
-        console.warn('Screenshot capture error:', error)
-        return null
-      }
-    }
-    return null
-  }
-
-  /**
-   * Get current page URL via MCP
-   */
-  async getCurrentUrl(): Promise<string> {
-    try {
-      const result = await this.callMcp('browser_execute_javascript', {
-        tabId: this.ctx.tabId,
-        windowId: this.ctx.windowId,
-        code: 'window.location.href',
-      })
-
-      const textContent =
-        result.content.find((c) => c.type === 'text')?.text ?? ''
-      const urlMatch = textContent.match(/Result:\s*"?([^"\n]+)"?/)
-      return urlMatch?.[1] ?? 'unknown'
-    } catch {
-      return 'unknown'
-    }
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/agent.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/agent.ts
@@ -1,353 +0,0 @@
-/**
- * Yutori Navigator n1 Agent
- *
- * Implements the agent loop that calls Yutori n1 API and executes actions.
- * Uses UIMessageStreamEvent format for logging compatibility.
- *
- * n1 API follows OpenAI Chat Completions interface with special 'observation' role
- * for screenshots. Full conversation history must be maintained.
- */
-
-import { randomUUID } from 'node:crypto'
-import { ActionMapper } from './action-mapper'
-import {
-  DEFAULTS,
-  type N1Action,
-  type N1ChatCompletionResponse,
-  type N1Message,
-  N1ResponseSchema,
-  YUTORI_API_BASE,
-  type YutoriNavigatorAgentConfig,
-} from './types'
-
-interface StreamWriter {
-  write: (data: string) => Promise<void>
-}
-
-type ActionHook = (
-  action: N1Action,
-  result: { success: boolean; message: string },
-) => Promise<void>
-
-/**
- * Emit SSE-formatted UIMessageStreamEvent
- */
-function emitEvent(
-  writer: StreamWriter,
-  event: Record<string, unknown>,
-): Promise<void> {
-  return writer.write(`data: ${JSON.stringify(event)}\n\n`)
-}
-
-export class YutoriNavigatorAgent {
-  private config: YutoriNavigatorAgentConfig
-  private actionMapper: ActionMapper
-  private actionHook?: ActionHook
-  private messages: N1Message[] = []
-
-  constructor(config: YutoriNavigatorAgentConfig) {
-    this.config = config
-    this.actionMapper = new ActionMapper({
-      mcpUrl: config.mcpUrl,
-      tabId: config.tabId,
-      windowId: config.windowId,
-      screenSize: config.screenSize,
-    })
-  }
-
-  /**
-   * Set a hook to be called after each action execution
-   */
-  setActionHook(hook: ActionHook): void {
-    this.actionHook = hook
-  }
-
-  /**
-   * Build observation message with screenshot and optional URL
-   */
-  private buildObservationMessage(
-    screenshotBase64: string,
-    currentUrl?: string,
-  ): N1Message {
-    const content: N1Message['content'] = []
-
-    // Include URL if available (recommended by Yutori for better attribution)
-    if (currentUrl) {
-      content.push({
-        type: 'text',
-        text: `Current URL: ${currentUrl}`,
-      })
-    }
-
-    // Add screenshot as base64 data URL (WebP for smaller payload)
-    content.push({
-      type: 'image_url',
-      image_url: {
-        url: `data:image/webp;base64,${screenshotBase64}`,
-      },
-    })
-
-    return {
-      role: 'observation',
-      content,
-    }
-  }
-
-  /**
-   * Call the Yutori n1 API
-   */
-  private async callN1Api(): Promise<N1ChatCompletionResponse> {
-    const url = `${YUTORI_API_BASE}/chat/completions`
-
-    const response = await fetch(url, {
-      method: 'POST',
-      headers: {
-        'Content-Type': 'application/json',
-        Authorization: `Bearer ${this.config.apiKey}`,
-      },
-      body: JSON.stringify({
-        model: DEFAULTS.model,
-        messages: this.messages,
-        temperature: DEFAULTS.temperature,
-      }),
-    })
-
-    if (!response.ok) {
-      const errorBody = await response.text()
-      throw new Error(
-        `Yutori n1 API error: ${response.status} ${response.statusText} - ${errorBody}`,
-      )
-    }
-
-    return response.json()
-  }
-
-  /**
-   * Parse n1 response content to extract thoughts and actions
-   */
-  private parseN1Response(
-    content: string,
-  ): { thoughts: string; actions: N1Action[] } | null {
-    try {
-      const parsed = JSON.parse(content)
-      const validated = N1ResponseSchema.safeParse(parsed)
-
-      if (validated.success) {
-        return validated.data
-      }
-
-      console.warn('n1 response validation failed:', validated.error.message)
-      // Try to extract what we can
-      return {
-        thoughts: parsed.thoughts ?? '',
-        actions: Array.isArray(parsed.actions) ? parsed.actions : [],
-      }
-    } catch (error) {
-      console.warn('Failed to parse n1 response:', error)
-      return null
-    }
-  }
-
-  /**
-   * Execute the agent loop
-   */
-  async execute(
-    query: string,
-    streamWriter: StreamWriter,
-    signal: AbortSignal,
-  ): Promise<{ finalText: string | null; totalActions: number }> {
-    let totalActions = 0
-    let finalText: string | null = null
-
-    // Wait for page to stabilize before first screenshot
-    await new Promise((resolve) => setTimeout(resolve, 2000))
-
-    // Capture initial screenshot with retries
-    let initialScreenshot: string | null = null
-    for (let attempt = 1; attempt <= 3; attempt++) {
-      initialScreenshot = await this.actionMapper.captureScreenshot()
-      if (initialScreenshot) break
-      console.warn(`Initial screenshot attempt ${attempt} failed, retrying...`)
-      await new Promise((resolve) => setTimeout(resolve, 1000))
-    }
-
-    if (!initialScreenshot) {
-      throw new Error('Failed to capture initial screenshot after 3 attempts')
-    }
-
-    // Get initial URL
-    const initialUrl = await this.actionMapper.getCurrentUrl()
-
-    // Build initial messages
-    // 1. User message with task
-    this.messages.push({
-      role: 'user',
-      content: [{ type: 'text', text: query }],
-    })
-
-    // 2. Initial observation with screenshot
-    this.messages.push(
-      this.buildObservationMessage(initialScreenshot, initialUrl),
-    )
-
-    // Emit start event
-    const messageId = randomUUID()
-    await emitEvent(streamWriter, { type: 'start', messageId })
-
-    let finished = false
-    for (let turn = 0; turn < this.config.turnLimit; turn++) {
-      if (signal.aborted) {
-        await emitEvent(streamWriter, { type: 'abort' })
-        break
-      }
-
-      // Start step (turn)
-      await emitEvent(streamWriter, { type: 'start-step' })
-
-      // Call n1 API
-      let response: N1ChatCompletionResponse
-      try {
-        response = await this.callN1Api()
-      } catch (error) {
-        const errorMsg = error instanceof Error ? error.message : String(error)
-        await emitEvent(streamWriter, {
-          type: 'error',
-          errorText: `API error: ${errorMsg}`,
-        })
-        throw error
-      }
-
-      // Extract response content
-      const choice = response.choices?.[0]
-      if (!choice?.message?.content) {
-        await emitEvent(streamWriter, {
-          type: 'error',
-          errorText: 'Empty response from n1 API',
-        })
-        throw new Error('Empty response from n1 API')
-      }
-
-      const assistantContent = choice.message.content
-
-      // Parse the JSON response
-      const parsed = this.parseN1Response(assistantContent)
-      if (!parsed) {
-        await emitEvent(streamWriter, {
-          type: 'error',
-          errorText: 'Failed to parse n1 response',
-        })
-        throw new Error('Failed to parse n1 response')
-      }
-
-      const { thoughts, actions } = parsed
-
-      // Emit thoughts as text
-      if (thoughts) {
-        finalText = thoughts
-        const textId = randomUUID()
-        await emitEvent(streamWriter, { type: 'text-start', id: textId })
-        await emitEvent(streamWriter, {
-          type: 'text-delta',
-          id: textId,
-          delta: thoughts,
-        })
-        await emitEvent(streamWriter, { type: 'text-end', id: textId })
-      }
-
-      // Check for stop action or no actions
-      const stopAction = actions.find((a) => a.action_type === 'stop')
-      if (stopAction && stopAction.action_type === 'stop') {
-        finalText = stopAction.answer
-        await emitEvent(streamWriter, { type: 'finish-step' })
-        await emitEvent(streamWriter, {
-          type: 'finish',
-          finishReason: 'completed',
-        })
-        finished = true
-        break
-      }
-
-      if (actions.length === 0) {
-        await emitEvent(streamWriter, { type: 'finish-step' })
-        await emitEvent(streamWriter, {
-          type: 'finish',
-          finishReason: 'completed',
-        })
-        finished = true
-        break
-      }
-
-      // Add assistant response to conversation history
-      this.messages.push({
-        role: 'assistant',
-        content: assistantContent,
-      })
-
-      // Execute each action
-      for (const action of actions) {
-        if (signal.aborted) break
-
-        // Skip stop actions (handled above)
-        if (action.action_type === 'stop') continue
-
-        const toolCallId = randomUUID()
-
-        // Tool input events
-        await emitEvent(streamWriter, {
-          type: 'tool-input-start',
-          toolCallId,
-          toolName: action.action_type,
-        })
-        await emitEvent(streamWriter, {
-          type: 'tool-input-available',
-          toolCallId,
-          toolName: action.action_type,
-          input: action,
-        })
-
-        const result = await this.actionMapper.execute(action)
-        totalActions++
-
-        // Check if this was a stop action that returned an answer
-        if (result.stopAnswer) {
-          finalText = result.stopAnswer
-        }
-
-        // Tool output event
-        await emitEvent(streamWriter, {
-          type: 'tool-output-available',
-          toolCallId,
-          output: result,
-        })
-
-        // Call action hook (for screenshot capture)
-        if (this.actionHook) {
-          await this.actionHook(action, result)
-        }
-      }
-
-      // Capture new screenshot and URL for next turn
-      const newScreenshot = await this.actionMapper.captureScreenshot()
-      const currentUrl = await this.actionMapper.getCurrentUrl()
-
-      // Add observation for next turn (n1 requires full history)
-      if (newScreenshot) {
-        this.messages.push(
-          this.buildObservationMessage(newScreenshot, currentUrl),
-        )
-      }
-
-      // Finish step (turn)
-      await emitEvent(streamWriter, { type: 'finish-step' })
-    }
-
-    if (!finished && !signal.aborted) {
-      await emitEvent(streamWriter, {
-        type: 'finish',
-        finishReason: 'max_turns',
-      })
-    }
-
-    return { finalText, totalActions }
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/index.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/index.ts
@@ -1,97 +0,0 @@
-/**
- * Yutori Navigator Evaluator
- * Implements AgentEvaluator interface for the eval framework
- */
-
-import { DEFAULT_TIMEOUT_MS } from '../../constants'
-import type { TaskMetadata, YutoriNavigatorConfig } from '../../types'
-import { resolveEnvValue } from '../../utils/resolve-env'
-import { withEvalTimeout } from '../../utils/with-eval-timeout'
-import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
-import { YutoriNavigatorAgent } from './agent'
-import { DEFAULTS } from './types'
-
-export class YutoriNavigatorEvaluator implements AgentEvaluator {
-  constructor(private ctx: AgentContext) {}
-
-  async execute(): Promise<AgentResult> {
-    const { config, task, capture, windowId = 0, tabId = 0 } = this.ctx
-    const agentConfig = config.agent as YutoriNavigatorConfig
-
-    const startTime = Date.now()
-    const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS
-
-    await capture.messageLogger.logUser(task.query)
-
-    const apiKey = resolveEnvValue(agentConfig.apiKey)
-    if (!apiKey) {
-      throw new Error(
-        `API key not found. Set ${agentConfig.apiKey} environment variable or provide the key directly.`,
-      )
-    }
-
-    const agent = new YutoriNavigatorAgent({
-      apiKey,
-      turnLimit: agentConfig.turnLimit ?? DEFAULTS.turnLimit,
-      screenSize: agentConfig.screenSize ?? DEFAULTS.screenSize,
-      tabId,
-      windowId,
-      mcpUrl: `${config.browseros.server_url}/mcp`,
-    })
-
-    agent.setActionHook(async (_action, _result) => {
-      try {
-        await capture.screenshot.capture(capture.getActivePageId())
-      } catch (err) {
-        console.warn('Screenshot capture failed in hook:', err)
-      }
-    })
-
-    const streamWriter = capture.createStreamWriter()
-
-    let finalText: string | null = null
-    let totalActions = 0
-
-    const { terminationReason } = await withEvalTimeout(
-      timeoutMs,
-      capture,
-      async (signal) => {
-        const result = await agent.execute(task.query, streamWriter, signal)
-        finalText = result.finalText
-        totalActions = result.totalActions
-        return result
-      },
-    )
-
-    const endTime = Date.now()
-
-    const metadata: TaskMetadata = {
-      query_id: task.query_id,
-      dataset: task.dataset,
-      query: task.query,
-      started_at: new Date(startTime).toISOString(),
-      completed_at: new Date(endTime).toISOString(),
-      total_duration_ms: endTime - startTime,
-      total_steps: totalActions,
-      termination_reason: terminationReason,
-      final_answer: finalText ?? capture.getLastAssistantText(),
-      errors: capture.getErrors(),
-      warnings: capture.getWarnings(),
-      agent_config: {
-        type: 'yutori-navigator',
-        model: DEFAULTS.model,
-        turnLimit: agentConfig.turnLimit ?? DEFAULTS.turnLimit,
-        screenSize: agentConfig.screenSize ?? DEFAULTS.screenSize,
-      },
-      grader_results: {},
-    }
-
-    await capture.trajectorySaver.saveMetadata(metadata)
-
-    return {
-      metadata,
-      messages: capture.getMessages(),
-      finalAnswer: finalText ?? capture.getLastAssistantText(),
-    }
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/types.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/types.ts
@@ -1,158 +0,0 @@
-/**
- * Types for Yutori Navigator n1 agent
- *
- * n1 is a pixels-to-actions LLM that follows OpenAI Chat Completions interface.
- * Coordinates are normalized to 1000x1000 grid.
- * Recommended screenshot size: 1280x800 (WXGA 16:10)
- */
-
-import { z } from 'zod'
-
-// n1 action schemas based on API documentation
-export const N1ActionSchema = z.discriminatedUnion('action_type', [
-  z.object({
-    action_type: z.literal('click'),
-    center_coordinates: z.tuple([z.number(), z.number()]),
-  }),
-  z.object({
-    action_type: z.literal('scroll'),
-    direction: z.enum(['up', 'down', 'left', 'right']),
-    center_coordinates: z.tuple([z.number(), z.number()]),
-    amount: z.number().int().min(1).max(10),
-  }),
-  z.object({
-    action_type: z.literal('type'),
-    text: z.string(),
-    press_enter_after: z.boolean().optional(),
-    clear_before_typing: z.boolean().optional(),
-  }),
-  z.object({
-    action_type: z.literal('key_press'),
-    key_comb: z.string(), // Playwright keyboard press format
-  }),
-  z.object({
-    action_type: z.literal('hover'),
-    center_coordinates: z.tuple([z.number(), z.number()]),
-  }),
-  z.object({
-    action_type: z.literal('drag'),
-    start_coordinates: z.tuple([z.number(), z.number()]),
-    center_coordinates: z.tuple([z.number(), z.number()]), // destination
-  }),
-  z.object({
-    action_type: z.literal('wait'),
-  }),
-  z.object({
-    action_type: z.literal('refresh'),
-  }),
-  z.object({
-    action_type: z.literal('go_back'),
-  }),
-  z.object({
-    action_type: z.literal('goto_url'),
-    url: z.string(),
-  }),
-  z.object({
-    action_type: z.literal('read_texts_and_links'),
-  }),
-  z.object({
-    action_type: z.literal('stop'),
-    answer: z.string(),
-  }),
-])
-
-export type N1Action = z.infer<typeof N1ActionSchema>
-
-// n1 API response format
-export const N1ResponseSchema = z.object({
-  thoughts: z.string(),
-  actions: z.array(N1ActionSchema),
-})
-
-export type N1Response = z.infer<typeof N1ResponseSchema>
-
-// Screen size configuration
-export interface ScreenSize {
-  width: number
-  height: number
-}
-
-// Context for action execution
-export interface ActionContext {
-  mcpUrl: string
-  tabId: number
-  windowId: number
-  screenSize: ScreenSize
-}
-
-// OpenAI-compatible message types for n1 API
-export type N1MessageRole = 'user' | 'assistant' | 'observation'
-
-export interface N1TextContent {
-  type: 'text'
-  text: string
-}
-
-export interface N1ImageContent {
-  type: 'image_url'
-  image_url: {
-    url: string // Can be URL or data:image/webp;base64,...
-  }
-}
-
-export type N1ContentPart = N1TextContent | N1ImageContent
-
-export interface N1Message {
-  role: N1MessageRole
-  content: string | N1ContentPart[]
-}
-
-export interface N1ChatCompletionRequest {
-  model: string
-  messages: N1Message[]
-  temperature?: number
-}
-
-export interface N1ChatCompletionResponse {
-  id: string
-  object: string
-  created: number
-  model: string
-  choices: Array<{
-    index: number
-    message: {
-      role: 'assistant'
-      content: string // JSON string containing N1Response
-    }
-    finish_reason: string
-  }>
-  usage?: {
-    prompt_tokens: number
-    completion_tokens: number
-    total_tokens: number
-  }
-}
-
-// Agent configuration
-export interface YutoriNavigatorAgentConfig {
-  apiKey: string
-  turnLimit: number
-  screenSize: ScreenSize
-  tabId: number
-  windowId: number
-  mcpUrl: string
-}
-
-// Defaults based on Yutori documentation
-export const DEFAULTS = {
-  // WXGA 16:10 - Yutori's recommended screenshot size
-  screenshotSize: { width: 1280, height: 800 },
-  screenSize: { width: 1280, height: 800 },
-  turnLimit: 30,
-  model: 'n1-preview-2025-11',
-  temperature: 0.3,
-  // n1 uses 1000x1000 normalized coordinate system
-  normalizedMax: 1000,
-} as const
-
-export const YUTORI_API_BASE = 'https://api.yutori.com/v1'
--- a/packages/browseros-agent/apps/eval/src/dashboard/index.html
+++ b/packages/browseros-agent/apps/eval/src/dashboard/index.html
@@ -190,8 +190,6 @@
          <select id="cfg-agent-type" onchange="onAgentTypeChange(this.value)">
            <option value="single">Single Agent</option>
            <option value="orchestrator-executor">Orchestrator-Executor</option>
-            <option value="gemini-computer-use">Gemini Computer Use</option>
-            <option value="yutori-navigator">Yutori Navigator</option>
          </select>
        </div>

@@ -280,50 +278,6 @@
          </div>
        </div>

-        <!-- Gemini Computer Use fields -->
-        <div class="agent-fields" id="fields-gemini-computer-use">
-          <div class="config-field">
-            <label>API Key <span class="req">*</span></label>
-            <input type="password" id="cfg-gemini-apikey" placeholder="GOOGLE_AI_API_KEY">
-          </div>
-          <div class="config-row">
-            <div class="config-field">
-              <label>Screen Width</label>
-              <input type="number" id="cfg-gemini-width" value="1440" min="800" max="2560">
-            </div>
-            <div class="config-field">
-              <label>Screen Height</label>
-              <input type="number" id="cfg-gemini-height" value="900" min="600" max="1440">
-            </div>
-          </div>
-          <div class="config-field">
-            <label>Turn Limit</label>
-            <input type="number" id="cfg-gemini-turns" value="30" min="1" max="100">
-          </div>
-        </div>
-
-        <!-- Yutori Navigator fields -->
-        <div class="agent-fields" id="fields-yutori-navigator">
-          <div class="config-field">
-            <label>API Key <span class="req">*</span></label>
-            <input type="password" id="cfg-yutori-apikey" placeholder="YUTORI_API_KEY">
-          </div>
-          <div class="config-row">
-            <div class="config-field">
-              <label>Screen Width</label>
-              <input type="number" id="cfg-yutori-width" value="1280" min="800" max="2560">
-            </div>
-            <div class="config-field">
-              <label>Screen Height</label>
-              <input type="number" id="cfg-yutori-height" value="800" min="600" max="1440">
-            </div>
-          </div>
-          <div class="config-field">
-            <label>Turn Limit</label>
-            <input type="number" id="cfg-yutori-turns" value="30" min="1" max="100">
-          </div>
-        </div>
-
      </div>

      <!-- Infrastructure (center) -->
@@ -420,23 +374,10 @@
          <label>Graders</label>
          <div style="display: flex; flex-direction: column; gap: 4px; margin-top: 2px;">
            <div class="config-field-inline"><input type="checkbox" id="cfg-grader-performance" value="performance_grader"><label for="cfg-grader-performance">Performance Grader</label></div>
-            <div class="config-field-inline"><input type="checkbox" id="cfg-grader-webvoyager" value="webvoyager_grader"><label for="cfg-grader-webvoyager">WebVoyager Grader</label></div>
-            <div class="config-field-inline"><input type="checkbox" id="cfg-grader-fara" value="fara_combined"><label for="cfg-grader-fara">Fara Combined</label></div>
-            <div class="config-field-inline"><input type="checkbox" id="cfg-grader-mind2web" value="mind2web_judge"><label for="cfg-grader-mind2web">Mind2Web Judge</label></div>
+            <div class="config-field-inline"><input type="checkbox" id="cfg-grader-agisdk" value="agisdk_state_diff"><label for="cfg-grader-agisdk">AGI SDK State Diff</label></div>
+            <div class="config-field-inline"><input type="checkbox" id="cfg-grader-infinity" value="infinity_state"><label for="cfg-grader-infinity">Infinity State</label></div>
          </div>
        </div>
-        <div class="config-field">
-          <label>Grader Model</label>
-          <input type="text" id="cfg-grader-model" placeholder="e.g. openai/gpt-4.1">
-        </div>
-        <div class="config-field">
-          <label>Grader API Key</label>
-          <input type="password" id="cfg-grader-key-env" placeholder="Key or env var e.g. OPENROUTER_API_KEY">
-        </div>
-        <div class="config-field">
-          <label>Grader Base URL</label>
-          <input type="text" id="cfg-grader-baseurl" placeholder="https://openrouter.ai/api/v1">
-        </div>
      </div>

      <!-- Actions bar (full width) -->
@@ -514,7 +455,7 @@ let passCount = 0;
 let failCount = 0;
 let loadedConfigName = null;

-const PASS_FAIL_GRADER_ORDER = ['performance_grader', 'webvoyager_grader', 'fara_combined', 'fara_grader'];
+const PASS_FAIL_GRADER_ORDER = ['agisdk_state_diff', 'infinity_state', 'performance_grader'];
 function getPrimaryGrader(graderResults) {
  for (const name of PASS_FAIL_GRADER_ORDER) {
    if (graderResults[name]) return graderResults[name];
@@ -751,20 +692,6 @@ function fillForm(cfg) {
    setVal('cfg-exec-model', exec.model);
    setVal('cfg-exec-apikey', exec.apiKey);
    setVal('cfg-exec-baseurl', exec.baseUrl);
-  } else if (type === 'gemini-computer-use') {
-    setVal('cfg-gemini-apikey', agent.apiKey);
-    if (agent.screenSize) {
-      setVal('cfg-gemini-width', agent.screenSize.width);
-      setVal('cfg-gemini-height', agent.screenSize.height);
-    }
-    setVal('cfg-gemini-turns', agent.turnLimit);
-  } else if (type === 'yutori-navigator') {
-    setVal('cfg-yutori-apikey', agent.apiKey);
-    if (agent.screenSize) {
-      setVal('cfg-yutori-width', agent.screenSize.width);
-      setVal('cfg-yutori-height', agent.screenSize.height);
-    }
-    setVal('cfg-yutori-turns', agent.turnLimit);
  }

  // Infrastructure
@@ -797,17 +724,13 @@ function fillForm(cfg) {
  // Grader checkboxes
  const graderMap = {
    'performance_grader': 'cfg-grader-performance',
-    'webvoyager_grader': 'cfg-grader-webvoyager',
-    'fara_combined': 'cfg-grader-fara',
-    'mind2web_judge': 'cfg-grader-mind2web',
+    'agisdk_state_diff': 'cfg-grader-agisdk',
+    'infinity_state': 'cfg-grader-infinity',
  };
  const configGraders = cfg.graders || [];
  for (const [name, id] of Object.entries(graderMap)) {
    document.getElementById(id).checked = configGraders.includes(name);
  }
-  setVal('cfg-grader-model', cfg.grader_model);
-  setVal('cfg-grader-key-env', cfg.grader_api_key_env);
-  setVal('cfg-grader-baseurl', cfg.grader_base_url);
 }

 function setVal(id, val) {
@@ -848,26 +771,6 @@ function buildConfigFromForm() {
        baseUrl: getVal('cfg-exec-baseurl') || undefined,
      },
    };
-  } else if (type === 'gemini-computer-use') {
-    agent = {
-      type: 'gemini-computer-use',
-      apiKey: getVal('cfg-gemini-apikey'),
-      screenSize: {
-        width: parseInt(getVal('cfg-gemini-width'), 10) || 1440,
-        height: parseInt(getVal('cfg-gemini-height'), 10) || 900,
-      },
-      turnLimit: parseInt(getVal('cfg-gemini-turns'), 10) || 30,
-    };
-  } else if (type === 'yutori-navigator') {
-    agent = {
-      type: 'yutori-navigator',
-      apiKey: getVal('cfg-yutori-apikey'),
-      screenSize: {
-        width: parseInt(getVal('cfg-yutori-width'), 10) || 1280,
-        height: parseInt(getVal('cfg-yutori-height'), 10) || 800,
-      },
-      turnLimit: parseInt(getVal('cfg-yutori-turns'), 10) || 30,
-    };
  }

  // Dataset: use dropdown value unless custom is selected
@@ -894,16 +797,10 @@ function buildConfigFromForm() {
  if (outputDir) config.output_dir = outputDir;
  const timeoutMs = parseInt(getVal('cfg-timeout'), 10);
  if (timeoutMs) config.timeout_ms = timeoutMs;
-  const selectedGraders = ['cfg-grader-performance', 'cfg-grader-webvoyager', 'cfg-grader-fara', 'cfg-grader-mind2web']
+  const selectedGraders = ['cfg-grader-performance', 'cfg-grader-agisdk', 'cfg-grader-infinity']
    .filter(id => document.getElementById(id).checked)
    .map(id => document.getElementById(id).value);
  if (selectedGraders.length) config.graders = selectedGraders;
-  const graderModel = getVal('cfg-grader-model');
-  if (graderModel) config.grader_model = graderModel;
-  const graderKeyEnv = getVal('cfg-grader-key-env');
-  if (graderKeyEnv) config.grader_api_key_env = graderKeyEnv;
-  const graderBaseUrl = getVal('cfg-grader-baseurl');
-  if (graderBaseUrl) config.grader_base_url = graderBaseUrl;

  return config;
 }
@@ -1417,8 +1314,6 @@ function renderGraderPanel() {
  let bodyHtml = '';
  if (primaryName === 'performance_grader') {
    bodyHtml = renderPerformanceGrader(primaryResult);
-  } else if (primaryName === 'fara_combined' || primaryName === 'fara_grader') {
-    bodyHtml = renderFaraCombined(primaryResult);
  } else {
    bodyHtml = renderGenericGrader(primaryResult);
  }
@@ -1477,35 +1372,6 @@ function renderPerformanceGrader(result) {
  return html;
 }

-function renderFaraCombined(result) {
-  const details = result.details || {};
-  const verifiers = details.verifiers;
-  const voting = details.votingResult;
-  if (!verifiers || typeof verifiers !== 'object') {
-    return renderGenericGrader(result);
-  }
-
-  let html = '';
-  if (voting) {
-    html += `<div style="font-size:11px;color:#8b949e;margin-bottom:8px">Majority vote: ${voting.passCount}/${voting.totalVerifiers} passed &rarr; <strong style="color:${voting.decision === 'PASS' ? '#3fb950' : '#f85149'}">${voting.decision}</strong></div>`;
-  }
-  html += '<div class="grader-verifiers">';
-  for (const [name, v] of Object.entries(verifiers)) {
-    const badge = v.pass ? '<span class="grader-verifier-badge pass">PASS</span>' : '<span class="grader-verifier-badge fail">FAIL</span>';
-    const score = typeof v.score === 'number' ? `${(v.score * 100).toFixed(0)}%` : '';
-    const label = name.charAt(0).toUpperCase() + name.slice(1);
-    html += `
-      <div class="grader-verifier">
-        <span class="grader-verifier-name">${label}</span>
-        ${badge}
-        <span style="font-size:11px;color:#8b949e;margin-left:auto">${score}</span>
-      </div>
-    `;
-  }
-  html += '</div>';
-  return html;
-}
-
 function renderGenericGrader(result) {
  const reasoning = result.reasoning || '';
  if (!reasoning) return '';
--- a/packages/browseros-agent/apps/eval/src/dashboard/server.ts
+++ b/packages/browseros-agent/apps/eval/src/dashboard/server.ts
@@ -4,7 +4,6 @@ import { Hono } from 'hono'
 import { streamSSE } from 'hono/streaming'
 import { ParallelExecutor } from '../runner/parallel-executor'
 import { loadTasks } from '../runner/task-loader'
-import { resolveGraderOptions } from '../runner/types'
 import { EvalConfigSchema, type Task } from '../types'

 // ============================================================================
@@ -431,14 +430,11 @@ app.post('/api/run', async (c) => {
  const configLabel = body.configName || 'dashboard'
  dashboardState.init(tasks, configLabel, config.agent.type, outputDir)

-  const graderOptions = resolveGraderOptions(config)
-
  // Run eval in background — don't await
  const executor = new ParallelExecutor({
    numWorkers: config.num_workers || 1,
    config,
    outputDir,
-    graderOptions,
    restartServerPerTask: config.restart_server_per_task,
    onEvent: (taskId, event) =>
      dashboardState.broadcastStreamEvent(taskId, event),
--- a/packages/browseros-agent/apps/eval/src/graders/benchmark/mind2web.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/benchmark/mind2web.ts
@@ -1,355 +0,0 @@
-import { readFile } from 'node:fs/promises'
-import { join } from 'node:path'
-import OpenAI from 'openai'
-import type { ChatCompletionContentPart } from 'openai/resources/chat/completions'
-import { type GraderResult, isToolInputAvailable } from '../../types'
-import type { Grader, GraderInput } from '../types'
-
-/**
- * Mind2Web WebJudge Grader - 3-step automatic evaluation
- * Reference: https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/src/methods
- *
- * Steps:
- * 1. Key Point Identification - Extract critical requirements from task
- * 2. Key Screenshot Identification - Score screenshots for relevance (1-5)
- * 3. Outcome Judgment - Final success/failure determination
- */
-
-// ============================================================================
-// Prompts (Exact from Online-Mind2Web repository)
-// ============================================================================
-
-const STEP1_KEY_POINTS_SYSTEM = `You are an expert tasked with analyzing a given task to identify the key points explicitly stated in the task description.
-
-**Objective**: Carefully analyze the task description and extract the critical elements explicitly mentioned in the task for achieving its goal.
-
-**Instructions**:
-1. Read the task description carefully.
-2. Identify and extract **key points** directly stated in the task description.
-   - A **key point** is a critical element, condition, or step explicitly mentioned in the task description.
-   - Do not infer or add any unstated elements.
-   - Words such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" must go through the sort function(e.g., the key point should be "Filter by highest").
-
-**Respond with**:
- **Key Points**: A numbered list of the explicit key points for completing this task, one per line, without explanations or additional details.`
-
-const STEP2_IMAGE_SCORING_SYSTEM = `You are an expert evaluator tasked with determining whether an image contains information about the necessary steps to complete a task.
-
-**Objective**: Analyze the provided image and decide if it shows essential steps or evidence required for completing the task. Use your reasoning to explain your decision before assigning a score.
-
-**Instructions**:
-1. Provide a detailed description of the image, including its contents, visible elements, text (if any), and any notable features.
-
-2. Carefully examine the image and evaluate whether it contains necessary steps or evidence crucial to task completion:
- Identify key points that could be relevant to task completion, such as actions, progress indicators, tool usage, applied filters, or step-by-step instructions.
- Does the image show actions, progress indicators, or critical information directly related to completing the task?
- Is this information indispensable for understanding or ensuring task success?
- If the image contains partial but relevant information, consider its usefulness rather than dismissing it outright.
-
-3. Provide your response in the following format:
- **Reasoning**: Explain your thought process and observations. Mention specific elements in the image that indicate necessary steps, evidence, or lack thereof.
- **Score**: Assign a score based on the reasoning, using the following scale:
-    - **1**: The image does not contain any necessary steps or relevant information.
-    - **2**: The image contains minimal or ambiguous information, unlikely to be essential.
-    - **3**: The image includes some relevant steps or hints but lacks clarity or completeness.
-    - **4**: The image contains important steps or evidence that are highly relevant but not fully comprehensive.
-    - **5**: The image clearly displays necessary steps or evidence crucial for completing the task.
-
-Respond with:
-1. **Reasoning**: [Your explanation]
-2. **Score**: [1-5]`
-
-const STEP3_OUTCOME_SYSTEM = `You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task, the agent's action history, key points for task completion, some potentially important web pages in the agent's trajectory and their reasons, your goal is to determine whether the agent has completed the task and achieved all requirements.
-
-Your response must strictly follow the following evaluation criteria!
-*Important Evaluation Criteria*:
-1: The filtered results must be displayed correctly. If filters were not properly applied (i.e., missing selection, missing confirmation, or no visible effect in results), the task is not considered successful.
-2: You must carefully check whether these snapshots and action history meet these key points. Ensure that specific filter conditions, such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" are correctly applied using the filter function(e.g., sort function).
-3: Certain key points or requirements should be applied by the filter. Otherwise, a search with all requirements as input will be deemed a failure since it cannot guarantee that all results meet the requirements!
-4: If the task requires filtering by a specific range of money, years, or the number of beds and bathrooms, the applied filter must exactly match the given requirement. Any deviation results in failure. To ensure the task is successful, the applied filter must precisely match the specified range without being too broad or too narrow.
-Examples of Failure Cases:
- If the requirement is less than $50, but the applied filter is less than $25, it is a failure.
- If the requirement is $1500-$2500, but the applied filter is $2000-$2500, it is a failure.
- If the requirement is $25-$200, but the applied filter is $0-$200, it is a failure.
- If the required years are 2004-2012, but the filter applied is 2001-2012, it is a failure.
- If the required years are before 2015, but the applied filter is 2000-2014, it is a failure.
- If the task requires exactly 2 beds, but the filter applied is 2+ beds, it is a failure.
-5: Some tasks require a submission action or a display of results to be considered successful.
-6: If the retrieved information is invalid or empty(e.g., No match was found), but the agent has correctly performed the required action, it should still be considered successful.
-7: If the current page already displays all available items, then applying a filter is not necessary. As long as the agent selects items that meet the requirements (e.g., the cheapest or lowest price), the task is still considered successful.
-
-*IMPORTANT*
-Format your response into two lines as shown below:
-
-Thoughts: <your thoughts and reasoning process based on double-checking each key points and the evaluation criteria>
-Status: "success" or "failure"`
-
-// ============================================================================
-// Mind2Web WebJudge Grader Implementation
-// ============================================================================
-
-export class Mind2WebJudgeGrader implements Grader {
-  name = 'mind2web_judge'
-  private client: OpenAI
-  private model: string
-  private scoreThreshold = 3
-  private maxImages = 50
-
-  constructor(apiKey: string, baseURL?: string, model?: string) {
-    this.client = new OpenAI({
-      apiKey,
-      baseURL: baseURL || undefined,
-    })
-    this.model = model || 'gpt-4o'
-  }
-
-  async grade(input: GraderInput): Promise<GraderResult> {
-    try {
-      // Step 1: Identify key points from task
-      const keyPoints = await this.identifyKeyPoints(input.task.query)
-
-      // Step 2: Score screenshots and filter relevant ones
-      const screenshotResults = await this.scoreScreenshots(
-        input.task.query,
-        keyPoints,
-        input.outputDir,
-        input.screenshotCount,
-      )
-
-      // Step 3: Final outcome judgment
-      const actionHistory = this.extractActionHistory(input.messages)
-      const outcome = await this.judgeOutcome(
-        input.task.query,
-        keyPoints,
-        actionHistory,
-        screenshotResults.relevantImages,
-        screenshotResults.thoughts,
-      )
-
-      return {
-        score: outcome.success ? 1 : 0,
-        pass: outcome.success,
-        reasoning: outcome.reasoning,
-        details: {
-          keyPoints,
-          screenshotsEvaluated: screenshotResults.totalEvaluated,
-          screenshotsRelevant: screenshotResults.relevantImages.length,
-          model: this.model,
-        },
-      }
-    } catch (error) {
-      return {
-        score: 0,
-        pass: false,
-        reasoning: `Grader error: ${error instanceof Error ? error.message : String(error)}`,
-        details: { error: true },
-      }
-    }
-  }
-
-  /**
-   * Step 1: Key Point Identification
-   */
-  private async identifyKeyPoints(task: string): Promise<string> {
-    const response = await this.client.chat.completions.create({
-      model: this.model,
-      temperature: 0,
-      messages: [
-        { role: 'system', content: STEP1_KEY_POINTS_SYSTEM },
-        { role: 'user', content: `Task: ${task}` },
-      ],
-      max_tokens: 512,
-    })
-
-    const content = response.choices[0]?.message?.content || ''
-
-    // Extract key points section
-    if (content.includes('**Key Points**:')) {
-      return content.split('**Key Points**:')[1].trim()
-    }
-    if (content.includes('Key Points:')) {
-      return content.split('Key Points:')[1].trim()
-    }
-
-    return content
-  }
-
-  /**
-   * Step 2: Key Screenshot Identification
-   */
-  private async scoreScreenshots(
-    task: string,
-    keyPoints: string,
-    outputDir: string,
-    screenshotCount: number,
-  ): Promise<{
-    relevantImages: { data: string; score: number }[]
-    thoughts: string[]
-    totalEvaluated: number
-  }> {
-    const relevantImages: { data: string; score: number }[] = []
-    const thoughts: string[] = []
-    let totalEvaluated = 0
-
-    // Evaluate each screenshot
-    for (let i = 1; i <= screenshotCount; i++) {
-      try {
-        const filepath = join(outputDir, 'screenshots', `${i}.png`)
-        const buffer = await readFile(filepath)
-        const base64 = buffer.toString('base64')
-        const imageUrl = `data:image/png;base64,${base64}`
-
-        totalEvaluated++
-
-        // Score this image
-        const response = await this.client.chat.completions.create({
-          model: this.model,
-          temperature: 0,
-          messages: [
-            { role: 'system', content: STEP2_IMAGE_SCORING_SYSTEM },
-            {
-              role: 'user',
-              content: [
-                {
-                  type: 'text',
-                  text: `**Task**: ${task}\n\n**Key Points for Task Completion**: ${keyPoints}\n\nThe snapshot of the web page is shown in the image.`,
-                },
-                {
-                  type: 'image_url',
-                  image_url: { url: imageUrl, detail: 'high' },
-                },
-              ],
-            },
-          ],
-          max_tokens: 512,
-        })
-
-        const content = response.choices[0]?.message?.content || ''
-
-        // Extract score
-        const scoreMatch = content.match(/Score[:\s]*\**\s*([1-5])/i)
-        const score = scoreMatch ? parseInt(scoreMatch[1], 10) : 1
-
-        // Extract reasoning/thought
-        const thoughtMatch = content.match(
-          /\*\*Reasoning\*\*:?\s*([\s\S]*?)(?=\n\n|\*\*Score|$)/i,
-        )
-        const thought = thoughtMatch
-          ? thoughtMatch[1].trim().replace(/\n/g, ' ')
-          : content.split('\n')[0]
-
-        // Keep if above threshold
-        if (score >= this.scoreThreshold) {
-          relevantImages.push({ data: imageUrl, score })
-          thoughts.push(`Screenshot ${i} (score ${score}): ${thought}`)
-        }
-      } catch {
-        // Skip missing files
-      }
-    }
-
-    // Limit to max images
-    if (relevantImages.length > this.maxImages) {
-      relevantImages.splice(0, relevantImages.length - this.maxImages)
-      thoughts.splice(0, thoughts.length - this.maxImages)
-    }
-
-    return { relevantImages, thoughts, totalEvaluated }
-  }
-
-  /**
-   * Step 3: Outcome Judgment
-   */
-  private async judgeOutcome(
-    task: string,
-    keyPoints: string,
-    actionHistory: string[],
-    relevantImages: { data: string; score: number }[],
-    thoughts: string[],
-  ): Promise<{ success: boolean; reasoning: string }> {
-    // Format action history
-    const actionsFormatted = actionHistory
-      .map((action, i) => `${i + 1}. ${action}`)
-      .join('\n')
-
-    // Format thoughts
-    const thoughtsFormatted = thoughts
-      .map((thought, i) => `${i + 1}. ${thought}`)
-      .join('\n')
-
-    // Build message content
-    const messageContent: ChatCompletionContentPart[] = []
-
-    if (relevantImages.length > 0) {
-      messageContent.push({
-        type: 'text',
-        text: `User Task: ${task}
-
-Key Points: ${keyPoints}
-
-Action History:
-${actionsFormatted || 'No actions recorded'}
-
-The potentially important snapshots of the webpage in the agent's trajectory and their reasons:
-${thoughtsFormatted || 'No relevant screenshots identified'}`,
-      })
-
-      // Add images
-      for (const img of relevantImages) {
-        messageContent.push({
-          type: 'image_url',
-          image_url: { url: img.data, detail: 'high' as const },
-        })
-      }
-    } else {
-      // No images - text only
-      messageContent.push({
-        type: 'text',
-        text: `User Task: ${task}
-
-Key Points: ${keyPoints}
-
-Action History:
-${actionsFormatted || 'No actions recorded'}`,
-      })
-    }
-
-    const response = await this.client.chat.completions.create({
-      model: this.model,
-      temperature: 0,
-      messages: [
-        { role: 'system', content: STEP3_OUTCOME_SYSTEM },
-        { role: 'user', content: messageContent },
-      ],
-      max_tokens: 1000,
-    })
-
-    const content = response.choices[0]?.message?.content || ''
-
-    const statusMatch = content.match(/Status:\s*"?(success|failure)"?/i)
-    const isSuccess = statusMatch
-      ? statusMatch[1].toLowerCase() === 'success'
-      : false
-
-    return {
-      success: isSuccess,
-      reasoning: content,
-    }
-  }
-
-  /**
-   * Extract action history from messages
-   */
-  private extractActionHistory(messages: GraderInput['messages']): string[] {
-    const actions: string[] = []
-
-    for (const msg of messages) {
-      if (isToolInputAvailable(msg)) {
-        const params = JSON.stringify(msg.input as Record<string, unknown>)
-        actions.push(`${msg.toolName}(${params})`)
-      }
-    }
-
-    return actions
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/graders/benchmark/webvoyager.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/benchmark/webvoyager.ts
@@ -1,138 +0,0 @@
-import { readFile } from 'node:fs/promises'
-import { join } from 'node:path'
-import OpenAI from 'openai'
-import type { GraderResult } from '../../types'
-import type { Grader, GraderInput } from '../types'
-
-/**
- * WebVoyager Grader - Exact implementation based on original WebVoyager auto_eval.py
- * Reference: https://github.com/MinorJerry/WebVoyager/blob/main/evaluation/auto_eval.py
- *
- * Uses GPT-4V to evaluate task completion by analyzing screenshots and final response.
- */
-
-const WEBVOYAGER_SYSTEM_PROMPT = `As an evaluator, you will be presented with three primary components to assist you in your role:
-
-1. Web Task Instruction: This is a clear and specific directive provided in natural language, detailing the online activity to be carried out. These requirements may include conducting searches, verifying information, comparing prices, checking availability, or any other action relevant to the specified web service (such as Amazon, Apple, ArXiv, BBC News, Booking etc).
-
-2. Result Screenshots: This is a visual representation of the screen showing the result or intermediate state of performing a web task. It serves as visual proof of the actions taken in response to the instruction.
-
-3. Result Response: This is a textual response obtained after the execution of the web task. It serves as textual result in response to the instruction.
-
-- You DO NOT NEED to interact with web pages or perform actions such as booking flights or conducting searches on websites.
-- You SHOULD NOT make assumptions based on information not presented in the screenshot when comparing it to the instructions.
-- Your primary responsibility is to conduct a thorough assessment of the web task instruction against the outcome depicted in the screenshot and in the response, evaluating whether the actions taken align with the given instructions.
-- NOTE that the instruction may involve more than one task, for example, locating the garage and summarizing the review. Failing to complete either task, such as not providing a summary, should be considered unsuccessful.
-- NOTE that the screenshot is authentic, but the response provided by LLM is generated at the end of web browsing, and there may be discrepancies between the text and the screenshots.
-- Note the difference: 1) Result response may contradict the screenshot, then the content of the screenshot prevails, 2) The content in the Result response is not mentioned on the screenshot, choose to believe the content.
-
-You should elaborate on how you arrived at your final evaluation and then provide a definitive verdict on whether the task has been successfully accomplished, either as 'SUCCESS' or 'NOT SUCCESS'.`
-
-export class WebVoyagerGrader implements Grader {
-  name = 'webvoyager_grader'
-  private client: OpenAI
-  private maxScreenshots = 15
-  private model: string
-
-  constructor(apiKey: string, baseURL?: string, model?: string) {
-    this.client = new OpenAI({
-      apiKey,
-      baseURL: baseURL || undefined,
-    })
-    this.model = model || 'gpt-4o'
-  }
-
-  async grade(input: GraderInput): Promise<GraderResult> {
-    // Load screenshots (last N screenshots)
-    const startNum = Math.max(
-      1,
-      input.screenshotCount - this.maxScreenshots + 1,
-    )
-    const endNum = input.screenshotCount
-
-    const images: { type: 'image_url'; image_url: { url: string } }[] = []
-    const loadedScreenshots: number[] = []
-
-    for (let i = startNum; i <= endNum; i++) {
-      try {
-        const filepath = join(input.outputDir, 'screenshots', `${i}.png`)
-        const buffer = await readFile(filepath)
-        const base64 = buffer.toString('base64')
-        images.push({
-          type: 'image_url',
-          image_url: { url: `data:image/png;base64,${base64}` },
-        })
-        loadedScreenshots.push(i)
-      } catch {
-        // Skip missing files
-      }
-    }
-
-    if (images.length === 0) {
-      return {
-        score: 0,
-        pass: false,
-        reasoning: 'No screenshots available for evaluation',
-      }
-    }
-
-    // Build user prompt (matching original WebVoyager format)
-    const userPrompt = `TASK: ${input.task.query}
-Result Response: ${input.finalAnswer || '[No response provided]'}
-${images.length} screenshots at the end:`
-
-    try {
-      const response = await this.client.chat.completions.create({
-        model: this.model,
-        temperature: 0,
-        seed: 42,
-        messages: [
-          { role: 'system', content: WEBVOYAGER_SYSTEM_PROMPT },
-          {
-            role: 'user',
-            content: [
-              { type: 'text', text: userPrompt },
-              ...images,
-              { type: 'text', text: 'Your verdict:\n' },
-            ],
-          },
-        ],
-        max_tokens: 1000,
-      })
-
-      const content = response.choices[0]?.message?.content || ''
-
-      // Parse verdict (matching original logic)
-      // "NOT SUCCESS" must be checked first as it contains "SUCCESS"
-      let isSuccess: boolean
-      if (content.toUpperCase().includes('NOT SUCCESS')) {
-        isSuccess = false
-      } else if (content.toUpperCase().includes('SUCCESS')) {
-        isSuccess = true
-      } else {
-        // Ambiguous response - default to failure
-        isSuccess = false
-      }
-
-      return {
-        score: isSuccess ? 1 : 0,
-        pass: isSuccess,
-        reasoning: content,
-        details: {
-          screenshotsEvaluated: images.length,
-          screenshotRange: `${loadedScreenshots[0]}-${loadedScreenshots[loadedScreenshots.length - 1]}`,
-          model: this.model,
-          promptTokens: response.usage?.prompt_tokens,
-          completionTokens: response.usage?.completion_tokens,
-        },
-      }
-    } catch (error) {
-      return {
-        score: 0,
-        pass: false,
-        reasoning: `Grader error: ${error instanceof Error ? error.message : String(error)}`,
-        details: { error: true },
-      }
-    }
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/graders/fara/alignment.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/fara/alignment.ts
@@ -1,234 +0,0 @@
-import OpenAI from 'openai'
-import {
-  countToolCalls,
-  type GraderResult,
-  isToolInputAvailable,
-} from '../../types'
-import type { Grader, GraderInput } from '../types'
-
-/**
- * Fara Alignment Verifier
- *
- * Based on the Fara paper (Microsoft Research, 2024):
- * "A text-only verifier designed to judge whether the actions taken and final
- * response of a trajectory aligns with the given task. The purpose of this
- * verifier is to give a high-level judgement of whether the trajectory likely
- * satisfies the intent of the task."
- *
- * For transactional tasks: verifies whether the trajectory correctly identified
- * target URLs matching requested products/services.
- *
- * For information-seeking tasks: checks whether the response correctly answers
- * the input question.
- */
-
-const ALIGNMENT_SYSTEM_PROMPT = `You are an expert evaluator verifying if a web agent's trajectory aligns with the given task intent.
-
-Your role is to provide a high-level judgment of whether the agent's actions and final response satisfy the intent of the task.
-
-**Evaluation Criteria:**
-
-1. **Task Intent Alignment**: Do the actions taken directly address what the task is asking for?
-
-2. **Action Relevance**: Were the actions purposeful and directed toward completing the task?
-   - Did the agent navigate to relevant pages?
-   - Did it interact with appropriate elements (buttons, forms, links)?
-   - Were there unnecessary detours or irrelevant actions?
-
-3. **Response Accuracy** (for information-seeking tasks):
-   - Does the final response correctly answer the question asked?
-   - Is the information retrieved from the correct source?
-
-4. **Target Completion** (for transactional tasks):
-   - Did the agent reach the correct destination (product page, search results, etc.)?
-   - Were the correct parameters/filters applied?
-
-**Output Format:**
-Provide your analysis, then conclude with a clear verdict.
-
-VERDICT: PASS or FAIL
-REASONING: <One sentence summary of your decision>`
-
-export class FaraAlignmentGrader implements Grader {
-  name = 'fara_alignment'
-  private client: OpenAI
-  private model: string
-  private maxRetries = 3
-  private retryDelayMs = 1000
-
-  constructor(apiKey: string, baseUrl?: string, model?: string) {
-    this.client = new OpenAI({
-      apiKey,
-      baseURL: baseUrl || undefined,
-    })
-    this.model = model || 'gpt-4o-mini'
-  }
-
-  async grade(input: GraderInput): Promise<GraderResult> {
-    const actionSequence = this.extractActionSequence(input)
-    const taskType = this.classifyTaskType(input.task.query)
-
-    const userPrompt = `**Task:** ${input.task.query}
-
-**Task Type:** ${taskType}
-
-**Action Sequence:**
-${actionSequence || 'No actions taken'}
-
-**Final Response:** ${input.finalAnswer || '[No response provided]'}
-
-Evaluate whether this trajectory aligns with the task intent and provide your verdict.`
-
-    try {
-      const response = await this.callWithRetry(userPrompt)
-      const content = response.choices[0]?.message?.content || ''
-
-      const isPass = this.parseVerdict(content)
-
-      return {
-        score: isPass ? 1 : 0,
-        pass: isPass,
-        reasoning: content,
-        details: {
-          verifier: 'alignment',
-          taskType,
-          actionCount: countToolCalls(input.messages),
-          model: this.model,
-          promptTokens: response.usage?.prompt_tokens,
-          completionTokens: response.usage?.completion_tokens,
-        },
-      }
-    } catch (error) {
-      return {
-        score: 0,
-        pass: false,
-        reasoning: `Alignment verifier error: ${error instanceof Error ? error.message : String(error)}`,
-        details: { error: true, verifier: 'alignment' },
-      }
-    }
-  }
-
-  private extractActionSequence(input: GraderInput): string {
-    const actions: string[] = []
-    let stepNum = 1
-
-    for (const msg of input.messages) {
-      if (isToolInputAvailable(msg)) {
-        const paramsStr = this.formatParams(
-          msg.input as Record<string, unknown>,
-        )
-        actions.push(`${stepNum}. ${msg.toolName}(${paramsStr})`)
-        stepNum++
-      }
-    }
-
-    return actions.join('\n')
-  }
-
-  private formatParams(params: Record<string, unknown>): string {
-    const entries = Object.entries(params)
-    if (entries.length === 0) return ''
-
-    return entries
-      .map(([key, value]) => {
-        const strValue =
-          typeof value === 'string'
-            ? `"${value.substring(0, 100)}${value.length > 100 ? '...' : ''}"`
-            : JSON.stringify(value)
-        return `${key}=${strValue}`
-      })
-      .join(', ')
-  }
-
-  private classifyTaskType(query: string): string {
-    const lowerQuery = query.toLowerCase()
-
-    const infoKeywords = [
-      'find',
-      'search',
-      'look up',
-      'what is',
-      'how to',
-      'tell me',
-      'show me',
-      'get information',
-      'check',
-      'verify',
-      'confirm',
-      'list',
-      'summarize',
-      'review',
-    ]
-    const transactionalKeywords = [
-      'buy',
-      'purchase',
-      'add to cart',
-      'book',
-      'reserve',
-      'order',
-      'subscribe',
-      'sign up',
-      'register',
-      'download',
-      'submit',
-      'apply',
-    ]
-
-    for (const keyword of transactionalKeywords) {
-      if (lowerQuery.includes(keyword)) {
-        return 'transactional'
-      }
-    }
-
-    for (const keyword of infoKeywords) {
-      if (lowerQuery.includes(keyword)) {
-        return 'information-seeking'
-      }
-    }
-
-    return 'general'
-  }
-
-  private parseVerdict(content: string): boolean {
-    const upperContent = content.toUpperCase()
-
-    if (upperContent.includes('VERDICT: PASS')) {
-      return true
-    }
-    if (upperContent.includes('VERDICT: FAIL')) {
-      return false
-    }
-    if (upperContent.includes('VERDICT:')) {
-      const verdictMatch = upperContent.match(/VERDICT:\s*(PASS|FAIL)/)
-      if (verdictMatch) {
-        return verdictMatch[1] === 'PASS'
-      }
-    }
-
-    return false
-  }
-
-  private async callWithRetry(
-    userPrompt: string,
-    attempt = 1,
-  ): Promise<OpenAI.Chat.Completions.ChatCompletion> {
-    try {
-      return await this.client.chat.completions.create({
-        model: this.model,
-        temperature: 0,
-        messages: [
-          { role: 'system', content: ALIGNMENT_SYSTEM_PROMPT },
-          { role: 'user', content: userPrompt },
-        ],
-        max_tokens: 1000,
-      })
-    } catch (error) {
-      if (attempt < this.maxRetries) {
-        const delay = this.retryDelayMs * 2 ** (attempt - 1)
-        await new Promise((resolve) => setTimeout(resolve, delay))
-        return this.callWithRetry(userPrompt, attempt + 1)
-      }
-      throw error
-    }
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/graders/fara/combined.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/fara/combined.ts
@@ -1,284 +0,0 @@
-import type { GraderResult } from '../../types'
-import type { Grader, GraderInput } from '../types'
-import { FaraAlignmentGrader } from './alignment'
-import { FaraMultimodalGrader } from './multimodal'
-import { FaraRubricGrader } from './rubric'
-
-/**
- * Fara Combined Verifier (3-Verifier System)
- *
- * Based on the Fara paper (Microsoft Research, 2024):
- * "Before using any tasks for training, three verifier agents evaluate if a task
- * was 'successful': The Alignment Verifier checks if the trajectory of actions
- * match the task's intent; the Rubric Verifier defines completion criteria and
- * scores the trajectory against them; and the Multimodal Verifier reviews screenshots
- * and responses to confirm visual evidence supports successful completion."
- *
- * Decision Strategy: Majority Voting
- * - All three verifiers run independently
- * - A trajectory passes if at least 2 of 3 verifiers pass
- * - Combined score is the average of individual scores
- * - Detailed breakdown of each verifier's decision is provided
- *
- * This combined approach addresses different failure modes:
- * - Alignment: catches trajectories that wander off-task
- * - Rubric: catches partial completions via granular scoring
- * - Multimodal: catches hallucinations via visual evidence verification
- */
-
-interface VerifierResult {
-  name: string
-  pass: boolean
-  score: number
-  reasoning: string
-  details?: Record<string, unknown>
-}
-
-export class FaraCombinedGrader implements Grader {
-  name = 'fara_combined'
-  private alignmentGrader: FaraAlignmentGrader
-  private rubricGrader: FaraRubricGrader
-  private multimodalGrader: FaraMultimodalGrader
-  private runInParallel: boolean
-
-  constructor(
-    apiKey: string,
-    baseUrl?: string,
-    model?: string,
-    options?: { parallel?: boolean },
-  ) {
-    this.alignmentGrader = new FaraAlignmentGrader(
-      apiKey,
-      baseUrl,
-      model || 'gpt-4o-mini',
-    )
-    this.rubricGrader = new FaraRubricGrader(
-      apiKey,
-      baseUrl,
-      model || 'gpt-4o-mini',
-    )
-    this.multimodalGrader = new FaraMultimodalGrader(
-      apiKey,
-      baseUrl,
-      model || 'gpt-4o',
-    )
-    this.runInParallel = options?.parallel ?? true
-  }
-
-  async grade(input: GraderInput): Promise<GraderResult> {
-    try {
-      const verifierResults: VerifierResult[] = []
-
-      if (this.runInParallel) {
-        // Run all verifiers in parallel for speed
-        const [alignmentResult, rubricResult, multimodalResult] =
-          await Promise.all([
-            this.runVerifier('alignment', () =>
-              this.alignmentGrader.grade(input),
-            ),
-            this.runVerifier('rubric', () => this.rubricGrader.grade(input)),
-            this.runVerifier('multimodal', () =>
-              this.multimodalGrader.grade(input),
-            ),
-          ])
-
-        verifierResults.push(alignmentResult, rubricResult, multimodalResult)
-      } else {
-        // Run sequentially (useful for debugging or rate limiting)
-        verifierResults.push(
-          await this.runVerifier('alignment', () =>
-            this.alignmentGrader.grade(input),
-          ),
-        )
-        verifierResults.push(
-          await this.runVerifier('rubric', () =>
-            this.rubricGrader.grade(input),
-          ),
-        )
-        verifierResults.push(
-          await this.runVerifier('multimodal', () =>
-            this.multimodalGrader.grade(input),
-          ),
-        )
-      }
-
-      // Majority voting: pass if at least 2 of 3 verifiers pass
-      const passCount = verifierResults.filter((r) => r.pass).length
-      const majorityPass = passCount >= 2
-
-      // Combined score: average of individual scores
-      const averageScore =
-        verifierResults.reduce((sum, r) => sum + r.score, 0) /
-        verifierResults.length
-
-      // Build combined reasoning
-      const combinedReasoning = this.formatCombinedReasoning(
-        verifierResults,
-        majorityPass,
-        passCount,
-      )
-
-      return {
-        score: averageScore,
-        pass: majorityPass,
-        reasoning: combinedReasoning,
-        details: {
-          verifier: 'combined',
-          votingResult: {
-            passCount,
-            totalVerifiers: 3,
-            majorityThreshold: 2,
-            decision: majorityPass ? 'PASS' : 'FAIL',
-          },
-          verifiers: {
-            alignment: {
-              pass: verifierResults[0].pass,
-              score: verifierResults[0].score,
-              details: verifierResults[0].details,
-            },
-            rubric: {
-              pass: verifierResults[1].pass,
-              score: verifierResults[1].score,
-              details: verifierResults[1].details,
-            },
-            multimodal: {
-              pass: verifierResults[2].pass,
-              score: verifierResults[2].score,
-              details: verifierResults[2].details,
-            },
-          },
-        },
-      }
-    } catch (error) {
-      return {
-        score: 0,
-        pass: false,
-        reasoning: `Combined verifier error: ${error instanceof Error ? error.message : String(error)}`,
-        details: { error: true, verifier: 'combined' },
-      }
-    }
-  }
-
-  private async runVerifier(
-    name: string,
-    graderFn: () => Promise<GraderResult>,
-  ): Promise<VerifierResult> {
-    try {
-      const result = await graderFn()
-      return {
-        name,
-        pass: result.pass,
-        score: result.score,
-        reasoning: result.reasoning,
-        details: result.details,
-      }
-    } catch (error) {
-      return {
-        name,
-        pass: false,
-        score: 0,
-        reasoning: `${name} verifier error: ${error instanceof Error ? error.message : String(error)}`,
-        details: { error: true },
-      }
-    }
-  }
-
-  private formatCombinedReasoning(
-    results: VerifierResult[],
-    majorityPass: boolean,
-    passCount: number,
-  ): string {
-    const lines: string[] = []
-
-    lines.push('# Fara 3-Verifier Combined Evaluation\n')
-    lines.push(
-      `**Final Decision:** ${majorityPass ? 'PASS' : 'FAIL'} (${passCount}/3 verifiers passed)`,
-    )
-    lines.push(`**Majority Threshold:** 2/3 verifiers must pass\n`)
-
-    lines.push('---\n')
-
-    // Alignment Verifier Summary
-    const alignment = results[0]
-    lines.push(`## 1. Alignment Verifier: ${alignment.pass ? 'PASS' : 'FAIL'}`)
-    lines.push(`Score: ${alignment.score}`)
-    lines.push(`${this.truncateReasoning(alignment.reasoning, 500)}\n`)
-
-    // Rubric Verifier Summary
-    const rubric = results[1]
-    lines.push(`## 2. Rubric Verifier: ${rubric.pass ? 'PASS' : 'FAIL'}`)
-    lines.push(`Score: ${(rubric.score * 100).toFixed(1)}%`)
-    if (rubric.details && 'percentage' in rubric.details) {
-      lines.push(
-        `Rubric Score: ${rubric.details.percentage}% (threshold: ${rubric.details.threshold}%)`,
-      )
-    }
-    lines.push(`${this.truncateReasoning(rubric.reasoning, 500)}\n`)
-
-    // Multimodal Verifier Summary
-    const multimodal = results[2]
-    lines.push(
-      `## 3. Multimodal Verifier: ${multimodal.pass ? 'PASS' : 'FAIL'}`,
-    )
-    lines.push(`Score: ${multimodal.score}`)
-    if (multimodal.details) {
-      if ('responseConsistent' in multimodal.details) {
-        lines.push(
-          `Response Consistent: ${multimodal.details.responseConsistent ? 'Yes' : 'No'}`,
-        )
-      }
-      if ('taskSatisfied' in multimodal.details) {
-        lines.push(
-          `Task Satisfied: ${multimodal.details.taskSatisfied ? 'Yes' : 'No'}`,
-        )
-      }
-      if ('relevantScreenshots' in multimodal.details) {
-        lines.push(
-          `Screenshots Analyzed: ${multimodal.details.relevantScreenshots}/${multimodal.details.totalScreenshots}`,
-        )
-      }
-    }
-    lines.push(`${this.truncateReasoning(multimodal.reasoning, 500)}\n`)
-
-    lines.push('---\n')
-    lines.push('**Voting Summary:**')
-    lines.push(`- Alignment: ${alignment.pass ? 'YES' : 'NO'}`)
-    lines.push(`- Rubric: ${rubric.pass ? 'YES' : 'NO'}`)
-    lines.push(`- Multimodal: ${multimodal.pass ? 'YES' : 'NO'}`)
-    lines.push(
-      `- **Result: ${majorityPass ? 'MAJORITY PASS' : 'MAJORITY FAIL'}**`,
-    )
-
-    return lines.join('\n')
-  }
-
-  private truncateReasoning(reasoning: string, maxLength: number): string {
-    if (reasoning.length <= maxLength) {
-      return reasoning
-    }
-    return `${reasoning.substring(0, maxLength)}...`
-  }
-}
-
-/**
- * Factory function to create Fara graders
- */
-export function createFaraGrader(
-  type: 'alignment' | 'rubric' | 'multimodal' | 'combined',
-  apiKey: string,
-  baseUrl?: string,
-  model?: string,
-): Grader {
-  switch (type) {
-    case 'alignment':
-      return new FaraAlignmentGrader(apiKey, baseUrl, model)
-    case 'rubric':
-      return new FaraRubricGrader(apiKey, baseUrl, model)
-    case 'multimodal':
-      return new FaraMultimodalGrader(apiKey, baseUrl, model)
-    case 'combined':
-      return new FaraCombinedGrader(apiKey, baseUrl, model)
-    default:
-      throw new Error(`Unknown Fara grader type: ${type}`)
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/graders/fara/multimodal.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/fara/multimodal.ts
@@ -1,449 +0,0 @@
-import { readFile } from 'node:fs/promises'
-import { join } from 'node:path'
-import OpenAI from 'openai'
-import type { ChatCompletionContentPart } from 'openai/resources/chat/completions'
-import type { GraderResult } from '../../types'
-import type { Grader, GraderInput } from '../types'
-
-/**
- * Fara Multimodal Verifier
- *
- * Based on the Fara paper (Microsoft Research, 2024):
- * "This verifier inspects the screenshots and final response of the trajectory
- * to check whether the task was successfully completed. The verifier first selects
- * the most relevant screenshots from the trajectory based on the task ranked by
- * how informative they are."
- *
- * Two-phase evaluation:
- * 1. Select most relevant screenshots based on task relevance
- * 2. Judge:
- *    a) Whether the final response is fully consistent with screenshot evidence
- *    b) Whether the content in screenshots appears to satisfy the task
- *
- * "The Multimodal Verifier is especially important for combating hallucinations."
- */
-
-const SCREENSHOT_SELECTION_PROMPT = `You are an expert evaluator selecting the most relevant screenshots from a web agent's trajectory.
-
-**Instructions:**
-1. You will see multiple screenshots from an agent's web navigation
-2. Score each screenshot from 1-5 based on relevance to the task:
-   - 1: Not relevant at all
-   - 2: Minimal relevance
-   - 3: Somewhat relevant
-   - 4: Highly relevant
-   - 5: Critical/essential for verifying task completion
-
-**Output Format:**
-Return a JSON object:
-{
-  "scores": [
-    {"index": <1-based index>, "score": <1-5>, "reason": "Brief reason"}
-  ]
-}`
-
-const MULTIMODAL_VERIFICATION_PROMPT = `You are an expert evaluator verifying web agent task completion using visual evidence.
-
-**Your role is to verify two critical aspects:**
-
-1. **Response-Screenshot Consistency**: Is the agent's final response fully consistent with what is shown in the screenshots?
-   - Does the response accurately describe information visible in screenshots?
-   - Are there any claims in the response not supported by visual evidence?
-   - Look for hallucinations - information the agent claims but cannot be verified
-
-2. **Task Completion Evidence**: Do the screenshots show evidence that the task was successfully completed?
-   - Can you see the target page, information, or action result?
-   - Is there visual confirmation of the requested action/information?
-   - For search tasks: are correct search results visible?
-   - For navigation tasks: did the agent reach the target page?
-   - For information tasks: is the answer visible on screen?
-
-**Important:** The Multimodal Verifier is especially important for combating hallucinations. Be skeptical of claims not supported by visual evidence.
-
-**Output Format:**
-Provide your analysis, then conclude with:
-
-RESPONSE_CONSISTENT: YES or NO
-TASK_SATISFIED: YES or NO
-VERDICT: PASS or FAIL
-REASONING: <One sentence summary>`
-
-interface ScreenshotScore {
-  index: number
-  score: number
-  reason: string
-}
-
-export class FaraMultimodalGrader implements Grader {
-  name = 'fara_multimodal'
-  private client: OpenAI
-  private model: string
-  private relevanceThreshold = 3
-  private maxSelectedScreenshots = 10
-  private maxEvaluationScreenshots = 30
-  private maxRetries = 3
-  private retryDelayMs = 1000
-
-  constructor(apiKey: string, baseUrl?: string, model?: string) {
-    this.client = new OpenAI({
-      apiKey,
-      baseURL: baseUrl || undefined,
-    })
-    this.model = model || 'gpt-4o'
-  }
-
-  async grade(input: GraderInput): Promise<GraderResult> {
-    try {
-      // Load available screenshots
-      const allScreenshots = await this.loadScreenshots(
-        input.outputDir,
-        input.screenshotCount,
-      )
-
-      if (allScreenshots.length === 0) {
-        return {
-          score: 0,
-          pass: false,
-          reasoning: 'No screenshots available for multimodal verification',
-          details: { verifier: 'multimodal', error: 'no_screenshots' },
-        }
-      }
-
-      // Step 1: Select most relevant screenshots
-      const selectedScreenshots = await this.selectRelevantScreenshots(
-        input.task.query,
-        allScreenshots,
-      )
-
-      if (selectedScreenshots.length === 0) {
-        return {
-          score: 0,
-          pass: false,
-          reasoning:
-            'No relevant screenshots found for verification. All screenshots scored below relevance threshold.',
-          details: {
-            verifier: 'multimodal',
-            totalScreenshots: allScreenshots.length,
-            relevantScreenshots: 0,
-            threshold: this.relevanceThreshold,
-          },
-        }
-      }
-
-      // Step 2: Verify task completion with selected screenshots
-      const verification = await this.verifyWithScreenshots(
-        input.task.query,
-        input.finalAnswer,
-        selectedScreenshots,
-      )
-
-      const isPass =
-        verification.responseConsistent && verification.taskSatisfied
-
-      return {
-        score: isPass ? 1 : 0,
-        pass: isPass,
-        reasoning: verification.fullReasoning,
-        details: {
-          verifier: 'multimodal',
-          totalScreenshots: allScreenshots.length,
-          relevantScreenshots: selectedScreenshots.length,
-          selectedIndices: selectedScreenshots.map((s) => s.index),
-          responseConsistent: verification.responseConsistent,
-          taskSatisfied: verification.taskSatisfied,
-          model: this.model,
-        },
-      }
-    } catch (error) {
-      return {
-        score: 0,
-        pass: false,
-        reasoning: `Multimodal verifier error: ${error instanceof Error ? error.message : String(error)}`,
-        details: { error: true, verifier: 'multimodal' },
-      }
-    }
-  }
-
-  private async loadScreenshots(
-    outputDir: string,
-    screenshotCount: number,
-  ): Promise<{ index: number; data: string }[]> {
-    const screenshots: { index: number; data: string }[] = []
-
-    // Sample screenshots if too many
-    const indices: number[] = []
-    if (screenshotCount <= this.maxEvaluationScreenshots) {
-      for (let i = 1; i <= screenshotCount; i++) {
-        indices.push(i)
-      }
-    } else {
-      // Sample evenly across the trajectory, always include first, last, and recent
-      const step = Math.floor(screenshotCount / this.maxEvaluationScreenshots)
-      for (let i = 1; i <= screenshotCount; i += step) {
-        indices.push(i)
-      }
-      // Always include the last few screenshots (most likely to show completion)
-      for (let i = screenshotCount - 4; i <= screenshotCount; i++) {
-        if (i > 0 && !indices.includes(i)) {
-          indices.push(i)
-        }
-      }
-      indices.sort((a, b) => a - b)
-    }
-
-    for (const i of indices) {
-      try {
-        const filepath = join(outputDir, 'screenshots', `${i}.png`)
-        const buffer = await readFile(filepath)
-        const base64 = buffer.toString('base64')
-        screenshots.push({
-          index: i,
-          data: `data:image/png;base64,${base64}`,
-        })
-      } catch {
-        // Skip missing files
-      }
-    }
-
-    return screenshots
-  }
-
-  private async selectRelevantScreenshots(
-    task: string,
-    screenshots: { index: number; data: string }[],
-  ): Promise<{ index: number; data: string; score: number }[]> {
-    if (screenshots.length <= this.maxSelectedScreenshots) {
-      return screenshots.map((s) => ({ ...s, score: 5 }))
-    }
-
-    // Use batched evaluation to score screenshots
-    const batchSize = 5
-    const allScores: ScreenshotScore[] = []
-
-    for (let i = 0; i < screenshots.length; i += batchSize) {
-      const batch = screenshots.slice(i, i + batchSize)
-      const scores = await this.scoreScreenshotBatch(task, batch, i)
-      allScores.push(...scores)
-    }
-
-    // Filter by threshold and sort by score
-    const relevant = allScores
-      .filter((s) => s.score >= this.relevanceThreshold)
-      .sort((a, b) => b.score - a.score)
-      .slice(0, this.maxSelectedScreenshots)
-
-    // If not enough relevant screenshots, include the highest scored ones anyway
-    if (relevant.length < 3 && allScores.length > 0) {
-      const topScores = allScores
-        .sort((a, b) => b.score - a.score)
-        .slice(0, Math.min(5, allScores.length))
-
-      for (const score of topScores) {
-        if (!relevant.find((r) => r.index === score.index)) {
-          relevant.push(score)
-        }
-      }
-    }
-
-    return relevant.map((score) => ({
-      index: score.index,
-      data: screenshots.find((s) => s.index === score.index)?.data ?? '',
-      score: score.score,
-    }))
-  }
-
-  private async scoreScreenshotBatch(
-    task: string,
-    batch: { index: number; data: string }[],
-    _startOffset: number,
-  ): Promise<ScreenshotScore[]> {
-    const content: ChatCompletionContentPart[] = [
-      {
-        type: 'text',
-        text: `Task: ${task}\n\nScore the following ${batch.length} screenshots for relevance to this task. Screenshots are numbered ${batch[0].index} to ${batch[batch.length - 1].index}.`,
-      },
-    ]
-
-    for (const screenshot of batch) {
-      content.push({
-        type: 'text',
-        text: `\n--- Screenshot ${screenshot.index} ---`,
-      })
-      content.push({
-        type: 'image_url',
-        image_url: { url: screenshot.data, detail: 'low' },
-      })
-    }
-
-    try {
-      const response = await this.callWithRetry(
-        [
-          { role: 'system', content: SCREENSHOT_SELECTION_PROMPT },
-          { role: 'user', content },
-        ],
-        true,
-      )
-
-      const responseContent = response.choices[0]?.message?.content || ''
-      return this.parseScreenshotScores(responseContent, batch)
-    } catch {
-      // On error, give all screenshots average score
-      return batch.map((s) => ({
-        index: s.index,
-        score: 3,
-        reason: 'Could not evaluate',
-      }))
-    }
-  }
-
-  private parseScreenshotScores(
-    content: string,
-    batch: { index: number; data: string }[],
-  ): ScreenshotScore[] {
-    try {
-      const jsonMatch = content.match(/\{[\s\S]*\}/)
-      if (jsonMatch) {
-        const parsed = JSON.parse(jsonMatch[0])
-        if (parsed.scores && Array.isArray(parsed.scores)) {
-          return parsed.scores.map((s: Partial<ScreenshotScore>) => ({
-            index: s.index ?? batch[0].index,
-            score: Math.min(5, Math.max(1, s.score ?? 3)),
-            reason: s.reason ?? 'No reason provided',
-          }))
-        }
-      }
-    } catch {
-      // Fall through
-    }
-
-    // Default scores
-    return batch.map((s) => ({
-      index: s.index,
-      score: 3,
-      reason: 'Could not parse score',
-    }))
-  }
-
-  private async verifyWithScreenshots(
-    task: string,
-    finalAnswer: string | null,
-    screenshots: { index: number; data: string; score: number }[],
-  ): Promise<{
-    responseConsistent: boolean
-    taskSatisfied: boolean
-    fullReasoning: string
-  }> {
-    const content: ChatCompletionContentPart[] = [
-      {
-        type: 'text',
-        text: `**Task:** ${task}\n\n**Agent's Final Response:** ${finalAnswer || '[No response provided]'}\n\n**Selected Screenshots (${screenshots.length} most relevant):**`,
-      },
-    ]
-
-    for (const screenshot of screenshots) {
-      content.push({
-        type: 'text',
-        text: `\n--- Screenshot ${screenshot.index} (relevance score: ${screenshot.score}/5) ---`,
-      })
-      content.push({
-        type: 'image_url',
-        image_url: { url: screenshot.data, detail: 'high' },
-      })
-    }
-
-    content.push({
-      type: 'text',
-      text: '\nVerify the task completion based on the screenshots and final response.',
-    })
-
-    const response = await this.callWithRetry([
-      { role: 'system', content: MULTIMODAL_VERIFICATION_PROMPT },
-      { role: 'user', content },
-    ])
-
-    const responseContent = response.choices[0]?.message?.content || ''
-    return this.parseVerification(responseContent)
-  }
-
-  private parseVerification(content: string): {
-    responseConsistent: boolean
-    taskSatisfied: boolean
-    fullReasoning: string
-  } {
-    const upperContent = content.toUpperCase()
-
-    // Parse RESPONSE_CONSISTENT
-    let responseConsistent = false
-    if (upperContent.includes('RESPONSE_CONSISTENT: YES')) {
-      responseConsistent = true
-    } else if (upperContent.includes('RESPONSE_CONSISTENT: NO')) {
-      responseConsistent = false
-    } else {
-      // Fallback: check if there's any indication
-      responseConsistent =
-        !upperContent.includes('HALLUCINATION') &&
-        !upperContent.includes('INCONSISTENT') &&
-        !upperContent.includes('NOT SUPPORTED')
-    }
-
-    // Parse TASK_SATISFIED
-    let taskSatisfied = false
-    if (upperContent.includes('TASK_SATISFIED: YES')) {
-      taskSatisfied = true
-    } else if (upperContent.includes('TASK_SATISFIED: NO')) {
-      taskSatisfied = false
-    } else {
-      // Fallback: check verdict
-      if (upperContent.includes('VERDICT: PASS')) {
-        taskSatisfied = true
-      }
-    }
-
-    // Override with final verdict if present
-    if (upperContent.includes('VERDICT: FAIL')) {
-      // If explicit fail, at least one criterion failed
-      if (
-        !upperContent.includes('RESPONSE_CONSISTENT:') &&
-        !upperContent.includes('TASK_SATISFIED:')
-      ) {
-        responseConsistent = false
-        taskSatisfied = false
-      }
-    }
-
-    return {
-      responseConsistent,
-      taskSatisfied,
-      fullReasoning: content,
-    }
-  }
-
-  private async callWithRetry(
-    messages: OpenAI.Chat.Completions.ChatCompletionMessageParam[],
-    useJsonFormat = false,
-    attempt = 1,
-  ): Promise<OpenAI.Chat.Completions.ChatCompletion> {
-    try {
-      const options: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming =
-        {
-          model: this.model,
-          temperature: 0,
-          messages,
-          max_tokens: 2000,
-        }
-
-      if (useJsonFormat) {
-        options.response_format = { type: 'json_object' }
-      }
-
-      return await this.client.chat.completions.create(options)
-    } catch (error) {
-      if (attempt < this.maxRetries) {
-        const delay = this.retryDelayMs * 2 ** (attempt - 1)
-        await new Promise((resolve) => setTimeout(resolve, delay))
-        return this.callWithRetry(messages, useJsonFormat, attempt + 1)
-      }
-      throw error
-    }
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/graders/fara/rubric.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/fara/rubric.ts
@@ -1,447 +0,0 @@
-import OpenAI from 'openai'
-import { type GraderResult, isToolInputAvailable } from '../../types'
-import type { Grader, GraderInput } from '../types'
-
-/**
- * Fara Rubric Verifier
- *
- * Based on the Fara paper (Microsoft Research, 2024):
- * "The Rubric Verifier generates a rubric for each task and judges the
- * corresponding trajectory against the rubric, crediting points for partial
- * completion of various sub-goals. Each rubric is expressed as a list of
- * criteria that a trajectory would likely need to meet in order to be successful."
- *
- * Two-step process:
- * 1. Generate task-specific rubric with criteria and point values
- * 2. Score trajectory against rubric, calculating proportion of points satisfied
- *
- * Uses threshold of 0.8 - trajectories scoring above this are marked successful.
- */
-
-const RUBRIC_GENERATION_PROMPT = `You are an expert evaluator creating a rubric for assessing web agent task completion.
-
-Given a task, generate a detailed rubric with specific, measurable criteria that a web agent would need to satisfy to successfully complete the task.
-
-**Instructions:**
-1. Break down the task into discrete, verifiable sub-goals
-2. Assign point values based on importance (total should sum to 100)
-3. Make criteria specific and observable from the action sequence
-4. Include both process criteria (correct navigation, interactions) and outcome criteria (final result)
-
-**Output Format:**
-Return a JSON object with the following structure:
-{
-  "criteria": [
-    {
-      "id": 1,
-      "description": "Description of criterion",
-      "points": <number>,
-      "required": <boolean>
-    }
-  ],
-  "total_points": 100
-}
-
-**Guidelines:**
- Mark criteria as "required": true if failure means the task cannot be successful
- Include 4-8 criteria for most tasks
- Ensure criteria are observable from action sequence and final response
- Consider edge cases and partial completions`
-
-const RUBRIC_SCORING_PROMPT = `You are an expert evaluator scoring a web agent's trajectory against a rubric.
-
-**Instructions:**
-1. Carefully review each criterion in the rubric
-2. Determine if the agent's actions and response satisfy each criterion
-3. Award full points, partial points (if applicable), or zero points for each criterion
-4. Provide clear justification for each score
-
-**Output Format:**
-Return a JSON object with the following structure:
-{
-  "scores": [
-    {
-      "criterion_id": <number>,
-      "points_earned": <number>,
-      "max_points": <number>,
-      "satisfied": <boolean>,
-      "justification": "Brief explanation"
-    }
-  ],
-  "total_earned": <number>,
-  "total_possible": <number>,
-  "percentage": <number>,
-  "required_criteria_met": <boolean>,
-  "summary": "Overall assessment summary"
-}`
-
-interface RubricCriterion {
-  id: number
-  description: string
-  points: number
-  required: boolean
-}
-
-interface RubricScore {
-  criterion_id: number
-  points_earned: number
-  max_points: number
-  satisfied: boolean
-  justification: string
-}
-
-interface Rubric {
-  criteria: RubricCriterion[]
-  total_points: number
-}
-
-interface ScoringResult {
-  scores: RubricScore[]
-  total_earned: number
-  total_possible: number
-  percentage: number
-  required_criteria_met: boolean
-  summary: string
-}
-
-export class FaraRubricGrader implements Grader {
-  name = 'fara_rubric'
-  private client: OpenAI
-  private model: string
-  private passThreshold = 0.8
-  private maxRetries = 3
-  private retryDelayMs = 1000
-
-  constructor(apiKey: string, baseUrl?: string, model?: string) {
-    this.client = new OpenAI({
-      apiKey,
-      baseURL: baseUrl || undefined,
-    })
-    this.model = model || 'gpt-4o-mini'
-  }
-
-  async grade(input: GraderInput): Promise<GraderResult> {
-    try {
-      // Step 1: Generate rubric for the task
-      const rubric = await this.generateRubric(input.task.query)
-
-      // Step 2: Score trajectory against rubric
-      const actionSequence = this.extractActionSequence(input)
-      const scoringResult = await this.scoreAgainstRubric(
-        input.task.query,
-        rubric,
-        actionSequence,
-        input.finalAnswer,
-      )
-
-      const score = scoringResult.percentage / 100
-      const isPass =
-        score >= this.passThreshold && scoringResult.required_criteria_met
-
-      return {
-        score,
-        pass: isPass,
-        reasoning: this.formatReasoning(rubric, scoringResult),
-        details: {
-          verifier: 'rubric',
-          rubric: rubric.criteria,
-          scores: scoringResult.scores,
-          totalEarned: scoringResult.total_earned,
-          totalPossible: scoringResult.total_possible,
-          percentage: scoringResult.percentage,
-          threshold: this.passThreshold * 100,
-          requiredCriteriaMet: scoringResult.required_criteria_met,
-          model: this.model,
-        },
-      }
-    } catch (error) {
-      return {
-        score: 0,
-        pass: false,
-        reasoning: `Rubric verifier error: ${error instanceof Error ? error.message : String(error)}`,
-        details: { error: true, verifier: 'rubric' },
-      }
-    }
-  }
-
-  private async generateRubric(task: string): Promise<Rubric> {
-    const response = await this.callWithRetry([
-      { role: 'system', content: RUBRIC_GENERATION_PROMPT },
-      {
-        role: 'user',
-        content: `Generate a rubric for evaluating this web task:\n\n${task}`,
-      },
-    ])
-
-    const content = response.choices[0]?.message?.content || ''
-    return this.parseRubric(content)
-  }
-
-  private async scoreAgainstRubric(
-    task: string,
-    rubric: Rubric,
-    actionSequence: string,
-    finalAnswer: string | null,
-  ): Promise<ScoringResult> {
-    const rubricJson = JSON.stringify(rubric, null, 2)
-
-    const userPrompt = `**Task:** ${task}
-
-**Rubric:**
-${rubricJson}
-
-**Agent Action Sequence:**
-${actionSequence || 'No actions taken'}
-
-**Final Response:** ${finalAnswer || '[No response provided]'}
-
-Score this trajectory against each criterion in the rubric.`
-
-    const response = await this.callWithRetry([
-      { role: 'system', content: RUBRIC_SCORING_PROMPT },
-      { role: 'user', content: userPrompt },
-    ])
-
-    const content = response.choices[0]?.message?.content || ''
-    return this.parseScoringResult(content, rubric)
-  }
-
-  private parseRubric(content: string): Rubric {
-    try {
-      const jsonMatch = content.match(/\{[\s\S]*\}/)
-      if (jsonMatch) {
-        const parsed = JSON.parse(jsonMatch[0])
-        if (
-          parsed.criteria &&
-          Array.isArray(parsed.criteria) &&
-          parsed.criteria.length > 0
-        ) {
-          return {
-            criteria: parsed.criteria.map(
-              (c: Partial<RubricCriterion>, idx: number) => ({
-                id: c.id ?? idx + 1,
-                description: c.description ?? `Criterion ${idx + 1}`,
-                points: c.points ?? 25,
-                required: c.required ?? false,
-              }),
-            ),
-            total_points:
-              parsed.total_points ||
-              parsed.criteria.reduce(
-                (sum: number, c: Partial<RubricCriterion>) =>
-                  sum + (c.points ?? 25),
-                0,
-              ),
-          }
-        }
-      }
-    } catch {
-      // Fall through to default rubric
-    }
-
-    return this.getDefaultRubric()
-  }
-
-  private getDefaultRubric(): Rubric {
-    return {
-      criteria: [
-        {
-          id: 1,
-          description: 'Agent navigated to relevant pages for the task',
-          points: 25,
-          required: true,
-        },
-        {
-          id: 2,
-          description: 'Agent performed correct interactions (clicks, inputs)',
-          points: 25,
-          required: false,
-        },
-        {
-          id: 3,
-          description: 'Agent reached the target state or information',
-          points: 30,
-          required: true,
-        },
-        {
-          id: 4,
-          description: 'Final response accurately addresses the task',
-          points: 20,
-          required: false,
-        },
-      ],
-      total_points: 100,
-    }
-  }
-
-  private parseScoringResult(content: string, rubric: Rubric): ScoringResult {
-    try {
-      const jsonMatch = content.match(/\{[\s\S]*\}/)
-      if (jsonMatch) {
-        const parsed = JSON.parse(jsonMatch[0])
-        if (parsed.scores && Array.isArray(parsed.scores)) {
-          const totalEarned =
-            parsed.total_earned ??
-            parsed.scores.reduce(
-              (sum: number, s: Partial<RubricScore>) =>
-                sum + (s.points_earned ?? 0),
-              0,
-            )
-          const totalPossible =
-            parsed.total_possible ??
-            rubric.total_points ??
-            parsed.scores.reduce(
-              (sum: number, s: Partial<RubricScore>) =>
-                sum + (s.max_points ?? 0),
-              0,
-            )
-
-          const requiredCriteriaMet =
-            parsed.required_criteria_met ??
-            this.checkRequiredCriteria(parsed.scores, rubric)
-
-          return {
-            scores: parsed.scores.map(
-              (s: Partial<RubricScore>, idx: number) => ({
-                criterion_id: s.criterion_id ?? idx + 1,
-                points_earned: s.points_earned ?? 0,
-                max_points: s.max_points ?? 25,
-                satisfied: s.satisfied ?? false,
-                justification: s.justification ?? 'No justification provided',
-              }),
-            ),
-            total_earned: totalEarned,
-            total_possible: totalPossible,
-            percentage:
-              parsed.percentage ??
-              (totalPossible > 0
-                ? Math.round((totalEarned / totalPossible) * 100)
-                : 0),
-            required_criteria_met: requiredCriteriaMet,
-            summary: parsed.summary ?? 'Scoring completed',
-          }
-        }
-      }
-    } catch {
-      // Fall through to default scoring
-    }
-
-    return this.getDefaultScoringResult(rubric)
-  }
-
-  private checkRequiredCriteria(
-    scores: Partial<RubricScore>[],
-    rubric: Rubric,
-  ): boolean {
-    const requiredIds = rubric.criteria
-      .filter((c) => c.required)
-      .map((c) => c.id)
-
-    for (const reqId of requiredIds) {
-      const score = scores.find((s) => s.criterion_id === reqId)
-      if (!score || !score.satisfied) {
-        return false
-      }
-    }
-    return true
-  }
-
-  private getDefaultScoringResult(rubric: Rubric): ScoringResult {
-    return {
-      scores: rubric.criteria.map((c) => ({
-        criterion_id: c.id,
-        points_earned: 0,
-        max_points: c.points,
-        satisfied: false,
-        justification: 'Unable to evaluate',
-      })),
-      total_earned: 0,
-      total_possible: rubric.total_points,
-      percentage: 0,
-      required_criteria_met: false,
-      summary: 'Unable to parse scoring result',
-    }
-  }
-
-  private formatReasoning(rubric: Rubric, result: ScoringResult): string {
-    const lines: string[] = []
-
-    lines.push('**Rubric Evaluation**\n')
-    lines.push(
-      `Score: ${result.total_earned}/${result.total_possible} (${result.percentage}%)`,
-    )
-    lines.push(`Threshold: ${this.passThreshold * 100}%`)
-    lines.push(
-      `Required Criteria Met: ${result.required_criteria_met ? 'Yes' : 'No'}\n`,
-    )
-
-    lines.push('**Criteria Scores:**')
-    for (const score of result.scores) {
-      const criterion = rubric.criteria.find((c) => c.id === score.criterion_id)
-      const status = score.satisfied ? 'PASS' : 'FAIL'
-      const required = criterion?.required ? ' [REQUIRED]' : ''
-      lines.push(
-        `- ${criterion?.description ?? `Criterion ${score.criterion_id}`}${required}: ${score.points_earned}/${score.max_points} (${status})`,
-      )
-      lines.push(`  Justification: ${score.justification}`)
-    }
-
-    lines.push(`\n**Summary:** ${result.summary}`)
-
-    return lines.join('\n')
-  }
-
-  private extractActionSequence(input: GraderInput): string {
-    const actions: string[] = []
-    let stepNum = 1
-
-    for (const msg of input.messages) {
-      if (isToolInputAvailable(msg)) {
-        const paramsStr = this.formatParams(
-          msg.input as Record<string, unknown>,
-        )
-        actions.push(`${stepNum}. ${msg.toolName}(${paramsStr})`)
-        stepNum++
-      }
-    }
-
-    return actions.join('\n')
-  }
-
-  private formatParams(params: Record<string, unknown>): string {
-    const entries = Object.entries(params)
-    if (entries.length === 0) return ''
-
-    return entries
-      .map(([key, value]) => {
-        const strValue =
-          typeof value === 'string'
-            ? `"${value.substring(0, 100)}${value.length > 100 ? '...' : ''}"`
-            : JSON.stringify(value)
-        return `${key}=${strValue}`
-      })
-      .join(', ')
-  }
-
-  private async callWithRetry(
-    messages: OpenAI.Chat.Completions.ChatCompletionMessageParam[],
-    attempt = 1,
-  ): Promise<OpenAI.Chat.Completions.ChatCompletion> {
-    try {
-      return await this.client.chat.completions.create({
-        model: this.model,
-        temperature: 0,
-        messages,
-        max_tokens: 2000,
-        response_format: { type: 'json_object' },
-      })
-    } catch (error) {
-      if (attempt < this.maxRetries) {
-        const delay = this.retryDelayMs * 2 ** (attempt - 1)
-        await new Promise((resolve) => setTimeout(resolve, delay))
-        return this.callWithRetry(messages, attempt + 1)
-      }
-      throw error
-    }
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/graders/registry.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/registry.ts
@@ -1,86 +1,23 @@
 import type { GraderResult } from '../types'
 import { AgisdkStateDiffGrader } from './benchmark/agisdk-state-diff'
 import { InfinityStateGrader } from './benchmark/infinity-state'
-import { Mind2WebJudgeGrader } from './benchmark/mind2web'
-import { WebVoyagerGrader } from './benchmark/webvoyager'
-import { FaraAlignmentGrader } from './fara/alignment'
-import { FaraCombinedGrader } from './fara/combined'
-import { FaraMultimodalGrader } from './fara/multimodal'
-import { FaraRubricGrader } from './fara/rubric'
 import { PerformanceGrader } from './performance/performance-grader'
 import type { Grader, GraderInput } from './types'

-interface GraderOptions {
-  apiKey: string
-  baseUrl?: string
-  model?: string
-}
+export const PASS_FAIL_GRADER_ORDER = [
+  'agisdk_state_diff',
+  'infinity_state',
+  'performance_grader',
+] as const

-export function createGrader(
-  name: string,
-  options: GraderOptions | null,
-): Grader | null {
+export function createGrader(name: string): Grader | null {
  switch (name) {
-    // Deterministic benchmark graders (no LLM judge)
    case 'agisdk_state_diff':
      return new AgisdkStateDiffGrader()
    case 'infinity_state':
      return new InfinityStateGrader()
-
-    // LLM-based benchmark graders
-    case 'webvoyager_grader':
-      if (!options?.apiKey) return null
-      return new WebVoyagerGrader(
-        options.apiKey,
-        options.baseUrl,
-        options.model,
-      )
-    case 'mind2web_judge':
-    case 'mind2web_grader':
-      if (!options?.apiKey) return null
-      return new Mind2WebJudgeGrader(
-        options.apiKey,
-        options.baseUrl,
-        options.model,
-      )
-
-    // Fara individual verifiers
-    case 'fara_alignment':
-      if (!options?.apiKey) return null
-      return new FaraAlignmentGrader(
-        options.apiKey,
-        options.baseUrl,
-        options.model || 'gpt-4o-mini',
-      )
-    case 'fara_rubric':
-      if (!options?.apiKey) return null
-      return new FaraRubricGrader(
-        options.apiKey,
-        options.baseUrl,
-        options.model || 'gpt-4o-mini',
-      )
-    case 'fara_multimodal':
-      if (!options?.apiKey) return null
-      return new FaraMultimodalGrader(
-        options.apiKey,
-        options.baseUrl,
-        options.model || 'gpt-4o',
-      )
-
-    // Fara combined 3-verifier system (majority voting)
-    case 'fara_grader':
-    case 'fara_combined':
-      if (!options?.apiKey) return null
-      return new FaraCombinedGrader(
-        options.apiKey,
-        options.baseUrl,
-        options.model,
-      )
-
-    // Multi-axis performance grader (Claude Agent SDK — uses its own Claude default model)
    case 'performance_grader':
      return new PerformanceGrader()
-
    default:
      console.warn(`Unknown grader: ${name}`)
      return null
@@ -90,22 +27,20 @@ export function createGrader(
 export async function runGraders(
  graderNames: string[],
  input: GraderInput,
-  options: GraderOptions | null,
 ): Promise<Record<string, GraderResult>> {
  const results: Record<string, GraderResult> = {}

  for (const name of graderNames) {
-    const grader = createGrader(name, options)
-    if (grader) {
-      try {
-        console.log(`  Running grader: ${name}`)
-        results[name] = await grader.grade(input)
-      } catch (error) {
-        results[name] = {
-          score: 0,
-          pass: false,
-          reasoning: `Error running grader: ${error}`,
-        }
+    const grader = createGrader(name)
+    if (!grader) continue
+    try {
+      console.log(`  Running grader: ${name}`)
+      results[name] = await grader.grade(input)
+    } catch (error) {
+      results[name] = {
+        score: 0,
+        pass: false,
+        reasoning: `Error running grader: ${error}`,
      }
    }
  }
@@ -113,15 +48,4 @@ export async function runGraders(
  return results
 }

-// Export grader classes for direct use
-export {
-  AgisdkStateDiffGrader,
-  FaraAlignmentGrader,
-  FaraCombinedGrader,
-  FaraMultimodalGrader,
-  FaraRubricGrader,
-  InfinityStateGrader,
-  Mind2WebJudgeGrader,
-  PerformanceGrader,
-  WebVoyagerGrader,
-}
+export { AgisdkStateDiffGrader, InfinityStateGrader, PerformanceGrader }
--- a/packages/browseros-agent/apps/eval/src/index.ts
+++ b/packages/browseros-agent/apps/eval/src/index.ts
@@ -13,31 +13,34 @@ const { values } = parseArgs({

 if (values.help) {
  console.log(`
-Web Agent Eval System
+BrowserOS Eval

 Usage:
  bun run eval                          # Opens dashboard in config mode
  bun run eval --config <config.json>   # Runs eval with config file

-Config file should include:
-  - agent: Agent configuration (single or orchestrator-executor)
-  - dataset: Path to dataset JSONL file
-  - output_dir: Output directory for results (optional, default: ./results)
-  - num_workers: Number of parallel workers
-  - browseros.server_url: BrowserOS server URL
-  - grader_model, grader_api_key_env, grader_base_url: Grader settings (optional)
-  - timeout_ms: Task timeout in ms (optional)
+Available agent types:
+  - single                  Single LLM agent driven by the BrowserOS tool loop
+  - orchestrator-executor   High-level planner + visual/text executor

-Preset configs available in configs/:
-  - configs/webvoyager-full.json    Full WebVoyager evaluation
-  - configs/mind2web-full.json      Full Mind2Web evaluation
-  - configs/webvoyager-test.json    WebVoyager test subset (10 tasks)
-  - configs/mind2web-test.json      Mind2Web test subset (10 tasks)
+Available graders:
+  - performance_grader      Multi-axis grader using Claude Agent SDK
+  - agisdk_state_diff       AGI SDK / REAL Bench state-diff grader
+  - infinity_state          WebArena-Infinity verifier-script grader
+
+Preset configs in configs/:
+  - browseros-agent-weekly.json       Weekly eval (single agent)
+  - browseros-oe-agent-weekly.json    Weekly eval (orchestrator + LLM executor)
+  - browseros-oe-clado-weekly.json    Weekly eval (orchestrator + Clado executor)
+  - agisdk-real-smoke.json            AGI SDK smoke run
+  - infinity-hard-50.json             WebArena-Infinity hard-50 set
+  - test-webvoyager.json              WebVoyager test
+  - test-mind2web.json                Mind2Web test

 Examples:
-  bun run eval                                    # Dashboard config mode
-  bun run eval -c configs/webvoyager-test.json   # WebVoyager test
-  bun run eval -c configs/mind2web-full.json     # Full Mind2Web eval
+  bun run eval                                       # Dashboard config mode
+  bun run eval -c configs/browseros-agent-weekly.json
+  bun run eval -c configs/test-webvoyager.json
 `)
  process.exit(0)
 }
--- a/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts
@@ -16,16 +16,23 @@
 import {
  existsSync,
  mkdtempSync,
+  openSync,
  readFileSync,
  rmSync,
  writeFileSync,
 } from 'node:fs'
+import { readFile } from 'node:fs/promises'
 import { dirname, join } from 'node:path'
 import { fileURLToPath } from 'node:url'
 import { type Subprocess, spawn, spawnSync } from 'bun'
-import type { EvalPorts } from '../utils/dev-config'
 import { sleep } from '../utils/sleep'

+export interface EvalPorts {
+  cdp: number
+  server: number
+  extension: number
+}
+
 const MAX_RESTART_ATTEMPTS = 3
 const CDP_WAIT_TIMEOUT_MS = 30_000
 const SERVER_HEALTH_TIMEOUT_MS = 30_000
@@ -48,6 +55,7 @@ export class BrowserOSAppManager {
  private ports: EvalPorts
  private chromeProc: Subprocess | null = null
  private serverProc: Subprocess | null = null
+  private serverLogPath: string | null = null
  private tempDir: string | null = null
  private readonly workerIndex: number
  private readonly loadExtensions: boolean
@@ -178,24 +186,47 @@ export class BrowserOSAppManager {
      VITE_BROWSEROS_SERVER_PORT: String(server),
    }

+    this.serverLogPath = join(this.tempDir, 'server.log')
+    const serverLogFd = openSync(this.serverLogPath, 'w')
    this.serverProc = spawn({
      cmd: ['bun', 'run', '--filter', '@browseros/server', 'start'],
      cwd: MONOREPO_ROOT,
-      stdout: 'ignore',
-      stderr: 'ignore',
+      stdout: serverLogFd,
+      stderr: serverLogFd,
      env: serverEnv,
    })
    console.log(
-      `  [W${this.workerIndex}] Server started (PID: ${this.serverProc.pid})`,
+      `  [W${this.workerIndex}] Server started (PID: ${this.serverProc.pid}) — log: ${this.serverLogPath}`,
    )

    // --- Wait for Server Health ---
    if (!(await this.waitForServerHealth())) {
+      await this.dumpServerLog()
      throw new Error('Server health check timed out')
    }
    console.log(`  [W${this.workerIndex}] Server healthy`)
  }

+  private async dumpServerLog(): Promise<void> {
+    if (!this.serverLogPath) return
+    try {
+      const contents = await readFile(this.serverLogPath, 'utf-8')
+      const lines = contents.split('\n').filter((l) => l.length > 0)
+      const tail = lines.slice(-40)
+      console.warn(
+        `  [W${this.workerIndex}] --- last ${tail.length} server log lines ---`,
+      )
+      for (const line of tail) {
+        console.warn(`  [W${this.workerIndex}] | ${line}`)
+      }
+      console.warn(`  [W${this.workerIndex}] --- end server log ---`)
+    } catch (err) {
+      console.warn(
+        `  [W${this.workerIndex}] Could not read server log: ${err instanceof Error ? err.message : String(err)}`,
+      )
+    }
+  }
+
  private async waitForCdp(): Promise<boolean> {
    const startTime = Date.now()
    while (Date.now() - startTime < CDP_WAIT_TIMEOUT_MS) {
--- a/packages/browseros-agent/apps/eval/src/runner/eval-runner.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/eval-runner.ts
@@ -19,17 +19,12 @@ import {
 } from './task-loader'
 import type {
  BatchSummary,
-  GraderOptions,
  RunEvalOptions,
  TaskResult,
  TaskResultSummary,
  TaskSource,
 } from './types'
-import {
-  getPrimaryGraderResult,
-  isSuccessfulResult,
-  resolveGraderOptions,
-} from './types'
+import { getPrimaryGraderResult, isSuccessfulResult } from './types'

 // ============================================================================
 // Main Entry Point
@@ -58,7 +53,6 @@ export async function runEval(options: RunEvalOptions): Promise<void> {

  // Step 4: Setup
  await mkdir(resolvedPaths.outputDir, { recursive: true })
-  const graderOptions = resolveGraderOptions(config)

  // Step 5: Start dashboard
  startDashboard({
@@ -69,12 +63,7 @@ export async function runEval(options: RunEvalOptions): Promise<void> {
  })

  // Step 6: Execute tasks (parallel or sequential based on num_workers)
-  const results = await executeTasks(
-    tasks,
-    config,
-    resolvedPaths.outputDir,
-    graderOptions,
-  )
+  const results = await executeTasks(tasks, config, resolvedPaths.outputDir)

  // Step 7: Summary
  const summary = buildSummary(results)
@@ -187,7 +176,6 @@ async function executeTasks(
  tasks: Task[],
  config: EvalConfig,
  outputDir: string,
-  graderOptions: GraderOptions | null,
 ): Promise<TaskResult[]> {
  console.log(`\n${'='.repeat(60)}`)
  console.log('STARTING EVALUATION')
@@ -204,7 +192,6 @@ async function executeTasks(
    numWorkers,
    config,
    outputDir,
-    graderOptions,
    restartServerPerTask: config.restart_server_per_task,
    onEvent: (taskId, event) =>
      dashboardState.broadcastStreamEvent(taskId, event),
--- a/packages/browseros-agent/apps/eval/src/runner/parallel-executor.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/parallel-executor.ts
@@ -9,10 +9,9 @@
 */

 import type { EvalConfig, Task } from '../types'
-import type { EvalPorts } from '../utils/dev-config'
-import { BrowserOSAppManager } from './browseros-app-manager'
+import { BrowserOSAppManager, type EvalPorts } from './browseros-app-manager'
 import { createTaskExecutor } from './task-executor'
-import type { GraderOptions, TaskResult } from './types'
+import type { TaskResult } from './types'

 // ============================================================================
 // Types
@@ -22,7 +21,6 @@ export interface ParallelExecutorConfig {
  numWorkers: number
  config: EvalConfig
  outputDir: string
-  graderOptions: GraderOptions | null
  restartServerPerTask?: boolean
  onEvent?: (taskId: string, event: Record<string, unknown>) => void
 }
@@ -162,7 +160,6 @@ export class ParallelExecutor {
      workerConfig,
      workerIndex,
      this.config.outputDir,
-      this.config.graderOptions,
      this.config.onEvent,
    )

@@ -267,16 +264,3 @@ export class ParallelExecutor {
    }
  }
 }
-
-// ============================================================================
-// Factory
-// ============================================================================
-
-export async function executeTasksInParallel(
-  tasks: Task[],
-  config: ParallelExecutorConfig,
-  onProgress?: ProgressCallback,
-): Promise<TaskResult[]> {
-  const executor = new ParallelExecutor(config)
-  return executor.execute(tasks, onProgress)
-}
--- a/packages/browseros-agent/apps/eval/src/runner/task-executor.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/task-executor.ts
@@ -10,7 +10,7 @@ import { runGraders } from '../graders/registry'
 import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
 import { callMcpTool } from '../utils/mcp-client'
 import { InfinityAppManager } from './infinity-app-manager'
-import type { GraderOptions, TaskResult } from './types'
+import type { TaskResult } from './types'

 // ============================================================================
 // Errors
@@ -40,7 +40,6 @@ export class TaskExecutionError extends Error {
 // ============================================================================

 export interface TaskExecutorDeps {
-  graderOptions: GraderOptions | null
  onEvent?: (taskId: string, event: Record<string, unknown>) => void
 }

@@ -225,6 +224,7 @@ export class TaskExecutor {
      const context: AgentContext = {
        config: this.config,
        task,
+        workerIndex: this.workerIndex,
        initialPageId: pageId,
        outputDir: this.outputDir,
        taskOutputDir,
@@ -259,27 +259,23 @@ export class TaskExecutor {
    }

    try {
-      const graderResults = await runGraders(
-        graderNames,
-        {
-          task: {
-            query_id: task.query_id,
-            query: task.query,
-            dataset: task.dataset,
-          },
-          messages: agentResult.messages,
-          screenshotCount:
-            agentResult.metadata.screenshot_count ??
-            agentResult.metadata.total_steps,
-          finalAnswer: agentResult.finalAnswer,
-          expectedAnswer: (task.metadata?.additional as Record<string, unknown>)
-            ?.answer as string | undefined,
-          outputDir: join(this.outputDir, task.query_id),
-          mcpUrl: `${this.config.browseros.server_url}/mcp`,
-          infinityAppUrl,
+      const graderResults = await runGraders(graderNames, {
+        task: {
+          query_id: task.query_id,
+          query: task.query,
+          dataset: task.dataset,
        },
-        this.deps.graderOptions,
-      )
+        messages: agentResult.messages,
+        screenshotCount:
+          agentResult.metadata.screenshot_count ??
+          agentResult.metadata.total_steps,
+        finalAnswer: agentResult.finalAnswer,
+        expectedAnswer: (task.metadata?.additional as Record<string, unknown>)
+          ?.answer as string | undefined,
+        outputDir: join(this.outputDir, task.query_id),
+        mcpUrl: `${this.config.browseros.server_url}/mcp`,
+        infinityAppUrl,
+      })

      try {
        const saver = new TrajectorySaver(this.outputDir, task.query_id)
@@ -314,11 +310,7 @@ export function createTaskExecutor(
  config: EvalConfig,
  workerIndex: number,
  outputDir: string,
-  graderOptions: GraderOptions | null,
  onEvent?: (taskId: string, event: Record<string, unknown>) => void,
 ): TaskExecutor {
-  return new TaskExecutor(config, workerIndex, outputDir, {
-    graderOptions,
-    onEvent,
-  })
+  return new TaskExecutor(config, workerIndex, outputDir, { onEvent })
 }
--- a/packages/browseros-agent/apps/eval/src/runner/task-loader.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/task-loader.ts
@@ -145,7 +145,7 @@ function createSingleTask(query: string, startUrl?: string): TaskLoadResult {
    query_id: `single-${Date.now()}`,
    dataset: 'manual',
    query: query.trim(),
-    graders: ['fara_alignment'],
+    graders: ['performance_grader'],
    start_url: startUrl,
    metadata: {
      original_task_id: 'manual',
--- a/packages/browseros-agent/apps/eval/src/runner/types.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/types.ts
@@ -1,4 +1,5 @@
 import type { AgentResult } from '../agents/types'
+import { PASS_FAIL_GRADER_ORDER } from '../graders/registry'
 import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'

 // ============================================================================
@@ -99,15 +100,6 @@ export interface TaskResultSummary {
 // Pass/Fail Determination
 // ============================================================================

-export const PASS_FAIL_GRADER_ORDER = [
-  'agisdk_state_diff',
-  'infinity_state',
-  'performance_grader',
-  'webvoyager_grader',
-  'fara_combined',
-  'fara_grader',
-] as const
-
 export function getPrimaryGraderResult(
  graderResults: Record<string, { pass: boolean; score: number }>,
 ): { name: string; pass: boolean; score: number } | null {
@@ -122,31 +114,3 @@ export function getPrimaryGraderResult(
  }
  return null
 }
-
-// ============================================================================
-// Grader Options
-// ============================================================================
-
-export interface GraderOptions {
-  apiKey: string
-  baseUrl?: string
-  model?: string
-}
-
-export function resolveGraderOptions(config: EvalConfig): GraderOptions | null {
-  const keyValue = config.grader_api_key_env || 'OPENAI_API_KEY'
-  // If it looks like an env var name (ALL_CAPS), resolve from env; otherwise use directly
-  const apiKey = /^[A-Z][A-Z0-9_]*$/.test(keyValue)
-    ? process.env[keyValue]
-    : keyValue
-
-  if (!apiKey) {
-    return null
-  }
-
-  return {
-    apiKey,
-    baseUrl: config.grader_base_url,
-    model: config.grader_model,
-  }
-}
--- a/packages/browseros-agent/apps/eval/src/types/config.ts
+++ b/packages/browseros-agent/apps/eval/src/types/config.ts
@@ -19,37 +19,9 @@ export const OrchestratorExecutorConfigSchema = z.object({
  }),
 })

-export const GeminiComputerUseConfigSchema = z.object({
-  type: z.literal('gemini-computer-use'),
-  apiKey: z
-    .string()
-    .describe('API key or env var name (e.g., GOOGLE_AI_API_KEY)'),
-  screenSize: z
-    .object({
-      width: z.number().int().min(800).max(2560).default(1440),
-      height: z.number().int().min(600).max(1440).default(900),
-    })
-    .optional(),
-  turnLimit: z.number().int().min(1).max(100).default(30),
-})
-
-export const YutoriNavigatorConfigSchema = z.object({
-  type: z.literal('yutori-navigator'),
-  apiKey: z.string().describe('API key or env var name (e.g., YUTORI_API_KEY)'),
-  screenSize: z
-    .object({
-      width: z.number().int().min(800).max(2560).default(1280),
-      height: z.number().int().min(600).max(1440).default(800),
-    })
-    .optional(),
-  turnLimit: z.number().int().min(1).max(100).default(30),
-})
-
 export const AgentConfigSchema = z.discriminatedUnion('type', [
  SingleAgentConfigSchema,
  OrchestratorExecutorConfigSchema,
-  GeminiComputerUseConfigSchema,
-  YutoriNavigatorConfigSchema,
 ])

 export const EvalConfigSchema = z.object({
@@ -67,9 +39,6 @@ export const EvalConfigSchema = z.object({
    headless: z.boolean().optional().default(false),
  }),
  graders: z.array(z.string()).optional(),
-  grader_model: z.string().optional(),
-  grader_api_key_env: z.string().optional(),
-  grader_base_url: z.string().url().optional(),
  timeout_ms: z.number().int().min(30000).max(3600000).optional(),
  captcha: z
    .object({
@@ -84,9 +53,5 @@ export type SingleAgentConfig = z.infer<typeof SingleAgentConfigSchema>
 export type OrchestratorExecutorConfig = z.infer<
  typeof OrchestratorExecutorConfigSchema
 >
-export type GeminiComputerUseConfig = z.infer<
-  typeof GeminiComputerUseConfigSchema
->
-export type YutoriNavigatorConfig = z.infer<typeof YutoriNavigatorConfigSchema>
 export type AgentConfig = z.infer<typeof AgentConfigSchema>
 export type EvalConfig = z.infer<typeof EvalConfigSchema>
--- a/packages/browseros-agent/apps/eval/src/types/index.ts
+++ b/packages/browseros-agent/apps/eval/src/types/index.ts
@@ -4,14 +4,10 @@ export {
  AgentConfigSchema,
  type EvalConfig,
  EvalConfigSchema,
-  type GeminiComputerUseConfig,
-  GeminiComputerUseConfigSchema,
  type OrchestratorExecutorConfig,
  OrchestratorExecutorConfigSchema,
  type SingleAgentConfig,
  SingleAgentConfigSchema,
-  type YutoriNavigatorConfig,
-  YutoriNavigatorConfigSchema,
 } from './config'
 // Error types
 export {
--- a/packages/browseros-agent/apps/eval/src/types/result.ts
+++ b/packages/browseros-agent/apps/eval/src/types/result.ts
@@ -13,12 +13,7 @@ export const GraderResultSchema = z.object({
 // Agent config in metadata
 const AgentConfigMetaSchema = z
  .object({
-    type: z.enum([
-      'single',
-      'orchestrator-executor',
-      'gemini-computer-use',
-      'yutori-navigator',
-    ]),
+    type: z.enum(['single', 'orchestrator-executor']),
    model: z.string().optional(),
  })
  .passthrough()
--- a/packages/browseros-agent/apps/eval/src/utils/config-validator.ts
+++ b/packages/browseros-agent/apps/eval/src/utils/config-validator.ts
@@ -48,43 +48,25 @@ export async function validateConfig(

  // Check if API key env vars are set (apiKey field contains env var name)
  const envVarsToCheck: string[] = []
+  const isEnvVarName = (s: string) => /^[A-Z][A-Z0-9_]*$/.test(s)
+
  if (config.agent.type === 'single') {
    // Skip API key check for browseros provider (uses server's built-in auth)
    if (
      config.agent.provider !== LLM_PROVIDERS.BROWSEROS &&
-      config.agent.apiKey
+      config.agent.apiKey &&
+      isEnvVarName(config.agent.apiKey)
    ) {
-      // If apiKey looks like an env var name, check if it's set
-      if (/^[A-Z][A-Z0-9_]*$/.test(config.agent.apiKey)) {
-        envVarsToCheck.push(config.agent.apiKey)
-      }
+      envVarsToCheck.push(config.agent.apiKey)
    }
-  } else if (config.agent.type === 'orchestrator-executor') {
-    if (config.agent.orchestrator.apiKey) {
-      if (/^[A-Z][A-Z0-9_]*$/.test(config.agent.orchestrator.apiKey)) {
-        envVarsToCheck.push(config.agent.orchestrator.apiKey)
-      }
+  } else {
+    const { orchestrator, executor } = config.agent
+    if (orchestrator.apiKey && isEnvVarName(orchestrator.apiKey)) {
+      envVarsToCheck.push(orchestrator.apiKey)
    }
-    if (config.agent.executor.apiKey) {
-      if (/^[A-Z][A-Z0-9_]*$/.test(config.agent.executor.apiKey)) {
-        envVarsToCheck.push(config.agent.executor.apiKey)
-      }
+    if (executor.apiKey && isEnvVarName(executor.apiKey)) {
+      envVarsToCheck.push(executor.apiKey)
    }
-  } else if (config.agent.type === 'gemini-computer-use') {
-    // Gemini Computer Use agent
-    if (config.agent.apiKey) {
-      if (/^[A-Z][A-Z0-9_]*$/.test(config.agent.apiKey)) {
-        envVarsToCheck.push(config.agent.apiKey)
-      }
-    }
-  }
-
-  // Grader API key is checked at runtime - just warn if not set
-  const graderKeyEnv = config.grader_api_key_env || 'OPENAI_API_KEY'
-  if (!process.env[graderKeyEnv]) {
-    warnings.push(
-      `Grader API key not set (${graderKeyEnv}). Grading will fail.`,
-    )
  }

  for (const envVar of [...new Set(envVarsToCheck)]) {
--- a/packages/browseros-agent/apps/eval/src/utils/dev-config.ts
+++ b/packages/browseros-agent/apps/eval/src/utils/dev-config.ts
@@ -1,39 +0,0 @@
-/**
- * Development configuration utilities
- * Reads ports from config.dev.json to stay in sync with BrowserOS
- */
-
-import { readFileSync } from 'node:fs'
-import { dirname, join } from 'node:path'
-import { fileURLToPath } from 'node:url'
-
-const MONOREPO_ROOT = join(
-  dirname(fileURLToPath(import.meta.url)),
-  '../../../..',
-)
-
-interface DevConfig {
-  ports: {
-    cdp: number
-    http_mcp: number
-    agent: number
-    extension: number
-  }
-}
-
-export interface EvalPorts {
-  cdp: number
-  server: number
-  extension: number
-}
-
-export function getEvalPorts(): EvalPorts {
-  const configPath = join(MONOREPO_ROOT, 'config.dev.json')
-  const config = JSON.parse(readFileSync(configPath, 'utf-8')) as DevConfig
-
-  return {
-    cdp: config.ports.cdp,
-    server: config.ports.http_mcp,
-    extension: config.ports.extension,
-  }
-}