chore(eval): colocate grader python evaluators

chore(eval): organize config layouts
docs(eval): explain suites and variants
2026-05-13 23:53:25 +00:00 · 2026-04-29 17:16:58 -07:00 · 2026-04-29 17:01:25 -07:00 · 2026-04-29 16:38:54 -07:00 · 2026-04-29 16:10:27 -07:00 · 2026-04-29 16:00:56 -07:00
199 changed files with 13515 additions and 10515 deletions
--- a/.github/workflows/eval-weekly.yml
+++ b/.github/workflows/eval-weekly.yml
@@ -14,7 +14,7 @@ on:
      config:
        description: 'Eval config file (relative to apps/eval/)'
        required: false
-        default: 'configs/browseros-agent-weekly.json'
+        default: 'configs/legacy/browseros-agent-weekly.json'

 permissions:
  contents: read
@@ -42,10 +42,12 @@ jobs:

      - name: Install dependencies
        working-directory: packages/browseros-agent
-        run: bun install --ignore-scripts && bun run build:agent-sdk
+        run: bun install --ignore-scripts

      - name: Install Python eval dependencies
-        run: pip install agisdk requests
+        # agisdk pinned so silent upstream releases can't shift task definitions
+        # or grader behavior. Bump intentionally with a documented re-baseline.
+        run: pip install agisdk==0.3.5 requests

      - name: Clone WebArena-Infinity
        run: git clone --depth 1 https://github.com/web-arena-x/webarena-infinity.git /tmp/webarena-infinity
@@ -60,33 +62,27 @@ jobs:
          curl -sL -o /tmp/nopecha.zip https://github.com/NopeCHALLC/nopecha-extension/releases/latest/download/chromium_automation.zip
          unzip -qo /tmp/nopecha.zip -d extensions/nopecha

-      - name: Run eval
+      - name: Run eval and publish to R2
        working-directory: packages/browseros-agent/apps/eval
        env:
          FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
          CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
          NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
-          BROWSEROS_BINARY: /usr/bin/browseros
-          WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
-          EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
-        run: |
-          echo "Running eval with config: $EVAL_CONFIG"
-          xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts -c "$EVAL_CONFIG"
-
-      - name: Upload runs to R2
-        if: success()
-        working-directory: packages/browseros-agent/apps/eval
-        env:
          EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
          EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }}
          EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }}
          EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }}
          EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }}
-          EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
+          BROWSEROS_BINARY: /usr/bin/browseros
+          WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
+          # OpenClaw container runtime is macOS-only; opt the Linux runner
+          # into the no-op stub so the server can boot and the eval can run.
+          BROWSEROS_SKIP_OPENCLAW: '1'
+          EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }}
        run: |
-          CONFIG_NAME=$(basename "$EVAL_CONFIG" .json)
-          bun scripts/upload-run.ts "results/$CONFIG_NAME"
+          echo "Running eval with config: $EVAL_CONFIG"
+          xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts suite --config "$EVAL_CONFIG" --publish r2

      - name: Generate trend report
        if: success()
@@ -107,3 +103,11 @@ jobs:
        with:
          name: eval-report-${{ github.run_id }}
          path: /tmp/eval-report.html
+
+      - name: Upload server stderr logs (for post-mortem on startup failures)
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: browseros-server-logs-${{ github.run_id }}
+          path: /tmp/browseros-server-logs/
+          if-no-files-found: ignore
--- a/.github/workflows/publish-vm-agent-cache.yml
+++ b/.github/workflows/publish-vm-agent-cache.yml
@@ -1,4 +1,4 @@
-name: build-agent
+name: Publish VM Agent Cache

 on:
  workflow_dispatch:
@@ -16,7 +16,7 @@ on:
  pull_request:
    paths:
      - "packages/browseros-agent/packages/build-tools/**"
-      - ".github/workflows/build-agent.yml"
+      - ".github/workflows/publish-vm-agent-cache.yml"

 env:
  BUN_VERSION: "1.3.6"
@@ -48,6 +48,8 @@ jobs:
        include:
          - arch: arm64
            runner: ubuntu-24.04-arm
+          - arch: x64
+            runner: ubuntu-24.04
    runs-on: ${{ matrix.runner }}
    steps:
      - uses: actions/checkout@v4
@@ -74,7 +76,15 @@ jobs:

  smoke:
    needs: build
-    runs-on: ubuntu-24.04-arm
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - arch: arm64
+            runner: ubuntu-24.04-arm
+          - arch: x64
+            runner: ubuntu-24.04
+    runs-on: ${{ matrix.runner }}
    steps:
      - uses: actions/checkout@v4
      - uses: oven-sh/setup-bun@v2
@@ -82,7 +92,7 @@ jobs:
          bun-version: ${{ env.BUN_VERSION }}
      - uses: actions/download-artifact@v4
        with:
-          name: tarball-${{ inputs.agent || 'openclaw' }}-arm64
+          name: tarball-${{ inputs.agent || 'openclaw' }}-${{ matrix.arch }}
          path: dist/images
      - name: Install podman
        run: |
@@ -96,12 +106,12 @@ jobs:
          AGENT: ${{ inputs.agent || 'openclaw' }}
        run: |
          set -euo pipefail
-          tarball="$(find "$GITHUB_WORKSPACE/dist/images" -name "${AGENT}-*-arm64.tar.gz" -print -quit)"
+          tarball="$(find "$GITHUB_WORKSPACE/dist/images" -name "${AGENT}-*-${{ matrix.arch }}.tar.gz" -print -quit)"
          if [ -z "$tarball" ]; then
-            echo "missing arm64 tarball artifact for ${AGENT}" >&2
+            echo "missing ${{ matrix.arch }} tarball artifact for ${AGENT}" >&2
            exit 1
          fi
-          bun run smoke:tarball -- --agent "$AGENT" --arch arm64 --tarball "$tarball"
+          bun run smoke:tarball -- --agent "$AGENT" --arch "${{ matrix.arch }}" --tarball "$tarball"

  publish:
    needs: [build, smoke]
--- a/.github/workflows/release-agent-sdk.yml
+++ b/.github/workflows/release-agent-sdk.yml
@@ -1,168 +1,11 @@
-name: Release BrowserOS Agent SDK
+name: Release BrowserOS Agent SDK (disabled)

 on:
  workflow_dispatch:

-concurrency:
-  group: release-agent-sdk
-  cancel-in-progress: false
-
 jobs:
-  publish:
-    if: github.ref == 'refs/heads/main'
+  disabled:
+    if: ${{ false }}
    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      pull-requests: write
-    defaults:
-      run:
-        working-directory: packages/browseros-agent/packages/agent-sdk
-
    steps:
-      - uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - uses: oven-sh/setup-bun@v2
-
-      - uses: actions/setup-node@v6
-        with:
-          node-version: "20"
-          registry-url: "https://registry.npmjs.org"
-
-      - name: Install dependencies
-        run: bun ci
-        working-directory: packages/browseros-agent
-
-      - name: Build
-        run: bun run build
-
-      - name: Test
-        run: bun test
-
-      - name: Get version
-        id: version
-        run: |
-          echo "version=$(node -p "require('./package.json').version")" >> "$GITHUB_OUTPUT"
-          echo "release_sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
-
-      - name: Generate release notes
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          SDK_PATH="packages/browseros-agent/packages/agent-sdk"
-          CURRENT_TAG="agent-sdk-v${{ steps.version.outputs.version }}"
-          # Find the previous tag, excluding the current version's tag
-          # (which may already exist from a prior failed run)
-          PREV_TAG=$(git tag -l "agent-sdk-v*" --sort=-v:refname | grep -v "^${CURRENT_TAG}$" | head -n 1)
-
-          if [ -z "$PREV_TAG" ]; then
-            echo "Initial release" > /tmp/release-notes.md
-          else
-            # Get commits scoped to the SDK directory
-            COMMITS=$(git log "$PREV_TAG"..HEAD --pretty=format:"%H" -- "$SDK_PATH")
-
-            if [ -z "$COMMITS" ]; then
-              echo "No notable changes." > /tmp/release-notes.md
-            else
-              echo "## What's Changed" > /tmp/release-notes.md
-              echo "" >> /tmp/release-notes.md
-
-              # For each commit, find the associated PR and format with author
-              CONTRIBUTORS=""
-              while IFS= read -r SHA; do
-                # Get commit subject and author
-                SUBJECT=$(git log -1 --pretty=format:"%s" "$SHA")
-                AUTHOR=$(git log -1 --pretty=format:"%an" "$SHA")
-                GITHUB_USER=$(gh api "/repos/${{ github.repository }}/commits/${SHA}" --jq '.author.login // empty' 2>/dev/null)
-
-                # Find associated PR number
-                PR_NUM=$(gh api "/repos/${{ github.repository }}/commits/${SHA}/pulls" --jq '.[0].number // empty' 2>/dev/null)
-
-                # Format line: skip PR number if already in the commit subject
-                # (squash merges include "(#123)" in the subject automatically)
-                if [ -n "$PR_NUM" ] && ! echo "$SUBJECT" | grep -qF "(#${PR_NUM})"; then
-                  echo "- ${SUBJECT} (#${PR_NUM})" >> /tmp/release-notes.md
-                else
-                  echo "- ${SUBJECT}" >> /tmp/release-notes.md
-                fi
-              done <<< "$COMMITS"
-            fi
-          fi
-        working-directory: ${{ github.workspace }}
-
-      - name: Publish
-        run: npm publish --access public
-        env:
-          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
-
-      - name: Create GitHub release
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          TAG="agent-sdk-v${{ steps.version.outputs.version }}"
-          RELEASE_SHA="${{ steps.version.outputs.release_sha }}"
-          TITLE="BrowserOS Agent SDK - v${{ steps.version.outputs.version }}"
-
-          # Create or reuse tag (idempotent for re-runs)
-          if git rev-parse "$TAG" >/dev/null 2>&1; then
-            echo "Tag $TAG already exists, skipping tag creation"
-          else
-            git tag "$TAG" "$RELEASE_SHA"
-          fi
-
-          # Push tag (skip if already on remote)
-          if git ls-remote --tags origin "$TAG" | grep -q "$TAG"; then
-            echo "Tag $TAG already on remote, skipping push"
-          else
-            git push origin "$TAG"
-          fi
-
-          # Create or update release
-          if gh release view "$TAG" >/dev/null 2>&1; then
-            echo "Release $TAG already exists, updating"
-            gh release edit "$TAG" --title "$TITLE" --notes-file /tmp/release-notes.md
-          else
-            gh release create "$TAG" --title "$TITLE" --notes-file /tmp/release-notes.md
-          fi
-        working-directory: ${{ github.workspace }}
-
-      - name: Update CHANGELOG.md via PR
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          VERSION="${{ steps.version.outputs.version }}"
-          DATE=$(date -u +"%Y-%m-%d")
-          BRANCH="docs/agent-sdk-changelog-v${VERSION}"
-          CHANGELOG="packages/browseros-agent/packages/agent-sdk/CHANGELOG.md"
-
-          # Return to main before branching
-          git checkout main
-
-          # Use head/tail to safely insert without sed quoting issues
-          {
-            head -n 1 "$CHANGELOG"
-            echo ""
-            echo "## v${VERSION} (${DATE})"
-            echo ""
-            cat /tmp/release-notes.md
-            echo ""
-            tail -n +2 "$CHANGELOG"
-          } > /tmp/new-changelog.md
-          mv /tmp/new-changelog.md "$CHANGELOG"
-
-          git config user.name "github-actions[bot]"
-          git config user.email "github-actions[bot]@users.noreply.github.com"
-          git checkout -b "$BRANCH"
-          git add "$CHANGELOG"
-          git commit -m "docs: update agent-sdk changelog for v${VERSION}"
-          git push origin "$BRANCH"
-
-          gh pr create \
-            --title "docs: update agent-sdk changelog for v${VERSION}" \
-            --body "Auto-generated changelog update for BrowserOS Agent SDK v${VERSION}." \
-            --base main \
-            --head "$BRANCH"
-
-          gh pr merge "$BRANCH" --squash --auto || true
-        working-directory: ${{ github.workspace }}
+      - run: echo "Agent SDK publishing is disabled."
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -54,10 +54,10 @@ jobs:
            command: (cd apps/server && bun run test:integration)
            junit_path: test-results/server-integration.xml
            needs_browser: true
-          - suite: server-sdk
-            command: (cd apps/server && bun run test:sdk)
-            junit_path: test-results/server-sdk.xml
-            needs_browser: true
+          - suite: server-lib
+            command: (cd apps/server && bun run test:lib)
+            junit_path: test-results/server-lib.xml
+            needs_browser: false
          - suite: server-root
            command: (cd apps/server && bun run test:root)
            junit_path: test-results/server-root.xml
@@ -70,10 +70,6 @@ jobs:
            command: bun run test:eval
            junit_path: test-results/eval.xml
            needs_browser: false
-          - suite: agent-sdk
-            command: bun run test:agent-sdk
-            junit_path: test-results/agent-sdk.xml
-            needs_browser: false
          - suite: build
            command: bun run test:build
            junit_path: test-results/build.xml
--- a/packages/browseros-agent/README.md
+++ b/packages/browseros-agent/README.md
@@ -1,6 +1,6 @@
 # BrowserOS Agent

-The agent platform powering [BrowserOS](https://github.com/browseros-ai/BrowserOS) — contains the MCP server, agent UI, CLI, evaluation framework, and SDK.
+The agent platform powering [BrowserOS](https://github.com/browseros-ai/BrowserOS) — contains the MCP server, agent UI, CLI, and evaluation framework.

 ## Monorepo Structure

@@ -12,7 +12,6 @@ apps/
  eval/            # Evaluation framework for benchmarking agents

 packages/
-  agent-sdk/       # Node.js SDK (@browseros-ai/agent-sdk)
  cdp-protocol/    # Type-safe Chrome DevTools Protocol bindings
  shared/          # Shared constants (ports, timeouts, limits)
 ```
@@ -23,7 +22,6 @@ packages/
 | `apps/agent` | Agent UI — Chrome extension for the chat interface |
 | `apps/cli` | Go CLI — control BrowserOS from the terminal or AI coding agents |
 | `apps/eval` | Benchmark framework — WebVoyager, Mind2Web evaluation |
-| `packages/agent-sdk` | Node.js SDK for browser automation with natural language |
 | `packages/cdp-protocol` | Auto-generated CDP type bindings used by the server |
 | `packages/shared` | Shared constants used across packages |

--- a/packages/browseros-agent/apps/agent/components/chat/ChatProviderSelector.helpers.ts
+++ b/packages/browseros-agent/apps/agent/components/chat/ChatProviderSelector.helpers.ts
@@ -0,0 +1,50 @@
+import type { Provider } from './chatComponentTypes'
+
+export interface ProviderOptionGroup {
+  key: 'llm' | 'acp'
+  label: string
+  options: Provider[]
+}
+
+export function groupProviderOptions(
+  providers: Provider[],
+): ProviderOptionGroup[] {
+  const llm = providers.filter((provider) => provider.kind !== 'acp')
+  const acp = providers.filter((provider) => provider.kind === 'acp')
+
+  return [
+    ...(llm.length
+      ? [{ key: 'llm' as const, label: 'AI Providers', options: llm }]
+      : []),
+    ...(acp.length
+      ? [{ key: 'acp' as const, label: 'Agents', options: acp }]
+      : []),
+  ]
+}
+
+export function getProviderSearchValue(
+  provider: Provider,
+  groupLabel: string,
+): string {
+  return [
+    provider.id,
+    provider.name,
+    provider.type,
+    groupLabel,
+    provider.adapterName,
+    provider.modelLabel,
+  ]
+    .filter(Boolean)
+    .join(' ')
+}
+
+export function getProviderSubtitle(provider: Provider): string | undefined {
+  if (provider.kind !== 'acp') return undefined
+  return [
+    provider.adapterName,
+    provider.modelLabel,
+    provider.modelControl === 'best-effort' ? 'best effort' : undefined,
+  ]
+    .filter(Boolean)
+    .join(' · ')
+}
--- a/packages/browseros-agent/apps/agent/components/chat/ChatProviderSelector.test.tsx
+++ b/packages/browseros-agent/apps/agent/components/chat/ChatProviderSelector.test.tsx
@@ -0,0 +1,72 @@
+import { describe, expect, it } from 'bun:test'
+import {
+  getProviderSearchValue,
+  getProviderSubtitle,
+  groupProviderOptions,
+} from './ChatProviderSelector.helpers'
+import type { Provider } from './chatComponentTypes'
+
+const options: Provider[] = [
+  { kind: 'llm', id: 'browseros', name: 'BrowserOS', type: 'browseros' },
+  {
+    kind: 'llm',
+    id: 'anthropic-sonnet',
+    name: 'Anthropic Sonnet',
+    type: 'anthropic',
+  },
+  {
+    kind: 'acp',
+    id: 'agent-claude-review',
+    name: 'Review Bot',
+    type: 'acp',
+    adapterName: 'Claude Code',
+    modelLabel: 'Haiku',
+    modelControl: 'best-effort',
+  },
+  {
+    kind: 'acp',
+    id: 'agent-codex-browser',
+    name: 'Browser Driver',
+    type: 'acp',
+    adapterName: 'Codex',
+    modelLabel: 'GPT-5.5',
+    modelControl: 'runtime-supported',
+  },
+]
+
+describe('groupProviderOptions', () => {
+  it('groups normal providers separately from created agents', () => {
+    expect(groupProviderOptions(options)).toEqual([
+      {
+        key: 'llm',
+        label: 'AI Providers',
+        options: [options[0], options[1]],
+      },
+      {
+        key: 'acp',
+        label: 'Agents',
+        options: [options[2], options[3]],
+      },
+    ])
+  })
+})
+
+describe('getProviderSearchValue', () => {
+  it('matches created-agent group labels and item labels', () => {
+    expect(getProviderSearchValue(options[2], 'Agents')).toContain('Agents')
+    expect(getProviderSearchValue(options[2], 'Agents')).toContain('Review Bot')
+    expect(getProviderSearchValue(options[2], 'Agents')).toContain(
+      'Claude Code',
+    )
+  })
+})
+
+describe('getProviderSubtitle', () => {
+  it('describes created-agent runtime context without model-target copy', () => {
+    expect(getProviderSubtitle(options[2])).toBe(
+      'Claude Code · Haiku · best effort',
+    )
+    expect(getProviderSubtitle(options[3])).toBe('Codex · GPT-5.5')
+    expect(getProviderSubtitle(options[0])).toBeUndefined()
+  })
+})
--- a/packages/browseros-agent/apps/agent/components/chat/ChatProviderSelector.tsx
+++ b/packages/browseros-agent/apps/agent/components/chat/ChatProviderSelector.tsx
@@ -1,4 +1,4 @@
-import { Check, Plus } from 'lucide-react'
+import { Bot, Check, Plus } from 'lucide-react'
 import type { FC, PropsWithChildren } from 'react'
 import { useState } from 'react'
 import {
@@ -17,6 +17,11 @@ import {
 import { BrowserOSIcon, ProviderIcon } from '@/lib/llm-providers/providerIcons'
 import type { ProviderType } from '@/lib/llm-providers/types'
 import { cn } from '@/lib/utils'
+import {
+  getProviderSearchValue,
+  getProviderSubtitle,
+  groupProviderOptions,
+} from './ChatProviderSelector.helpers'
 import type { Provider } from './chatComponentTypes'

 interface ChatProviderSelectorProps {
@@ -29,54 +34,58 @@ export const ChatProviderSelector: FC<
  PropsWithChildren<ChatProviderSelectorProps>
 > = ({ children, providers, selectedProvider, onSelectProvider }) => {
  const [open, setOpen] = useState(false)
+  const groups = groupProviderOptions(providers)

  return (
    <Popover open={open} onOpenChange={setOpen}>
      <PopoverTrigger asChild>{children}</PopoverTrigger>
-      <PopoverContent side="bottom" align="start" className="w-48 p-0">
+      <PopoverContent side="bottom" align="start" className="w-64 p-0">
        <Command>
-          <CommandInput placeholder="Search providers..." className="h-9" />
+          <CommandInput
+            placeholder="Search providers or agents..."
+            className="h-9"
+          />
          <CommandList>
-            <div className="my-2 px-2 font-semibold text-muted-foreground text-xs uppercase tracking-wide">
-              AI Provider
-            </div>
            <CommandEmpty>No provider found</CommandEmpty>
-            <CommandGroup>
-              {providers.map((provider) => {
-                const isSelected = selectedProvider.id === provider.id
-                return (
-                  <CommandItem
-                    key={provider.id}
-                    value={`${provider.id} ${provider.name}`}
-                    onSelect={() => {
-                      onSelectProvider(provider)
-                      setOpen(false)
-                    }}
-                    className={cn(
-                      'flex w-full items-center gap-3 rounded-md p-2 transition-colors',
-                      isSelected && 'bg-[var(--accent-orange)]/10',
-                    )}
-                  >
-                    <span className="text-muted-foreground">
-                      {provider.type === 'browseros' ? (
-                        <BrowserOSIcon size={18} />
-                      ) : (
-                        <ProviderIcon
-                          type={provider.type as ProviderType}
-                          size={18}
-                        />
+            {groups.map((group) => (
+              <CommandGroup key={group.key} heading={group.label}>
+                {group.options.map((provider) => {
+                  const isSelected = selectedProvider.id === provider.id
+                  const subtitle = getProviderSubtitle(provider)
+                  return (
+                    <CommandItem
+                      key={provider.id}
+                      value={getProviderSearchValue(provider, group.label)}
+                      onSelect={() => {
+                        onSelectProvider(provider)
+                        setOpen(false)
+                      }}
+                      className={cn(
+                        'flex w-full items-center gap-3 rounded-md p-2 transition-colors',
+                        isSelected && 'bg-[var(--accent-orange)]/10',
                      )}
-                    </span>
-                    <span className="flex-1 text-left text-sm">
-                      {provider.name}
-                    </span>
-                    {isSelected && (
-                      <Check className="h-3.5 w-3.5 text-[var(--accent-orange)]" />
-                    )}
-                  </CommandItem>
-                )
-              })}
-            </CommandGroup>
+                    >
+                      <span className="text-muted-foreground">
+                        <ProviderOptionIcon provider={provider} />
+                      </span>
+                      <span className="min-w-0 flex-1 text-left">
+                        <span className="block truncate text-sm">
+                          {provider.name}
+                        </span>
+                        {subtitle && (
+                          <span className="block truncate text-muted-foreground text-xs">
+                            {subtitle}
+                          </span>
+                        )}
+                      </span>
+                      {isSelected && (
+                        <Check className="h-3.5 w-3.5 text-[var(--accent-orange)]" />
+                      )}
+                    </CommandItem>
+                  )
+                })}
+              </CommandGroup>
+            ))}
            <div className="border-border border-t p-1">
              <button
                type="button"
@@ -96,3 +105,9 @@ export const ChatProviderSelector: FC<
    </Popover>
  )
 }
+
+function ProviderOptionIcon({ provider }: { provider: Provider }) {
+  if (provider.kind === 'acp') return <Bot size={18} />
+  if (provider.type === 'browseros') return <BrowserOSIcon size={18} />
+  return <ProviderIcon type={provider.type as ProviderType} size={18} />
+}
--- a/packages/browseros-agent/apps/agent/components/chat/chatComponentTypes.ts
+++ b/packages/browseros-agent/apps/agent/components/chat/chatComponentTypes.ts
@@ -1,7 +1,14 @@
 import type { ProviderType } from '@/lib/llm-providers/types'

+export type ChatProviderType = ProviderType | 'acp'
+
 export interface Provider {
  id: string
  name: string
-  type: ProviderType
+  type: ChatProviderType
+  kind: 'llm' | 'acp'
+  agentId?: string
+  adapterName?: string
+  modelLabel?: string
+  modelControl?: 'runtime-supported' | 'best-effort'
 }
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/AgentCommandConversation.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/AgentCommandConversation.tsx
@@ -1,5 +1,5 @@
 import { ArrowLeft, Bot, Home } from 'lucide-react'
-import { type FC, useEffect, useMemo, useRef, useState } from 'react'
+import { type FC, useEffect, useMemo, useRef } from 'react'
 import { Navigate, useNavigate, useParams, useSearchParams } from 'react-router'
 import { Button } from '@/components/ui/button'
 import {
@@ -16,9 +16,7 @@ import {
  flattenHistoryPages,
 } from './claw-chat-types'
 import { useAgentConversation } from './useAgentConversation'
-import { useClawChatHistory } from './useClawChatHistory'
 import { useHarnessChatHistory } from './useHarnessChatHistory'
-import { useOutboundQueue } from './useOutboundQueue'

 function StatusBadge({ status }: { status: string }) {
  return (
@@ -176,19 +174,10 @@ function getAgentEntryMeta(agent: AgentEntry | undefined): string {
  return getModelDisplayName(agent?.model) ?? 'OpenClaw agent'
 }

-function getConversationStatusCopy(status: string | undefined): string {
-  if (status === 'running') return 'Ready'
-  if (status === 'starting') return 'Connecting'
-  if (status === 'error') return 'Attention'
-  if (status === 'stopped') return 'Offline'
-  return 'Setup'
-}
-
 function AgentConversationController({
  agentId,
  initialMessage,
  onInitialMessageConsumed,
-  status,
  agents,
  agentPathPrefix,
  createAgentPath,
@@ -196,7 +185,6 @@ function AgentConversationController({
  agentId: string
  initialMessage: string | null
  onInitialMessageConsumed: () => void
-  status: ReturnType<typeof useAgentCommandData>['status']
  agents: AgentEntry[]
  agentPathPrefix: string
  createAgentPath: string
@@ -204,121 +192,49 @@ function AgentConversationController({
  const navigate = useNavigate()
  const initialMessageSentRef = useRef<string | null>(null)
  const onInitialMessageConsumedRef = useRef(onInitialMessageConsumed)
-  const [streamSessionKey, setStreamSessionKey] = useState<string | null>(null)
  const agent = agents.find((entry) => entry.agentId === agentId)
  const agentName = agent?.name || agentId || 'Agent'
-  const isAgentHarnessAgent = agent?.source === 'agent-harness'
-  const clawHistoryQuery = useClawChatHistory({
-    agentId,
-    sessionKey: streamSessionKey,
-    enabled: Boolean(agent) && !isAgentHarnessAgent,
-  })
-  const harnessHistoryQuery = useHarnessChatHistory(
-    agentId,
-    Boolean(agent) && isAgentHarnessAgent,
-  )
+  // Routing is now harness-only. Every OpenClaw agent has a harness
+  // record post the gateway → harness backfill, so the chat panel
+  // always talks to /agents/<id>/chat. The legacy ClawChat surface
+  // was deleted with the /claw/agents/:id/chat server route.
+  const harnessHistoryQuery = useHarnessChatHistory(agentId, Boolean(agent))

  const historyMessages = useMemo(
    () =>
      flattenHistoryPages(
-        isAgentHarnessAgent
-          ? harnessHistoryQuery.data
-            ? [harnessHistoryQuery.data]
-            : []
-          : (clawHistoryQuery.data?.pages ?? []),
+        harnessHistoryQuery.data ? [harnessHistoryQuery.data] : [],
      ),
-    [
-      clawHistoryQuery.data?.pages,
-      harnessHistoryQuery.data,
-      isAgentHarnessAgent,
-    ],
+    [harnessHistoryQuery.data],
  )
  const chatHistory = useMemo(
    () => buildChatHistoryFromClawMessages(historyMessages),
    [historyMessages],
  )
-  const resolvedSessionKey =
-    streamSessionKey ??
-    (isAgentHarnessAgent
-      ? null
-      : (clawHistoryQuery.data?.pages?.[0]?.sessionKey ?? null))

  const { turns, streaming, send } = useAgentConversation(agentId, {
-    runtime: isAgentHarnessAgent ? 'agent-harness' : 'openclaw',
-    sessionKey: resolvedSessionKey,
+    runtime: 'agent-harness',
+    sessionKey: null,
    history: chatHistory,
    onComplete: () => {
-      if (isAgentHarnessAgent) {
-        void harnessHistoryQuery.refetch()
-      }
-    },
-    onSessionKeyChange: (sessionKey) => {
-      setStreamSessionKey(sessionKey)
+      void harnessHistoryQuery.refetch()
    },
+    onSessionKeyChange: () => {},
  })
  const visibleTurns = useMemo(
-    () =>
-      isAgentHarnessAgent
-        ? filterTurnsPersistedInHistory(turns, historyMessages)
-        : turns,
-    [historyMessages, isAgentHarnessAgent, turns],
+    () => filterTurnsPersistedInHistory(turns, historyMessages),
+    [historyMessages, turns],
  )
-  const outboundQueue = useOutboundQueue({
-    agentId,
-    sessionKey: resolvedSessionKey,
-    enabled: Boolean(agent) && !isAgentHarnessAgent,
-  })
  onInitialMessageConsumedRef.current = onInitialMessageConsumed

-  // Refetch history whenever a server-dispatched queue item completes.
-  // The server worker streams the queued turn into OpenClaw directly, so
-  // the client never observes the live tokens — we only see the new
-  // assistant turn once the JSONL is updated. Watching the queue for
-  // any 'sending' item dropping out is the cleanest "turn finalized"
-  // signal we have without exposing per-turn SSE.
-  const previousSendingIdsRef = useRef<Set<string>>(new Set())
-  useEffect(() => {
-    if (isAgentHarnessAgent) return
-    const currentSending = new Set(
-      outboundQueue.queue
-        .filter((item) => item.status === 'sending')
-        .map((item) => item.id),
-    )
-    const dropped = [...previousSendingIdsRef.current].filter(
-      (id) => !currentSending.has(id),
-    )
-    previousSendingIdsRef.current = currentSending
-    if (dropped.length > 0) {
-      void clawHistoryQuery.refetch()
-    }
-  }, [clawHistoryQuery, isAgentHarnessAgent, outboundQueue.queue])
-
-  const disabled =
-    !agent || (!isAgentHarnessAgent && status?.status !== 'running')
-  // Two-part gate: cover both "still fetching" AND "just got enabled but
-  // hasn't started fetching yet". When `enabled` flips true (baseUrl
-  // resolves), there's a render frame where React Query reports
-  // isLoading=false but hasn't run the queryFn yet — `isFetched` is still
-  // false. Without this we render EmptyState during that one frame.
-  const isInitialLoading =
-    !isAgentHarnessAgent &&
-    (clawHistoryQuery.isLoading ||
-      (!clawHistoryQuery.isFetched && !clawHistoryQuery.isError))
-
+  const disabled = !agent
  const historyReady =
-    (isAgentHarnessAgent &&
-      (harnessHistoryQuery.isFetched || harnessHistoryQuery.isError)) ||
-    (!isAgentHarnessAgent &&
-      (clawHistoryQuery.isFetched || clawHistoryQuery.isError))
+    harnessHistoryQuery.isFetched || harnessHistoryQuery.isError
  const initialMessageKey = initialMessage
    ? `${agentId}:${initialMessage}`
    : null
-  const error = isAgentHarnessAgent
-    ? (harnessHistoryQuery.error ?? null)
-    : (clawHistoryQuery.error ?? null)
+  const error = harnessHistoryQuery.error ?? null

-  const enqueueRef = useRef(outboundQueue.enqueue)
-  enqueueRef.current = outboundQueue.enqueue
  const sendRef = useRef(send)
  sendRef.current = send

@@ -340,18 +256,8 @@ function AgentConversationController({

    initialMessageSentRef.current = initialMessageKey
    onInitialMessageConsumedRef.current()
-    if (isAgentHarnessAgent) {
-      void sendRef.current({ text: query })
-    } else {
-      enqueueRef.current({ text: query })
-    }
-  }, [
-    disabled,
-    historyReady,
-    initialMessage,
-    initialMessageKey,
-    isAgentHarnessAgent,
-  ])
+    void sendRef.current({ text: query })
+  }, [disabled, historyReady, initialMessage, initialMessageKey])

  const handleSelectAgent = (entry: AgentEntry) => {
    navigate(`${agentPathPrefix}/${entry.agentId}`)
@@ -364,27 +270,13 @@ function AgentConversationController({
        historyMessages={historyMessages}
        turns={visibleTurns}
        streaming={streaming}
-        isInitialLoading={
-          isAgentHarnessAgent ? harnessHistoryQuery.isLoading : isInitialLoading
-        }
+        isInitialLoading={harnessHistoryQuery.isLoading}
        error={error}
-        hasNextPage={
-          isAgentHarnessAgent ? false : Boolean(clawHistoryQuery.hasNextPage)
-        }
-        isFetchingNextPage={
-          isAgentHarnessAgent ? false : clawHistoryQuery.isFetchingNextPage
-        }
-        onFetchNextPage={() => {
-          if (!isAgentHarnessAgent) {
-            void clawHistoryQuery.fetchNextPage()
-          }
-        }}
+        hasNextPage={false}
+        isFetchingNextPage={false}
+        onFetchNextPage={() => {}}
        onRetry={() => {
-          if (isAgentHarnessAgent) {
-            void harnessHistoryQuery.refetch()
-          } else {
-            void clawHistoryQuery.refetch()
-          }
+          void harnessHistoryQuery.refetch()
        }}
      />

@@ -404,32 +296,14 @@ function AgentConversationController({
                name: a.name,
                dataUrl: a.dataUrl,
              }))
-              if (isAgentHarnessAgent) {
-                void send({ text: input.text, attachments, attachmentPreviews })
-              } else {
-                outboundQueue.enqueue({
-                  text: input.text,
-                  attachments,
-                  attachmentPreviews,
-                  history: chatHistory,
-                })
-              }
+              void send({ text: input.text, attachments, attachmentPreviews })
            }}
            onCreateAgent={() => navigate(createAgentPath)}
            streaming={streaming}
            disabled={disabled}
-            status={isAgentHarnessAgent ? 'running' : status?.status}
-            attachmentsEnabled={!isAgentHarnessAgent}
+            status="running"
+            attachmentsEnabled={true}
            placeholder={`Message ${agentName}...`}
-            outboundQueue={
-              isAgentHarnessAgent ? undefined : outboundQueue.queue
-            }
-            onCancelQueued={
-              isAgentHarnessAgent ? undefined : outboundQueue.cancel
-            }
-            onRetryQueued={
-              isAgentHarnessAgent ? undefined : outboundQueue.retry
-            }
          />
        </div>
      </div>
@@ -453,7 +327,7 @@ export const AgentCommandConversation: FC<AgentCommandConversationProps> = ({
  const { agentId } = useParams<{ agentId: string }>()
  const [searchParams, setSearchParams] = useSearchParams()
  const navigate = useNavigate()
-  const { status, agents } = useAgentCommandData()
+  const { agents } = useAgentCommandData()
  const shouldRedirectHome = !agentId
  const resolvedAgentId = agentId ?? ''
  const agent = agents.find((entry) => entry.agentId === resolvedAgentId)
@@ -471,10 +345,11 @@ export const AgentCommandConversation: FC<AgentCommandConversationProps> = ({
    navigate(`${agentPathPrefix}/${entry.agentId}`)
  }

-  const statusCopy =
-    agent?.source === 'agent-harness'
-      ? 'Ready'
-      : getConversationStatusCopy(status?.status)
+  // Every visible agent runs through the harness now, so per-agent
+  // runtime status doesn't gate chat the way OpenClaw's legacy
+  // gateway lifecycle did. Show "Ready" once the agent record is
+  // resolved from the rail, "Setup" otherwise.
+  const statusCopy = agent ? 'Ready' : 'Setup'

  return (
    <div className="absolute inset-0 overflow-hidden bg-background md:pl-[theme(spacing.14)]">
@@ -500,7 +375,6 @@ export const AgentCommandConversation: FC<AgentCommandConversationProps> = ({
          key={resolvedAgentId}
          agentId={resolvedAgentId}
          agents={agents}
-          status={status}
          initialMessage={initialMessage}
          onInitialMessageConsumed={() =>
            setSearchParams({}, { replace: true })
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/ConversationInput.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/ConversationInput.tsx
@@ -1,5 +1,4 @@
 import {
-  AlertTriangle,
  ArrowRight,
  Bot,
  ChevronDown,
@@ -9,7 +8,6 @@ import {
  Loader2,
  Mic,
  Paperclip,
-  RefreshCw,
  Square,
  X,
 } from 'lucide-react'
@@ -38,7 +36,6 @@ import { cn } from '@/lib/utils'
 import { useVoiceInput } from '@/lib/voice/useVoiceInput'
 import { useWorkspace } from '@/lib/workspace/use-workspace'
 import { AgentSelector } from './AgentSelector'
-import type { OutboundMessage } from './useOutboundQueue'

 export interface ConversationInputSendInput {
  text: string
@@ -57,15 +54,6 @@ interface ConversationInputProps {
  placeholder?: string
  attachmentsEnabled?: boolean
  variant?: 'home' | 'conversation'
-  // Outbound queue: when present, the composer renders the queue strip
-  // above the textarea and lets the user keep sending while a previous
-  // turn is in flight. Optional so non-conversation variants (the home
-  // page) can opt out — the queue only makes sense in the conversation
-  // page where each enqueued message will eventually be delivered to the
-  // active agent.
-  outboundQueue?: OutboundMessage[]
-  onCancelQueued?: (id: string) => void
-  onRetryQueued?: (id: string) => void
 }

 function InputActionButton({
@@ -311,9 +299,6 @@ export const ConversationInput: FC<ConversationInputProps> = ({
  placeholder,
  attachmentsEnabled = true,
  variant = 'conversation',
-  outboundQueue,
-  onCancelQueued,
-  onRetryQueued,
 }) => {
  const [input, setInput] = useState('')
  const [selectedTabs, setSelectedTabs] = useState<chrome.tabs.Tab[]>([])
@@ -394,15 +379,10 @@ export const ConversationInput: FC<ConversationInputProps> = ({
  }

  const hasContent = input.trim().length > 0 || attachments.length > 0
-  const queueEnabled = outboundQueue !== undefined

  const handleSend = () => {
    const text = input.trim()
-    // The outbound queue accepts new messages while streaming; legacy
-    // direct-send callers (e.g., the home composer) keep the original
-    // streaming-blocks-send semantic.
-    if (disabled || isStaging) return
-    if (!queueEnabled && streaming) return
+    if (disabled || isStaging || streaming) return
    if (!text && attachments.length === 0) return
    onSend({ text, attachments })
    setInput('')
@@ -494,13 +474,6 @@ export const ConversationInput: FC<ConversationInputProps> = ({
            error={attachmentError}
          />
        ) : null}
-        {queueEnabled && outboundQueue && outboundQueue.length > 0 ? (
-          <OutboundQueueStrip
-            messages={outboundQueue}
-            onCancel={onCancelQueued}
-            onRetry={onRetryQueued}
-          />
-        ) : null}
        <div
          className={cn(
            'flex gap-3',
@@ -556,10 +529,7 @@ export const ConversationInput: FC<ConversationInputProps> = ({
              !!disabled ||
              voice.isRecording ||
              voice.isTranscribing ||
-              // Only block on `streaming` for the legacy direct-send path
-              // (no queue). With the queue active the press always
-              // succeeds — it just enqueues instead of dispatching.
-              (!queueEnabled && streaming)
+              streaming
            }
            onClick={handleSend}
            // Spinner stays the user-facing "agent is busy" hint; with the
@@ -595,117 +565,6 @@ export const ConversationInput: FC<ConversationInputProps> = ({
  )
 }

-function OutboundQueueStrip({
-  messages,
-  onCancel,
-  onRetry,
-}: {
-  messages: OutboundMessage[]
-  onCancel?: (id: string) => void
-  onRetry?: (id: string) => void
-}) {
-  return (
-    <div className="border-border/40 border-b px-4 pt-3 pb-2">
-      <ul className="flex flex-col gap-1">
-        {messages.map((message) => (
-          <OutboundQueueItem
-            key={message.id}
-            message={message}
-            onCancel={onCancel}
-            onRetry={onRetry}
-          />
-        ))}
-      </ul>
-    </div>
-  )
-}
-
-function OutboundQueueItem({
-  message,
-  onCancel,
-  onRetry,
-}: {
-  message: OutboundMessage
-  onCancel?: (id: string) => void
-  onRetry?: (id: string) => void
-}) {
-  const preview = message.text.trim() || '(attachments only)'
-  return (
-    <li className="flex items-center gap-2 rounded-md px-2 py-1 text-xs">
-      <OutboundQueueStatusIcon status={message.status} />
-      <span className="min-w-0 flex-1 truncate text-muted-foreground">
-        {preview}
-      </span>
-      {message.attachmentPreviews.length > 0 ? (
-        <span className="inline-flex items-center gap-1 text-muted-foreground/70">
-          <Paperclip className="size-3" />
-          <span className="tabular-nums">
-            {message.attachmentPreviews.length}
-          </span>
-        </span>
-      ) : null}
-      {message.status === 'queued' && onCancel ? (
-        <button
-          type="button"
-          onClick={() => onCancel(message.id)}
-          className="ml-1 inline-flex size-5 items-center justify-center rounded-full text-muted-foreground hover:bg-accent hover:text-foreground"
-          aria-label="Cancel queued message"
-          title="Cancel"
-        >
-          <X className="size-3" />
-        </button>
-      ) : null}
-      {message.status === 'failed' ? (
-        <span className="ml-1 inline-flex items-center gap-2 text-destructive">
-          <span className="max-w-[160px] truncate" title={message.error}>
-            {message.error ?? 'Failed'}
-          </span>
-          {onRetry ? (
-            <button
-              type="button"
-              onClick={() => onRetry(message.id)}
-              className="inline-flex size-5 items-center justify-center rounded-full hover:bg-accent hover:text-foreground"
-              aria-label="Retry failed message"
-              title="Retry"
-            >
-              <RefreshCw className="size-3" />
-            </button>
-          ) : null}
-          {onCancel ? (
-            <button
-              type="button"
-              onClick={() => onCancel(message.id)}
-              className="inline-flex size-5 items-center justify-center rounded-full hover:bg-accent hover:text-foreground"
-              aria-label="Discard failed message"
-              title="Discard"
-            >
-              <X className="size-3" />
-            </button>
-          ) : null}
-        </span>
-      ) : null}
-    </li>
-  )
-}
-
-function OutboundQueueStatusIcon({
-  status,
-}: {
-  status: OutboundMessage['status']
-}) {
-  if (status === 'sending') {
-    return (
-      <Loader2 className="size-3.5 shrink-0 animate-spin text-muted-foreground" />
-    )
-  }
-  if (status === 'failed') {
-    return <AlertTriangle className="size-3.5 shrink-0 text-destructive" />
-  }
-  return (
-    <span className="inline-block size-2 shrink-0 rounded-full bg-muted-foreground/40" />
-  )
-}
-
 function AttachmentStrip({
  attachments,
  onRemove,
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/agent-command-layout.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/agent-command-layout.tsx
@@ -26,7 +26,15 @@ export const AgentCommandLayout: FC = () => {
  const { agents: harnessAgents, loading: harnessAgentsLoading } =
    useHarnessAgents()
  const visibleOpenClawAgents = openClawEnabled ? openClawAgents : []
-  const agents = [...visibleOpenClawAgents, ...harnessAgents]
+  // Dual-created OpenClaw agents appear in both `/claw/agents` (gateway
+  // record) and `/agents` (harness record) under the same id. Prefer the
+  // harness entry so the chat panel can route through the harness path
+  // and the rail doesn't show duplicates.
+  const harnessAgentIds = new Set(harnessAgents.map((entry) => entry.agentId))
+  const dedupedOpenClawAgents = visibleOpenClawAgents.filter(
+    (entry) => !harnessAgentIds.has(entry.agentId),
+  )
+  const agents = [...dedupedOpenClawAgents, ...harnessAgents]

  return (
    <Outlet
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/claw-chat-types.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/claw-chat-types.ts
@@ -23,9 +23,9 @@ export interface BrowserOSChatHistoryToolCall {
  toolName: string
  label: string
  subject?: string
-  status: 'completed' | 'failed'
-  input?: Record<string, unknown>
-  output?: string
+  status: 'pending' | 'running' | 'completed' | 'failed'
+  input?: unknown
+  output?: unknown
  error?: string
  durationMs?: number
 }
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/harness-history-mapper.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/harness-history-mapper.ts
@@ -0,0 +1,71 @@
+import { buildToolLabel } from '../../../lib/tool-labels'
+import type { HarnessAgentHistoryPage } from '../agents/agent-harness-types'
+import type {
+  AgentHistoryPageResponse,
+  BrowserOSChatHistoryItem,
+  BrowserOSChatHistoryToolCall,
+} from './claw-chat-types'
+
+export function mapHarnessHistoryPage(
+  page: HarnessAgentHistoryPage,
+): AgentHistoryPageResponse {
+  const items: BrowserOSChatHistoryItem[] = page.items.map((item, index) => {
+    const toolCalls = item.toolCalls?.map(
+      (tool): BrowserOSChatHistoryToolCall => {
+        const input = asRecord(tool.input)
+        const { label, subject } = buildToolLabel(tool.toolName, input)
+        return {
+          toolName: tool.toolName,
+          label,
+          status: tool.status,
+          ...(tool.toolCallId ? { toolCallId: tool.toolCallId } : {}),
+          ...(subject ? { subject } : {}),
+          ...(tool.input !== undefined ? { input: tool.input } : {}),
+          ...(tool.output !== undefined ? { output: tool.output } : {}),
+          ...(tool.error ? { error: tool.error } : {}),
+          ...(tool.durationMs != null ? { durationMs: tool.durationMs } : {}),
+        }
+      },
+    )
+
+    return {
+      id: item.id,
+      role: item.role,
+      text: item.text,
+      timestamp: item.createdAt,
+      messageSeq: index + 1,
+      sessionKey: 'main',
+      source: 'user-chat',
+      ...(item.reasoning ? { reasoning: item.reasoning } : {}),
+      ...(toolCalls && toolCalls.length > 0 ? { toolCalls } : {}),
+    }
+  })
+  const updatedAt =
+    page.items.length > 0
+      ? Math.max(...page.items.map((item) => item.createdAt))
+      : Date.now()
+
+  return {
+    agentId: page.agentId,
+    sessionKey: 'main',
+    session: {
+      key: 'main',
+      updatedAt,
+      sessionId: 'main',
+      agentId: page.agentId,
+      kind: 'agent-harness',
+      source: 'user-chat',
+    },
+    items,
+    page: {
+      hasMore: false,
+      limit: items.length,
+    },
+  }
+}
+
+function asRecord(value: unknown): Record<string, unknown> | undefined {
+  return value && typeof value === 'object' && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined
+}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/useAgentConversation.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/useAgentConversation.ts
@@ -1,13 +1,12 @@
 import { useEffect, useRef, useState } from 'react'
 import {
  type AgentHarnessStreamEvent,
+  attachToHarnessTurn,
+  cancelHarnessTurn,
  chatWithHarnessAgent,
+  fetchActiveHarnessTurn,
 } from '@/entrypoints/app/agents/useAgents'
-import {
-  chatWithAgent,
-  type OpenClawChatHistoryMessage,
-  type OpenClawStreamEvent,
-} from '@/entrypoints/app/agents/useOpenClaw'
+import type { OpenClawChatHistoryMessage } from '@/entrypoints/app/agents/useOpenClaw'
 import type {
  AgentConversationTurn,
  AssistantPart,
@@ -29,7 +28,10 @@ export interface SendInput {
 }

 interface UseAgentConversationOptions {
-  runtime?: 'openclaw' | 'agent-harness'
+  // The hook always speaks to the harness chat path now; the OpenClaw
+  // legacy /claw/agents/:id/chat surface was removed in Step 12. The
+  // option remains for forward-compatibility.
+  runtime?: 'agent-harness'
  sessionKey?: string | null
  history?: OpenClawChatHistoryMessage[]
  onComplete?: () => void
@@ -49,6 +51,11 @@ export function useAgentConversation(
  const streamAbortRef = useRef<AbortController | null>(null)
  const onCompleteRef = useRef(options.onComplete)
  const onSessionKeyChangeRef = useRef(options.onSessionKeyChange)
+  // Per-turn resume bookkeeping. `turnId` is captured from the response
+  // header; `lastSeq` advances with every SSE event so a reconnect can
+  // resume via Last-Event-ID.
+  const turnIdRef = useRef<string | null>(null)
+  const lastSeqRef = useRef<number | null>(null)

  useEffect(() => {
    sessionKeyRef.current = options.sessionKey ?? ''
@@ -72,6 +79,12 @@ export function useAgentConversation(
    }
  }, [])

+  // Indirection for the resume effect below: lets it call the latest
+  // event handler without re-subscribing on every render.
+  const processEventRef = useRef<(event: AgentHarnessStreamEvent) => void>(
+    () => {},
+  )
+
  const updateCurrentTurnParts = (
    updater: (parts: AssistantPart[]) => AssistantPart[],
  ) => {
@@ -82,85 +95,6 @@ export function useAgentConversation(
    })
  }

-  const processStreamEvent = (event: OpenClawStreamEvent) => {
-    switch (event.type) {
-      case 'text-delta': {
-        appendTextDelta((event.data.text as string) ?? '')
-        break
-      }
-
-      case 'thinking': {
-        appendThinkingDelta((event.data.text as string) ?? '')
-        break
-      }
-
-      case 'tool-start': {
-        const rawName = (event.data.toolName as string) ?? 'unknown'
-        const args = event.data.args as Record<string, unknown> | undefined
-        const { label, subject } = buildToolLabel(rawName, args)
-        const tool = {
-          id: (event.data.toolCallId as string) ?? crypto.randomUUID(),
-          name: rawName,
-          label,
-          subject,
-          status: 'running' as const,
-        }
-        updateCurrentTurnParts((parts) => {
-          const last = parts[parts.length - 1]
-          if (last?.kind === 'tool-batch') {
-            return [
-              ...parts.slice(0, -1),
-              { ...last, tools: [...last.tools, tool] },
-            ]
-          }
-          return [...parts, { kind: 'tool-batch', tools: [tool] }]
-        })
-        break
-      }
-
-      case 'tool-end': {
-        const toolId = event.data.toolCallId as string
-        const toolStatus: 'completed' | 'error' =
-          (event.data.status as string) === 'error' ? 'error' : 'completed'
-        const durationMs = event.data.durationMs as number | undefined
-        updateCurrentTurnParts((parts) => {
-          for (let i = parts.length - 1; i >= 0; i--) {
-            const part = parts[i]
-            if (
-              part.kind === 'tool-batch' &&
-              part.tools.some((t) => t.id === toolId)
-            ) {
-              const updatedTools = part.tools.map((t) =>
-                t.id === toolId ? { ...t, status: toolStatus, durationMs } : t,
-              )
-              return [
-                ...parts.slice(0, i),
-                { ...part, tools: updatedTools },
-                ...parts.slice(i + 1),
-              ]
-            }
-          }
-          return parts
-        })
-        break
-      }
-
-      case 'done': {
-        markCurrentTurnDone()
-        break
-      }
-
-      case 'error': {
-        const msg =
-          (event.data.message as string) ??
-          (event.data.error as string) ??
-          'Unknown error'
-        appendErrorText(msg)
-        break
-      }
-    }
-  }
-
  const appendTextDelta = (delta: string) => {
    textAccRef.current += delta
    const text = textAccRef.current
@@ -275,6 +209,79 @@ export function useAgentConversation(
        break
    }
  }
+  processEventRef.current = processAgentHarnessStreamEvent
+
+  // On mount (and whenever the agent changes), check whether the
+  // server has an in-flight turn for this agent and reattach to it.
+  // This is what makes the chat resilient across tab close/reopen,
+  // refresh, and navigation: the runtime call kept running on the
+  // server while we were away. Effect only depends on `agentId` —
+  // the event handler is read off a ref so this doesn't re-subscribe
+  // every render.
+  useEffect(() => {
+    let cancelled = false
+    const abortController = new AbortController()
+
+    const attemptResume = async () => {
+      try {
+        const active = await fetchActiveHarnessTurn(agentId)
+        if (cancelled || !active || active.status !== 'running') return
+        if (streamAbortRef.current) return // a fresh send already in flight
+
+        // Stage a placeholder turn so the streamed events have a row
+        // to render into. We don't have the user message text on
+        // resume; the assistant turn is what we're catching up on.
+        setTurns((prev) => [
+          ...prev,
+          {
+            id: crypto.randomUUID(),
+            userText: '',
+            parts: [],
+            done: false,
+            timestamp: active.startedAt,
+          },
+        ])
+        textAccRef.current = ''
+        thinkAccRef.current = ''
+        turnIdRef.current = active.turnId
+        lastSeqRef.current = null
+        streamAbortRef.current = abortController
+        setStreaming(true)
+
+        const response = await attachToHarnessTurn(agentId, {
+          turnId: active.turnId,
+          signal: abortController.signal,
+        })
+        if (!response.ok) return
+        await consumeSSEStream<AgentHarnessStreamEvent>(
+          response,
+          (event, meta) => {
+            if (typeof meta.seq === 'number') lastSeqRef.current = meta.seq
+            processEventRef.current(event)
+          },
+          abortController.signal,
+        )
+      } catch {
+        // Resume is best-effort; transient errors fall back to the
+        // user starting a new turn manually.
+      } finally {
+        if (!cancelled) {
+          if (streamAbortRef.current === abortController) {
+            streamAbortRef.current = null
+          }
+          turnIdRef.current = null
+          lastSeqRef.current = null
+          setStreaming(false)
+        }
+      }
+    }
+
+    void attemptResume()
+    return () => {
+      cancelled = true
+      abortController.abort()
+    }
+  }, [agentId])

  const send = async (input: string | SendInput) => {
    const normalized: SendInput =
@@ -304,17 +311,25 @@ export function useAgentConversation(
    streamAbortRef.current = abortController

    try {
-      const response =
-        options.runtime === 'agent-harness'
-          ? await chatWithHarnessAgent(agentId, trimmed, abortController.signal)
-          : await chatWithAgent(
-              agentId,
-              trimmed,
-              sessionKeyRef.current || undefined,
-              historyRef.current,
-              abortController.signal,
-              attachments,
-            )
+      let response = await chatWithHarnessAgent(
+        agentId,
+        trimmed,
+        abortController.signal,
+        attachments,
+      )
+      // 409 means the server already has an active turn for this
+      // agent (e.g. a previous tab kicked one off and we're a fresh
+      // mount that missed the resume window). Attach to it instead of
+      // double-sending.
+      if (response.status === 409) {
+        const body = (await response.json()) as { turnId?: string }
+        if (body.turnId) {
+          response = await attachToHarnessTurn(agentId, {
+            turnId: body.turnId,
+            signal: abortController.signal,
+          })
+        }
+      }
      const responseSessionKey =
        response.headers.get('X-Session-Key') ??
        response.headers.get('X-Session-Id')
@@ -322,6 +337,11 @@ export function useAgentConversation(
        sessionKeyRef.current = responseSessionKey
        onSessionKeyChangeRef.current?.(responseSessionKey)
      }
+      const responseTurnId = response.headers.get('X-Turn-Id')
+      if (responseTurnId) {
+        turnIdRef.current = responseTurnId
+        lastSeqRef.current = null
+      }
      if (!response.ok) {
        const err = await response.text()
        updateCurrentTurnParts((parts) => [
@@ -330,19 +350,14 @@ export function useAgentConversation(
        ])
        return
      }
-      if (options.runtime === 'agent-harness') {
-        await consumeSSEStream<AgentHarnessStreamEvent>(
-          response,
-          processAgentHarnessStreamEvent,
-          abortController.signal,
-        )
-      } else {
-        await consumeSSEStream<OpenClawStreamEvent>(
-          response,
-          processStreamEvent,
-          abortController.signal,
-        )
-      }
+      await consumeSSEStream<AgentHarnessStreamEvent>(
+        response,
+        (event, meta) => {
+          if (typeof meta.seq === 'number') lastSeqRef.current = meta.seq
+          processAgentHarnessStreamEvent(event)
+        },
+        abortController.signal,
+      )
    } catch (err) {
      if (abortController.signal.aborted) return
      const msg = err instanceof Error ? err.message : String(err)
@@ -354,14 +369,35 @@ export function useAgentConversation(
      if (streamAbortRef.current === abortController) {
        streamAbortRef.current = null
      }
+      turnIdRef.current = null
+      lastSeqRef.current = null
      onCompleteRef.current?.()
      setStreaming(false)
    }
  }

-  const resetConversation = () => {
+  /**
+   * Stop button. The fetch abort only detaches *this* SSE subscriber
+   * now — the underlying turn would otherwise keep running on the
+   * server. So we explicitly cancel via the new endpoint, then unwind
+   * the local stream.
+   */
+  const stop = async () => {
+    const turnId = turnIdRef.current ?? undefined
    streamAbortRef.current?.abort()
    streamAbortRef.current = null
+    try {
+      await cancelHarnessTurn(agentId, {
+        turnId,
+        reason: 'user pressed stop',
+      })
+    } catch {
+      // Best-effort — UI already aborted.
+    }
+  }
+
+  const resetConversation = () => {
+    void stop()
    setTurns([])
    setStreaming(false)
  }
@@ -371,6 +407,7 @@ export function useAgentConversation(
    streaming,
    sessionKey: sessionKeyRef.current,
    send,
+    stop,
    resetConversation,
  }
 }
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/useClawChatHistory.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/useClawChatHistory.ts
@@ -1,71 +0,0 @@
-import { useInfiniteQuery } from '@tanstack/react-query'
-import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
-import type { AgentHistoryPageResponse } from './claw-chat-types'
-
-const HISTORY_QUERY_KEY = 'claw-agent-history'
-
-async function fetchClawJson<T>(url: string): Promise<T> {
-  const response = await fetch(url)
-
-  if (!response.ok) {
-    let message = `Request failed with status ${response.status}`
-    try {
-      const body = (await response.json()) as { error?: string }
-      if (body.error) message = body.error
-    } catch {}
-    throw new Error(message)
-  }
-
-  return response.json() as Promise<T>
-}
-
-function buildClawUrl(baseUrl: string, path: string): URL {
-  return new URL(`/claw${path}`, baseUrl)
-}
-
-export function useClawChatHistory({
-  agentId,
-  sessionKey,
-  enabled = true,
-  limit = 50,
-}: {
-  agentId: string
-  // null lets the server resolve the most recent user-chat session for the
-  // agent — avoids an extra /session round-trip and the race that came with it.
-  sessionKey: string | null
-  enabled?: boolean
-  limit?: number
-}) {
-  const {
-    baseUrl,
-    isLoading: urlLoading,
-    error: urlError,
-  } = useAgentServerUrl()
-
-  const query = useInfiniteQuery<AgentHistoryPageResponse, Error>({
-    queryKey: [HISTORY_QUERY_KEY, baseUrl, agentId, sessionKey],
-    initialPageParam: undefined as string | undefined,
-    queryFn: async ({ pageParam }) => {
-      const url = buildClawUrl(baseUrl as string, `/agents/${agentId}/history`)
-      url.searchParams.set('limit', String(limit))
-
-      if (sessionKey) {
-        url.searchParams.set('sessionKey', sessionKey)
-      }
-      if (typeof pageParam === 'string' && pageParam) {
-        url.searchParams.set('cursor', pageParam)
-      }
-
-      return fetchClawJson<AgentHistoryPageResponse>(url.toString())
-    },
-    getNextPageParam: (lastPage) =>
-      lastPage.page.hasMore ? lastPage.page.cursor : undefined,
-    enabled: enabled && Boolean(baseUrl) && !urlLoading && Boolean(agentId),
-  })
-
-  return {
-    ...query,
-    error: query.error ?? urlError,
-    isLoading: query.isLoading || urlLoading,
-  }
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/useHarnessChatHistory.test.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/useHarnessChatHistory.test.ts
@@ -0,0 +1,55 @@
+import { describe, expect, it } from 'bun:test'
+import { mapHarnessHistoryPage } from './harness-history-mapper'
+
+describe('mapHarnessHistoryPage', () => {
+  it('maps rich harness history into chat history items', () => {
+    const page = mapHarnessHistoryPage({
+      agentId: 'agent-1',
+      sessionId: 'main',
+      items: [
+        {
+          id: 'agent:agent-1:main:1',
+          agentId: 'agent-1',
+          sessionId: 'main',
+          role: 'assistant',
+          text: 'Done.',
+          createdAt: 1000,
+          reasoning: { text: 'checking state' },
+          toolCalls: [
+            {
+              toolCallId: 'tool-1',
+              toolName: 'read_file',
+              status: 'completed',
+              input: { path: 'src/index.ts' },
+              output: 'file contents',
+            },
+          ],
+        },
+      ],
+    })
+
+    expect(page.items).toEqual([
+      {
+        id: 'agent:agent-1:main:1',
+        role: 'assistant',
+        text: 'Done.',
+        timestamp: 1000,
+        messageSeq: 1,
+        sessionKey: 'main',
+        source: 'user-chat',
+        reasoning: { text: 'checking state' },
+        toolCalls: [
+          {
+            toolCallId: 'tool-1',
+            toolName: 'read_file',
+            label: 'Read file',
+            subject: 'index.ts',
+            status: 'completed',
+            input: { path: 'src/index.ts' },
+            output: 'file contents',
+          },
+        ],
+      },
+    ])
+  })
+})
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/useHarnessChatHistory.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/useHarnessChatHistory.ts
@@ -1,11 +1,8 @@
 import { useQuery } from '@tanstack/react-query'
-import type { HarnessAgentHistoryPage } from '@/entrypoints/app/agents/agent-harness-types'
 import { fetchHarnessAgentHistory } from '@/entrypoints/app/agents/useAgents'
 import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
-import type {
-  AgentHistoryPageResponse,
-  BrowserOSChatHistoryItem,
-} from './claw-chat-types'
+import type { AgentHistoryPageResponse } from './claw-chat-types'
+import { mapHarnessHistoryPage } from './harness-history-mapper'

 const HISTORY_QUERY_KEY = 'harness-agent-history'

@@ -30,39 +27,3 @@ export function useHarnessChatHistory(agentId: string, enabled = true) {
    isLoading: query.isLoading || urlLoading,
  }
 }
-
-function mapHarnessHistoryPage(
-  page: HarnessAgentHistoryPage,
-): AgentHistoryPageResponse {
-  const items: BrowserOSChatHistoryItem[] = page.items.map((item, index) => ({
-    id: item.id,
-    role: item.role,
-    text: item.text,
-    timestamp: item.createdAt,
-    messageSeq: index + 1,
-    sessionKey: 'main',
-    source: 'user-chat',
-  }))
-  const updatedAt =
-    page.items.length > 0
-      ? Math.max(...page.items.map((item) => item.createdAt))
-      : Date.now()
-
-  return {
-    agentId: page.agentId,
-    sessionKey: 'main',
-    session: {
-      key: 'main',
-      updatedAt,
-      sessionId: 'main',
-      agentId: page.agentId,
-      kind: 'agent-harness',
-      source: 'user-chat',
-    },
-    items,
-    page: {
-      hasMore: false,
-      limit: items.length,
-    },
-  }
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/useOutboundQueue.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agent-command/useOutboundQueue.ts
@@ -1,271 +0,0 @@
-import { useCallback, useEffect, useRef, useState } from 'react'
-import type { OpenClawChatHistoryMessage } from '@/entrypoints/app/agents/useOpenClaw'
-import type { UserAttachmentPreview } from '@/lib/agent-conversations/types'
-import type { ServerAttachmentPayload } from '@/lib/attachments'
-import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
-
-export type OutboundMessageStatus = 'queued' | 'sending' | 'failed'
-
-export interface OutboundMessage {
-  id: string
-  text: string
-  attachments: ServerAttachmentPayload[]
-  attachmentPreviews: UserAttachmentPreview[]
-  status: OutboundMessageStatus
-  error?: string
-  createdAt: number
-}
-
-export interface OutboundQueueEnqueueInput {
-  text: string
-  attachments?: ServerAttachmentPayload[]
-  attachmentPreviews?: UserAttachmentPreview[]
-  history?: OpenClawChatHistoryMessage[]
-}
-
-export interface OutboundQueueApi {
-  queue: OutboundMessage[]
-  enqueue(input: OutboundQueueEnqueueInput): void
-  cancel(id: string): void
-  retry(id: string): void
-}
-
-interface UseOutboundQueueOptions {
-  agentId: string | null | undefined
-  sessionKey?: string | null
-  enabled?: boolean
-}
-
-interface ServerQueuedItem {
-  id: string
-  status: 'queued' | 'dispatching' | 'failed'
-  message: string
-  attachmentsPreview: Array<{
-    kind: 'image' | 'file'
-    mediaType: string
-    name?: string
-  }>
-  error?: string
-  createdAt: number
-}
-
-function makeId(): string {
-  if (typeof crypto !== 'undefined' && crypto.randomUUID) {
-    return crypto.randomUUID()
-  }
-  return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`
-}
-
-/**
- * Server-backed outbound message queue. The browser is purely a
- * projection of server state — closing the tab is safe because the queue
- * keeps draining server-side via the OutboundQueueService.
- *
- * Single id-keyed list: the client generates the queue id and hands it
- * to the server in the POST body, so the optimistic row and the SSE
- * snapshot reconcile on the same key from frame zero — there is no
- * window in which the message renders twice.
- */
-export function useOutboundQueue(
-  options: UseOutboundQueueOptions,
-): OutboundQueueApi {
-  const { agentId, enabled = true, sessionKey } = options
-  const { baseUrl } = useAgentServerUrl()
-  const sessionKeyRef = useRef<string | null | undefined>(sessionKey)
-  sessionKeyRef.current = sessionKey
-
-  const [items, setItems] = useState<OutboundMessage[]>([])
-  // Track which ids the server has confirmed seeing in any SSE snapshot.
-  // We use this to know whether a missing-from-snapshot id is "drained
-  // by the server" (drop it) or "still in flight client-side" (keep
-  // showing the optimistic row).
-  const everSeenByServerRef = useRef<Set<string>>(new Set())
-  // Local-only attachment previews, keyed by queue id. Data URLs never
-  // leave the browser — the SSE feed only carries metadata, so we hold
-  // them here so the chip strip keeps rendering after server takeover.
-  const previewMapRef = useRef<Map<string, UserAttachmentPreview[]>>(new Map())
-
-  useEffect(() => {
-    if (!enabled || !baseUrl || !agentId) {
-      setItems([])
-      everSeenByServerRef.current = new Set()
-      previewMapRef.current = new Map()
-      return
-    }
-    let cancelled = false
-    const url = `${baseUrl}/claw/agents/${encodeURIComponent(agentId)}/queue/stream`
-    const source = new EventSource(url)
-    source.onmessage = (event) => {
-      if (cancelled) return
-      try {
-        const parsed = JSON.parse(event.data) as { items: ServerQueuedItem[] }
-        const snapshotIds = new Set(parsed.items.map((item) => item.id))
-        for (const id of snapshotIds) everSeenByServerRef.current.add(id)
-
-        setItems((prev) => {
-          const next: OutboundMessage[] = parsed.items.map((item) => ({
-            id: item.id,
-            text: item.message,
-            attachments: [],
-            attachmentPreviews: previewMapRef.current.get(item.id) ?? [],
-            status: serverStatusToClient(item.status),
-            error: item.error,
-            createdAt: item.createdAt,
-          }))
-          // Carry forward any optimistic / failed entries the server
-          // doesn't know about yet (POST in flight) or has finished
-          // dispatching but the client wants to keep visible (failed).
-          const carried = prev.filter((local) => {
-            if (snapshotIds.has(local.id)) return false
-            if (everSeenByServerRef.current.has(local.id)) {
-              // Server saw it before and it's gone now — drained.
-              previewMapRef.current.delete(local.id)
-              return false
-            }
-            return local.status !== 'failed' || Boolean(local.error)
-          })
-          return [...carried, ...next]
-        })
-      } catch {
-        // Malformed event — ignore; next snapshot will recover.
-      }
-    }
-    source.onerror = () => {
-      // Auto-reconnects; nothing to do here.
-    }
-    return () => {
-      cancelled = true
-      source.close()
-    }
-  }, [baseUrl, agentId, enabled])
-
-  const enqueue = useCallback(
-    (input: OutboundQueueEnqueueInput) => {
-      if (!enabled || !baseUrl || !agentId) return
-      const trimmed = input.text.trim()
-      const attachments = input.attachments ?? []
-      if (!trimmed && attachments.length === 0) return
-
-      const id = makeId()
-      const previews = input.attachmentPreviews ?? []
-      previewMapRef.current.set(id, previews)
-      setItems((prev) => [
-        ...prev,
-        {
-          id,
-          text: trimmed,
-          attachments,
-          attachmentPreviews: previews,
-          status: 'queued',
-          createdAt: Date.now(),
-        },
-      ])
-
-      void (async () => {
-        try {
-          const response = await fetch(
-            `${baseUrl}/claw/agents/${encodeURIComponent(agentId)}/queue`,
-            {
-              method: 'POST',
-              headers: { 'Content-Type': 'application/json' },
-              body: JSON.stringify({
-                id,
-                message: trimmed,
-                attachments: attachments.length > 0 ? attachments : undefined,
-                sessionKey: sessionKeyRef.current ?? undefined,
-                history: input.history,
-              }),
-            },
-          )
-          if (!response.ok) {
-            const text = await response.text().catch(() => '')
-            previewMapRef.current.delete(id)
-            setItems((prev) =>
-              prev.map((item) =>
-                item.id === id
-                  ? {
-                      ...item,
-                      status: 'failed',
-                      error:
-                        text || `Failed to enqueue (status ${response.status})`,
-                    }
-                  : item,
-              ),
-            )
-          }
-        } catch (err) {
-          // Only mark as failed if the SSE snapshot hasn't already
-          // taken ownership of the entry (i.e. the request actually
-          // reached the server).
-          if (everSeenByServerRef.current.has(id)) return
-          previewMapRef.current.delete(id)
-          setItems((prev) =>
-            prev.map((item) =>
-              item.id === id
-                ? {
-                    ...item,
-                    status: 'failed',
-                    error:
-                      err instanceof Error
-                        ? err.message
-                        : 'Failed to enqueue message',
-                  }
-                : item,
-            ),
-          )
-        }
-      })()
-    },
-    [baseUrl, agentId, enabled],
-  )
-
-  const cancel = useCallback(
-    (id: string) => {
-      // If the server has never seen this id, just drop it locally.
-      if (!everSeenByServerRef.current.has(id)) {
-        previewMapRef.current.delete(id)
-        setItems((prev) => prev.filter((item) => item.id !== id))
-        return
-      }
-      if (!enabled || !baseUrl || !agentId) return
-      void fetch(
-        `${baseUrl}/claw/agents/${encodeURIComponent(agentId)}/queue/${encodeURIComponent(id)}`,
-        { method: 'DELETE' },
-      ).catch(() => {})
-    },
-    [baseUrl, agentId, enabled],
-  )
-
-  const retry = useCallback(
-    (id: string) => {
-      if (!everSeenByServerRef.current.has(id)) {
-        // Optimistic-only entry, never made it to the server. Reset
-        // status so the user can press Send again.
-        setItems((prev) =>
-          prev.map((item) =>
-            item.id === id
-              ? { ...item, status: 'queued', error: undefined }
-              : item,
-          ),
-        )
-        return
-      }
-      if (!enabled || !baseUrl || !agentId) return
-      void fetch(
-        `${baseUrl}/claw/agents/${encodeURIComponent(agentId)}/queue/${encodeURIComponent(id)}/retry`,
-        { method: 'POST' },
-      ).catch(() => {})
-    },
-    [baseUrl, agentId, enabled],
-  )
-
-  return { queue: items, enqueue, cancel, retry }
-}
-
-function serverStatusToClient(
-  status: ServerQueuedItem['status'],
-): OutboundMessageStatus {
-  if (status === 'dispatching') return 'sending'
-  if (status === 'failed') return 'failed'
-  return 'queued'
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/AdapterIcon.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/AdapterIcon.tsx
@@ -0,0 +1,42 @@
+import { Bot, Cpu, Sparkles } from 'lucide-react'
+import type { FC } from 'react'
+import type { HarnessAgentAdapter } from './agent-harness-types'
+
+/**
+ * Single icon component for any adapter the agent rail can render.
+ * Falls back to a generic bot when the adapter is unknown so future
+ * adapters land without a code change at the call site.
+ */
+interface AdapterIconProps {
+  adapter: HarnessAgentAdapter | 'unknown'
+  className?: string
+}
+
+export const AdapterIcon: FC<AdapterIconProps> = ({ adapter, className }) => {
+  switch (adapter) {
+    case 'claude':
+      // Claude Code — text-based agent, sparkles to evoke the "AI assistant" feel.
+      return <Sparkles className={className} aria-label="Claude Code" />
+    case 'codex':
+      // Codex — code-leaning, CPU mark.
+      return <Cpu className={className} aria-label="Codex" />
+    case 'openclaw':
+      // OpenClaw — bot/automation framing.
+      return <Bot className={className} aria-label="OpenClaw" />
+    default:
+      return <Bot className={className} aria-label="Agent" />
+  }
+}
+
+export function adapterLabel(adapter: HarnessAgentAdapter | 'unknown'): string {
+  switch (adapter) {
+    case 'claude':
+      return 'Claude Code'
+    case 'codex':
+      return 'Codex'
+    case 'openclaw':
+      return 'OpenClaw'
+    default:
+      return 'Agent'
+  }
+}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/AgentList.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/AgentList.tsx
@@ -1,117 +1,108 @@
-import { Bot, Cpu, Loader2, MessageSquare, Plus, Trash2 } from 'lucide-react'
-import type { FC } from 'react'
-import { Badge } from '@/components/ui/badge'
-import { Button } from '@/components/ui/button'
-import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
+import { Loader2 } from 'lucide-react'
+import { type FC, useMemo } from 'react'
+import { AgentRowCard } from './AgentRowCard'
+import { AgentsEmptyState } from './AgentsEmptyState'
+import type { HarnessAgent, HarnessAgentAdapter } from './agent-harness-types'
 import type { AgentListItem } from './agents-page-types'
+import type { AgentLiveness } from './LivenessDot'

 interface AgentListProps {
  agents: AgentListItem[]
+  /**
+   * Optional per-agent activity metadata. Keyed by `agentId`. Missing
+   * entries fall back to status='unknown' / lastUsedAt=null and the
+   * row renders an "unknown" dot. The server will populate this once
+   * the activity tracker ships; the page works without it.
+   */
+  activity?: Record<
+    string,
+    { status: AgentLiveness; lastUsedAt: number | null }
+  >
+  /**
+   * Lookup table from harness agent id → adapter + reasoning effort,
+   * sourced from `useHarnessAgents`. Lets the row card render the
+   * correct adapter icon and chips for harness agents (legacy
+   * /claw/agents entries fall back to inferring from `runtimeLabel`).
+   */
+  harnessAgentLookup?: Map<string, HarnessAgent>
  loading: boolean
  deletingAgentKey: string | null
-  onChatAgent: (agent: AgentListItem) => void
  onCreateAgent: () => void
  onDeleteAgent: (agent: AgentListItem) => void
 }

 export const AgentList: FC<AgentListProps> = ({
  agents,
+  activity,
+  harnessAgentLookup,
  loading,
  deletingAgentKey,
-  onChatAgent,
  onCreateAgent,
  onDeleteAgent,
 }) => {
+  // Sort by recency: most recently used first; never-used agents drop
+  // to the bottom in id-stable order so the list doesn't reshuffle on
+  // every refresh. The pinned exception is the gateway's `main` agent
+  // when it's never been touched — keep it at the top so a fresh
+  // install has an obvious starting point.
+  const ordered = useMemo(() => {
+    const withScore = agents.map((agent) => {
+      const lastUsedAt = activity?.[agent.agentId]?.lastUsedAt ?? null
+      return { agent, lastUsedAt }
+    })
+    return withScore
+      .sort((a, b) => {
+        const aPinned = a.agent.agentId === 'main' && a.lastUsedAt === null
+        const bPinned = b.agent.agentId === 'main' && b.lastUsedAt === null
+        if (aPinned && !bPinned) return -1
+        if (!aPinned && bPinned) return 1
+        const aValue = a.lastUsedAt ?? -Infinity
+        const bValue = b.lastUsedAt ?? -Infinity
+        if (aValue !== bValue) return bValue - aValue
+        return a.agent.agentId.localeCompare(b.agent.agentId)
+      })
+      .map((entry) => entry.agent)
+  }, [activity, agents])
+
  if (loading && agents.length === 0) {
    return (
-      <div className="flex h-36 items-center justify-center rounded-lg border border-border/70">
+      <div className="flex h-36 items-center justify-center rounded-xl border border-border border-dashed bg-card/50">
        <Loader2 className="size-5 animate-spin text-muted-foreground" />
      </div>
    )
  }

  if (agents.length === 0) {
-    return (
-      <Card>
-        <CardContent className="flex h-48 flex-col items-center justify-center gap-4 text-center">
-          <div className="flex size-10 items-center justify-center rounded-lg bg-muted text-muted-foreground">
-            <Bot className="size-5" />
-          </div>
-          <div className="space-y-1">
-            <h2 className="font-medium text-base">No agents</h2>
-            <p className="text-muted-foreground text-sm">
-              Create an OpenClaw, Claude Code, or Codex agent.
-            </p>
-          </div>
-          <Button variant="outline" onClick={onCreateAgent}>
-            <Plus className="mr-2 size-4" />
-            New Agent
-          </Button>
-        </CardContent>
-      </Card>
-    )
+    return <AgentsEmptyState onCreateAgent={onCreateAgent} />
  }

  return (
    <div className="grid gap-3">
-      {agents.map((agent) => (
-        <Card key={agent.key} className="rounded-lg border-border/70">
-          <CardHeader className="flex flex-row items-center justify-between gap-4 py-3">
-            <div className="flex min-w-0 items-center gap-3">
-              <div className="flex size-10 shrink-0 items-center justify-center rounded-lg bg-muted text-muted-foreground">
-                {agent.source === 'openclaw' ? (
-                  <Cpu className="size-5" />
-                ) : (
-                  <Bot className="size-5" />
-                )}
-              </div>
-              <div className="min-w-0">
-                <CardTitle className="truncate text-base">
-                  {agent.name}
-                </CardTitle>
-                <div className="mt-1 flex flex-wrap items-center gap-2 text-muted-foreground text-xs">
-                  <Badge variant="outline" className="rounded-md">
-                    {agent.runtimeLabel}
-                  </Badge>
-                  <span>{agent.modelLabel}</span>
-                  <Badge variant="outline" className="rounded-md">
-                    main
-                  </Badge>
-                </div>
-                <p className="mt-1 truncate font-mono text-muted-foreground text-xs">
-                  {agent.detail}
-                </p>
-              </div>
-            </div>
-            <div className="flex shrink-0 items-center gap-1">
-              <Button
-                variant="ghost"
-                size="sm"
-                onClick={() => onChatAgent(agent)}
-                disabled={!agent.canChat}
-              >
-                <MessageSquare className="mr-1 size-4" />
-                Chat
-              </Button>
-              {agent.canDelete ? (
-                <Button
-                  variant="ghost"
-                  size="icon"
-                  title="Delete agent"
-                  onClick={() => onDeleteAgent(agent)}
-                  disabled={deletingAgentKey === agent.key}
-                >
-                  {deletingAgentKey === agent.key ? (
-                    <Loader2 className="size-4 animate-spin" />
-                  ) : (
-                    <Trash2 className="size-4 text-destructive" />
-                  )}
-                </Button>
-              ) : null}
-            </div>
-          </CardHeader>
-        </Card>
-      ))}
+      {ordered.map((agent) => {
+        const harness = harnessAgentLookup?.get(agent.agentId)
+        const adapter: HarnessAgentAdapter | undefined =
+          harness?.adapter ?? inferAdapterFromLabel(agent.runtimeLabel)
+        return (
+          <AgentRowCard
+            key={agent.key}
+            agent={agent}
+            status={activity?.[agent.agentId]?.status}
+            lastUsedAt={activity?.[agent.agentId]?.lastUsedAt}
+            adapter={adapter}
+            reasoningEffort={harness?.reasoningEffort ?? null}
+            onDelete={onDeleteAgent}
+            deleting={deletingAgentKey === agent.key}
+          />
+        )
+      })}
    </div>
  )
 }
+
+function inferAdapterFromLabel(label: string): HarnessAgentAdapter | undefined {
+  const lower = label?.toLowerCase()
+  if (lower === 'claude code') return 'claude'
+  if (lower === 'codex') return 'codex'
+  if (lower === 'openclaw') return 'openclaw'
+  return undefined
+}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/AgentRowCard.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/AgentRowCard.tsx
@@ -0,0 +1,270 @@
+import {
+  Copy,
+  Loader2,
+  MessageSquare,
+  MoreHorizontal,
+  Pencil,
+  RotateCcw,
+  Trash2,
+} from 'lucide-react'
+import type { FC } from 'react'
+import { useNavigate } from 'react-router'
+import { toast } from 'sonner'
+import { Badge } from '@/components/ui/badge'
+import { Button } from '@/components/ui/button'
+import {
+  DropdownMenu,
+  DropdownMenuContent,
+  DropdownMenuItem,
+  DropdownMenuSeparator,
+  DropdownMenuTrigger,
+} from '@/components/ui/dropdown-menu'
+import {
+  Tooltip,
+  TooltipContent,
+  TooltipProvider,
+  TooltipTrigger,
+} from '@/components/ui/tooltip'
+import { cn } from '@/lib/utils'
+import { AdapterIcon, adapterLabel } from './AdapterIcon'
+import {
+  canDelete as canDeleteAgent,
+  canRename as canRenameAgent,
+  displayName,
+  formatRelativeTime,
+  workspaceLabel,
+} from './agent-display.helpers'
+import type { HarnessAgentAdapter } from './agent-harness-types'
+import type { AgentListItem } from './agents-page-types'
+import { type AgentLiveness, LivenessDot } from './LivenessDot'
+
+interface AgentRowCardProps {
+  agent: AgentListItem
+  /**
+   * Per-agent extras the listing surface provides on top of the
+   * minimal `AgentListItem` shape. `lastUsedAt` survives server
+   * restart (sourced from acpx session record); `status` is in-memory
+   * server-side.
+   */
+  status?: AgentLiveness
+  lastUsedAt?: number | null
+  /** Adapter the agent belongs to. Drives icon + label. */
+  adapter?: HarnessAgentAdapter
+  /** Reasoning effort chip (claude/codex/openclaw catalog). */
+  reasoningEffort?: string | null
+  /** Modeled directly off the inbound delete handler so the parent owns the dialog. */
+  onDelete: (agent: AgentListItem) => void
+  /** Whether THIS agent is mid-delete; renders a spinner in place of the trash icon. */
+  deleting?: boolean
+}
+
+export const AgentRowCard: FC<AgentRowCardProps> = ({
+  agent,
+  status = 'unknown',
+  lastUsedAt,
+  adapter,
+  reasoningEffort,
+  onDelete,
+  deleting,
+}) => {
+  const navigate = useNavigate()
+  const adapterId = adapter ?? inferAdapterFromListItem(agent)
+  const workspace = workspaceLabel(agent)
+  const lastUsedLabel = formatRelativeTime(lastUsedAt ?? null)
+  const allowDelete = canDeleteAgent(agent)
+  const allowRename = canRenameAgent(agent)
+
+  const handleChat = () => navigate(`/agents/${agent.agentId}`)
+  const handleCopyId = async () => {
+    try {
+      await navigator.clipboard.writeText(agent.agentId)
+      toast.success('Agent id copied')
+    } catch {
+      toast.error('Could not copy agent id')
+    }
+  }
+
+  return (
+    <div
+      className={cn(
+        'group rounded-xl border border-border bg-card p-4 shadow-sm transition-all',
+        'hover:border-[var(--accent-orange)]/50 hover:shadow-sm',
+      )}
+    >
+      <div className="flex items-start gap-4">
+        {/* Adapter tile + liveness dot in the corner. */}
+        <div className="relative shrink-0">
+          <div className="flex h-12 w-12 items-center justify-center rounded-xl bg-muted text-muted-foreground">
+            <AdapterIcon adapter={adapterId} className="h-6 w-6" />
+          </div>
+          <LivenessDot
+            status={status}
+            detail={livenessDetail(status, lastUsedAt)}
+            className="absolute -right-0.5 -bottom-0.5"
+          />
+        </div>
+
+        <div className="min-w-0 flex-1">
+          <div className="mb-1 flex items-center gap-2">
+            <span className="truncate font-semibold">{displayName(agent)}</span>
+            {status === 'working' && (
+              <Badge
+                variant="secondary"
+                className="bg-amber-50 text-amber-900 hover:bg-amber-50"
+              >
+                Working
+              </Badge>
+            )}
+            {status === 'asleep' && (
+              <Badge variant="outline" className="text-muted-foreground">
+                Asleep
+              </Badge>
+            )}
+            {status === 'error' && (
+              <Badge variant="destructive">Attention</Badge>
+            )}
+          </div>
+
+          <div className="mb-2 flex flex-wrap items-center gap-1.5 text-xs">
+            <Badge variant="secondary" className="font-normal">
+              {adapterLabel(adapterId)}
+            </Badge>
+            {agent.modelLabel && agent.modelLabel !== 'default' && (
+              <Badge variant="outline" className="font-normal">
+                {agent.modelLabel}
+              </Badge>
+            )}
+            {reasoningEffort && reasoningEffort !== 'medium' && (
+              <Badge variant="outline" className="font-normal">
+                {reasoningEffort}
+              </Badge>
+            )}
+          </div>
+
+          <div className="flex flex-wrap items-center gap-2 text-muted-foreground text-xs">
+            <span>Last used {lastUsedLabel}</span>
+            {workspace && (
+              <>
+                <span aria-hidden>•</span>
+                <span className="truncate font-mono" title={workspace}>
+                  {workspace}
+                </span>
+              </>
+            )}
+          </div>
+        </div>
+
+        <div className="flex shrink-0 items-center gap-2">
+          <Button variant="outline" size="sm" onClick={handleChat}>
+            <MessageSquare className="mr-1.5 h-3 w-3" />
+            Chat
+          </Button>
+          <DropdownMenu>
+            <DropdownMenuTrigger asChild>
+              <Button
+                variant="ghost"
+                size="icon"
+                aria-label={`More actions for ${displayName(agent)}`}
+                className="h-8 w-8"
+              >
+                <MoreHorizontal className="h-4 w-4" />
+              </Button>
+            </DropdownMenuTrigger>
+            <DropdownMenuContent align="end" className="w-44">
+              <DropdownMenuItem onSelect={() => void handleCopyId()}>
+                <Copy className="mr-2 h-3.5 w-3.5" />
+                Copy id
+              </DropdownMenuItem>
+              <RenameMenuItem disabled={!allowRename} />
+              <ResetHistoryMenuItem />
+              <DropdownMenuSeparator />
+              <DropdownMenuItem
+                onSelect={() => onDelete(agent)}
+                disabled={!allowDelete || deleting}
+                className="text-destructive focus:text-destructive"
+              >
+                {deleting ? (
+                  <Loader2 className="mr-2 h-3.5 w-3.5 animate-spin" />
+                ) : (
+                  <Trash2 className="mr-2 h-3.5 w-3.5" />
+                )}
+                Delete
+              </DropdownMenuItem>
+            </DropdownMenuContent>
+          </DropdownMenu>
+        </div>
+      </div>
+    </div>
+  )
+}
+
+const RenameMenuItem: FC<{ disabled: boolean }> = ({ disabled }) => {
+  const item = (
+    <DropdownMenuItem disabled className="text-muted-foreground">
+      <Pencil className="mr-2 h-3.5 w-3.5" />
+      Rename
+    </DropdownMenuItem>
+  )
+  if (!disabled) return item
+  // Disabled but with a hint so users know it's coming, not broken.
+  return (
+    <TooltipProvider delayDuration={300}>
+      <Tooltip>
+        <TooltipTrigger asChild>
+          <span className="block w-full">{item}</span>
+        </TooltipTrigger>
+        <TooltipContent side="left" className="text-xs">
+          Rename coming soon
+        </TooltipContent>
+      </Tooltip>
+    </TooltipProvider>
+  )
+}
+
+const ResetHistoryMenuItem: FC = () => {
+  const item = (
+    <DropdownMenuItem disabled className="text-muted-foreground">
+      <RotateCcw className="mr-2 h-3.5 w-3.5" />
+      Reset history
+    </DropdownMenuItem>
+  )
+  return (
+    <TooltipProvider delayDuration={300}>
+      <Tooltip>
+        <TooltipTrigger asChild>
+          <span className="block w-full">{item}</span>
+        </TooltipTrigger>
+        <TooltipContent side="left" className="text-xs">
+          Reset history coming soon
+        </TooltipContent>
+      </Tooltip>
+    </TooltipProvider>
+  )
+}
+
+function inferAdapterFromListItem(
+  agent: AgentListItem,
+): HarnessAgentAdapter | 'unknown' {
+  const label = agent.runtimeLabel?.toLowerCase()
+  if (label?.includes('claude')) return 'claude'
+  if (label?.includes('codex')) return 'codex'
+  if (label?.includes('openclaw')) return 'openclaw'
+  return 'unknown'
+}
+
+function livenessDetail(
+  status: AgentLiveness,
+  lastUsedAt: number | null | undefined,
+): string | undefined {
+  if (lastUsedAt == null) return undefined
+  const diffMin = Math.floor((Date.now() - lastUsedAt) / 60_000)
+  if (status === 'idle') return `Idle for ${Math.max(0, diffMin)} min`
+  if (status === 'asleep') {
+    if (diffMin < 60) return `Asleep — quiet for ${diffMin} min`
+    const hr = Math.floor(diffMin / 60)
+    return `Asleep — quiet for ${hr} hr`
+  }
+  if (status === 'working') return 'Working on a turn'
+  if (status === 'error') return 'Attention — last turn failed'
+  return undefined
+}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/AgentsEmptyState.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/AgentsEmptyState.tsx
@@ -0,0 +1,32 @@
+import { Bot, Plus } from 'lucide-react'
+import type { FC } from 'react'
+import { Button } from '@/components/ui/button'
+
+interface AgentsEmptyStateProps {
+  onCreateAgent: () => void
+}
+
+export const AgentsEmptyState: FC<AgentsEmptyStateProps> = ({
+  onCreateAgent,
+}) => {
+  return (
+    <div className="rounded-xl border border-border border-dashed bg-card/50 p-12 text-center">
+      <div className="mx-auto mb-4 flex h-12 w-12 items-center justify-center rounded-xl bg-[var(--accent-orange)]/10">
+        <Bot className="h-6 w-6 text-[var(--accent-orange)]" />
+      </div>
+      <h3 className="mb-1 font-semibold">No agents yet</h3>
+      <p className="mx-auto mb-4 max-w-sm text-muted-foreground text-sm">
+        Spin up an OpenClaw, Claude Code, or Codex agent to chat with, schedule,
+        or run in the background.
+      </p>
+      <Button
+        onClick={onCreateAgent}
+        variant="outline"
+        className="border-[var(--accent-orange)] bg-[var(--accent-orange)]/10 text-[var(--accent-orange)] hover:bg-[var(--accent-orange)]/20 hover:text-[var(--accent-orange)]"
+      >
+        <Plus className="mr-1.5 h-4 w-4" />
+        Create your first agent
+      </Button>
+    </div>
+  )
+}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/AgentsHeader.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/AgentsHeader.tsx
@@ -0,0 +1,41 @@
+import { Bot, Plus } from 'lucide-react'
+import type { FC } from 'react'
+import { Button } from '@/components/ui/button'
+
+interface AgentsHeaderProps {
+  onCreateAgent: () => void
+}
+
+/**
+ * Mirrors the visual shape of `SoulHeader` and `ScheduledTasksHeader`
+ * so the page reads as part of the same family. Loose lifecycle
+ * controls that used to sit next to the title moved into
+ * `GatewayStatusBar` — they're OpenClaw-specific and don't apply to
+ * Claude/Codex agents.
+ */
+export const AgentsHeader: FC<AgentsHeaderProps> = ({ onCreateAgent }) => {
+  return (
+    <div className="rounded-xl border border-border bg-card p-6 shadow-sm transition-all hover:shadow-md">
+      <div className="flex items-start gap-4">
+        <div className="flex h-12 w-12 shrink-0 items-center justify-center rounded-xl bg-[var(--accent-orange)]/10">
+          <Bot className="h-6 w-6 text-[var(--accent-orange)]" />
+        </div>
+        <div className="flex-1">
+          <h2 className="mb-1 font-semibold text-xl">Agents</h2>
+          <p className="text-muted-foreground text-sm">
+            OpenClaw, Claude Code, and Codex agents — chat, schedule, and run
+            them in the background.
+          </p>
+        </div>
+        <Button
+          onClick={onCreateAgent}
+          className="border-[var(--accent-orange)] bg-[var(--accent-orange)]/10 text-[var(--accent-orange)] hover:bg-[var(--accent-orange)]/20 hover:text-[var(--accent-orange)]"
+          variant="outline"
+        >
+          <Plus className="mr-1.5 h-4 w-4" />
+          New Agent
+        </Button>
+      </div>
+    </div>
+  )
+}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/AgentsPage.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/AgentsPage.tsx
@@ -3,8 +3,9 @@ import { type FC, useMemo, useState } from 'react'
 import { useNavigate } from 'react-router'
 import { useLlmProviders } from '@/lib/llm-providers/useLlmProviders'
 import { AgentList } from './AgentList'
+import { AgentsHeader } from './AgentsHeader'
 import { AgentTerminal } from './AgentTerminal'
-import type { HarnessAgentAdapter } from './agent-harness-types'
+import type { HarnessAgent, HarnessAgentAdapter } from './agent-harness-types'
 import { createAgentPageActions } from './agents-page-actions'
 import {
  useDefaultAgentName,
@@ -29,9 +30,9 @@ import {
  toHarnessListItem,
  toOpenClawListItem,
 } from './agents-page-utils'
+import { GatewayStatusBar } from './GatewayStatusBar'
 import { NewAgentDialog } from './NewAgentDialog'
 import {
-  AgentsPageHeader,
  ControlPlaneAlert,
  GatewayStateCards,
  InlineErrorAlert,
@@ -44,42 +45,35 @@ import {
  useDeleteHarnessAgent,
  useHarnessAgents,
 } from './useAgents'
-import {
-  useOpenClawAgents,
-  useOpenClawMutations,
-  useOpenClawStatus,
-} from './useOpenClaw'
+import { useOpenClawAgents, useOpenClawMutations } from './useOpenClaw'

 export const AgentsPage: FC = () => {
  const navigate = useNavigate()
-  const {
-    status,
-    loading: statusLoading,
-    error: statusError,
-    refetch: refetchStatus,
-  } = useOpenClawStatus()
  const { providers, defaultProviderId } = useLlmProviders()
  const {
    adapters,
    loading: adaptersLoading,
    error: adaptersError,
-    refetch: refetchAdapters,
  } = useAgentAdapters()

+  // The harness listing now carries the gateway lifecycle snapshot
+  // alongside the agents — one polling source for everything the
+  // agents page renders. The legacy `/claw/status` poll is dead from
+  // this surface; the chat-panel layout still uses it for now.
+  const {
+    harnessAgents,
+    gateway: status,
+    loading: harnessAgentsLoading,
+    error: harnessAgentsError,
+  } = useHarnessAgents()
+
  const openClawAgentsEnabled =
    status?.status === 'running' && status.controlPlaneStatus === 'connected'
  const {
    agents: openClawAgents,
    loading: openClawAgentsLoading,
    error: openClawAgentsError,
-    refetch: refetchOpenClawAgents,
  } = useOpenClawAgents(openClawAgentsEnabled)
-  const {
-    harnessAgents,
-    loading: harnessAgentsLoading,
-    error: harnessAgentsError,
-    refetch: refetchHarnessAgents,
-  } = useHarnessAgents()
  const createHarnessAgent = useCreateHarnessAgent()
  const deleteHarnessAgent = useDeleteHarnessAgent()
  const {
@@ -87,7 +81,6 @@ export const AgentsPage: FC = () => {
    createAgent: createOpenClawAgent,
    deleteAgent: deleteOpenClawAgent,
    startOpenClaw,
-    stopOpenClaw,
    restartOpenClaw,
    reconnectOpenClaw,
    actionInProgress,
@@ -158,42 +151,68 @@ export const AgentsPage: FC = () => {
    openClawAgentsEnabled,
    openClawAgents,
  )
-  const agentListItems = useMemo(
-    () => [
-      ...visibleOpenClawAgents.map((agent) =>
+  const agentListItems = useMemo(() => {
+    // Dual-created OpenClaw agents (and the backfilled `main`/orphans
+    // post Step 9) live in both `/claw/agents` and `/agents` under the
+    // same id. Prefer the harness entry — it carries adapter/model/
+    // reasoning/lastUsedAt/status that the chat path actually uses —
+    // and drop the legacy duplicate so the rail doesn't show every
+    // OpenClaw agent twice.
+    const harnessIds = new Set(harnessAgents.map((agent) => agent.id))
+    const dedupedOpenClawAgents = visibleOpenClawAgents.filter(
+      (agent) => !harnessIds.has(agent.agentId),
+    )
+    return [
+      ...dedupedOpenClawAgents.map((agent) =>
        toOpenClawListItem(agent, openClawManageable),
      ),
      ...harnessAgents.map(toHarnessListItem),
-    ],
-    [harnessAgents, openClawManageable, visibleOpenClawAgents],
-  )
+    ]
+  }, [harnessAgents, openClawManageable, visibleOpenClawAgents])
+  // Lookup map so AgentList can render adapter chips, reasoning, etc.
+  // Computed up here to keep all hooks above the early returns below.
+  const harnessAgentLookup = useMemo(() => {
+    const map = new Map<string, HarnessAgent>()
+    for (const agent of harnessAgents) map.set(agent.id, agent)
+    return map
+  }, [harnessAgents])
+  // Activity map keyed by agentId. Sourced from the harness listing's
+  // server-side enrichment (`status` + `lastUsedAt`). Legacy gateway
+  // agents that don't have a harness record yet (rare post-backfill)
+  // simply miss from the map and render with the default `unknown`
+  // dot until reconciliation picks them up.
+  const agentActivity = useMemo(() => {
+    const map: Record<
+      string,
+      {
+        status: 'working' | 'idle' | 'asleep' | 'error'
+        lastUsedAt: number | null
+      }
+    > = {}
+    for (const agent of harnessAgents) {
+      if (!agent.status) continue
+      map[agent.id] = {
+        status: agent.status,
+        lastUsedAt: agent.lastUsedAt ?? null,
+      }
+    }
+    return map
+  }, [harnessAgents])
  const inlineError = getInlineError({
    lifecyclePending,
    pageError,
-    statusError,
    openClawAgentsError,
    adaptersError,
    harnessAgentsError,
  })
  const agentsLoading = getAgentsLoading({
-    statusLoading,
    adaptersLoading,
    harnessAgentsLoading,
-    openClawAgentsEnabled,
    openClawAgentsLoading,
  })
  const creatingAgent = creatingOpenClawAgent || createHarnessAgent.isPending
  const deletingAgent = deletingOpenClawAgent || deleteHarnessAgent.isPending

-  const refreshAll = async () => {
-    await Promise.all([
-      refetchStatus(),
-      refetchAdapters(),
-      refetchHarnessAgents(),
-      openClawAgentsEnabled ? refetchOpenClawAgents() : Promise.resolve(),
-    ])
-  }
-
  const handleHarnessAdapterChange = (adapter: HarnessAgentAdapter) => {
    const descriptor = adapters.find((entry) => entry.id === adapter)
    setHarnessAdapterId(adapter)
@@ -239,7 +258,9 @@ export const AgentsPage: FC = () => {
    )
  }

-  if (statusLoading && !status) {
+  // First-paint loader: until the harness listing has resolved at
+  // least once we don't know which adapters / agents to render.
+  if (harnessAgentsLoading && !status) {
    return (
      <div className="flex items-center justify-center py-20">
        <Loader2 className="size-6 animate-spin text-muted-foreground" />
@@ -255,27 +276,18 @@ export const AgentsPage: FC = () => {
  const recoveryDetail = status ? getRecoveryDetail(status) : null
  const controlPlaneCopy = getControlPlaneCopyForStatus(status)

+  // Bar only makes sense when the gateway is meaningfully alive AND
+  // there's at least one OpenClaw agent in the merged list. Hide it
+  // for Claude/Codex-only setups so the page stays uncluttered.
+  const showGatewayStatusBar =
+    status?.status === 'running' &&
+    (visibleOpenClawAgents.length > 0 ||
+      harnessAgents.some((agent) => agent.adapter === 'openclaw'))
+
  return (
    <div className="min-h-full bg-background px-6 py-8">
-      <div className="mx-auto flex w-full max-w-5xl flex-col gap-6">
-        <AgentsPageHeader
-          actionInProgress={actionInProgress}
-          controlPlaneBusy={gatewayUiState.controlPlaneBusy}
-          reconnecting={reconnecting}
-          status={status}
-          onCreateAgent={() => setCreateOpen(true)}
-          onOpenTerminal={() => setShowTerminal(true)}
-          onReconnect={() => {
-            void runWithPageErrorHandling(reconnectOpenClaw)
-          }}
-          onRefresh={() => void refreshAll()}
-          onRestart={() => {
-            void runWithPageErrorHandling(restartOpenClaw)
-          }}
-          onStop={() => {
-            void runWithPageErrorHandling(stopOpenClaw)
-          }}
-        />
+      <div className="fade-in slide-in-from-bottom-5 mx-auto flex w-full max-w-5xl animate-in flex-col gap-6 duration-500">
+        <AgentsHeader onCreateAgent={() => setCreateOpen(true)} />

        {lifecycleBanner ? <LifecycleAlert message={lifecycleBanner} /> : null}

@@ -315,11 +327,23 @@ export const AgentsPage: FC = () => {
          }}
        />

+        {showGatewayStatusBar ? (
+          <GatewayStatusBar
+            status={status}
+            actionInProgress={actionInProgress}
+            onOpenTerminal={() => setShowTerminal(true)}
+            onRestart={() => {
+              void runWithPageErrorHandling(restartOpenClaw)
+            }}
+          />
+        ) : null}
+
        <AgentList
          agents={agentListItems}
+          activity={agentActivity}
+          harnessAgentLookup={harnessAgentLookup}
          loading={agentsLoading}
          deletingAgentKey={deletingAgent ? deletingAgentKey : null}
-          onChatAgent={(agent) => navigate(`/agents/${agent.agentId}`)}
          onCreateAgent={() => setCreateOpen(true)}
          onDeleteAgent={(agent) => {
            void handleDelete(agent)
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/GatewayStatusBar.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/GatewayStatusBar.tsx
@@ -0,0 +1,206 @@
+import { Loader2, RotateCcw, Terminal } from 'lucide-react'
+import type { FC, ReactNode } from 'react'
+import { Badge } from '@/components/ui/badge'
+import { Button } from '@/components/ui/button'
+import { Separator } from '@/components/ui/separator'
+import {
+  Tooltip,
+  TooltipContent,
+  TooltipProvider,
+  TooltipTrigger,
+} from '@/components/ui/tooltip'
+import { cn } from '@/lib/utils'
+import type { OpenClawStatus } from './useOpenClaw'
+
+interface GatewayStatusBarProps {
+  status: OpenClawStatus | null
+  /** Disabled while a gateway lifecycle mutation is mid-flight. */
+  actionInProgress: boolean
+  onOpenTerminal: () => void
+  onRestart: () => void
+}
+
+/**
+ * Compact one-line status bar for the OpenClaw gateway. Renders the
+ * lifecycle pills (Running / Control plane connected) plus a Terminal
+ * escape hatch and a Restart Gateway action. Lives between the page
+ * header and the agent list when at least one OpenClaw agent is in
+ * the merged list; collapses to nothing for Claude/Codex-only setups.
+ *
+ * Status is sourced from `GET /agents`'s `gateway` field — the agents
+ * page no longer polls `/claw/status` directly. One endpoint, one
+ * 5s interval, no duplicate state.
+ */
+export const GatewayStatusBar: FC<GatewayStatusBarProps> = ({
+  status,
+  actionInProgress,
+  onOpenTerminal,
+  onRestart,
+}) => {
+  if (!status) return null
+
+  const runningPill = pillForRuntimeStatus(status.status)
+  const controlPlanePill = pillForControlPlane(status.controlPlaneStatus)
+
+  return (
+    <div className="rounded-xl border border-border bg-card px-4 py-3 shadow-sm">
+      <div className="flex items-center gap-3 text-sm">
+        <span className="font-medium text-muted-foreground">
+          OpenClaw gateway
+        </span>
+        <Badge
+          variant={runningPill.variant}
+          className={cn('gap-1.5', runningPill.className)}
+        >
+          <span
+            className={cn(
+              'inline-block h-1.5 w-1.5 rounded-full',
+              runningPill.dot,
+            )}
+          />
+          {runningPill.label}
+        </Badge>
+        <Badge
+          variant={controlPlanePill.variant}
+          className={cn('gap-1.5', controlPlanePill.className)}
+        >
+          <span
+            className={cn(
+              'inline-block h-1.5 w-1.5 rounded-full',
+              controlPlanePill.dot,
+            )}
+          />
+          {controlPlanePill.label}
+        </Badge>
+        <Separator orientation="vertical" className="h-4" />
+        <WithTooltip label="Open a shell into the OpenClaw gateway container for raw CLI access (config edits, session inspection).">
+          <Button variant="ghost" size="sm" onClick={onOpenTerminal}>
+            <Terminal className="mr-1.5 h-3.5 w-3.5" />
+            Terminal
+          </Button>
+        </WithTooltip>
+        <WithTooltip label="Restart the OpenClaw gateway. Useful when the gateway is stuck or after editing provider config.">
+          <Button
+            variant="ghost"
+            size="sm"
+            onClick={onRestart}
+            disabled={actionInProgress}
+            className="ml-auto"
+          >
+            {actionInProgress ? (
+              <Loader2 className="mr-1.5 h-3.5 w-3.5 animate-spin" />
+            ) : (
+              <RotateCcw className="mr-1.5 h-3.5 w-3.5" />
+            )}
+            Restart Gateway
+          </Button>
+        </WithTooltip>
+      </div>
+    </div>
+  )
+}
+
+const WithTooltip: FC<{ label: string; children: ReactNode }> = ({
+  label,
+  children,
+}) => (
+  <TooltipProvider delayDuration={250}>
+    <Tooltip>
+      <TooltipTrigger asChild>{children}</TooltipTrigger>
+      <TooltipContent side="bottom" className="max-w-xs text-xs">
+        {label}
+      </TooltipContent>
+    </Tooltip>
+  </TooltipProvider>
+)
+
+type PillKind = {
+  variant: 'default' | 'secondary' | 'outline' | 'destructive'
+  label: string
+  dot: string
+  className?: string
+}
+
+function pillForRuntimeStatus(status: OpenClawStatus['status']): PillKind {
+  switch (status) {
+    case 'running':
+      return {
+        variant: 'secondary',
+        label: 'Running',
+        dot: 'bg-emerald-500',
+        className: 'bg-emerald-50 text-emerald-900 hover:bg-emerald-50',
+      }
+    case 'starting':
+      return {
+        variant: 'secondary',
+        label: 'Starting',
+        dot: 'bg-amber-500 animate-pulse',
+        className: 'bg-amber-50 text-amber-900 hover:bg-amber-50',
+      }
+    case 'stopped':
+      return {
+        variant: 'outline',
+        label: 'Stopped',
+        dot: 'bg-muted-foreground/40',
+      }
+    case 'error':
+      return {
+        variant: 'destructive',
+        label: 'Error',
+        dot: 'bg-destructive-foreground',
+      }
+    default:
+      return {
+        variant: 'outline',
+        label: 'Unknown',
+        dot: 'bg-muted-foreground/40',
+      }
+  }
+}
+
+function pillForControlPlane(
+  status: OpenClawStatus['controlPlaneStatus'],
+): PillKind {
+  switch (status) {
+    case 'connected':
+      return {
+        variant: 'secondary',
+        label: 'Control plane connected',
+        dot: 'bg-emerald-500',
+        className: 'bg-emerald-50 text-emerald-900 hover:bg-emerald-50',
+      }
+    case 'connecting':
+      return {
+        variant: 'secondary',
+        label: 'Connecting',
+        dot: 'bg-amber-500 animate-pulse',
+        className: 'bg-amber-50 text-amber-900 hover:bg-amber-50',
+      }
+    case 'reconnecting':
+      return {
+        variant: 'secondary',
+        label: 'Reconnecting',
+        dot: 'bg-amber-500 animate-pulse',
+        className: 'bg-amber-50 text-amber-900 hover:bg-amber-50',
+      }
+    case 'recovering':
+      return {
+        variant: 'secondary',
+        label: 'Recovering',
+        dot: 'bg-amber-500 animate-pulse',
+        className: 'bg-amber-50 text-amber-900 hover:bg-amber-50',
+      }
+    case 'failed':
+      return {
+        variant: 'destructive',
+        label: 'Needs attention',
+        dot: 'bg-destructive-foreground',
+      }
+    default:
+      return {
+        variant: 'outline',
+        label: 'Disconnected',
+        dot: 'bg-muted-foreground/40',
+      }
+  }
+}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/LivenessDot.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/LivenessDot.tsx
@@ -0,0 +1,83 @@
+import type { FC } from 'react'
+import {
+  Tooltip,
+  TooltipContent,
+  TooltipProvider,
+  TooltipTrigger,
+} from '@/components/ui/tooltip'
+import { cn } from '@/lib/utils'
+
+export type AgentLiveness = 'working' | 'idle' | 'asleep' | 'error' | 'unknown'
+
+interface LivenessDotProps {
+  status: AgentLiveness
+  /**
+   * Optional human-friendly secondary line, e.g. "Idle for 4 min" or
+   * "Asleep — no activity for 22 min". When absent the tooltip just
+   * reads the status label.
+   */
+  detail?: string
+  className?: string
+}
+
+const VARIANT: Record<
+  AgentLiveness,
+  { dot: string; ring: string; label: string }
+> = {
+  working: {
+    // Animated amber pulse + soft halo so the eye catches an active
+    // agent in a long list without the dot screaming for attention.
+    dot: 'bg-amber-500 animate-pulse',
+    ring: 'ring-2 ring-amber-200',
+    label: 'Working on a turn',
+  },
+  idle: {
+    dot: 'bg-emerald-500',
+    ring: 'ring-2 ring-emerald-100',
+    label: 'Idle',
+  },
+  asleep: {
+    dot: 'bg-muted-foreground/40',
+    ring: 'ring-2 ring-muted',
+    label: 'Asleep',
+  },
+  error: {
+    dot: 'bg-destructive',
+    ring: 'ring-2 ring-destructive/30',
+    label: 'Attention',
+  },
+  unknown: {
+    dot: 'bg-muted-foreground/30',
+    ring: 'ring-2 ring-muted',
+    label: 'Status unknown',
+  },
+}
+
+export const LivenessDot: FC<LivenessDotProps> = ({
+  status,
+  detail,
+  className,
+}) => {
+  const variant = VARIANT[status]
+  return (
+    <TooltipProvider delayDuration={150}>
+      <Tooltip>
+        <TooltipTrigger asChild>
+          <span
+            role="img"
+            aria-label={detail ?? variant.label}
+            className={cn(
+              'inline-block h-3 w-3 rounded-full',
+              variant.dot,
+              variant.ring,
+              className,
+            )}
+          />
+        </TooltipTrigger>
+        <TooltipContent side="right" className="text-xs">
+          {detail ?? variant.label}
+        </TooltipContent>
+      </Tooltip>
+    </TooltipProvider>
+  )
+}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/NewAgentDialog.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/NewAgentDialog.tsx
@@ -154,7 +154,6 @@ export const NewAgentDialog: FC<NewAgentDialogProps> = ({
                <SelectValue />
              </SelectTrigger>
              <SelectContent>
-                <SelectItem value="openclaw">OpenClaw</SelectItem>
                {adapters.map((adapter) => (
                  <SelectItem key={adapter.id} value={adapter.id}>
                    {adapter.name}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/agent-display.helpers.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/agent-display.helpers.ts
@@ -0,0 +1,84 @@
+import type { AgentListItem } from './agents-page-types'
+
+/**
+ * Display rules for the redesigned agent rows. Pure helpers — no React,
+ * no API calls — so they're trivial to unit-test and the row card stays
+ * focused on layout.
+ */
+
+const UUID_PATTERN =
+  /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
+
+const OC_UUID_PATTERN =
+  /^oc-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
+
+/**
+ * The agent rail used to render whatever the gateway returned for `name`.
+ * Post-migration that's frequently the agent's UUID — readable to nobody.
+ * Prefer the explicit `name` when it differs meaningfully from the id;
+ * otherwise fall back to a short prefix users can recognize on second
+ * glance.
+ */
+export function displayName(agent: AgentListItem): string {
+  const name = agent.name?.trim()
+  const id = agent.agentId
+  if (!name || name === id) {
+    if (OC_UUID_PATTERN.test(id)) return id.slice(0, 11) // "oc-XXXXXXXX"
+    if (UUID_PATTERN.test(id)) return id.slice(0, 8)
+    return id
+  }
+  return name
+}
+
+export function canDelete(agent: AgentListItem): boolean {
+  // The gateway's protected `main` agent must not be deletable. The
+  // server enforces this too, but disabling the menu item avoids users
+  // hitting an opaque 400.
+  if (agent.agentId === 'main') return false
+  return agent.canDelete
+}
+
+/**
+ * Rename will be wired to a future `PATCH /agents/:id` endpoint. The
+ * legacy `/claw/agents` create flow named the agent on the gateway via
+ * the `name` field but the field isn't editable post-create today.
+ */
+export function canRename(_agent: AgentListItem): boolean {
+  return false
+}
+
+/**
+ * The detail line carries the agent's workspace path. The `detail`
+ * field on AgentListItem already holds it for OpenClaw entries
+ * (`/home/node/.openclaw/workspace-...`); for harness agents it's the
+ * synthetic `<adapter>:main` marker that's not informative — hide it.
+ */
+export function workspaceLabel(agent: AgentListItem): string | null {
+  if (!agent.detail) return null
+  if (/^(claude|codex|openclaw):main$/.test(agent.detail)) return null
+  return agent.detail
+}
+
+const ONE_MINUTE = 60_000
+const ONE_HOUR = 60 * ONE_MINUTE
+const ONE_DAY = 24 * ONE_HOUR
+
+/**
+ * Lightweight relative-time formatter. We don't want to drag in
+ * `dayjs/relativeTime` just for a few labels.
+ */
+export function formatRelativeTime(epochMs: number | null): string {
+  if (epochMs === null || !Number.isFinite(epochMs)) return 'never'
+  const diff = Math.max(0, Date.now() - epochMs)
+  if (diff < ONE_MINUTE) return 'just now'
+  if (diff < ONE_HOUR) {
+    const m = Math.floor(diff / ONE_MINUTE)
+    return `${m} min ago`
+  }
+  if (diff < ONE_DAY) {
+    const h = Math.floor(diff / ONE_HOUR)
+    return h === 1 ? '1 hr ago' : `${h} hr ago`
+  }
+  const d = Math.floor(diff / ONE_DAY)
+  return d === 1 ? '1 day ago' : `${d} days ago`
+}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/agent-harness-types.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/agent-harness-types.ts
@@ -1,6 +1,6 @@
 import type { AgentEntry } from './useOpenClaw'

-export type HarnessAgentAdapter = 'claude' | 'codex'
+export type HarnessAgentAdapter = 'claude' | 'codex' | 'openclaw'

 export type AgentHarnessStreamEvent =
  | {
@@ -33,6 +33,8 @@ export type AgentHarnessStreamEvent =
      code?: string
    }

+export type HarnessAgentLiveness = 'working' | 'idle' | 'asleep' | 'error'
+
 export interface HarnessAgent {
  id: string
  name: string
@@ -43,6 +45,17 @@ export interface HarnessAgent {
  sessionKey: string
  createdAt: number
  updatedAt: number
+  /**
+   * Server-derived liveness state. When the listing endpoint hasn't
+   * been enriched yet (older deployments) this is undefined and the UI
+   * falls back to `unknown`.
+   */
+  status?: HarnessAgentLiveness
+  /**
+   * Wall-clock ms of the last persisted turn. `null` for never-used
+   * agents. Drives the recency sort and the "Last used X min ago" copy.
+   */
+  lastUsedAt?: number | null
 }

 export interface HarnessAdapterDescriptor {
@@ -62,19 +75,36 @@ export interface CreateHarnessAgentInput {
  reasoningEffort?: string
 }

-export interface HarnessTranscriptEntry {
+export interface HarnessHistoryReasoning {
+  text: string
+  durationMs?: number
+}
+
+export interface HarnessHistoryToolCall {
+  toolCallId?: string
+  toolName: string
+  status: 'pending' | 'running' | 'completed' | 'failed'
+  input?: unknown
+  output?: unknown
+  error?: string
+  durationMs?: number
+}
+
+export interface HarnessHistoryEntry {
  id: string
  agentId: string
  sessionId: 'main'
  role: 'user' | 'assistant'
  text: string
  createdAt: number
+  reasoning?: HarnessHistoryReasoning
+  toolCalls?: HarnessHistoryToolCall[]
 }

 export interface HarnessAgentHistoryPage {
  agentId: string
  sessionId: 'main'
-  items: HarnessTranscriptEntry[]
+  items: HarnessHistoryEntry[]
 }

 export function mapHarnessAgentToEntry(agent: HarnessAgent): AgentEntry {
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/agents-page-utils.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/agents-page-utils.ts
@@ -138,24 +138,20 @@ export function getVisibleOpenClawAgents(
 }

 export function getAgentsLoading(input: {
-  statusLoading: boolean
  adaptersLoading: boolean
  harnessAgentsLoading: boolean
-  openClawAgentsEnabled: boolean
  openClawAgentsLoading: boolean
 }): boolean {
  return (
-    input.statusLoading ||
    input.adaptersLoading ||
    input.harnessAgentsLoading ||
-    (input.openClawAgentsEnabled && input.openClawAgentsLoading)
+    input.openClawAgentsLoading
  )
 }

 export function getInlineError(input: {
  lifecyclePending: boolean
  pageError: string | null
-  statusError: Error | null
  openClawAgentsError: Error | null
  adaptersError: Error | null
  harnessAgentsError: Error | null
@@ -163,7 +159,6 @@ export function getInlineError(input: {
  if (input.lifecyclePending) return null
  return (
    input.pageError ??
-    input.statusError?.message ??
    input.openClawAgentsError?.message ??
    input.adaptersError?.message ??
    input.harnessAgentsError?.message ??
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/useAgents.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/useAgents.ts
@@ -10,6 +10,17 @@ import {
  type HarnessAgentHistoryPage,
  mapHarnessAgentToEntry,
 } from './agent-harness-types'
+import type { OpenClawStatus } from './useOpenClaw'
+
+/**
+ * Combined response shape of `GET /agents`. The page polls this once
+ * and consumes both fields, replacing the dedicated `/claw/status`
+ * poll the previous design carried.
+ */
+interface HarnessAgentsResponse {
+  agents: HarnessAgent[]
+  gateway: OpenClawStatus | null
+}

 export type { AgentHarnessStreamEvent }

@@ -69,21 +80,31 @@ export function useHarnessAgents(enabled = true) {
    error: urlError,
  } = useAgentServerUrl()

-  const query = useQuery<HarnessAgent[], Error>({
+  const query = useQuery<HarnessAgentsResponse, Error>({
    queryKey: [AGENT_QUERY_KEYS.agents, baseUrl],
    queryFn: async () => {
-      const data = await agentsFetch<{ agents: HarnessAgent[] }>(
+      const data = await agentsFetch<HarnessAgentsResponse>(
        baseUrl as string,
        '/',
      )
-      return data.agents ?? []
+      return {
+        agents: data.agents ?? [],
+        gateway: data.gateway ?? null,
+      }
    },
    enabled: Boolean(baseUrl) && !urlLoading && enabled,
+    // Poll every 5s so the per-agent liveness state (working / idle /
+    // asleep / error) and last-used timestamps stay fresh without a
+    // websocket. `refetchIntervalInBackground: false` lets a hidden
+    // tab go quiet — react-query's default, made explicit.
+    refetchInterval: 5_000,
+    refetchIntervalInBackground: false,
  })

  return {
-    agents: (query.data ?? []).map(mapHarnessAgentToEntry),
-    harnessAgents: query.data ?? [],
+    agents: (query.data?.agents ?? []).map(mapHarnessAgentToEntry),
+    harnessAgents: query.data?.agents ?? [],
+    gateway: query.data?.gateway ?? null,
    loading: query.isLoading || urlLoading,
    error: query.error ?? urlError,
    refetch: query.refetch,
@@ -141,16 +162,95 @@ export async function chatWithHarnessAgent(
  agentId: string,
  message: string,
  signal?: AbortSignal,
+  attachments?: ReadonlyArray<unknown>,
 ): Promise<Response> {
  const baseUrl = await getAgentServerUrl()
  return fetch(`${baseUrl}/agents/${encodeURIComponent(agentId)}/chat`, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify({ message }),
+    body: JSON.stringify({
+      message,
+      ...(attachments && attachments.length > 0 ? { attachments } : {}),
+    }),
    signal,
  })
 }

+/**
+ * Subscribe to an existing turn (the server's `ActiveTurnRegistry`
+ * decoupled the turn lifecycle from POST /chat). `lastSeq` lets the
+ * client resume after a disconnect — the server replays buffered
+ * frames with seq > lastSeq, then tails new ones.
+ */
+export async function attachToHarnessTurn(
+  agentId: string,
+  options: { turnId?: string; lastSeq?: number; signal?: AbortSignal } = {},
+): Promise<Response> {
+  const baseUrl = await getAgentServerUrl()
+  const url = new URL(
+    `${baseUrl}/agents/${encodeURIComponent(agentId)}/chat/stream`,
+  )
+  if (options.turnId) url.searchParams.set('turnId', options.turnId)
+  const headers: Record<string, string> = {}
+  if (typeof options.lastSeq === 'number') {
+    headers['Last-Event-ID'] = String(options.lastSeq)
+  }
+  return fetch(url.toString(), { signal: options.signal, headers })
+}
+
+export interface HarnessActiveTurnInfo {
+  turnId: string
+  agentId: string
+  sessionId: 'main'
+  status: 'running' | 'done' | 'error' | 'cancelled'
+  lastSeq: number
+  startedAt: number
+  endedAt?: number
+}
+
+/**
+ * Discover an in-flight turn for an agent. Used on chat mount so the
+ * UI reattaches instead of starting a new turn after a tab/refresh.
+ */
+export async function fetchActiveHarnessTurn(
+  agentId: string,
+): Promise<HarnessActiveTurnInfo | null> {
+  const baseUrl = await getAgentServerUrl()
+  const response = await fetch(
+    `${baseUrl}/agents/${encodeURIComponent(agentId)}/chat/active`,
+  )
+  if (!response.ok) return null
+  const body = (await response.json()) as {
+    active: HarnessActiveTurnInfo | null
+  }
+  return body.active
+}
+
+/**
+ * Stop button. Hits the explicit cancel endpoint instead of just
+ * aborting the fetch (which now only detaches *this* subscriber from
+ * the buffer; the underlying turn would otherwise keep running).
+ */
+export async function cancelHarnessTurn(
+  agentId: string,
+  options: { turnId?: string; reason?: string } = {},
+): Promise<{ cancelled: boolean }> {
+  const baseUrl = await getAgentServerUrl()
+  const response = await fetch(
+    `${baseUrl}/agents/${encodeURIComponent(agentId)}/chat/cancel`,
+    {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        ...(options.turnId ? { turnId: options.turnId } : {}),
+        ...(options.reason ? { reason: options.reason } : {}),
+      }),
+    },
+  )
+  if (!response.ok) return { cancelled: false }
+  return (await response.json()) as { cancelled: boolean }
+}
+
 export async function fetchHarnessAgentHistory(
  agentId: string,
 ): Promise<HarnessAgentHistoryPage> {
--- a/packages/browseros-agent/apps/agent/entrypoints/app/agents/useOpenClaw.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/agents/useOpenClaw.ts
@@ -1,5 +1,4 @@
 import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
-import { getAgentServerUrl } from '@/lib/browseros/helpers'
 import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'

 export interface AgentEntry {
@@ -319,25 +318,3 @@ export function buildChatHistoryFromTurns(

  return messages
 }
-
-export async function chatWithAgent(
-  agentId: string,
-  message: string,
-  sessionKey?: string,
-  history: OpenClawChatHistoryMessage[] = [],
-  signal?: AbortSignal,
-  attachments?: ReadonlyArray<unknown>,
-): Promise<Response> {
-  const baseUrl = await getAgentServerUrl()
-  return fetch(`${baseUrl}/claw/agents/${agentId}/chat`, {
-    method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify({
-      message,
-      sessionKey,
-      history,
-      ...(attachments && attachments.length > 0 ? { attachments } : {}),
-    }),
-    signal,
-  })
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/scheduled-tasks/NewScheduledTaskDialog.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/scheduled-tasks/NewScheduledTaskDialog.tsx
@@ -164,9 +164,17 @@ export const NewScheduledTaskDialog: FC<NewScheduledTaskDialogProps> = ({
  const resolvedProvider: Provider | null = (() => {
    const id = selectedProviderId ?? defaultProviderId
    const found = providers.find((p) => p.id === id)
-    if (found) return { id: found.id, name: found.name, type: found.type }
+    if (found) {
+      return {
+        kind: 'llm' as const,
+        id: found.id,
+        name: found.name,
+        type: found.type,
+      }
+    }
    if (providers[0])
      return {
+        kind: 'llm' as const,
        id: providers[0].id,
        name: providers[0].name,
        type: providers[0].type,
@@ -175,6 +183,7 @@ export const NewScheduledTaskDialog: FC<NewScheduledTaskDialogProps> = ({
  })()

  const providerOptions: Provider[] = providers.map((p) => ({
+    kind: 'llm',
    id: p.id,
    name: p.name,
    type: p.type,
--- a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/ChatHeader.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/ChatHeader.tsx
@@ -1,4 +1,4 @@
-import { Github, History, Plus, SettingsIcon } from 'lucide-react'
+import { Bot, Github, History, Plus, SettingsIcon } from 'lucide-react'
 import type { FC } from 'react'
 import { Link, useLocation, useNavigate } from 'react-router'
 import { ChatProviderSelector } from '@/components/chat/ChatProviderSelector'
@@ -64,7 +64,9 @@ export const ChatHeader: FC<ChatHeaderProps> = ({
            className="group relative inline-flex cursor-pointer items-center gap-2 rounded-lg p-2 text-muted-foreground transition-colors hover:bg-muted/50 hover:text-foreground data-[state=open]:bg-accent"
            title="Change AI Provider"
          >
-            {selectedProvider.type === 'browseros' ? (
+            {selectedProvider.kind === 'acp' ? (
+              <Bot className="h-[18px] w-[18px]" />
+            ) : selectedProvider.type === 'browseros' ? (
              <BrowserOSIcon size={18} />
            ) : (
              <ProviderIcon
--- a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/sidepanel-chat-targets.test.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/sidepanel-chat-targets.test.ts
@@ -0,0 +1,258 @@
+import { describe, expect, it } from 'bun:test'
+import type {
+  HarnessAdapterDescriptor,
+  HarnessAgent,
+} from '@/entrypoints/app/agents/agent-harness-types'
+import type { LlmProviderConfig } from '@/lib/llm-providers/types'
+import {
+  buildSidepanelChatTargets,
+  persistSidepanelChatTargetSelection,
+  resolveSidepanelChatTarget,
+  type SidepanelChatTargetSelection,
+  toLlmProviderConfig,
+} from './sidepanel-chat-targets'
+
+const timestamp = 1000
+
+const providers: LlmProviderConfig[] = [
+  {
+    id: 'browseros',
+    type: 'browseros',
+    name: 'BrowserOS',
+    baseUrl: 'https://api.browseros.com/v1',
+    modelId: 'browseros-auto',
+    supportsImages: true,
+    contextWindow: 200000,
+    temperature: 0.2,
+    createdAt: timestamp,
+    updatedAt: timestamp,
+  },
+  {
+    id: 'anthropic-sonnet',
+    type: 'anthropic',
+    name: 'Anthropic Sonnet',
+    modelId: 'claude-sonnet-4-6',
+    apiKey: 'sk-ant',
+    supportsImages: true,
+    contextWindow: 200000,
+    temperature: 0.2,
+    createdAt: timestamp,
+    updatedAt: timestamp,
+  },
+]
+
+const adapters: HarnessAdapterDescriptor[] = [
+  {
+    id: 'claude',
+    name: 'Claude Code',
+    defaultModelId: 'haiku',
+    defaultReasoningEffort: 'medium',
+    modelControl: 'best-effort',
+    models: [
+      { id: 'sonnet', label: 'Sonnet' },
+      { id: 'haiku', label: 'Haiku', recommended: true },
+    ],
+    reasoningEfforts: [
+      { id: 'medium', label: 'Medium', recommended: true },
+      { id: 'high', label: 'High' },
+    ],
+  },
+  {
+    id: 'codex',
+    name: 'Codex',
+    defaultModelId: 'gpt-5.5',
+    defaultReasoningEffort: 'medium',
+    modelControl: 'runtime-supported',
+    models: [{ id: 'gpt-5.5', label: 'GPT-5.5', recommended: true }],
+    reasoningEfforts: [{ id: 'medium', label: 'Medium', recommended: true }],
+  },
+  {
+    id: 'openclaw',
+    name: 'OpenClaw',
+    defaultModelId: 'default',
+    defaultReasoningEffort: 'medium',
+    modelControl: 'best-effort',
+    models: [],
+    reasoningEfforts: [
+      { id: 'medium', label: 'Medium', recommended: true },
+      { id: 'high', label: 'High' },
+    ],
+  },
+]
+
+const agents: HarnessAgent[] = [
+  {
+    id: 'agent-codex',
+    name: 'Review Bot',
+    adapter: 'codex',
+    modelId: 'gpt-5.5',
+    reasoningEffort: 'medium',
+    permissionMode: 'approve-all',
+    sessionKey: 'agent:agent-codex:main',
+    createdAt: timestamp,
+    updatedAt: timestamp,
+  },
+  {
+    id: 'agent-openclaw',
+    name: 'Research Claw',
+    adapter: 'openclaw',
+    modelId: 'default',
+    reasoningEffort: 'high',
+    permissionMode: 'approve-all',
+    sessionKey: 'agent:agent-openclaw:main',
+    createdAt: timestamp,
+    updatedAt: timestamp,
+  },
+]
+
+describe('buildSidepanelChatTargets', () => {
+  it('returns LLM targets plus one ACP target per persisted harness agent', () => {
+    const targets = buildSidepanelChatTargets({ providers, adapters, agents })
+
+    expect(targets.map((target) => target.id)).toEqual([
+      'browseros',
+      'anthropic-sonnet',
+      'agent-codex',
+      'agent-openclaw',
+    ])
+  })
+
+  it('does not emit catalog-only ACP targets without persisted agents', () => {
+    const targets = buildSidepanelChatTargets({
+      providers,
+      adapters,
+      agents: [],
+    })
+
+    expect(targets.map((target) => target.id)).toEqual([
+      'browseros',
+      'anthropic-sonnet',
+    ])
+  })
+
+  it('uses the created OpenClaw agent name instead of a generic adapter target', () => {
+    const targets = buildSidepanelChatTargets({ providers, adapters, agents })
+    const openclaw = targets.find((target) => target.id === 'agent-openclaw')
+
+    expect(openclaw).toMatchObject({
+      kind: 'acp',
+      id: 'agent-openclaw',
+      agentId: 'agent-openclaw',
+      adapter: 'openclaw',
+      adapterName: 'OpenClaw',
+      modelId: 'default',
+      modelLabel: 'default',
+      name: 'Research Claw',
+      modelControl: 'best-effort',
+      reasoningEffort: 'high',
+    })
+  })
+
+  it('preserves adapter metadata for created agent targets', () => {
+    const targets = buildSidepanelChatTargets({ providers, adapters, agents })
+    const codex = targets.find((target) => target.id === 'agent-codex')
+
+    expect(codex).toMatchObject({
+      kind: 'acp',
+      agentId: 'agent-codex',
+      adapter: 'codex',
+      adapterName: 'Codex',
+      modelId: 'gpt-5.5',
+      modelLabel: 'GPT-5.5',
+      modelControl: 'runtime-supported',
+      recommended: true,
+      reasoningEffort: 'medium',
+      reasoningEffortLabel: 'Medium',
+    })
+  })
+
+  it('still returns LLM targets when agents and adapters are unavailable', () => {
+    expect(
+      buildSidepanelChatTargets({ providers, adapters: [], agents: [] }),
+    ).toEqual([
+      {
+        kind: 'llm',
+        id: 'browseros',
+        name: 'BrowserOS',
+        type: 'browseros',
+        provider: providers[0],
+      },
+      {
+        kind: 'llm',
+        id: 'anthropic-sonnet',
+        name: 'Anthropic Sonnet',
+        type: 'anthropic',
+        provider: providers[1],
+      },
+    ])
+  })
+})
+
+describe('resolveSidepanelChatTarget', () => {
+  it('resolves selected LLM targets back to their provider config', () => {
+    const targets = buildSidepanelChatTargets({ providers, adapters, agents })
+    const resolved = resolveSidepanelChatTarget({
+      targets,
+      defaultProviderId: 'browseros',
+      selection: { kind: 'llm', id: 'anthropic-sonnet' },
+    })
+
+    expect(resolved?.kind).toBe('llm')
+    expect(toLlmProviderConfig(resolved)?.modelId).toBe('claude-sonnet-4-6')
+  })
+
+  it('falls back to the current default LLM provider when a persisted ACP target is stale', () => {
+    const targets = buildSidepanelChatTargets({
+      providers,
+      adapters,
+      agents: [],
+    })
+
+    expect(
+      resolveSidepanelChatTarget({
+        targets,
+        defaultProviderId: 'anthropic-sonnet',
+        selection: { kind: 'acp', id: 'agent-codex' },
+      }),
+    ).toMatchObject({
+      kind: 'llm',
+      id: 'anthropic-sonnet',
+    })
+  })
+
+  it('falls back when an old catalog-style ACP target id is persisted', () => {
+    const targets = buildSidepanelChatTargets({ providers, adapters, agents })
+
+    expect(
+      resolveSidepanelChatTarget({
+        targets,
+        defaultProviderId: 'anthropic-sonnet',
+        selection: { kind: 'acp', id: 'acp:codex:gpt-5.5:medium' },
+      }),
+    ).toMatchObject({
+      kind: 'llm',
+      id: 'anthropic-sonnet',
+    })
+  })
+})
+
+describe('persistSidepanelChatTargetSelection', () => {
+  it('stores only target identity and does not mutate LLM provider arrays', async () => {
+    let savedSelection: SidepanelChatTargetSelection | null = null
+    const originalProviders = providers.map((provider) => ({ ...provider }))
+    const targets = buildSidepanelChatTargets({ providers, adapters, agents })
+    const target = targets.find((candidate) => candidate.id === 'agent-codex')
+
+    await persistSidepanelChatTargetSelection(target, {
+      setValue: async (value) => {
+        savedSelection = value
+      },
+    })
+
+    expect(savedSelection as SidepanelChatTargetSelection | null).toEqual({
+      kind: 'acp',
+      id: 'agent-codex',
+    })
+    expect(providers).toEqual(originalProviders)
+  })
+})
--- a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/sidepanel-chat-targets.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/sidepanel-chat-targets.ts
@@ -0,0 +1,178 @@
+import type {
+  HarnessAdapterDescriptor,
+  HarnessAgent,
+  HarnessAgentAdapter,
+} from '@/entrypoints/app/agents/agent-harness-types'
+import type { LlmProviderConfig, ProviderType } from '@/lib/llm-providers/types'
+
+export type SidepanelTargetKind = 'llm' | 'acp'
+
+export type SidepanelChatTarget =
+  | {
+      kind: 'llm'
+      id: string
+      name: string
+      type: ProviderType
+      provider: LlmProviderConfig
+    }
+  | {
+      kind: 'acp'
+      id: string
+      name: string
+      type: 'acp'
+      agentId: string
+      adapter: HarnessAgentAdapter
+      adapterName: string
+      modelId: string
+      modelLabel: string
+      modelControl: HarnessAdapterDescriptor['modelControl']
+      recommended?: boolean
+      reasoningEffort: string
+      reasoningEffortLabel?: string
+    }
+
+export type SidepanelChatTargetSelection = Pick<
+  SidepanelChatTarget,
+  'kind' | 'id'
+>
+
+interface BuildSidepanelChatTargetsInput {
+  providers: LlmProviderConfig[]
+  adapters: HarnessAdapterDescriptor[]
+  agents?: HarnessAgent[]
+}
+
+interface ResolveSidepanelChatTargetInput {
+  targets: SidepanelChatTarget[]
+  defaultProviderId: string
+  selection?: SidepanelChatTargetSelection | null
+}
+
+interface SidepanelChatTargetSelectionWriter {
+  setValue(value: SidepanelChatTargetSelection | null): Promise<void>
+}
+
+interface SidepanelChatTargetSelectionReader {
+  getValue(): Promise<SidepanelChatTargetSelection | null>
+}
+
+type SidepanelChatTargetSelectionStore = SidepanelChatTargetSelectionReader &
+  SidepanelChatTargetSelectionWriter
+
+let sidepanelChatTargetSelectionStorage:
+  | SidepanelChatTargetSelectionStore
+  | undefined
+
+export function buildSidepanelChatTargets({
+  providers,
+  adapters,
+  agents = [],
+}: BuildSidepanelChatTargetsInput): SidepanelChatTarget[] {
+  return [
+    ...providers.map(toLlmTarget),
+    ...agents.map((agent) => toAcpTargetForAgent(agent, adapters)),
+  ]
+}
+
+function toAcpTargetForAgent(
+  agent: HarnessAgent,
+  adapters: HarnessAdapterDescriptor[],
+): SidepanelChatTarget {
+  const adapter = adapters.find((entry) => entry.id === agent.adapter)
+  const modelId = agent.modelId ?? adapter?.defaultModelId ?? 'default'
+  const reasoningEffort =
+    agent.reasoningEffort ?? adapter?.defaultReasoningEffort ?? 'medium'
+  const model = adapter?.models.find((entry) => entry.id === modelId)
+  const reasoning = adapter?.reasoningEfforts.find(
+    (effort) => effort.id === reasoningEffort,
+  )
+
+  return {
+    kind: 'acp',
+    id: agent.id,
+    name: agent.name,
+    type: 'acp',
+    agentId: agent.id,
+    adapter: agent.adapter,
+    adapterName: adapter?.name ?? formatAdapterName(agent.adapter),
+    modelId,
+    modelLabel: model?.label ?? modelId,
+    modelControl: adapter?.modelControl ?? 'best-effort',
+    recommended: model?.recommended,
+    reasoningEffort,
+    reasoningEffortLabel: reasoning?.label,
+  }
+}
+
+function formatAdapterName(adapter: HarnessAgentAdapter): string {
+  if (adapter === 'claude') return 'Claude Code'
+  if (adapter === 'codex') return 'Codex'
+  if (adapter === 'openclaw') return 'OpenClaw'
+  return adapter
+}
+
+export function resolveSidepanelChatTarget({
+  targets,
+  defaultProviderId,
+  selection,
+}: ResolveSidepanelChatTargetInput): SidepanelChatTarget | undefined {
+  if (selection) {
+    const selected = targets.find(
+      (target) => target.kind === selection.kind && target.id === selection.id,
+    )
+    if (selected) return selected
+  }
+
+  return (
+    targets.find(
+      (target) => target.kind === 'llm' && target.id === defaultProviderId,
+    ) ?? targets.find((target) => target.kind === 'llm')
+  )
+}
+
+export function toLlmProviderConfig(
+  target: SidepanelChatTarget | undefined,
+): LlmProviderConfig | undefined {
+  return target?.kind === 'llm' ? target.provider : undefined
+}
+
+export async function persistSidepanelChatTargetSelection(
+  target: SidepanelChatTarget | undefined,
+  store?: SidepanelChatTargetSelectionWriter,
+): Promise<void> {
+  const targetStore = store ?? (await getSidepanelChatTargetSelectionStorage())
+  await targetStore.setValue(
+    target ? { kind: target.kind, id: target.id } : null,
+  )
+}
+
+export async function loadSidepanelChatTargetSelection(
+  store?: SidepanelChatTargetSelectionReader,
+): Promise<SidepanelChatTargetSelection | null> {
+  const targetStore = store ?? (await getSidepanelChatTargetSelectionStorage())
+  return targetStore.getValue()
+}
+
+function toLlmTarget(provider: LlmProviderConfig): SidepanelChatTarget {
+  return {
+    kind: 'llm',
+    id: provider.id,
+    name: provider.name,
+    type: provider.type,
+    provider,
+  }
+}
+
+async function getSidepanelChatTargetSelectionStorage(): Promise<SidepanelChatTargetSelectionStore> {
+  if (sidepanelChatTargetSelectionStorage) {
+    return sidepanelChatTargetSelectionStorage
+  }
+
+  const { storage } = await import('@wxt-dev/storage')
+  sidepanelChatTargetSelectionStorage =
+    storage.defineItem<SidepanelChatTargetSelection | null>(
+      'local:sidepanel-chat-target-selection',
+      { fallback: null },
+    )
+  return sidepanelChatTargetSelectionStorage
+}
--- a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/useChatRefs.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/useChatRefs.ts
@@ -1,9 +1,21 @@
-import { useEffect, useRef } from 'react'
+import { useCallback, useEffect, useMemo, useRef, useState } from 'react'
 import useDeepCompareEffect from 'use-deep-compare-effect'
+import {
+  useAgentAdapters,
+  useHarnessAgents,
+} from '@/entrypoints/app/agents/useAgents'
 import type { LlmProviderConfig } from '@/lib/llm-providers/types'
 import { useLlmProviders } from '@/lib/llm-providers/useLlmProviders'
 import { type McpServer, useMcpServers } from '@/lib/mcp/mcpServerStorage'
 import { usePersonalization } from '@/lib/personalization/personalizationStorage'
+import {
+  buildSidepanelChatTargets,
+  loadSidepanelChatTargetSelection,
+  persistSidepanelChatTargetSelection,
+  resolveSidepanelChatTarget,
+  type SidepanelChatTarget,
+  type SidepanelChatTargetSelection,
+} from './sidepanel-chat-targets'

 const constructMcpServers = (servers: McpServer[]) => {
  return servers
@@ -23,14 +35,53 @@ const constructCustomServers = (servers: McpServer[]) => {
 export const useChatRefs = () => {
  const { servers: mcpServers } = useMcpServers()
  const {
+    providers: llmProviders,
    selectedProvider: selectedLlmProvider,
+    setDefaultProvider,
    isLoading: isLoadingProviders,
  } = useLlmProviders()
+  const { adapters, loading: isLoadingAdapters } = useAgentAdapters()
+  const { harnessAgents, loading: isLoadingAgents } = useHarnessAgents()
  const { personalization } = usePersonalization()
+  const [targetSelection, setTargetSelection] =
+    useState<SidepanelChatTargetSelection | null>(null)
+
+  useEffect(() => {
+    let cancelled = false
+    loadSidepanelChatTargetSelection().then((selection) => {
+      if (!cancelled) setTargetSelection(selection)
+    })
+    return () => {
+      cancelled = true
+    }
+  }, [])
+
+  const chatTargets = useMemo(
+    () =>
+      buildSidepanelChatTargets({
+        providers: llmProviders,
+        adapters,
+        agents: harnessAgents,
+      }),
+    [llmProviders, adapters, harnessAgents],
+  )
+
+  const selectedChatTarget = useMemo(
+    () =>
+      resolveSidepanelChatTarget({
+        targets: chatTargets,
+        defaultProviderId: selectedLlmProvider?.id ?? llmProviders[0]?.id ?? '',
+        selection: targetSelection,
+      }),
+    [chatTargets, llmProviders, selectedLlmProvider, targetSelection],
+  )

  const selectedLlmProviderRef = useRef<LlmProviderConfig | null>(
    selectedLlmProvider,
  )
+  const selectedChatTargetRef = useRef<SidepanelChatTarget | undefined>(
+    selectedChatTarget,
+  )
  const enabledMcpServersRef = useRef(constructMcpServers(mcpServers))
  const enabledCustomServersRef = useRef(constructCustomServers(mcpServers))
  const personalizationRef = useRef(personalization)
@@ -41,16 +92,36 @@ export const useChatRefs = () => {
    enabledCustomServersRef.current = constructCustomServers(mcpServers)
  }, [selectedLlmProvider, mcpServers])

+  useEffect(() => {
+    selectedChatTargetRef.current = selectedChatTarget
+  }, [selectedChatTarget])
+
  useEffect(() => {
    personalizationRef.current = personalization
  }, [personalization])

+  const selectChatTarget = useCallback(
+    async (target: SidepanelChatTarget | undefined) => {
+      selectedChatTargetRef.current = target
+      setTargetSelection(target ? { kind: target.kind, id: target.id } : null)
+      await persistSidepanelChatTargetSelection(target)
+    },
+    [],
+  )
+
  return {
    selectedLlmProviderRef,
+    selectedChatTargetRef,
    enabledMcpServersRef,
    enabledCustomServersRef,
    personalizationRef,
+    llmProviders,
+    setDefaultProvider,
+    chatTargets,
+    selectedChatTarget,
+    selectChatTarget,
    selectedLlmProvider,
-    isLoadingProviders,
+    isLoadingProviders:
+      isLoadingProviders || isLoadingAdapters || isLoadingAgents,
  }
 }
--- a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/useChatSession.test.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/useChatSession.test.ts
@@ -0,0 +1,153 @@
+import { describe, expect, it } from 'bun:test'
+import type { LlmProviderConfig } from '@/lib/llm-providers/types'
+import type { ChatMode } from './chatTypes'
+import type { SidepanelChatTarget } from './sidepanel-chat-targets'
+import { buildSidepanelPreparedSendMessagesRequest } from './useChatSessionRequest'
+
+const conversationId = '00000000-0000-4000-8000-000000000001'
+
+describe('buildSidepanelPreparedSendMessagesRequest', () => {
+  it('keeps LLM targets on the existing /chat request body', () => {
+    const request = buildSidepanelPreparedSendMessagesRequest({
+      agentServerUrl: 'http://127.0.0.1:5151',
+      target: llmTarget,
+      fallbackProvider,
+      message: 'Summarize this page',
+      ...commonRequestInput(),
+    })
+
+    expect(request.api).toBe('http://127.0.0.1:5151/chat')
+    expect(request.body).toMatchObject({
+      message: 'Summarize this page',
+      conversationId,
+      provider: 'browseros',
+      providerType: 'browseros',
+      providerName: 'BrowserOS',
+      model: 'gpt-5',
+      mode: 'agent',
+      browserContext: {
+        activeTab: { id: 10, url: 'https://example.com', title: 'Example' },
+        enabledMcpServers: ['slack'],
+      },
+      userSystemPrompt: 'Be concise',
+      userWorkingDir: '/tmp/work',
+      previousConversation: [{ role: 'assistant', content: 'Prior answer' }],
+      selectedText: 'selected text',
+      selectedTextSource: {
+        url: 'https://example.com',
+        title: 'Example',
+      },
+    })
+  })
+
+  it('sends created-agent targets to the agent-id sidepanel route', () => {
+    const request = buildSidepanelPreparedSendMessagesRequest({
+      agentServerUrl: 'http://127.0.0.1:5151',
+      target: acpTarget,
+      fallbackProvider,
+      message: 'Inspect the current tab',
+      approvalResponses: [
+        { approvalId: 'approval-1', approved: true, reason: 'ok' },
+      ],
+      ...commonRequestInput(),
+    })
+
+    expect(request.api).toBe(
+      'http://127.0.0.1:5151/agents/agent-codex/sidepanel/chat',
+    )
+    expect(request.body).toEqual({
+      conversationId,
+      message: 'Inspect the current tab',
+      browserContext: {
+        activeTab: { id: 10, url: 'https://example.com', title: 'Example' },
+        enabledMcpServers: ['slack'],
+      },
+      userSystemPrompt: 'Be concise',
+      userWorkingDir: '/tmp/work',
+      selectedText: 'selected text',
+      selectedTextSource: {
+        url: 'https://example.com',
+        title: 'Example',
+      },
+    })
+  })
+
+  it('keeps tool approval retry payloads scoped to LLM chat', () => {
+    const request = buildSidepanelPreparedSendMessagesRequest({
+      agentServerUrl: 'http://127.0.0.1:5151',
+      target: llmTarget,
+      fallbackProvider,
+      approvalResponses: [
+        { approvalId: 'approval-1', approved: false, reason: 'no' },
+      ],
+      ...commonRequestInput(),
+    })
+
+    expect(request.api).toBe('http://127.0.0.1:5151/chat')
+    expect(request.body).toMatchObject({
+      message: '',
+      toolApprovalResponses: [
+        { approvalId: 'approval-1', approved: false, reason: 'no' },
+      ],
+    })
+  })
+})
+
+function commonRequestInput() {
+  return {
+    conversationId,
+    mode: 'agent' as ChatMode,
+    browserContext: {
+      activeTab: { id: 10, url: 'https://example.com', title: 'Example' },
+      enabledMcpServers: ['slack'],
+    },
+    userSystemPrompt: 'Be concise',
+    userWorkingDir: '/tmp/work',
+    previousConversation: [
+      { role: 'assistant' as const, content: 'Prior answer' },
+    ],
+    declinedApps: ['gmail'],
+    aclRules: [{ id: 'rule-1', sitePattern: '*://*/*', enabled: true }],
+    selectedText: 'selected text',
+    selectedTextSource: {
+      url: 'https://example.com',
+      title: 'Example',
+    },
+    toolApprovalConfig: { categories: { navigation: true } },
+  }
+}
+
+const fallbackProvider: LlmProviderConfig = {
+  id: 'browseros',
+  type: 'browseros',
+  name: 'BrowserOS',
+  modelId: 'gpt-5',
+  supportsImages: true,
+  contextWindow: 128000,
+  temperature: 0.7,
+  createdAt: 1000,
+  updatedAt: 1000,
+}
+
+const llmTarget: SidepanelChatTarget = {
+  kind: 'llm',
+  id: fallbackProvider.id,
+  name: fallbackProvider.name,
+  type: fallbackProvider.type,
+  provider: fallbackProvider,
+}
+
+const acpTarget: SidepanelChatTarget = {
+  kind: 'acp',
+  id: 'agent-codex',
+  name: 'Review bot',
+  type: 'acp',
+  agentId: 'agent-codex',
+  adapter: 'codex',
+  adapterName: 'Codex',
+  modelId: 'gpt-5.5',
+  modelLabel: 'GPT-5.5',
+  modelControl: 'best-effort',
+  reasoningEffort: 'medium',
+  reasoningEffortLabel: 'Medium',
+}
--- a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/useChatSession.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/useChatSession.ts
@@ -26,15 +26,14 @@ import { useInvalidateCredits } from '@/lib/credits/useCredits'
 import { declinedAppsStorage } from '@/lib/declined-apps/storage'
 import { useGraphqlQuery } from '@/lib/graphql/useGraphqlQuery'
 import { createDefaultBrowserOSProvider } from '@/lib/llm-providers/storage'
-import { useLlmProviders } from '@/lib/llm-providers/useLlmProviders'
-import {
-  type ApprovalResponseData,
-  buildChatRequestBody,
-  type ChatRequestBrowserContext,
+import type {
+  ApprovalResponseData,
+  ChatRequestBrowserContext,
 } from '@/lib/messaging/server/buildChatRequestBody'
 import { track } from '@/lib/metrics/track'
 import { searchActionsStorage } from '@/lib/search-actions/searchActionsStorage'
 import { selectedTextStorage } from '@/lib/selected-text/selectedTextStorage'
+import { sentry } from '@/lib/sentry/sentry'
 import { stopAgentStorage } from '@/lib/stop-agent/stop-agent-storage'
 import {
  type ApprovalResponse,
@@ -52,7 +51,12 @@ import {
 import { selectedWorkspaceStorage } from '@/lib/workspace/workspace-storage'
 import type { ChatMode } from './chatTypes'
 import { GetConversationWithMessagesDocument } from './graphql/chatSessionDocument'
+import { toLlmProviderConfig } from './sidepanel-chat-targets'
 import { useChatRefs } from './useChatRefs'
+import {
+  buildSidepanelPreparedSendMessagesRequest,
+  toProviderOption,
+} from './useChatSessionRequest'
 import { useExecutionHistoryTracker } from './useExecutionHistoryTracker'
 import { useNotifyActiveTab } from './useNotifyActiveTab'
 import { useRemoteConversationSave } from './useRemoteConversationSave'
@@ -186,16 +190,19 @@ const buildRequestBrowserContext = ({
 export const useChatSession = (options?: ChatSessionOptions) => {
  const {
    selectedLlmProviderRef,
+    selectedChatTargetRef,
    enabledMcpServersRef,
    enabledCustomServersRef,
    personalizationRef,
+    setDefaultProvider,
+    chatTargets,
+    selectedChatTarget,
+    selectChatTarget,
    selectedLlmProvider,
    isLoadingProviders,
  } = useChatRefs()
  const invalidateCredits = useInvalidateCredits()

-  const { providers: llmProviders, setDefaultProvider } = useLlmProviders()
-
  const {
    baseUrl: agentServerUrl,
    isLoading: isLoadingAgentUrl,
@@ -218,11 +225,7 @@ export const useChatSession = (options?: ChatSessionOptions) => {
    agentUrlRef.current = agentServerUrl
  }, [agentServerUrl])

-  const providers: Provider[] = llmProviders.map((p) => ({
-    id: p.id,
-    name: p.name,
-    type: p.type,
-  }))
+  const providers: Provider[] = chatTargets.map(toProviderOption)

  const [mode, setMode] = useState<ChatMode>('agent')
  const [textToAction, setTextToAction] = useState<Map<string, ChatAction>>(
@@ -324,15 +327,8 @@ export const useChatSession = (options?: ChatSessionOptions) => {
    textToActionRef.current = textToAction
  }, [mode, textToAction])

-  const selectedProvider = selectedLlmProvider
-    ? {
-        id: selectedLlmProvider.id,
-        name: selectedLlmProvider.name,
-        type:
-          selectedLlmProvider.id === 'browseros'
-            ? ('browseros' as const)
-            : selectedLlmProvider.type,
-      }
+  const selectedProvider = selectedChatTarget
+    ? toProviderOption(selectedChatTarget)
    : providers[0]

  const {
@@ -346,7 +342,8 @@ export const useChatSession = (options?: ChatSessionOptions) => {
  } = useChat({
    transport: new DefaultChatTransport({
      prepareSendMessagesRequest: async ({ messages }) => {
-        const provider =
+        const target = selectedChatTargetRef.current
+        const fallbackProvider =
          selectedLlmProviderRef.current ?? createDefaultBrowserOSProvider()
        const activeTabsList = await chrome.tabs.query({
          active: true,
@@ -395,51 +392,46 @@ export const useChatSession = (options?: ChatSessionOptions) => {
          personalizationRef.current,
        )

-        const approvalResponses = extractApprovalResponses(messages)
+        const commonRequest = {
+          conversationId: conversationIdRef.current,
+          mode: currentMode,
+          browserContext: requestBrowserContext,
+          userSystemPrompt,
+          userWorkingDir: workingDirRef.current,
+          previousConversation,
+          declinedApps,
+          aclRules: enabledAclRules,
+          toolApprovalConfig: approvalConfig,
+        }
+
+        const approvalResponses =
+          target?.kind === 'acp' ? null : extractApprovalResponses(messages)
        if (approvalResponses) {
-          return {
-            api: `${agentUrlRef.current}/chat`,
-            body: buildChatRequestBody({
-              conversationId: conversationIdRef.current,
-              provider,
-              mode: currentMode,
-              browserContext: requestBrowserContext,
-              userSystemPrompt,
-              userWorkingDir: workingDirRef.current,
-              previousConversation,
-              declinedApps,
-              aclRules: enabledAclRules,
-              toolApprovalConfig: approvalConfig,
-              toolApprovalResponses: approvalResponses,
-            }),
-          }
+          return buildSidepanelPreparedSendMessagesRequest({
+            agentServerUrl: agentUrlRef.current ?? undefined,
+            target,
+            fallbackProvider,
+            ...commonRequest,
+            approvalResponses,
+          })
        }

        const message = getLastMessageText(messages)

-        const result = {
-          api: `${agentUrlRef.current}/chat`,
-          body: buildChatRequestBody({
-            message,
-            conversationId: conversationIdRef.current,
-            provider,
-            mode: currentMode,
-            browserContext: requestBrowserContext,
-            userSystemPrompt,
-            userWorkingDir: workingDirRef.current,
-            previousConversation,
-            declinedApps,
-            aclRules: enabledAclRules,
-            selectedText: activeTabSelection?.text,
-            selectedTextSource: activeTabSelection
-              ? {
-                  url: activeTabSelection.url,
-                  title: activeTabSelection.title,
-                }
-              : undefined,
-            toolApprovalConfig: approvalConfig,
-          }),
-        }
+        const result = buildSidepanelPreparedSendMessagesRequest({
+          agentServerUrl: agentUrlRef.current ?? undefined,
+          target,
+          fallbackProvider,
+          message,
+          ...commonRequest,
+          selectedText: activeTabSelection?.text,
+          selectedTextSource: activeTabSelection
+            ? {
+                url: activeTabSelection.url,
+                title: activeTabSelection.title,
+              }
+            : undefined,
+        })

        // Track which tab's selection was sent so we can clear it on success
        pendingSelectionTabKeyRef.current =
@@ -451,7 +443,7 @@ export const useChatSession = (options?: ChatSessionOptions) => {
    sendAutomaticallyWhen: () => {
      if (approvalJustRespondedRef.current) {
        approvalJustRespondedRef.current = false
-        return true
+        return selectedChatTargetRef.current?.kind !== 'acp'
      }
      return false
    },
@@ -686,10 +678,22 @@ export const useChatSession = (options?: ChatSessionOptions) => {
  }, [dispatchMessage, isIntegrationsSynced])

  const sendMessage = (params: { text: string; action?: ChatAction }) => {
+    const target = selectedChatTargetRef.current
+    const llmTargetProvider = toLlmProviderConfig(target)
+    const agentTarget = target?.kind === 'acp' ? target : undefined
    track(MESSAGE_SENT_EVENT, {
      mode,
-      provider_type: selectedLlmProvider?.type,
-      model: selectedLlmProvider?.modelId,
+      provider_id:
+        agentTarget?.agentId ??
+        llmTargetProvider?.id ??
+        selectedLlmProvider?.id,
+      provider_type: agentTarget ? 'acp' : llmTargetProvider?.type,
+      agent_id: agentTarget?.agentId,
+      adapter: agentTarget?.adapter,
+      model:
+        agentTarget?.modelId ??
+        llmTargetProvider?.modelId ??
+        selectedLlmProvider?.modelId,
    })

    if (!isIntegrationsSyncedRef.current) {
@@ -741,14 +745,54 @@ export const useChatSession = (options?: ChatSessionOptions) => {
    addToolApprovalResponse(params)
  }

+  const resetConversationState = () => {
+    stop()
+    void finishExecutionTask({ isAbort: true })
+    setConversationId(crypto.randomUUID())
+    setMessages([])
+    setTextToAction(new Map())
+    setLiked({})
+    setDisliked({})
+    setRestoredConversationId(null)
+    resetRemoteConversation()
+  }
+
  const handleSelectProvider = (provider: Provider) => {
-    const fullProvider = llmProviders.find((p) => p.id === provider.id)
+    const target = chatTargets.find(
+      (candidate) =>
+        candidate.id === provider.id && candidate.kind === provider.kind,
+    )
+    if (!target) return
+
+    const previousTarget = selectedChatTargetRef.current
    track(PROVIDER_SELECTED_EVENT, {
-      provider_id: provider.id,
-      provider_type: provider.type,
-      model_id: fullProvider?.modelId,
+      provider_id: target.id,
+      provider_type: target.kind === 'acp' ? 'acp' : target.type,
+      model_id:
+        target.kind === 'acp' ? target.modelId : target.provider.modelId,
+      agent_id: target.kind === 'acp' ? target.agentId : undefined,
+      adapter: target.kind === 'acp' ? target.adapter : undefined,
    })
-    setDefaultProvider(provider.id)
+
+    void selectChatTarget(target).catch((error) => {
+      sentry.captureException(error, {
+        extra: {
+          message: 'Failed to persist sidepanel chat target selection',
+          targetId: target.id,
+          targetKind: target.kind,
+        },
+      })
+    })
+    if (target.kind === 'llm') setDefaultProvider(target.provider.id)
+
+    if (
+      previousTarget &&
+      (previousTarget.kind !== target.kind ||
+        previousTarget.id !== target.id) &&
+      messagesRef.current.length > 0
+    ) {
+      resetConversationState()
+    }
  }

  const getActionForMessage = (message: UIMessage) => {
@@ -762,15 +806,7 @@ export const useChatSession = (options?: ChatSessionOptions) => {

  const resetConversation = () => {
    track(CONVERSATION_RESET_EVENT, { message_count: messages.length })
-    stop()
-    void finishExecutionTask({ isAbort: true })
-    setConversationId(crypto.randomUUID())
-    setMessages([])
-    setTextToAction(new Map())
-    setLiked({})
-    setDisliked({})
-    setRestoredConversationId(null)
-    resetRemoteConversation()
+    resetConversationState()
  }

  const isRestoringConversation =
--- a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/useChatSessionRequest.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/useChatSessionRequest.ts
@@ -0,0 +1,74 @@
+import type { Provider } from '../../../components/chat/chatComponentTypes'
+import type { LlmProviderConfig } from '../../../lib/llm-providers/types'
+import {
+  type ApprovalResponseData,
+  buildChatRequestBody,
+} from '../../../lib/messaging/server/buildChatRequestBody'
+import {
+  type SidepanelChatTarget,
+  toLlmProviderConfig,
+} from './sidepanel-chat-targets'
+
+type LlmChatRequestBodyInput = Parameters<typeof buildChatRequestBody>[0]
+
+type CommonSidepanelRequestInput = Omit<
+  LlmChatRequestBodyInput,
+  'provider' | 'message' | 'toolApprovalResponses' | 'isScheduledTask'
+>
+
+interface BuildSidepanelPreparedSendMessagesRequestInput
+  extends CommonSidepanelRequestInput {
+  agentServerUrl: string | undefined
+  target: SidepanelChatTarget | undefined
+  fallbackProvider: LlmProviderConfig
+  message?: string
+  approvalResponses?: ApprovalResponseData[] | null
+}
+
+export function buildSidepanelPreparedSendMessagesRequest({
+  agentServerUrl,
+  target,
+  fallbackProvider,
+  message,
+  approvalResponses,
+  ...common
+}: BuildSidepanelPreparedSendMessagesRequestInput) {
+  if (target?.kind === 'acp') {
+    return {
+      api: `${agentServerUrl}/agents/${encodeURIComponent(target.agentId)}/sidepanel/chat`,
+      body: {
+        conversationId: common.conversationId,
+        message: message ?? '',
+        browserContext: common.browserContext,
+        userSystemPrompt: common.userSystemPrompt,
+        userWorkingDir: common.userWorkingDir,
+        selectedText: common.selectedText,
+        selectedTextSource: common.selectedTextSource,
+      },
+    }
+  }
+
+  const provider = toLlmProviderConfig(target) ?? fallbackProvider
+  return {
+    api: `${agentServerUrl}/chat`,
+    body: buildChatRequestBody({
+      ...common,
+      provider,
+      message,
+      toolApprovalResponses: approvalResponses ?? undefined,
+    }),
+  }
+}
+
+export function toProviderOption(target: SidepanelChatTarget): Provider {
+  return {
+    id: target.id,
+    name: target.name,
+    type: target.type,
+    kind: target.kind,
+    agentId: target.kind === 'acp' ? target.agentId : undefined,
+    adapterName: target.kind === 'acp' ? target.adapterName : undefined,
+    modelLabel: target.kind === 'acp' ? target.modelLabel : undefined,
+    modelControl: target.kind === 'acp' ? target.modelControl : undefined,
+  }
+}
--- a/packages/browseros-agent/apps/agent/lib/sse.ts
+++ b/packages/browseros-agent/apps/agent/lib/sse.ts
@@ -2,29 +2,75 @@ function isAbortError(error: unknown): boolean {
  return error instanceof DOMException && error.name === 'AbortError'
 }

+export interface ParsedSSEEvent<T> {
+  data: T
+  /** Numeric `id:` line on the same SSE event, if any. */
+  seq?: number
+}
+
 export function parseSSELines<T>(buffer: string): {
-  events: T[]
+  events: ParsedSSEEvent<T>[]
  remainder: string
 } {
+  // SSE events are separated by blank lines. Buffer lines until we hit
+  // a blank, then assemble each event. Lines we recognise: `id: <n>`
+  // and `data: <payload>`. Everything else is ignored.
+  const events: ParsedSSEEvent<T>[] = []
  const lines = buffer.split('\n')
-  const remainder = lines.pop() ?? ''
-  const events: T[] = []
-
-  for (const line of lines) {
-    if (!line.startsWith('data: ')) continue
-    const payload = line.slice(6)
-    if (payload === '[DONE]') continue
-    try {
-      events.push(JSON.parse(payload) as T)
-    } catch {}
+  // Find the last blank-line boundary; everything after it is the
+  // remainder (next event partially received).
+  let lastBoundary = -1
+  for (let i = lines.length - 1; i >= 0; i--) {
+    if (lines[i] === '') {
+      lastBoundary = i
+      break
+    }
  }
+  const completeLines = lastBoundary >= 0 ? lines.slice(0, lastBoundary) : []
+  const remainder =
+    lastBoundary >= 0 ? lines.slice(lastBoundary + 1).join('\n') : buffer
+
+  let currentSeq: number | undefined
+  let currentData: string | null = null
+  const flush = () => {
+    if (currentData != null && currentData !== '[DONE]') {
+      try {
+        events.push({
+          data: JSON.parse(currentData) as T,
+          seq: currentSeq,
+        })
+      } catch {
+        // ignore
+      }
+    }
+    currentSeq = undefined
+    currentData = null
+  }
+
+  for (const line of completeLines) {
+    if (line === '') {
+      flush()
+      continue
+    }
+    if (line.startsWith('id: ')) {
+      const n = Number.parseInt(line.slice(4).trim(), 10)
+      if (Number.isFinite(n)) currentSeq = n
+      continue
+    }
+    if (line.startsWith('data: ')) {
+      currentData = line.slice(6)
+    }
+  }
+  // Catch a complete trailing event with no terminating blank line —
+  // shouldn't happen in well-formed SSE, but be tolerant.
+  flush()

  return { events, remainder }
 }

 export async function consumeSSEStream<T>(
  response: Response,
-  onEvent: (event: T) => void,
+  onEvent: (event: T, meta: { seq?: number }) => void,
  signal?: AbortSignal,
 ): Promise<void> {
  const reader = response.body?.getReader()
@@ -49,7 +95,7 @@ export async function consumeSSEStream<T>(
      buffer = remainder

      for (const event of events) {
-        onEvent(event)
+        onEvent(event.data, { seq: event.seq })
      }
    }
  } catch (error) {
@@ -64,7 +110,7 @@ export async function consumeSSEStream<T>(
    if (buffer) {
      const { events } = parseSSELines<T>(buffer)
      for (const event of events) {
-        onEvent(event)
+        onEvent(event.data, { seq: event.seq })
      }
    }
  }
--- a/packages/browseros-agent/apps/eval/.env.example
+++ b/packages/browseros-agent/apps/eval/.env.example
@@ -0,0 +1,51 @@
+# Copy to .env.development for local eval runs.
+
+# Provider keys used by existing config files.
+OPENROUTER_API_KEY=
+FIREWORKS_API_KEY=
+ANTHROPIC_API_KEY=
+OPENAI_API_KEY=
+GOOGLE_GENERATIVE_AI_API_KEY=
+
+# Claude Agent SDK token used by performance_grader.
+CLAUDE_CODE_OAUTH_TOKEN=
+
+# Suite-mode model selection.
+EVAL_VARIANT=local
+EVAL_AGENT_PROVIDER=openai-compatible
+EVAL_AGENT_MODEL=
+EVAL_AGENT_API_KEY=
+EVAL_AGENT_BASE_URL=
+EVAL_AGENT_SUPPORTS_IMAGES=true
+
+# Optional suite-mode executor override for orchestrator suites.
+EVAL_EXECUTOR_MODEL=
+EVAL_EXECUTOR_API_KEY=
+EVAL_EXECUTOR_BASE_URL=
+
+# Clado visual action executor.
+CLADO_ACTION_MODEL=
+CLADO_ACTION_API_KEY=
+CLADO_ACTION_BASE_URL=
+# Backward-compatible alias used by older local scripts.
+CLADO_ACTION_URL=
+
+# BrowserOS runner.
+BROWSEROS_BINARY=/Applications/BrowserOS.app/Contents/MacOS/BrowserOS
+BROWSEROS_SERVER_URL=http://127.0.0.1:9110
+BROWSEROS_SERVER_LOG_DIR=/tmp/browseros-server-logs
+BROWSEROS_CONFIG_URL=
+
+# Captcha solver extension.
+NOPECHA_API_KEY=
+
+# WebArena-Infinity.
+WEBARENA_INFINITY_DIR=
+INFINITY_APP_URL=
+
+# R2 publishing and weekly report.
+EVAL_R2_ACCOUNT_ID=
+EVAL_R2_ACCESS_KEY_ID=
+EVAL_R2_SECRET_ACCESS_KEY=
+EVAL_R2_BUCKET=browseros-eval
+EVAL_R2_CDN_BASE_URL=https://eval.browseros.com
--- a/packages/browseros-agent/apps/eval/README.md
+++ b/packages/browseros-agent/apps/eval/README.md
@@ -14,6 +14,7 @@ Evaluation framework for BrowserOS browser automation agents. Runs tasks from st

 ```bash
 cd apps/eval
+cp .env.example .env.development
 # Edit .env.development with your keys, then:
 bun run eval
 ```
@@ -23,11 +24,55 @@ Opens the eval dashboard at `http://localhost:9900` in config mode. From there:
 ### CLI mode

 ```bash
-bun run eval -c configs/browseros-agent-weekly.json
+bun run eval -c configs/legacy/browseros-agent-weekly.json
+bun run eval suite --config configs/legacy/browseros-agent-weekly.json --publish r2
 ```

 Runs immediately. Dashboard still available at `http://localhost:9900` for live progress.

+The `suite` command is the workflow-compatible full loop: execute tasks, run graders, write artifacts, and optionally publish to R2. The old `-c` form remains supported during migration.
+
+```bash
+bun run eval run --config configs/legacy/browseros-agent-weekly.json
+bun run eval suite --suite configs/suites/agisdk-daily-10.json --variant kimi-fireworks --publish r2
+bun run eval grade --run results/browseros-agent-weekly/2026-04-29-1430
+bun run eval publish --run results/browseros-agent-weekly/2026-04-29-1430 --target r2
+```
+
+Config files live in two groups:
+
+```txt
+configs/legacy/  # Complete EvalConfig files used by older workflows and the dashboard
+configs/suites/  # Suite definitions; model/provider comes from CLI flags or env
+```
+
+Suite mode takes model settings from CLI flags first, then env:
+
+```bash
+EVAL_VARIANT=kimi-fireworks \
+EVAL_AGENT_PROVIDER=openai-compatible \
+EVAL_AGENT_MODEL=accounts/fireworks/models/kimi-k2p5 \
+EVAL_AGENT_API_KEY=$FIREWORKS_API_KEY \
+EVAL_AGENT_BASE_URL=https://api.fireworks.ai/inference/v1 \
+bun run eval suite --suite configs/suites/agisdk-daily-10.json --publish r2
+```
+
+### Suites and variants
+
+A **suite** is what we run: the task dataset, graders, worker count, timeout, and browser settings. For example, `agisdk-daily-10` means "run these 10 AGI SDK tasks and grade them with `agisdk_state_diff`."
+
+A **variant** is the model setup we are testing on that suite. `EVAL_VARIANT` is just the human-readable name for that setup. The actual model connection still comes from `EVAL_AGENT_PROVIDER`, `EVAL_AGENT_MODEL`, `EVAL_AGENT_API_KEY`, and `EVAL_AGENT_BASE_URL`.
+
+This lets us run the same suite against multiple model setups without copying the benchmark config:
+
+```txt
+agisdk-daily-10 + kimi-fireworks
+agisdk-daily-10 + claude-sonnet
+agisdk-daily-10 + clado-action-000159
+```
+
+For `orchestrator-executor` suites, there can also be an executor model/backend. The `EVAL_AGENT_*` vars describe the main agent or orchestrator. The optional `EVAL_EXECUTOR_*` or `CLADO_ACTION_*` vars describe the delegated executor.
+
 ## Agent types

 | Type | Description |
@@ -66,9 +111,9 @@ The orchestrator works with any LLM provider. The executor can be another LLM, o
    },
    "executor": {
      "provider": "clado-action",
-      "model": "qwen3-vl-30b-a3b-instruct",
+      "model": "Qwen3.5-35B-A3B-action-000159-merged",
      "apiKey": "",
-      "baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
+      "baseUrl": "https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run"
    }
  }
 }
@@ -96,6 +141,20 @@ The `apiKey` field supports two formats:
 - **Env var name**: `"OPENAI_API_KEY"` — resolved from `.env.development` at runtime
 - **Direct value**: `"sk-xxxxx"` — used as-is (not recommended)

+### Environment variables
+
+| Variable | Used for |
+|----------|----------|
+| `EVAL_AGENT_PROVIDER`, `EVAL_AGENT_MODEL`, `EVAL_AGENT_API_KEY`, `EVAL_AGENT_BASE_URL`, `EVAL_AGENT_SUPPORTS_IMAGES` | Suite variant model selection |
+| `FIREWORKS_API_KEY`, `OPENROUTER_API_KEY`, `ANTHROPIC_API_KEY`, provider-specific keys | Config-file or provider-backed model calls |
+| `EVAL_EXECUTOR_MODEL`, `EVAL_EXECUTOR_API_KEY`, `EVAL_EXECUTOR_BASE_URL` | Suite-mode orchestrator executor override |
+| `CLADO_ACTION_MODEL`, `CLADO_ACTION_API_KEY`, `CLADO_ACTION_BASE_URL` | Clado executor defaults |
+| `BROWSEROS_BINARY` | BrowserOS binary path in CI/local smoke runs |
+| `BROWSEROS_SERVER_URL` | Optional grader MCP URL override |
+| `WEBARENA_INFINITY_DIR` | Local WebArena-Infinity checkout for Infinity tasks |
+| `NOPECHA_API_KEY` | CAPTCHA solver extension |
+| `EVAL_R2_ACCOUNT_ID`, `EVAL_R2_ACCESS_KEY_ID`, `EVAL_R2_SECRET_ACCESS_KEY`, `EVAL_R2_BUCKET`, `EVAL_R2_CDN_BASE_URL` | R2 upload and viewer URL |
+
 ### Supported providers

 | Provider | `provider` value | Requires `baseUrl` |
@@ -110,6 +169,20 @@ The `apiKey` field supports two formats:
 | Ollama | `ollama` | No |
 | Clado Action (executor only) | `clado-action` | Yes |

+### R2 publishing
+
+`suite --config ... --publish r2` and `publish --target r2` upload the run artifacts plus `viewer.html` to the viewer-compatible R2 layout:
+
+```bash
+export EVAL_R2_ACCOUNT_ID=...
+export EVAL_R2_ACCESS_KEY_ID=...
+export EVAL_R2_SECRET_ACCESS_KEY=...
+export EVAL_R2_BUCKET=browseros-eval
+export EVAL_R2_CDN_BASE_URL=https://eval.browseros.com
+```
+
+Published runs are available at `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
+
 ### BrowserOS infrastructure

 ```json
@@ -137,10 +210,12 @@ Each worker gets its own Chrome instance. Worker N uses `base_port + N` for CDP

 | File | Tasks | Description |
 |------|-------|-------------|
+| `agisdk-daily-10.jsonl` | 10 | Daily AGI SDK / REAL Bench subset |
 | `webvoyager.jsonl` | 643 | Full WebVoyager benchmark |
 | `mind2web.jsonl` | 300 | Online-Mind2Web |
 | `webbench-{0,1,2}of4-50.jsonl` | 50 each | WebBench shards (50-task subsets) |
-| `agisdk-real.jsonl` | 40 | AGI SDK / REAL Bench (action-only tasks) |
+| `agisdk-real-smoke.jsonl` | 1 | AGI SDK / REAL Bench smoke task |
+| `agisdk-real.jsonl` | 36 | AGI SDK / REAL Bench (action-only tasks) |
 | `webarena-infinity-hard-50.jsonl` | 50 | WebArena-Infinity hard set |
 | `browsecomp-medium-hard-50.jsonl` | 50 | BrowseComp medium-hard |
 | `browsecomp-very-hard-50.jsonl` | 50 | BrowseComp very-hard |
@@ -167,14 +242,19 @@ results/
  browseros-agent-weekly/
    2026-04-29-1430/
      Amazon--0/
+        attempt.json          # Stable attempt summary for viewer/reporting
        metadata.json         # Task result, timing, grader scores
+        grades.json           # Compact grader results
        messages.jsonl         # Full message log
+        grader-artifacts/      # Grader-specific inputs/outputs/stderr
        screenshots/
          001.png              # Step-by-step screenshots
          002.png
      summary.json             # Aggregate pass rates
 ```

+R2 publishing preserves the same task files under `runs/<run-id>/...`, writes `runs/<run-id>/manifest.json`, and uploads `viewer.html` at the bucket root. The viewer URL is `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
+
 ## Troubleshooting

 **BrowserOS not found**: Expects `/Applications/BrowserOS.app/Contents/MacOS/BrowserOS`. Set `BROWSEROS_BINARY` to override.
--- a/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real-smoke.json
+++ b/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real-smoke.json
@@ -7,8 +7,8 @@
    "baseUrl": "https://openrouter.ai/api/v1",
    "supportsImages": true
  },
-  "dataset": "../data/agisdk-real.jsonl",
-  "num_workers": 10,
+  "dataset": "../../data/agisdk-real-smoke.jsonl",
+  "num_workers": 1,
  "restart_server_per_task": true,
  "browseros": {
    "server_url": "http://127.0.0.1:9110",
--- a/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real.json
+++ b/packages/browseros-agent/apps/eval/configs/legacy/agisdk-real.json
@@ -0,0 +1,26 @@
+{
+  "agent": {
+    "type": "single",
+    "provider": "openai-compatible",
+    "model": "accounts/fireworks/models/kimi-k2p5",
+    "apiKey": "FIREWORKS_API_KEY",
+    "baseUrl": "https://api.fireworks.ai/inference/v1",
+    "supportsImages": true
+  },
+  "dataset": "../../data/agisdk-real.jsonl",
+  "num_workers": 4,
+  "restart_server_per_task": true,
+  "browseros": {
+    "server_url": "http://127.0.0.1:9110",
+    "base_cdp_port": 9010,
+    "base_server_port": 9110,
+    "base_extension_port": 9310,
+    "load_extensions": false,
+    "headless": false
+  },
+  "captcha": {
+    "api_key_env": "NOPECHA_API_KEY"
+  },
+  "graders": ["agisdk_state_diff"],
+  "timeout_ms": 1800000
+}
--- a/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-weekly.json
+++ b/packages/browseros-agent/apps/eval/configs/legacy/browseros-agent-weekly.json
@@ -7,7 +7,7 @@
    "baseUrl": "https://openrouter.ai/api/v1",
    "supportsImages": true
  },
-  "dataset": "../data/webbench-2of4-50.jsonl",
+  "dataset": "../../data/webbench-2of4-50.jsonl",
  "num_workers": 10,
  "restart_server_per_task": true,
  "browseros": {
--- a/packages/browseros-agent/apps/eval/configs/legacy/browseros-oe-agent-weekly.json
+++ b/packages/browseros-agent/apps/eval/configs/legacy/browseros-oe-agent-weekly.json
@@ -14,7 +14,7 @@
      "baseUrl": "https://api.fireworks.ai/inference/v1"
    }
  },
-  "dataset": "../data/webbench-2of4-50.jsonl",
+  "dataset": "../../data/webbench-2of4-50.jsonl",
  "num_workers": 10,
  "restart_server_per_task": true,
  "browseros": {
--- a/packages/browseros-agent/apps/eval/configs/legacy/browseros-oe-clado-weekly.json
+++ b/packages/browseros-agent/apps/eval/configs/legacy/browseros-oe-clado-weekly.json
@@ -9,12 +9,12 @@
    },
    "executor": {
      "provider": "clado-action",
-      "model": "qwen3-vl-30b-a3b-instruct",
+      "model": "Qwen3.5-35B-A3B-action-000159-merged",
      "apiKey": "",
-      "baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
+      "baseUrl": "https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run"
    }
  },
-  "dataset": "../data/webbench-2of4-50.jsonl",
+  "dataset": "../../data/agisdk-real.jsonl",
  "num_workers": 10,
  "restart_server_per_task": true,
  "browseros": {
@@ -23,11 +23,11 @@
    "base_server_port": 9110,
    "base_extension_port": 9310,
    "load_extensions": false,
-    "headless": false
+    "headless": true
  },
  "captcha": {
    "api_key_env": "NOPECHA_API_KEY"
  },
-  "graders": ["performance_grader"],
+  "graders": ["agisdk_state_diff"],
  "timeout_ms": 1800000
 }
--- a/packages/browseros-agent/apps/eval/configs/legacy/infinity-hard-50.json
+++ b/packages/browseros-agent/apps/eval/configs/legacy/infinity-hard-50.json
@@ -7,7 +7,7 @@
    "baseUrl": "https://openrouter.ai/api/v1",
    "supportsImages": true
  },
-  "dataset": "../data/webarena-infinity-hard-50.jsonl",
+  "dataset": "../../data/webarena-infinity-hard-50.jsonl",
  "num_workers": 10,
  "restart_server_per_task": true,
  "browseros": {
--- a/packages/browseros-agent/apps/eval/configs/legacy/test-mind2web.json
+++ b/packages/browseros-agent/apps/eval/configs/legacy/test-mind2web.json
@@ -5,7 +5,7 @@
    "model": "openai/gpt-4.1",
    "apiKey": "OPENROUTER_API_KEY"
  },
-  "dataset": "../data/mind2web.jsonl",
+  "dataset": "../../data/mind2web.jsonl",
  "num_workers": 5,
  "restart_server_per_task": true,
  "browseros": {
--- a/packages/browseros-agent/apps/eval/configs/legacy/test-webvoyager.json
+++ b/packages/browseros-agent/apps/eval/configs/legacy/test-webvoyager.json
@@ -7,7 +7,7 @@
    "baseUrl": "https://api.fireworks.ai/inference/v1",
    "supportsImages": true
  },
-  "dataset": "../data/webvoyager.jsonl",
+  "dataset": "../../data/webvoyager.jsonl",
  "num_workers": 3,
  "restart_server_per_task": true,
  "browseros": {
--- a/packages/browseros-agent/apps/eval/configs/suites/agisdk-daily-10.json
+++ b/packages/browseros-agent/apps/eval/configs/suites/agisdk-daily-10.json
@@ -0,0 +1,22 @@
+{
+  "id": "agisdk-daily-10",
+  "dataset": "../../data/agisdk-daily-10.jsonl",
+  "agent": {
+    "type": "single"
+  },
+  "graders": ["agisdk_state_diff"],
+  "workers": 1,
+  "restartBrowserPerTask": true,
+  "timeoutMs": 1800000,
+  "browseros": {
+    "server_url": "http://127.0.0.1:9110",
+    "base_cdp_port": 9010,
+    "base_server_port": 9110,
+    "base_extension_port": 9310,
+    "load_extensions": false,
+    "headless": true
+  },
+  "captcha": {
+    "api_key_env": "NOPECHA_API_KEY"
+  }
+}
--- a/packages/browseros-agent/apps/eval/configs/suites/agisdk-real-smoke.json
+++ b/packages/browseros-agent/apps/eval/configs/suites/agisdk-real-smoke.json
@@ -0,0 +1,22 @@
+{
+  "id": "agisdk-real-smoke",
+  "dataset": "../../data/agisdk-real-smoke.jsonl",
+  "agent": {
+    "type": "single"
+  },
+  "graders": ["agisdk_state_diff"],
+  "workers": 1,
+  "restartBrowserPerTask": true,
+  "timeoutMs": 1800000,
+  "browseros": {
+    "server_url": "http://127.0.0.1:9110",
+    "base_cdp_port": 9010,
+    "base_server_port": 9110,
+    "base_extension_port": 9310,
+    "load_extensions": false,
+    "headless": false
+  },
+  "captcha": {
+    "api_key_env": "NOPECHA_API_KEY"
+  }
+}
--- a/packages/browseros-agent/apps/eval/configs/suites/agisdk-real.json
+++ b/packages/browseros-agent/apps/eval/configs/suites/agisdk-real.json
@@ -0,0 +1,22 @@
+{
+  "id": "agisdk-real",
+  "dataset": "../../data/agisdk-real.jsonl",
+  "agent": {
+    "type": "single"
+  },
+  "graders": ["agisdk_state_diff"],
+  "workers": 1,
+  "restartBrowserPerTask": true,
+  "timeoutMs": 1800000,
+  "browseros": {
+    "server_url": "http://127.0.0.1:9110",
+    "base_cdp_port": 9010,
+    "base_server_port": 9110,
+    "base_extension_port": 9310,
+    "load_extensions": false,
+    "headless": false
+  },
+  "captcha": {
+    "api_key_env": "NOPECHA_API_KEY"
+  }
+}
--- a/packages/browseros-agent/apps/eval/data/agisdk-daily-10.jsonl
+++ b/packages/browseros-agent/apps/eval/data/agisdk-daily-10.jsonl
@@ -0,0 +1,10 @@
+{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
+{"query_id": "agisdk-fly-unified-5", "dataset": "agisdk-real", "query": "Find me the cheapest fare for a flight from Orlando to Milwaukee on December 5th, 2024 and book it.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/30, Security Code: 420 Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-5", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "United Airlines"}}}
+{"query_id": "agisdk-udriver-10", "dataset": "agisdk-real", "query": "Order me a ride for 4pm, I'll be at the de Young muesum headed to the Waterbar, fanciest option possible please.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-10", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
+{"query_id": "agisdk-udriver-9", "dataset": "agisdk-real", "query": "Book me a ride from the thai restaurant I last took a ride to for later today at 2pm, I'll be at 333 Apartments on Fremont", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-9", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-9", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Uber"}}}
+{"query_id": "agisdk-topwork-4", "dataset": "agisdk-real", "query": "Create a job post for a UI/UX Designer with expertise in Figma, Sketch, and Adobe Creative Suite, including project details, timeline, and required skills (Wireframing, Prototyping, Responsive Design).", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-4", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
+{"query_id": "agisdk-gocalendar-4", "dataset": "agisdk-real", "query": "Change the \"Team Check-In\" event on July 18, 2024, name to \"Project Kickoff\" and update the location to \"Zoom\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-4", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
+{"query_id": "agisdk-staynb-6", "dataset": "agisdk-real", "query": "Find and book the stay with the best value for money (cheapest stay with the best reviews) for 1 day. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-6", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-6", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
+{"query_id": "agisdk-udriver-11", "dataset": "agisdk-real", "query": "I need to go from Pacific Catch on Chestnut back home to 333 Fremont now. If the fancy version is within ten dollars of the regular one, book that.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-11", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-11", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
+{"query_id": "agisdk-networkin-5", "dataset": "agisdk-real", "query": "Send a connection request to John Smith.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-5", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-5", "challenge_type": "action", "difficulty": "easy", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-zilloft-6", "dataset": "agisdk-real", "query": "Select a property listed in San Francisco as \"Condos\" within a price range under $300,000 and request a tour for tomorrow at 4:00 PM. Use these contact details: Name: Sarah Brown, Email: sarahbrown@example.com, Phone: 555-987-6543.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-6", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "Zillow"}}}
--- a/packages/browseros-agent/apps/eval/data/agisdk-real-smoke.jsonl
+++ b/packages/browseros-agent/apps/eval/data/agisdk-real-smoke.jsonl
@@ -0,0 +1 @@
+{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
--- a/packages/browseros-agent/apps/eval/data/agisdk-real.jsonl
+++ b/packages/browseros-agent/apps/eval/data/agisdk-real.jsonl
@@ -32,9 +32,5 @@
 {"query_id": "agisdk-networkin-10", "dataset": "agisdk-real", "query": "Generate a polite follow-up message for a previous unanswered chat, starting with \"Following up on\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-10", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-10", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
 {"query_id": "agisdk-gomail-3", "dataset": "agisdk-real", "query": "Compose a new email to jonathan.smith@example.com with the subject \"Meeting Notes\" and body \"Please find the meeting notes attached.\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-3", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "Gmail"}}}
 {"query_id": "agisdk-udriver-6", "dataset": "agisdk-real", "query": "Me and 4 friends need a ride from the Palace Hotel to dinner at Osha Thai leaving now", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-6", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-6", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
-{"query_id": "agisdk-staynb-9", "dataset": "agisdk-real", "query": "Book a stay with the maximum number of guests supported. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-9", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-9", "challenge_type": "action", "difficulty": "hard", "similar_to": "Airbnb"}}}
 {"query_id": "agisdk-zilloft-3", "dataset": "agisdk-real", "query": "Find a home in San Diego priced under $150,000 with at least 2 bedrooms and request a tour. Use these details: Contact Name: John Doe, Email: johndoe@example.com, Phone: 555-123-4567, Tour Time: 2:00 PM, Tour Date: First available.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-3", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-3", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "Zillow"}}}
 {"query_id": "agisdk-fly-unified-6", "dataset": "agisdk-real", "query": "Reserve me a seat for the flight from Austin to Pittsburgh departing on December 11th, 2024 at 8:00 in Basic Economy.\nPassenger: Alice Brown\nDate of Birth: 05/20/1992\nSex: Female\nSeat Selection: Yes (Aisle seat)\nPayment: Credit Card (378342143523967), Exp: 09/27, security code: 332 Address: 789 Pine St, Los Angeles, CA, 90012, USA, Phone: 555-456-7890, Email: alicebrown@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-6", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "United Airlines"}}}
-{"query_id": "agisdk-opendining-3", "dataset": "agisdk-real", "query": "Book a table at \"The Royal Dine\" for a party of 4 on July 20, 2024, at 7 PM. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-3", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "OpenTable"}}}
-{"query_id": "agisdk-gocalendar-7", "dataset": "agisdk-real", "query": "Reschedule the \"Morning Coffee with sister\" event from July 18, 2024, at 9 AM to July 19, 2024, at 10AM using drag-and-drop functionality", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-7", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-7", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
-{"query_id": "agisdk-staynb-5", "dataset": "agisdk-real", "query": "Use the search bar to look for a stay. For the \"Where\" section, use the \"Search by region\" popover and select \"Europe\". Set the check-in date to October 13th and the check-out date to October 23rd. For the \"Who\" section, select 1 infant, 2 children, and 2 adults. Press the search button, select the first stay, and book it.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-5", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-5", "challenge_type": "action", "difficulty": "medium", "similar_to": "Airbnb"}}}
--- a/packages/browseros-agent/apps/eval/scripts/build-agisdk-dataset.py
+++ b/packages/browseros-agent/apps/eval/scripts/build-agisdk-dataset.py
@@ -64,6 +64,37 @@ EXCLUDED_TASKS = {
    # the grader's first criterion (search history contains "stanford") was
    # never triggered server-side. Eval-site bug.
    "networkin-9",
+    # Goal text instructs "move event to July 19, 10 AM" but the grader expects
+    # `eventsDiff.updated.*.start == "2024-07-18T17:00:00Z"` (= July 18, 10 AM
+    # PDT — same day, 1 hour shift). Goal contradicts grader: following the
+    # goal yields July 19 timestamps; satisfying the grader requires ignoring
+    # the explicit "to July 19" instruction. Confirmed via 8-trial deep-dive:
+    # never passed even after the Phase 2 HTML5 dnd dispatch fix made the drag
+    # actually populate `eventsDiff.updated` (now produces July 19 values, but
+    # grader rejects them).
+    "gocalendar-7",
+    # Grader hardcodes literal year strings `'Oct 13 2025'` / `'Oct 23 2025'`
+    # in checkin/checkout criteria. Today is 2026, and the staynb date picker
+    # interprets bare "Oct 13" as the most recent past instance — currently
+    # 2024, not 2025. Even a perfectly-acting agent cannot produce a booking
+    # whose persisted date contains "2025". Confirmed via 8 trials, 0 passes.
+    "staynb-5",
+    # Goal says "maximum number of guests supported"; grader expects the very
+    # specific string "32 Guests, 16 Infants" — which requires the agent to
+    # know that (a) Adults+Children sum into the displayed "Guests" count,
+    # (b) Infants render separately, (c) Pets are excluded, (d) per-category
+    # cap is 16 despite no UI affordance signalling it. None of this is in
+    # the prompt. 8 trials, 0 passes; even Opus 4.6 stopped at 16 (one
+    # category maxed). Task is under-specified relative to grader expectation.
+    "staynb-9",
+    # Grader requires `contains(booking.date, '2024-07-20')` but the eval-site
+    # date picker is a React-controlled textbox that the agent's `fill` tool
+    # frequently no-ops on. 3 of 8 trials passed (when fill happened to stick),
+    # 5 failed with `actual_value: False` (booking persisted with the eval-site
+    # default search date, not Jul 20). Effectively a coin-flip task that
+    # exercises tool-fidelity flakiness rather than agent capability —
+    # contributes noise, not signal. Excluding for eval reliability.
+    "opendining-3",
 }

 # Far-future replacement used by `freshen_goal_dates` when a task's hardcoded
--- a/packages/browseros-agent/apps/eval/scripts/test-clado-api.ts
+++ b/packages/browseros-agent/apps/eval/scripts/test-clado-api.ts
@@ -1,34 +1,73 @@
 /**
- * Test script for Clado API endpoints (grounding + action models)
+ * Smoke-test for the Clado BrowserOS Action endpoint.
+ *
+ * Health-checks the model, then runs a generate call and prints every
+ * field the new contract documents (action, coordinates, text, key,
+ * direction, scroll/drag fields, wait, end+final_answer, thinking,
+ * parse_error, raw_response).
 *
 * Usage:
 *   bun apps/eval/scripts/test-clado-api.ts [screenshot-path]
 *
- * If no screenshot provided, captures one from a running BrowserOS server.
+ * If no screenshot path is given, captures one over MCP from a
+ * running BrowserOS server (default http://127.0.0.1:9110, override
+ * with BROWSEROS_URL).
+ *
+ * Cold start can take ~5 minutes; the script waits up to 6.
 */

 import { readFile } from 'node:fs/promises'
 import { resolve } from 'node:path'

 const ACTION_URL =
-  'https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run'
+  'https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run'
 const ACTION_HEALTH_URL =
-  'https://clado-ai--clado-browseros-action-actionmodel-health.modal.run'
-const GROUNDING_URL =
-  'https://clado-ai--clado-browseros-grounding-groundingmodel-generate.modal.run'
-const GROUNDING_HEALTH_URL =
-  'https://clado-ai--clado-browseros-grounding-groundingmodel-health.modal.run'
+  'https://clado-ai--clado-browseros-action-000159-merged-actionmod-5e5033.modal.run'

-async function checkHealth(name: string, url: string): Promise<boolean> {
-  console.log(`\n--- ${name} health check ---`)
-  console.log(`  URL: ${url}`)
+const COLD_START_BUDGET_MS = 360_000 // 6 min — Clado cold start is ~5 min
+const COLD_START_WARN_MS = 30_000
+
+interface CladoResponse {
+  action?: string | null
+  thinking?: string | null
+  raw_response?: string
+  parse_error?: string | null
+  inference_time_seconds?: number
+  x?: number
+  y?: number
+  text?: string
+  key?: string
+  direction?: string
+  amount?: number
+  startX?: number
+  startY?: number
+  endX?: number
+  endY?: number
+  time?: number
+  final_answer?: string | null
+}
+
+async function checkHealth(): Promise<boolean> {
+  console.log(`\n--- Action model health ---`)
+  console.log(`  URL:   ${ACTION_HEALTH_URL}`)
+  console.log(
+    `  Note:  cold start can take ~5 min; waiting up to ${COLD_START_BUDGET_MS / 1000}s.`,
+  )
  const start = performance.now()
+  const warn = setTimeout(() => {
+    console.log(
+      `  ...still waiting (${COLD_START_WARN_MS / 1000}s in) — model is likely cold-starting on Modal.`,
+    )
+  }, COLD_START_WARN_MS)
+
  try {
-    const resp = await fetch(url, { signal: AbortSignal.timeout(30_000) })
+    const resp = await fetch(ACTION_HEALTH_URL, {
+      signal: AbortSignal.timeout(COLD_START_BUDGET_MS),
+    })
    const elapsed = ((performance.now() - start) / 1000).toFixed(2)
    const body = await resp.text()
    console.log(`  Status: ${resp.status} (${elapsed}s)`)
-    console.log(`  Body: ${body.slice(0, 200)}`)
+    console.log(`  Body:   ${body.slice(0, 400)}`)
    return resp.ok
  } catch (err) {
    const elapsed = ((performance.now() - start) / 1000).toFixed(2)
@@ -36,63 +75,34 @@ async function checkHealth(name: string, url: string): Promise<boolean> {
      `  FAILED (${elapsed}s): ${err instanceof Error ? err.message : err}`,
    )
    return false
+  } finally {
+    clearTimeout(warn)
  }
 }

-async function testGenerate(
-  name: string,
-  url: string,
+async function generate(
+  label: string,
  payload: Record<string, unknown>,
-): Promise<Record<string, unknown> | null> {
-  console.log(`\n--- ${name} generate ---`)
-  console.log(`  URL: ${url}`)
+): Promise<CladoResponse | null> {
+  console.log(`\n--- ${label} ---`)
+  console.log(`  URL:         ${ACTION_URL}`)
  console.log(`  Instruction: ${payload.instruction}`)
  console.log(
-    `  Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
+    `  Image size:  ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
  )
-  if (payload.history) console.log(`  History: ${payload.history}`)
+  if (payload.history && payload.history !== 'None') {
+    console.log(`  History:     ${payload.history}`)
+  }

  const start = performance.now()
+  let resp: Response
  try {
-    const resp = await fetch(url, {
+    resp = await fetch(ACTION_URL, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify(payload),
-      signal: AbortSignal.timeout(120_000),
+      signal: AbortSignal.timeout(COLD_START_BUDGET_MS),
    })
-    const elapsed = ((performance.now() - start) / 1000).toFixed(2)
-
-    if (!resp.ok) {
-      const body = await resp.text()
-      console.log(`  FAILED: HTTP ${resp.status} (${elapsed}s)`)
-      console.log(`  Body: ${body.slice(0, 400)}`)
-      return null
-    }
-
-    const result = (await resp.json()) as Record<string, unknown>
-    console.log(`  Status: ${resp.status} (${elapsed}s)`)
-    console.log(`  Action: ${result.action}`)
-    if (result.x !== null && result.x !== undefined)
-      console.log(`  Coordinates: (${result.x}, ${result.y})`)
-    if (result.text)
-      console.log(`  Text: ${(result.text as string).slice(0, 100)}`)
-    if (result.key) console.log(`  Key: ${result.key}`)
-    if (result.inference_time_seconds)
-      console.log(`  Inference: ${result.inference_time_seconds}s`)
-
-    // Show thinking if present
-    const raw = result.raw_response as string | undefined
-    if (raw) {
-      const thinkMatch = raw.match(/<thinking>([\s\S]*?)<\/thinking>/)
-      if (thinkMatch) {
-        const thinking = thinkMatch[1].trim()
-        console.log(
-          `  Thinking: ${thinking.slice(0, 200)}${thinking.length > 200 ? '...' : ''}`,
-        )
-      }
-    }
-
-    return result
  } catch (err) {
    const elapsed = ((performance.now() - start) / 1000).toFixed(2)
    console.log(
@@ -100,6 +110,50 @@ async function testGenerate(
    )
    return null
  }
+  const elapsed = ((performance.now() - start) / 1000).toFixed(2)
+
+  if (!resp.ok) {
+    const body = await resp.text()
+    console.log(`  HTTP ${resp.status} ${resp.statusText} (${elapsed}s)`)
+    console.log(`  Body: ${body.slice(0, 400)}`)
+    return null
+  }
+
+  const result = (await resp.json()) as CladoResponse
+  console.log(`  HTTP ${resp.status} (${elapsed}s)`)
+  console.log(`  action:                ${result.action ?? 'null'}`)
+  if (result.parse_error) {
+    console.log(`  parse_error:           ${result.parse_error}`)
+  }
+  if (result.thinking) {
+    const trimmed = result.thinking.replace(/\s+/g, ' ').trim()
+    console.log(
+      `  thinking:              ${trimmed.slice(0, 240)}${trimmed.length > 240 ? '…' : ''}`,
+    )
+  }
+  if (typeof result.x === 'number' || typeof result.y === 'number') {
+    console.log(`  x, y:                  ${result.x}, ${result.y}`)
+  }
+  if (typeof result.text === 'string')
+    console.log(`  text:                  ${result.text.slice(0, 120)}`)
+  if (typeof result.key === 'string')
+    console.log(`  key:                   ${result.key}`)
+  if (typeof result.direction === 'string')
+    console.log(`  direction:             ${result.direction}`)
+  if (typeof result.amount === 'number')
+    console.log(`  amount:                ${result.amount}`)
+  if (typeof result.startX === 'number' || typeof result.endX === 'number') {
+    console.log(
+      `  drag:                  (${result.startX}, ${result.startY}) → (${result.endX}, ${result.endY})`,
+    )
+  }
+  if (typeof result.time === 'number')
+    console.log(`  time:                  ${result.time}s`)
+  if (result.final_answer)
+    console.log(`  final_answer:          ${result.final_answer.slice(0, 240)}`)
+  if (typeof result.inference_time_seconds === 'number')
+    console.log(`  inference_time_seconds: ${result.inference_time_seconds}`)
+  return result
 }

 async function loadScreenshot(path?: string): Promise<string> {
@@ -110,10 +164,9 @@ async function loadScreenshot(path?: string): Promise<string> {
    return data.toString('base64')
  }

-  // Try to capture from a running BrowserOS server
  const serverUrl = process.env.BROWSEROS_URL || 'http://127.0.0.1:9110'
  console.log(
-    `No screenshot path provided. Trying to capture from ${serverUrl}...`,
+    `No screenshot path provided. Capturing from ${serverUrl} via MCP...`,
  )

  const { Client } = await import('@modelcontextprotocol/sdk/client/index.js')
@@ -134,82 +187,101 @@ async function loadScreenshot(path?: string): Promise<string> {
      arguments: { format: 'png', page: 1 },
    })) as { content: Array<{ type: string; data?: string }> }

-    const imageContent = result.content?.find((c) => c.type === 'image')
-    if (!imageContent?.data)
-      throw new Error('No image data in screenshot response')
+    const image = result.content?.find((c) => c.type === 'image')
+    if (!image?.data)
+      throw new Error('No image data in take_screenshot response')

    console.log(
-      `Captured screenshot (${(imageContent.data.length / 1024).toFixed(0)} KB base64)`,
+      `Captured screenshot (${(image.data.length / 1024).toFixed(0)} KB base64)`,
    )
-    return imageContent.data
+    return image.data
  } finally {
    try {
      await transport.close()
-    } catch {}
+    } catch {
+      /* ignore */
+    }
  }
 }

+function summarize(history: CladoResponse[]): string {
+  if (history.length === 0) return 'None'
+  return history
+    .map((h) => {
+      switch (h.action) {
+        case 'click':
+        case 'double_click':
+        case 'right_click':
+        case 'hover':
+          return `${h.action}(${h.x}, ${h.y})`
+        case 'type':
+          return `type(${JSON.stringify(h.text ?? '')})`
+        case 'press_key':
+          return `press_key(${JSON.stringify(h.key ?? '')})`
+        case 'scroll':
+          return `scroll(${h.direction ?? 'down'})`
+        case 'drag':
+          return `drag(${h.startX},${h.startY} -> ${h.endX},${h.endY})`
+        case 'wait':
+          return `wait(${h.time ?? 1}s)`
+        case 'end':
+          return 'end()'
+        default:
+          return h.action ?? 'invalid'
+      }
+    })
+    .join(' -> ')
+}
+
 async function main() {
-  const screenshotPath = process.argv[2]
+  console.log('=== Clado action endpoint smoke test ===')

-  console.log('=== Clado API Test ===\n')
-
-  // Health checks (parallel)
-  const [actionHealthy, groundingHealthy] = await Promise.all([
-    checkHealth('Action Model', ACTION_HEALTH_URL),
-    checkHealth('Grounding Model', GROUNDING_HEALTH_URL),
-  ])
-
-  if (!actionHealthy && !groundingHealthy) {
-    console.log('\nBoth endpoints are down. Exiting.')
+  const healthy = await checkHealth()
+  if (!healthy) {
+    console.log('\nHealth check failed. Exiting.')
    process.exit(1)
  }

-  // Load screenshot
  let imageBase64: string
  try {
-    imageBase64 = await loadScreenshot(screenshotPath)
+    imageBase64 = await loadScreenshot(process.argv[2])
  } catch (err) {
    console.log(
      `\nFailed to load screenshot: ${err instanceof Error ? err.message : err}`,
    )
    console.log(
-      'Provide a screenshot path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
+      'Pass a path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
    )
    process.exit(1)
  }

-  const instruction = 'Click on the search button or search bar'
+  const history: CladoResponse[] = []

-  // Test grounding model
-  if (groundingHealthy) {
-    await testGenerate('Grounding Model', GROUNDING_URL, {
-      instruction,
+  // Step 1: open task — let the model decide what to do.
+  const step1 = await generate('Step 1: cold task', {
+    instruction: 'Find the search bar and click it',
+    image_base64: imageBase64,
+    history: 'None',
+  })
+  if (step1?.action) history.push(step1)
+
+  // Step 2: continuation with history, asks for typing.
+  if (step1?.action) {
+    const step2 = await generate('Step 2: with history', {
+      instruction: 'Type "hello world" into the search bar',
      image_base64: imageBase64,
+      history: summarize(history),
    })
-  } else {
-    console.log('\nSkipping grounding model (unhealthy)')
+    if (step2?.action) history.push(step2)
  }

-  // Test action model (no history)
-  if (actionHealthy) {
-    const result = await testGenerate('Action Model (step 1)', ACTION_URL, {
-      instruction,
-      image_base64: imageBase64,
-      history: 'None',
-    })
-
-    // Test action model with history (simulate multi-turn)
-    if (result && result.action === 'click') {
-      await testGenerate('Action Model (step 2, with history)', ACTION_URL, {
-        instruction: 'Type "hello world" in the search bar',
-        image_base64: imageBase64,
-        history: `click(${result.x}, ${result.y})`,
-      })
-    }
-  } else {
-    console.log('\nSkipping action model (unhealthy)')
-  }
+  // Step 3: ask for end with a final answer to exercise that field.
+  await generate('Step 3: ask for end+final_answer', {
+    instruction:
+      'You have completed the task. Reply with end() and final_answer="done".',
+    image_base64: imageBase64,
+    history: summarize(history),
+  })

  console.log('\n=== Done ===')
 }
--- a/packages/browseros-agent/apps/eval/scripts/upload-run.ts
+++ b/packages/browseros-agent/apps/eval/scripts/upload-run.ts
@@ -1,349 +1,43 @@
+#!/usr/bin/env bun
+
 /**
 * Upload eval runs to R2.
 *
 * Two modes:
 *   bun scripts/upload-run.ts results/browseros-agent-weekly/2026-03-21-1730
- *       → uploads that specific run
- *
 *   bun scripts/upload-run.ts results/browseros-agent-weekly
- *       → finds all timestamped subfolders, uploads any not yet in R2
- *
- * Env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY
- *           EVAL_R2_BUCKET (default: browseros-eval)
- *           EVAL_R2_CDN_BASE_URL (default: https://eval.browseros.com)
 */

-import { readdir, readFile, stat } from 'node:fs/promises'
-import { basename, dirname, extname, join } from 'node:path'
 import {
-  GetObjectCommand,
-  PutObjectCommand,
-  S3Client,
-} from '@aws-sdk/client-s3'
+  loadR2ConfigFromEnv,
+  R2Publisher,
+} from '../src/publishing/r2-publisher'

-const CONCURRENCY = 20
-
-const CONTENT_TYPES: Record<string, string> = {
-  '.json': 'application/json',
-  '.jsonl': 'application/x-ndjson',
-  '.png': 'image/png',
-}
-
-interface R2Config {
-  accountId: string
-  accessKeyId: string
-  secretAccessKey: string
-  bucket: string
-  cdnBaseUrl: string
-}
-
-function loadConfig(): R2Config {
-  const accountId = process.env.EVAL_R2_ACCOUNT_ID
-  const accessKeyId = process.env.EVAL_R2_ACCESS_KEY_ID
-  const secretAccessKey = process.env.EVAL_R2_SECRET_ACCESS_KEY
-
-  if (!accountId || !accessKeyId || !secretAccessKey) {
-    console.error(
-      'Missing required env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY',
-    )
-    process.exit(1)
-  }
-
-  return {
-    accountId,
-    accessKeyId,
-    secretAccessKey,
-    bucket: process.env.EVAL_R2_BUCKET || 'browseros-eval',
-    cdnBaseUrl: (
-      process.env.EVAL_R2_CDN_BASE_URL || 'https://eval.browseros.com'
-    ).replace(/\/+$/, ''),
-  }
-}
-
-function createClient(config: R2Config): S3Client {
-  return new S3Client({
-    region: 'auto',
-    endpoint: `https://${config.accountId}.r2.cloudflarestorage.com`,
-    credentials: {
-      accessKeyId: config.accessKeyId,
-      secretAccessKey: config.secretAccessKey,
-    },
-  })
-}
-
-async function upload(
-  client: S3Client,
-  bucket: string,
-  key: string,
-  body: Buffer,
-  contentType: string,
-) {
-  await client.send(
-    new PutObjectCommand({
-      Bucket: bucket,
-      Key: key,
-      Body: body,
-      ContentType: contentType,
-    }),
-  )
-}
-
-async function collectFiles(dir: string): Promise<string[]> {
-  const files: string[] = []
-  const entries = await readdir(dir, { withFileTypes: true })
-  for (const entry of entries) {
-    const full = join(dir, entry.name)
-    if (entry.isDirectory()) {
-      files.push(...(await collectFiles(full)))
-    } else {
-      files.push(full)
-    }
-  }
-  return files
-}
-
-async function runPool<T>(
-  items: T[],
-  concurrency: number,
-  fn: (item: T) => Promise<void>,
-) {
-  let i = 0
-  const workers = Array.from({ length: concurrency }, async () => {
-    while (i < items.length) {
-      const idx = i++
-      await fn(items[idx])
-    }
-  })
-  await Promise.all(workers)
-}
-
-// Check if a run has already been uploaded to R2
-async function isUploaded(
-  client: S3Client,
-  bucket: string,
-  runId: string,
-): Promise<boolean> {
-  try {
-    await client.send(
-      new GetObjectCommand({
-        Bucket: bucket,
-        Key: `runs/${runId}/manifest.json`,
-      }),
-    )
-    return true
-  } catch {
-    return false
-  }
-}
-
-// Detect if a directory is a run dir (has task subdirs with metadata.json)
-// vs a config dir (has timestamped subdirs like 2026-03-21-1730/)
-async function isRunDir(dir: string): Promise<boolean> {
-  const entries = await readdir(dir, { withFileTypes: true })
-  const subdirs = entries.filter((e) => e.isDirectory())
-  for (const subdir of subdirs) {
-    const metaPath = join(dir, subdir.name, 'metadata.json')
-    const metaStat = await stat(metaPath).catch(() => null)
-    if (metaStat?.isFile()) return true
-  }
-  return false
-}
-
-async function uploadSingleRun(
-  runDir: string,
-  runId: string,
-  r2Config: R2Config,
-  client: S3Client,
-): Promise<void> {
-  const taskDirs = await readdir(runDir, { withFileTypes: true })
-  const taskEntries = taskDirs.filter((d) => d.isDirectory())
-
-  if (taskEntries.length === 0) {
-    console.warn(`  No task subdirectories in ${runId}, skipping`)
-    return
-  }
-
-  const manifestTasks: Record<string, unknown>[] = []
-  const jobs: { key: string; filePath: string; contentType: string }[] = []
-
-  // Extract agent config from first task
-  let agentConfig: Record<string, unknown> | undefined
-  let dataset: string | undefined
-
-  for (const taskDir of taskEntries) {
-    const taskId = taskDir.name
-    const taskPath = join(runDir, taskId)
-    const metaPath = join(taskPath, 'metadata.json')
-
-    let meta: Record<string, unknown> = {}
-    try {
-      meta = JSON.parse(await readFile(metaPath, 'utf-8'))
-    } catch {
-      continue
-    }
-
-    if (!agentConfig && meta.agent_config)
-      agentConfig = meta.agent_config as Record<string, unknown>
-    if (!dataset && meta.dataset) dataset = meta.dataset as string
-
-    const files = await collectFiles(taskPath)
-    let screenshotCount = 0
-
-    for (const file of files) {
-      const relative = file.slice(taskPath.length + 1)
-      const ext = extname(file)
-      if (relative.startsWith('screenshots/') && ext === '.png')
-        screenshotCount++
-
-      jobs.push({
-        key: `runs/${runId}/${taskId}/${relative}`,
-        filePath: file,
-        contentType: CONTENT_TYPES[ext] || 'application/octet-stream',
-      })
-    }
-
-    manifestTasks.push({
-      queryId: meta.query_id || taskId,
-      query: meta.query || '',
-      startUrl: meta.start_url || '',
-      status:
-        meta.termination_reason === 'completed'
-          ? 'completed'
-          : meta.termination_reason || 'unknown',
-      durationMs: meta.total_duration_ms || 0,
-      screenshotCount: (meta.screenshot_count as number) || screenshotCount,
-      graderResults: meta.grader_results || {},
-    })
-  }
-
-  if (manifestTasks.length === 0) {
-    console.warn(`  No completed tasks in ${runId}, skipping`)
-    return
-  }
-
-  console.log(
-    `  Uploading ${jobs.length} files across ${manifestTasks.length} tasks...`,
-  )
-
-  let uploaded = 0
-  await runPool(jobs, CONCURRENCY, async (job) => {
-    const body = await readFile(job.filePath)
-    await upload(client, r2Config.bucket, job.key, body, job.contentType)
-    uploaded++
-    if (uploaded % 50 === 0 || uploaded === jobs.length) {
-      console.log(`    ${uploaded}/${jobs.length}`)
-    }
-  })
-
-  // Read summary.json if it exists
-  let summaryData: Record<string, unknown> | undefined
-  try {
-    summaryData = JSON.parse(
-      await readFile(join(runDir, 'summary.json'), 'utf-8'),
-    )
-  } catch {}
-
-  // Upload manifest
-  const manifest = {
-    runId,
-    uploadedAt: new Date().toISOString(),
-    agentConfig,
-    dataset,
-    summary: summaryData
-      ? {
-          passRate: summaryData.passRate,
-          avgDurationMs: summaryData.avgDurationMs,
-        }
-      : undefined,
-    tasks: manifestTasks,
-  }
-  const manifestBody = Buffer.from(JSON.stringify(manifest, null, 2))
-  await upload(
-    client,
-    r2Config.bucket,
-    `runs/${runId}/manifest.json`,
-    manifestBody,
-    'application/json',
-  )
-
-  // Upload viewer.html to bucket root
-  const viewerPath = join(
-    import.meta.dir,
-    '..',
-    'src',
-    'dashboard',
-    'viewer.html',
-  )
-  const viewerBody = await readFile(viewerPath)
-  await upload(client, r2Config.bucket, 'viewer.html', viewerBody, 'text/html')
-
-  console.log(`  Uploaded ${uploaded + 2} files`)
-  console.log(`  ${r2Config.cdnBaseUrl}/viewer.html?run=${runId}`)
-}
-
-async function main() {
+async function main(): Promise<void> {
  const inputDir = process.argv[2]
  if (!inputDir) {
-    console.error(
+    throw new Error(
      'Usage:\n' +
-        '  bun scripts/upload-run.ts results/config-name/2026-03-21-1730  (specific run)\n' +
-        '  bun scripts/upload-run.ts results/config-name                   (all un-uploaded runs)',
-    )
-    process.exit(1)
-  }
-
-  const dirStat = await stat(inputDir).catch(() => null)
-  if (!dirStat?.isDirectory()) {
-    console.error(`Not a directory: ${inputDir}`)
-    process.exit(1)
-  }
-
-  const r2Config = loadConfig()
-  const client = createClient(r2Config)
-
-  if (await isRunDir(inputDir)) {
-    // Single run: results/config-name/2026-03-21-1730
-    const timestamp = basename(inputDir)
-    const configName = basename(dirname(inputDir))
-    const runId = `${configName}-${timestamp}`
-    console.log(`Uploading run: ${runId}`)
-    await uploadSingleRun(inputDir, runId, r2Config, client)
-  } else {
-    // Config dir: results/config-name/ — upload all un-uploaded runs
-    const configName = basename(inputDir)
-    const entries = await readdir(inputDir, { withFileTypes: true })
-    const runDirs = entries
-      .filter((e) => e.isDirectory())
-      .map((e) => e.name)
-      .sort()
-
-    if (runDirs.length === 0) {
-      console.error('No run subdirectories found')
-      process.exit(1)
-    }
-
-    console.log(
-      `Found ${runDirs.length} runs for config "${configName}", checking R2...`,
-    )
-
-    let uploadedCount = 0
-    for (const dir of runDirs) {
-      const runId = `${configName}-${dir}`
-      const alreadyUploaded = await isUploaded(client, r2Config.bucket, runId)
-      if (alreadyUploaded) {
-        console.log(`  ${runId}: already uploaded, skipping`)
-        continue
-      }
-
-      console.log(`  ${runId}: uploading...`)
-      await uploadSingleRun(join(inputDir, dir), runId, r2Config, client)
-      uploadedCount++
-    }
-
-    console.log(
-      `\nDone. Uploaded ${uploadedCount} new run(s), ${runDirs.length - uploadedCount} already in R2.`,
+        '  bun scripts/upload-run.ts results/config-name/2026-03-21-1730\n' +
+        '  bun scripts/upload-run.ts results/config-name',
    )
  }
+
+  const publisher = new R2Publisher({ config: loadR2ConfigFromEnv() })
+  const result = await publisher.publishPath(inputDir)
+  for (const run of result.uploadedRuns) {
+    console.log(`Uploaded ${run.uploadedFiles} files for ${run.runId}`)
+    console.log(run.viewerUrl)
+  }
+  for (const runId of result.skippedRuns) {
+    console.log(`${runId}: already uploaded, skipping`)
+  }
+  console.log(
+    `Done. Uploaded ${result.uploadedRuns.length} run(s), skipped ${result.skippedRuns.length}.`,
+  )
 }

-main()
+main().catch((error) => {
+  console.error(error instanceof Error ? error.message : String(error))
+  process.exit(1)
+})
--- a/packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-actions.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-actions.ts
@@ -0,0 +1,191 @@
+import type {
+  CladoAction,
+  CladoActionResponse,
+  RawCladoActionPayload,
+} from './types'
+
+/** Parses Clado's structured response plus any raw `<answer>` blocks into executable actions. */
+export function parseCladoActions(
+  prediction: CladoActionResponse,
+): CladoAction[] {
+  const actionFromField =
+    typeof prediction.action === 'string' ? prediction.action : null
+
+  const rawActions = parseCladoActionsFromRawResponse(prediction.raw_response)
+  const primaryFromRaw = rawActions[0] ?? null
+  const mergedPrimary = {
+    ...primaryFromRaw,
+    ...prediction,
+    action: actionFromField ?? primaryFromRaw?.action,
+  }
+
+  const normalized: CladoAction[] = []
+  const primary = normalizeCladoActionPayload(mergedPrimary)
+  if (primary) normalized.push(primary)
+
+  for (const candidate of rawActions.slice(1)) {
+    const parsed = normalizeCladoActionPayload(candidate)
+    if (!parsed) continue
+    const prev = normalized[normalized.length - 1]
+    if (
+      !prev ||
+      getCladoActionSignature(prev) !== getCladoActionSignature(parsed)
+    ) {
+      normalized.push(parsed)
+    }
+  }
+
+  return normalized
+}
+
+export function normalizeCladoActionPayload(
+  payload: RawCladoActionPayload,
+): CladoAction | null {
+  if (!payload.action || typeof payload.action !== 'string') {
+    return null
+  }
+  return {
+    action: payload.action,
+    x: typeof payload.x === 'number' ? payload.x : undefined,
+    y: typeof payload.y === 'number' ? payload.y : undefined,
+    text: typeof payload.text === 'string' ? payload.text : undefined,
+    key: typeof payload.key === 'string' ? payload.key : undefined,
+    direction:
+      typeof payload.direction === 'string' ? payload.direction : undefined,
+    startX: typeof payload.startX === 'number' ? payload.startX : undefined,
+    startY: typeof payload.startY === 'number' ? payload.startY : undefined,
+    endX: typeof payload.endX === 'number' ? payload.endX : undefined,
+    endY: typeof payload.endY === 'number' ? payload.endY : undefined,
+    amount: typeof payload.amount === 'number' ? payload.amount : undefined,
+    time: typeof payload.time === 'number' ? payload.time : undefined,
+    final_answer:
+      typeof payload.final_answer === 'string'
+        ? payload.final_answer
+        : undefined,
+  }
+}
+
+export function parseCladoActionsFromRawResponse(
+  rawResponse: string | undefined,
+): RawCladoActionPayload[] {
+  if (!rawResponse) return []
+  const matches = [
+    ...rawResponse.matchAll(/<answer>\s*([\s\S]*?)\s*<\/answer>/gi),
+  ]
+  const parsed: RawCladoActionPayload[] = []
+  for (const match of matches) {
+    try {
+      parsed.push(JSON.parse(match[1]) as RawCladoActionPayload)
+    } catch {
+      // Ignore malformed answer blocks so one bad block does not drop the whole prediction.
+    }
+  }
+  return parsed
+}
+
+export function extractCladoThinking(
+  rawResponse: string | undefined,
+): string | undefined {
+  if (!rawResponse) return undefined
+  const matches = [
+    ...rawResponse.matchAll(/<thinking>\s*([\s\S]*?)\s*<\/thinking>/gi),
+  ]
+  if (matches.length === 0) return undefined
+
+  const merged = matches
+    .map((match) => match[1]?.replace(/\s+/g, ' ').trim() ?? '')
+    .filter((value) => value.length > 0)
+    .join(' ')
+
+  if (!merged) return undefined
+  return merged
+}
+
+export function summarizeCladoPrediction(
+  prediction: CladoActionResponse,
+): Record<string, unknown> {
+  const preview =
+    typeof prediction.raw_response === 'string' &&
+    prediction.raw_response.length > 0
+      ? prediction.raw_response.slice(0, 240)
+      : undefined
+
+  return {
+    action: prediction.action,
+    x: prediction.x,
+    y: prediction.y,
+    text: prediction.text,
+    key: prediction.key,
+    direction: prediction.direction,
+    startX: prediction.startX,
+    startY: prediction.startY,
+    endX: prediction.endX,
+    endY: prediction.endY,
+    amount: prediction.amount,
+    time: prediction.time,
+    inference_time_seconds: prediction.inference_time_seconds,
+    raw_response_preview: preview,
+  }
+}
+
+export function getCladoActionSignature(action: CladoAction): string {
+  switch (action.action) {
+    case 'click':
+    case 'double_click':
+    case 'right_click':
+    case 'hover':
+      return `${action.action}:${action.x ?? 'x'}:${action.y ?? 'y'}`
+    case 'type':
+      return `${action.action}:${(action.text ?? '').slice(0, 16)}`
+    case 'press_key':
+      return `${action.action}:${action.key ?? 'key'}`
+    case 'scroll':
+      return `${action.action}:${action.direction ?? 'down'}:${action.amount ?? 500}`
+    case 'drag':
+      return `${action.action}:${action.startX}:${action.startY}:${action.endX}:${action.endY}`
+    case 'wait':
+      return `${action.action}:${action.time ?? 1}`
+    case 'end':
+      return action.final_answer
+        ? `end(${action.final_answer.slice(0, 32)})`
+        : 'end()'
+    case 'invalid':
+      return `invalid(${(action.text ?? '').slice(0, 40)})`
+    default:
+      return action.action
+  }
+}
+
+export function formatCladoHistory(actions: CladoAction[]): string {
+  if (actions.length === 0) return 'None'
+
+  const parts = actions.map((action) => {
+    switch (action.action) {
+      case 'click':
+      case 'double_click':
+      case 'right_click':
+      case 'hover':
+        return `${action.action}(${Math.round(action.x ?? 500)}, ${Math.round(action.y ?? 500)})`
+      case 'type': {
+        const text = (action.text ?? '').replace(/'/g, "\\'")
+        return `type('${text}')`
+      }
+      case 'press_key':
+        return `press_key('${action.key ?? 'Enter'}')`
+      case 'scroll':
+        return `scroll(${action.direction ?? 'down'})`
+      case 'drag':
+        return `drag(${Math.round(action.startX ?? 500)},${Math.round(action.startY ?? 500)} -> ${Math.round(action.endX ?? 500)},${Math.round(action.endY ?? 500)})`
+      case 'wait':
+        return `wait(${Math.round(action.time ?? 1)}s)`
+      case 'end':
+        return 'end()'
+      case 'invalid':
+        return 'invalid()'
+      default:
+        return action.action
+    }
+  })
+
+  return parts.join(' -> ')
+}
--- a/packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-browser-driver.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-browser-driver.ts
@@ -0,0 +1,123 @@
+import {
+  CLADO_PAGE_SCOPED_TOOLS,
+  type CladoActionPoint,
+  type CladoViewport,
+} from './types'
+
+export function clampCladoNormalizedCoordinate(value: number): number {
+  return Math.min(999, Math.max(0, Math.round(value)))
+}
+
+/** Converts Clado's 0-1000 normalized coordinate space into BrowserOS viewport pixels. */
+export function resolveCladoPoint(
+  viewport: CladoViewport,
+  normalizedX: number | undefined,
+  normalizedY: number | undefined,
+): CladoActionPoint {
+  const nx = clampCladoNormalizedCoordinate(normalizedX ?? 500)
+  const ny = clampCladoNormalizedCoordinate(normalizedY ?? 500)
+
+  return {
+    x: Math.round((nx / 1000) * viewport.width),
+    y: Math.round((ny / 1000) * viewport.height),
+  }
+}
+
+/** Adapts Clado action tool arguments to the BrowserOS MCP tool argument contract. */
+export function prepareCladoToolArgs(
+  toolName: string,
+  args: Record<string, unknown>,
+  pageId: number,
+): Record<string, unknown> {
+  const prepared: Record<string, unknown> = { ...args }
+
+  if (
+    toolName === 'evaluate_script' &&
+    typeof prepared.function === 'string' &&
+    prepared.expression === undefined
+  ) {
+    prepared.expression = toCladoEvaluateExpression(prepared.function)
+    delete prepared.function
+  }
+
+  if (
+    toolName === 'click_at' &&
+    typeof prepared.dblClick === 'boolean' &&
+    prepared.clickCount === undefined
+  ) {
+    prepared.clickCount = prepared.dblClick ? 2 : 1
+    delete prepared.dblClick
+  }
+
+  if (
+    CLADO_PAGE_SCOPED_TOOLS.has(toolName) &&
+    typeof prepared.page !== 'number'
+  ) {
+    prepared.page = pageId
+  }
+
+  return prepared
+}
+
+export function toCladoEvaluateExpression(rawFunction: unknown): string {
+  const source = String(rawFunction).trim()
+  if (source.startsWith('() =>') || source.startsWith('async () =>')) {
+    return `(${source})()`
+  }
+  if (source.startsWith('function')) {
+    return `(${source})()`
+  }
+  return source
+}
+
+export function normalizeCladoPressKey(key: string | undefined): string {
+  const raw = (key ?? '').trim()
+  if (!raw) throw new Error('press_key action missing key field')
+
+  const map: Record<string, string> = {
+    'C-a': 'Control+A',
+    'C-c': 'Control+C',
+    'C-v': 'Control+V',
+    'C-x': 'Control+X',
+    'C-z': 'Control+Z',
+    'C-y': 'Control+Y',
+    'C-s': 'Control+S',
+    'C-t': 'Control+T',
+    'C-w': 'Control+W',
+    'C-h': 'Control+H',
+    'C-f': 'Control+F',
+    'C-+': 'Control++',
+    'C--': 'Control+-',
+    'C-tab': 'Control+Tab',
+    'C-S-tab': 'Control+Shift+Tab',
+    'C-S-n': 'Control+Shift+N',
+    'C-down': 'Control+ArrowDown',
+    'M-a': 'Meta+A',
+    'M-c': 'Meta+C',
+    'M-v': 'Meta+V',
+    'M-x': 'Meta+X',
+    'M-f4': 'Alt+F4',
+  }
+  return map[raw] ?? raw
+}
+
+export function normalizeCladoDirection(
+  direction: string | undefined,
+): 'up' | 'down' | 'left' | 'right' {
+  if (
+    direction === 'up' ||
+    direction === 'down' ||
+    direction === 'left' ||
+    direction === 'right'
+  ) {
+    return direction
+  }
+  return 'down'
+}
+
+export function normalizeCladoScrollAmount(amount: number | undefined): number {
+  if (typeof amount !== 'number') return 500
+  if (amount <= 0) return 100
+  const clamped = Math.min(amount, 1000)
+  return Math.max(100, Math.round((clamped / 1000) * 900))
+}
--- a/packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-client.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-client.ts
@@ -0,0 +1,68 @@
+import { CLADO_REQUEST_TIMEOUT_MS } from '../../../../constants'
+import { formatCladoHistory } from './clado-actions'
+import type { CladoAction, CladoActionResponse } from './types'
+
+export interface CladoActionClientOptions {
+  baseUrl?: string
+  apiKey?: string
+}
+
+export interface CladoActionPredictionInput {
+  instruction: string
+  imageBase64: string
+  actionHistory: CladoAction[]
+  signal?: AbortSignal
+}
+
+/** Calls the Clado action model without exposing credentials in process arguments or artifacts. */
+export class CladoActionClient {
+  constructor(private readonly options: CladoActionClientOptions) {}
+
+  async requestActionPrediction(
+    input: CladoActionPredictionInput,
+  ): Promise<CladoActionResponse> {
+    if (!this.options.baseUrl) {
+      throw new Error('executor.baseUrl must be set for clado-action provider')
+    }
+
+    const requestController = new AbortController()
+    const onAbort = () => requestController.abort()
+    input.signal?.addEventListener('abort', onAbort, { once: true })
+
+    const timeoutHandle = setTimeout(() => {
+      requestController.abort()
+    }, CLADO_REQUEST_TIMEOUT_MS)
+
+    try {
+      const headers: Record<string, string> = {
+        'Content-Type': 'application/json',
+      }
+      if (this.options.apiKey) {
+        headers.Authorization = `Bearer ${this.options.apiKey}`
+      }
+
+      const response = await fetch(this.options.baseUrl, {
+        method: 'POST',
+        headers,
+        body: JSON.stringify({
+          instruction: input.instruction,
+          image_base64: input.imageBase64,
+          history: formatCladoHistory(input.actionHistory),
+        }),
+        signal: requestController.signal,
+      })
+
+      if (!response.ok) {
+        const body = await response.text()
+        throw new Error(
+          `HTTP ${response.status} ${response.statusText}: ${body.slice(0, 400)}`,
+        )
+      }
+
+      return (await response.json()) as CladoActionResponse
+    } finally {
+      clearTimeout(timeoutHandle)
+      input.signal?.removeEventListener('abort', onAbort)
+    }
+  }
+}
--- a/packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/types.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/types.ts
@@ -0,0 +1,78 @@
+export const CLADO_ACTION_PROVIDER = 'clado-action'
+
+export const CLADO_PAGE_SCOPED_TOOLS = new Set<string>([
+  'take_screenshot',
+  'evaluate_script',
+  'click',
+  'click_at',
+  'hover',
+  'hover_at',
+  'clear',
+  'fill',
+  'press_key',
+  'type_at',
+  'drag',
+  'drag_at',
+  'scroll',
+  'handle_dialog',
+  'select_option',
+  'navigate_page',
+  'close_page',
+  'wait_for',
+])
+
+export interface CladoActionResponse {
+  action?: string | null
+  x?: number
+  y?: number
+  text?: string
+  key?: string
+  direction?: string
+  startX?: number
+  startY?: number
+  endX?: number
+  endY?: number
+  amount?: number
+  time?: number
+  final_answer?: string | null
+  inference_time_seconds?: number
+  raw_response?: string
+  thinking?: string | null
+  parse_error?: string | null
+}
+
+export interface CladoViewport {
+  width: number
+  height: number
+}
+
+export interface CladoAction {
+  action: string
+  x?: number
+  y?: number
+  text?: string
+  key?: string
+  direction?: string
+  startX?: number
+  startY?: number
+  endX?: number
+  endY?: number
+  amount?: number
+  time?: number
+  final_answer?: string
+}
+
+export type RawCladoActionPayload = Partial<
+  Omit<CladoAction, 'final_answer'>
+> & {
+  final_answer?: string | null
+}
+
+export interface CladoActionPoint {
+  x: number
+  y: number
+}
+
+export function isCladoActionProvider(provider: string): boolean {
+  return provider === CLADO_ACTION_PROVIDER
+}
--- a/packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/create-executor-backend.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/create-executor-backend.ts
@@ -0,0 +1,45 @@
+import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
+import type { Browser } from '@browseros/server/browser'
+import type { ExecutorCallbacks } from '../../orchestrator-executor/executor'
+import type { ExecutorBackend, ExecutorBackendKind } from '../executor-backend'
+import { ExecutorAdapterBackend } from './tool-loop-backend'
+
+export interface CreateExecutorBackendOptions {
+  backendKind?: ExecutorBackendKind
+  provider?: string
+  configTemplate?: ResolvedAgentConfig
+  browser?: Browser | null
+  serverUrl?: string
+  windowId?: number
+  tabId?: number
+  initialPageId?: number
+  callbacks?: ExecutorCallbacks
+  executor?: ExecutorBackend
+}
+
+export function backendKindForProvider(provider: string): ExecutorBackendKind {
+  return provider === 'clado-action' ? 'clado' : 'tool-loop'
+}
+
+/** Creates the backend used for one orchestrator delegation. */
+export function createExecutorBackend(
+  options: CreateExecutorBackendOptions,
+): ExecutorBackend {
+  const kind =
+    options.backendKind ??
+    backendKindForProvider(
+      options.provider ?? options.configTemplate?.provider ?? '',
+    )
+
+  return new ExecutorAdapterBackend({
+    kind,
+    configTemplate: options.configTemplate,
+    browser: options.browser,
+    serverUrl: options.serverUrl,
+    windowId: options.windowId,
+    tabId: options.tabId,
+    initialPageId: options.initialPageId,
+    callbacks: options.callbacks,
+    executor: options.executor,
+  })
+}
--- a/packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/tool-loop-backend.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/tool-loop-backend.ts
@@ -0,0 +1,72 @@
+import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
+import type { Browser } from '@browseros/server/browser'
+import {
+  Executor,
+  type ExecutorCallbacks,
+} from '../../orchestrator-executor/executor'
+import type {
+  DelegationResult,
+  ExecutorBackend,
+  ExecutorBackendKind,
+} from '../executor-backend'
+
+interface ExecutorRunner {
+  execute(instruction: string, signal?: AbortSignal): Promise<DelegationResult>
+  close(): Promise<void>
+  getTotalSteps(): number
+}
+
+export interface ExecutorAdapterBackendOptions {
+  kind: ExecutorBackendKind
+  configTemplate?: ResolvedAgentConfig
+  browser?: Browser | null
+  serverUrl?: string
+  windowId?: number
+  tabId?: number
+  initialPageId?: number
+  callbacks?: ExecutorCallbacks
+  executor?: ExecutorRunner
+}
+
+export class ExecutorAdapterBackend implements ExecutorBackend {
+  readonly kind: ExecutorBackendKind
+  private readonly executor: ExecutorRunner
+
+  constructor(options: ExecutorAdapterBackendOptions) {
+    this.kind = options.kind
+    this.executor =
+      options.executor ??
+      new Executor(
+        required(options.configTemplate, 'configTemplate'),
+        options.browser ?? null,
+        required(options.serverUrl, 'serverUrl'),
+        {
+          isCladoAction: options.kind === 'clado',
+          windowId: options.windowId,
+          tabId: options.tabId,
+          initialPageId: options.initialPageId,
+          callbacks: options.callbacks,
+        },
+      )
+  }
+
+  execute(
+    instruction: string,
+    signal?: AbortSignal,
+  ): Promise<DelegationResult> {
+    return this.executor.execute(instruction, signal)
+  }
+
+  close(): Promise<void> {
+    return this.executor.close()
+  }
+
+  getTotalSteps(): number {
+    return this.executor.getTotalSteps()
+  }
+}
+
+function required<T>(value: T | undefined, name: string): T {
+  if (value === undefined) throw new Error(`${name} is required`)
+  return value
+}
--- a/packages/browseros-agent/apps/eval/src/agents/orchestrated/executor-backend.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/orchestrated/executor-backend.ts
@@ -0,0 +1,11 @@
+import type { ExecutorResult } from '../orchestrator-executor/types'
+
+export type ExecutorBackendKind = 'tool-loop' | 'clado'
+export type DelegationResult = ExecutorResult
+
+export interface ExecutorBackend {
+  readonly kind: ExecutorBackendKind
+  execute(instruction: string, signal?: AbortSignal): Promise<DelegationResult>
+  close(): Promise<void>
+  getTotalSteps(): number
+}
--- a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/clado-action-executor.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/clado-action-executor.ts
@@ -1,98 +1,47 @@
 import { randomUUID } from 'node:crypto'
-import {
-  CLADO_REQUEST_TIMEOUT_MS,
-  MAX_ACTIONS_PER_DELEGATION,
-} from '../../constants'
+import { MAX_ACTIONS_PER_DELEGATION } from '../../constants'
 import { McpClient, type McpToolResult } from '../../utils/mcp-client'
 import { sleep } from '../../utils/sleep'
+import {
+  extractCladoThinking,
+  formatCladoHistory,
+  getCladoActionSignature,
+  parseCladoActions,
+  summarizeCladoPrediction,
+} from '../orchestrated/backends/clado/clado-actions'
+import {
+  normalizeCladoDirection,
+  normalizeCladoPressKey,
+  normalizeCladoScrollAmount,
+  prepareCladoToolArgs,
+  resolveCladoPoint,
+} from '../orchestrated/backends/clado/clado-browser-driver'
+import { CladoActionClient } from '../orchestrated/backends/clado/clado-client'
+import {
+  CLADO_ACTION_PROVIDER,
+  type CladoAction,
+  type CladoActionPoint,
+  type CladoActionResponse,
+  type CladoViewport,
+  isCladoActionProvider,
+} from '../orchestrated/backends/clado/types'
 import type { ExecutorCallbacks } from './executor'
 import type { ExecutorConfig, ExecutorResult } from './types'

-const CLADO_ACTION_PROVIDER = 'clado-action'
-const PAGE_SCOPED_TOOLS = new Set<string>([
-  'take_screenshot',
-  'evaluate_script',
-  'click',
-  'click_at',
-  'hover',
-  'hover_at',
-  'clear',
-  'fill',
-  'press_key',
-  'type_at',
-  'drag',
-  'drag_at',
-  'scroll',
-  'handle_dialog',
-  'select_option',
-  'navigate_page',
-  'close_page',
-  'wait_for',
-])
-
-interface CladoActionResponse {
-  action?: string
-  x?: number
-  y?: number
-  text?: string
-  key?: string
-  direction?: string
-  startX?: number
-  startY?: number
-  endX?: number
-  endY?: number
-  amount?: number
-  time?: number
-  inference_time_seconds?: number
-  raw_response?: string
-}
-
-interface Viewport {
-  width: number
-  height: number
-}
-
-interface CladoAction {
-  action: string
-  x?: number
-  y?: number
-  text?: string
-  key?: string
-  direction?: string
-  startX?: number
-  startY?: number
-  endX?: number
-  endY?: number
-  amount?: number
-  time?: number
-}
-
-type RawActionPayload = Partial<CladoAction>
-
-interface ActionPoint {
-  x: number
-  y: number
-}
+const MAX_CONSECUTIVE_PARSE_FAILURES = 3

 function asErrorMessage(error: unknown): string {
  return error instanceof Error ? error.message : String(error)
 }

-function clampNormalized(value: number): number {
-  return Math.min(999, Math.max(0, Math.round(value)))
-}
-
-function isCladoProvider(provider: string): boolean {
-  return provider === CLADO_ACTION_PROVIDER
-}
-
 export class CladoActionExecutor {
  private readonly mcpClient: McpClient
+  private readonly cladoClient: CladoActionClient
  private readonly pageId: number
  private callbacks: ExecutorCallbacks = {}
  private stepsUsed = 0
-  private viewport: Viewport | null = null
-  private lastPoint: ActionPoint | null = null
+  private viewport: CladoViewport | null = null
+  private lastPoint: CladoActionPoint | null = null
  private currentUrl = ''

  constructor(
@@ -102,12 +51,16 @@ export class CladoActionExecutor {
    readonly _tabId?: number,
    initialPageId?: number,
  ) {
-    if (!isCladoProvider(config.provider)) {
+    if (!isCladoActionProvider(config.provider)) {
      throw new Error(
        `CladoActionExecutor requires provider="${CLADO_ACTION_PROVIDER}"`,
      )
    }
    this.mcpClient = new McpClient(`${serverUrl}/mcp`)
+    this.cladoClient = new CladoActionClient({
+      baseUrl: config.baseUrl,
+      apiKey: config.apiKey,
+    })
    this.pageId = initialPageId ?? 1
  }

@@ -135,6 +88,8 @@ export class CladoActionExecutor {
    const actionHistory: CladoAction[] = []
    let predictionCalls = 0
    const thinkingTrace: string[] = []
+    let consecutiveParseFailures = 0
+    let finalAnswer: string | undefined

    let status: ExecutorResult['status'] = 'done'
    let reason = 'Goal executed.'
@@ -155,7 +110,7 @@ export class CladoActionExecutor {
        break
      }

-      const historyForPrediction = this.formatHistory(actionHistory)
+      const historyForPrediction = formatCladoHistory(actionHistory)
      const actionToolCallId = randomUUID()
      const predictionInput = {
        instruction,
@@ -177,7 +132,7 @@ export class CladoActionExecutor {
          signal,
        )
        predictionCalls++
-        const thinking = this.extractThinking(prediction.raw_response)
+        const thinking = extractCladoThinking(prediction.raw_response)
        if (thinking) {
          const previous = thinkingTrace[thinkingTrace.length - 1]
          if (previous !== thinking) {
@@ -207,8 +162,19 @@ export class CladoActionExecutor {
        break
      }

-      const predictedActions = this.parseActions(prediction)
+      const predictedActions = parseCladoActions(prediction)
      if (predictedActions.length === 0) {
+        // Per Clado contract: HTTP 200 with action=null on parse failure.
+        // Count as an invalid step so the model can self-correct on the
+        // next call instead of dropping the trajectory.
+        consecutiveParseFailures++
+        const parseError =
+          prediction.parse_error ?? 'no parsable <answer> in raw_response'
+        actionHistory.push({
+          action: 'invalid',
+          text: `parse_error: ${parseError}`,
+        })
+        this.stepsUsed++
        await this.callbacks.onStepFinish?.({
          toolCalls: [
            {
@@ -222,16 +188,23 @@ export class CladoActionExecutor {
              toolCallId: actionToolCallId,
              toolName: 'clado_action_predict',
              output: {
-                prediction: this.summarizePrediction(prediction),
+                prediction: summarizeCladoPrediction(prediction),
                parsedActions: [],
+                parseError,
+                consecutiveParseFailures,
              },
            },
          ],
        })
-        status = 'blocked'
-        reason = 'Clado action response did not contain a valid action.'
-        break
+
+        if (consecutiveParseFailures >= MAX_CONSECUTIVE_PARSE_FAILURES) {
+          status = 'blocked'
+          reason = `Clado returned ${consecutiveParseFailures} consecutive unparseable responses.`
+          break
+        }
+        continue
      }
+      consecutiveParseFailures = 0

      let requestedStop = false
      const executionNotes: string[] = []
@@ -257,7 +230,7 @@ export class CladoActionExecutor {
                toolCallId: actionToolCallId,
                toolName: 'clado_action_predict',
                output: {
-                  prediction: this.summarizePrediction(prediction),
+                  prediction: summarizeCladoPrediction(prediction),
                  parsedActions: predictedActions,
                  executed: executionNotes,
                },
@@ -272,7 +245,12 @@ export class CladoActionExecutor {

        actionHistory.push(predictedAction)
        if (predictedAction.action === 'end') {
-          reason = 'Model requested end() and marked task complete.'
+          if (predictedAction.final_answer) {
+            finalAnswer = predictedAction.final_answer
+            reason = `Model requested end() with final_answer: ${predictedAction.final_answer.slice(0, 240)}`
+          } else {
+            reason = 'Model requested end() and marked task complete.'
+          }
          requestedStop = true
          break
        }
@@ -293,7 +271,7 @@ export class CladoActionExecutor {
              toolCallId: actionToolCallId,
              toolName: 'clado_action_predict',
              output: {
-                prediction: this.summarizePrediction(prediction),
+                prediction: summarizeCladoPrediction(prediction),
                parsedActions: predictedActions,
                executed: executionNotes,
              },
@@ -327,6 +305,7 @@ export class CladoActionExecutor {
      actions: actionHistory,
      url: this.currentUrl,
      thinkingTrace,
+      finalAnswer,
    })

    return {
@@ -344,121 +323,12 @@ export class CladoActionExecutor {
    actionHistory: CladoAction[],
    signal?: AbortSignal,
  ): Promise<CladoActionResponse> {
-    if (!this.config.baseUrl) {
-      throw new Error('executor.baseUrl must be set for clado-action provider')
-    }
-
-    const requestController = new AbortController()
-    const onAbort = () => requestController.abort()
-    signal?.addEventListener('abort', onAbort, { once: true })
-
-    const timeoutHandle = setTimeout(() => {
-      requestController.abort()
-    }, CLADO_REQUEST_TIMEOUT_MS)
-
-    try {
-      const headers: Record<string, string> = {
-        'Content-Type': 'application/json',
-      }
-      if (this.config.apiKey) {
-        headers.Authorization = `Bearer ${this.config.apiKey}`
-      }
-
-      const response = await fetch(this.config.baseUrl, {
-        method: 'POST',
-        headers,
-        body: JSON.stringify({
-          instruction,
-          image_base64: imageBase64,
-          history: this.formatHistory(actionHistory),
-        }),
-        signal: requestController.signal,
-      })
-
-      if (!response.ok) {
-        const body = await response.text()
-        throw new Error(
-          `HTTP ${response.status} ${response.statusText}: ${body.slice(0, 400)}`,
-        )
-      }
-
-      return (await response.json()) as CladoActionResponse
-    } finally {
-      clearTimeout(timeoutHandle)
-      signal?.removeEventListener('abort', onAbort)
-    }
-  }
-
-  private parseActions(prediction: CladoActionResponse): CladoAction[] {
-    const actionFromField =
-      typeof prediction.action === 'string' ? prediction.action : null
-
-    const rawActions = this.parseActionsFromRawResponse(prediction.raw_response)
-    const primaryFromRaw = rawActions[0] ?? null
-    const mergedPrimary = {
-      ...primaryFromRaw,
-      ...prediction,
-      action: actionFromField ?? primaryFromRaw?.action,
-    }
-
-    const normalized: CladoAction[] = []
-    const primary = this.normalizeActionPayload(mergedPrimary)
-    if (primary) normalized.push(primary)
-
-    for (const candidate of rawActions.slice(1)) {
-      const parsed = this.normalizeActionPayload(candidate)
-      if (!parsed) continue
-      const prev = normalized[normalized.length - 1]
-      if (
-        !prev ||
-        this.getActionSignature(prev) !== this.getActionSignature(parsed)
-      ) {
-        normalized.push(parsed)
-      }
-    }
-
-    return normalized
-  }
-
-  private normalizeActionPayload(
-    payload: RawActionPayload,
-  ): CladoAction | null {
-    if (!payload.action || typeof payload.action !== 'string') {
-      return null
-    }
-    return {
-      action: payload.action,
-      x: typeof payload.x === 'number' ? payload.x : undefined,
-      y: typeof payload.y === 'number' ? payload.y : undefined,
-      text: typeof payload.text === 'string' ? payload.text : undefined,
-      key: typeof payload.key === 'string' ? payload.key : undefined,
-      direction:
-        typeof payload.direction === 'string' ? payload.direction : undefined,
-      startX: typeof payload.startX === 'number' ? payload.startX : undefined,
-      startY: typeof payload.startY === 'number' ? payload.startY : undefined,
-      endX: typeof payload.endX === 'number' ? payload.endX : undefined,
-      endY: typeof payload.endY === 'number' ? payload.endY : undefined,
-      amount: typeof payload.amount === 'number' ? payload.amount : undefined,
-      time: typeof payload.time === 'number' ? payload.time : undefined,
-    }
-  }
-
-  private parseActionsFromRawResponse(
-    rawResponse: string | undefined,
-  ): RawActionPayload[] {
-    if (!rawResponse) return []
-    const matches = [
-      ...rawResponse.matchAll(/<answer>\s*([\s\S]*?)\s*<\/answer>/gi),
-    ]
-    const parsed: RawActionPayload[] = []
-    for (const match of matches) {
-      try {
-        parsed.push(JSON.parse(match[1]) as RawActionPayload)
-      } catch {
-        // ignore malformed answer blocks
-      }
-    }
-    return parsed
+    return this.cladoClient.requestActionPrediction({
+      instruction,
+      imageBase64,
+      actionHistory,
+      signal,
+    })
  }

  private async executeAction(
@@ -529,14 +399,14 @@ export class CladoActionExecutor {
      }

      case 'press_key': {
-        const key = this.normalizePressKey(action.key)
+        const key = normalizeCladoPressKey(action.key)
        await this.runTool('press_key', { key }, signal)
        return `Pressed key "${key}".`
      }

      case 'scroll': {
-        const direction = this.normalizeDirection(action.direction)
-        const amountPx = this.normalizeScrollAmount(action.amount)
+        const direction = normalizeCladoDirection(action.direction)
+        const amountPx = normalizeCladoScrollAmount(action.amount)
        const ticks = Math.max(1, Math.round(amountPx / 120))

        await this.runTool('scroll', { direction, amount: ticks }, signal)
@@ -578,7 +448,9 @@ export class CladoActionExecutor {
      }

      case 'end': {
-        return 'Model requested end().'
+        return action.final_answer
+          ? `Model requested end() with final_answer: ${action.final_answer.slice(0, 240)}`
+          : 'Model requested end().'
      }

      default: {
@@ -588,9 +460,10 @@ export class CladoActionExecutor {
  }

  private async captureScreenshotBase64(signal?: AbortSignal): Promise<string> {
+    // Clado contract is PNG or JPEG; use PNG for lossless input.
    const result = await this.runTool(
      'take_screenshot',
-      { format: 'webp', quality: 80 },
+      { format: 'png' },
      signal,
    )

@@ -604,7 +477,7 @@ export class CladoActionExecutor {
    return image.data
  }

-  private async getViewport(signal?: AbortSignal): Promise<Viewport> {
+  private async getViewport(signal?: AbortSignal): Promise<CladoViewport> {
    if (this.viewport) return this.viewport

    try {
@@ -635,15 +508,9 @@ export class CladoActionExecutor {
    normalizedX: number | undefined,
    normalizedY: number | undefined,
    signal?: AbortSignal,
-  ): Promise<ActionPoint> {
+  ): Promise<CladoActionPoint> {
    const viewport = await this.getViewport(signal)
-    const nx = clampNormalized(normalizedX ?? 500)
-    const ny = clampNormalized(normalizedY ?? 500)
-
-    return {
-      x: Math.round((nx / 1000) * viewport.width),
-      y: Math.round((ny / 1000) * viewport.height),
-    }
+    return resolveCladoPoint(viewport, normalizedX, normalizedY)
  }

  private async getCurrentUrl(signal?: AbortSignal): Promise<string> {
@@ -670,7 +537,7 @@ export class CladoActionExecutor {
      throw new Error('aborted')
    }

-    const toolArgs = this.prepareToolArgs(toolName, args)
+    const toolArgs = prepareCladoToolArgs(toolName, args, this.pageId)

    try {
      const raw = await this.mcpClient.callTool(toolName, toolArgs)
@@ -689,211 +556,22 @@ export class CladoActionExecutor {
    }
  }

-  private prepareToolArgs(
-    toolName: string,
-    args: Record<string, unknown>,
-  ): Record<string, unknown> {
-    const prepared: Record<string, unknown> = { ...args }
-
-    if (
-      toolName === 'evaluate_script' &&
-      typeof prepared.function === 'string' &&
-      prepared.expression === undefined
-    ) {
-      prepared.expression = this.toEvaluateExpression(prepared.function)
-      delete prepared.function
-    }
-
-    if (
-      toolName === 'click_at' &&
-      typeof prepared.dblClick === 'boolean' &&
-      prepared.clickCount === undefined
-    ) {
-      prepared.clickCount = prepared.dblClick ? 2 : 1
-      delete prepared.dblClick
-    }
-
-    // Use fixed page ID for all page-scoped tools (single-page operation)
-    if (PAGE_SCOPED_TOOLS.has(toolName) && typeof prepared.page !== 'number') {
-      prepared.page = this.pageId
-    }
-
-    return prepared
-  }
-
-  private toEvaluateExpression(rawFunction: unknown): string {
-    const source = String(rawFunction).trim()
-    if (source.startsWith('() =>') || source.startsWith('async () =>')) {
-      return `(${source})()`
-    }
-    if (source.startsWith('function')) {
-      return `(${source})()`
-    }
-    return source
-  }
-
-  private normalizePressKey(key: string | undefined): string {
-    const raw = (key ?? '').trim()
-    if (!raw) throw new Error('press_key action missing key field')
-
-    const map: Record<string, string> = {
-      'C-a': 'Control+A',
-      'C-c': 'Control+C',
-      'C-v': 'Control+V',
-      'C-x': 'Control+X',
-      'C-z': 'Control+Z',
-      'C-y': 'Control+Y',
-      'C-s': 'Control+S',
-      'C-t': 'Control+T',
-      'C-w': 'Control+W',
-      'C-h': 'Control+H',
-      'C-f': 'Control+F',
-      'C-+': 'Control++',
-      'C--': 'Control+-',
-      'C-tab': 'Control+Tab',
-      'C-S-tab': 'Control+Shift+Tab',
-      'C-S-n': 'Control+Shift+N',
-      'C-down': 'Control+ArrowDown',
-      'M-f4': 'Alt+F4',
-    }
-    return map[raw] ?? raw
-  }
-
-  private normalizeDirection(
-    direction: string | undefined,
-  ): 'up' | 'down' | 'left' | 'right' {
-    if (
-      direction === 'up' ||
-      direction === 'down' ||
-      direction === 'left' ||
-      direction === 'right'
-    ) {
-      return direction
-    }
-    return 'down'
-  }
-
-  private normalizeScrollAmount(amount: number | undefined): number {
-    if (typeof amount !== 'number') return 500
-    if (amount <= 0) return 100
-    const clamped = Math.min(amount, 1000)
-    return Math.max(100, Math.round((clamped / 1000) * 900))
-  }
-
-  private summarizePrediction(
-    prediction: CladoActionResponse,
-  ): Record<string, unknown> {
-    const preview =
-      typeof prediction.raw_response === 'string' &&
-      prediction.raw_response.length > 0
-        ? prediction.raw_response.slice(0, 240)
-        : undefined
-
-    return {
-      action: prediction.action,
-      x: prediction.x,
-      y: prediction.y,
-      text: prediction.text,
-      key: prediction.key,
-      direction: prediction.direction,
-      startX: prediction.startX,
-      startY: prediction.startY,
-      endX: prediction.endX,
-      endY: prediction.endY,
-      amount: prediction.amount,
-      time: prediction.time,
-      inference_time_seconds: prediction.inference_time_seconds,
-      raw_response_preview: preview,
-    }
-  }
-
-  private extractThinking(rawResponse: string | undefined): string | undefined {
-    if (!rawResponse) return undefined
-    const matches = [
-      ...rawResponse.matchAll(/<thinking>\s*([\s\S]*?)\s*<\/thinking>/gi),
-    ]
-    if (matches.length === 0) return undefined
-
-    const merged = matches
-      .map((match) => match[1]?.replace(/\s+/g, ' ').trim() ?? '')
-      .filter((value) => value.length > 0)
-      .join(' ')
-
-    if (!merged) return undefined
-    return merged
-  }
-
-  private getActionSignature(action: CladoAction): string {
-    switch (action.action) {
-      case 'click':
-      case 'double_click':
-      case 'right_click':
-      case 'hover':
-        return `${action.action}:${action.x ?? 'x'}:${action.y ?? 'y'}`
-      case 'type':
-        return `${action.action}:${(action.text ?? '').slice(0, 16)}`
-      case 'press_key':
-        return `${action.action}:${action.key ?? 'key'}`
-      case 'scroll':
-        return `${action.action}:${action.direction ?? 'down'}:${action.amount ?? 500}`
-      case 'drag':
-        return `${action.action}:${action.startX}:${action.startY}:${action.endX}:${action.endY}`
-      case 'wait':
-        return `${action.action}:${action.time ?? 1}`
-      case 'end':
-        return 'end()'
-      default:
-        return action.action
-    }
-  }
-
-  private formatHistory(actions: CladoAction[]): string {
-    if (actions.length === 0) return 'None'
-
-    const parts = actions.map((action) => {
-      switch (action.action) {
-        case 'click':
-        case 'double_click':
-        case 'right_click':
-        case 'hover':
-          return `${action.action}(${Math.round(action.x ?? 500)}, ${Math.round(action.y ?? 500)})`
-        case 'type': {
-          const text = (action.text ?? '').replace(/'/g, "\\'")
-          return `type('${text}')`
-        }
-        case 'press_key':
-          return `press_key('${action.key ?? 'Enter'}')`
-        case 'scroll':
-          return `scroll(${action.direction ?? 'down'})`
-        case 'drag':
-          return `drag(${Math.round(action.startX ?? 500)},${Math.round(action.startY ?? 500)} -> ${Math.round(action.endX ?? 500)},${Math.round(action.endY ?? 500)})`
-        case 'wait':
-          return `wait(${Math.round(action.time ?? 1)}s)`
-        case 'end':
-          return 'end()'
-        default:
-          return action.action
-      }
-    })
-
-    return parts.join(' -> ')
-  }
-
  private buildObservation(params: {
    status: ExecutorResult['status']
    reason: string
    actions: CladoAction[]
    url: string
    thinkingTrace: string[]
+    finalAnswer?: string
  }): string {
-    const { status, reason, actions, url, thinkingTrace } = params
+    const { status, reason, actions, url, thinkingTrace, finalAnswer } = params
    const actionSummary =
      actions.length === 0
        ? 'No actions were executed.'
        : actions
            .slice(-5)
            .map(
-              (action, idx) => `${idx + 1}. ${this.getActionSignature(action)}`,
+              (action, idx) => `${idx + 1}. ${getCladoActionSignature(action)}`,
            )
            .join('\n')
    const thinkingSummary =
@@ -907,6 +585,7 @@ export class CladoActionExecutor {
      `Status: ${status}`,
      `Reason: ${reason}`,
      `URL: ${url || 'unknown'}`,
+      finalAnswer ? `Final answer: ${finalAnswer}` : '',
      '',
      'Recent actions:',
      actionSummary,
--- a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/index.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/index.ts
@@ -24,8 +24,9 @@ import {
  resolveProviderConfig,
 } from '../../utils/resolve-provider-config'
 import { withEvalTimeout } from '../../utils/with-eval-timeout'
+import { createExecutorBackend } from '../orchestrated/backends/create-executor-backend'
 import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
-import { Executor, type ExecutorCallbacks } from './executor'
+import type { ExecutorCallbacks } from './executor'
 import { OrchestratorAgent } from './orchestrator-agent'
 import type { ExecutorFactory, ExecutorResult } from './types'

@@ -235,12 +236,13 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
        await capture.messageLogger.logStreamEvent(delegateInputEvent)
        capture.emitEvent(task.query_id, delegateInputEvent)

-        const executor = new Executor(
-          executorConfig,
+        const executor = createExecutorBackend({
+          backendKind: isCladoAction ? 'clado' : 'tool-loop',
+          configTemplate: executorConfig,
          browser,
-          config.browseros.server_url,
-          { isCladoAction, callbacks },
-        )
+          serverUrl: config.browseros.server_url,
+          callbacks,
+        })
        let result: ExecutorResult
        try {
          result = await executor.execute(instruction, signal)
--- a/packages/browseros-agent/apps/eval/src/capture/trajectory-saver.ts
+++ b/packages/browseros-agent/apps/eval/src/capture/trajectory-saver.ts
@@ -57,6 +57,20 @@ export class TrajectorySaver {
    )
  }

+  async saveAttempt(attempt: Record<string, unknown>): Promise<void> {
+    await writeFile(
+      join(this.outputDir, 'attempt.json'),
+      JSON.stringify(attempt, null, 2),
+    )
+  }
+
+  async saveGrades(graderResults: Record<string, GraderResult>): Promise<void> {
+    await writeFile(
+      join(this.outputDir, 'grades.json'),
+      JSON.stringify(graderResults, null, 2),
+    )
+  }
+
  async loadMetadata(): Promise<TaskMetadata> {
    const content = await readFile(
      join(this.outputDir, 'metadata.json'),
@@ -70,6 +84,7 @@ export class TrajectorySaver {
  ): Promise<void> {
    const metadata = await this.loadMetadata()
    metadata.grader_results = graderResults
+    await this.saveGrades(graderResults)
    await this.saveMetadata(metadata)
  }

--- a/packages/browseros-agent/apps/eval/src/cli/args.ts
+++ b/packages/browseros-agent/apps/eval/src/cli/args.ts
@@ -0,0 +1,170 @@
+import { parseArgs } from 'node:util'
+
+export type PublishTarget = 'r2'
+
+export interface LegacyCliArgs {
+  command: 'legacy'
+  configPath?: string
+  help?: boolean
+}
+
+export interface SuiteCliArgs {
+  command: 'suite'
+  configPath?: string
+  suitePath?: string
+  variantId?: string
+  provider?: string
+  model?: string
+  apiKey?: string
+  baseUrl?: string
+  publishTarget?: PublishTarget
+}
+
+export interface RunCliArgs
+  extends Omit<SuiteCliArgs, 'command' | 'publishTarget'> {
+  command: 'run'
+}
+
+export interface GradeCliArgs {
+  command: 'grade'
+  runDir: string
+}
+
+export interface PublishCliArgs {
+  command: 'publish'
+  runDir: string
+  target: PublishTarget
+}
+
+export type EvalCliArgs =
+  | LegacyCliArgs
+  | SuiteCliArgs
+  | RunCliArgs
+  | GradeCliArgs
+  | PublishCliArgs
+
+const COMMANDS = new Set(['suite', 'run', 'grade', 'publish'])
+
+function stringValue(value: string | boolean | undefined): string | undefined {
+  return typeof value === 'string' && value.length > 0 ? value : undefined
+}
+
+function publishTarget(value: string | undefined): PublishTarget | undefined {
+  if (value === undefined) return undefined
+  if (value === 'r2') return 'r2'
+  throw new Error(`Unsupported publish target: ${value}`)
+}
+
+function requireOne(
+  command: string,
+  configPath: string | undefined,
+  suitePath: string | undefined,
+): void {
+  if (!configPath && !suitePath) {
+    throw new Error(`${command} requires --config or --suite`)
+  }
+  if (configPath && suitePath) {
+    throw new Error(`${command} accepts either --config or --suite, not both`)
+  }
+}
+
+function parseSuiteLikeArgs(
+  command: 'suite' | 'run',
+  argv: string[],
+): SuiteCliArgs | RunCliArgs {
+  const { values } = parseArgs({
+    args: argv,
+    options: {
+      config: { type: 'string' },
+      suite: { type: 'string' },
+      variant: { type: 'string' },
+      provider: { type: 'string' },
+      model: { type: 'string' },
+      'api-key': { type: 'string' },
+      'base-url': { type: 'string' },
+      publish: { type: 'string' },
+    },
+  })
+
+  const configPath = stringValue(values.config)
+  const suitePath = stringValue(values.suite)
+  requireOne(command, configPath, suitePath)
+
+  const parsed: SuiteCliArgs | RunCliArgs =
+    command === 'suite' ? { command: 'suite' } : { command: 'run' }
+  if (configPath) parsed.configPath = configPath
+  if (suitePath) parsed.suitePath = suitePath
+  const variantId = stringValue(values.variant)
+  if (variantId) parsed.variantId = variantId
+  const provider = stringValue(values.provider)
+  if (provider) parsed.provider = provider
+  const model = stringValue(values.model)
+  if (model) parsed.model = model
+  const apiKey = stringValue(values['api-key'])
+  if (apiKey) parsed.apiKey = apiKey
+  const baseUrl = stringValue(values['base-url'])
+  if (baseUrl) parsed.baseUrl = baseUrl
+
+  if (command === 'suite') {
+    const target = publishTarget(stringValue(values.publish))
+    if (target) {
+      const suiteArgs = parsed as SuiteCliArgs
+      suiteArgs.publishTarget = target
+    }
+  }
+
+  return parsed
+}
+
+function parseLegacyArgs(argv: string[]): LegacyCliArgs {
+  const { values } = parseArgs({
+    args: argv,
+    options: {
+      config: { type: 'string', short: 'c' },
+      help: { type: 'boolean', short: 'h' },
+    },
+  })
+
+  const parsed: LegacyCliArgs = { command: 'legacy' }
+  const configPath = stringValue(values.config)
+  if (configPath) parsed.configPath = configPath
+  if (values.help) parsed.help = true
+  return parsed
+}
+
+/** Parses the eval CLI command without running browser or publishing side effects. */
+export function parseEvalCliArgs(argv: string[]): EvalCliArgs {
+  const [command, ...rest] = argv
+  if (!COMMANDS.has(command ?? '')) {
+    return parseLegacyArgs(argv)
+  }
+
+  switch (command) {
+    case 'suite':
+      return parseSuiteLikeArgs('suite', rest)
+    case 'run':
+      return parseSuiteLikeArgs('run', rest)
+    case 'grade': {
+      const { values } = parseArgs({
+        args: rest,
+        options: { run: { type: 'string' } },
+      })
+      const runDir = stringValue(values.run)
+      if (!runDir) throw new Error('grade requires --run')
+      return { command: 'grade', runDir }
+    }
+    case 'publish': {
+      const { values } = parseArgs({
+        args: rest,
+        options: { run: { type: 'string' }, target: { type: 'string' } },
+      })
+      const runDir = stringValue(values.run)
+      if (!runDir) throw new Error('publish requires --run')
+      const target = publishTarget(stringValue(values.target))
+      if (!target) throw new Error('publish requires --target')
+      return { command: 'publish', runDir, target }
+    }
+    default:
+      return parseLegacyArgs(argv)
+  }
+}
--- a/packages/browseros-agent/apps/eval/src/cli/commands/grade.ts
+++ b/packages/browseros-agent/apps/eval/src/cli/commands/grade.ts
@@ -0,0 +1,84 @@
+import { readdir, readFile, stat } from 'node:fs/promises'
+import { join } from 'node:path'
+import { TrajectorySaver } from '../../capture/trajectory-saver'
+import { runGraders } from '../../grading/grader-runner'
+import { type Message, MessageSchema, TaskMetadataSchema } from '../../types'
+import type { GradeCliArgs } from '../args'
+
+async function loadMessages(taskDir: string): Promise<Message[]> {
+  const content = await readFile(
+    join(taskDir, 'messages.jsonl'),
+    'utf-8',
+  ).catch(() => '')
+  return content
+    .split('\n')
+    .filter((line) => line.trim().length > 0)
+    .map((line) => MessageSchema.parse(JSON.parse(line)))
+}
+
+async function findTaskDirs(runDir: string): Promise<string[]> {
+  const entries = await readdir(runDir, { withFileTypes: true })
+  const taskDirs: string[] = []
+  for (const entry of entries) {
+    if (!entry.isDirectory()) continue
+    const taskDir = join(runDir, entry.name)
+    const metadata = await stat(join(taskDir, 'metadata.json')).catch(
+      () => null,
+    )
+    if (metadata?.isFile()) taskDirs.push(taskDir)
+  }
+  return taskDirs
+}
+
+/** Re-runs graders for task artifacts that already contain metadata and messages. */
+export async function runGradeCommand(args: GradeCliArgs): Promise<void> {
+  const runStat = await stat(args.runDir).catch(() => null)
+  if (!runStat?.isDirectory()) {
+    throw new Error(`Not a run directory: ${args.runDir}`)
+  }
+
+  const taskDirs = await findTaskDirs(args.runDir)
+  if (taskDirs.length === 0) {
+    throw new Error(`No task metadata found under ${args.runDir}`)
+  }
+
+  let graded = 0
+  for (const taskDir of taskDirs) {
+    const metadata = TaskMetadataSchema.parse(
+      JSON.parse(await readFile(join(taskDir, 'metadata.json'), 'utf-8')),
+    )
+    const graderNames = Object.keys(metadata.grader_results ?? {})
+    if (graderNames.length === 0) {
+      console.warn(`Skipping ${metadata.query_id}: no existing grader names`)
+      continue
+    }
+
+    const messages = await loadMessages(taskDir)
+    const graderResults = await runGraders(graderNames, {
+      task: {
+        query_id: metadata.query_id,
+        query: metadata.query,
+        dataset: metadata.dataset,
+      },
+      messages,
+      screenshotCount: metadata.screenshot_count ?? metadata.total_steps,
+      finalAnswer: metadata.final_answer,
+      taskArtifactDir: taskDir,
+      outputDir: taskDir,
+      mcpUrl: `${process.env.BROWSEROS_SERVER_URL || 'http://127.0.0.1:9110'}/mcp`,
+    })
+
+    await new TrajectorySaver(
+      args.runDir,
+      metadata.query_id,
+    ).updateGraderResults(graderResults)
+    graded++
+  }
+
+  if (graded === 0) {
+    throw new Error(
+      `No tasks with existing grader names found under ${args.runDir}`,
+    )
+  }
+  console.log(`Re-graded ${graded} task(s) in ${args.runDir}`)
+}
--- a/packages/browseros-agent/apps/eval/src/cli/commands/publish.ts
+++ b/packages/browseros-agent/apps/eval/src/cli/commands/publish.ts
@@ -0,0 +1,25 @@
+import { publishPathToR2 } from '../../publishing/r2-publisher'
+import type { PublishCliArgs, PublishTarget } from '../args'
+
+export interface PublishRunOptions {
+  runDir: string
+  target: PublishTarget
+}
+
+/** Publishes run artifacts through the R2 viewer upload path. */
+export async function publishRun(options: PublishRunOptions): Promise<void> {
+  if (options.target !== 'r2') {
+    throw new Error(`Unsupported publish target: ${options.target}`)
+  }
+  const result = await publishPathToR2(options.runDir)
+  for (const run of result.uploadedRuns) {
+    console.log(run.viewerUrl)
+  }
+  for (const runId of result.skippedRuns) {
+    console.log(`${runId}: already uploaded, skipping`)
+  }
+}
+
+export async function runPublishCommand(args: PublishCliArgs): Promise<void> {
+  await publishRun({ runDir: args.runDir, target: args.target })
+}
--- a/packages/browseros-agent/apps/eval/src/cli/commands/run.ts
+++ b/packages/browseros-agent/apps/eval/src/cli/commands/run.ts
@@ -0,0 +1,21 @@
+import type { RunCliArgs } from '../args'
+import { runSuiteCommand, type SuiteCommandDeps } from './suite'
+
+/** Executes tasks from a config or suite without publishing artifacts. */
+export async function runRunCommand(
+  args: RunCliArgs,
+  deps: SuiteCommandDeps = {},
+): Promise<void> {
+  await runSuiteCommand(
+    {
+      configPath: args.configPath,
+      suitePath: args.suitePath,
+      variantId: args.variantId,
+      provider: args.provider,
+      model: args.model,
+      apiKey: args.apiKey,
+      baseUrl: args.baseUrl,
+    },
+    deps,
+  )
+}
--- a/packages/browseros-agent/apps/eval/src/cli/commands/suite.ts
+++ b/packages/browseros-agent/apps/eval/src/cli/commands/suite.ts
@@ -0,0 +1,187 @@
+import type { RunEvalOptions, RunEvalResult } from '../../runner/types'
+import { runEval as defaultRunEval } from '../../runs/eval-runner'
+import {
+  type AdaptedEvalConfig,
+  adaptEvalConfigFile,
+} from '../../suites/config-adapter'
+import { loadSuite } from '../../suites/load-suite'
+import { type EvalVariant, resolveVariant } from '../../suites/resolve-variant'
+import type { EvalSuite } from '../../suites/schema'
+import { type EvalConfig, EvalConfigSchema } from '../../types'
+import type { PublishTarget } from '../args'
+
+type Env = Record<string, string | undefined>
+
+export interface SuiteCommandOptions {
+  configPath?: string
+  suitePath?: string
+  variantId?: string
+  provider?: string
+  model?: string
+  apiKey?: string
+  baseUrl?: string
+  publishTarget?: PublishTarget
+  env?: Env
+}
+
+export type ResolvedSuiteCommand =
+  | (AdaptedEvalConfig & { kind: 'config'; datasetPath?: undefined })
+  | {
+      kind: 'suite'
+      suitePath: string
+      suite: EvalSuite
+      variant: EvalVariant
+      datasetPath: string
+      evalConfig: EvalConfig
+    }
+
+export interface SuiteCommandDeps {
+  runEval?: (options: RunEvalOptions) => Promise<RunEvalResult | undefined>
+  publishRun?: (options: {
+    runDir: string
+    target: PublishTarget
+  }) => Promise<void>
+}
+
+function ensureRunnableSuite(suite: EvalSuite): void {
+  if (!suite.browseros) {
+    throw new Error('suite browseros config is required to run suite commands')
+  }
+}
+
+function suiteToEvalConfig(
+  suite: EvalSuite,
+  datasetPath: string,
+  variant: EvalVariant,
+  env: Env,
+): EvalConfig {
+  ensureRunnableSuite(suite)
+
+  const base = {
+    dataset: datasetPath,
+    num_workers: suite.workers,
+    restart_server_per_task: suite.restartBrowserPerTask,
+    browseros: suite.browseros,
+    graders: suite.graders,
+    timeout_ms: suite.timeoutMs,
+    captcha: suite.captcha,
+  }
+
+  if (suite.agent.type === 'single' || suite.agent.type === 'tool-loop') {
+    // The legacy runner names the BrowserOS tool-loop agent "single".
+    return EvalConfigSchema.parse({
+      ...base,
+      agent: {
+        type: 'single',
+        provider: variant.agent.provider,
+        model: variant.agent.model,
+        apiKey: variant.agent.apiKey,
+        baseUrl: variant.agent.baseUrl,
+        supportsImages: variant.agent.supportsImages,
+      },
+    })
+  }
+
+  const executorBackend = suite.agent.executorBackend ?? 'tool-loop'
+  const executor =
+    executorBackend === 'clado'
+      ? {
+          provider: 'clado-action' as const,
+          model:
+            env.EVAL_EXECUTOR_MODEL ?? env.CLADO_ACTION_MODEL ?? 'clado-action',
+          apiKey: env.EVAL_EXECUTOR_API_KEY ?? env.CLADO_ACTION_API_KEY ?? '',
+          baseUrl:
+            env.EVAL_EXECUTOR_BASE_URL ??
+            env.CLADO_ACTION_BASE_URL ??
+            env.CLADO_ACTION_URL,
+        }
+      : {
+          provider: variant.agent.provider,
+          model: variant.agent.model,
+          apiKey: variant.agent.apiKey,
+          baseUrl: variant.agent.baseUrl,
+        }
+
+  return EvalConfigSchema.parse({
+    ...base,
+    agent: {
+      type: 'orchestrator-executor',
+      orchestrator: {
+        provider: variant.agent.provider,
+        model: variant.agent.model,
+        apiKey: variant.agent.apiKey,
+        baseUrl: variant.agent.baseUrl,
+      },
+      executor,
+    },
+  })
+}
+
+/** Resolves config-backed or suite-backed CLI input into the run shape used by the runner. */
+export async function resolveSuiteCommand(
+  options: SuiteCommandOptions,
+): Promise<ResolvedSuiteCommand> {
+  const env = options.env ?? process.env
+  if (options.configPath) {
+    return {
+      kind: 'config',
+      ...(await adaptEvalConfigFile(options.configPath, { env })),
+    }
+  }
+  if (!options.suitePath) {
+    throw new Error('suite requires --config or --suite')
+  }
+
+  const loaded = await loadSuite(options.suitePath)
+  const variant = resolveVariant({
+    variantId: options.variantId,
+    provider: options.provider,
+    model: options.model,
+    apiKey: options.apiKey,
+    baseUrl: options.baseUrl,
+    env,
+  })
+
+  return {
+    kind: 'suite',
+    suitePath: loaded.suitePath,
+    suite: loaded.suite,
+    variant,
+    datasetPath: loaded.datasetPath,
+    evalConfig: suiteToEvalConfig(
+      loaded.suite,
+      loaded.datasetPath,
+      variant,
+      env,
+    ),
+  }
+}
+
+/** Runs the full suite loop: resolve input, execute tasks, then optionally publish the run. */
+export async function runSuiteCommand(
+  options: SuiteCommandOptions,
+  deps: SuiteCommandDeps = {},
+): Promise<void> {
+  const runEval = deps.runEval ?? defaultRunEval
+  const resolved = await resolveSuiteCommand(options)
+  const runOptions: RunEvalOptions =
+    resolved.kind === 'config'
+      ? { configPath: resolved.configPath }
+      : {
+          configPath: resolved.suitePath,
+          dataPath: resolved.datasetPath,
+          config: resolved.evalConfig,
+        }
+
+  const result = await runEval(runOptions)
+  if (!options.publishTarget) return
+
+  const outputDir = result?.outputDir
+  if (!outputDir) {
+    throw new Error('publish requested but runner did not return an outputDir')
+  }
+  if (!deps.publishRun) {
+    throw new Error('publish requested before the publisher is configured')
+  }
+  await deps.publishRun({ runDir: outputDir, target: options.publishTarget })
+}
--- a/packages/browseros-agent/apps/eval/src/cli/index.ts
+++ b/packages/browseros-agent/apps/eval/src/cli/index.ts
@@ -0,0 +1,70 @@
+import { startDashboard } from '../dashboard/server'
+import { runEval } from '../runs/eval-runner'
+import { type EvalCliArgs, parseEvalCliArgs } from './args'
+import { runGradeCommand } from './commands/grade'
+import { publishRun, runPublishCommand } from './commands/publish'
+import { runRunCommand } from './commands/run'
+import { runSuiteCommand } from './commands/suite'
+
+export function usage(): string {
+  return `
+BrowserOS Eval
+
+Usage:
+  bun run eval suite --config <config.json> [--publish r2]
+  bun run eval suite --suite <suite.json> --variant <id> [--publish r2]
+  bun run eval run --config <config.json>
+  bun run eval run --suite <suite.json> --variant <id>
+  bun run eval grade --run <results/run-dir>
+  bun run eval publish --run <results/run-dir> --target r2
+  bun run eval -c <config.json>
+`
+}
+
+async function runLegacyCommand(args: EvalCliArgs): Promise<void> {
+  if (args.command !== 'legacy') return
+  if (args.help) {
+    console.log(usage())
+    return
+  }
+  if (args.configPath) {
+    await runEval({ configPath: args.configPath })
+    return
+  }
+
+  startDashboard({
+    tasks: [],
+    configName: '',
+    agentType: '',
+    outputDir: '',
+    configMode: true,
+  })
+  console.log(
+    'Dashboard running at http://localhost:9900 — configure and run from the UI',
+  )
+  await new Promise(() => {})
+}
+
+/** Dispatches the eval CLI while preserving the old config/dashboard entry points. */
+export async function runCli(
+  argv: string[] = Bun.argv.slice(2),
+): Promise<void> {
+  const args = parseEvalCliArgs(argv)
+  switch (args.command) {
+    case 'legacy':
+      await runLegacyCommand(args)
+      break
+    case 'suite':
+      await runSuiteCommand(args, { publishRun })
+      break
+    case 'run':
+      await runRunCommand(args)
+      break
+    case 'grade':
+      await runGradeCommand(args)
+      break
+    case 'publish':
+      await runPublishCommand(args)
+      break
+  }
+}
--- a/packages/browseros-agent/apps/eval/src/constants.ts
+++ b/packages/browseros-agent/apps/eval/src/constants.ts
@@ -5,4 +5,5 @@
 export const DEFAULT_TIMEOUT_MS = 30 * 60 * 1000 // 30 minutes
 export const SCREENSHOT_TIMEOUT_MS = 65_000 // 65s — ensures we get extension's error (60s)
 export const MAX_ACTIONS_PER_DELEGATION = 15
-export const CLADO_REQUEST_TIMEOUT_MS = 120_000
+// Cold start can take ~5 minutes per Clado; 6 minutes leaves headroom.
+export const CLADO_REQUEST_TIMEOUT_MS = 360_000
--- a/packages/browseros-agent/apps/eval/src/dashboard/server.ts
+++ b/packages/browseros-agent/apps/eval/src/dashboard/server.ts
@@ -1,5 +1,5 @@
 import { mkdir, readdir, readFile, stat } from 'node:fs/promises'
-import { join, resolve } from 'node:path'
+import { dirname, join, resolve, sep } from 'node:path'
 import { Hono } from 'hono'
 import { streamSSE } from 'hono/streaming'
 import { ParallelExecutor } from '../runner/parallel-executor'
@@ -128,6 +128,35 @@ let dashboardConfigMode = false
 const configsDir = join(import.meta.dir, '..', '..', 'configs')
 const projectRoot = resolve(import.meta.dir, '..', '..', '..', '..')

+async function listConfigFiles(dir: string, prefix = ''): Promise<string[]> {
+  const entries = await readdir(join(dir, prefix), { withFileTypes: true })
+  const files: string[] = []
+  for (const entry of entries) {
+    const relativePath = prefix ? join(prefix, entry.name) : entry.name
+    if (entry.isDirectory()) {
+      files.push(...(await listConfigFiles(dir, relativePath)))
+    } else if (entry.isFile() && entry.name.endsWith('.json')) {
+      files.push(relativePath.split(sep).join('/'))
+    }
+  }
+  return files.sort()
+}
+
+function resolveConfigPath(name: string): string | null {
+  if (!name.endsWith('.json')) return null
+  if (name.split('/').some((part) => !part || part === '.' || part === '..')) {
+    return null
+  }
+
+  const resolvedPath = resolve(configsDir, name)
+  const resolvedConfigsDir = resolve(configsDir)
+  const configRootPrefix = resolvedConfigsDir.endsWith(sep)
+    ? resolvedConfigsDir
+    : `${resolvedConfigsDir}${sep}`
+  if (!resolvedPath.startsWith(configRootPrefix)) return null
+  return resolvedPath
+}
+
 // ============================================================================
 // Hono App
 // ============================================================================
@@ -339,21 +368,21 @@ app.get('/api/mode', (c) => {
 // List saved config files
 app.get('/api/configs', async (c) => {
  try {
-    const files = await readdir(configsDir)
-    return c.json(files.filter((f) => f.endsWith('.json')))
+    return c.json(await listConfigFiles(configsDir))
  } catch {
    return c.json([])
  }
 })

 // Read a specific config file
-app.get('/api/config/:name', async (c) => {
-  const name = c.req.param('name')
-  if (name.includes('/') || name.includes('..')) {
+app.get('/api/config/*', async (c) => {
+  const name = decodeURIComponent(c.req.path.slice('/api/config/'.length))
+  const configPath = resolveConfigPath(name)
+  if (!configPath) {
    return c.json({ error: 'Invalid config name' }, 400)
  }
  try {
-    const content = await readFile(join(configsDir, name), 'utf-8')
+    const content = await readFile(configPath, 'utf-8')
    return c.json(JSON.parse(content))
  } catch {
    return c.notFound()
@@ -382,8 +411,17 @@ app.post('/api/run', async (c) => {

  const config = parseResult.data

-  // Resolve relative paths from configs/ dir (dataset dropdown values are relative to it)
-  const baseDir = configsDir
+  let baseDir = configsDir
+  if (body.configName) {
+    const configPath = resolveConfigPath(body.configName)
+    if (!configPath) {
+      return c.json({ error: 'Invalid config name' }, 400)
+    }
+    baseDir = dirname(configPath)
+  }
+
+  // Resolve relative paths from the loaded config location. Unsaved dashboard
+  // configs keep using apps/eval/configs as their base for dropdown values.
  const datasetPath = resolve(
    config.dataset.startsWith('/')
      ? config.dataset
--- a/packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts
@@ -1,5 +1,12 @@
-import { spawn } from 'node:child_process'
 import { join } from 'node:path'
+import {
+  writeGraderJsonArtifact,
+  writeGraderTextArtifact,
+} from '../../grading/artifacts'
+import {
+  type PythonEvaluatorResult,
+  runPythonJsonEvaluator,
+} from '../../grading/python-evaluator'
 import type { GraderResult } from '../../types'
 import { callMcpTool } from '../../utils/mcp-client'
 import type { Grader, GraderInput } from '../types'
@@ -7,12 +14,23 @@ import type { Grader, GraderInput } from '../types'
 const EVAL_SCRIPT = join(
  import.meta.dirname,
  '..',
-  '..',
-  '..',
-  'scripts',
+  'python',
  'agisdk-evaluate.py',
 )

+interface AgisdkEvaluatorInput {
+  task_id: string
+  env_state: Record<string, unknown>
+  model_response: string
+}
+
+interface AgisdkEvaluatorOutput {
+  reward: number
+  pass: boolean
+  message: string
+  per_criterion: unknown[]
+}
+
 export class AgisdkStateDiffGrader implements Grader {
  name = 'agisdk_state_diff'

@@ -36,6 +54,16 @@ export class AgisdkStateDiffGrader implements Grader {
    let envState: Record<string, unknown>
    try {
      envState = await this.fetchFinishState(origin, mcpEndpoint)
+      await writeGraderJsonArtifact(
+        input,
+        this.name,
+        'finish-state.json',
+        envState,
+      )
+      await writeGraderJsonArtifact(input, this.name, 'context.json', {
+        origin,
+        agisdk_task_id: taskId,
+      })
    } catch (error) {
      return {
        score: 0,
@@ -46,10 +74,30 @@ export class AgisdkStateDiffGrader implements Grader {
    }

    try {
-      const result = await this.runPythonEvaluator(
-        taskId,
-        envState,
-        input.finalAnswer || '',
+      const evaluatorInput: AgisdkEvaluatorInput = {
+        task_id: taskId,
+        env_state: envState,
+        model_response: input.finalAnswer || '',
+      }
+      await writeGraderJsonArtifact(
+        input,
+        this.name,
+        'evaluator-input.json',
+        evaluatorInput,
+      )
+      const evaluation = await this.runPythonEvaluator(evaluatorInput)
+      const result = evaluation.output
+      await writeGraderJsonArtifact(
+        input,
+        this.name,
+        'evaluator-output.json',
+        result,
+      )
+      await writeGraderTextArtifact(
+        input,
+        this.name,
+        'stderr.txt',
+        evaluation.stderr,
      )
      return {
        score: result.reward,
@@ -144,59 +192,12 @@ export class AgisdkStateDiffGrader implements Grader {
  }

  private runPythonEvaluator(
-    taskId: string,
-    envState: Record<string, unknown>,
-    modelResponse: string,
-  ): Promise<{
-    reward: number
-    pass: boolean
-    message: string
-    per_criterion: unknown[]
-  }> {
-    return new Promise((resolve, reject) => {
-      const proc = spawn('python3', [EVAL_SCRIPT], {
-        stdio: ['pipe', 'pipe', 'pipe'],
-      })
-
-      const inputData = JSON.stringify({
-        task_id: taskId,
-        env_state: envState,
-        model_response: modelResponse,
-      })
-
-      let stdout = ''
-      let stderr = ''
-
-      proc.stdout.on('data', (data: Buffer) => {
-        stdout += data.toString()
-      })
-
-      proc.stderr.on('data', (data: Buffer) => {
-        stderr += data.toString()
-      })
-
-      proc.on('close', (code) => {
-        if (code !== 0) {
-          reject(
-            new Error(`Python evaluator exited with code ${code}: ${stderr}`),
-          )
-          return
-        }
-
-        try {
-          const result = JSON.parse(stdout.trim())
-          resolve(result)
-        } catch {
-          reject(new Error(`Failed to parse evaluator output: ${stdout}`))
-        }
-      })
-
-      proc.on('error', (err) => {
-        reject(new Error(`Failed to spawn Python evaluator: ${err.message}`))
-      })
-
-      proc.stdin.write(inputData)
-      proc.stdin.end()
+    evalInput: AgisdkEvaluatorInput,
+  ): Promise<PythonEvaluatorResult<AgisdkEvaluatorOutput>> {
+    return runPythonJsonEvaluator<AgisdkEvaluatorOutput>({
+      scriptPath: EVAL_SCRIPT,
+      input: evalInput,
+      timeoutMs: 300_000,
    })
  }
 }
--- a/packages/browseros-agent/apps/eval/src/graders/benchmark/infinity-state.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/benchmark/infinity-state.ts
@@ -1,4 +1,12 @@
 import { join, resolve } from 'node:path'
+import {
+  writeGraderJsonArtifact,
+  writeGraderTextArtifact,
+} from '../../grading/artifacts'
+import {
+  type PythonEvaluatorResult,
+  runPythonJsonEvaluator,
+} from '../../grading/python-evaluator'
 import type { GraderResult } from '../../types'
 import type { Grader, GraderInput } from '../types'

@@ -14,10 +22,7 @@ interface InfinityEvalOutput {
  message: string
 }

-const EVAL_SCRIPT = resolve(
-  import.meta.dir,
-  '../../../scripts/infinity-evaluate.py',
-)
+const EVAL_SCRIPT = resolve(import.meta.dir, '../python/infinity-evaluate.py')

 export class InfinityStateGrader implements Grader {
  name = 'infinity_state'
@@ -66,7 +71,32 @@ export class InfinityStateGrader implements Grader {
    }

    try {
-      const result = await this.runPythonEvaluator(evalInput)
+      await writeGraderJsonArtifact(input, this.name, 'verifier.json', {
+        appName: parsed.appName,
+        taskId: parsed.taskId,
+        verifierPath,
+        appServerUrl,
+      })
+      await writeGraderJsonArtifact(
+        input,
+        this.name,
+        'evaluator-input.json',
+        evalInput,
+      )
+      const evaluation = await this.runPythonEvaluator(evalInput)
+      const result = evaluation.output
+      await writeGraderJsonArtifact(
+        input,
+        this.name,
+        'evaluator-output.json',
+        result,
+      )
+      await writeGraderTextArtifact(
+        input,
+        this.name,
+        'stderr.txt',
+        evaluation.stderr,
+      )
      return {
        score: result.pass ? 1 : 0,
        pass: result.pass,
@@ -108,27 +138,11 @@ export class InfinityStateGrader implements Grader {

  private async runPythonEvaluator(
    evalInput: InfinityEvalInput,
-  ): Promise<InfinityEvalOutput> {
-    const proc = Bun.spawn(['python3', EVAL_SCRIPT], {
-      stdin: 'pipe',
-      stdout: 'pipe',
-      stderr: 'pipe',
+  ): Promise<PythonEvaluatorResult<InfinityEvalOutput>> {
+    return runPythonJsonEvaluator<InfinityEvalOutput>({
+      scriptPath: EVAL_SCRIPT,
+      input: evalInput,
+      timeoutMs: 300_000,
    })
-
-    const inputJson = JSON.stringify(evalInput)
-    proc.stdin.write(inputJson)
-    proc.stdin.end()
-
-    const stdout = await new Response(proc.stdout).text()
-    const stderr = await new Response(proc.stderr).text()
-    const exitCode = await proc.exited
-
-    if (exitCode !== 0) {
-      throw new Error(
-        `Python evaluator exited with code ${exitCode}: ${stderr || stdout}`,
-      )
-    }
-
-    return JSON.parse(stdout.trim()) as InfinityEvalOutput
  }
 }
--- a/packages/browseros-agent/apps/eval/src/graders/performance/performance-grader.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/performance/performance-grader.ts
@@ -1,6 +1,7 @@
 import { readFile } from 'node:fs/promises'
 import { join } from 'node:path'
 import { query } from '@anthropic-ai/claude-agent-sdk'
+import { writeGraderJsonArtifact } from '../../grading/artifacts'
 import type { GraderResult } from '../../types'
 import type { Grader, GraderInput } from '../types'
 import {
@@ -63,6 +64,7 @@ export class PerformanceGrader implements Grader {
        input.screenshotCount,
        terminationReason,
      )
+      await writeGraderJsonArtifact(input, this.name, 'metrics.json', metrics)

      const systemPrompt = PERFORMANCE_SYSTEM_PROMPT.replace(
        /\{screenshot_count\}/g,
@@ -82,6 +84,14 @@ export class PerformanceGrader implements Grader {
        userPrompt,
        input.outputDir,
      )
+      if (response) {
+        await writeGraderJsonArtifact(
+          input,
+          this.name,
+          'agent-output.json',
+          response,
+        )
+      }

      if (!response) {
        return {
@@ -140,6 +150,7 @@ export class PerformanceGrader implements Grader {
          `Perf grader: LLM returned ${returnedAxes.size}/${expectedAxes.size} axes, missing: ${missingAxes.join(', ')}`,
        )
      }
+      await writeGraderJsonArtifact(input, this.name, 'axes.json', axisResults)

      return {
        score: compositeScore / 100,
--- a/packages/browseros-agent/apps/eval/src/graders/python/agisdk-evaluate.py
+++ b/packages/browseros-agent/apps/eval/src/graders/python/agisdk-evaluate.py
--- a/packages/browseros-agent/apps/eval/src/graders/python/infinity-evaluate.py
+++ b/packages/browseros-agent/apps/eval/src/graders/python/infinity-evaluate.py
--- a/packages/browseros-agent/apps/eval/src/graders/registry.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/registry.ts
@@ -1,51 +1,2 @@
-import type { GraderResult } from '../types'
-import { AgisdkStateDiffGrader } from './benchmark/agisdk-state-diff'
-import { InfinityStateGrader } from './benchmark/infinity-state'
-import { PerformanceGrader } from './performance/performance-grader'
-import type { Grader, GraderInput } from './types'
-
-export const PASS_FAIL_GRADER_ORDER = [
-  'agisdk_state_diff',
-  'infinity_state',
-  'performance_grader',
-] as const
-
-export function createGrader(name: string): Grader | null {
-  switch (name) {
-    case 'agisdk_state_diff':
-      return new AgisdkStateDiffGrader()
-    case 'infinity_state':
-      return new InfinityStateGrader()
-    case 'performance_grader':
-      return new PerformanceGrader()
-    default:
-      console.warn(`Unknown grader: ${name}`)
-      return null
-  }
-}
-
-export async function runGraders(
-  graderNames: string[],
-  input: GraderInput,
-): Promise<Record<string, GraderResult>> {
-  const results: Record<string, GraderResult> = {}
-
-  for (const name of graderNames) {
-    const grader = createGrader(name)
-    if (!grader) continue
-    try {
-      console.log(`  Running grader: ${name}`)
-      results[name] = await grader.grade(input)
-    } catch (error) {
-      results[name] = {
-        score: 0,
-        pass: false,
-        reasoning: `Error running grader: ${error}`,
-      }
-    }
-  }
-
-  return results
-}
-
-export { AgisdkStateDiffGrader, InfinityStateGrader, PerformanceGrader }
+export * from '../grading/grader-registry'
+export { runConfiguredGraders, runGraders } from '../grading/grader-runner'
--- a/packages/browseros-agent/apps/eval/src/graders/types.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/types.ts
@@ -1,21 +1 @@
-import type { GraderResult, Message } from '../types'
-
-export interface GraderInput {
-  task: {
-    query_id: string
-    query: string
-    dataset: string
-  }
-  messages: Message[]
-  screenshotCount: number
-  finalAnswer: string | null
-  expectedAnswer?: string | null
-  outputDir: string
-  mcpUrl?: string
-  infinityAppUrl?: string
-}
-
-export interface Grader {
-  name: string
-  grade(input: GraderInput): Promise<GraderResult>
-}
+export type { Grader, GraderInput } from '../grading/types'
--- a/packages/browseros-agent/apps/eval/src/grading/artifacts.ts
+++ b/packages/browseros-agent/apps/eval/src/grading/artifacts.ts
@@ -0,0 +1,34 @@
+import { mkdir, writeFile } from 'node:fs/promises'
+import { join } from 'node:path'
+import type { GraderInput } from './types'
+
+function artifactDir(input: GraderInput, graderName: string): string {
+  return join(
+    input.taskArtifactDir || input.outputDir,
+    'grader-artifacts',
+    graderName,
+  )
+}
+
+/** Writes a JSON artifact for a grader under the task artifact directory. */
+export async function writeGraderJsonArtifact(
+  input: GraderInput,
+  graderName: string,
+  filename: string,
+  value: unknown,
+): Promise<void> {
+  const dir = artifactDir(input, graderName)
+  await mkdir(dir, { recursive: true })
+  await writeFile(join(dir, filename), JSON.stringify(value, null, 2))
+}
+
+export async function writeGraderTextArtifact(
+  input: GraderInput,
+  graderName: string,
+  filename: string,
+  value: string,
+): Promise<void> {
+  const dir = artifactDir(input, graderName)
+  await mkdir(dir, { recursive: true })
+  await writeFile(join(dir, filename), value)
+}
--- a/packages/browseros-agent/apps/eval/src/grading/grader-registry.ts
+++ b/packages/browseros-agent/apps/eval/src/grading/grader-registry.ts
@@ -0,0 +1,26 @@
+import { AgisdkStateDiffGrader } from '../graders/benchmark/agisdk-state-diff'
+import { InfinityStateGrader } from '../graders/benchmark/infinity-state'
+import { PerformanceGrader } from '../graders/performance/performance-grader'
+import type { Grader } from './types'
+
+export const PASS_FAIL_GRADER_ORDER = [
+  'agisdk_state_diff',
+  'infinity_state',
+  'performance_grader',
+] as const
+
+export function createGrader(name: string): Grader | null {
+  switch (name) {
+    case 'agisdk_state_diff':
+      return new AgisdkStateDiffGrader()
+    case 'infinity_state':
+      return new InfinityStateGrader()
+    case 'performance_grader':
+      return new PerformanceGrader()
+    default:
+      console.warn(`Unknown grader: ${name}`)
+      return null
+  }
+}
+
+export { AgisdkStateDiffGrader, InfinityStateGrader, PerformanceGrader }
--- a/packages/browseros-agent/apps/eval/src/grading/grader-runner.ts
+++ b/packages/browseros-agent/apps/eval/src/grading/grader-runner.ts
@@ -0,0 +1,36 @@
+import type { GraderResult } from '../types'
+import { createGrader as defaultCreateGrader } from './grader-registry'
+import type { Grader, GraderInput } from './types'
+
+export interface GraderRunnerDeps {
+  createGrader?: (name: string) => Grader | null
+}
+
+/** Runs configured graders independently so one failure does not hide others. */
+export async function runConfiguredGraders(
+  graderNames: string[],
+  input: GraderInput,
+  deps: GraderRunnerDeps = {},
+): Promise<Record<string, GraderResult>> {
+  const create = deps.createGrader ?? defaultCreateGrader
+  const results: Record<string, GraderResult> = {}
+
+  for (const name of graderNames) {
+    const grader = create(name)
+    if (!grader) continue
+    try {
+      console.log(`  Running grader: ${name}`)
+      results[name] = await grader.grade(input)
+    } catch (error) {
+      results[name] = {
+        score: 0,
+        pass: false,
+        reasoning: `Error running grader: ${error instanceof Error ? error.message : String(error)}`,
+      }
+    }
+  }
+
+  return results
+}
+
+export const runGraders = runConfiguredGraders
--- a/packages/browseros-agent/apps/eval/src/grading/python-evaluator.ts
+++ b/packages/browseros-agent/apps/eval/src/grading/python-evaluator.ts
@@ -0,0 +1,65 @@
+export interface PythonEvaluatorOptions {
+  scriptPath: string
+  input: unknown
+  timeoutMs: number
+}
+
+export interface PythonEvaluatorResult<T> {
+  output: T
+  stdout: string
+  stderr: string
+  exitCode: number
+}
+
+/** Runs a Python evaluator that accepts stdin JSON and emits stdout JSON. */
+export async function runPythonJsonEvaluator<T>(
+  options: PythonEvaluatorOptions,
+): Promise<PythonEvaluatorResult<T>> {
+  const proc = Bun.spawn(['python3', options.scriptPath], {
+    stdin: 'pipe',
+    stdout: 'pipe',
+    stderr: 'pipe',
+  })
+
+  proc.stdin.write(JSON.stringify(options.input))
+  proc.stdin.end()
+
+  let timeoutHandle: ReturnType<typeof setTimeout> | undefined
+  const timeout = new Promise<never>((_, reject) => {
+    timeoutHandle = setTimeout(() => {
+      proc.kill('SIGKILL')
+      reject(
+        new Error(`Python evaluator timed out after ${options.timeoutMs}ms`),
+      )
+    }, options.timeoutMs)
+  })
+
+  const completed = (async (): Promise<PythonEvaluatorResult<T>> => {
+    const stdout = await new Response(proc.stdout).text()
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+
+    if (exitCode !== 0) {
+      throw new Error(
+        `Python evaluator exited with code ${exitCode}: ${stderr || stdout}`,
+      )
+    }
+
+    try {
+      return {
+        output: JSON.parse(stdout.trim()) as T,
+        stdout,
+        stderr,
+        exitCode,
+      }
+    } catch {
+      throw new Error(`Failed to parse Python evaluator output: ${stdout}`)
+    }
+  })()
+
+  try {
+    return await Promise.race([completed, timeout])
+  } finally {
+    clearTimeout(timeoutHandle)
+  }
+}
--- a/packages/browseros-agent/apps/eval/src/grading/types.ts
+++ b/packages/browseros-agent/apps/eval/src/grading/types.ts
@@ -0,0 +1,22 @@
+import type { GraderResult, Message } from '../types'
+
+export interface GraderInput {
+  task: {
+    query_id: string
+    query: string
+    dataset: string
+  }
+  messages: Message[]
+  screenshotCount: number
+  finalAnswer: string | null
+  expectedAnswer?: string | null
+  taskArtifactDir: string
+  outputDir: string
+  mcpUrl?: string
+  infinityAppUrl?: string
+}
+
+export interface Grader {
+  name: string
+  grade(input: GraderInput): Promise<GraderResult>
+}
--- a/packages/browseros-agent/apps/eval/src/index.ts
+++ b/packages/browseros-agent/apps/eval/src/index.ts
@@ -1,72 +1,10 @@
 #!/usr/bin/env bun

-import { parseArgs } from 'node:util'
-import { runEval } from './runner/eval-runner'
+import { runCli } from './cli'

-const { values } = parseArgs({
-  args: Bun.argv.slice(2),
-  options: {
-    config: { type: 'string', short: 'c' },
-    help: { type: 'boolean', short: 'h' },
-  },
-})
-
-if (values.help) {
-  console.log(`
-BrowserOS Eval
-
-Usage:
-  bun run eval                          # Opens dashboard in config mode
-  bun run eval --config <config.json>   # Runs eval with config file
-
-Available agent types:
-  - single                  Single LLM agent driven by the BrowserOS tool loop
-  - orchestrator-executor   High-level planner + visual/text executor
-
-Available graders:
-  - performance_grader      Multi-axis grader using Claude Agent SDK
-  - agisdk_state_diff       AGI SDK / REAL Bench state-diff grader
-  - infinity_state          WebArena-Infinity verifier-script grader
-
-Preset configs in configs/:
-  - browseros-agent-weekly.json       Weekly eval (single agent)
-  - browseros-oe-agent-weekly.json    Weekly eval (orchestrator + LLM executor)
-  - browseros-oe-clado-weekly.json    Weekly eval (orchestrator + Clado executor)
-  - agisdk-real-smoke.json            AGI SDK smoke run
-  - infinity-hard-50.json             WebArena-Infinity hard-50 set
-  - test-webvoyager.json              WebVoyager test
-  - test-mind2web.json                Mind2Web test
-
-Examples:
-  bun run eval                                       # Dashboard config mode
-  bun run eval -c configs/browseros-agent-weekly.json
-  bun run eval -c configs/test-webvoyager.json
-`)
-  process.exit(0)
-}
-
-if (values.config) {
-  try {
-    await runEval({ configPath: values.config })
-  } catch (error) {
-    console.error(error instanceof Error ? error.message : String(error))
-    process.exit(1)
-  }
-  process.exit(0)
-} else {
-  // No config — start dashboard in config mode, wait for user to configure and run
-  const { startDashboard } = await import('./dashboard/server')
-  startDashboard({
-    tasks: [],
-    configName: '',
-    agentType: '',
-    outputDir: '',
-    configMode: true,
-  })
-  console.log(
-    'Dashboard running at http://localhost:9900 — configure and run from the UI',
-  )
-
-  // Keep process alive until SIGINT
-  await new Promise(() => {})
+try {
+  await runCli(Bun.argv.slice(2))
+} catch (error) {
+  console.error(error instanceof Error ? error.message : String(error))
+  process.exit(1)
 }
--- a/packages/browseros-agent/apps/eval/src/publishing/r2-manifest.ts
+++ b/packages/browseros-agent/apps/eval/src/publishing/r2-manifest.ts
@@ -0,0 +1,41 @@
+export interface R2UploadConfig {
+  accountId: string
+  accessKeyId: string
+  secretAccessKey: string
+  bucket: string
+  cdnBaseUrl: string
+}
+
+export interface R2ManifestTask {
+  queryId: string
+  query: string
+  startUrl: string
+  status: string
+  durationMs: number
+  screenshotCount: number
+  graderResults: Record<string, unknown>
+}
+
+export interface R2RunManifest {
+  runId: string
+  uploadedAt: string
+  agentConfig?: Record<string, unknown>
+  dataset?: string
+  summary?: {
+    passRate?: unknown
+    avgDurationMs?: unknown
+  }
+  tasks: R2ManifestTask[]
+}
+
+export interface R2PublishRunResult {
+  runId: string
+  uploadedFiles: number
+  viewerUrl: string
+  manifest: R2RunManifest
+}
+
+export interface R2PublishPathResult {
+  uploadedRuns: R2PublishRunResult[]
+  skippedRuns: string[]
+}
--- a/packages/browseros-agent/apps/eval/src/publishing/r2-publisher.ts
+++ b/packages/browseros-agent/apps/eval/src/publishing/r2-publisher.ts
@@ -0,0 +1,425 @@
+import { readdir, readFile, stat } from 'node:fs/promises'
+import { basename, dirname, extname, join } from 'node:path'
+import {
+  GetObjectCommand,
+  PutObjectCommand,
+  S3Client,
+} from '@aws-sdk/client-s3'
+import type {
+  R2ManifestTask,
+  R2PublishPathResult,
+  R2PublishRunResult,
+  R2RunManifest,
+  R2UploadConfig,
+} from './r2-manifest'
+
+const DEFAULT_CONCURRENCY = 20
+
+const CONTENT_TYPES: Record<string, string> = {
+  '.json': 'application/json',
+  '.jsonl': 'application/x-ndjson',
+  '.png': 'image/png',
+  '.html': 'text/html',
+}
+
+export interface R2Client {
+  send(command: unknown): Promise<unknown>
+}
+
+export interface R2PublisherOptions {
+  config: R2UploadConfig
+  client?: R2Client
+  viewerPath?: string
+  concurrency?: number
+  now?: () => Date
+}
+
+interface UploadJob {
+  key: string
+  filePath: string
+  contentType: string
+}
+
+interface TaskDirEntry {
+  taskId: string
+  taskPath: string
+  canonicalLayout: boolean
+}
+
+export function contentTypeForPath(filePath: string): string {
+  return CONTENT_TYPES[extname(filePath)] || 'application/octet-stream'
+}
+
+export function loadR2ConfigFromEnv(
+  env: Record<string, string | undefined> = process.env,
+): R2UploadConfig {
+  const accountId = env.EVAL_R2_ACCOUNT_ID
+  const accessKeyId = env.EVAL_R2_ACCESS_KEY_ID
+  const secretAccessKey = env.EVAL_R2_SECRET_ACCESS_KEY
+
+  if (!accountId || !accessKeyId || !secretAccessKey) {
+    throw new Error(
+      'Missing required env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY',
+    )
+  }
+
+  return {
+    accountId,
+    accessKeyId,
+    secretAccessKey,
+    bucket: env.EVAL_R2_BUCKET || 'browseros-eval',
+    cdnBaseUrl: (
+      env.EVAL_R2_CDN_BASE_URL || 'https://eval.browseros.com'
+    ).replace(/\/+$/, ''),
+  }
+}
+
+export function createR2Client(config: R2UploadConfig): S3Client {
+  return new S3Client({
+    region: 'auto',
+    endpoint: `https://${config.accountId}.r2.cloudflarestorage.com`,
+    credentials: {
+      accessKeyId: config.accessKeyId,
+      secretAccessKey: config.secretAccessKey,
+    },
+  })
+}
+
+async function collectFiles(dir: string): Promise<string[]> {
+  const files: string[] = []
+  const entries = await readdir(dir, { withFileTypes: true })
+  for (const entry of entries) {
+    const full = join(dir, entry.name)
+    if (entry.isDirectory()) {
+      files.push(...(await collectFiles(full)))
+    } else {
+      files.push(full)
+    }
+  }
+  return files
+}
+
+async function runPool<T>(
+  items: T[],
+  concurrency: number,
+  fn: (item: T) => Promise<void>,
+): Promise<void> {
+  let i = 0
+  const workers = Array.from({ length: concurrency }, async () => {
+    while (i < items.length) {
+      const idx = i++
+      await fn(items[idx])
+    }
+  })
+  await Promise.all(workers)
+}
+
+async function hasMetadata(dir: string): Promise<boolean> {
+  const metaStat = await stat(join(dir, 'metadata.json')).catch(() => null)
+  return !!metaStat?.isFile()
+}
+
+async function findTaskDirs(runDir: string): Promise<TaskDirEntry[]> {
+  const entries = await readdir(runDir, { withFileTypes: true })
+  const legacyTasks: TaskDirEntry[] = []
+  for (const entry of entries) {
+    if (!entry.isDirectory() || entry.name === 'tasks') continue
+    const taskPath = join(runDir, entry.name)
+    if (await hasMetadata(taskPath)) {
+      legacyTasks.push({
+        taskId: entry.name,
+        taskPath,
+        canonicalLayout: false,
+      })
+    }
+  }
+
+  const tasksRoot = join(runDir, 'tasks')
+  const canonicalEntries = await readdir(tasksRoot, {
+    withFileTypes: true,
+  }).catch(() => [])
+  const canonicalTasks: TaskDirEntry[] = []
+  for (const entry of canonicalEntries) {
+    if (!entry.isDirectory()) continue
+    const taskPath = join(tasksRoot, entry.name)
+    if (await hasMetadata(taskPath)) {
+      canonicalTasks.push({
+        taskId: entry.name,
+        taskPath,
+        canonicalLayout: true,
+      })
+    }
+  }
+
+  return legacyTasks.length > 0 ? legacyTasks : canonicalTasks
+}
+
+async function isRunDir(dir: string): Promise<boolean> {
+  return (await findTaskDirs(dir)).length > 0
+}
+
+async function collectRunRootFiles(runDir: string): Promise<UploadJob[]> {
+  const entries = await readdir(runDir, { withFileTypes: true })
+  return entries
+    .filter((entry) => entry.isFile())
+    .map((entry) => {
+      const filePath = join(runDir, entry.name)
+      return {
+        key: entry.name,
+        filePath,
+        contentType: contentTypeForPath(filePath),
+      }
+    })
+}
+
+function statusFromMetadata(meta: Record<string, unknown>): string {
+  return meta.termination_reason === 'completed'
+    ? 'completed'
+    : ((meta.termination_reason as string | undefined) ?? 'unknown')
+}
+
+function runIdForDir(runDir: string): string {
+  const timestamp = basename(runDir)
+  const configName = basename(dirname(runDir))
+  return `${configName}-${timestamp}`
+}
+
+/** Publishes eval artifacts in the viewer-compatible R2 layout. */
+export class R2Publisher {
+  private readonly client: R2Client
+  private readonly config: R2UploadConfig
+  private readonly viewerPath: string
+  private readonly concurrency: number
+  private readonly now: () => Date
+
+  constructor(options: R2PublisherOptions) {
+    this.config = options.config
+    this.client = options.client ?? createR2Client(options.config)
+    this.viewerPath =
+      options.viewerPath ??
+      join(import.meta.dirname, '..', 'dashboard', 'viewer.html')
+    this.concurrency = options.concurrency ?? DEFAULT_CONCURRENCY
+    this.now = options.now ?? (() => new Date())
+  }
+
+  async isUploaded(runId: string): Promise<boolean> {
+    try {
+      await this.client.send(
+        new GetObjectCommand({
+          Bucket: this.config.bucket,
+          Key: `runs/${runId}/manifest.json`,
+        }),
+      )
+      return true
+    } catch {
+      return false
+    }
+  }
+
+  async publishPath(inputDir: string): Promise<R2PublishPathResult> {
+    const dirStat = await stat(inputDir).catch(() => null)
+    if (!dirStat?.isDirectory()) {
+      throw new Error(`Not a directory: ${inputDir}`)
+    }
+
+    if (await isRunDir(inputDir)) {
+      const result = await this.publishRun(inputDir, runIdForDir(inputDir))
+      return { uploadedRuns: [result], skippedRuns: [] }
+    }
+
+    const configName = basename(inputDir)
+    const entries = await readdir(inputDir, { withFileTypes: true })
+    const runDirs = entries
+      .filter((entry) => entry.isDirectory())
+      .map((entry) => entry.name)
+      .sort()
+
+    if (runDirs.length === 0) {
+      throw new Error('No run subdirectories found')
+    }
+
+    const uploadedRuns: R2PublishRunResult[] = []
+    const skippedRuns: string[] = []
+    for (const dir of runDirs) {
+      const runId = `${configName}-${dir}`
+      if (await this.isUploaded(runId)) {
+        skippedRuns.push(runId)
+        continue
+      }
+      uploadedRuns.push(await this.publishRun(join(inputDir, dir), runId))
+    }
+
+    return { uploadedRuns, skippedRuns }
+  }
+
+  async publishRun(
+    runDir: string,
+    runId: string = runIdForDir(runDir),
+  ): Promise<R2PublishRunResult> {
+    const taskEntries = await findTaskDirs(runDir)
+
+    if (taskEntries.length === 0) {
+      throw new Error(`No task subdirectories in ${runId}`)
+    }
+
+    const manifestTasks: R2ManifestTask[] = []
+    const jobs: UploadJob[] = (await collectRunRootFiles(runDir)).map(
+      (job) => ({
+        ...job,
+        key: `runs/${runId}/${job.key}`,
+      }),
+    )
+    let agentConfig: Record<string, unknown> | undefined
+    let dataset: string | undefined
+
+    for (const taskDirEntry of taskEntries) {
+      const { taskId, taskPath } = taskDirEntry
+      const meta = await this.readMetadata(taskPath)
+      if (!meta) continue
+
+      if (!agentConfig && meta.agent_config) {
+        agentConfig = meta.agent_config as Record<string, unknown>
+      }
+      if (!dataset && meta.dataset) dataset = meta.dataset as string
+
+      const files = await collectFiles(taskPath)
+      let screenshotCount = 0
+      for (const file of files) {
+        const relative = file.slice(taskPath.length + 1)
+        if (relative.startsWith('screenshots/') && extname(file) === '.png') {
+          screenshotCount++
+        }
+        jobs.push({
+          key: `runs/${runId}/${taskId}/${relative}`,
+          filePath: file,
+          contentType: contentTypeForPath(file),
+        })
+        if (taskDirEntry.canonicalLayout) {
+          jobs.push({
+            key: `runs/${runId}/tasks/${taskId}/${relative}`,
+            filePath: file,
+            contentType: contentTypeForPath(file),
+          })
+        }
+      }
+
+      manifestTasks.push({
+        queryId: (meta.query_id as string | undefined) || taskId,
+        query: (meta.query as string | undefined) || '',
+        startUrl: (meta.start_url as string | undefined) || '',
+        status: statusFromMetadata(meta),
+        durationMs: (meta.total_duration_ms as number | undefined) || 0,
+        screenshotCount:
+          (meta.screenshot_count as number | undefined) || screenshotCount,
+        graderResults:
+          (meta.grader_results as Record<string, unknown> | undefined) || {},
+      })
+    }
+
+    if (manifestTasks.length === 0) {
+      throw new Error(`No completed tasks in ${runId}`)
+    }
+
+    let uploaded = 0
+    await runPool(jobs, this.concurrency, async (job) => {
+      await this.uploadFile(job)
+      uploaded++
+    })
+
+    const manifest = await this.buildManifest(
+      runDir,
+      runId,
+      agentConfig,
+      dataset,
+      manifestTasks,
+    )
+    await this.uploadBuffer(
+      `runs/${runId}/manifest.json`,
+      Buffer.from(JSON.stringify(manifest, null, 2)),
+      'application/json',
+    )
+    await this.uploadBuffer(
+      'viewer.html',
+      await readFile(this.viewerPath),
+      'text/html',
+    )
+
+    return {
+      runId,
+      uploadedFiles: uploaded + 2,
+      viewerUrl: `${this.config.cdnBaseUrl}/viewer.html?run=${runId}`,
+      manifest,
+    }
+  }
+
+  private async readMetadata(
+    taskPath: string,
+  ): Promise<Record<string, unknown> | null> {
+    try {
+      return JSON.parse(
+        await readFile(join(taskPath, 'metadata.json'), 'utf-8'),
+      ) as Record<string, unknown>
+    } catch {
+      return null
+    }
+  }
+
+  private async buildManifest(
+    runDir: string,
+    runId: string,
+    agentConfig: Record<string, unknown> | undefined,
+    dataset: string | undefined,
+    tasks: R2ManifestTask[],
+  ): Promise<R2RunManifest> {
+    let summaryData: Record<string, unknown> | undefined
+    try {
+      summaryData = JSON.parse(
+        await readFile(join(runDir, 'summary.json'), 'utf-8'),
+      ) as Record<string, unknown>
+    } catch {}
+
+    return {
+      runId,
+      uploadedAt: this.now().toISOString(),
+      agentConfig,
+      dataset,
+      summary: summaryData
+        ? {
+            passRate: summaryData.passRate,
+            avgDurationMs: summaryData.avgDurationMs,
+          }
+        : undefined,
+      tasks,
+    }
+  }
+
+  private async uploadFile(job: UploadJob): Promise<void> {
+    await this.uploadBuffer(
+      job.key,
+      await readFile(job.filePath),
+      job.contentType,
+    )
+  }
+
+  private async uploadBuffer(
+    key: string,
+    body: Buffer,
+    contentType: string,
+  ): Promise<void> {
+    await this.client.send(
+      new PutObjectCommand({
+        Bucket: this.config.bucket,
+        Key: key,
+        Body: body,
+        ContentType: contentType,
+      }),
+    )
+  }
+}
+
+export async function publishPathToR2(
+  inputDir: string,
+): Promise<R2PublishPathResult> {
+  const config = loadR2ConfigFromEnv()
+  return new R2Publisher({ config }).publishPath(inputDir)
+}
--- a/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts
@@ -14,8 +14,11 @@
 */

 import {
+  closeSync,
  existsSync,
+  mkdirSync,
  mkdtempSync,
+  openSync,
  readFileSync,
  rmSync,
  writeFileSync,
@@ -33,7 +36,17 @@ export interface EvalPorts {

 const MAX_RESTART_ATTEMPTS = 3
 const CDP_WAIT_TIMEOUT_MS = 30_000
-const SERVER_HEALTH_TIMEOUT_MS = 30_000
+// Bumped from 30s → 90s while debugging dev-CI startup. Dev's server module
+// graph is ~108 files larger than main's; cold-cache module load on a CI
+// runner can take much longer than the original 30s budget allowed.
+const SERVER_HEALTH_TIMEOUT_MS = 90_000
+
+// Where per-worker server stderr is written. Captured (rather than ignored)
+// so eval-weekly.yml can upload these as workflow artifacts on failure for
+// post-mortem debugging. Path is also referenced in the workflow's artifact
+// upload step.
+const SERVER_LOG_DIR =
+  process.env.BROWSEROS_SERVER_LOG_DIR || '/tmp/browseros-server-logs'

 const MONOREPO_ROOT = join(
  dirname(fileURLToPath(import.meta.url)),
@@ -53,6 +66,7 @@ export class BrowserOSAppManager {
  private ports: EvalPorts
  private chromeProc: Subprocess | null = null
  private serverProc: Subprocess | null = null
+  private serverLogFd: number | null = null
  private tempDir: string | null = null
  private readonly workerIndex: number
  private readonly loadExtensions: boolean
@@ -183,15 +197,36 @@ export class BrowserOSAppManager {
      VITE_BROWSEROS_SERVER_PORT: String(server),
    }

+    // Capture both stdout and stderr to a per-worker file so we can
+    // post-mortem startup hangs. The server uses pino which writes logs to
+    // stdout by default — capturing stderr alone misses everything. The
+    // eval-weekly workflow uploads /tmp/browseros-server-logs/ as a workflow
+    // artifact on failure.
+    // Open the per-worker log file under SERVER_LOG_DIR. If the directory
+    // can't be created or the file can't be opened (e.g. unwritable custom
+    // BROWSEROS_SERVER_LOG_DIR), fall back to /dev/null so spawn still works.
+    const logPath = join(SERVER_LOG_DIR, `server-W${this.workerIndex}.log`)
+    let logFd: number
+    try {
+      mkdirSync(SERVER_LOG_DIR, { recursive: true })
+      logFd = openSync(logPath, 'a')
+    } catch {
+      logFd = openSync('/dev/null', 'w')
+    }
+    this.serverLogFd = logFd
+
+    // `start:ci` skips `--watch` (no file-watcher overhead in CI). Falls back
+    // to the regular `start` script outside CI for the dev-watch experience.
+    const startScript = process.env.CI ? 'start:ci' : 'start'
    this.serverProc = spawn({
-      cmd: ['bun', 'run', '--filter', '@browseros/server', 'start'],
+      cmd: ['bun', 'run', '--filter', '@browseros/server', startScript],
      cwd: MONOREPO_ROOT,
-      stdout: 'ignore',
-      stderr: 'ignore',
+      stdout: logFd,
+      stderr: logFd,
      env: serverEnv,
    })
    console.log(
-      `  [W${this.workerIndex}] Server started (PID: ${this.serverProc.pid})`,
+      `  [W${this.workerIndex}] Server started (PID: ${this.serverProc.pid}, logs → ${logPath})`,
    )

    // --- Wait for Server Health ---
@@ -244,6 +279,18 @@ export class BrowserOSAppManager {
    await this.killProcess(this.serverProc)
    this.serverProc = null

+    // Close the parent's copy of the server log fd. Child kept its own dup
+    // until it exited above, so closing here doesn't truncate any output.
+    // Without this we'd leak one fd per restart attempt across all workers.
+    if (this.serverLogFd !== null) {
+      try {
+        closeSync(this.serverLogFd)
+      } catch {
+        // already closed or invalid — ignore
+      }
+      this.serverLogFd = null
+    }
+
    // Kill Chrome (graceful → force)
    await this.killProcess(this.chromeProc)
    this.chromeProc = null
--- a/packages/browseros-agent/apps/eval/src/runner/eval-runner.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/eval-runner.ts
@@ -1,362 +1 @@
-import { mkdir, writeFile } from 'node:fs/promises'
-import { basename, dirname, join, resolve } from 'node:path'
-import {
-  dashboardState,
-  setActiveExecutor,
-  startDashboard,
-  stopDashboard,
-} from '../dashboard/server'
-import type { ErrorSource, EvalConfig, Task } from '../types'
-import {
-  printValidationResult,
-  validateConfig,
-} from '../utils/config-validator'
-import { ParallelExecutor } from './parallel-executor'
-import {
-  getTaskSourceDescription,
-  loadTasks,
-  TaskLoadError,
-} from './task-loader'
-import type {
-  BatchSummary,
-  RunEvalOptions,
-  TaskResult,
-  TaskResultSummary,
-  TaskSource,
-} from './types'
-import { getPrimaryGraderResult, isSuccessfulResult } from './types'
-
-// ============================================================================
-// Main Entry Point
-// ============================================================================
-
-export async function runEval(options: RunEvalOptions): Promise<void> {
-  // Step 1: Validate configuration
-  const config = await loadAndValidateConfig(options.configPath)
-
-  // Step 2: Resolve paths relative to config location
-  const configDir = dirname(resolve(options.configPath))
-  const resolvedPaths = resolvePaths(options, config, configDir)
-
-  // Log configuration
-  console.log('Eval Configuration:')
-  console.log(`  Config: ${options.configPath}`)
-  console.log(`  Dataset: ${resolvedPaths.dataPath}`)
-  console.log(`  Output: ${resolvedPaths.outputDir}`)
-  console.log(`  Workers: ${config.num_workers}`)
-  console.log(`  Agent: ${config.agent.type}`)
-  console.log()
-
-  // Step 3: Load tasks
-  const taskSource = resolveTaskSource(options, resolvedPaths.dataPath)
-  const { tasks } = await loadTasksWithLogging(taskSource)
-
-  // Step 4: Setup
-  await mkdir(resolvedPaths.outputDir, { recursive: true })
-
-  // Step 5: Start dashboard
-  startDashboard({
-    tasks,
-    configName: options.configPath,
-    agentType: config.agent.type,
-    outputDir: resolvedPaths.outputDir,
-  })
-
-  // Step 6: Execute tasks (parallel or sequential based on num_workers)
-  const results = await executeTasks(tasks, config, resolvedPaths.outputDir)
-
-  // Step 7: Summary
-  const summary = buildSummary(results)
-  await saveSummary(summary, resolvedPaths.outputDir)
-  printSummary(summary)
-  console.log(`\nResults saved to: ${resolvedPaths.outputDir}`)
-
-  stopDashboard()
-}
-
-// ============================================================================
-// Configuration
-// ============================================================================
-
-async function loadAndValidateConfig(configPath: string) {
-  console.log('Validating configuration...')
-  const validationResult = await validateConfig(configPath)
-  printValidationResult(validationResult)
-
-  if (!validationResult.valid || !validationResult.config) {
-    throw new Error(
-      'Configuration validation failed. Fix the above errors and try again.',
-    )
-  }
-
-  return validationResult.config
-}
-
-interface ResolvedPaths {
-  dataPath: string
-  outputDir: string
-}
-
-function resolvePaths(
-  options: RunEvalOptions,
-  config: EvalConfig,
-  configDir: string,
-): ResolvedPaths {
-  // Resolve dataset path: use options.dataPath if provided, otherwise resolve from config
-  const dataPath = options.dataPath
-    ? options.dataPath
-    : config.dataset.startsWith('/')
-      ? config.dataset
-      : resolve(configDir, config.dataset)
-
-  // Resolve output directory: results/{config-name}/{timestamp}/
-  // Config name derived from config filename (e.g., "browseros-agent-weekly.json" → "browseros-agent-weekly")
-  const configName = options.configPath
-    ? basename(resolve(options.configPath), '.json')
-    : 'eval'
-  const timestamp = formatTimestamp(new Date())
-  const resultsBase = config.output_dir
-    ? config.output_dir.startsWith('/')
-      ? config.output_dir
-      : resolve(configDir, config.output_dir)
-    : resolve(configDir, '..', 'results')
-  const outputDir = join(resultsBase, configName, timestamp)
-
-  return { dataPath, outputDir }
-}
-
-function formatTimestamp(date: Date): string {
-  const y = date.getFullYear()
-  const m = String(date.getMonth() + 1).padStart(2, '0')
-  const d = String(date.getDate()).padStart(2, '0')
-  const h = String(date.getHours()).padStart(2, '0')
-  const min = String(date.getMinutes()).padStart(2, '0')
-  return `${y}-${m}-${d}-${h}${min}`
-}
-
-// ============================================================================
-// Task Loading
-// ============================================================================
-
-function resolveTaskSource(
-  options: RunEvalOptions,
-  dataPath: string,
-): TaskSource {
-  // If query is provided, use single task mode
-  if (options.query) {
-    return { type: 'single', query: options.query, startUrl: options.startUrl }
-  }
-
-  // Otherwise use file mode with the resolved dataPath
-  return { type: 'file', path: dataPath }
-}
-
-async function loadTasksWithLogging(
-  source: TaskSource,
-): Promise<{ tasks: Awaited<ReturnType<typeof loadTasks>>['tasks'] }> {
-  console.log(`Loading tasks from ${getTaskSourceDescription(source)}...`)
-
-  try {
-    const result = await loadTasks(source)
-    console.log(`Loaded ${result.tasks.length} task(s)`)
-    return { tasks: result.tasks }
-  } catch (error) {
-    if (error instanceof TaskLoadError) {
-      throw new Error(`Failed to load tasks: ${error.message}`)
-    }
-    throw new Error(`Failed to load tasks: ${error}`)
-  }
-}
-
-// ============================================================================
-// Task Execution
-// ============================================================================
-
-async function executeTasks(
-  tasks: Task[],
-  config: EvalConfig,
-  outputDir: string,
-): Promise<TaskResult[]> {
-  console.log(`\n${'='.repeat(60)}`)
-  console.log('STARTING EVALUATION')
-  console.log(`${'='.repeat(60)}\n`)
-
-  const numWorkers = config.num_workers || 1
-  console.log(`Running with ${numWorkers} worker(s)`)
-  if (config.restart_server_per_task) {
-    console.log(`Server restart per task: enabled`)
-  }
-  console.log()
-
-  const executor = new ParallelExecutor({
-    numWorkers,
-    config,
-    outputDir,
-    restartServerPerTask: config.restart_server_per_task,
-    onEvent: (taskId, event) =>
-      dashboardState.broadcastStreamEvent(taskId, event),
-  })
-
-  // Register so dashboard stop button works for CLI runs too
-  setActiveExecutor(executor)
-  try {
-    return await executor.execute(tasks, (completed, total, task, result) => {
-      printTaskProgress(completed, total, task, result)
-    })
-  } finally {
-    setActiveExecutor(null)
-  }
-}
-
-function printTaskProgress(
-  completed: number,
-  total: number,
-  task: Task,
-  result: TaskResult,
-): void {
-  const status =
-    result.status === 'completed'
-      ? 'DONE'
-      : result.status === 'timeout'
-        ? 'TIMEOUT'
-        : 'FAILED'
-
-  const duration =
-    result.durationMs > 0 ? ` (${(result.durationMs / 1000).toFixed(1)}s)` : ''
-
-  console.log(`[${completed}/${total}] ${task.query_id}: ${status}${duration}`)
-
-  if (result.status === 'failed') {
-    console.log(`  ERROR: ${result.error.message}`)
-  } else if (isSuccessfulResult(result)) {
-    // Log agent errors (e.g., LLM API failures) even if task "completed"
-    if (result.agentResult.metadata.errors?.length) {
-      for (const err of result.agentResult.metadata.errors) {
-        console.log(`    ERROR [${err.source}]: ${err.message}`)
-      }
-    }
-    for (const [name, gr] of Object.entries(result.graderResults)) {
-      const icon = gr.pass ? 'PASS' : 'FAIL'
-      console.log(`    ${name}: ${icon}`)
-    }
-  }
-}
-
-// ============================================================================
-// Summary
-// ============================================================================
-
-function buildSummary(results: TaskResult[]): BatchSummary {
-  // Track errors by source
-  const errorsBySource: Partial<Record<ErrorSource, number>> = {}
-  let totalWarnings = 0
-
-  const taskSummaries: TaskResultSummary[] = results.map((r) => {
-    let errorCount = 0
-    let warningCount = 0
-    let errorSources: ErrorSource[] | undefined
-    let failureReason: string | undefined
-
-    if (isSuccessfulResult(r)) {
-      // Count errors and warnings from agent metadata
-      errorCount = r.agentResult.metadata.errors?.length ?? 0
-      warningCount = r.agentResult.metadata.warnings?.length ?? 0
-      totalWarnings += warningCount
-
-      // Track error sources
-      if (r.agentResult.metadata.errors?.length) {
-        errorSources = r.agentResult.metadata.errors.map((e) => e.source)
-        for (const err of r.agentResult.metadata.errors) {
-          errorsBySource[err.source] = (errorsBySource[err.source] ?? 0) + 1
-        }
-      }
-    } else {
-      // Failed task
-      errorCount = 1
-      errorSources = [r.errorSource]
-      failureReason = r.error.message
-      errorsBySource[r.errorSource] = (errorsBySource[r.errorSource] ?? 0) + 1
-    }
-
-    return {
-      queryId: r.task.query_id,
-      status: r.status,
-      durationMs: r.durationMs,
-      graderResults: isSuccessfulResult(r)
-        ? Object.fromEntries(
-            Object.entries(r.graderResults).map(([name, gr]) => [
-              name,
-              { pass: gr.pass, score: gr.score },
-            ]),
-          )
-        : undefined,
-      errorCount,
-      warningCount,
-      errorSources: errorSources?.length ? errorSources : undefined,
-      failureReason,
-    }
-  })
-
-  const completed = results.filter((r) => r.status === 'completed').length
-  const timeout = results.filter((r) => r.status === 'timeout').length
-  const failed = results.filter((r) => r.status === 'failed').length
-
-  // Calculate pass rate using primary grader (fallback order)
-  let totalGraded = 0
-  let totalPasses = 0
-
-  for (const result of results) {
-    if (isSuccessfulResult(result)) {
-      const primary = getPrimaryGraderResult(result.graderResults)
-      if (primary) {
-        totalGraded++
-        if (primary.pass) totalPasses++
-      }
-    }
-  }
-
-  const passRate = totalGraded > 0 ? totalPasses / totalGraded : 0
-
-  // Calculate average duration for non-failed tasks
-  const durations = results
-    .filter((r) => r.status !== 'failed')
-    .map((r) => r.durationMs)
-  const avgDurationMs =
-    durations.length > 0
-      ? durations.reduce((a, b) => a + b, 0) / durations.length
-      : 0
-
-  return {
-    total: results.length,
-    completed,
-    failed,
-    timeout,
-    passRate,
-    avgDurationMs,
-    errorsBySource,
-    totalWarnings,
-    results: taskSummaries,
-  }
-}
-
-async function saveSummary(
-  summary: BatchSummary,
-  outputDir: string,
-): Promise<void> {
-  await writeFile(
-    join(outputDir, 'summary.json'),
-    JSON.stringify(summary, null, 2),
-  )
-}
-
-function printSummary(summary: BatchSummary): void {
-  console.log('='.repeat(60))
-  console.log('EVALUATION COMPLETE')
-  console.log('='.repeat(60))
-  console.log(`Total: ${summary.total} tasks`)
-  console.log(`  Completed: ${summary.completed}`)
-  console.log(`  Timeout: ${summary.timeout}`)
-  console.log(`  Failed: ${summary.failed}`)
-  console.log(`  Pass Rate: ${(summary.passRate * 100).toFixed(1)}%`)
-  console.log(`  Avg Duration: ${(summary.avgDurationMs / 1000).toFixed(1)}s`)
-}
+export { runEval } from '../runs/eval-runner'
--- a/packages/browseros-agent/apps/eval/src/runner/parallel-executor.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/parallel-executor.ts
@@ -1,266 +1,5 @@
-/**
- * Parallel Executor
- *
- * Each worker gets its own isolated BrowserOS stack:
- *   - BrowserOSAppManager (Chrome + Server on unique ports)
- *   - TaskExecutor (uses that worker's server URL)
- *
- * Port allocation: Worker N → CDP=base+N, Server=base+N, Extension=base+N
- */
-
-import type { EvalConfig, Task } from '../types'
-import { BrowserOSAppManager, type EvalPorts } from './browseros-app-manager'
-import { createTaskExecutor } from './task-executor'
-import type { TaskResult } from './types'
-
-// ============================================================================
-// Types
-// ============================================================================
-
-export interface ParallelExecutorConfig {
-  numWorkers: number
-  config: EvalConfig
-  outputDir: string
-  restartServerPerTask?: boolean
-  onEvent?: (taskId: string, event: Record<string, unknown>) => void
-}
-
-export type ProgressCallback = (
-  completed: number,
-  total: number,
-  task: Task,
-  result: TaskResult,
-) => void
-
-// ============================================================================
-// Task Queue (thread-safe for single-threaded async — index is atomic)
-// ============================================================================
-
-class TaskQueue {
-  private tasks: Task[]
-  private index: number = 0
-  private stopped: boolean = false
-
-  constructor(tasks: Task[]) {
-    this.tasks = [...tasks]
-  }
-
-  next(): Task | null {
-    if (this.stopped || this.index >= this.tasks.length) return null
-    return this.tasks[this.index++]
-  }
-
-  stop(): void {
-    this.stopped = true
-  }
-}
-
-// ============================================================================
-// Parallel Executor
-// ============================================================================
-
-export class ParallelExecutor {
-  private readonly numWorkers: number
-  private readonly appManagers = new Map<number, BrowserOSAppManager>()
-  private completedCount: number = 0
-  private readonly resultLock = new Map<string, TaskResult>()
-  private queue: TaskQueue | null = null
-
-  constructor(private readonly config: ParallelExecutorConfig) {
-    this.numWorkers = Math.max(1, config.numWorkers)
-  }
-
-  async stop(): Promise<void> {
-    console.log('\nStopping eval run...')
-    this.queue?.stop()
-    const kills = [...this.appManagers.values()].map((m) => m.killApp())
-    await Promise.allSettled(kills)
-  }
-
-  async execute(
-    tasks: Task[],
-    onProgress?: ProgressCallback,
-  ): Promise<TaskResult[]> {
-    if (tasks.length === 0) return []
-
-    const cleanup = this.setupSignalHandlers()
-
-    const loadExtensions = this.config.config.browseros.load_extensions ?? false
-
-    // Patch NopeCHA API key before launching any workers
-    const captchaConfig = this.config.config.captcha
-    if (captchaConfig) {
-      const apiKey = process.env[captchaConfig.api_key_env]
-      if (apiKey) {
-        BrowserOSAppManager.patchNopechaApiKey(apiKey)
-      }
-    }
-
-    this.queue = new TaskQueue(tasks)
-    const totalTasks = tasks.length
-
-    try {
-      const queue = this.queue
-      // Launch N workers in parallel — each gets its own Chrome + Server
-      const workers = Array.from({ length: this.numWorkers }, (_, i) =>
-        this.runWorker(i, queue, totalTasks, loadExtensions, onProgress),
-      )
-      await Promise.all(workers)
-
-      // Return results in original task order
-      return tasks.map((task) => {
-        const result = this.resultLock.get(task.query_id)
-        if (!result) {
-          return {
-            status: 'failed' as const,
-            task,
-            error: new Error('Task result not found'),
-            errorSource: 'unknown' as const,
-            durationMs: 0,
-          }
-        }
-        return result
-      })
-    } finally {
-      cleanup()
-    }
-  }
-
-  private async runWorker(
-    workerIndex: number,
-    queue: TaskQueue,
-    totalTasks: number,
-    loadExtensions: boolean,
-    onProgress?: ProgressCallback,
-  ): Promise<void> {
-    // Per-worker isolated ports
-    const basePorts: EvalPorts = {
-      cdp: this.config.config.browseros.base_cdp_port,
-      server: this.config.config.browseros.base_server_port,
-      extension: this.config.config.browseros.base_extension_port,
-    }
-    const headless = this.config.config.browseros.headless ?? false
-    const appManager = new BrowserOSAppManager(
-      workerIndex,
-      basePorts,
-      loadExtensions,
-      headless,
-    )
-    this.appManagers.set(workerIndex, appManager)
-
-    // Per-worker executor pointing to this worker's server
-    const workerConfig: typeof this.config.config = {
-      ...this.config.config,
-      browseros: {
-        ...this.config.config.browseros,
-        server_url: appManager.getServerUrl(),
-      },
-    }
-    const executor = createTaskExecutor(
-      workerConfig,
-      workerIndex,
-      this.config.outputDir,
-      this.config.onEvent,
-    )
-
-    try {
-      // Always start Chrome+Server once for this worker
-      console.log(`\n  Worker ${workerIndex}: Starting BrowserOS stack...`)
-      await appManager.restart()
-
-      while (true) {
-        const task = queue.next()
-        if (!task) break
-
-        const taskStartTime = Date.now()
-        let result: TaskResult
-
-        try {
-          // Restart between tasks if configured
-          if (this.config.restartServerPerTask) {
-            console.log(`\n${'─'.repeat(60)}`)
-            console.log(`  Worker ${workerIndex}: Task: ${task.query_id}`)
-            console.log(`${'─'.repeat(60)}`)
-            await appManager.restart()
-          }
-
-          this.config.onEvent?.(task.query_id, {
-            type: 'task-state',
-            taskId: task.query_id,
-            status: 'running',
-          })
-          result = await executor.execute(task)
-          console.log(
-            `  Worker ${workerIndex}: ${task.query_id}: ${result.status}`,
-          )
-        } catch (error) {
-          console.error(
-            `  Worker ${workerIndex}: ${task.query_id}: FAILED - ${error instanceof Error ? error.message : String(error)}`,
-          )
-          result = {
-            status: 'failed',
-            task,
-            error: error instanceof Error ? error : new Error(String(error)),
-            errorSource: 'unknown',
-            durationMs: Date.now() - taskStartTime,
-          }
-        }
-
-        this.resultLock.set(task.query_id, result)
-        this.completedCount++
-
-        // Emit task completion to dashboard
-        const stateEvent: Record<string, unknown> = {
-          type: 'task-state',
-          taskId: task.query_id,
-          status: result.status,
-          durationMs: result.durationMs,
-        }
-        if (result.status !== 'failed' && 'graderResults' in result) {
-          stateEvent.graderResults = Object.fromEntries(
-            Object.entries(result.graderResults).map(([name, gr]) => [
-              name,
-              {
-                pass: gr.pass,
-                score: gr.score,
-                reasoning: gr.reasoning,
-                details: gr.details,
-              },
-            ]),
-          )
-          stateEvent.screenshotCount =
-            result.agentResult?.metadata?.total_steps ?? 0
-        }
-        this.config.onEvent?.(task.query_id, stateEvent)
-
-        onProgress?.(this.completedCount, totalTasks, task, result)
-
-        if (this.config.restartServerPerTask) {
-          await new Promise((resolve) => setTimeout(resolve, 2000))
-        }
-      }
-    } finally {
-      await appManager.killApp()
-    }
-  }
-
-  /**
-   * SIGINT/SIGTERM kills all Chrome + Server instances across all workers.
-   * Returns a cleanup function that removes the listeners after execute() completes.
-   */
-  private setupSignalHandlers(): () => void {
-    const onSignal = async () => {
-      console.log('\nShutting down all workers...')
-      this.queue?.stop()
-      const kills = [...this.appManagers.values()].map((m) => m.killApp())
-      await Promise.allSettled(kills)
-      process.exit(0)
-    }
-    process.on('SIGINT', onSignal)
-    process.on('SIGTERM', onSignal)
-    return () => {
-      process.off('SIGINT', onSignal)
-      process.off('SIGTERM', onSignal)
-    }
-  }
-}
+export {
+  type ProgressCallback,
+  TaskWorkerPool as ParallelExecutor,
+  type TaskWorkerPoolConfig as ParallelExecutorConfig,
+} from '../runs/task-worker-pool'
--- a/packages/browseros-agent/apps/eval/src/runner/task-executor.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/task-executor.ts
@@ -1,316 +1,6 @@
-import { join } from 'node:path'
-import { createAgent } from '../agents'
-import type { AgentContext, AgentResult } from '../agents/types'
-import { CaptureContext } from '../capture/context'
-import {
-  hasExistingGraderResults,
-  TrajectorySaver,
-} from '../capture/trajectory-saver'
-import { runGraders } from '../graders/registry'
-import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
-import { callMcpTool } from '../utils/mcp-client'
-import { InfinityAppManager } from './infinity-app-manager'
-import type { TaskResult } from './types'
-
-// ============================================================================
-// Errors
-// ============================================================================
-
-export class TaskExecutionError extends Error {
-  public readonly errorSource: ErrorSource
-
-  constructor(
-    message: string,
-    public readonly task: Task,
-    public readonly phase:
-      | 'navigation'
-      | 'agent_execution'
-      | 'grading'
-      | 'cleanup',
-    public readonly cause?: Error,
-  ) {
-    super(message)
-    this.name = 'TaskExecutionError'
-    this.errorSource = phase as ErrorSource
-  }
-}
-
-// ============================================================================
-// Task Executor
-// ============================================================================
-
-export interface TaskExecutorDeps {
-  onEvent?: (taskId: string, event: Record<string, unknown>) => void
-}
-
-export class TaskExecutor {
-  constructor(
-    private readonly config: EvalConfig,
-    private readonly workerIndex: number,
-    private readonly outputDir: string,
-    private readonly deps: TaskExecutorDeps,
-  ) {}
-
-  /**
-   * Resolve the initial page ID via list_pages MCP call.
-   * Called once per task on a fresh browser — there's exactly one page.
-   */
-  private async resolveInitialPageId(mcpUrl: string): Promise<number> {
-    try {
-      const result = await callMcpTool(mcpUrl, 'list_pages', {})
-      if (!result.isError) {
-        const textContent = result.content?.find(
-          (c: { type: string }) => c.type === 'text',
-        )
-        const match = textContent?.text?.match(/^\s*(\d+)\./m)
-        if (match) return Number.parseInt(match[1], 10)
-      }
-    } catch {
-      // Fall through to default
-    }
-    // Fresh browser always has page 1
-    return 1
-  }
-
-  async execute(task: Task): Promise<TaskResult> {
-    const startTime = Date.now()
-    const mcpUrl = `${this.config.browseros.server_url}/mcp`
-
-    // Check if task already has grader results (resume capability)
-    const existing = await hasExistingGraderResults(
-      this.outputDir,
-      task.query_id,
-    )
-    if (existing.exists && existing.metadata) {
-      console.log(`  Skipping: already has grader results`)
-      return {
-        status:
-          existing.metadata.termination_reason === 'timeout'
-            ? 'timeout'
-            : 'completed',
-        task,
-        agentResult: {
-          metadata: existing.metadata,
-          messages: [],
-          finalAnswer: existing.metadata.final_answer,
-        },
-        graderResults: existing.metadata.grader_results,
-        durationMs: existing.metadata.total_duration_ms,
-      }
-    }
-
-    // Resolve page ID once — fresh browser has exactly one page
-    const pageId = await this.resolveInitialPageId(mcpUrl)
-
-    // For Infinity tasks, start a fresh app server per task
-    let infinityManager: InfinityAppManager | null = null
-    let actualStartUrl = task.start_url
-
-    if (task.dataset === 'webarena-infinity') {
-      const appName = (task.metadata?.additional as Record<string, unknown>)
-        ?.app_name as string
-      const appBasePort =
-        ((task.metadata?.additional as Record<string, unknown>)
-          ?.app_base_port as number) || 8000
-
-      if (appName && process.env.WEBARENA_INFINITY_DIR) {
-        infinityManager = new InfinityAppManager(this.workerIndex, appBasePort)
-        try {
-          actualStartUrl = await infinityManager.startApp(appName)
-          console.log(
-            `  Infinity app "${appName}" started on port ${infinityManager.getPort()}`,
-          )
-        } catch (error) {
-          throw new TaskExecutionError(
-            `Failed to start Infinity app: ${error instanceof Error ? error.message : String(error)}`,
-            task,
-            'navigation',
-            error instanceof Error ? error : undefined,
-          )
-        }
-      }
-    }
-
-    try {
-      // Phase 1: Set viewport + navigate to start URL
-      try {
-        await callMcpTool(mcpUrl, 'evaluate_script', {
-          page: pageId,
-          expression: 'window.resizeTo(1440, 900)',
-        })
-      } catch (vpError) {
-        console.warn(
-          `  Viewport resize failed: ${vpError instanceof Error ? vpError.message : String(vpError)}`,
-        )
-      }
-
-      if (actualStartUrl && actualStartUrl !== 'about:blank') {
-        try {
-          await callMcpTool(mcpUrl, 'navigate_page', {
-            url: actualStartUrl,
-            page: pageId,
-          })
-        } catch (error) {
-          throw new TaskExecutionError(
-            `Failed to navigate to start URL: ${error instanceof Error ? error.message : String(error)}`,
-            task,
-            'navigation',
-            error instanceof Error ? error : undefined,
-          )
-        }
-      }
-
-      // Phase 2: Execute agent
-      const agentResult = await this.executeAgent(task, pageId)
-
-      // Phase 3: Run graders
-      const graderResults = await this.runGraders(
-        task,
-        agentResult,
-        infinityManager?.getUrl(),
-      )
-
-      const status =
-        agentResult.metadata.termination_reason === 'timeout'
-          ? 'timeout'
-          : 'completed'
-
-      return {
-        status,
-        task,
-        agentResult,
-        graderResults,
-        durationMs: Date.now() - startTime,
-      }
-    } catch (error) {
-      const errorSource: ErrorSource =
-        error instanceof TaskExecutionError ? error.errorSource : 'unknown'
-
-      return {
-        status: 'failed',
-        task,
-        error: error instanceof Error ? error : new Error(String(error)),
-        errorSource,
-        durationMs: Date.now() - startTime,
-      }
-    } finally {
-      // Navigate to about:blank to clean up
-      try {
-        await callMcpTool(mcpUrl, 'navigate_page', {
-          url: 'about:blank',
-          page: pageId,
-        })
-      } catch {
-        // Ignore cleanup errors
-      }
-
-      // Stop Infinity app server if running
-      if (infinityManager) {
-        await infinityManager.stop().catch(() => {})
-      }
-    }
-  }
-
-  private async executeAgent(task: Task, pageId: number): Promise<AgentResult> {
-    try {
-      const { capture, taskOutputDir } = await CaptureContext.create({
-        serverUrl: this.config.browseros.server_url,
-        outputDir: this.outputDir,
-        taskId: task.query_id,
-        initialPageId: pageId,
-        onEvent: this.deps.onEvent,
-      })
-
-      const context: AgentContext = {
-        config: this.config,
-        task,
-        workerIndex: this.workerIndex,
-        initialPageId: pageId,
-        outputDir: this.outputDir,
-        taskOutputDir,
-        capture,
-      }
-
-      const agent = createAgent(context)
-      return await agent.execute()
-    } catch (error) {
-      if (error instanceof TaskExecutionError) {
-        throw error
-      }
-      throw new TaskExecutionError(
-        `Agent execution failed: ${error instanceof Error ? error.message : String(error)}`,
-        task,
-        'agent_execution',
-        error instanceof Error ? error : undefined,
-      )
-    }
-  }
-
-  private async runGraders(
-    task: Task,
-    agentResult: AgentResult,
-    infinityAppUrl?: string,
-  ): Promise<Record<string, GraderResult>> {
-    const configGraders = this.config.graders ?? []
-    const taskGraders = task.graders ?? []
-    const graderNames = configGraders.length > 0 ? configGraders : taskGraders
-    if (graderNames.length === 0) {
-      return {}
-    }
-
-    try {
-      const graderResults = await runGraders(graderNames, {
-        task: {
-          query_id: task.query_id,
-          query: task.query,
-          dataset: task.dataset,
-        },
-        messages: agentResult.messages,
-        screenshotCount:
-          agentResult.metadata.screenshot_count ??
-          agentResult.metadata.total_steps,
-        finalAnswer: agentResult.finalAnswer,
-        expectedAnswer: (task.metadata?.additional as Record<string, unknown>)
-          ?.answer as string | undefined,
-        outputDir: join(this.outputDir, task.query_id),
-        mcpUrl: `${this.config.browseros.server_url}/mcp`,
-        infinityAppUrl,
-      })
-
-      try {
-        const saver = new TrajectorySaver(this.outputDir, task.query_id)
-        await saver.updateGraderResults(graderResults)
-      } catch (saveError) {
-        console.warn(
-          `  Failed to persist grader results: ${saveError instanceof Error ? saveError.message : String(saveError)}`,
-        )
-      }
-
-      return graderResults
-    } catch (error) {
-      console.warn(
-        `  Grading failed: ${error instanceof Error ? error.message : String(error)}`,
-      )
-      return {
-        _error: {
-          score: 0,
-          pass: false,
-          reasoning: `Grading failed: ${error instanceof Error ? error.message : String(error)}`,
-        },
-      }
-    }
-  }
-}
-
-// ============================================================================
-// Factory
-// ============================================================================
-
-export function createTaskExecutor(
-  config: EvalConfig,
-  workerIndex: number,
-  outputDir: string,
-  onEvent?: (taskId: string, event: Record<string, unknown>) => void,
-): TaskExecutor {
-  return new TaskExecutor(config, workerIndex, outputDir, { onEvent })
-}
+export {
+  createTaskRunPipeline as createTaskExecutor,
+  TaskExecutionError,
+  TaskRunPipeline as TaskExecutor,
+  type TaskRunPipelineDeps as TaskExecutorDeps,
+} from '../runs/task-run-pipeline'
--- a/packages/browseros-agent/apps/eval/src/runner/types.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/types.ts
@@ -8,12 +8,18 @@ import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'

 export interface RunEvalOptions {
  configPath: string
+  config?: EvalConfig
  dataPath?: string
  query?: string
  startUrl?: string
  outputDir?: string
 }

+export interface RunEvalResult {
+  outputDir: string
+  summary: BatchSummary
+}
+
 // ============================================================================
 // Task Loading
 // ============================================================================
--- a/packages/browseros-agent/apps/eval/src/runs/artifact-paths.ts
+++ b/packages/browseros-agent/apps/eval/src/runs/artifact-paths.ts
@@ -0,0 +1,46 @@
+import { join } from 'node:path'
+
+function timestamp(date: Date): string {
+  const y = date.getUTCFullYear()
+  const m = String(date.getUTCMonth() + 1).padStart(2, '0')
+  const d = String(date.getUTCDate()).padStart(2, '0')
+  const h = String(date.getUTCHours()).padStart(2, '0')
+  const min = String(date.getUTCMinutes()).padStart(2, '0')
+  return `${y}-${m}-${d}-${h}${min}`
+}
+
+function safeSegment(value: string): string {
+  return value
+    .toLowerCase()
+    .replace(/[^a-z0-9._-]+/g, '-')
+    .replace(/^-+|-+$/g, '')
+}
+
+/** Creates a path-safe run id from suite/config, variant, and time. */
+export function createRunId(
+  suiteId: string,
+  variantId: string,
+  date = new Date(),
+): string {
+  return `${safeSegment(suiteId)}__${safeSegment(variantId)}__${timestamp(date)}`
+}
+
+export function getRunPaths(baseDir: string, runId: string, taskId?: string) {
+  const runDir = join(baseDir, 'runs', runId)
+  const taskDir = taskId ? join(runDir, 'tasks', taskId) : undefined
+
+  return {
+    runDir,
+    runManifest: join(runDir, 'run.json'),
+    summary: join(runDir, 'summary.json'),
+    viewerManifest: join(runDir, 'viewer-manifest.json'),
+    uploadManifest: join(runDir, 'upload-manifest.json'),
+    taskDir,
+    attempt: taskDir ? join(taskDir, 'attempt.json') : undefined,
+    trace: taskDir ? join(taskDir, 'trace.jsonl') : undefined,
+    messages: taskDir ? join(taskDir, 'messages.jsonl') : undefined,
+    grades: taskDir ? join(taskDir, 'grades.json') : undefined,
+    graderArtifacts: taskDir ? join(taskDir, 'grader-artifacts') : undefined,
+    screenshots: taskDir ? join(taskDir, 'screenshots') : undefined,
+  }
+}
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}