Compare commits

..

1 Commits

Author SHA1 Message Date
Nikhil Sonti
1f504702ec feat(build-tools): add cache:sync:dev for local tarball seeding
Seeds ~/.browseros-dev/cache/vm/ from ./dist/ without touching R2, so
devs can test the server against a freshly-built tarball before anything
is published to cdn.browseros.com. Hardcodes arm64 since all devs are on
Apple Silicon; refuses to run unless NODE_ENV=development; idempotent
(skips copy on sha256 match).

Also fixes the R2_BUCKET default in .env.sample from browseros-artifacts
to browseros to match the actual bucket.
2026-04-23 10:30:04 -07:00
401 changed files with 19153 additions and 33104 deletions

View File

@@ -30,9 +30,8 @@ jobs:
- name: Install BrowserOS
run: |
# Rolling stable channel — see https://cdn.browseros.com/download/BrowserOS.deb
wget -q -O BrowserOS.deb https://cdn.browseros.com/download/BrowserOS.deb
sudo dpkg -i BrowserOS.deb
wget -q https://github.com/browseros-ai/BrowserOS/releases/download/v0.44.0.1/BrowserOS_v0.44.0.1_amd64.deb
sudo dpkg -i BrowserOS_v0.44.0.1_amd64.deb
browseros --version || echo "BrowserOS installed at $(which browseros)"
- name: Install Bun
@@ -42,15 +41,7 @@ jobs:
- name: Install dependencies
working-directory: packages/browseros-agent
run: bun install --ignore-scripts
- name: Install Python eval dependencies
# agisdk pinned so silent upstream releases can't shift task definitions
# or grader behavior. Bump intentionally with a documented re-baseline.
run: pip install agisdk==0.3.5 requests
- name: Clone WebArena-Infinity
run: git clone --depth 1 https://github.com/web-arena-x/webarena-infinity.git /tmp/webarena-infinity
run: bun install --ignore-scripts && bun run build:agent-sdk
- name: Install xvfb
run: sudo apt-get update && sudo apt-get install -y xvfb
@@ -66,14 +57,9 @@ jobs:
working-directory: packages/browseros-agent/apps/eval
env:
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
BROWSEROS_BINARY: /usr/bin/browseros
WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
# OpenClaw container runtime is macOS-only; opt the Linux runner
# into the no-op stub so the server can boot and the eval can run.
BROWSEROS_SKIP_OPENCLAW: '1'
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
run: |
echo "Running eval with config: $EVAL_CONFIG"
@@ -95,8 +81,6 @@ jobs:
- name: Generate trend report
if: success()
timeout-minutes: 5
continue-on-error: true
working-directory: packages/browseros-agent
env:
EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
@@ -112,11 +96,3 @@ jobs:
with:
name: eval-report-${{ github.run_id }}
path: /tmp/eval-report.html
- name: Upload server stderr logs (for post-mortem on startup failures)
if: always()
uses: actions/upload-artifact@v4
with:
name: browseros-server-logs-${{ github.run_id }}
path: /tmp/browseros-server-logs/
if-no-files-found: ignore

View File

@@ -1,11 +1,168 @@
name: Release BrowserOS Agent SDK (disabled)
name: Release BrowserOS Agent SDK
on:
workflow_dispatch:
concurrency:
group: release-agent-sdk
cancel-in-progress: false
jobs:
disabled:
if: ${{ false }}
publish:
if: github.ref == 'refs/heads/main'
runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write
defaults:
run:
working-directory: packages/browseros-agent/packages/agent-sdk
steps:
- run: echo "Agent SDK publishing is disabled."
- uses: actions/checkout@v6
with:
fetch-depth: 0
- uses: oven-sh/setup-bun@v2
- uses: actions/setup-node@v6
with:
node-version: "20"
registry-url: "https://registry.npmjs.org"
- name: Install dependencies
run: bun ci
working-directory: packages/browseros-agent
- name: Build
run: bun run build
- name: Test
run: bun test
- name: Get version
id: version
run: |
echo "version=$(node -p "require('./package.json').version")" >> "$GITHUB_OUTPUT"
echo "release_sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
- name: Generate release notes
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
SDK_PATH="packages/browseros-agent/packages/agent-sdk"
CURRENT_TAG="agent-sdk-v${{ steps.version.outputs.version }}"
# Find the previous tag, excluding the current version's tag
# (which may already exist from a prior failed run)
PREV_TAG=$(git tag -l "agent-sdk-v*" --sort=-v:refname | grep -v "^${CURRENT_TAG}$" | head -n 1)
if [ -z "$PREV_TAG" ]; then
echo "Initial release" > /tmp/release-notes.md
else
# Get commits scoped to the SDK directory
COMMITS=$(git log "$PREV_TAG"..HEAD --pretty=format:"%H" -- "$SDK_PATH")
if [ -z "$COMMITS" ]; then
echo "No notable changes." > /tmp/release-notes.md
else
echo "## What's Changed" > /tmp/release-notes.md
echo "" >> /tmp/release-notes.md
# For each commit, find the associated PR and format with author
CONTRIBUTORS=""
while IFS= read -r SHA; do
# Get commit subject and author
SUBJECT=$(git log -1 --pretty=format:"%s" "$SHA")
AUTHOR=$(git log -1 --pretty=format:"%an" "$SHA")
GITHUB_USER=$(gh api "/repos/${{ github.repository }}/commits/${SHA}" --jq '.author.login // empty' 2>/dev/null)
# Find associated PR number
PR_NUM=$(gh api "/repos/${{ github.repository }}/commits/${SHA}/pulls" --jq '.[0].number // empty' 2>/dev/null)
# Format line: skip PR number if already in the commit subject
# (squash merges include "(#123)" in the subject automatically)
if [ -n "$PR_NUM" ] && ! echo "$SUBJECT" | grep -qF "(#${PR_NUM})"; then
echo "- ${SUBJECT} (#${PR_NUM})" >> /tmp/release-notes.md
else
echo "- ${SUBJECT}" >> /tmp/release-notes.md
fi
done <<< "$COMMITS"
fi
fi
working-directory: ${{ github.workspace }}
- name: Publish
run: npm publish --access public
env:
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
- name: Create GitHub release
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
TAG="agent-sdk-v${{ steps.version.outputs.version }}"
RELEASE_SHA="${{ steps.version.outputs.release_sha }}"
TITLE="BrowserOS Agent SDK - v${{ steps.version.outputs.version }}"
# Create or reuse tag (idempotent for re-runs)
if git rev-parse "$TAG" >/dev/null 2>&1; then
echo "Tag $TAG already exists, skipping tag creation"
else
git tag "$TAG" "$RELEASE_SHA"
fi
# Push tag (skip if already on remote)
if git ls-remote --tags origin "$TAG" | grep -q "$TAG"; then
echo "Tag $TAG already on remote, skipping push"
else
git push origin "$TAG"
fi
# Create or update release
if gh release view "$TAG" >/dev/null 2>&1; then
echo "Release $TAG already exists, updating"
gh release edit "$TAG" --title "$TITLE" --notes-file /tmp/release-notes.md
else
gh release create "$TAG" --title "$TITLE" --notes-file /tmp/release-notes.md
fi
working-directory: ${{ github.workspace }}
- name: Update CHANGELOG.md via PR
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
VERSION="${{ steps.version.outputs.version }}"
DATE=$(date -u +"%Y-%m-%d")
BRANCH="docs/agent-sdk-changelog-v${VERSION}"
CHANGELOG="packages/browseros-agent/packages/agent-sdk/CHANGELOG.md"
# Return to main before branching
git checkout main
# Use head/tail to safely insert without sed quoting issues
{
head -n 1 "$CHANGELOG"
echo ""
echo "## v${VERSION} (${DATE})"
echo ""
cat /tmp/release-notes.md
echo ""
tail -n +2 "$CHANGELOG"
} > /tmp/new-changelog.md
mv /tmp/new-changelog.md "$CHANGELOG"
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git checkout -b "$BRANCH"
git add "$CHANGELOG"
git commit -m "docs: update agent-sdk changelog for v${VERSION}"
git push origin "$BRANCH"
gh pr create \
--title "docs: update agent-sdk changelog for v${VERSION}" \
--body "Auto-generated changelog update for BrowserOS Agent SDK v${VERSION}." \
--base main \
--head "$BRANCH"
gh pr merge "$BRANCH" --squash --auto || true
working-directory: ${{ github.workspace }}

View File

@@ -54,10 +54,10 @@ jobs:
command: (cd apps/server && bun run test:integration)
junit_path: test-results/server-integration.xml
needs_browser: true
- suite: server-lib
command: (cd apps/server && bun run test:lib)
junit_path: test-results/server-lib.xml
needs_browser: false
- suite: server-sdk
command: (cd apps/server && bun run test:sdk)
junit_path: test-results/server-sdk.xml
needs_browser: true
- suite: server-root
command: (cd apps/server && bun run test:root)
junit_path: test-results/server-root.xml
@@ -70,6 +70,10 @@ jobs:
command: bun run test:eval
junit_path: test-results/eval.xml
needs_browser: false
- suite: agent-sdk
command: bun run test:agent-sdk
junit_path: test-results/agent-sdk.xml
needs_browser: false
- suite: build
command: bun run test:build
junit_path: test-results/build.xml

View File

@@ -180,7 +180,6 @@ packages/*/dist
browseros-server
browseros-server.exe
browseros-server-*
tools/dogfood/browseros-dogfood
tools/dev/browseros-dev
log.txt

View File

@@ -1,6 +1,6 @@
# BrowserOS Agent
The agent platform powering [BrowserOS](https://github.com/browseros-ai/BrowserOS) — contains the MCP server, agent UI, CLI, and evaluation framework.
The agent platform powering [BrowserOS](https://github.com/browseros-ai/BrowserOS) — contains the MCP server, agent UI, CLI, evaluation framework, and SDK.
## Monorepo Structure
@@ -12,6 +12,7 @@ apps/
eval/ # Evaluation framework for benchmarking agents
packages/
agent-sdk/ # Node.js SDK (@browseros-ai/agent-sdk)
cdp-protocol/ # Type-safe Chrome DevTools Protocol bindings
shared/ # Shared constants (ports, timeouts, limits)
```
@@ -22,6 +23,7 @@ packages/
| `apps/agent` | Agent UI — Chrome extension for the chat interface |
| `apps/cli` | Go CLI — control BrowserOS from the terminal or AI coding agents |
| `apps/eval` | Benchmark framework — WebVoyager, Mind2Web evaluation |
| `packages/agent-sdk` | Node.js SDK for browser automation with natural language |
| `packages/cdp-protocol` | Auto-generated CDP type bindings used by the server |
| `packages/shared` | Shared constants used across packages |
@@ -73,20 +75,26 @@ packages/
### Setup
Requires [process-compose](https://github.com/F1bonacc1/process-compose):
```bash
brew install process-compose
```
```bash
# Copy environment files for each package
cp apps/server/.env.example apps/server/.env.development
cp apps/agent/.env.example apps/agent/.env.development
cp apps/server/.env.production.example apps/server/.env.production
# Install deps, generate agent code, and sync the VM cache
bun run dev:setup
# Start the full dev environment
bun run dev:watch
process-compose up
```
`dev:watch` exits when the VM cache manifest is missing, but setup stays in `dev:setup`.
The `process-compose up` command runs the following in order:
1. `bun install` — installs dependencies
2. `bun --cwd apps/agent codegen` — generates agent code
3. `bun --cwd apps/server start` and `bun --cwd apps/agent dev` — starts server and agent in parallel
### Environment Variables

View File

@@ -1,39 +0,0 @@
import type { Provider } from './chatComponentTypes'
export interface ProviderOptionGroup {
key: 'llm' | 'acp'
label: string
options: Provider[]
}
export function groupProviderOptions(
providers: Provider[],
): ProviderOptionGroup[] {
const llm = providers.filter((provider) => provider.kind !== 'acp')
const acp = providers.filter((provider) => provider.kind === 'acp')
return [
...(llm.length
? [{ key: 'llm' as const, label: 'AI Providers', options: llm }]
: []),
...(acp.length
? [{ key: 'acp' as const, label: 'ACP Models', options: acp }]
: []),
]
}
export function getProviderSearchValue(
provider: Provider,
groupLabel: string,
): string {
return [provider.id, provider.name, provider.type, groupLabel]
.filter(Boolean)
.join(' ')
}
export function getProviderSubtitle(provider: Provider): string | undefined {
if (provider.kind !== 'acp') return undefined
return provider.modelControl === 'best-effort'
? 'ACP model · best effort'
: 'ACP model'
}

View File

@@ -1,67 +0,0 @@
import { describe, expect, it } from 'bun:test'
import {
getProviderSearchValue,
getProviderSubtitle,
groupProviderOptions,
} from './ChatProviderSelector.helpers'
import type { Provider } from './chatComponentTypes'
const options: Provider[] = [
{ kind: 'llm', id: 'browseros', name: 'BrowserOS', type: 'browseros' },
{
kind: 'llm',
id: 'anthropic-sonnet',
name: 'Anthropic Sonnet',
type: 'anthropic',
},
{
kind: 'acp',
id: 'acp:claude:haiku:medium',
name: 'Claude Code Haiku',
type: 'acp',
modelControl: 'best-effort',
},
{
kind: 'acp',
id: 'acp:codex:gpt-5.5:medium',
name: 'Codex GPT-5.5',
type: 'acp',
modelControl: 'runtime-supported',
},
]
describe('groupProviderOptions', () => {
it('groups normal providers separately from ACP models', () => {
expect(groupProviderOptions(options)).toEqual([
{
key: 'llm',
label: 'AI Providers',
options: [options[0], options[1]],
},
{
key: 'acp',
label: 'ACP Models',
options: [options[2], options[3]],
},
])
})
})
describe('getProviderSearchValue', () => {
it('matches ACP group labels and item labels', () => {
expect(getProviderSearchValue(options[2], 'ACP Models')).toContain(
'ACP Models',
)
expect(getProviderSearchValue(options[2], 'ACP Models')).toContain(
'Claude Code Haiku',
)
})
})
describe('getProviderSubtitle', () => {
it('does not present best-effort ACP models as guaranteed routing', () => {
expect(getProviderSubtitle(options[2])).toBe('ACP model · best effort')
expect(getProviderSubtitle(options[3])).toBe('ACP model')
expect(getProviderSubtitle(options[0])).toBeUndefined()
})
})

View File

@@ -1,4 +1,4 @@
import { Bot, Check, Plus } from 'lucide-react'
import { Check, Plus } from 'lucide-react'
import type { FC, PropsWithChildren } from 'react'
import { useState } from 'react'
import {
@@ -17,11 +17,6 @@ import {
import { BrowserOSIcon, ProviderIcon } from '@/lib/llm-providers/providerIcons'
import type { ProviderType } from '@/lib/llm-providers/types'
import { cn } from '@/lib/utils'
import {
getProviderSearchValue,
getProviderSubtitle,
groupProviderOptions,
} from './ChatProviderSelector.helpers'
import type { Provider } from './chatComponentTypes'
interface ChatProviderSelectorProps {
@@ -34,55 +29,54 @@ export const ChatProviderSelector: FC<
PropsWithChildren<ChatProviderSelectorProps>
> = ({ children, providers, selectedProvider, onSelectProvider }) => {
const [open, setOpen] = useState(false)
const groups = groupProviderOptions(providers)
return (
<Popover open={open} onOpenChange={setOpen}>
<PopoverTrigger asChild>{children}</PopoverTrigger>
<PopoverContent side="bottom" align="start" className="w-64 p-0">
<PopoverContent side="bottom" align="start" className="w-48 p-0">
<Command>
<CommandInput placeholder="Search models..." className="h-9" />
<CommandInput placeholder="Search providers..." className="h-9" />
<CommandList>
<div className="my-2 px-2 font-semibold text-muted-foreground text-xs uppercase tracking-wide">
AI Provider
</div>
<CommandEmpty>No provider found</CommandEmpty>
{groups.map((group) => (
<CommandGroup key={group.key} heading={group.label}>
{group.options.map((provider) => {
const isSelected = selectedProvider.id === provider.id
const subtitle = getProviderSubtitle(provider)
return (
<CommandItem
key={provider.id}
value={getProviderSearchValue(provider, group.label)}
onSelect={() => {
onSelectProvider(provider)
setOpen(false)
}}
className={cn(
'flex w-full items-center gap-3 rounded-md p-2 transition-colors',
isSelected && 'bg-[var(--accent-orange)]/10',
<CommandGroup>
{providers.map((provider) => {
const isSelected = selectedProvider.id === provider.id
return (
<CommandItem
key={provider.id}
value={`${provider.id} ${provider.name}`}
onSelect={() => {
onSelectProvider(provider)
setOpen(false)
}}
className={cn(
'flex w-full items-center gap-3 rounded-md p-2 transition-colors',
isSelected && 'bg-[var(--accent-orange)]/10',
)}
>
<span className="text-muted-foreground">
{provider.type === 'browseros' ? (
<BrowserOSIcon size={18} />
) : (
<ProviderIcon
type={provider.type as ProviderType}
size={18}
/>
)}
>
<span className="text-muted-foreground">
<ProviderOptionIcon provider={provider} />
</span>
<span className="min-w-0 flex-1 text-left">
<span className="block truncate text-sm">
{provider.name}
</span>
{subtitle && (
<span className="block truncate text-muted-foreground text-xs">
{subtitle}
</span>
)}
</span>
{isSelected && (
<Check className="h-3.5 w-3.5 text-[var(--accent-orange)]" />
)}
</CommandItem>
)
})}
</CommandGroup>
))}
</span>
<span className="flex-1 text-left text-sm">
{provider.name}
</span>
{isSelected && (
<Check className="h-3.5 w-3.5 text-[var(--accent-orange)]" />
)}
</CommandItem>
)
})}
</CommandGroup>
<div className="border-border border-t p-1">
<button
type="button"
@@ -102,9 +96,3 @@ export const ChatProviderSelector: FC<
</Popover>
)
}
function ProviderOptionIcon({ provider }: { provider: Provider }) {
if (provider.kind === 'acp') return <Bot size={18} />
if (provider.type === 'browseros') return <BrowserOSIcon size={18} />
return <ProviderIcon type={provider.type as ProviderType} size={18} />
}

View File

@@ -1,11 +1,7 @@
import type { ProviderType } from '@/lib/llm-providers/types'
export type ChatProviderType = ProviderType | 'acp'
export interface Provider {
id: string
name: string
type: ChatProviderType
kind: 'llm' | 'acp'
modelControl?: 'runtime-supported' | 'best-effort'
type: ProviderType
}

View File

@@ -74,18 +74,6 @@ const primaryNavItems: NavItem[] = [
{ name: 'Settings', to: '/settings/ai', icon: Settings },
]
function isNavItemActive(item: NavItem, pathname: string): boolean {
if (item.to === '/settings/ai') {
return pathname.startsWith('/settings')
}
if (item.to === '/agents') {
return pathname === '/agents' || pathname.startsWith('/agents/')
}
return pathname === item.to
}
export const SidebarNavigation: FC<SidebarNavigationProps> = ({
expanded = true,
}) => {
@@ -102,7 +90,10 @@ export const SidebarNavigation: FC<SidebarNavigationProps> = ({
<nav className="space-y-1">
{filteredItems.map((item) => {
const Icon = item.icon
const isActive = isNavItemActive(item, location.pathname)
const isActive =
item.to === '/settings/ai'
? location.pathname.startsWith('/settings')
: location.pathname === item.to
const navItem = (
<NavLink

View File

@@ -113,22 +113,7 @@ export const App: FC = () => {
<Route path="connect-apps" element={<ConnectMCP />} />
<Route path="scheduled" element={<ScheduledTasksPage />} />
{alphaEnabled ? (
<>
<Route path="agents" element={<AgentsPage />} />
<Route element={<AgentCommandLayout />}>
<Route
path="agents/:agentId"
element={
<AgentCommandConversation
variant="page"
backPath="/agents"
agentPathPrefix="/agents"
createAgentPath="/agents"
/>
}
/>
</Route>
</>
<Route path="agents" element={<AgentsPage />} />
) : null}
{alphaEnabled ? (
<Route path="admin" element={<AdminDashboardPage />} />

View File

@@ -1,4 +1,4 @@
import { Bot, Loader2, Wrench } from 'lucide-react'
import { Bot } from 'lucide-react'
import type { FC } from 'react'
import type { AgentCardData } from '@/lib/agent-conversations/types'
import { cn } from '@/lib/utils'
@@ -32,11 +32,6 @@ function getStatusTone(status: AgentCardData['status']): string {
return 'bg-emerald-500'
}
function formatCost(usd: number): string {
if (usd < 0.005) return `$${usd.toFixed(4)}`
return `$${usd.toFixed(2)}`
}
export const AgentCardExpanded: FC<AgentCardProps> = ({
agent,
onClick,
@@ -86,26 +81,9 @@ export const AgentCardExpanded: FC<AgentCardProps> = ({
</p>
</div>
<div className="mt-4 space-y-1.5 text-muted-foreground text-xs">
<div className="flex items-center justify-between gap-3">
<span>{formatTimestamp(agent.lastMessageTimestamp)}</span>
{agent.costUsd ? (
<span className="tabular-nums opacity-70">
{formatCost(agent.costUsd)}
</span>
) : null}
</div>
{agent.status === 'working' && agent.currentTool ? (
<div className="flex items-center gap-1.5 text-[var(--accent-orange)]/70">
<Loader2 className="size-3 shrink-0 animate-spin" />
<span className="truncate">{agent.currentTool}</span>
</div>
) : agent.activitySummary ? (
<div className="flex items-center gap-1.5 text-muted-foreground/60">
<Wrench className="size-3 shrink-0" />
<span className="truncate">{agent.activitySummary}</span>
</div>
) : null}
<div className="mt-4 flex items-center justify-between gap-3 text-muted-foreground text-xs">
<span>{formatTimestamp(agent.lastMessageTimestamp)}</span>
<span>Open conversation</span>
</div>
</button>
)

View File

@@ -1,308 +1,190 @@
import { ArrowLeft, Bot, Home } from 'lucide-react'
import { type FC, useEffect, useMemo, useRef } from 'react'
import { Bot, Home, RotateCcw } from 'lucide-react'
import { type FC, useEffect, useRef } from 'react'
import { Navigate, useNavigate, useParams, useSearchParams } from 'react-router'
import { Button } from '@/components/ui/button'
import {
type AgentEntry,
getModelDisplayName,
} from '@/entrypoints/app/agents/useOpenClaw'
import type { AgentEntry } from '@/entrypoints/app/agents/useOpenClaw'
import { cn } from '@/lib/utils'
import { useAgentCommandData } from './agent-command-layout'
import { ClawChat } from './ClawChat'
import { ConversationInput } from './ConversationInput'
import {
buildChatHistoryFromClawMessages,
filterTurnsPersistedInHistory,
flattenHistoryPages,
} from './claw-chat-types'
import { ConversationMessage } from './ConversationMessage'
import { useAgentConversation } from './useAgentConversation'
import { useHarnessChatHistory } from './useHarnessChatHistory'
function StatusBadge({ status }: { status: string }) {
return (
<div className="inline-flex items-center gap-2 rounded-full border border-border/60 bg-card px-3 py-1 text-[11px] text-muted-foreground uppercase tracking-[0.18em]">
<span
className={cn(
'size-1.5 rounded-full',
status === 'Working on your request'
? 'bg-amber-500'
: status === 'Ready'
? 'bg-emerald-500'
: status === 'Offline'
? 'bg-muted-foreground/50'
: 'bg-[var(--accent-orange)]',
)}
/>
<span>{status}</span>
</div>
)
}
function AgentIdentity({
name,
meta,
className,
}: {
name: string
meta: string
className?: string
}) {
return (
<div className={cn('min-w-0', className)}>
<div className="truncate font-semibold text-[15px] leading-5">{name}</div>
<div className="truncate text-muted-foreground text-xs leading-5">
{meta}
</div>
</div>
)
}
function ConversationHeader({
agentName,
agentMeta,
status,
backLabel,
backTarget,
onGoHome,
onReset,
}: {
agentName: string
agentMeta: string
status: string
backLabel: string
backTarget: 'home' | 'page'
onGoHome: () => void
onReset: () => void
}) {
const BackIcon = backTarget === 'home' ? Home : ArrowLeft
return (
<div className="flex h-14 items-center justify-between gap-4 border-border/50 border-b px-5">
<div className="flex min-w-0 items-center gap-3">
<div className="overflow-hidden rounded-[1.5rem] border border-border/60 bg-card/95 shadow-sm backdrop-blur">
<div className="flex items-center justify-between gap-3 px-5 py-4">
<div className="flex min-w-0 items-center gap-3">
<Button
variant="ghost"
size="icon"
onClick={onGoHome}
className="rounded-xl"
title="Back to home"
>
<Home className="size-4" />
</Button>
<div className="flex size-11 shrink-0 items-center justify-center rounded-2xl bg-muted text-muted-foreground">
<Bot className="size-5" />
</div>
<div className="min-w-0">
<div className="truncate font-semibold text-sm">{agentName}</div>
<div className="truncate text-muted-foreground text-sm">
{status}
</div>
</div>
</div>
<Button
variant="ghost"
size="icon"
onClick={onGoHome}
className="size-8 rounded-xl lg:hidden"
title={backLabel}
size="sm"
onClick={onReset}
className="rounded-xl text-muted-foreground"
>
<BackIcon className="size-4" />
<RotateCcw className="mr-2 size-4" />
New conversation
</Button>
<div className="flex size-8 shrink-0 items-center justify-center rounded-xl bg-muted text-muted-foreground">
<Bot className="size-4" />
</div>
<AgentIdentity name={agentName} meta={agentMeta} />
</div>
<StatusBadge status={status} />
</div>
)
}
function AgentRailHeader({ onGoHome }: { onGoHome: () => void }) {
return (
<div className="hidden h-14 items-center border-border/50 border-r border-b bg-background/70 px-4 lg:flex">
<div className="flex min-w-0 items-center gap-3">
<Button
variant="ghost"
size="icon"
onClick={onGoHome}
className="size-8 rounded-xl"
title="Back to home"
>
<ArrowLeft className="size-4" />
</Button>
<div className="truncate font-semibold text-[15px] leading-5">
Agents
</div>
</div>
</div>
)
}
function AgentRailList({
activeAgentId,
agents,
onSelectAgent,
}: {
activeAgentId: string
agents: AgentEntry[]
onSelectAgent: (entry: AgentEntry) => void
}) {
function EmptyConversationState({ agentName }: { agentName: string }) {
return (
<aside className="hidden min-h-0 flex-col border-border/50 border-r bg-background/70 lg:flex">
<div className="styled-scrollbar min-h-0 flex-1 space-y-2 overflow-y-auto px-3 py-3">
{agents.map((entry) => {
const active = entry.agentId === activeAgentId
const modelName = getAgentEntryMeta(entry)
return (
<button
key={entry.agentId}
type="button"
onClick={() => onSelectAgent(entry)}
className={cn(
'w-full rounded-2xl border px-3 py-3 text-left transition-all',
active
? 'border-[var(--accent-orange)]/30 bg-[var(--accent-orange)]/8 shadow-sm'
: 'border-transparent bg-transparent hover:border-border/60 hover:bg-card',
)}
>
<div className="flex items-center gap-3">
<div
className={cn(
'flex size-9 items-center justify-center rounded-xl',
active
? 'bg-[var(--accent-orange)]/12 text-[var(--accent-orange)]'
: 'bg-muted text-muted-foreground',
)}
>
<Bot className="size-4" />
</div>
<AgentIdentity name={entry.name} meta={modelName} />
</div>
</button>
)
})}
<div className="flex min-h-full items-center justify-center py-10">
<div className="max-w-md rounded-[1.5rem] border border-border/60 bg-card/90 px-8 py-10 text-center shadow-sm backdrop-blur">
<div className="mx-auto flex size-14 items-center justify-center rounded-2xl bg-muted text-muted-foreground">
<Bot className="size-6" />
</div>
<h2 className="mt-4 font-semibold text-lg">{agentName}</h2>
<p className="mt-2 text-muted-foreground text-sm">
Send a message to start a focused conversation with this agent.
</p>
</div>
</aside>
</div>
)
}
function getAgentEntryMeta(agent: AgentEntry | undefined): string {
if (agent?.source === 'agent-harness') {
return getModelDisplayName(agent.model) ?? 'ACP agent'
}
return getModelDisplayName(agent?.model) ?? 'OpenClaw agent'
function getConversationStatusCopy(
status: string | undefined,
streaming: boolean,
): string {
if (streaming) return 'Working on your request'
if (status === 'running') return 'Ready for the next task'
if (status === 'starting') return 'Connecting to OpenClaw'
if (status === 'error') return 'OpenClaw needs attention'
if (status === 'stopped') return 'OpenClaw is offline'
return 'Open agent setup to continue'
}
function AgentConversationController({
agentId,
initialMessage,
onInitialMessageConsumed,
agents,
agentPathPrefix,
createAgentPath,
}: {
agentId: string
initialMessage: string | null
onInitialMessageConsumed: () => void
agents: AgentEntry[]
agentPathPrefix: string
createAgentPath: string
}) {
export const AgentCommandConversation: FC = () => {
const { agentId } = useParams<{ agentId: string }>()
const [searchParams, setSearchParams] = useSearchParams()
const navigate = useNavigate()
const initialMessageSentRef = useRef<string | null>(null)
const onInitialMessageConsumedRef = useRef(onInitialMessageConsumed)
const agent = agents.find((entry) => entry.agentId === agentId)
const agentName = agent?.name || agentId || 'Agent'
// Routing is now harness-only. Every OpenClaw agent has a harness
// record post the gateway → harness backfill, so the chat panel
// always talks to /agents/<id>/chat. The legacy ClawChat surface
// was deleted with the /claw/agents/:id/chat server route.
const harnessHistoryQuery = useHarnessChatHistory(agentId, Boolean(agent))
const historyMessages = useMemo(
() =>
flattenHistoryPages(
harnessHistoryQuery.data ? [harnessHistoryQuery.data] : [],
),
[harnessHistoryQuery.data],
)
const chatHistory = useMemo(
() => buildChatHistoryFromClawMessages(historyMessages),
[historyMessages],
)
const { turns, streaming, send } = useAgentConversation(agentId, {
runtime: 'agent-harness',
sessionKey: null,
history: chatHistory,
onComplete: () => {
void harnessHistoryQuery.refetch()
},
onSessionKeyChange: () => {},
})
const visibleTurns = useMemo(
() => filterTurnsPersistedInHistory(turns, historyMessages),
[historyMessages, turns],
)
onInitialMessageConsumedRef.current = onInitialMessageConsumed
const disabled = !agent
const historyReady =
harnessHistoryQuery.isFetched || harnessHistoryQuery.isError
const initialMessageKey = initialMessage
? `${agentId}:${initialMessage}`
: null
const error = harnessHistoryQuery.error ?? null
const sendRef = useRef(send)
sendRef.current = send
const scrollRef = useRef<HTMLDivElement>(null)
const initialQuerySent = useRef(false)
const { status, agents } = useAgentCommandData()
const shouldRedirectHome = !agentId
const resolvedAgentId = agentId ?? ''
const agent = agents.find((entry) => entry.agentId === resolvedAgentId)
const agentName = agent?.name || resolvedAgentId || 'Agent'
const { turns, streaming, loading, send, resetConversation } =
useAgentConversation(resolvedAgentId, agentName)
const lastTurn = turns[turns.length - 1]
const lastTurnPartCount = lastTurn?.parts.length ?? 0
useEffect(() => {
const query = initialMessage?.trim()
if (!initialMessageKey) {
initialMessageSentRef.current = null
return
}
if (shouldRedirectHome) return
const query = searchParams.get('q')
if (query && !initialQuerySent.current && !loading) {
initialQuerySent.current = true
setSearchParams({}, { replace: true })
void send(query)
}
}, [loading, searchParams, send, setSearchParams, shouldRedirectHome])
useEffect(() => {
if (
!query ||
initialMessageSentRef.current === initialMessageKey ||
disabled ||
!historyReady
shouldRedirectHome ||
(turns.length === 0 && lastTurnPartCount === 0 && !streaming)
) {
return
}
initialMessageSentRef.current = initialMessageKey
onInitialMessageConsumedRef.current()
void sendRef.current({ text: query })
}, [disabled, historyReady, initialMessage, initialMessageKey])
scrollRef.current?.scrollTo({
top: scrollRef.current.scrollHeight,
behavior: 'smooth',
})
}, [lastTurnPartCount, shouldRedirectHome, streaming, turns.length])
const handleSelectAgent = (entry: AgentEntry) => {
navigate(`${agentPathPrefix}/${entry.agentId}`)
if (shouldRedirectHome) {
return <Navigate to="/home" replace />
}
return (
<div className="flex min-h-0 flex-col overflow-hidden">
<ClawChat
agentName={agentName}
historyMessages={historyMessages}
turns={visibleTurns}
streaming={streaming}
isInitialLoading={harnessHistoryQuery.isLoading}
error={error}
hasNextPage={false}
isFetchingNextPage={false}
onFetchNextPage={() => {}}
onRetry={() => {
void harnessHistoryQuery.refetch()
}}
/>
const handleSelectAgent = (entry: AgentEntry) => {
navigate(`/home/agents/${entry.agentId}`)
}
<div className="border-border/50 border-t bg-background/88 px-4 py-3 backdrop-blur-md">
<div className="mx-auto max-w-3xl">
const statusCopy = getConversationStatusCopy(status?.status, streaming)
return (
<div className="absolute inset-0 overflow-hidden">
<div className="fade-in slide-in-from-bottom-5 mx-auto flex h-full w-full max-w-3xl animate-in flex-col gap-3 px-4 pt-4 pb-2 duration-300">
<ConversationHeader
agentName={agentName}
status={statusCopy}
onGoHome={() => navigate('/home')}
onReset={resetConversation}
/>
<main
ref={scrollRef}
className={cn(
'styled-scrollbar min-h-0 flex-1 overflow-y-auto overflow-x-hidden rounded-[1.5rem] border border-border/50 bg-card/85 px-5 py-5 shadow-sm',
'[&_[data-streamdown="code-block"]]:!max-w-full [&_[data-streamdown="table-wrapper"]]:!max-w-full [&_[data-streamdown="code-block"]]:overflow-x-auto [&_[data-streamdown="table-wrapper"]]:overflow-x-auto',
)}
>
{loading ? (
<div className="flex h-full items-center justify-center text-muted-foreground text-sm">
Loading conversation...
</div>
) : turns.length === 0 ? (
<EmptyConversationState agentName={agentName} />
) : (
<div className="w-full space-y-4">
{turns.map((turn, index) => (
<ConversationMessage
key={turn.id}
turn={turn}
streaming={streaming && index === turns.length - 1}
/>
))}
</div>
)}
</main>
<div className="w-full flex-shrink-0">
<ConversationInput
variant="conversation"
agents={agents}
selectedAgentId={agentId}
selectedAgentId={resolvedAgentId}
onSelectAgent={handleSelectAgent}
onSend={(input) => {
const attachments = input.attachments.map((a) => a.payload)
const attachmentPreviews = input.attachments.map((a) => ({
id: a.id,
kind: a.kind,
mediaType: a.mediaType,
name: a.name,
dataUrl: a.dataUrl,
}))
void send({ text: input.text, attachments, attachmentPreviews })
onSend={(text) => {
void send(text)
}}
onCreateAgent={() => navigate(createAgentPath)}
onCreateAgent={() => navigate('/agents')}
streaming={streaming}
disabled={disabled}
status="running"
attachmentsEnabled={true}
disabled={status?.status !== 'running'}
status={status?.status}
placeholder={`Message ${agentName}...`}
/>
</div>
@@ -310,79 +192,3 @@ function AgentConversationController({
</div>
)
}
interface AgentCommandConversationProps {
variant?: 'command' | 'page'
backPath?: string
agentPathPrefix?: string
createAgentPath?: string
}
export const AgentCommandConversation: FC<AgentCommandConversationProps> = ({
variant = 'command',
backPath = '/home',
agentPathPrefix = '/home/agents',
createAgentPath = '/agents',
}) => {
const { agentId } = useParams<{ agentId: string }>()
const [searchParams, setSearchParams] = useSearchParams()
const navigate = useNavigate()
const { agents } = useAgentCommandData()
const shouldRedirectHome = !agentId
const resolvedAgentId = agentId ?? ''
const agent = agents.find((entry) => entry.agentId === resolvedAgentId)
const agentName = agent?.name || resolvedAgentId || 'Agent'
const agentMeta = getAgentEntryMeta(agent)
const initialMessage = searchParams.get('q')
const isPageVariant = variant === 'page'
const backLabel = isPageVariant ? 'Back to agents' : 'Back to home'
if (shouldRedirectHome) {
return <Navigate to="/home" replace />
}
const handleSelectAgent = (entry: AgentEntry) => {
navigate(`${agentPathPrefix}/${entry.agentId}`)
}
// Every visible agent runs through the harness now, so per-agent
// runtime status doesn't gate chat the way OpenClaw's legacy
// gateway lifecycle did. Show "Ready" once the agent record is
// resolved from the rail, "Setup" otherwise.
const statusCopy = agent ? 'Ready' : 'Setup'
return (
<div className="absolute inset-0 overflow-hidden bg-background md:pl-[theme(spacing.14)]">
<div className="mx-auto grid h-full w-full max-w-[1480px] lg:grid-cols-[288px_minmax(0,1fr)] lg:grid-rows-[3.5rem_minmax(0,1fr)]">
<AgentRailHeader onGoHome={() => navigate(backPath)} />
<ConversationHeader
agentName={agentName}
agentMeta={agentMeta}
status={statusCopy}
backLabel={backLabel}
backTarget={isPageVariant ? 'page' : 'home'}
onGoHome={() => navigate(backPath)}
/>
<AgentRailList
activeAgentId={resolvedAgentId}
agents={agents}
onSelectAgent={handleSelectAgent}
/>
<AgentConversationController
key={resolvedAgentId}
agentId={resolvedAgentId}
agents={agents}
initialMessage={initialMessage}
onInitialMessageConsumed={() =>
setSearchParams({}, { replace: true })
}
agentPathPrefix={agentPathPrefix}
createAgentPath={createAgentPath}
/>
</div>
</div>
)
}

View File

@@ -1,87 +1,89 @@
import { Plus } from 'lucide-react'
import { ArrowRight } from 'lucide-react'
import { type FC, useEffect, useState } from 'react'
import { useNavigate } from 'react-router'
import { Button } from '@/components/ui/button'
import { Card, CardContent } from '@/components/ui/card'
import { Separator } from '@/components/ui/separator'
import type { AgentEntry } from '@/entrypoints/app/agents/useOpenClaw'
import { ImportDataHint } from '@/entrypoints/newtab/index/ImportDataHint'
import { NewTabBranding } from '@/entrypoints/newtab/index/NewTabBranding'
import { NewTabTip } from '@/entrypoints/newtab/index/NewTabTip'
import { ScheduleResults } from '@/entrypoints/newtab/index/ScheduleResults'
import { SignInHint } from '@/entrypoints/newtab/index/SignInHint'
import { TopSites } from '@/entrypoints/newtab/index/TopSites'
import { useActiveHint } from '@/entrypoints/newtab/index/useActiveHint'
import type { AgentCardData } from '@/lib/agent-conversations/types'
import { AgentCardDock } from './AgentCardDock'
import { useAgentCommandData } from './agent-command-layout'
import { ConversationInput } from './ConversationInput'
import { buildAgentCardData } from './useAgentCardData'
import { useAgentCardData } from './useAgentCardData'
function EmptyAgentsState({ onOpenAgents }: { onOpenAgents: () => void }) {
function AgentCommandSetupState({
onOpenAgents,
}: {
onOpenAgents: () => void
}) {
return (
<Card className="border-border/60 bg-card/90 shadow-sm">
<CardContent className="flex flex-col items-center gap-4 p-8 text-center">
<div className="flex size-12 items-center justify-center rounded-2xl bg-muted text-muted-foreground">
<Plus className="size-5" />
</div>
<div className="space-y-2">
<h2 className="font-semibold text-lg">No agents yet</h2>
<p className="max-w-md text-muted-foreground text-sm leading-6">
Create an agent to start using BrowserOS as an agent-first new tab.
</p>
</div>
<Button variant="outline" onClick={onOpenAgents} className="rounded-xl">
Create agent
<Card className="border-border/60 bg-card/85 shadow-sm">
<CardContent className="flex flex-col items-center gap-4 p-6 text-center">
<p className="max-w-xl text-muted-foreground text-sm">
Set up OpenClaw agents to turn your new tab into an agent command
center.
</p>
<Button onClick={onOpenAgents} className="gap-2">
Open Agent Setup
<ArrowRight className="size-4" />
</Button>
</CardContent>
</Card>
)
}
function RecentThreads({
activeAgentId,
agents,
onOpenAgents,
onSelectAgent,
}: {
activeAgentId?: string | null
agents: AgentCardData[]
onOpenAgents: () => void
onSelectAgent: (agentId: string) => void
}) {
if (agents.length === 0) return null
function EmptyAgentsState({ onOpenAgents }: { onOpenAgents: () => void }) {
return (
<section className="space-y-4">
<div className="flex items-center justify-between gap-4">
<div>
<h2 className="font-semibold text-base">Recent agents</h2>
<p className="text-muted-foreground text-sm">
Continue from where you left off.
</p>
</div>
<Button
variant="outline"
onClick={onOpenAgents}
className="rounded-xl"
size="sm"
>
Manage agents
<Card className="border-border/60 bg-card/85 shadow-sm">
<CardContent className="flex flex-col items-center gap-4 p-6 text-center">
<p className="max-w-xl text-muted-foreground text-sm">
OpenClaw is running, but you do not have any agents yet.
</p>
<Button variant="outline" onClick={onOpenAgents}>
Create your first agent
</Button>
</div>
<AgentCardDock
agents={agents}
activeAgentId={activeAgentId ?? undefined}
onSelectAgent={onSelectAgent}
onCreateAgent={onOpenAgents}
/>
</section>
</CardContent>
</Card>
)
}
function OpenClawUnavailableState({
onOpenAgents,
}: {
onOpenAgents: () => void
}) {
return (
<Card className="border-border/60 bg-card/85 shadow-sm">
<CardContent className="flex flex-col items-center gap-4 p-6 text-center">
<p className="max-w-xl text-muted-foreground text-sm">
OpenClaw is unavailable right now. Open the Agents page to restart the
gateway or review setup.
</p>
<Button onClick={onOpenAgents} className="gap-2">
Open Agent Setup
<ArrowRight className="size-4" />
</Button>
</CardContent>
</Card>
)
}
export const AgentCommandHome: FC = () => {
const navigate = useNavigate()
const activeHint = useActiveHint()
const { agents, status } = useAgentCommandData()
const { status, agents } = useAgentCommandData()
const [mounted, setMounted] = useState(false)
const [selectedAgentId, setSelectedAgentId] = useState<string | null>(null)
const cardData = buildAgentCardData(agents, status?.status, undefined)
const cardData = useAgentCardData(agents, status?.status)
useEffect(() => {
setMounted(true)
}, [])
useEffect(() => {
if (agents.length === 0) {
@@ -99,77 +101,78 @@ export const AgentCommandHome: FC = () => {
}
}, [agents, selectedAgentId])
const handleSend = (input: { text: string }) => {
const handleSend = (text: string) => {
if (!selectedAgentId) return
navigate(
`/home/agents/${selectedAgentId}?q=${encodeURIComponent(input.text)}`,
)
navigate(`/home/agents/${selectedAgentId}?q=${encodeURIComponent(text)}`)
}
const handleSelectAgent = (agent: AgentEntry) => {
setSelectedAgentId(agent.agentId)
}
const selectedAgent = agents.find(
(agent) => agent.agentId === selectedAgentId,
)
const selectedAgentReady = selectedAgent
? selectedAgent.source === 'agent-harness' || status?.status === 'running'
: false
const selectedAgentStatus =
selectedAgent?.source === 'agent-harness' ? 'running' : status?.status
const selectedCard =
cardData.find((agent) => agent.agentId === selectedAgentId) ?? cardData[0]
const openClawStatus = status?.status
const isSetup = openClawStatus != null && openClawStatus !== 'uninitialized'
const shouldShowUnavailableState =
openClawStatus != null &&
openClawStatus !== 'running' &&
openClawStatus !== 'uninitialized' &&
cardData.length === 0
return (
<div className="min-h-full px-4 py-6">
<div className="mx-auto flex w-full max-w-5xl flex-col gap-8">
{cardData.length > 0 ? (
<>
<div className="flex flex-col items-center gap-5 pt-[max(10vh,24px)] text-center">
<div className="space-y-3">
<h1 className="font-semibold text-[clamp(2rem,4vw,3.25rem)] leading-tight tracking-tight">
What should your agent work on next?
</h1>
<p className="mx-auto max-w-2xl text-muted-foreground text-sm leading-6">
Start with a task, continue a thread, or switch to another
agent without leaving the new tab.
</p>
</div>
<div className="pt-[max(25vh,16px)]">
<div className="relative w-full space-y-8 md:w-3xl">
<NewTabBranding />
<div className="w-full max-w-3xl">
<ConversationInput
variant="home"
agents={agents}
selectedAgentId={selectedAgentId}
onSelectAgent={handleSelectAgent}
onSend={handleSend}
onCreateAgent={() => navigate('/agents')}
streaming={false}
disabled={!selectedAgentReady}
status={selectedAgentStatus}
attachmentsEnabled={false}
placeholder={
selectedAgentReady
? `Ask ${selectedCard?.name ?? 'your agent'} to handle a task...`
: 'Agent runtime is not running...'
}
/>
</div>
</div>
<ConversationInput
variant="home"
agents={agents}
selectedAgentId={selectedAgentId}
onSelectAgent={handleSelectAgent}
onSend={handleSend}
onCreateAgent={() => navigate('/agents')}
streaming={false}
disabled={status?.status !== 'running'}
status={status?.status}
placeholder={
status?.status === 'running'
? undefined
: 'OpenClaw is not running...'
}
/>
<Separator />
{mounted ? <NewTabTip /> : null}
<RecentThreads
activeAgentId={selectedAgentId}
agents={cardData}
{isSetup ? (
shouldShowUnavailableState ? (
<OpenClawUnavailableState
onOpenAgents={() => navigate('/agents')}
onSelectAgent={(agentId) => navigate(`/home/agents/${agentId}`)}
/>
</>
) : cardData.length > 0 ? (
<section className="space-y-3">
<div className="flex items-center justify-between">
<div>
<h2 className="font-semibold text-base">Agents</h2>
<p className="text-muted-foreground text-sm">
Pick up where your agents left off.
</p>
</div>
</div>
<AgentCardDock
agents={cardData}
activeAgentId={selectedAgentId ?? undefined}
onSelectAgent={(agentId) => navigate(`/home/agents/${agentId}`)}
onCreateAgent={() => navigate('/agents')}
/>
</section>
) : (
<EmptyAgentsState onOpenAgents={() => navigate('/agents')} />
)
) : (
<EmptyAgentsState onOpenAgents={() => navigate('/agents')} />
<AgentCommandSetupState onOpenAgents={() => navigate('/agents')} />
)}
{mounted ? <TopSites /> : null}
{mounted ? <ScheduleResults /> : null}
</div>
{activeHint === 'signin' ? <SignInHint /> : null}

View File

@@ -1,172 +0,0 @@
import { Bot, Loader2, RefreshCw } from 'lucide-react'
import { type FC, useEffect, useRef } from 'react'
import {
Conversation,
ConversationContent,
ConversationScrollButton,
} from '@/components/ai-elements/conversation'
import type { AgentConversationTurn } from '@/lib/agent-conversations/types'
import { cn } from '@/lib/utils'
import { ClawChatMessage } from './ClawChatMessage'
import { ConversationMessage } from './ConversationMessage'
import type { ClawChatMessage as ClawChatMessageModel } from './claw-chat-types'
interface ClawChatProps {
agentName: string
historyMessages: ClawChatMessageModel[]
turns: AgentConversationTurn[]
streaming: boolean
isInitialLoading: boolean
error: Error | null
hasNextPage: boolean
isFetchingNextPage: boolean
onFetchNextPage: () => void
onRetry: () => void
className?: string
}
function EmptyConversationState({ agentName }: { agentName: string }) {
return (
<div className="flex h-full items-center justify-center px-6 py-12">
<div className="max-w-md text-center">
<div className="mx-auto flex size-14 items-center justify-center rounded-3xl bg-muted text-muted-foreground">
<Bot className="size-6" />
</div>
<h2 className="mt-5 font-semibold text-xl">{agentName}</h2>
<p className="mt-2 text-muted-foreground text-sm leading-6">
Ask {agentName} to start a task.
</p>
</div>
</div>
)
}
function LoadingConversationState() {
return (
<div className="flex h-full items-center justify-center gap-2 text-muted-foreground text-sm">
<Loader2 className="size-4 animate-spin" />
Loading conversation...
</div>
)
}
function ConversationErrorState({
message,
onRetry,
}: {
message: string
onRetry: () => void
}) {
return (
<div className="flex h-full items-center justify-center px-6 py-12">
<div className="max-w-md rounded-2xl border border-border/60 bg-card px-5 py-4 text-center shadow-sm">
<p className="text-sm">{message}</p>
<button
type="button"
onClick={onRetry}
className="mt-3 inline-flex items-center gap-2 rounded-lg border border-border/60 px-3 py-1.5 font-medium text-muted-foreground text-xs transition-colors hover:bg-accent hover:text-foreground"
>
<RefreshCw className="size-3.5" />
Retry
</button>
</div>
</div>
)
}
export const ClawChat: FC<ClawChatProps> = ({
agentName,
historyMessages,
turns,
streaming,
isInitialLoading,
error,
hasNextPage,
isFetchingNextPage,
onFetchNextPage,
onRetry,
className,
}) => {
const topSentinelRef = useRef<HTMLDivElement>(null)
const onFetchNextPageRef = useRef(onFetchNextPage)
onFetchNextPageRef.current = onFetchNextPage
const hasMessages = historyMessages.length > 0 || turns.length > 0
useEffect(() => {
const sentinel = topSentinelRef.current
if (!sentinel) return
const observer = new IntersectionObserver(
(entries) => {
const [entry] = entries
if (!entry?.isIntersecting || !hasNextPage || isFetchingNextPage) {
return
}
onFetchNextPageRef.current()
},
{
root: null,
rootMargin: '160px 0px 0px 0px',
threshold: 0,
},
)
observer.observe(sentinel)
return () => observer.disconnect()
}, [hasNextPage, isFetchingNextPage])
return (
<div
className={cn('flex min-h-0 flex-1 flex-col overflow-hidden', className)}
>
<Conversation
className={cn(
'bg-background',
'[&_[data-streamdown="code-block"]]:!w-full [&_[data-streamdown="code-block"]]:!max-w-full [&_[data-streamdown="table-wrapper"]]:!w-full [&_[data-streamdown="table-wrapper"]]:!max-w-full [&_[data-streamdown="code-block"]]:overflow-x-auto [&_[data-streamdown="table-wrapper"]]:overflow-x-auto',
)}
>
<ConversationContent className="min-h-full px-5 py-5">
{isInitialLoading ? (
<LoadingConversationState />
) : error && !hasMessages ? (
<ConversationErrorState message={error.message} onRetry={onRetry} />
) : !hasMessages ? (
<EmptyConversationState agentName={agentName} />
) : (
<div className="mx-auto flex w-full max-w-3xl flex-col gap-3">
<div ref={topSentinelRef} aria-hidden="true" className="h-px" />
{isFetchingNextPage ? (
<div className="flex justify-center py-2 text-muted-foreground text-xs">
<Loader2 className="mr-2 size-3.5 animate-spin" />
Loading older messages...
</div>
) : null}
{!hasNextPage && historyMessages.length > 0 ? (
<div className="py-1 text-center text-muted-foreground text-xs">
Start of conversation
</div>
) : null}
{historyMessages.map((message) => (
<ClawChatMessage key={message.id} message={message} />
))}
{turns.map((turn, index) => (
<ConversationMessage
key={turn.id}
turn={turn}
streaming={streaming && index === turns.length - 1}
/>
))}
{error ? (
<div className="rounded-xl border border-border/60 bg-card px-4 py-3 text-muted-foreground text-sm">
{error.message}
</div>
) : null}
</div>
)}
</ConversationContent>
<ConversationScrollButton />
</Conversation>
</div>
)
}

View File

@@ -1,248 +0,0 @@
import { CheckCircle2, Copy, Loader2, Wrench, XCircle } from 'lucide-react'
import { type FC, useCallback, useMemo } from 'react'
import {
Message,
MessageAction,
MessageActions,
MessageAttachment,
MessageAttachments,
MessageContent,
MessageResponse,
MessageToolbar,
} from '@/components/ai-elements/message'
import {
Reasoning,
ReasoningContent,
ReasoningTrigger,
} from '@/components/ai-elements/reasoning'
import {
Task,
TaskContent,
TaskItem,
TaskTrigger,
} from '@/components/ai-elements/task'
import { cn } from '@/lib/utils'
import type {
ClawChatMessagePart,
ClawChatMessage as ClawChatMessageType,
} from './claw-chat-types'
function formatCost(usd: number): string {
if (usd < 0.005) return `$${usd.toFixed(4)}`
return `$${usd.toFixed(2)}`
}
type ToolCallPart = Extract<ClawChatMessagePart, { type: 'tool-call' }>
type AttachmentPart = Extract<ClawChatMessagePart, { type: 'attachment' }>
interface RenderEntry {
kind: 'text' | 'reasoning' | 'meta' | 'task' | 'attachments'
partIndex: number
part?: ClawChatMessagePart
tools?: ToolCallPart[]
attachments?: AttachmentPart[]
}
/**
* Build a render plan that groups all tool-call parts into a single Task
* collapsible and all attachment parts into a single attachment strip at
* their respective first-appearance positions. Other parts render in place.
*/
function buildRenderEntries(parts: ClawChatMessagePart[]): RenderEntry[] {
const entries: RenderEntry[] = []
const tools: ToolCallPart[] = []
const attachments: AttachmentPart[] = []
let taskInserted = false
let attachmentsInserted = false
parts.forEach((part, partIndex) => {
if (part.type === 'tool-call') {
tools.push(part)
if (!taskInserted) {
entries.push({ kind: 'task', partIndex, tools })
taskInserted = true
}
} else if (part.type === 'attachment') {
attachments.push(part)
if (!attachmentsInserted) {
entries.push({ kind: 'attachments', partIndex, attachments })
attachmentsInserted = true
}
} else if (part.type === 'text') {
entries.push({ kind: 'text', partIndex, part })
} else if (part.type === 'reasoning') {
entries.push({ kind: 'reasoning', partIndex, part })
} else if (part.type === 'meta') {
entries.push({ kind: 'meta', partIndex, part })
}
})
return entries
}
function ToolStatusIcon({ status }: { status: ToolCallPart['status'] }) {
if (status === 'running' || status === 'pending') {
return (
<Loader2 className="size-3.5 shrink-0 animate-spin text-muted-foreground" />
)
}
if (status === 'completed') {
return <CheckCircle2 className="size-3.5 shrink-0 text-green-500" />
}
return <XCircle className="size-3.5 shrink-0 text-destructive" />
}
interface ClawChatMessageProps {
message: ClawChatMessageType
}
export const ClawChatMessage: FC<ClawChatMessageProps> = ({ message }) => {
const messageText = message.parts
.filter((p) => p.type === 'text')
.map((p) => p.text)
.join('\n')
const handleCopy = useCallback(() => {
if (messageText) navigator.clipboard.writeText(messageText)
}, [messageText])
const entries = useMemo(
() => buildRenderEntries(message.parts),
[message.parts],
)
return (
<Message
from={message.role}
className="max-w-full group-[.is-user]:max-w-[80%]"
>
<MessageContent className="max-w-full overflow-hidden group-[.is-assistant]:w-full group-[.is-user]:max-w-full">
{entries.map((entry) => {
const key = `${message.id}-entry-${entry.partIndex}`
if (entry.kind === 'attachments' && entry.attachments) {
return (
<MessageAttachments key={key}>
{entry.attachments.map((attachment, idx) => (
<MessageAttachment
// biome-ignore lint/suspicious/noArrayIndexKey: attachment order is stable within a finalized message
key={`${attachment.kind}-${idx}`}
data={{
type: 'file',
url: attachment.dataUrl ?? '',
mediaType: attachment.mediaType,
filename: attachment.name,
}}
/>
))}
</MessageAttachments>
)
}
if (entry.kind === 'text' && entry.part?.type === 'text') {
return (
<MessageResponse
key={key}
// Historical messages are finalized — render immediately.
// Streamdown's default "streaming" mode uses an idle-callback
// debounce (300ms / 500ms idle) that paints empty content
// first, which made history flash blank tool collapsibles
// before text on every load.
mode="static"
parseIncompleteMarkdown={false}
className={cn(
'max-w-full overflow-hidden break-words',
'[&_[data-streamdown="code-block"]]:!w-full [&_[data-streamdown="code-block"]]:!max-w-full [&_[data-streamdown="code-block"]]:overflow-x-auto',
'[&_[data-streamdown="table-wrapper"]]:!w-full [&_[data-streamdown="table-wrapper"]]:!max-w-full [&_[data-streamdown="table-wrapper"]]:overflow-x-auto',
'[&_table]:w-max [&_table]:min-w-full',
)}
>
{entry.part.text}
</MessageResponse>
)
}
if (entry.kind === 'reasoning' && entry.part?.type === 'reasoning') {
return (
<Reasoning
key={key}
className="w-full"
defaultOpen={false}
duration={entry.part.duration}
>
<ReasoningTrigger />
<ReasoningContent>{entry.part.text}</ReasoningContent>
</Reasoning>
)
}
if (entry.kind === 'meta' && entry.part?.type === 'meta') {
return (
<div key={key} className="text-muted-foreground text-xs">
{entry.part.label}: {entry.part.value}
</div>
)
}
if (entry.kind === 'task' && entry.tools) {
const tools = entry.tools
const errorCount = tools.filter((t) => t.status === 'failed').length
const taskTitle = `Agent activity (${tools.length} ${tools.length === 1 ? 'action' : 'actions'}${errorCount > 0 ? `, ${errorCount} failed` : ''})`
return (
<Task key={key} defaultOpen={false}>
<TaskTrigger title={taskTitle} TriggerIcon={Wrench} />
<TaskContent>
{tools.map((tool, idx) => (
<TaskItem
// biome-ignore lint/suspicious/noArrayIndexKey: tool order is stable within a finalized historical message
key={`${tool.name}-${tool.status}-${idx}`}
className="flex items-center gap-2"
>
<ToolStatusIcon status={tool.status} />
<span className="text-foreground text-xs">
{tool.label}
</span>
{tool.subject ? (
<span className="ml-1.5 truncate text-muted-foreground/70 text-xs">
· {tool.subject}
</span>
) : null}
{tool.error ? (
<span className="ml-2 truncate text-destructive text-xs">
{tool.error}
</span>
) : null}
{tool.durationMs != null ? (
<span className="ml-auto text-muted-foreground/60 text-xs tabular-nums">
{(tool.durationMs / 1000).toFixed(1)}s
</span>
) : null}
</TaskItem>
))}
</TaskContent>
</Task>
)
}
return null
})}
{message.role === 'assistant' && messageText ? (
<MessageToolbar>
<MessageActions>
<MessageAction tooltip="Copy" onClick={handleCopy}>
<Copy className="size-3.5" />
</MessageAction>
</MessageActions>
{message.costUsd ? (
<span className="text-[11px] text-muted-foreground/50 tabular-nums">
{formatCost(message.costUsd)}
</span>
) : null}
</MessageToolbar>
) : null}
</MessageContent>
</Message>
)
}

View File

@@ -2,33 +2,20 @@ import {
ArrowRight,
Bot,
ChevronDown,
FileText,
Folder,
Layers,
Loader2,
Mic,
Paperclip,
Square,
X,
} from 'lucide-react'
import {
type DragEvent,
type FC,
type ReactNode,
useEffect,
useLayoutEffect,
useRef,
useState,
} from 'react'
import { type FC, type ReactNode, useEffect, useState } from 'react'
import { AppSelector } from '@/components/elements/AppSelector'
import { TabPickerPopover } from '@/components/elements/tab-picker-popover'
import { WorkspaceSelector } from '@/components/elements/workspace-selector'
import { Button } from '@/components/ui/button'
import { Textarea } from '@/components/ui/textarea'
import type { AgentEntry } from '@/entrypoints/app/agents/useOpenClaw'
import { McpServerIcon } from '@/entrypoints/app/connect-mcp/McpServerIcon'
import { useGetUserMCPIntegrations } from '@/entrypoints/app/connect-mcp/useGetUserMCPIntegrations'
import { type StagedAttachment, stageAttachments } from '@/lib/attachments'
import { Feature } from '@/lib/browseros/capabilities'
import { useCapabilities } from '@/lib/browseros/useCapabilities'
import { useMcpServers } from '@/lib/mcp/mcpServerStorage'
@@ -37,22 +24,16 @@ import { useVoiceInput } from '@/lib/voice/useVoiceInput'
import { useWorkspace } from '@/lib/workspace/use-workspace'
import { AgentSelector } from './AgentSelector'
export interface ConversationInputSendInput {
text: string
attachments: StagedAttachment[]
}
interface ConversationInputProps {
agents: AgentEntry[]
selectedAgentId: string | null
onSelectAgent: (agent: AgentEntry) => void
onSend: (input: ConversationInputSendInput) => void
onSend: (text: string) => void
onCreateAgent?: () => void
streaming: boolean
disabled?: boolean
status?: string
placeholder?: string
attachmentsEnabled?: boolean
variant?: 'home' | 'conversation'
}
@@ -142,9 +123,6 @@ function ContextControls({
onToggleTab,
showAgentSelector,
status,
onAttachClick,
attachDisabled,
attachmentsEnabled,
}: {
agents: AgentEntry[]
onCreateAgent?: () => void
@@ -154,9 +132,6 @@ function ContextControls({
onToggleTab: (tab: chrome.tabs.Tab) => void
showAgentSelector: boolean
status?: string
onAttachClick: () => void
attachDisabled: boolean
attachmentsEnabled: boolean
}) {
const { supports } = useCapabilities()
const { selectedFolder } = useWorkspace()
@@ -171,7 +146,7 @@ function ContextControls({
})
return (
<div className="flex items-center justify-between border-border/40 border-t px-4 py-2.5">
<div className="flex items-center justify-between border-border/50 border-t px-5 py-3">
<div className="flex items-center gap-1">
{showAgentSelector ? (
<AgentSelector
@@ -216,20 +191,6 @@ function ContextControls({
<span>Tabs</span>
</Button>
</TabPickerPopover>
<Button
type="button"
variant="ghost"
onClick={onAttachClick}
disabled={attachDisabled || !attachmentsEnabled}
title="Attach files"
className={cn(
'flex items-center gap-2 rounded-lg px-3 py-1.5 font-medium text-sm transition-all',
'bg-transparent text-muted-foreground hover:bg-accent hover:text-accent-foreground',
)}
>
<Paperclip className="h-4 w-4" />
<span>Attach</span>
</Button>
</div>
{supports(Feature.MANAGED_MCP_SUPPORT) ? (
@@ -273,7 +234,7 @@ function ContextControls({
function HomeShell({ children }: { children: ReactNode }) {
return (
<div className="overflow-hidden rounded-[1.55rem] border border-border/60 bg-card/95 shadow-sm">
<div className="overflow-hidden rounded-[1.5rem] border border-border/60 bg-card/95 shadow-sm backdrop-blur">
{children}
</div>
)
@@ -281,7 +242,7 @@ function HomeShell({ children }: { children: ReactNode }) {
function ConversationShell({ children }: { children: ReactNode }) {
return (
<div className="overflow-hidden rounded-[1.35rem] border border-border/50 bg-background/95 shadow-[0_10px_30px_rgba(15,23,42,0.06)] backdrop-blur-md">
<div className="overflow-hidden rounded-[1.5rem] border border-border/60 bg-card/95 shadow-sm backdrop-blur">
{children}
</div>
)
@@ -297,63 +258,14 @@ export const ConversationInput: FC<ConversationInputProps> = ({
disabled,
status,
placeholder,
attachmentsEnabled = true,
variant = 'conversation',
}) => {
const [input, setInput] = useState('')
const [selectedTabs, setSelectedTabs] = useState<chrome.tabs.Tab[]>([])
const [isExpandedDraft, setIsExpandedDraft] = useState(false)
const [attachments, setAttachments] = useState<StagedAttachment[]>([])
const [attachmentError, setAttachmentError] = useState<string | null>(null)
const [isStaging, setIsStaging] = useState(false)
const [isDragOver, setIsDragOver] = useState(false)
const fileInputRef = useRef<HTMLInputElement>(null)
const voice = useVoiceInput()
const textareaRef = useRef<HTMLTextAreaElement>(null)
const selectedAgent = agents.find(
(agent) => agent.agentId === selectedAgentId,
)
const isConversation = variant === 'conversation'
const stageFiles = async (files: File[]) => {
if (files.length === 0) return
if (!attachmentsEnabled) {
setAttachmentError('Attachments are not supported for this agent yet.')
return
}
setIsStaging(true)
setAttachmentError(null)
try {
const result = await stageAttachments(files, attachments.length)
if (result.staged.length > 0) {
setAttachments((prev) => [...prev, ...result.staged])
}
if (result.errors.length > 0) {
setAttachmentError(result.errors.map((e) => e.message).join(' \u2022 '))
}
} finally {
setIsStaging(false)
}
}
const removeAttachment = (id: string) => {
setAttachments((prev) => prev.filter((a) => a.id !== id))
setAttachmentError(null)
}
useLayoutEffect(() => {
const element = textareaRef.current
if (!element) return
const maxHeight = isConversation ? 176 : 100
const collapsedHeight = isConversation ? 56 : 72
element.style.height = '0px'
const nextHeight = Math.min(element.scrollHeight, maxHeight)
element.style.height = `${nextHeight}px`
element.style.overflowY =
element.scrollHeight > maxHeight ? 'auto' : 'hidden'
setIsExpandedDraft(nextHeight > collapsedHeight)
})
useEffect(() => {
if (voice.transcript && !voice.isTranscribing) {
@@ -362,12 +274,6 @@ export const ConversationInput: FC<ConversationInputProps> = ({
}
}, [voice.transcript, voice.isTranscribing, voice])
useEffect(() => {
if (attachmentsEnabled) return
setAttachments([])
setAttachmentError(null)
}, [attachmentsEnabled])
const toggleTab = (tab: chrome.tabs.Tab) => {
setSelectedTabs((prev) => {
const isSelected = prev.some((selected) => selected.id === tab.id)
@@ -378,70 +284,11 @@ export const ConversationInput: FC<ConversationInputProps> = ({
})
}
const hasContent = input.trim().length > 0 || attachments.length > 0
const handleSend = () => {
const text = input.trim()
if (disabled || isStaging || streaming) return
if (!text && attachments.length === 0) return
onSend({ text, attachments })
if (!text || streaming || disabled) return
onSend(text)
setInput('')
setAttachments([])
setAttachmentError(null)
}
const handlePaste = (event: React.ClipboardEvent<HTMLTextAreaElement>) => {
const items = event.clipboardData?.items
if (!items) return
const files: File[] = []
for (const item of items) {
if (item.kind === 'file') {
const file = item.getAsFile()
if (file) files.push(file)
}
}
if (files.length > 0) {
event.preventDefault()
void stageFiles(files)
}
}
const handleDrop = (event: DragEvent<HTMLDivElement>) => {
event.preventDefault()
setIsDragOver(false)
const files = Array.from(event.dataTransfer?.files ?? [])
if (files.length > 0) {
void stageFiles(files)
}
}
const handleDragOver = (event: DragEvent<HTMLDivElement>) => {
if (!event.dataTransfer?.types.includes('Files')) return
event.preventDefault()
setIsDragOver(true)
}
const handleDragLeave = (event: DragEvent<HTMLDivElement>) => {
if (event.currentTarget.contains(event.relatedTarget as Node | null)) {
return
}
setIsDragOver(false)
}
const openFilePicker = () => {
if (!attachmentsEnabled) {
setAttachmentError('Attachments are not supported for this agent yet.')
return
}
fileInputRef.current?.click()
}
const handleFileInputChange = (
event: React.ChangeEvent<HTMLInputElement>,
) => {
const files = Array.from(event.target.files ?? [])
event.target.value = ''
if (files.length > 0) void stageFiles(files)
}
const shell = variant === 'home' ? HomeShell : ConversationShell
@@ -449,190 +296,62 @@ export const ConversationInput: FC<ConversationInputProps> = ({
return (
<Shell>
<section
// Drag/drop on a region isn't a click affordance — wrap the
// composer in a labeled <section> so the a11y rule is satisfied
// without misrepresenting the surface as interactive.
aria-label="Message composer"
className={cn('relative', isDragOver && 'ring-2 ring-primary/60')}
onDragOver={handleDragOver}
onDragLeave={handleDragLeave}
onDrop={handleDrop}
>
<div className="flex items-center gap-3 px-5 py-4">
<BotInputIcon variant={variant} />
<input
ref={fileInputRef}
type="file"
multiple
accept="image/png,image/jpeg,image/webp,image/gif,text/*,application/json"
className="hidden"
onChange={handleFileInputChange}
/>
{attachments.length > 0 || attachmentError ? (
<AttachmentStrip
attachments={attachments}
onRemove={removeAttachment}
error={attachmentError}
/>
) : null}
<div
className={cn(
'flex gap-3',
variant === 'home' ? 'px-4 py-3' : 'px-4 py-3',
isExpandedDraft ? 'items-end' : 'items-center',
)}
>
<BotInputIcon variant={variant} />
<div className="flex-1">
<Textarea
ref={textareaRef}
value={input}
onChange={(event) => setInput(event.currentTarget.value)}
onKeyDown={(event) => {
if (event.key === 'Enter' && !event.shiftKey) {
event.preventDefault()
handleSend()
}
}}
onPaste={handlePaste}
rows={1}
placeholder={
voice.isTranscribing
? 'Transcribing...'
: (placeholder ??
`Message ${selectedAgent?.name ?? 'agent'}...`)
}
disabled={disabled || voice.isTranscribing}
className={cn(
'resize-none border-none bg-transparent px-0 text-[15px] shadow-none focus-visible:ring-0',
'[field-sizing:fixed]',
variant === 'home'
? 'min-h-[40px] py-2 leading-6'
: 'min-h-[40px] py-2 leading-6',
'placeholder:text-muted-foreground/80',
)}
/>
</div>
<VoiceButton
isRecording={voice.isRecording}
isTranscribing={voice.isTranscribing}
onStart={() => {
void voice.startRecording()
}}
onStop={() => {
void voice.stopRecording()
}}
/>
<InputActionButton
disabled={
!hasContent ||
isStaging ||
!!disabled ||
voice.isRecording ||
voice.isTranscribing ||
streaming
type="text"
value={input}
onChange={(event) => setInput(event.currentTarget.value)}
onKeyDown={(event) => {
if (event.key === 'Enter') {
event.preventDefault()
handleSend()
}
onClick={handleSend}
// Spinner stays the user-facing "agent is busy" hint; with the
// queue active we still spin while a turn is in flight.
streaming={streaming}
/>
</div>
{voice.error ? (
<div className="px-5 pb-2 text-destructive text-xs">
{voice.error}
</div>
) : null}
<ContextControls
agents={agents}
onCreateAgent={onCreateAgent}
onSelectAgent={onSelectAgent}
selectedAgentId={selectedAgentId}
selectedTabs={selectedTabs}
onToggleTab={toggleTab}
showAgentSelector={variant === 'home'}
status={status}
onAttachClick={openFilePicker}
attachDisabled={attachments.length >= 10 || isStaging || !!disabled}
attachmentsEnabled={attachmentsEnabled}
}}
placeholder={
voice.isTranscribing
? 'Transcribing...'
: (placeholder ?? `Message ${selectedAgent?.name ?? 'agent'}...`)
}
disabled={disabled || voice.isTranscribing}
className="flex-1 border-none bg-transparent text-base text-foreground outline-none placeholder:text-muted-foreground disabled:opacity-60"
/>
{isDragOver ? (
<div className="pointer-events-none absolute inset-0 flex items-center justify-center rounded-[inherit] bg-background/80 font-medium text-foreground text-sm backdrop-blur-sm">
Drop files to attach
</div>
) : null}
</section>
</Shell>
)
}
function AttachmentStrip({
attachments,
onRemove,
error,
}: {
attachments: StagedAttachment[]
onRemove: (id: string) => void
error: string | null
}) {
return (
<div className="border-border/40 border-b px-4 pt-3 pb-2">
{attachments.length > 0 ? (
<div className="flex flex-wrap gap-2">
{attachments.map((attachment) => (
<AttachmentChip
key={attachment.id}
attachment={attachment}
onRemove={() => onRemove(attachment.id)}
/>
))}
</div>
) : null}
{error ? (
<div className="mt-2 text-destructive text-xs">{error}</div>
) : null}
</div>
)
}
function AttachmentChip({
attachment,
onRemove,
}: {
attachment: StagedAttachment
onRemove: () => void
}) {
if (attachment.kind === 'image' && attachment.dataUrl) {
return (
<div className="group relative size-16 overflow-hidden rounded-md border border-border/60">
<img
src={attachment.dataUrl}
alt={attachment.name}
className="size-full object-cover"
<VoiceButton
isRecording={voice.isRecording}
isTranscribing={voice.isTranscribing}
onStart={() => {
void voice.startRecording()
}}
onStop={() => {
void voice.stopRecording()
}}
/>
<InputActionButton
disabled={
!input.trim() ||
streaming ||
!!disabled ||
voice.isRecording ||
voice.isTranscribing
}
onClick={handleSend}
streaming={streaming}
/>
<button
type="button"
onClick={onRemove}
className="absolute top-1 right-1 inline-flex size-5 items-center justify-center rounded-full bg-background/80 text-muted-foreground opacity-0 transition-opacity hover:text-foreground group-hover:opacity-100"
aria-label={`Remove ${attachment.name}`}
>
<X className="size-3" />
</button>
</div>
)
}
return (
<div className="group flex max-w-[220px] items-center gap-2 rounded-md border border-border/60 bg-background/60 px-2 py-1.5">
<FileText className="size-4 shrink-0 text-muted-foreground" />
<span className="truncate text-xs">{attachment.name}</span>
<button
type="button"
onClick={onRemove}
className="ml-1 inline-flex size-4 items-center justify-center text-muted-foreground hover:text-foreground"
aria-label={`Remove ${attachment.name}`}
>
<X className="size-3" />
</button>
</div>
{voice.error ? (
<div className="px-5 pb-2 text-destructive text-xs">{voice.error}</div>
) : null}
<ContextControls
agents={agents}
onCreateAgent={onCreateAgent}
onSelectAgent={onSelectAgent}
selectedAgentId={selectedAgentId}
selectedTabs={selectedTabs}
onToggleTab={toggleTab}
showAgentSelector={variant === 'home'}
status={status}
/>
</Shell>
)
}
@@ -642,8 +361,8 @@ function BotInputIcon({ variant }: { variant: 'home' | 'conversation' }) {
className={cn(
'flex items-center justify-center text-[var(--accent-orange)]',
variant === 'home'
? 'h-8 w-8 rounded-lg bg-[var(--accent-orange)]/10'
: 'h-8 w-8 rounded-lg bg-[var(--accent-orange)]/10',
? 'h-10 w-10 rounded-xl bg-[var(--accent-orange)]/10'
: 'h-9 w-9 rounded-xl bg-[var(--accent-orange)]/12',
)}
>
<Bot className="h-4 w-4" />

View File

@@ -1,9 +1,7 @@
import { Bot, CheckCircle2, Loader2, Wrench, XCircle } from 'lucide-react'
import { type FC, useMemo } from 'react'
import { Bot, CheckCircle2, Loader2, XCircle } from 'lucide-react'
import type { FC } from 'react'
import {
Message,
MessageAttachment,
MessageAttachments,
MessageContent,
MessageResponse,
} from '@/components/ai-elements/message'
@@ -12,191 +10,96 @@ import {
ReasoningContent,
ReasoningTrigger,
} from '@/components/ai-elements/reasoning'
import {
Task,
TaskContent,
TaskItem,
TaskTrigger,
} from '@/components/ai-elements/task'
import type {
AgentConversationTurn,
ToolEntry,
} from '@/lib/agent-conversations/types'
import type { AgentConversationTurn } from '@/lib/agent-conversations/types'
interface ConversationMessageProps {
turn: AgentConversationTurn
streaming: boolean
}
interface RenderEntry {
kind: 'thinking' | 'text' | 'task'
partIndex: number
text?: string
done?: boolean
tools?: ToolEntry[]
}
/**
* Build the render plan for an assistant turn:
* - thinking and text parts render in place
* - all tool-batch parts collapse into a single Task entry at their first
* appearance position, with tools listed in arrival order
*/
function buildRenderEntries(turn: AgentConversationTurn): RenderEntry[] {
const entries: RenderEntry[] = []
const aggregatedTools: ToolEntry[] = []
let taskInserted = false
turn.parts.forEach((part, partIndex) => {
if (part.kind === 'thinking') {
entries.push({
kind: 'thinking',
partIndex,
text: part.text,
done: part.done,
})
} else if (part.kind === 'text') {
entries.push({ kind: 'text', partIndex, text: part.text })
} else if (part.kind === 'tool-batch') {
aggregatedTools.push(...part.tools)
if (!taskInserted) {
entries.push({
kind: 'task',
partIndex,
tools: aggregatedTools,
})
taskInserted = true
}
}
})
return entries
}
function ToolStatusIcon({ status }: { status: ToolEntry['status'] }) {
if (status === 'running') {
return (
<Loader2 className="size-3.5 shrink-0 animate-spin text-muted-foreground" />
)
}
if (status === 'completed') {
return <CheckCircle2 className="size-3.5 shrink-0 text-green-500" />
}
return <XCircle className="size-3.5 shrink-0 text-destructive" />
}
export const ConversationMessage: FC<ConversationMessageProps> = ({
turn,
streaming,
}) => {
const entries = useMemo(() => buildRenderEntries(turn), [turn])
}) => (
<div className="space-y-3">
<Message from="user">
<MessageContent>
<pre className="whitespace-pre-wrap font-sans text-sm">
{turn.userText}
</pre>
</MessageContent>
</Message>
return (
<div className="space-y-3">
<Message from="user">
{turn.parts.length > 0 && (
<Message from="assistant">
<MessageContent>
{turn.userAttachments && turn.userAttachments.length > 0 && (
<MessageAttachments>
{turn.userAttachments.map((attachment) => (
<MessageAttachment
key={attachment.id}
data={{
type: 'file',
url: attachment.dataUrl ?? '',
mediaType: attachment.mediaType,
filename: attachment.name,
}}
/>
))}
</MessageAttachments>
)}
{turn.userText && (
<pre className="whitespace-pre-wrap font-sans text-sm">
{turn.userText}
</pre>
)}
</MessageContent>
</Message>
{turn.parts.map((part, i) => {
const key = `${turn.id}-part-${i}`
{entries.length > 0 && (
<Message from="assistant">
<MessageContent>
{entries.map((entry) => {
const key = `${turn.id}-entry-${entry.partIndex}`
if (entry.kind === 'thinking') {
switch (part.kind) {
case 'thinking':
return (
<Reasoning
key={key}
className="w-full"
isStreaming={!entry.done}
defaultOpen={!entry.done}
isStreaming={!part.done}
defaultOpen={!part.done}
>
<ReasoningTrigger />
<ReasoningContent>{entry.text ?? ''}</ReasoningContent>
<ReasoningContent>{part.text}</ReasoningContent>
</Reasoning>
)
}
if (entry.kind === 'text') {
case 'tool-batch':
return (
<MessageResponse key={key}>
{entry.text ?? ''}
</MessageResponse>
)
}
const tools = entry.tools ?? []
const allDone = tools.every((t) => t.status !== 'running')
const taskTitle = allDone
? `Agent activity (${tools.length} ${tools.length === 1 ? 'action' : 'actions'})`
: `Working… (${tools.length} ${tools.length === 1 ? 'action' : 'actions'})`
return (
<Task key={key} defaultOpen={!turn.done}>
<TaskTrigger title={taskTitle} TriggerIcon={Wrench} />
<TaskContent>
{tools.map((tool) => (
<TaskItem
<div key={key} className="w-full space-y-1">
{part.tools.map((tool) => (
<div
key={tool.id}
className="flex items-center gap-2"
className="flex items-center gap-2 rounded-md border px-3 py-2 text-sm"
>
<ToolStatusIcon status={tool.status} />
<span className="text-foreground text-xs">
{tool.label}
</span>
{tool.subject ? (
<span className="ml-1.5 truncate text-muted-foreground/70 text-xs">
· {tool.subject}
</span>
) : null}
{tool.status === 'running' && (
<Loader2 className="size-3.5 animate-spin text-muted-foreground" />
)}
{tool.status === 'completed' && (
<CheckCircle2 className="size-3.5 text-green-500" />
)}
{tool.status === 'error' && (
<XCircle className="size-3.5 text-destructive" />
)}
<span className="font-mono text-xs">{tool.name}</span>
{tool.durationMs != null && (
<span className="ml-auto text-muted-foreground/60 text-xs tabular-nums">
<span className="ml-auto text-muted-foreground text-xs">
{(tool.durationMs / 1000).toFixed(1)}s
</span>
)}
</TaskItem>
</div>
))}
</TaskContent>
</Task>
)
})}
</MessageContent>
</Message>
)}
</div>
)
{!turn.done && turn.parts.length === 0 && streaming && (
<div className="flex gap-2">
<div className="flex size-7 shrink-0 items-center justify-center rounded-full bg-[var(--accent-orange)] text-white">
<Bot className="size-3.5" />
</div>
<div className="flex items-center gap-1 rounded-xl rounded-tl-none border border-border/50 bg-card px-3 py-2.5 shadow-sm">
<span className="size-1.5 animate-bounce rounded-full bg-[var(--accent-orange)] [animation-delay:-0.3s]" />
<span className="size-1.5 animate-bounce rounded-full bg-[var(--accent-orange)] [animation-delay:-0.15s]" />
<span className="size-1.5 animate-bounce rounded-full bg-[var(--accent-orange)]" />
</div>
case 'text':
return <MessageResponse key={key}>{part.text}</MessageResponse>
default:
return null
}
})}
</MessageContent>
</Message>
)}
{!turn.done && turn.parts.length === 0 && streaming && (
<div className="flex gap-2">
<div className="flex size-7 shrink-0 items-center justify-center rounded-full bg-[var(--accent-orange)] text-white">
<Bot className="size-3.5" />
</div>
)}
</div>
)
}
<div className="flex items-center gap-1 rounded-xl rounded-tl-none border border-border/50 bg-card px-3 py-2.5 shadow-sm">
<span className="size-1.5 animate-bounce rounded-full bg-[var(--accent-orange)] [animation-delay:-0.3s]" />
<span className="size-1.5 animate-bounce rounded-full bg-[var(--accent-orange)] [animation-delay:-0.15s]" />
<span className="size-1.5 animate-bounce rounded-full bg-[var(--accent-orange)]" />
</div>
</div>
)}
</div>
)

View File

@@ -1,11 +1,8 @@
import type { FC } from 'react'
import { Outlet, useOutletContext } from 'react-router'
import { useHarnessAgents } from '@/entrypoints/app/agents/useAgents'
import type {
AgentEntry,
OpenClawStatus,
} from '@/entrypoints/app/agents/useOpenClaw'
import {
type AgentEntry,
type OpenClawStatus,
useOpenClawAgents,
useOpenClawStatus,
} from '@/entrypoints/app/agents/useOpenClaw'
@@ -19,32 +16,16 @@ interface AgentCommandContextValue {
export const AgentCommandLayout: FC = () => {
const { status, loading: statusLoading } = useOpenClawStatus(5000)
const openClawEnabled =
status?.status === 'running' && status.controlPlaneStatus === 'connected'
const { agents: openClawAgents, loading: openClawAgentsLoading } =
useOpenClawAgents(openClawEnabled)
const { agents: harnessAgents, loading: harnessAgentsLoading } =
useHarnessAgents()
const visibleOpenClawAgents = openClawEnabled ? openClawAgents : []
// Dual-created OpenClaw agents appear in both `/claw/agents` (gateway
// record) and `/agents` (harness record) under the same id. Prefer the
// harness entry so the chat panel can route through the harness path
// and the rail doesn't show duplicates.
const harnessAgentIds = new Set(harnessAgents.map((entry) => entry.agentId))
const dedupedOpenClawAgents = visibleOpenClawAgents.filter(
(entry) => !harnessAgentIds.has(entry.agentId),
const { agents, loading: agentsLoading } = useOpenClawAgents(
status?.status === 'running' && status.controlPlaneStatus === 'connected',
)
const agents = [...dedupedOpenClawAgents, ...harnessAgents]
return (
<Outlet
context={
{
agents,
agentsLoading:
harnessAgentsLoading ||
statusLoading ||
(openClawEnabled && openClawAgentsLoading),
agentsLoading,
status,
statusLoading,
} satisfies AgentCommandContextValue

View File

@@ -1,12 +0,0 @@
import { describe, expect, it } from 'bun:test'
import { mapAgentHarnessToolStatus } from './agent-stream-events'
describe('mapAgentHarnessToolStatus', () => {
it('normalizes ACP tool statuses for the chat renderer', () => {
expect(mapAgentHarnessToolStatus('running')).toBe('running')
expect(mapAgentHarnessToolStatus('completed')).toBe('completed')
expect(mapAgentHarnessToolStatus('failed')).toBe('error')
expect(mapAgentHarnessToolStatus('incomplete')).toBe('running')
expect(mapAgentHarnessToolStatus(undefined)).toBe('running')
})
})

View File

@@ -1,19 +0,0 @@
import type { ToolEntry } from '@/lib/agent-conversations/types'
export function mapAgentHarnessToolStatus(
status: string | undefined,
): ToolEntry['status'] {
if (!status) return 'running'
const normalized = status.toLowerCase()
if (['error', 'failed', 'failure', 'denied'].includes(normalized)) {
return 'error'
}
if (
['complete', 'completed', 'done', 'success', 'succeeded'].includes(
normalized,
)
) {
return 'completed'
}
return 'running'
}

View File

@@ -1,183 +0,0 @@
import { describe, expect, it } from 'bun:test'
import type { AgentConversationTurn } from '@/lib/agent-conversations/types'
import {
type AgentHistoryPageResponse,
type BrowserOSChatHistoryItem,
buildChatHistoryFromClawMessages,
filterTurnsPersistedInHistory,
flattenHistoryPages,
mapHistoryItemToClawMessage,
} from './claw-chat-types'
function historyItem(
overrides: Partial<BrowserOSChatHistoryItem>,
): BrowserOSChatHistoryItem {
return {
id: 'session-1:0',
role: 'user',
text: 'Hello',
timestamp: 1000,
messageSeq: 0,
sessionKey: 'session-1',
source: 'user-chat',
...overrides,
}
}
function page(items: BrowserOSChatHistoryItem[]): AgentHistoryPageResponse {
return {
agentId: 'main',
sessionKey: 'session-1',
session: null,
items,
page: {
hasMore: false,
limit: 50,
},
}
}
describe('claw-chat-types', () => {
it('maps backend history items into text-first ClawChat messages', () => {
const message = mapHistoryItemToClawMessage(
historyItem({
id: 'session-1:1',
role: 'assistant',
text: 'Hi there',
messageSeq: 1,
}),
)
expect(message).toEqual({
id: 'session-1:1',
role: 'assistant',
sessionKey: 'session-1',
timestamp: 1000,
source: 'user-chat',
messageSeq: 1,
status: 'historical',
parts: [{ type: 'text', text: 'Hi there' }],
})
})
it('flattens paginated history into oldest-to-newest render order', () => {
const messages = flattenHistoryPages([
page([
historyItem({
id: 'session-1:2',
role: 'user',
text: 'newer',
timestamp: 3000,
messageSeq: 2,
}),
]),
page([
historyItem({
id: 'session-1:0',
role: 'user',
text: 'older',
timestamp: 1000,
messageSeq: 0,
}),
historyItem({
id: 'session-1:1',
role: 'assistant',
text: 'middle',
timestamp: 2000,
messageSeq: 1,
}),
]),
])
expect(messages.map((message) => message.id)).toEqual([
'session-1:0',
'session-1:1',
'session-1:2',
])
})
it('builds OpenClaw chat history from text message parts only', () => {
const history = buildChatHistoryFromClawMessages([
{
id: 'user-1',
role: 'user',
sessionKey: 'session-1',
parts: [{ type: 'text', text: ' User request ' }],
},
{
id: 'assistant-1',
role: 'assistant',
sessionKey: 'session-1',
parts: [
{ type: 'reasoning', text: 'private reasoning' },
{ type: 'text', text: 'Assistant answer' },
],
},
])
expect(history).toEqual([
{ role: 'user', content: 'User request' },
{ role: 'assistant', content: 'Assistant answer' },
])
})
it('hides completed live turns once harness history contains the same turn', () => {
const turn: AgentConversationTurn = {
id: 'live-turn',
userText: 'hello',
parts: [{ kind: 'text', text: 'hi there' }],
done: true,
timestamp: 1_000,
}
const visible = filterTurnsPersistedInHistory(
[turn],
[
{
id: 'history-user',
role: 'user',
sessionKey: 'main',
timestamp: 1_050,
status: 'historical',
parts: [{ type: 'text', text: 'hello' }],
},
{
id: 'history-assistant',
role: 'assistant',
sessionKey: 'main',
timestamp: 1_100,
status: 'historical',
parts: [{ type: 'text', text: 'hi there' }],
},
],
)
expect(visible).toEqual([])
})
it('keeps completed live turns until matching assistant history arrives', () => {
const turn: AgentConversationTurn = {
id: 'live-turn',
userText: 'hello',
parts: [{ kind: 'text', text: 'hi there' }],
done: true,
timestamp: 1_000,
}
const visible = filterTurnsPersistedInHistory(
[turn],
[
{
id: 'history-user',
role: 'user',
sessionKey: 'main',
timestamp: 1_050,
status: 'historical',
parts: [{ type: 'text', text: 'hello' }],
},
],
)
expect(visible).toEqual([turn])
})
})

View File

@@ -1,287 +0,0 @@
import type { OpenClawChatHistoryMessage } from '@/entrypoints/app/agents/useOpenClaw'
import type { AgentConversationTurn } from '@/lib/agent-conversations/types'
export type ClawChatRole = 'user' | 'assistant'
export type ClawChatSource = 'user-chat' | 'cron' | 'hook' | 'channel' | 'other'
export interface BrowserOSOpenClawSession {
key: string
updatedAt: number
sessionId: string
agentId: string
kind: string
source: ClawChatSource
status?: string
totalTokens?: number
model?: string
modelProvider?: string
}
export interface BrowserOSChatHistoryToolCall {
toolCallId?: string
toolName: string
label: string
subject?: string
status: 'pending' | 'running' | 'completed' | 'failed'
input?: unknown
output?: unknown
error?: string
durationMs?: number
}
export interface BrowserOSChatHistoryReasoning {
text: string
durationMs?: number
}
export interface BrowserOSChatHistoryAttachment {
kind: 'image' | 'file'
mediaType: string
// Images carry a `data:` URL so we can render directly without any
// additional fetch; files (text/PDF) currently round-trip via inline
// text in the message body and do not populate this field in v1.
dataUrl?: string
name?: string
}
export interface BrowserOSChatHistoryItem {
id: string
role: ClawChatRole
text: string
timestamp?: number
messageSeq: number
sessionKey: string
source: ClawChatSource
costUsd?: number
tokensIn?: number
tokensOut?: number
toolCalls?: BrowserOSChatHistoryToolCall[]
reasoning?: BrowserOSChatHistoryReasoning
attachments?: BrowserOSChatHistoryAttachment[]
}
export interface AgentHistoryPageResponse {
agentId: string
sessionKey: string | null
session: BrowserOSOpenClawSession | null
items: BrowserOSChatHistoryItem[]
page: {
cursor?: string
hasMore: boolean
limit: number
}
}
export type ClawChatMessageStatus =
| 'historical'
| 'sending'
| 'streaming'
| 'error'
export type ClawChatMessagePart =
| { type: 'text'; text: string }
| { type: 'reasoning'; text: string; duration?: number }
| {
type: 'tool-call'
name: string
label: string
subject?: string
status: 'pending' | 'running' | 'completed' | 'failed'
input?: unknown
output?: unknown
error?: string
durationMs?: number
}
| {
type: 'attachment'
kind: 'image' | 'file'
mediaType: string
dataUrl?: string
name?: string
}
| { type: 'meta'; label: string; value: string }
export interface ClawChatMessage {
id: string
role: ClawChatRole
sessionKey: string
timestamp?: number
source?: ClawChatSource
messageSeq?: number
status?: ClawChatMessageStatus
parts: ClawChatMessagePart[]
costUsd?: number
tokensIn?: number
tokensOut?: number
}
export function mapHistoryItemToClawMessage(
item: BrowserOSChatHistoryItem,
): ClawChatMessage {
const parts: ClawChatMessagePart[] = []
// Attachments first — they belong above the text in user messages and
// never appear on assistant messages today (assistant images come back
// through tool results, which render via the Task collapsible).
if (item.attachments && item.attachments.length > 0) {
for (const attachment of item.attachments) {
parts.push({
type: 'attachment',
kind: attachment.kind,
mediaType: attachment.mediaType,
dataUrl: attachment.dataUrl,
name: attachment.name,
})
}
}
// Reasoning, then tool calls, then text — the chronological order the
// agent produced them (think → act → answer).
if (item.reasoning && item.reasoning.text.trim().length > 0) {
// 0ms means thinking and the final answer were emitted in the same JSONL
// line (no tool calls between them) — there's no real elapsed wall-clock,
// so fall through to the "Thinking" trigger instead of "Thought for 0
// seconds" / streaming shimmer. Real multi-line turns floor at 1s.
const durationMs = item.reasoning.durationMs ?? 0
const duration =
durationMs > 0 ? Math.max(1, Math.round(durationMs / 1000)) : undefined
parts.push({
type: 'reasoning',
text: item.reasoning.text,
duration,
})
}
if (item.toolCalls && item.toolCalls.length > 0) {
for (const tc of item.toolCalls) {
parts.push({
type: 'tool-call',
name: tc.toolName,
label: tc.label,
subject: tc.subject,
status: tc.status,
input: tc.input,
output: tc.output,
error: tc.error,
durationMs: tc.durationMs,
})
}
}
// Only emit a text part when there's actual content. User messages with
// only attachments and no caption shouldn't render an empty bubble.
if (item.text.trim().length > 0) {
parts.push({ type: 'text', text: item.text })
}
return {
id: item.id,
role: item.role,
sessionKey: item.sessionKey,
timestamp: item.timestamp,
source: item.source,
messageSeq: item.messageSeq,
status: 'historical',
parts,
costUsd: item.costUsd,
tokensIn: item.tokensIn,
tokensOut: item.tokensOut,
}
}
export function flattenHistoryPages(
pages: AgentHistoryPageResponse[],
): ClawChatMessage[] {
return pages
.flatMap((page) => page.items)
.sort((a, b) => {
if (a.timestamp != null && b.timestamp != null) {
return a.timestamp - b.timestamp
}
return a.messageSeq - b.messageSeq
})
.map(mapHistoryItemToClawMessage)
}
export function buildChatHistoryFromClawMessages(
messages: ClawChatMessage[],
): OpenClawChatHistoryMessage[] {
return messages
.map((message) => {
const content = message.parts
.filter((part): part is { type: 'text'; text: string } => {
return part.type === 'text' && part.text.trim().length > 0
})
.map((part) => part.text.trim())
.join('\n\n')
return content ? { role: message.role, content } : null
})
.filter((message): message is OpenClawChatHistoryMessage =>
Boolean(message),
)
}
const TURN_HISTORY_MATCH_WINDOW_MS = 5_000
export function filterTurnsPersistedInHistory(
turns: AgentConversationTurn[],
historyMessages: ClawChatMessage[],
): AgentConversationTurn[] {
return turns.filter(
(turn) => !isTurnPersistedInHistory(turn, historyMessages),
)
}
function isTurnPersistedInHistory(
turn: AgentConversationTurn,
historyMessages: ClawChatMessage[],
): boolean {
if (!turn.done) return false
const assistantText = getTurnAssistantText(turn)
if (!assistantText) return false
const minTimestamp = turn.timestamp - TURN_HISTORY_MATCH_WINDOW_MS
const userText = turn.userText.trim()
const userPersisted =
!userText ||
historyMessages.some(
(message) =>
message.role === 'user' &&
isHistoryMessageAfter(message, minTimestamp) &&
getClawMessageText(message) === userText,
)
const assistantPersisted = historyMessages.some(
(message) =>
message.role === 'assistant' &&
isHistoryMessageAfter(message, minTimestamp) &&
getClawMessageText(message) === assistantText,
)
return userPersisted && assistantPersisted
}
function isHistoryMessageAfter(
message: ClawChatMessage,
minTimestamp: number,
): boolean {
return message.timestamp == null || message.timestamp >= minTimestamp
}
function getTurnAssistantText(turn: AgentConversationTurn): string {
return turn.parts
.filter((part) => part.kind === 'text')
.map((part) => part.text)
.join('')
.trim()
}
function getClawMessageText(message: ClawChatMessage): string {
return message.parts
.filter((part) => part.type === 'text')
.map((part) => part.text)
.join('')
.trim()
}

View File

@@ -1,71 +0,0 @@
import { buildToolLabel } from '../../../lib/tool-labels'
import type { HarnessAgentHistoryPage } from '../agents/agent-harness-types'
import type {
AgentHistoryPageResponse,
BrowserOSChatHistoryItem,
BrowserOSChatHistoryToolCall,
} from './claw-chat-types'
export function mapHarnessHistoryPage(
page: HarnessAgentHistoryPage,
): AgentHistoryPageResponse {
const items: BrowserOSChatHistoryItem[] = page.items.map((item, index) => {
const toolCalls = item.toolCalls?.map(
(tool): BrowserOSChatHistoryToolCall => {
const input = asRecord(tool.input)
const { label, subject } = buildToolLabel(tool.toolName, input)
return {
toolName: tool.toolName,
label,
status: tool.status,
...(tool.toolCallId ? { toolCallId: tool.toolCallId } : {}),
...(subject ? { subject } : {}),
...(tool.input !== undefined ? { input: tool.input } : {}),
...(tool.output !== undefined ? { output: tool.output } : {}),
...(tool.error ? { error: tool.error } : {}),
...(tool.durationMs != null ? { durationMs: tool.durationMs } : {}),
}
},
)
return {
id: item.id,
role: item.role,
text: item.text,
timestamp: item.createdAt,
messageSeq: index + 1,
sessionKey: 'main',
source: 'user-chat',
...(item.reasoning ? { reasoning: item.reasoning } : {}),
...(toolCalls && toolCalls.length > 0 ? { toolCalls } : {}),
}
})
const updatedAt =
page.items.length > 0
? Math.max(...page.items.map((item) => item.createdAt))
: Date.now()
return {
agentId: page.agentId,
sessionKey: 'main',
session: {
key: 'main',
updatedAt,
sessionId: 'main',
agentId: page.agentId,
kind: 'agent-harness',
source: 'user-chat',
},
items,
page: {
hasMore: false,
limit: items.length,
},
}
}
function asRecord(value: unknown): Record<string, unknown> | undefined {
return value && typeof value === 'object' && !Array.isArray(value)
? (value as Record<string, unknown>)
: undefined
}

View File

@@ -1,53 +1,69 @@
import { useEffect, useState } from 'react'
import {
type AgentEntry,
getModelDisplayName,
type OpenClawStatus,
} from '@/entrypoints/app/agents/useOpenClaw'
import { getLatestConversation } from '@/lib/agent-conversations/storage'
import type { AgentCardData } from '@/lib/agent-conversations/types'
import type { AgentOverview } from './useAgentDashboard'
function resolveAgentStatus(
gatewayStatus: OpenClawStatus['status'] | undefined,
liveStatus: AgentOverview['status'] | undefined,
function getAgentStatusTone(
status: OpenClawStatus['status'] | undefined,
): AgentCardData['status'] {
// Gateway-level errors take precedence
if (gatewayStatus === 'error') return 'error'
if (gatewayStatus === 'starting') return 'working'
// Per-agent live status from the WS observer
if (liveStatus === 'working') return 'working'
if (liveStatus === 'error') return 'error'
if (status === 'error') return 'error'
if (status === 'starting') return 'working'
return 'idle'
}
/**
* Build agent card display data by merging the raw agent entries from
* the gateway with enriched overview data from the dashboard API.
*
* Pure function — no hooks, no IndexedDB, no async.
*/
export function buildAgentCardData(
async function getAgentCardData(
agent: AgentEntry,
status: OpenClawStatus['status'] | undefined,
): Promise<AgentCardData> {
const conversation = await getLatestConversation(agent.agentId)
const lastTurn = conversation?.turns[conversation.turns.length - 1]
const lastTextPart = lastTurn?.parts.findLast((part) => part.kind === 'text')
return {
agentId: agent.agentId,
name: agent.name,
model: getModelDisplayName(agent.model),
status: getAgentStatusTone(status),
lastMessage:
lastTextPart?.kind === 'text'
? lastTextPart.text.slice(0, 120)
: undefined,
lastMessageTimestamp: lastTurn?.timestamp,
}
}
export function useAgentCardData(
agents: AgentEntry[],
status: OpenClawStatus['status'] | undefined,
dashboard: AgentOverview[] | undefined,
): AgentCardData[] {
return agents.map((agent) => {
const overview = dashboard?.find((d) => d.agentId === agent.agentId)
) {
const [cardData, setCardData] = useState<AgentCardData[]>([])
return {
agentId: agent.agentId,
name: agent.name,
model: getModelDisplayName(agent.model),
status:
agent.source === 'agent-harness'
? 'idle'
: resolveAgentStatus(status, overview?.status),
lastMessage: overview?.latestMessage?.slice(0, 200) ?? undefined,
lastMessageTimestamp: overview?.latestMessageAt ?? undefined,
activitySummary: overview?.activitySummary ?? undefined,
currentTool: overview?.currentTool ?? undefined,
costUsd: overview?.totalCostUsd ?? undefined,
useEffect(() => {
let active = true
const loadCardData = async () => {
const nextCardData = await Promise.all(
agents.map((agent) => getAgentCardData(agent, status)),
)
if (active) {
setCardData(nextCardData)
}
}
})
if (agents.length > 0) {
void loadCardData()
} else {
setCardData([])
}
return () => {
active = false
}
}, [agents, status])
return cardData
}

View File

@@ -1,77 +1,52 @@
import { useEffect, useRef, useState } from 'react'
import {
type AgentHarnessStreamEvent,
attachToHarnessTurn,
cancelHarnessTurn,
chatWithHarnessAgent,
fetchActiveHarnessTurn,
} from '@/entrypoints/app/agents/useAgents'
import type { OpenClawChatHistoryMessage } from '@/entrypoints/app/agents/useOpenClaw'
buildChatHistoryFromTurns,
chatWithAgent,
type OpenClawStreamEvent,
} from '@/entrypoints/app/agents/useOpenClaw'
import {
getLatestConversation,
saveConversation,
} from '@/lib/agent-conversations/storage'
import type {
AgentConversation,
AgentConversationTurn,
AssistantPart,
ToolEntry,
UserAttachmentPreview,
} from '@/lib/agent-conversations/types'
import type { ServerAttachmentPayload } from '@/lib/attachments'
import { consumeSSEStream } from '@/lib/sse'
import { buildToolLabel } from '@/lib/tool-labels'
import { mapAgentHarnessToolStatus } from './agent-stream-events'
export interface SendInput {
text: string
attachments?: ServerAttachmentPayload[]
// Optional preview metadata used to render the optimistic user turn.
// Built by the composer at staging time; the server only sees the
// payload array.
attachmentPreviews?: UserAttachmentPreview[]
}
interface UseAgentConversationOptions {
// The hook always speaks to the harness chat path now; the OpenClaw
// legacy /claw/agents/:id/chat surface was removed in Step 12. The
// option remains for forward-compatibility.
runtime?: 'agent-harness'
sessionKey?: string | null
history?: OpenClawChatHistoryMessage[]
onComplete?: () => void
onSessionKeyChange?: (sessionKey: string) => void
}
export function useAgentConversation(
agentId: string,
options: UseAgentConversationOptions = {},
) {
export function useAgentConversation(agentId: string, agentName: string) {
const [turns, setTurns] = useState<AgentConversationTurn[]>([])
const [streaming, setStreaming] = useState(false)
const sessionKeyRef = useRef(options.sessionKey ?? '')
const historyRef = useRef<OpenClawChatHistoryMessage[]>(options.history ?? [])
const [loading, setLoading] = useState(true)
const sessionKeyRef = useRef('')
const textAccRef = useRef('')
const thinkAccRef = useRef('')
const streamAbortRef = useRef<AbortController | null>(null)
const onCompleteRef = useRef(options.onComplete)
const onSessionKeyChangeRef = useRef(options.onSessionKeyChange)
// Per-turn resume bookkeeping. `turnId` is captured from the response
// header; `lastSeq` advances with every SSE event so a reconnect can
// resume via Last-Event-ID.
const turnIdRef = useRef<string | null>(null)
const lastSeqRef = useRef<number | null>(null)
useEffect(() => {
sessionKeyRef.current = options.sessionKey ?? ''
}, [options.sessionKey])
useEffect(() => {
historyRef.current = options.history ?? []
}, [options.history])
useEffect(() => {
onCompleteRef.current = options.onComplete
}, [options.onComplete])
useEffect(() => {
onSessionKeyChangeRef.current = options.onSessionKeyChange
}, [options.onSessionKeyChange])
let active = true
getLatestConversation(agentId)
.then((conv) => {
if (!active) return
if (conv) {
setTurns(conv.turns)
sessionKeyRef.current = conv.sessionKey
} else {
sessionKeyRef.current = crypto.randomUUID()
}
setLoading(false)
})
.catch(() => {
if (active) {
sessionKeyRef.current = crypto.randomUUID()
setLoading(false)
}
})
return () => {
active = false
}
}, [agentId])
useEffect(() => {
return () => {
@@ -79,11 +54,17 @@ export function useAgentConversation(
}
}, [])
// Indirection for the resume effect below: lets it call the latest
// event handler without re-subscribing on every render.
const processEventRef = useRef<(event: AgentHarnessStreamEvent) => void>(
() => {},
)
const persistTurns = (updatedTurns: AgentConversationTurn[]) => {
const conv: AgentConversation = {
agentId,
agentName,
sessionKey: sessionKeyRef.current,
turns: updatedTurns,
createdAt: updatedTurns[0]?.timestamp ?? Date.now(),
updatedAt: Date.now(),
}
saveConversation(conv).catch(() => {})
}
const updateCurrentTurnParts = (
updater: (parts: AssistantPart[]) => AssistantPart[],
@@ -95,210 +76,123 @@ export function useAgentConversation(
})
}
const appendTextDelta = (delta: string) => {
textAccRef.current += delta
const text = textAccRef.current
updateCurrentTurnParts((parts) => {
const last = parts[parts.length - 1]
if (last?.kind === 'text') {
return [...parts.slice(0, -1), { ...last, text }]
}
return [...parts, { kind: 'text', text }]
})
}
const appendThinkingDelta = (delta: string) => {
thinkAccRef.current += delta
const text = thinkAccRef.current
updateCurrentTurnParts((parts) => {
const idx = parts.findIndex((p) => p.kind === 'thinking' && !p.done)
if (idx >= 0) {
return [
...parts.slice(0, idx),
{ ...parts[idx], text, done: false },
...parts.slice(idx + 1),
]
}
return [...parts, { kind: 'thinking', text, done: false }]
})
}
const appendErrorText = (message: string) => {
updateCurrentTurnParts((parts) => [
...parts,
{ kind: 'text', text: `Error: ${message}` },
])
}
const markCurrentTurnDone = () => {
updateCurrentTurnParts((parts) =>
parts.map((part) =>
part.kind === 'thinking' ? { ...part, done: true } : part,
),
)
setTurns((prev) => {
const last = prev[prev.length - 1]
if (!last) return prev
return [...prev.slice(0, -1), { ...last, done: true }]
})
}
const upsertAgentHarnessTool = (event: AgentHarnessStreamEvent) => {
if (event.type !== 'tool_call') return
const rawName = event.title || event.rawType || 'tool call'
const { label, subject } = buildToolLabel(
rawName,
event.text ? { description: event.text } : undefined,
)
const tool: ToolEntry = {
id: event.id ?? crypto.randomUUID(),
name: rawName,
label,
subject,
status: mapAgentHarnessToolStatus(event.status),
}
updateCurrentTurnParts((parts) => {
for (let i = parts.length - 1; i >= 0; i--) {
const part = parts[i]
if (
part.kind === 'tool-batch' &&
part.tools.some((existing) => existing.id === tool.id)
) {
const tools = part.tools.map((existing) =>
existing.id === tool.id ? { ...existing, ...tool } : existing,
)
return [
...parts.slice(0, i),
{ ...part, tools },
...parts.slice(i + 1),
]
}
}
const last = parts[parts.length - 1]
if (last?.kind === 'tool-batch') {
return [
...parts.slice(0, -1),
{ ...last, tools: [...last.tools, tool] },
]
}
return [...parts, { kind: 'tool-batch', tools: [tool] }]
})
}
const processAgentHarnessStreamEvent = (event: AgentHarnessStreamEvent) => {
const processStreamEvent = (event: OpenClawStreamEvent) => {
switch (event.type) {
case 'text_delta':
if (event.stream === 'thought') {
appendThinkingDelta(event.text)
} else {
appendTextDelta(event.text)
}
break
case 'tool_call':
upsertAgentHarnessTool(event)
break
case 'done':
markCurrentTurnDone()
break
case 'error':
appendErrorText(event.message)
break
case 'status':
break
}
}
processEventRef.current = processAgentHarnessStreamEvent
// On mount (and whenever the agent changes), check whether the
// server has an in-flight turn for this agent and reattach to it.
// This is what makes the chat resilient across tab close/reopen,
// refresh, and navigation: the runtime call kept running on the
// server while we were away. Effect only depends on `agentId` —
// the event handler is read off a ref so this doesn't re-subscribe
// every render.
useEffect(() => {
let cancelled = false
const abortController = new AbortController()
const attemptResume = async () => {
try {
const active = await fetchActiveHarnessTurn(agentId)
if (cancelled || !active || active.status !== 'running') return
if (streamAbortRef.current) return // a fresh send already in flight
// Stage a placeholder turn so the streamed events have a row
// to render into. We don't have the user message text on
// resume; the assistant turn is what we're catching up on.
setTurns((prev) => [
...prev,
{
id: crypto.randomUUID(),
userText: '',
parts: [],
done: false,
timestamp: active.startedAt,
},
])
textAccRef.current = ''
thinkAccRef.current = ''
turnIdRef.current = active.turnId
lastSeqRef.current = null
streamAbortRef.current = abortController
setStreaming(true)
const response = await attachToHarnessTurn(agentId, {
turnId: active.turnId,
signal: abortController.signal,
})
if (!response.ok) return
await consumeSSEStream<AgentHarnessStreamEvent>(
response,
(event, meta) => {
if (typeof meta.seq === 'number') lastSeqRef.current = meta.seq
processEventRef.current(event)
},
abortController.signal,
)
} catch {
// Resume is best-effort; transient errors fall back to the
// user starting a new turn manually.
} finally {
if (!cancelled) {
if (streamAbortRef.current === abortController) {
streamAbortRef.current = null
case 'text-delta': {
const delta = (event.data.text as string) ?? ''
textAccRef.current += delta
const text = textAccRef.current
updateCurrentTurnParts((parts) => {
const last = parts[parts.length - 1]
if (last?.kind === 'text') {
return [...parts.slice(0, -1), { ...last, text }]
}
turnIdRef.current = null
lastSeqRef.current = null
setStreaming(false)
return [...parts, { kind: 'text', text }]
})
break
}
case 'thinking': {
const delta = (event.data.text as string) ?? ''
thinkAccRef.current += delta
const text = thinkAccRef.current
updateCurrentTurnParts((parts) => {
const idx = parts.findIndex((p) => p.kind === 'thinking' && !p.done)
if (idx >= 0) {
return [
...parts.slice(0, idx),
{ ...parts[idx], text, done: false },
...parts.slice(idx + 1),
]
}
return [...parts, { kind: 'thinking', text, done: false }]
})
break
}
case 'tool-start': {
const tool = {
id: (event.data.toolCallId as string) ?? crypto.randomUUID(),
name: (event.data.toolName as string) ?? 'unknown',
status: 'running' as const,
}
updateCurrentTurnParts((parts) => {
const last = parts[parts.length - 1]
if (last?.kind === 'tool-batch') {
return [
...parts.slice(0, -1),
{ ...last, tools: [...last.tools, tool] },
]
}
return [...parts, { kind: 'tool-batch', tools: [tool] }]
})
break
}
case 'tool-end': {
const toolId = event.data.toolCallId as string
const toolStatus: 'completed' | 'error' =
(event.data.status as string) === 'error' ? 'error' : 'completed'
const durationMs = event.data.durationMs as number | undefined
updateCurrentTurnParts((parts) => {
for (let i = parts.length - 1; i >= 0; i--) {
const part = parts[i]
if (
part.kind === 'tool-batch' &&
part.tools.some((t) => t.id === toolId)
) {
const updatedTools = part.tools.map((t) =>
t.id === toolId ? { ...t, status: toolStatus, durationMs } : t,
)
return [
...parts.slice(0, i),
{ ...part, tools: updatedTools },
...parts.slice(i + 1),
]
}
}
return parts
})
break
}
case 'done': {
updateCurrentTurnParts((parts) =>
parts.map((part) =>
part.kind === 'thinking' ? { ...part, done: true } : part,
),
)
setTurns((prev) => {
const last = prev[prev.length - 1]
if (!last) return prev
const updated = [...prev.slice(0, -1), { ...last, done: true }]
persistTurns(updated)
return updated
})
break
}
case 'error': {
const msg =
(event.data.message as string) ??
(event.data.error as string) ??
'Unknown error'
updateCurrentTurnParts((parts) => [
...parts,
{ kind: 'text', text: `Error: ${msg}` },
])
break
}
}
}
void attemptResume()
return () => {
cancelled = true
abortController.abort()
}
}, [agentId])
const send = async (input: string | SendInput) => {
const normalized: SendInput =
typeof input === 'string' ? { text: input } : input
const trimmed = normalized.text.trim()
const attachments = normalized.attachments ?? []
if (streaming) return
if (!trimmed && attachments.length === 0) return
const send = async (text: string) => {
if (!text.trim() || streaming) return
const history = buildChatHistoryFromTurns(turns)
const turn: AgentConversationTurn = {
id: crypto.randomUUID(),
userText: trimmed,
userAttachments:
normalized.attachmentPreviews &&
normalized.attachmentPreviews.length > 0
? normalized.attachmentPreviews
: undefined,
userText: text.trim(),
parts: [],
done: false,
timestamp: Date.now(),
@@ -311,37 +205,13 @@ export function useAgentConversation(
streamAbortRef.current = abortController
try {
let response = await chatWithHarnessAgent(
const response = await chatWithAgent(
agentId,
trimmed,
text.trim(),
sessionKeyRef.current,
history,
abortController.signal,
attachments,
)
// 409 means the server already has an active turn for this
// agent (e.g. a previous tab kicked one off and we're a fresh
// mount that missed the resume window). Attach to it instead of
// double-sending.
if (response.status === 409) {
const body = (await response.json()) as { turnId?: string }
if (body.turnId) {
response = await attachToHarnessTurn(agentId, {
turnId: body.turnId,
signal: abortController.signal,
})
}
}
const responseSessionKey =
response.headers.get('X-Session-Key') ??
response.headers.get('X-Session-Id')
if (responseSessionKey) {
sessionKeyRef.current = responseSessionKey
onSessionKeyChangeRef.current?.(responseSessionKey)
}
const responseTurnId = response.headers.get('X-Turn-Id')
if (responseTurnId) {
turnIdRef.current = responseTurnId
lastSeqRef.current = null
}
if (!response.ok) {
const err = await response.text()
updateCurrentTurnParts((parts) => [
@@ -350,12 +220,9 @@ export function useAgentConversation(
])
return
}
await consumeSSEStream<AgentHarnessStreamEvent>(
await consumeSSEStream(
response,
(event, meta) => {
if (typeof meta.seq === 'number') lastSeqRef.current = meta.seq
processAgentHarnessStreamEvent(event)
},
processStreamEvent,
abortController.signal,
)
} catch (err) {
@@ -369,45 +236,24 @@ export function useAgentConversation(
if (streamAbortRef.current === abortController) {
streamAbortRef.current = null
}
turnIdRef.current = null
lastSeqRef.current = null
onCompleteRef.current?.()
setStreaming(false)
}
}
/**
* Stop button. The fetch abort only detaches *this* SSE subscriber
* now — the underlying turn would otherwise keep running on the
* server. So we explicitly cancel via the new endpoint, then unwind
* the local stream.
*/
const stop = async () => {
const turnId = turnIdRef.current ?? undefined
const resetConversation = () => {
streamAbortRef.current?.abort()
streamAbortRef.current = null
try {
await cancelHarnessTurn(agentId, {
turnId,
reason: 'user pressed stop',
})
} catch {
// Best-effort — UI already aborted.
}
}
const resetConversation = () => {
void stop()
setTurns([])
setStreaming(false)
sessionKeyRef.current = crypto.randomUUID()
}
return {
turns,
streaming,
loading,
sessionKey: sessionKeyRef.current,
send,
stop,
resetConversation,
}
}

View File

@@ -1,95 +0,0 @@
import { useQuery, useQueryClient } from '@tanstack/react-query'
import { useEffect } from 'react'
import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
export interface AgentOverview {
agentId: string
status: 'working' | 'idle' | 'error' | 'unknown'
latestMessage: string | null
latestMessageAt: number | null
activitySummary: string | null
currentTool: string | null
totalCostUsd: number
sessionCount: number
}
export interface DashboardResponse {
agents: AgentOverview[]
summary: {
totalAgents: number
totalCostUsd: number
}
}
interface StatusEvent {
agentId: string
status: AgentOverview['status']
currentTool: string | null
error: string | null
timestamp: number
}
const DASHBOARD_QUERY_KEY = ['claw', 'dashboard']
export function useAgentDashboard(enabled: boolean) {
const { baseUrl, isLoading: urlLoading } = useAgentServerUrl()
const queryClient = useQueryClient()
const ready = enabled && Boolean(baseUrl) && !urlLoading
// Initial data load + periodic refresh as fallback
const query = useQuery<DashboardResponse>({
queryKey: [...DASHBOARD_QUERY_KEY, baseUrl],
queryFn: async () => {
const url = new URL('/claw/dashboard', baseUrl as string)
const response = await fetch(url.toString())
if (!response.ok) throw new Error('Failed to fetch dashboard')
return response.json()
},
enabled: ready,
})
// SSE subscription for real-time status patches
useEffect(() => {
if (!ready || !baseUrl) return
const streamUrl = new URL('/claw/dashboard/stream', baseUrl)
const eventSource = new EventSource(streamUrl.toString())
eventSource.addEventListener('snapshot', (event) => {
try {
const dashboard = JSON.parse(event.data) as DashboardResponse
queryClient.setQueryData([...DASHBOARD_QUERY_KEY, baseUrl], dashboard)
} catch {}
})
eventSource.addEventListener('status', (event) => {
try {
const status = JSON.parse(event.data) as StatusEvent
queryClient.setQueryData<DashboardResponse>(
[...DASHBOARD_QUERY_KEY, baseUrl],
(prev) => {
if (!prev) return prev
return {
...prev,
agents: prev.agents.map((agent) =>
agent.agentId === status.agentId
? {
...agent,
status: status.status,
currentTool: status.currentTool,
}
: agent,
),
}
},
)
} catch {}
})
return () => {
eventSource.close()
}
}, [ready, baseUrl, queryClient])
return query
}

View File

@@ -1,55 +0,0 @@
import { describe, expect, it } from 'bun:test'
import { mapHarnessHistoryPage } from './harness-history-mapper'
describe('mapHarnessHistoryPage', () => {
it('maps rich harness history into chat history items', () => {
const page = mapHarnessHistoryPage({
agentId: 'agent-1',
sessionId: 'main',
items: [
{
id: 'agent:agent-1:main:1',
agentId: 'agent-1',
sessionId: 'main',
role: 'assistant',
text: 'Done.',
createdAt: 1000,
reasoning: { text: 'checking state' },
toolCalls: [
{
toolCallId: 'tool-1',
toolName: 'read_file',
status: 'completed',
input: { path: 'src/index.ts' },
output: 'file contents',
},
],
},
],
})
expect(page.items).toEqual([
{
id: 'agent:agent-1:main:1',
role: 'assistant',
text: 'Done.',
timestamp: 1000,
messageSeq: 1,
sessionKey: 'main',
source: 'user-chat',
reasoning: { text: 'checking state' },
toolCalls: [
{
toolCallId: 'tool-1',
toolName: 'read_file',
label: 'Read file',
subject: 'index.ts',
status: 'completed',
input: { path: 'src/index.ts' },
output: 'file contents',
},
],
},
])
})
})

View File

@@ -1,29 +0,0 @@
import { useQuery } from '@tanstack/react-query'
import { fetchHarnessAgentHistory } from '@/entrypoints/app/agents/useAgents'
import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
import type { AgentHistoryPageResponse } from './claw-chat-types'
import { mapHarnessHistoryPage } from './harness-history-mapper'
const HISTORY_QUERY_KEY = 'harness-agent-history'
export function useHarnessChatHistory(agentId: string, enabled = true) {
const {
baseUrl,
isLoading: urlLoading,
error: urlError,
} = useAgentServerUrl()
const query = useQuery<AgentHistoryPageResponse, Error>({
queryKey: [HISTORY_QUERY_KEY, baseUrl, agentId, 'main'],
queryFn: async () => {
return mapHarnessHistoryPage(await fetchHarnessAgentHistory(agentId))
},
enabled: Boolean(baseUrl) && !urlLoading && enabled && Boolean(agentId),
})
return {
...query,
error: query.error ?? urlError,
isLoading: query.isLoading || urlLoading,
}
}

View File

@@ -1,42 +0,0 @@
import { Bot, Cpu, Sparkles } from 'lucide-react'
import type { FC } from 'react'
import type { HarnessAgentAdapter } from './agent-harness-types'
/**
* Single icon component for any adapter the agent rail can render.
* Falls back to a generic bot when the adapter is unknown so future
* adapters land without a code change at the call site.
*/
interface AdapterIconProps {
adapter: HarnessAgentAdapter | 'unknown'
className?: string
}
export const AdapterIcon: FC<AdapterIconProps> = ({ adapter, className }) => {
switch (adapter) {
case 'claude':
// Claude Code — text-based agent, sparkles to evoke the "AI assistant" feel.
return <Sparkles className={className} aria-label="Claude Code" />
case 'codex':
// Codex — code-leaning, CPU mark.
return <Cpu className={className} aria-label="Codex" />
case 'openclaw':
// OpenClaw — bot/automation framing.
return <Bot className={className} aria-label="OpenClaw" />
default:
return <Bot className={className} aria-label="Agent" />
}
}
export function adapterLabel(adapter: HarnessAgentAdapter | 'unknown'): string {
switch (adapter) {
case 'claude':
return 'Claude Code'
case 'codex':
return 'Codex'
case 'openclaw':
return 'OpenClaw'
default:
return 'Agent'
}
}

View File

@@ -0,0 +1,399 @@
import {
ArrowLeft,
Bot,
CheckCircle2,
Loader2,
Send,
XCircle,
} from 'lucide-react'
import { type FC, useEffect, useRef, useState } from 'react'
import {
Message,
MessageContent,
MessageResponse,
} from '@/components/ai-elements/message'
import {
Reasoning,
ReasoningContent,
ReasoningTrigger,
} from '@/components/ai-elements/reasoning'
import { Button } from '@/components/ui/button'
import { Textarea } from '@/components/ui/textarea'
import { consumeSSEStream } from '@/lib/sse'
import {
buildChatHistoryFromTurns,
chatWithAgent,
type OpenClawStreamEvent,
} from './useOpenClaw'
interface ToolEntry {
id: string
name: string
status: 'running' | 'completed' | 'error'
durationMs?: number
}
type AssistantPart =
| { kind: 'thinking'; text: string; done: boolean }
| { kind: 'tool-batch'; tools: ToolEntry[] }
| { kind: 'text'; text: string }
interface ChatTurn {
id: string
userText: string
parts: AssistantPart[]
done: boolean
}
interface AgentChatProps {
agentId: string
agentName: string
onBack: () => void
}
export const AgentChat: FC<AgentChatProps> = ({
agentId,
agentName,
onBack,
}) => {
const [turns, setTurns] = useState<ChatTurn[]>([])
const [input, setInput] = useState('')
const [streaming, setStreaming] = useState(false)
const scrollRef = useRef<HTMLDivElement>(null)
const sessionKeyRef = useRef(crypto.randomUUID())
const streamAbortRef = useRef<AbortController | null>(null)
const textAccRef = useRef('')
const thinkAccRef = useRef('')
const scrollToBottom = () => {
scrollRef.current?.scrollTo(0, scrollRef.current.scrollHeight)
}
// biome-ignore lint/correctness/useExhaustiveDependencies: scroll on every turns change
useEffect(() => {
scrollToBottom()
}, [turns])
useEffect(() => {
return () => {
streamAbortRef.current?.abort()
}
}, [])
const updateCurrentTurnParts = (
updater: (parts: AssistantPart[]) => AssistantPart[],
) => {
setTurns((prev) => {
const last = prev[prev.length - 1]
if (!last) return prev
return [...prev.slice(0, -1), { ...last, parts: updater(last.parts) }]
})
}
const processStreamEvent = (event: OpenClawStreamEvent) => {
switch (event.type) {
case 'text-delta': {
const delta = (event.data.text as string) ?? ''
textAccRef.current += delta
const text = textAccRef.current
updateCurrentTurnParts((parts) => {
const last = parts[parts.length - 1]
if (last?.kind === 'text') {
return [...parts.slice(0, -1), { ...last, text }]
}
return [...parts, { kind: 'text', text }]
})
break
}
case 'thinking': {
const delta = (event.data.text as string) ?? ''
thinkAccRef.current += delta
const text = thinkAccRef.current
updateCurrentTurnParts((parts) => {
const idx = parts.findIndex((p) => p.kind === 'thinking' && !p.done)
if (idx >= 0) {
return [
...parts.slice(0, idx),
{ ...parts[idx], text, done: false },
...parts.slice(idx + 1),
]
}
return [...parts, { kind: 'thinking', text, done: false }]
})
break
}
case 'tool-start': {
const tool: ToolEntry = {
id: (event.data.toolCallId as string) ?? crypto.randomUUID(),
name: (event.data.toolName as string) ?? 'unknown',
status: 'running',
}
updateCurrentTurnParts((parts) => {
const last = parts[parts.length - 1]
if (last?.kind === 'tool-batch') {
return [
...parts.slice(0, -1),
{ ...last, tools: [...last.tools, tool] },
]
}
return [...parts, { kind: 'tool-batch', tools: [tool] }]
})
break
}
case 'tool-end': {
const toolId = event.data.toolCallId as string
const status =
(event.data.status as string) === 'error' ? 'error' : 'completed'
const durationMs = event.data.durationMs as number | undefined
updateCurrentTurnParts((parts) => {
for (let i = parts.length - 1; i >= 0; i--) {
const part = parts[i]
if (
part.kind === 'tool-batch' &&
part.tools.some((t) => t.id === toolId)
) {
const updatedTools = part.tools.map((t) =>
t.id === toolId
? {
...t,
status: status as ToolEntry['status'],
durationMs,
}
: t,
)
return [
...parts.slice(0, i),
{ ...part, tools: updatedTools },
...parts.slice(i + 1),
]
}
}
return parts
})
break
}
case 'done': {
updateCurrentTurnParts((parts) =>
parts.map((part) =>
part.kind === 'thinking' ? { ...part, done: true } : part,
),
)
setTurns((prev) => {
const last = prev[prev.length - 1]
if (!last) return prev
return [...prev.slice(0, -1), { ...last, done: true }]
})
break
}
case 'error': {
const msg =
(event.data.message as string) ??
(event.data.error as string) ??
'Unknown error'
updateCurrentTurnParts((parts) => [
...parts,
{ kind: 'text', text: `Error: ${msg}` },
])
break
}
}
}
const handleSend = async () => {
const text = input.trim()
if (!text || streaming) return
const history = buildChatHistoryFromTurns(turns)
const turn: ChatTurn = {
id: crypto.randomUUID(),
userText: text,
parts: [],
done: false,
}
setTurns((prev) => [...prev, turn])
setInput('')
setStreaming(true)
textAccRef.current = ''
thinkAccRef.current = ''
const abortController = new AbortController()
streamAbortRef.current = abortController
try {
const response = await chatWithAgent(
agentId,
text,
sessionKeyRef.current,
history,
abortController.signal,
)
if (!response.ok) {
const err = await response.text()
updateCurrentTurnParts((parts) => [
...parts,
{ kind: 'text', text: `Error: ${err}` },
])
return
}
await consumeSSEStream(
response,
processStreamEvent,
abortController.signal,
)
} catch (err) {
if (abortController.signal.aborted) return
const msg = err instanceof Error ? err.message : String(err)
updateCurrentTurnParts((parts) => [
...parts,
{ kind: 'text', text: `Error: ${msg}` },
])
} finally {
if (streamAbortRef.current === abortController) {
streamAbortRef.current = null
}
setStreaming(false)
}
}
return (
<div className="flex h-[calc(100vh-4rem)] flex-col">
<div className="flex items-center gap-2 border-b px-4 py-3">
<Button variant="ghost" size="icon" onClick={onBack}>
<ArrowLeft className="size-4" />
</Button>
<h2 className="font-semibold text-lg">{agentName}</h2>
</div>
<div ref={scrollRef} className="flex-1 space-y-4 overflow-y-auto p-4">
{turns.map((turn) => (
<div key={turn.id} className="space-y-3">
{/* User message */}
<Message from="user">
<MessageContent>
<pre className="whitespace-pre-wrap font-sans text-sm">
{turn.userText}
</pre>
</MessageContent>
</Message>
{/* Assistant response — all parts grouped */}
{turn.parts.length > 0 && (
<Message from="assistant">
<MessageContent>
{turn.parts.map((part, i) => {
const key = `${turn.id}-part-${i}`
switch (part.kind) {
case 'thinking':
return (
<Reasoning
key={key}
className="w-full"
isStreaming={!part.done}
defaultOpen={!part.done}
>
<ReasoningTrigger />
<ReasoningContent>{part.text}</ReasoningContent>
</Reasoning>
)
case 'tool-batch':
return (
<div key={key} className="w-full space-y-1">
{part.tools.map((tool) => (
<div
key={tool.id}
className="flex items-center gap-2 rounded-md border px-3 py-2 text-sm"
>
{tool.status === 'running' && (
<Loader2 className="size-3.5 animate-spin text-muted-foreground" />
)}
{tool.status === 'completed' && (
<CheckCircle2 className="size-3.5 text-green-500" />
)}
{tool.status === 'error' && (
<XCircle className="size-3.5 text-destructive" />
)}
<span className="font-mono text-xs">
{tool.name}
</span>
{tool.durationMs != null && (
<span className="ml-auto text-muted-foreground text-xs">
{(tool.durationMs / 1000).toFixed(1)}s
</span>
)}
</div>
))}
</div>
)
case 'text':
return (
<MessageResponse key={key}>
{part.text}
</MessageResponse>
)
default:
return null
}
})}
</MessageContent>
</Message>
)}
{/* Streaming indicator when waiting for first part */}
{!turn.done && turn.parts.length === 0 && streaming && (
<div className="flex gap-2">
<div className="flex h-7 w-7 shrink-0 items-center justify-center rounded-full bg-[var(--accent-orange)] text-white">
<Bot className="h-3.5 w-3.5" />
</div>
<div className="flex items-center gap-1 rounded-xl rounded-tl-none border border-border/50 bg-card px-3 py-2.5 shadow-sm">
<span className="h-1.5 w-1.5 animate-bounce rounded-full bg-[var(--accent-orange)] [animation-delay:-0.3s]" />
<span className="h-1.5 w-1.5 animate-bounce rounded-full bg-[var(--accent-orange)] [animation-delay:-0.15s]" />
<span className="h-1.5 w-1.5 animate-bounce rounded-full bg-[var(--accent-orange)]" />
</div>
</div>
)}
</div>
))}
</div>
<div className="border-t p-4">
<div className="flex gap-2">
<Textarea
value={input}
onChange={(e) => setInput(e.target.value)}
onKeyDown={(e) => {
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault()
handleSend()
}
}}
placeholder="Send a message..."
className="min-h-[44px] resize-none"
rows={1}
/>
<Button
onClick={handleSend}
disabled={!input.trim() || streaming}
size="icon"
>
{streaming ? (
<Loader2 className="size-4 animate-spin" />
) : (
<Send className="size-4" />
)}
</Button>
</div>
</div>
</div>
)
}

View File

@@ -1,108 +0,0 @@
import { Loader2 } from 'lucide-react'
import { type FC, useMemo } from 'react'
import { AgentRowCard } from './AgentRowCard'
import { AgentsEmptyState } from './AgentsEmptyState'
import type { HarnessAgent, HarnessAgentAdapter } from './agent-harness-types'
import type { AgentListItem } from './agents-page-types'
import type { AgentLiveness } from './LivenessDot'
interface AgentListProps {
agents: AgentListItem[]
/**
* Optional per-agent activity metadata. Keyed by `agentId`. Missing
* entries fall back to status='unknown' / lastUsedAt=null and the
* row renders an "unknown" dot. The server will populate this once
* the activity tracker ships; the page works without it.
*/
activity?: Record<
string,
{ status: AgentLiveness; lastUsedAt: number | null }
>
/**
* Lookup table from harness agent id → adapter + reasoning effort,
* sourced from `useHarnessAgents`. Lets the row card render the
* correct adapter icon and chips for harness agents (legacy
* /claw/agents entries fall back to inferring from `runtimeLabel`).
*/
harnessAgentLookup?: Map<string, HarnessAgent>
loading: boolean
deletingAgentKey: string | null
onCreateAgent: () => void
onDeleteAgent: (agent: AgentListItem) => void
}
export const AgentList: FC<AgentListProps> = ({
agents,
activity,
harnessAgentLookup,
loading,
deletingAgentKey,
onCreateAgent,
onDeleteAgent,
}) => {
// Sort by recency: most recently used first; never-used agents drop
// to the bottom in id-stable order so the list doesn't reshuffle on
// every refresh. The pinned exception is the gateway's `main` agent
// when it's never been touched — keep it at the top so a fresh
// install has an obvious starting point.
const ordered = useMemo(() => {
const withScore = agents.map((agent) => {
const lastUsedAt = activity?.[agent.agentId]?.lastUsedAt ?? null
return { agent, lastUsedAt }
})
return withScore
.sort((a, b) => {
const aPinned = a.agent.agentId === 'main' && a.lastUsedAt === null
const bPinned = b.agent.agentId === 'main' && b.lastUsedAt === null
if (aPinned && !bPinned) return -1
if (!aPinned && bPinned) return 1
const aValue = a.lastUsedAt ?? -Infinity
const bValue = b.lastUsedAt ?? -Infinity
if (aValue !== bValue) return bValue - aValue
return a.agent.agentId.localeCompare(b.agent.agentId)
})
.map((entry) => entry.agent)
}, [activity, agents])
if (loading && agents.length === 0) {
return (
<div className="flex h-36 items-center justify-center rounded-xl border border-border border-dashed bg-card/50">
<Loader2 className="size-5 animate-spin text-muted-foreground" />
</div>
)
}
if (agents.length === 0) {
return <AgentsEmptyState onCreateAgent={onCreateAgent} />
}
return (
<div className="grid gap-3">
{ordered.map((agent) => {
const harness = harnessAgentLookup?.get(agent.agentId)
const adapter: HarnessAgentAdapter | undefined =
harness?.adapter ?? inferAdapterFromLabel(agent.runtimeLabel)
return (
<AgentRowCard
key={agent.key}
agent={agent}
status={activity?.[agent.agentId]?.status}
lastUsedAt={activity?.[agent.agentId]?.lastUsedAt}
adapter={adapter}
reasoningEffort={harness?.reasoningEffort ?? null}
onDelete={onDeleteAgent}
deleting={deletingAgentKey === agent.key}
/>
)
})}
</div>
)
}
function inferAdapterFromLabel(label: string): HarnessAgentAdapter | undefined {
const lower = label?.toLowerCase()
if (lower === 'claude code') return 'claude'
if (lower === 'codex') return 'codex'
if (lower === 'openclaw') return 'openclaw'
return undefined
}

View File

@@ -1,270 +0,0 @@
import {
Copy,
Loader2,
MessageSquare,
MoreHorizontal,
Pencil,
RotateCcw,
Trash2,
} from 'lucide-react'
import type { FC } from 'react'
import { useNavigate } from 'react-router'
import { toast } from 'sonner'
import { Badge } from '@/components/ui/badge'
import { Button } from '@/components/ui/button'
import {
DropdownMenu,
DropdownMenuContent,
DropdownMenuItem,
DropdownMenuSeparator,
DropdownMenuTrigger,
} from '@/components/ui/dropdown-menu'
import {
Tooltip,
TooltipContent,
TooltipProvider,
TooltipTrigger,
} from '@/components/ui/tooltip'
import { cn } from '@/lib/utils'
import { AdapterIcon, adapterLabel } from './AdapterIcon'
import {
canDelete as canDeleteAgent,
canRename as canRenameAgent,
displayName,
formatRelativeTime,
workspaceLabel,
} from './agent-display.helpers'
import type { HarnessAgentAdapter } from './agent-harness-types'
import type { AgentListItem } from './agents-page-types'
import { type AgentLiveness, LivenessDot } from './LivenessDot'
interface AgentRowCardProps {
agent: AgentListItem
/**
* Per-agent extras the listing surface provides on top of the
* minimal `AgentListItem` shape. `lastUsedAt` survives server
* restart (sourced from acpx session record); `status` is in-memory
* server-side.
*/
status?: AgentLiveness
lastUsedAt?: number | null
/** Adapter the agent belongs to. Drives icon + label. */
adapter?: HarnessAgentAdapter
/** Reasoning effort chip (claude/codex/openclaw catalog). */
reasoningEffort?: string | null
/** Modeled directly off the inbound delete handler so the parent owns the dialog. */
onDelete: (agent: AgentListItem) => void
/** Whether THIS agent is mid-delete; renders a spinner in place of the trash icon. */
deleting?: boolean
}
export const AgentRowCard: FC<AgentRowCardProps> = ({
agent,
status = 'unknown',
lastUsedAt,
adapter,
reasoningEffort,
onDelete,
deleting,
}) => {
const navigate = useNavigate()
const adapterId = adapter ?? inferAdapterFromListItem(agent)
const workspace = workspaceLabel(agent)
const lastUsedLabel = formatRelativeTime(lastUsedAt ?? null)
const allowDelete = canDeleteAgent(agent)
const allowRename = canRenameAgent(agent)
const handleChat = () => navigate(`/agents/${agent.agentId}`)
const handleCopyId = async () => {
try {
await navigator.clipboard.writeText(agent.agentId)
toast.success('Agent id copied')
} catch {
toast.error('Could not copy agent id')
}
}
return (
<div
className={cn(
'group rounded-xl border border-border bg-card p-4 shadow-sm transition-all',
'hover:border-[var(--accent-orange)]/50 hover:shadow-sm',
)}
>
<div className="flex items-start gap-4">
{/* Adapter tile + liveness dot in the corner. */}
<div className="relative shrink-0">
<div className="flex h-12 w-12 items-center justify-center rounded-xl bg-muted text-muted-foreground">
<AdapterIcon adapter={adapterId} className="h-6 w-6" />
</div>
<LivenessDot
status={status}
detail={livenessDetail(status, lastUsedAt)}
className="absolute -right-0.5 -bottom-0.5"
/>
</div>
<div className="min-w-0 flex-1">
<div className="mb-1 flex items-center gap-2">
<span className="truncate font-semibold">{displayName(agent)}</span>
{status === 'working' && (
<Badge
variant="secondary"
className="bg-amber-50 text-amber-900 hover:bg-amber-50"
>
Working
</Badge>
)}
{status === 'asleep' && (
<Badge variant="outline" className="text-muted-foreground">
Asleep
</Badge>
)}
{status === 'error' && (
<Badge variant="destructive">Attention</Badge>
)}
</div>
<div className="mb-2 flex flex-wrap items-center gap-1.5 text-xs">
<Badge variant="secondary" className="font-normal">
{adapterLabel(adapterId)}
</Badge>
{agent.modelLabel && agent.modelLabel !== 'default' && (
<Badge variant="outline" className="font-normal">
{agent.modelLabel}
</Badge>
)}
{reasoningEffort && reasoningEffort !== 'medium' && (
<Badge variant="outline" className="font-normal">
{reasoningEffort}
</Badge>
)}
</div>
<div className="flex flex-wrap items-center gap-2 text-muted-foreground text-xs">
<span>Last used {lastUsedLabel}</span>
{workspace && (
<>
<span aria-hidden></span>
<span className="truncate font-mono" title={workspace}>
{workspace}
</span>
</>
)}
</div>
</div>
<div className="flex shrink-0 items-center gap-2">
<Button variant="outline" size="sm" onClick={handleChat}>
<MessageSquare className="mr-1.5 h-3 w-3" />
Chat
</Button>
<DropdownMenu>
<DropdownMenuTrigger asChild>
<Button
variant="ghost"
size="icon"
aria-label={`More actions for ${displayName(agent)}`}
className="h-8 w-8"
>
<MoreHorizontal className="h-4 w-4" />
</Button>
</DropdownMenuTrigger>
<DropdownMenuContent align="end" className="w-44">
<DropdownMenuItem onSelect={() => void handleCopyId()}>
<Copy className="mr-2 h-3.5 w-3.5" />
Copy id
</DropdownMenuItem>
<RenameMenuItem disabled={!allowRename} />
<ResetHistoryMenuItem />
<DropdownMenuSeparator />
<DropdownMenuItem
onSelect={() => onDelete(agent)}
disabled={!allowDelete || deleting}
className="text-destructive focus:text-destructive"
>
{deleting ? (
<Loader2 className="mr-2 h-3.5 w-3.5 animate-spin" />
) : (
<Trash2 className="mr-2 h-3.5 w-3.5" />
)}
Delete
</DropdownMenuItem>
</DropdownMenuContent>
</DropdownMenu>
</div>
</div>
</div>
)
}
const RenameMenuItem: FC<{ disabled: boolean }> = ({ disabled }) => {
const item = (
<DropdownMenuItem disabled className="text-muted-foreground">
<Pencil className="mr-2 h-3.5 w-3.5" />
Rename
</DropdownMenuItem>
)
if (!disabled) return item
// Disabled but with a hint so users know it's coming, not broken.
return (
<TooltipProvider delayDuration={300}>
<Tooltip>
<TooltipTrigger asChild>
<span className="block w-full">{item}</span>
</TooltipTrigger>
<TooltipContent side="left" className="text-xs">
Rename coming soon
</TooltipContent>
</Tooltip>
</TooltipProvider>
)
}
const ResetHistoryMenuItem: FC = () => {
const item = (
<DropdownMenuItem disabled className="text-muted-foreground">
<RotateCcw className="mr-2 h-3.5 w-3.5" />
Reset history
</DropdownMenuItem>
)
return (
<TooltipProvider delayDuration={300}>
<Tooltip>
<TooltipTrigger asChild>
<span className="block w-full">{item}</span>
</TooltipTrigger>
<TooltipContent side="left" className="text-xs">
Reset history coming soon
</TooltipContent>
</Tooltip>
</TooltipProvider>
)
}
function inferAdapterFromListItem(
agent: AgentListItem,
): HarnessAgentAdapter | 'unknown' {
const label = agent.runtimeLabel?.toLowerCase()
if (label?.includes('claude')) return 'claude'
if (label?.includes('codex')) return 'codex'
if (label?.includes('openclaw')) return 'openclaw'
return 'unknown'
}
function livenessDetail(
status: AgentLiveness,
lastUsedAt: number | null | undefined,
): string | undefined {
if (lastUsedAt == null) return undefined
const diffMin = Math.floor((Date.now() - lastUsedAt) / 60_000)
if (status === 'idle') return `Idle for ${Math.max(0, diffMin)} min`
if (status === 'asleep') {
if (diffMin < 60) return `Asleep — quiet for ${diffMin} min`
const hr = Math.floor(diffMin / 60)
return `Asleep — quiet for ${hr} hr`
}
if (status === 'working') return 'Working on a turn'
if (status === 'error') return 'Attention — last turn failed'
return undefined
}

View File

@@ -5,16 +5,14 @@ import {
import { FitAddon } from '@xterm/addon-fit'
import { WebLinksAddon } from '@xterm/addon-web-links'
import { Terminal } from '@xterm/xterm'
import { ArrowLeft, Check, Copy } from 'lucide-react'
import { type FC, useEffect, useRef, useState } from 'react'
import { ArrowLeft } from 'lucide-react'
import { type FC, useEffect, useRef } from 'react'
import '@xterm/xterm/css/xterm.css'
import { Button } from '@/components/ui/button'
import { getAgentServerUrl } from '@/lib/browseros/helpers'
interface AgentTerminalProps {
onBack: () => void
initialCommand?: string
onSessionExit?: () => void
}
type TerminalServerMessage =
@@ -38,22 +36,26 @@ function resolveCssColor(variableName: string): string {
return color
}
function withAlpha(color: string, alpha: number): string {
const channels = color.match(/[\d.]+/g)
if (!channels || channels.length < 3) return color
const [red, green, blue] = channels
return `rgb(${red} ${green} ${blue} / ${alpha})`
}
function createTerminalTheme() {
const isDark = document.documentElement.classList.contains('dark')
const background = resolveCssColor('--background')
const foreground = resolveCssColor('--foreground')
const muted = resolveCssColor('--muted-foreground')
const accent = resolveCssColor('--accent-orange')
return {
background,
foreground,
cursor: foreground,
cursorAccent: background,
// Solid terminal-standard selection colors. Deriving from a CSS var
// with alpha composed against the background produced near-white
// rectangles on light mode, making selection invisible.
selectionBackground: isDark ? '#3a4463' : '#b4d4f4',
selectionInactiveBackground: isDark ? '#2b3348' : '#d9e5f3',
selectionBackground: withAlpha(accent, isDark ? 0.3 : 0.2),
selectionForeground: foreground,
black: isDark ? '#16131a' : '#1f1b22',
red: isDark ? '#ef8c7c' : '#c25544',
@@ -116,38 +118,8 @@ function parseTerminalMessage(data: unknown): TerminalServerMessage | null {
return null
}
export const AgentTerminal: FC<AgentTerminalProps> = ({
onBack,
initialCommand,
onSessionExit,
}) => {
export const AgentTerminal: FC<AgentTerminalProps> = ({ onBack }) => {
const containerRef = useRef<HTMLDivElement>(null)
const terminalRef = useRef<Terminal | null>(null)
// Refs keep the mount-once effect from tearing down the PTY when the
// parent re-renders with new inline callbacks.
const initialCommandRef = useRef(initialCommand)
const onSessionExitRef = useRef(onSessionExit)
initialCommandRef.current = initialCommand
onSessionExitRef.current = onSessionExit
const [copied, setCopied] = useState(false)
// Copy the current xterm selection to the browser clipboard. No-op
// if nothing is selected — users who want the whole buffer can
// Cmd+A first. Uses the browser clipboard, not the container's, so
// it works even when the running TUI has mouse tracking enabled
// (Opt+drag forces a selection regardless, see terminal config).
const handleCopy = async (): Promise<void> => {
const text = terminalRef.current?.getSelection()
if (!text) return
try {
await navigator.clipboard.writeText(text)
setCopied(true)
window.setTimeout(() => setCopied(false), 1500)
} catch {
// clipboard permission denied or unavailable — swallow, user will retry
}
}
useEffect(() => {
if (!containerRef.current) return
@@ -160,34 +132,6 @@ export const AgentTerminal: FC<AgentTerminalProps> = ({
lineHeight: 1.25,
scrollback: 8000,
theme: createTerminalTheme(),
// Opt+click+drag forces a native text selection even when the
// running TUI has mouse-tracking enabled (xterm would otherwise
// forward every click to the app and selection wouldn't work).
macOptionClickForcesSelection: true,
})
terminalRef.current = terminal
// Cmd+A → select all, Cmd+C → copy selection via the browser
// clipboard. Return false so xterm doesn't also forward the keys
// to the running program.
terminal.attachCustomKeyEventHandler((event) => {
if (event.type !== 'keydown') return true
const isMac = navigator.platform.toUpperCase().includes('MAC')
const mod = isMac ? event.metaKey : event.ctrlKey
if (!mod) return true
const key = event.key.toLowerCase()
if (key === 'a') {
terminal.selectAll()
return false
}
if (key === 'c') {
const sel = terminal.getSelection()
if (sel) {
void navigator.clipboard.writeText(sel)
return false
}
}
return true
})
const fitAddon = new FitAddon()
@@ -195,12 +139,6 @@ export const AgentTerminal: FC<AgentTerminalProps> = ({
terminal.loadAddon(new WebLinksAddon())
terminal.open(containerRef.current)
// React 18 StrictMode double-invokes effects in dev. Everything
// async inside this effect is scoped to an AbortController; the
// cleanup aborts it and any pending awaits bail out, so we never
// leak a second live WebSocket or duplicate xterm listeners.
const ac = new AbortController()
const cleanups: Array<() => void> = []
let ws: WebSocket | null = null
let sawExit = false
@@ -221,28 +159,17 @@ export const AgentTerminal: FC<AgentTerminalProps> = ({
sendMessage({ type: 'resize', cols, rows })
}
const connect = async (): Promise<void> => {
const connect = async () => {
const baseUrl = await getAgentServerUrl()
if (ac.signal.aborted) return
const wsUrl = new URL('/terminal/ws', baseUrl)
wsUrl.protocol = wsUrl.protocol === 'https:' ? 'wss:' : 'ws:'
ws = new WebSocket(wsUrl)
// If the effect was cleaned up between the await above and now,
// close the socket we just opened and bail.
if (ac.signal.aborted) {
ws.close()
ws = null
return
}
cleanups.push(() => ws?.close())
ws.onopen = () => {
fitAddon.fit()
terminal.focus()
sendResize()
const cmd = initialCommandRef.current
if (cmd) sendMessage({ type: 'input', data: `${cmd}\n` })
}
ws.onmessage = (event) => {
@@ -258,7 +185,6 @@ export const AgentTerminal: FC<AgentTerminalProps> = ({
terminal.write(
`\r\n\x1b[90m[session ended with exit ${message.exitCode}]\x1b[0m\r\n`,
)
onSessionExitRef.current?.()
}
}
@@ -274,41 +200,49 @@ export const AgentTerminal: FC<AgentTerminalProps> = ({
const inputDisposable = terminal.onData((data) => {
sendMessage({ type: 'input', data })
})
const resizeDisposable = terminal.onResize(({ cols, rows }) => {
sendResize(cols, rows)
})
cleanups.push(() => inputDisposable.dispose())
cleanups.push(() => resizeDisposable.dispose())
return () => {
inputDisposable.dispose()
resizeDisposable.dispose()
}
}
void connect()
let disposeSocketBindings: (() => void) | undefined
void connect().then((disposeBindings) => {
disposeSocketBindings = disposeBindings
})
const resizeObserver = new ResizeObserver(() => {
fitAddon.fit()
sendResize()
})
resizeObserver.observe(containerRef.current)
cleanups.push(() => resizeObserver.disconnect())
const themeObserver = new MutationObserver(() => applyTheme())
const themeObserver = new MutationObserver(() => {
applyTheme()
})
themeObserver.observe(document.documentElement, {
attributes: true,
attributeFilter: ['class'],
})
cleanups.push(() => themeObserver.disconnect())
return () => {
ac.abort()
for (const dispose of cleanups) dispose()
resizeObserver.disconnect()
themeObserver.disconnect()
disposeSocketBindings?.()
ws?.close()
terminal.dispose()
terminalRef.current = null
}
}, [])
return (
<div className="flex h-[calc(100dvh-10rem)] min-h-[32rem] w-full flex-col py-2 sm:min-h-[42rem] sm:py-4">
<div className="flex min-h-0 flex-1 flex-col overflow-hidden rounded-xl border border-border bg-card shadow-sm">
<div className="flex items-center justify-between gap-3 border-border border-b px-4 py-3 sm:px-6">
<div className="flex items-center gap-3 border-border border-b px-4 py-3 sm:px-6">
<div className="flex min-w-0 items-center gap-3">
<Button variant="ghost" size="icon" onClick={onBack}>
<ArrowLeft className="size-4" />
@@ -322,14 +256,6 @@ export const AgentTerminal: FC<AgentTerminalProps> = ({
</div>
</div>
</div>
<Button variant="outline" size="sm" onClick={handleCopy}>
{copied ? (
<Check className="mr-1 size-3.5" />
) : (
<Copy className="mr-1 size-3.5" />
)}
{copied ? 'Copied' : 'Copy'}
</Button>
</div>
<div className="min-h-0 flex-1 p-4 sm:p-6">
@@ -343,7 +269,7 @@ export const AgentTerminal: FC<AgentTerminalProps> = ({
</div>
</div>
<div className="min-h-0 flex-1 cursor-text px-4 py-4 sm:px-5 sm:py-5">
<div className="min-h-0 flex-1 px-4 py-4 sm:px-5 sm:py-5">
<div ref={containerRef} className="h-full w-full" />
</div>
</div>

View File

@@ -1,32 +0,0 @@
import { Bot, Plus } from 'lucide-react'
import type { FC } from 'react'
import { Button } from '@/components/ui/button'
interface AgentsEmptyStateProps {
onCreateAgent: () => void
}
export const AgentsEmptyState: FC<AgentsEmptyStateProps> = ({
onCreateAgent,
}) => {
return (
<div className="rounded-xl border border-border border-dashed bg-card/50 p-12 text-center">
<div className="mx-auto mb-4 flex h-12 w-12 items-center justify-center rounded-xl bg-[var(--accent-orange)]/10">
<Bot className="h-6 w-6 text-[var(--accent-orange)]" />
</div>
<h3 className="mb-1 font-semibold">No agents yet</h3>
<p className="mx-auto mb-4 max-w-sm text-muted-foreground text-sm">
Spin up an OpenClaw, Claude Code, or Codex agent to chat with, schedule,
or run in the background.
</p>
<Button
onClick={onCreateAgent}
variant="outline"
className="border-[var(--accent-orange)] bg-[var(--accent-orange)]/10 text-[var(--accent-orange)] hover:bg-[var(--accent-orange)]/20 hover:text-[var(--accent-orange)]"
>
<Plus className="mr-1.5 h-4 w-4" />
Create your first agent
</Button>
</div>
)
}

View File

@@ -1,41 +0,0 @@
import { Bot, Plus } from 'lucide-react'
import type { FC } from 'react'
import { Button } from '@/components/ui/button'
interface AgentsHeaderProps {
onCreateAgent: () => void
}
/**
* Mirrors the visual shape of `SoulHeader` and `ScheduledTasksHeader`
* so the page reads as part of the same family. Loose lifecycle
* controls that used to sit next to the title moved into
* `GatewayStatusBar` — they're OpenClaw-specific and don't apply to
* Claude/Codex agents.
*/
export const AgentsHeader: FC<AgentsHeaderProps> = ({ onCreateAgent }) => {
return (
<div className="rounded-xl border border-border bg-card p-6 shadow-sm transition-all hover:shadow-md">
<div className="flex items-start gap-4">
<div className="flex h-12 w-12 shrink-0 items-center justify-center rounded-xl bg-[var(--accent-orange)]/10">
<Bot className="h-6 w-6 text-[var(--accent-orange)]" />
</div>
<div className="flex-1">
<h2 className="mb-1 font-semibold text-xl">Agents</h2>
<p className="text-muted-foreground text-sm">
OpenClaw, Claude Code, and Codex agents chat, schedule, and run
them in the background.
</p>
</div>
<Button
onClick={onCreateAgent}
className="border-[var(--accent-orange)] bg-[var(--accent-orange)]/10 text-[var(--accent-orange)] hover:bg-[var(--accent-orange)]/20 hover:text-[var(--accent-orange)]"
variant="outline"
>
<Plus className="mr-1.5 h-4 w-4" />
New Agent
</Button>
</div>
</div>
)
}

View File

@@ -1,206 +0,0 @@
import { Loader2, RotateCcw, Terminal } from 'lucide-react'
import type { FC, ReactNode } from 'react'
import { Badge } from '@/components/ui/badge'
import { Button } from '@/components/ui/button'
import { Separator } from '@/components/ui/separator'
import {
Tooltip,
TooltipContent,
TooltipProvider,
TooltipTrigger,
} from '@/components/ui/tooltip'
import { cn } from '@/lib/utils'
import type { OpenClawStatus } from './useOpenClaw'
interface GatewayStatusBarProps {
status: OpenClawStatus | null
/** Disabled while a gateway lifecycle mutation is mid-flight. */
actionInProgress: boolean
onOpenTerminal: () => void
onRestart: () => void
}
/**
* Compact one-line status bar for the OpenClaw gateway. Renders the
* lifecycle pills (Running / Control plane connected) plus a Terminal
* escape hatch and a Restart Gateway action. Lives between the page
* header and the agent list when at least one OpenClaw agent is in
* the merged list; collapses to nothing for Claude/Codex-only setups.
*
* Status is sourced from `GET /agents`'s `gateway` field — the agents
* page no longer polls `/claw/status` directly. One endpoint, one
* 5s interval, no duplicate state.
*/
export const GatewayStatusBar: FC<GatewayStatusBarProps> = ({
status,
actionInProgress,
onOpenTerminal,
onRestart,
}) => {
if (!status) return null
const runningPill = pillForRuntimeStatus(status.status)
const controlPlanePill = pillForControlPlane(status.controlPlaneStatus)
return (
<div className="rounded-xl border border-border bg-card px-4 py-3 shadow-sm">
<div className="flex items-center gap-3 text-sm">
<span className="font-medium text-muted-foreground">
OpenClaw gateway
</span>
<Badge
variant={runningPill.variant}
className={cn('gap-1.5', runningPill.className)}
>
<span
className={cn(
'inline-block h-1.5 w-1.5 rounded-full',
runningPill.dot,
)}
/>
{runningPill.label}
</Badge>
<Badge
variant={controlPlanePill.variant}
className={cn('gap-1.5', controlPlanePill.className)}
>
<span
className={cn(
'inline-block h-1.5 w-1.5 rounded-full',
controlPlanePill.dot,
)}
/>
{controlPlanePill.label}
</Badge>
<Separator orientation="vertical" className="h-4" />
<WithTooltip label="Open a shell into the OpenClaw gateway container for raw CLI access (config edits, session inspection).">
<Button variant="ghost" size="sm" onClick={onOpenTerminal}>
<Terminal className="mr-1.5 h-3.5 w-3.5" />
Terminal
</Button>
</WithTooltip>
<WithTooltip label="Restart the OpenClaw gateway. Useful when the gateway is stuck or after editing provider config.">
<Button
variant="ghost"
size="sm"
onClick={onRestart}
disabled={actionInProgress}
className="ml-auto"
>
{actionInProgress ? (
<Loader2 className="mr-1.5 h-3.5 w-3.5 animate-spin" />
) : (
<RotateCcw className="mr-1.5 h-3.5 w-3.5" />
)}
Restart Gateway
</Button>
</WithTooltip>
</div>
</div>
)
}
const WithTooltip: FC<{ label: string; children: ReactNode }> = ({
label,
children,
}) => (
<TooltipProvider delayDuration={250}>
<Tooltip>
<TooltipTrigger asChild>{children}</TooltipTrigger>
<TooltipContent side="bottom" className="max-w-xs text-xs">
{label}
</TooltipContent>
</Tooltip>
</TooltipProvider>
)
type PillKind = {
variant: 'default' | 'secondary' | 'outline' | 'destructive'
label: string
dot: string
className?: string
}
function pillForRuntimeStatus(status: OpenClawStatus['status']): PillKind {
switch (status) {
case 'running':
return {
variant: 'secondary',
label: 'Running',
dot: 'bg-emerald-500',
className: 'bg-emerald-50 text-emerald-900 hover:bg-emerald-50',
}
case 'starting':
return {
variant: 'secondary',
label: 'Starting',
dot: 'bg-amber-500 animate-pulse',
className: 'bg-amber-50 text-amber-900 hover:bg-amber-50',
}
case 'stopped':
return {
variant: 'outline',
label: 'Stopped',
dot: 'bg-muted-foreground/40',
}
case 'error':
return {
variant: 'destructive',
label: 'Error',
dot: 'bg-destructive-foreground',
}
default:
return {
variant: 'outline',
label: 'Unknown',
dot: 'bg-muted-foreground/40',
}
}
}
function pillForControlPlane(
status: OpenClawStatus['controlPlaneStatus'],
): PillKind {
switch (status) {
case 'connected':
return {
variant: 'secondary',
label: 'Control plane connected',
dot: 'bg-emerald-500',
className: 'bg-emerald-50 text-emerald-900 hover:bg-emerald-50',
}
case 'connecting':
return {
variant: 'secondary',
label: 'Connecting',
dot: 'bg-amber-500 animate-pulse',
className: 'bg-amber-50 text-amber-900 hover:bg-amber-50',
}
case 'reconnecting':
return {
variant: 'secondary',
label: 'Reconnecting',
dot: 'bg-amber-500 animate-pulse',
className: 'bg-amber-50 text-amber-900 hover:bg-amber-50',
}
case 'recovering':
return {
variant: 'secondary',
label: 'Recovering',
dot: 'bg-amber-500 animate-pulse',
className: 'bg-amber-50 text-amber-900 hover:bg-amber-50',
}
case 'failed':
return {
variant: 'destructive',
label: 'Needs attention',
dot: 'bg-destructive-foreground',
}
default:
return {
variant: 'outline',
label: 'Disconnected',
dot: 'bg-muted-foreground/40',
}
}
}

View File

@@ -1,83 +0,0 @@
import type { FC } from 'react'
import {
Tooltip,
TooltipContent,
TooltipProvider,
TooltipTrigger,
} from '@/components/ui/tooltip'
import { cn } from '@/lib/utils'
export type AgentLiveness = 'working' | 'idle' | 'asleep' | 'error' | 'unknown'
interface LivenessDotProps {
status: AgentLiveness
/**
* Optional human-friendly secondary line, e.g. "Idle for 4 min" or
* "Asleep — no activity for 22 min". When absent the tooltip just
* reads the status label.
*/
detail?: string
className?: string
}
const VARIANT: Record<
AgentLiveness,
{ dot: string; ring: string; label: string }
> = {
working: {
// Animated amber pulse + soft halo so the eye catches an active
// agent in a long list without the dot screaming for attention.
dot: 'bg-amber-500 animate-pulse',
ring: 'ring-2 ring-amber-200',
label: 'Working on a turn',
},
idle: {
dot: 'bg-emerald-500',
ring: 'ring-2 ring-emerald-100',
label: 'Idle',
},
asleep: {
dot: 'bg-muted-foreground/40',
ring: 'ring-2 ring-muted',
label: 'Asleep',
},
error: {
dot: 'bg-destructive',
ring: 'ring-2 ring-destructive/30',
label: 'Attention',
},
unknown: {
dot: 'bg-muted-foreground/30',
ring: 'ring-2 ring-muted',
label: 'Status unknown',
},
}
export const LivenessDot: FC<LivenessDotProps> = ({
status,
detail,
className,
}) => {
const variant = VARIANT[status]
return (
<TooltipProvider delayDuration={150}>
<Tooltip>
<TooltipTrigger asChild>
<span
role="img"
aria-label={detail ?? variant.label}
className={cn(
'inline-block h-3 w-3 rounded-full',
variant.dot,
variant.ring,
className,
)}
/>
</TooltipTrigger>
<TooltipContent side="right" className="text-xs">
{detail ?? variant.label}
</TooltipContent>
</Tooltip>
</TooltipProvider>
)
}

View File

@@ -1,260 +0,0 @@
import { AlertCircle, Loader2 } from 'lucide-react'
import type { FC } from 'react'
import { Alert, AlertDescription, AlertTitle } from '@/components/ui/alert'
import { Button } from '@/components/ui/button'
import {
Dialog,
DialogContent,
DialogFooter,
DialogHeader,
DialogTitle,
} from '@/components/ui/dialog'
import { Input } from '@/components/ui/input'
import { Label } from '@/components/ui/label'
import {
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
} from '@/components/ui/select'
import type {
HarnessAdapterDescriptor,
HarnessAgentAdapter,
} from './agent-harness-types'
import type { CreateAgentRuntime, ProviderOption } from './agents-page-types'
import { ProviderSelector } from './OpenClawControls'
import {
type OpenClawCliProvider,
type OpenClawCliProviderAuthStatus,
OpenClawCliProviderStatusPanel,
} from './openclaw-cli-providers'
interface NewAgentDialogProps {
adapters: HarnessAdapterDescriptor[]
canManageOpenClaw: boolean
createError: string | null
createRuntime: CreateAgentRuntime
creating: boolean
defaultProviderId: string
harnessAdapterId: HarnessAgentAdapter
harnessModelId: string
harnessReasoningEffort: string
name: string
open: boolean
providers: ProviderOption[]
selectedCliProvider: OpenClawCliProvider | undefined
selectedProviderId: string
cliAuthError: Error | null
cliAuthLoading: boolean
cliAuthStatus: OpenClawCliProviderAuthStatus | undefined
onConnectCliProvider: () => void
onCreate: () => void
onOpenChange: (open: boolean) => void
onRuntimeChange: (runtime: CreateAgentRuntime) => void
onHarnessAdapterChange: (adapter: HarnessAgentAdapter) => void
onHarnessModelChange: (modelId: string) => void
onHarnessReasoningChange: (reasoningEffort: string) => void
onNameChange: (name: string) => void
onProviderChange: (providerId: string) => void
}
export const NewAgentDialog: FC<NewAgentDialogProps> = ({
adapters,
canManageOpenClaw,
createError,
createRuntime,
creating,
defaultProviderId,
harnessAdapterId,
harnessModelId,
harnessReasoningEffort,
name,
open,
providers,
selectedCliProvider,
selectedProviderId,
cliAuthError,
cliAuthLoading,
cliAuthStatus,
onConnectCliProvider,
onCreate,
onOpenChange,
onRuntimeChange,
onHarnessAdapterChange,
onHarnessModelChange,
onHarnessReasoningChange,
onNameChange,
onProviderChange,
}) => {
const selectedHarnessAdapter =
adapters.find((adapter) => adapter.id === harnessAdapterId) ?? adapters[0]
const isHarnessRuntime = createRuntime !== 'openclaw'
const openClawBlocked = createRuntime === 'openclaw' && !canManageOpenClaw
const cliBlocked =
createRuntime === 'openclaw' &&
!!selectedCliProvider &&
!cliAuthStatus?.loggedIn
const canCreate =
Boolean(name.trim()) &&
!creating &&
!openClawBlocked &&
!cliBlocked &&
(createRuntime === 'openclaw'
? providers.length > 0
: Boolean(selectedHarnessAdapter))
return (
<Dialog open={open} onOpenChange={onOpenChange}>
<DialogContent>
<DialogHeader>
<DialogTitle>New Agent</DialogTitle>
</DialogHeader>
<div className="grid gap-4 py-2">
{createError ? (
<Alert variant="destructive">
<AlertCircle className="size-4" />
<AlertTitle>Create failed</AlertTitle>
<AlertDescription>{createError}</AlertDescription>
</Alert>
) : null}
<div className="grid gap-2">
<Label htmlFor="agent-name">Name</Label>
<Input
id="agent-name"
value={name}
onChange={(event) => onNameChange(event.target.value)}
placeholder={
createRuntime === 'openclaw' ? 'research-agent' : 'Review bot'
}
onKeyDown={(event) => {
if (event.key === 'Enter' && canCreate) onCreate()
}}
/>
</div>
<div className="grid gap-2">
<Label htmlFor="agent-runtime">Adapter</Label>
<Select
value={createRuntime}
onValueChange={(value) => {
if (
value === 'openclaw' ||
value === 'claude' ||
value === 'codex'
) {
onRuntimeChange(value)
if (value !== 'openclaw') onHarnessAdapterChange(value)
}
}}
>
<SelectTrigger id="agent-runtime">
<SelectValue />
</SelectTrigger>
<SelectContent>
{adapters.map((adapter) => (
<SelectItem key={adapter.id} value={adapter.id}>
{adapter.name}
</SelectItem>
))}
</SelectContent>
</Select>
</div>
{createRuntime === 'openclaw' ? (
<>
{openClawBlocked ? (
<Alert>
<AlertCircle className="size-4" />
<AlertTitle>OpenClaw is not ready</AlertTitle>
<AlertDescription>
Start or set up the OpenClaw gateway before creating an
OpenClaw agent.
</AlertDescription>
</Alert>
) : null}
<ProviderSelector
providers={providers}
defaultProviderId={defaultProviderId}
selectedId={selectedProviderId}
onSelect={onProviderChange}
hideApiKeyHint={!!selectedCliProvider}
/>
{selectedCliProvider ? (
<OpenClawCliProviderStatusPanel
provider={selectedCliProvider}
status={cliAuthStatus}
loading={cliAuthLoading}
fetchError={cliAuthError}
onConnect={onConnectCliProvider}
/>
) : null}
</>
) : null}
{isHarnessRuntime ? (
<>
<div className="grid gap-2">
<Label htmlFor="harness-model">Model</Label>
<Select
value={harnessModelId}
onValueChange={onHarnessModelChange}
>
<SelectTrigger id="harness-model">
<SelectValue />
</SelectTrigger>
<SelectContent>
{(selectedHarnessAdapter?.models ?? []).map((model) => (
<SelectItem key={model.id} value={model.id}>
{model.label}
</SelectItem>
))}
</SelectContent>
</Select>
</div>
<div className="grid gap-2">
<Label htmlFor="harness-effort">Reasoning</Label>
<Select
value={harnessReasoningEffort}
onValueChange={onHarnessReasoningChange}
>
<SelectTrigger id="harness-effort">
<SelectValue />
</SelectTrigger>
<SelectContent>
{(selectedHarnessAdapter?.reasoningEfforts ?? []).map(
(effort) => (
<SelectItem key={effort.id} value={effort.id}>
{effort.label}
</SelectItem>
),
)}
</SelectContent>
</Select>
</div>
</>
) : null}
</div>
<DialogFooter>
<Button
variant="outline"
onClick={() => onOpenChange(false)}
disabled={creating}
>
Cancel
</Button>
<Button disabled={!canCreate} onClick={onCreate}>
{creating ? <Loader2 className="mr-2 size-4 animate-spin" /> : null}
Create
</Button>
</DialogFooter>
</DialogContent>
</Dialog>
)
}

View File

@@ -1,387 +0,0 @@
import {
AlertCircle,
Cpu,
Loader2,
Plus,
RefreshCw,
ShieldAlert,
Square,
TerminalSquare,
WifiOff,
Wrench,
} from 'lucide-react'
import type { FC } from 'react'
import { Alert, AlertDescription, AlertTitle } from '@/components/ui/alert'
import { Badge } from '@/components/ui/badge'
import { Button } from '@/components/ui/button'
import { Card, CardContent } from '@/components/ui/card'
import { Label } from '@/components/ui/label'
import {
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
} from '@/components/ui/select'
import type { ProviderOption } from './agents-page-types'
import {
CONTROL_PLANE_COPY,
FALLBACK_CONTROL_PLANE_COPY,
} from './agents-page-types'
import type { getControlPlaneCopy } from './agents-page-utils'
import type { OpenClawStatus } from './useOpenClaw'
const StatusBadge: FC<{ status: OpenClawStatus['status'] }> = ({ status }) => {
const variants: Record<
OpenClawStatus['status'],
{
variant: 'default' | 'secondary' | 'outline' | 'destructive'
label: string
}
> = {
running: { variant: 'default', label: 'Running' },
starting: { variant: 'secondary', label: 'Starting...' },
stopped: { variant: 'outline', label: 'Stopped' },
error: { variant: 'destructive', label: 'Error' },
uninitialized: { variant: 'outline', label: 'Not Set Up' },
}
const current = variants[status] ?? {
variant: 'outline' as const,
label: 'Unknown',
}
return <Badge variant={current.variant}>{current.label}</Badge>
}
const ControlPlaneBadge: FC<{
status: OpenClawStatus['controlPlaneStatus']
}> = ({ status }) => {
const current = CONTROL_PLANE_COPY[status] ?? FALLBACK_CONTROL_PLANE_COPY
return <Badge variant={current.badgeVariant}>{current.badgeLabel}</Badge>
}
interface ProviderSelectorProps {
providers: ProviderOption[]
defaultProviderId: string
selectedId: string
onSelect: (id: string) => void
hideApiKeyHint?: boolean
}
export const ProviderSelector: FC<ProviderSelectorProps> = ({
providers,
defaultProviderId,
selectedId,
onSelect,
hideApiKeyHint,
}) => {
if (providers.length === 0) {
return (
<div className="space-y-2">
<p className="font-medium text-sm">LLM Provider</p>
<p className="text-muted-foreground text-sm">
No compatible LLM providers configured.{' '}
<a href="#/settings/ai" className="underline">
Add one in AI settings
</a>{' '}
first.
</p>
</div>
)
}
return (
<div className="space-y-2">
<Label htmlFor="provider-select">LLM Provider</Label>
<Select value={selectedId} onValueChange={onSelect}>
<SelectTrigger id="provider-select">
<SelectValue placeholder="Select a provider" />
</SelectTrigger>
<SelectContent>
{providers.map((provider) => (
<SelectItem key={provider.id} value={provider.id}>
{provider.name} - {provider.modelId}
{provider.id === defaultProviderId ? ' (default)' : ''}
</SelectItem>
))}
</SelectContent>
</Select>
{!hideApiKeyHint && (
<p className="text-muted-foreground text-xs">
Uses your existing API key from BrowserOS settings. The key is passed
to the container and never leaves your machine.
</p>
)}
</div>
)
}
interface AgentsPageHeaderProps {
actionInProgress: boolean
controlPlaneBusy: boolean
reconnecting: boolean
status: OpenClawStatus | null
onCreateAgent: () => void
onOpenTerminal: () => void
onReconnect: () => void
onRefresh: () => void
onRestart: () => void
onStop: () => void
}
export const AgentsPageHeader: FC<AgentsPageHeaderProps> = ({
actionInProgress,
controlPlaneBusy,
reconnecting,
status,
onCreateAgent,
onOpenTerminal,
onReconnect,
onRefresh,
onRestart,
onStop,
}) => (
<div className="flex flex-wrap items-center justify-between gap-3">
<div>
<h1 className="font-semibold text-2xl tracking-normal">Agents</h1>
<p className="text-muted-foreground text-sm">
OpenClaw, Claude Code, and Codex agents
</p>
</div>
<div className="flex flex-wrap items-center gap-2">
{status ? (
<>
<StatusBadge status={status.status} />
{status.status !== 'uninitialized' && (
<ControlPlaneBadge status={status.controlPlaneStatus} />
)}
</>
) : null}
{status?.status === 'running' &&
status.controlPlaneStatus !== 'connected' ? (
<Button
variant="outline"
onClick={onReconnect}
disabled={actionInProgress || controlPlaneBusy}
>
{reconnecting ? (
<Loader2 className="mr-2 size-4 animate-spin" />
) : (
<RefreshCw className="mr-2 size-4" />
)}
Retry Connection
</Button>
) : null}
{status?.status === 'running' ? (
<>
<Button
variant="ghost"
size="icon"
onClick={onRestart}
disabled={actionInProgress}
title="Restart gateway"
>
<RefreshCw className="size-4" />
</Button>
<Button
variant="ghost"
size="icon"
onClick={onStop}
disabled={actionInProgress}
title="Stop gateway"
>
<Square className="size-4" />
</Button>
<Button variant="outline" onClick={onOpenTerminal}>
<TerminalSquare className="mr-2 size-4" />
Terminal
</Button>
</>
) : null}
<Button variant="ghost" size="icon" onClick={onRefresh} title="Refresh">
<RefreshCw className="size-4" />
</Button>
<Button onClick={onCreateAgent}>
<Plus className="mr-2 size-4" />
New Agent
</Button>
</div>
</div>
)
export function LifecycleAlert({ message }: { message: string }) {
return (
<Alert>
<Loader2 className="size-4 animate-spin" />
<AlertTitle>{message}</AlertTitle>
</Alert>
)
}
export function InlineErrorAlert({
message,
onDismiss,
}: {
message: string
onDismiss: () => void
}) {
return (
<Alert variant="destructive">
<AlertCircle className="size-4" />
<AlertTitle>Agent action failed</AlertTitle>
<AlertDescription>
<p>{message}</p>
<div className="mt-2">
<Button variant="outline" size="sm" onClick={onDismiss}>
Dismiss
</Button>
</div>
</AlertDescription>
</Alert>
)
}
interface ControlPlaneAlertProps {
actionInProgress: boolean
controlPlaneBusy: boolean
controlPlaneCopy: ReturnType<typeof getControlPlaneCopy>
reconnecting: boolean
recoveryDetail: string | null
status: OpenClawStatus
onReconnect: () => void
onRestart: () => void
}
export const ControlPlaneAlert: FC<ControlPlaneAlertProps> = ({
actionInProgress,
controlPlaneBusy,
controlPlaneCopy,
reconnecting,
recoveryDetail,
status,
onReconnect,
onRestart,
}) => (
<Alert
variant={status.controlPlaneStatus === 'failed' ? 'destructive' : 'default'}
>
{status.controlPlaneStatus === 'failed' ? (
<ShieldAlert className="size-4" />
) : status.controlPlaneStatus === 'recovering' ? (
<Wrench className="size-4" />
) : (
<WifiOff className="size-4" />
)}
<AlertTitle>{controlPlaneCopy.title}</AlertTitle>
<AlertDescription>
<p>{controlPlaneCopy.description}</p>
{recoveryDetail ? <p>{recoveryDetail}</p> : null}
<div className="mt-2 flex flex-wrap gap-2">
<Button
variant="outline"
size="sm"
onClick={onReconnect}
disabled={actionInProgress || controlPlaneBusy}
>
{reconnecting ? (
<Loader2 className="mr-2 size-4 animate-spin" />
) : (
<RefreshCw className="mr-2 size-4" />
)}
Retry Connection
</Button>
<Button
variant="outline"
size="sm"
onClick={onRestart}
disabled={actionInProgress}
>
Restart Gateway
</Button>
</div>
</AlertDescription>
</Alert>
)
interface GatewayStateCardsProps {
actionInProgress: boolean
status: OpenClawStatus | null
onOpenSetup: () => void
onRestart: () => void
onStart: () => void
}
export const GatewayStateCards: FC<GatewayStateCardsProps> = ({
actionInProgress,
status,
onOpenSetup,
onRestart,
onStart,
}) => (
<>
{status?.status === 'uninitialized' ? (
<Card>
<CardContent className="flex flex-col items-center gap-4 py-12">
<Cpu className="size-12 text-muted-foreground" />
<div className="text-center">
<h3 className="font-semibold text-lg">Set Up OpenClaw</h3>
<p className="text-muted-foreground text-sm">
{status.podmanAvailable
? 'Create a local BrowserOS VM to run autonomous agents with full tool access.'
: 'BrowserOS VM runtime is unavailable on this system.'}
</p>
</div>
{status.podmanAvailable ? (
<Button onClick={onOpenSetup}>Set Up Now</Button>
) : null}
</CardContent>
</Card>
) : null}
{status?.status === 'stopped' ? (
<Card>
<CardContent className="flex flex-col items-center gap-4 py-12">
<Cpu className="size-12 text-muted-foreground" />
<div className="text-center">
<h3 className="font-semibold text-lg">Gateway Stopped</h3>
<p className="text-muted-foreground text-sm">
The OpenClaw gateway is not running.
</p>
</div>
<Button onClick={onStart} disabled={actionInProgress}>
Start Gateway
</Button>
</CardContent>
</Card>
) : null}
{status?.status === 'error' ? (
<Card className="border-destructive">
<CardContent className="flex flex-col items-center gap-4 py-12">
<AlertCircle className="size-12 text-destructive" />
<div className="text-center">
<h3 className="font-semibold text-lg">Gateway Error</h3>
<p className="text-muted-foreground text-sm">
{status.error ?? status.lastGatewayError}
</p>
</div>
<div className="flex gap-2">
<Button onClick={onStart} disabled={actionInProgress}>
Start Gateway
</Button>
<Button
variant="outline"
onClick={onRestart}
disabled={actionInProgress}
>
Restart Gateway
</Button>
</div>
</CardContent>
</Card>
) : null}
</>
)

View File

@@ -1,76 +0,0 @@
import { Loader2 } from 'lucide-react'
import type { FC } from 'react'
import { Button } from '@/components/ui/button'
import {
Dialog,
DialogContent,
DialogHeader,
DialogTitle,
} from '@/components/ui/dialog'
import type { ProviderOption } from './agents-page-types'
import { ProviderSelector } from './OpenClawControls'
import type { OpenClawCliProvider } from './openclaw-cli-providers'
interface SetupOpenClawDialogProps {
defaultProviderId: string
open: boolean
providers: ProviderOption[]
selectedProviderId: string
selectedCliProvider: OpenClawCliProvider | undefined
settingUp: boolean
onOpenChange: (open: boolean) => void
onProviderChange: (providerId: string) => void
onSetup: () => void
}
export const SetupOpenClawDialog: FC<SetupOpenClawDialogProps> = ({
defaultProviderId,
open,
providers,
selectedProviderId,
selectedCliProvider,
settingUp,
onOpenChange,
onProviderChange,
onSetup,
}) => (
<Dialog open={open} onOpenChange={onOpenChange}>
<DialogContent>
<DialogHeader>
<DialogTitle>Set Up OpenClaw</DialogTitle>
</DialogHeader>
<div className="space-y-4 py-2">
<ProviderSelector
providers={providers}
defaultProviderId={defaultProviderId}
selectedId={selectedProviderId}
onSelect={onProviderChange}
hideApiKeyHint={!!selectedCliProvider}
/>
{selectedCliProvider ? (
<p className="rounded-md border border-border bg-muted/30 px-3 py-2 text-muted-foreground text-xs">
{selectedCliProvider.description}. Clicking{' '}
<span className="font-medium">Set Up &amp; Start</span> starts the
gateway and opens a terminal to sign in.
</p>
) : null}
<Button
onClick={onSetup}
disabled={settingUp || providers.length === 0}
className="w-full"
>
{settingUp ? (
<>
<Loader2 className="mr-2 size-4 animate-spin" />
Setting up...
</>
) : (
'Set Up & Start'
)}
</Button>
</div>
</DialogContent>
</Dialog>
)

View File

@@ -1,4 +0,0 @@
export function buildAgentApiUrl(baseUrl: string, path: string): string {
const normalizedPath = path === '/' ? '' : path
return `${baseUrl}/agents${normalizedPath}`
}

View File

@@ -1,84 +0,0 @@
import type { AgentListItem } from './agents-page-types'
/**
* Display rules for the redesigned agent rows. Pure helpers — no React,
* no API calls — so they're trivial to unit-test and the row card stays
* focused on layout.
*/
const UUID_PATTERN =
/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
const OC_UUID_PATTERN =
/^oc-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
/**
* The agent rail used to render whatever the gateway returned for `name`.
* Post-migration that's frequently the agent's UUID — readable to nobody.
* Prefer the explicit `name` when it differs meaningfully from the id;
* otherwise fall back to a short prefix users can recognize on second
* glance.
*/
export function displayName(agent: AgentListItem): string {
const name = agent.name?.trim()
const id = agent.agentId
if (!name || name === id) {
if (OC_UUID_PATTERN.test(id)) return id.slice(0, 11) // "oc-XXXXXXXX"
if (UUID_PATTERN.test(id)) return id.slice(0, 8)
return id
}
return name
}
export function canDelete(agent: AgentListItem): boolean {
// The gateway's protected `main` agent must not be deletable. The
// server enforces this too, but disabling the menu item avoids users
// hitting an opaque 400.
if (agent.agentId === 'main') return false
return agent.canDelete
}
/**
* Rename will be wired to a future `PATCH /agents/:id` endpoint. The
* legacy `/claw/agents` create flow named the agent on the gateway via
* the `name` field but the field isn't editable post-create today.
*/
export function canRename(_agent: AgentListItem): boolean {
return false
}
/**
* The detail line carries the agent's workspace path. The `detail`
* field on AgentListItem already holds it for OpenClaw entries
* (`/home/node/.openclaw/workspace-...`); for harness agents it's the
* synthetic `<adapter>:main` marker that's not informative — hide it.
*/
export function workspaceLabel(agent: AgentListItem): string | null {
if (!agent.detail) return null
if (/^(claude|codex|openclaw):main$/.test(agent.detail)) return null
return agent.detail
}
const ONE_MINUTE = 60_000
const ONE_HOUR = 60 * ONE_MINUTE
const ONE_DAY = 24 * ONE_HOUR
/**
* Lightweight relative-time formatter. We don't want to drag in
* `dayjs/relativeTime` just for a few labels.
*/
export function formatRelativeTime(epochMs: number | null): string {
if (epochMs === null || !Number.isFinite(epochMs)) return 'never'
const diff = Math.max(0, Date.now() - epochMs)
if (diff < ONE_MINUTE) return 'just now'
if (diff < ONE_HOUR) {
const m = Math.floor(diff / ONE_MINUTE)
return `${m} min ago`
}
if (diff < ONE_DAY) {
const h = Math.floor(diff / ONE_HOUR)
return h === 1 ? '1 hr ago' : `${h} hr ago`
}
const d = Math.floor(diff / ONE_DAY)
return d === 1 ? '1 day ago' : `${d} days ago`
}

View File

@@ -1,118 +0,0 @@
import type { AgentEntry } from './useOpenClaw'
export type HarnessAgentAdapter = 'claude' | 'codex' | 'openclaw'
export type AgentHarnessStreamEvent =
| {
type: 'text_delta'
text: string
stream: 'output' | 'thought'
rawType?: string
}
| {
type: 'tool_call'
text: string
title: string
id?: string
status?: string
rawType?: string
}
| {
type: 'status'
text: string
rawType?: string
}
| {
type: 'done'
text?: string
stopReason?: string
}
| {
type: 'error'
message: string
code?: string
}
export type HarnessAgentLiveness = 'working' | 'idle' | 'asleep' | 'error'
export interface HarnessAgent {
id: string
name: string
adapter: HarnessAgentAdapter
modelId?: string
reasoningEffort?: string
permissionMode: 'approve-all'
sessionKey: string
createdAt: number
updatedAt: number
/**
* Server-derived liveness state. When the listing endpoint hasn't
* been enriched yet (older deployments) this is undefined and the UI
* falls back to `unknown`.
*/
status?: HarnessAgentLiveness
/**
* Wall-clock ms of the last persisted turn. `null` for never-used
* agents. Drives the recency sort and the "Last used X min ago" copy.
*/
lastUsedAt?: number | null
}
export interface HarnessAdapterDescriptor {
id: HarnessAgentAdapter
name: string
defaultModelId: string
defaultReasoningEffort: string
modelControl: 'runtime-supported' | 'best-effort'
models: Array<{ id: string; label: string; recommended?: boolean }>
reasoningEfforts: Array<{ id: string; label: string; recommended?: boolean }>
}
export interface CreateHarnessAgentInput {
name: string
adapter: HarnessAgentAdapter
modelId?: string
reasoningEffort?: string
}
export interface HarnessHistoryReasoning {
text: string
durationMs?: number
}
export interface HarnessHistoryToolCall {
toolCallId?: string
toolName: string
status: 'pending' | 'running' | 'completed' | 'failed'
input?: unknown
output?: unknown
error?: string
durationMs?: number
}
export interface HarnessHistoryEntry {
id: string
agentId: string
sessionId: 'main'
role: 'user' | 'assistant'
text: string
createdAt: number
reasoning?: HarnessHistoryReasoning
toolCalls?: HarnessHistoryToolCall[]
}
export interface HarnessAgentHistoryPage {
agentId: string
sessionId: 'main'
items: HarnessHistoryEntry[]
}
export function mapHarnessAgentToEntry(agent: HarnessAgent): AgentEntry {
return {
agentId: agent.id,
name: agent.name,
workspace: `${agent.adapter}:main`,
model: agent.modelId,
source: 'agent-harness',
}
}

View File

@@ -1,172 +0,0 @@
import type { NavigateFunction } from 'react-router'
import {
AGENT_CREATED_EVENT,
AGENT_DELETED_EVENT,
} from '@/lib/constants/analyticsEvents'
import { track } from '@/lib/metrics/track'
import type { HarnessAgent, HarnessAgentAdapter } from './agent-harness-types'
import type {
AgentListItem,
CreateAgentRuntime,
ProviderOption,
} from './agents-page-types'
import { findOpenClawCliProviderById } from './openclaw-cli-providers'
import type {
AgentEntry,
OpenClawAgentMutationInput,
OpenClawSetupInput,
} from './useOpenClaw'
export interface AgentPageActionInput {
createProviderId: string
createRuntime: CreateAgentRuntime
harnessModelId: string
harnessReasoningEffort: string
navigate: NavigateFunction
newName: string
selectableOpenClawProviders: ProviderOption[]
setupProviderId: string
createHarnessAgent: (input: {
name: string
adapter: HarnessAgentAdapter
modelId?: string
reasoningEffort?: string
}) => Promise<HarnessAgent>
createOpenClawAgent: (
input: OpenClawAgentMutationInput,
) => Promise<{ agent: AgentEntry }>
deleteHarnessAgent: (agentId: string) => Promise<unknown>
deleteOpenClawAgent: (agentId: string) => Promise<unknown>
setCliAuthModalOpen: (open: boolean) => void
setCreateError: (error: string | null) => void
setCreateOpen: (open: boolean) => void
setDeletingAgentKey: (key: string | null) => void
setNewName: (name: string) => void
setPageError: (error: string | null) => void
setSetupOpen: (open: boolean) => void
setupOpenClaw: (input: OpenClawSetupInput) => Promise<unknown>
}
export function createAgentPageActions(input: AgentPageActionInput) {
const runWithPageErrorHandling = async (fn: () => Promise<unknown>) => {
input.setPageError(null)
try {
await fn()
} catch (err) {
input.setPageError(err instanceof Error ? err.message : String(err))
}
}
const handleSetup = async () => {
const option = input.selectableOpenClawProviders.find(
(item) => item.id === input.setupProviderId,
)
const isCli = !!option && !!findOpenClawCliProviderById(option.type)
const llmOption = !isCli && option ? option : undefined
await runWithPageErrorHandling(async () => {
await input.setupOpenClaw({
providerType: option?.type,
providerName: isCli ? undefined : option?.name,
baseUrl: llmOption?.baseUrl,
apiKey: llmOption?.apiKey,
modelId: option?.modelId,
})
input.setSetupOpen(false)
if (isCli) input.setCliAuthModalOpen(true)
})
}
const handleOpenClawCreate = async () => {
if (!input.newName.trim()) return
const option = input.selectableOpenClawProviders.find(
(item) => item.id === input.createProviderId,
)
const normalizedName = input.newName
.trim()
.toLowerCase()
.replace(/\s+/g, '-')
const isCli = !!option && !!findOpenClawCliProviderById(option.type)
const llmOption = !isCli && option ? option : undefined
input.setCreateError(null)
try {
const result = await input.createOpenClawAgent({
name: normalizedName,
providerType: option?.type,
providerName: isCli ? undefined : option?.name,
baseUrl: llmOption?.baseUrl,
apiKey: llmOption?.apiKey,
modelId: option?.modelId,
})
input.setCreateOpen(false)
input.setNewName('')
track(AGENT_CREATED_EVENT, {
runtime: 'openclaw',
provider_type: option?.type,
})
input.navigate(`/agents/${result.agent.agentId}`)
} catch (err) {
input.setCreateError(err instanceof Error ? err.message : String(err))
}
}
const handleHarnessCreate = async () => {
if (!input.newName.trim()) return
input.setCreateError(null)
try {
const agent = await input.createHarnessAgent({
name: input.newName.trim(),
adapter: input.createRuntime as HarnessAgentAdapter,
modelId: input.harnessModelId || undefined,
reasoningEffort: input.harnessReasoningEffort || undefined,
})
input.setCreateOpen(false)
input.setNewName('')
track(AGENT_CREATED_EVENT, {
runtime: input.createRuntime,
model_id: input.harnessModelId || undefined,
reasoning_effort: input.harnessReasoningEffort || undefined,
})
input.navigate(`/agents/${agent.id}`)
} catch (err) {
input.setCreateError(err instanceof Error ? err.message : String(err))
}
}
const handleCreate = () => {
const createByRuntime: Record<CreateAgentRuntime, () => Promise<void>> = {
openclaw: handleOpenClawCreate,
claude: handleHarnessCreate,
codex: handleHarnessCreate,
}
void createByRuntime[input.createRuntime]()
}
const handleDelete = async (agent: AgentListItem) => {
input.setDeletingAgentKey(agent.key)
await runWithPageErrorHandling(async () => {
const deleteBySource: Record<
AgentListItem['source'],
(agentId: string) => Promise<unknown>
> = {
openclaw: (agentId) => input.deleteOpenClawAgent(agentId),
'agent-harness': (agentId) => input.deleteHarnessAgent(agentId),
}
await deleteBySource[agent.source](agent.agentId)
track(AGENT_DELETED_EVENT, {
runtime: agent.source,
agent_id: agent.agentId,
})
})
input.setDeletingAgentKey(null)
}
return {
handleCreate,
handleDelete,
handleSetup,
runWithPageErrorHandling,
}
}

View File

@@ -1,173 +0,0 @@
import { type Dispatch, type SetStateAction, useEffect, useMemo } from 'react'
import type { LlmProviderConfig } from '@/lib/llm-providers/types'
import type {
HarnessAdapterDescriptor,
HarnessAgentAdapter,
} from './agent-harness-types'
import type { CreateAgentRuntime } from './agents-page-types'
import { toProviderOptions } from './agents-page-utils'
import {
buildOpenClawCliProviderOptions,
findOpenClawCliProviderById,
useOpenClawCliProviderAuthStatus,
} from './openclaw-cli-providers'
export function useDefaultAgentName(
createOpen: boolean,
setNewName: Dispatch<SetStateAction<string>>,
): void {
useEffect(() => {
if (!createOpen) return
setNewName((current) => current || 'agent')
}, [createOpen, setNewName])
}
export function useHarnessAgentDefaults(input: {
adapters: HarnessAdapterDescriptor[]
createOpen: boolean
harnessAdapterId: HarnessAgentAdapter
setHarnessAdapterId: Dispatch<SetStateAction<HarnessAgentAdapter>>
setHarnessModelId: Dispatch<SetStateAction<string>>
setHarnessReasoningEffort: Dispatch<SetStateAction<string>>
}): void {
const {
adapters,
createOpen,
harnessAdapterId,
setHarnessAdapterId,
setHarnessModelId,
setHarnessReasoningEffort,
} = input
useEffect(() => {
if (!createOpen) return
const adapter =
adapters.find((entry) => entry.id === harnessAdapterId) ?? adapters[0]
if (!adapter) return
setHarnessAdapterId(adapter.id)
setHarnessModelId((current) => current || adapter.defaultModelId)
setHarnessReasoningEffort(
(current) => current || adapter.defaultReasoningEffort,
)
}, [
adapters,
createOpen,
harnessAdapterId,
setHarnessAdapterId,
setHarnessModelId,
setHarnessReasoningEffort,
])
}
export function useOpenClawProviderSelection(input: {
providers: LlmProviderConfig[]
defaultProviderId: string
createOpen: boolean
createRuntime: CreateAgentRuntime
createProviderId: string
setCreateProviderId: Dispatch<SetStateAction<string>>
setupOpen: boolean
setupProviderId: string
setSetupProviderId: Dispatch<SetStateAction<string>>
cliAuthModalOpen: boolean
setCliAuthModalOpen: Dispatch<SetStateAction<boolean>>
}) {
const {
providers,
defaultProviderId,
createOpen,
createRuntime,
createProviderId,
setCreateProviderId,
setupOpen,
setupProviderId,
setSetupProviderId,
cliAuthModalOpen,
setCliAuthModalOpen,
} = input
const cliProviderOptions = useMemo(
() => buildOpenClawCliProviderOptions(),
[],
)
const selectableOpenClawProviders = useMemo(
() => toProviderOptions(providers, cliProviderOptions),
[providers, cliProviderOptions],
)
useEffect(() => {
if (selectableOpenClawProviders.length === 0) return
const fallbackId =
selectableOpenClawProviders.find(
(provider) => provider.id === defaultProviderId,
)?.id ?? selectableOpenClawProviders[0].id
if (createOpen && !createProviderId) {
setCreateProviderId(fallbackId)
}
}, [
createOpen,
createProviderId,
defaultProviderId,
selectableOpenClawProviders,
setCreateProviderId,
])
useEffect(() => {
if (selectableOpenClawProviders.length === 0) return
const fallbackId =
selectableOpenClawProviders.find(
(provider) => provider.id === defaultProviderId,
)?.id ?? selectableOpenClawProviders[0].id
if (setupOpen && !setupProviderId) {
setSetupProviderId(fallbackId)
}
}, [
defaultProviderId,
selectableOpenClawProviders,
setSetupProviderId,
setupOpen,
setupProviderId,
])
const selectedCreateOption = selectableOpenClawProviders.find(
(provider) => provider.id === createProviderId,
)
const selectedCliProvider = selectedCreateOption
? findOpenClawCliProviderById(selectedCreateOption.type)
: undefined
const selectedSetupOption = selectableOpenClawProviders.find(
(provider) => provider.id === setupProviderId,
)
const selectedSetupCliProvider = selectedSetupOption
? findOpenClawCliProviderById(selectedSetupOption.type)
: undefined
const activeCliProvider =
(setupOpen && selectedSetupCliProvider) ||
(createOpen && createRuntime === 'openclaw' && selectedCliProvider) ||
undefined
const {
data: cliAuthStatus,
isLoading: cliAuthLoading,
error: cliAuthError,
} = useOpenClawCliProviderAuthStatus(
activeCliProvider?.id ?? '',
!!activeCliProvider,
)
useEffect(() => {
if (cliAuthModalOpen && cliAuthStatus?.loggedIn) {
setCliAuthModalOpen(false)
}
}, [cliAuthModalOpen, cliAuthStatus?.loggedIn, setCliAuthModalOpen])
return {
selectableOpenClawProviders,
selectedCliProvider,
selectedSetupCliProvider,
authTerminalProvider: selectedSetupCliProvider ?? selectedCliProvider,
cliAuthStatus,
cliAuthLoading,
cliAuthError,
}
}

View File

@@ -1,119 +0,0 @@
import type { HarnessAgentAdapter } from './agent-harness-types'
import type { GatewayLifecycleAction, OpenClawStatus } from './useOpenClaw'
export type CreateAgentRuntime = 'openclaw' | HarnessAgentAdapter
export interface ProviderOption {
id: string
type: string
name: string
modelId: string
baseUrl?: string
apiKey?: string
}
export interface AgentListItem {
key: string
agentId: string
name: string
source: 'openclaw' | 'agent-harness'
runtimeLabel: string
modelLabel: string
detail: string
canChat: boolean
canDelete: boolean
}
export interface GatewayUiState {
canManageAgents: boolean
controlPlaneDegraded: boolean
controlPlaneBusy: boolean
}
export const DEFAULT_HARNESS_ADAPTER: HarnessAgentAdapter = 'claude'
export const DEFAULT_CREATE_RUNTIME: CreateAgentRuntime = 'openclaw'
export const LIFECYCLE_BANNER_COPY: Record<GatewayLifecycleAction, string> = {
setup: 'Setting up OpenClaw...',
start: 'Starting gateway...',
stop: 'Stopping gateway...',
restart: 'Restarting gateway...',
reconnect: 'Restoring gateway connection...',
}
export const CONTROL_PLANE_COPY: Record<
OpenClawStatus['controlPlaneStatus'],
{
badgeVariant: 'default' | 'secondary' | 'outline' | 'destructive'
badgeLabel: string
title: string
description: string
}
> = {
connected: {
badgeVariant: 'default',
badgeLabel: 'Control Plane Ready',
title: 'Gateway Connected',
description: 'OpenClaw can create, manage, and chat with agents normally.',
},
connecting: {
badgeVariant: 'secondary',
badgeLabel: 'Connecting',
title: 'Connecting to Gateway',
description:
'BrowserOS is establishing the OpenClaw control channel for agent operations.',
},
reconnecting: {
badgeVariant: 'secondary',
badgeLabel: 'Reconnecting',
title: 'Reconnecting Control Plane',
description:
'The gateway process is up, but BrowserOS is restoring the control channel.',
},
recovering: {
badgeVariant: 'secondary',
badgeLabel: 'Recovering',
title: 'Recovering Gateway Connection',
description:
'BrowserOS detected a control-plane fault and is trying a safe recovery path.',
},
disconnected: {
badgeVariant: 'outline',
badgeLabel: 'Disconnected',
title: 'Gateway Disconnected',
description: 'The gateway process is not available to BrowserOS right now.',
},
failed: {
badgeVariant: 'destructive',
badgeLabel: 'Needs Attention',
title: 'Gateway Recovery Failed',
description:
'BrowserOS could not restore the OpenClaw control channel automatically.',
},
}
export const FALLBACK_CONTROL_PLANE_COPY = {
badgeVariant: 'outline' as const,
badgeLabel: 'Unknown',
title: 'Gateway State Unknown',
description:
'BrowserOS received a gateway status it does not recognize yet. Refreshing or reconnecting should restore a known state.',
}
export const RECOVERY_REASON_COPY: Record<
NonNullable<OpenClawStatus['lastRecoveryReason']>,
string
> = {
transient_disconnect:
'The control channel dropped briefly and BrowserOS is retrying it.',
signature_expired:
'The gateway rejected the signed device handshake because its clock drifted.',
pairing_required:
'The gateway asked BrowserOS to approve its local device identity again.',
token_mismatch:
'BrowserOS had to reload the gateway token before reconnecting.',
container_not_ready:
'The OpenClaw gateway process is not ready yet, so control-plane recovery cannot start.',
unknown:
'BrowserOS hit an unexpected gateway error and could not classify it cleanly.',
}

View File

@@ -1,167 +0,0 @@
import type { LlmProviderConfig } from '@/lib/llm-providers/types'
import type { HarnessAgent, HarnessAgentAdapter } from './agent-harness-types'
import {
type AgentListItem,
CONTROL_PLANE_COPY,
FALLBACK_CONTROL_PLANE_COPY,
type GatewayUiState,
LIFECYCLE_BANNER_COPY,
type ProviderOption,
RECOVERY_REASON_COPY,
} from './agents-page-types'
import { getOpenClawSupportedProviders } from './openclaw-supported-providers'
import {
type AgentEntry,
type GatewayLifecycleAction,
getModelDisplayName,
type OpenClawStatus,
} from './useOpenClaw'
export function getControlPlaneCopy(
status: OpenClawStatus['controlPlaneStatus'],
) {
return CONTROL_PLANE_COPY[status] ?? FALLBACK_CONTROL_PLANE_COPY
}
export function getRecoveryDetail(status: OpenClawStatus): string | null {
if (!status.lastRecoveryReason && !status.lastGatewayError) return null
const detail = status.lastRecoveryReason
? RECOVERY_REASON_COPY[status.lastRecoveryReason]
: null
if (status.lastGatewayError && detail) {
return `${detail} Latest gateway error: ${status.lastGatewayError}`
}
return status.lastGatewayError ?? detail
}
export function formatHarnessAdapter(adapter: HarnessAgentAdapter): string {
return adapter === 'claude' ? 'Claude Code' : 'Codex'
}
export function toProviderOptions(
providers: LlmProviderConfig[],
cliProviders: ProviderOption[],
): ProviderOption[] {
return [...getOpenClawSupportedProviders(providers), ...cliProviders]
}
export function toOpenClawListItem(
agent: AgentEntry,
canManageAgents: boolean,
): AgentListItem {
return {
key: `openclaw:${agent.agentId}`,
agentId: agent.agentId,
name: agent.name,
source: 'openclaw',
runtimeLabel: 'OpenClaw',
modelLabel: getModelDisplayName(agent.model) ?? 'default',
detail: agent.workspace,
canChat: canManageAgents,
canDelete: canManageAgents && agent.agentId !== 'main',
}
}
export function toHarnessListItem(agent: HarnessAgent): AgentListItem {
return {
key: `agent-harness:${agent.id}`,
agentId: agent.id,
name: agent.name,
source: 'agent-harness',
runtimeLabel: formatHarnessAdapter(agent.adapter),
modelLabel: agent.modelId ?? 'default',
detail: `${agent.adapter}:main`,
canChat: true,
canDelete: true,
}
}
export function getGatewayUiState(
status: OpenClawStatus | null,
): GatewayUiState {
if (!status) {
return {
canManageAgents: false,
controlPlaneDegraded: false,
controlPlaneBusy: false,
}
}
const controlPlaneBusy =
status.controlPlaneStatus === 'connecting' ||
status.controlPlaneStatus === 'reconnecting' ||
status.controlPlaneStatus === 'recovering'
return {
canManageAgents:
status.status === 'running' && status.controlPlaneStatus === 'connected',
controlPlaneBusy,
controlPlaneDegraded:
status.status === 'running' && status.controlPlaneStatus !== 'connected',
}
}
export function getLifecycleBanner(
action: GatewayLifecycleAction | null,
): string | null {
return action ? LIFECYCLE_BANNER_COPY[action] : null
}
export function canManageOpenClawAgents(
state: GatewayUiState,
lifecyclePending: boolean,
): boolean {
return state.canManageAgents && !lifecyclePending
}
export function shouldShowControlPlaneDegraded(
state: GatewayUiState,
lifecyclePending: boolean,
): boolean {
return state.controlPlaneDegraded && !lifecyclePending
}
export function getControlPlaneCopyForStatus(status: OpenClawStatus | null) {
return status
? getControlPlaneCopy(status.controlPlaneStatus)
: FALLBACK_CONTROL_PLANE_COPY
}
export function getVisibleOpenClawAgents(
enabled: boolean,
agents: AgentEntry[],
): AgentEntry[] {
return enabled ? agents : []
}
export function getAgentsLoading(input: {
adaptersLoading: boolean
harnessAgentsLoading: boolean
openClawAgentsLoading: boolean
}): boolean {
return (
input.adaptersLoading ||
input.harnessAgentsLoading ||
input.openClawAgentsLoading
)
}
export function getInlineError(input: {
lifecyclePending: boolean
pageError: string | null
openClawAgentsError: Error | null
adaptersError: Error | null
harnessAgentsError: Error | null
}): string | null {
if (input.lifecyclePending) return null
return (
input.pageError ??
input.openClawAgentsError?.message ??
input.adaptersError?.message ??
input.harnessAgentsError?.message ??
null
)
}

View File

@@ -1,185 +0,0 @@
import { useQuery } from '@tanstack/react-query'
import { CheckCircle2, Loader2, Terminal, TriangleAlert } from 'lucide-react'
import type { FC } from 'react'
import { Button } from '@/components/ui/button'
import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
export interface OpenClawCliProvider {
id: string
displayName: string
description: string
models: readonly string[]
authLoginCommand: string
}
export interface OpenClawCliProviderAuthStatus {
installed: boolean
loggedIn: boolean
accountLabel?: string
subscriptionLabel?: string
error?: string
}
export interface OpenClawCliProviderOption {
id: string
type: string
name: string
modelId: string
}
const CLAUDE_CLI_PROVIDER: OpenClawCliProvider = {
id: 'claude-cli',
displayName: 'Anthropic Claude CLI',
description: 'Uses your Claude.ai subscription via the Claude Code CLI',
models: ['claude-sonnet-4-6', 'claude-opus-4-6', 'claude-haiku-4-5'],
authLoginCommand: 'claude /login',
}
export const OPENCLAW_CLI_PROVIDERS: readonly OpenClawCliProvider[] = [
CLAUDE_CLI_PROVIDER,
]
export function findOpenClawCliProviderById(
id: string,
): OpenClawCliProvider | undefined {
return OPENCLAW_CLI_PROVIDERS.find((provider) => provider.id === id)
}
export function buildOpenClawCliProviderOptions(): OpenClawCliProviderOption[] {
return OPENCLAW_CLI_PROVIDERS.flatMap((provider) =>
provider.models.map((modelId) => ({
id: `${provider.id}/${modelId}`,
type: provider.id,
name: provider.displayName,
modelId,
})),
)
}
async function fetchCliProviderAuthStatus(
baseUrl: string,
providerId: string,
): Promise<OpenClawCliProviderAuthStatus> {
const res = await fetch(`${baseUrl}/claw/providers/${providerId}/auth-status`)
if (!res.ok) {
let message = `Auth status request failed (${res.status})`
try {
const body = (await res.json()) as { error?: string }
if (body.error) message = body.error
} catch {}
throw new Error(message)
}
return res.json() as Promise<OpenClawCliProviderAuthStatus>
}
export function useOpenClawCliProviderAuthStatus(
providerId: string,
enabled: boolean,
) {
const { baseUrl, isLoading: urlLoading } = useAgentServerUrl()
return useQuery<OpenClawCliProviderAuthStatus, Error>({
queryKey: ['openclaw-cli-auth', baseUrl, providerId],
queryFn: () => fetchCliProviderAuthStatus(baseUrl as string, providerId),
enabled: !!baseUrl && !urlLoading && enabled,
refetchInterval: enabled ? 2000 : false,
})
}
interface OpenClawCliProviderStatusPanelProps {
provider: OpenClawCliProvider
status: OpenClawCliProviderAuthStatus | undefined
loading: boolean
fetchError: Error | null
onConnect: () => void
}
export const OpenClawCliProviderStatusPanel: FC<
OpenClawCliProviderStatusPanelProps
> = ({ provider, status, loading, fetchError, onConnect }) => {
// Initial fetch (no data yet).
if (loading && !status) {
return (
<div className="flex items-center gap-2 rounded-md border border-border bg-muted/30 px-3 py-2 text-sm">
<Loader2 className="size-4 animate-spin text-muted-foreground" />
<span className="text-muted-foreground">
Checking {provider.displayName} status
</span>
</div>
)
}
if (fetchError) {
return (
<div className="flex items-start gap-2 rounded-md border border-destructive/30 bg-destructive/5 px-3 py-2 text-sm">
<TriangleAlert className="mt-0.5 size-4 text-destructive" />
<div>
<div className="font-medium text-destructive">
Could not read {provider.displayName} status
</div>
<div className="text-muted-foreground text-xs">
{fetchError.message}
</div>
</div>
</div>
)
}
if (!status) return null
// Install failed or binary missing.
if (!status.installed) {
return (
<div className="flex items-start gap-2 rounded-md border border-amber-500/40 bg-amber-500/5 px-3 py-2 text-sm">
<TriangleAlert className="mt-0.5 size-4 text-amber-600" />
<div>
<div className="font-medium">
{provider.displayName} not installed
</div>
<div className="text-muted-foreground text-xs">
The gateway will try to install it on the next restart. If this
persists, check your network and the gateway logs.
</div>
</div>
</div>
)
}
// Happy path.
if (status.loggedIn) {
const identityBits = [
status.accountLabel,
status.subscriptionLabel ? `(${status.subscriptionLabel})` : null,
].filter(Boolean)
const identity = identityBits.length > 0 ? identityBits.join(' ') : 'Ready'
return (
<div className="flex items-center gap-2 rounded-md border border-emerald-500/40 bg-emerald-500/5 px-3 py-2 text-sm">
<CheckCircle2 className="size-4 text-emerald-600" />
<div className="min-w-0 flex-1">
<div className="font-medium">Connected to {provider.displayName}</div>
<div className="truncate text-muted-foreground text-xs">
{identity}
</div>
</div>
</div>
)
}
// Installed but not logged in.
return (
<div className="flex flex-col gap-2 rounded-md border border-border bg-muted/30 px-3 py-3 text-sm">
<div>
<div className="font-medium">{provider.displayName} not set up</div>
<div className="text-muted-foreground text-xs">
{provider.description}
</div>
{status.error && (
<div className="mt-1 text-destructive text-xs">{status.error}</div>
)}
</div>
<Button size="sm" variant="outline" onClick={onConnect} className="w-fit">
<Terminal className="mr-1 size-4" />
Connect {provider.displayName}
</Button>
</div>
)
}

View File

@@ -1,38 +0,0 @@
import { describe, expect, it } from 'bun:test'
import { buildAgentApiUrl } from './agent-api-url'
import { mapHarnessAgentToEntry } from './agent-harness-types'
describe('mapHarnessAgentToEntry', () => {
it('maps created harness agents into chat-compatible entries', () => {
expect(
mapHarnessAgentToEntry({
id: 'agent-1',
name: 'Review bot',
adapter: 'codex',
modelId: 'gpt-5.5',
reasoningEffort: 'medium',
permissionMode: 'approve-all',
sessionKey: 'agent:agent-1:main',
createdAt: 1000,
updatedAt: 1000,
}),
).toEqual({
agentId: 'agent-1',
name: 'Review bot',
workspace: 'codex:main',
model: 'gpt-5.5',
source: 'agent-harness',
})
})
})
describe('buildAgentApiUrl', () => {
it('does not add a trailing slash for the harness root route', () => {
expect(buildAgentApiUrl('http://127.0.0.1:9105', '/')).toBe(
'http://127.0.0.1:9105/agents',
)
expect(buildAgentApiUrl('http://127.0.0.1:9105', '/adapters')).toBe(
'http://127.0.0.1:9105/agents/adapters',
)
})
})

View File

@@ -1,262 +0,0 @@
import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
import { getAgentServerUrl } from '@/lib/browseros/helpers'
import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
import { buildAgentApiUrl } from './agent-api-url'
import {
type AgentHarnessStreamEvent,
type CreateHarnessAgentInput,
type HarnessAdapterDescriptor,
type HarnessAgent,
type HarnessAgentHistoryPage,
mapHarnessAgentToEntry,
} from './agent-harness-types'
import type { OpenClawStatus } from './useOpenClaw'
/**
* Combined response shape of `GET /agents`. The page polls this once
* and consumes both fields, replacing the dedicated `/claw/status`
* poll the previous design carried.
*/
interface HarnessAgentsResponse {
agents: HarnessAgent[]
gateway: OpenClawStatus | null
}
export type { AgentHarnessStreamEvent }
const AGENT_QUERY_KEYS = {
adapters: 'agent-harness-adapters',
agents: 'agent-harness-agents',
} as const
async function agentsFetch<T>(
baseUrl: string,
path: string,
init?: RequestInit,
): Promise<T> {
const res = await fetch(buildAgentApiUrl(baseUrl, path), init)
if (!res.ok) {
let message = `Request failed with status ${res.status}`
try {
const body = (await res.json()) as { error?: string }
if (body.error) message = body.error
} catch {}
throw new Error(message)
}
return res.json() as Promise<T>
}
export function useAgentAdapters(enabled = true) {
const {
baseUrl,
isLoading: urlLoading,
error: urlError,
} = useAgentServerUrl()
const query = useQuery<HarnessAdapterDescriptor[], Error>({
queryKey: [AGENT_QUERY_KEYS.adapters, baseUrl],
queryFn: async () => {
const data = await agentsFetch<{ adapters: HarnessAdapterDescriptor[] }>(
baseUrl as string,
'/adapters',
)
return data.adapters ?? []
},
enabled: Boolean(baseUrl) && !urlLoading && enabled,
})
return {
adapters: query.data ?? [],
loading: query.isLoading || urlLoading,
error: query.error ?? urlError,
refetch: query.refetch,
}
}
export function useHarnessAgents(enabled = true) {
const {
baseUrl,
isLoading: urlLoading,
error: urlError,
} = useAgentServerUrl()
const query = useQuery<HarnessAgentsResponse, Error>({
queryKey: [AGENT_QUERY_KEYS.agents, baseUrl],
queryFn: async () => {
const data = await agentsFetch<HarnessAgentsResponse>(
baseUrl as string,
'/',
)
return {
agents: data.agents ?? [],
gateway: data.gateway ?? null,
}
},
enabled: Boolean(baseUrl) && !urlLoading && enabled,
// Poll every 5s so the per-agent liveness state (working / idle /
// asleep / error) and last-used timestamps stay fresh without a
// websocket. `refetchIntervalInBackground: false` lets a hidden
// tab go quiet — react-query's default, made explicit.
refetchInterval: 5_000,
refetchIntervalInBackground: false,
})
return {
agents: (query.data?.agents ?? []).map(mapHarnessAgentToEntry),
harnessAgents: query.data?.agents ?? [],
gateway: query.data?.gateway ?? null,
loading: query.isLoading || urlLoading,
error: query.error ?? urlError,
refetch: query.refetch,
}
}
export function useCreateHarnessAgent() {
const { baseUrl, isLoading: urlLoading } = useAgentServerUrl()
const queryClient = useQueryClient()
return useMutation({
mutationFn: async (input: CreateHarnessAgentInput) => {
if (!baseUrl || urlLoading) {
throw new Error('BrowserOS agent server URL is not ready')
}
const data = await agentsFetch<{ agent: HarnessAgent }>(baseUrl, '/', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(input),
})
return data.agent
},
onSuccess: async () => {
await queryClient.invalidateQueries({
queryKey: [AGENT_QUERY_KEYS.agents],
})
},
})
}
export function useDeleteHarnessAgent() {
const { baseUrl, isLoading: urlLoading } = useAgentServerUrl()
const queryClient = useQueryClient()
return useMutation({
mutationFn: async (agentId: string) => {
if (!baseUrl || urlLoading) {
throw new Error('BrowserOS agent server URL is not ready')
}
return agentsFetch<{ success: boolean }>(
baseUrl,
`/${encodeURIComponent(agentId)}`,
{ method: 'DELETE' },
)
},
onSuccess: async () => {
await queryClient.invalidateQueries({
queryKey: [AGENT_QUERY_KEYS.agents],
})
},
})
}
export async function chatWithHarnessAgent(
agentId: string,
message: string,
signal?: AbortSignal,
attachments?: ReadonlyArray<unknown>,
): Promise<Response> {
const baseUrl = await getAgentServerUrl()
return fetch(`${baseUrl}/agents/${encodeURIComponent(agentId)}/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
message,
...(attachments && attachments.length > 0 ? { attachments } : {}),
}),
signal,
})
}
/**
* Subscribe to an existing turn (the server's `ActiveTurnRegistry`
* decoupled the turn lifecycle from POST /chat). `lastSeq` lets the
* client resume after a disconnect — the server replays buffered
* frames with seq > lastSeq, then tails new ones.
*/
export async function attachToHarnessTurn(
agentId: string,
options: { turnId?: string; lastSeq?: number; signal?: AbortSignal } = {},
): Promise<Response> {
const baseUrl = await getAgentServerUrl()
const url = new URL(
`${baseUrl}/agents/${encodeURIComponent(agentId)}/chat/stream`,
)
if (options.turnId) url.searchParams.set('turnId', options.turnId)
const headers: Record<string, string> = {}
if (typeof options.lastSeq === 'number') {
headers['Last-Event-ID'] = String(options.lastSeq)
}
return fetch(url.toString(), { signal: options.signal, headers })
}
export interface HarnessActiveTurnInfo {
turnId: string
agentId: string
sessionId: 'main'
status: 'running' | 'done' | 'error' | 'cancelled'
lastSeq: number
startedAt: number
endedAt?: number
}
/**
* Discover an in-flight turn for an agent. Used on chat mount so the
* UI reattaches instead of starting a new turn after a tab/refresh.
*/
export async function fetchActiveHarnessTurn(
agentId: string,
): Promise<HarnessActiveTurnInfo | null> {
const baseUrl = await getAgentServerUrl()
const response = await fetch(
`${baseUrl}/agents/${encodeURIComponent(agentId)}/chat/active`,
)
if (!response.ok) return null
const body = (await response.json()) as {
active: HarnessActiveTurnInfo | null
}
return body.active
}
/**
* Stop button. Hits the explicit cancel endpoint instead of just
* aborting the fetch (which now only detaches *this* subscriber from
* the buffer; the underlying turn would otherwise keep running).
*/
export async function cancelHarnessTurn(
agentId: string,
options: { turnId?: string; reason?: string } = {},
): Promise<{ cancelled: boolean }> {
const baseUrl = await getAgentServerUrl()
const response = await fetch(
`${baseUrl}/agents/${encodeURIComponent(agentId)}/chat/cancel`,
{
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
...(options.turnId ? { turnId: options.turnId } : {}),
...(options.reason ? { reason: options.reason } : {}),
}),
},
)
if (!response.ok) return { cancelled: false }
return (await response.json()) as { cancelled: boolean }
}
export async function fetchHarnessAgentHistory(
agentId: string,
): Promise<HarnessAgentHistoryPage> {
const baseUrl = await getAgentServerUrl()
return agentsFetch<HarnessAgentHistoryPage>(
baseUrl,
`/${encodeURIComponent(agentId)}/sessions/main/history`,
)
}

View File

@@ -1,4 +1,5 @@
import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
import { getAgentServerUrl } from '@/lib/browseros/helpers'
import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
export interface AgentEntry {
@@ -6,7 +7,6 @@ export interface AgentEntry {
name: string
workspace: string
model?: unknown
source?: 'openclaw' | 'agent-harness'
}
export interface OpenClawStatus {
@@ -41,7 +41,6 @@ export interface OpenClawAgentMutationInput {
baseUrl?: string
apiKey?: string
modelId?: string
supportsImages?: boolean
}
export interface OpenClawSetupInput {
@@ -50,10 +49,6 @@ export interface OpenClawSetupInput {
baseUrl?: string
apiKey?: string
modelId?: string
// Mirrors LlmProviderConfig.supportsImages — pass-through so the gateway
// can declare the model's input modalities correctly when persisting the
// custom-provider config.
supportsImages?: boolean
}
export function getModelDisplayName(model: unknown): string | undefined {
@@ -64,8 +59,14 @@ export function getModelDisplayName(model: unknown): string | undefined {
export const OPENCLAW_QUERY_KEYS = {
status: 'openclaw-status',
agents: 'openclaw-agents',
podmanOverrides: 'openclaw-podman-overrides',
} as const
export interface PodmanOverrides {
podmanPath: string | null
effectivePodmanPath: string
}
export type GatewayLifecycleAction =
| 'setup'
| 'start'
@@ -98,10 +99,7 @@ async function fetchOpenClawStatus(baseUrl: string): Promise<OpenClawStatus> {
async function fetchOpenClawAgents(baseUrl: string): Promise<AgentEntry[]> {
const data = await clawFetch<{ agents: AgentEntry[] }>(baseUrl, '/agents')
return (data.agents ?? []).map((agent) => ({
...agent,
source: 'openclaw',
}))
return data.agents ?? []
}
async function invalidateOpenClawQueries(
@@ -264,6 +262,50 @@ export function useOpenClawMutations() {
}
}
export function usePodmanOverrides() {
const {
baseUrl,
isLoading: urlLoading,
error: urlError,
} = useAgentServerUrl()
const queryClient = useQueryClient()
const query = useQuery<PodmanOverrides, Error>({
queryKey: [OPENCLAW_QUERY_KEYS.podmanOverrides, baseUrl],
queryFn: () =>
clawFetch<PodmanOverrides>(baseUrl as string, '/podman-overrides'),
enabled: !!baseUrl && !urlLoading,
})
const saveMutation = useMutation({
mutationFn: async (podmanPath: string | null) =>
clawFetch<PodmanOverrides>(baseUrl as string, '/podman-overrides', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ podmanPath }),
}),
onSuccess: async () => {
await Promise.all([
queryClient.invalidateQueries({
queryKey: [OPENCLAW_QUERY_KEYS.podmanOverrides],
}),
queryClient.invalidateQueries({
queryKey: [OPENCLAW_QUERY_KEYS.status],
}),
])
},
})
return {
overrides: query.data ?? null,
loading: query.isLoading || urlLoading,
error: (query.error ?? urlError) as Error | null,
saving: saveMutation.isPending,
saveOverrides: (podmanPath: string) => saveMutation.mutateAsync(podmanPath),
clearOverrides: () => saveMutation.mutateAsync(null),
}
}
export interface OpenClawStreamEvent {
type:
| 'text-delta'
@@ -318,3 +360,19 @@ export function buildChatHistoryFromTurns(
return messages
}
export async function chatWithAgent(
agentId: string,
message: string,
sessionKey?: string,
history: OpenClawChatHistoryMessage[] = [],
signal?: AbortSignal,
): Promise<Response> {
const baseUrl = await getAgentServerUrl()
return fetch(`${baseUrl}/claw/agents/${agentId}/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ message, sessionKey, history }),
signal,
})
}

View File

@@ -164,17 +164,9 @@ export const NewScheduledTaskDialog: FC<NewScheduledTaskDialogProps> = ({
const resolvedProvider: Provider | null = (() => {
const id = selectedProviderId ?? defaultProviderId
const found = providers.find((p) => p.id === id)
if (found) {
return {
kind: 'llm' as const,
id: found.id,
name: found.name,
type: found.type,
}
}
if (found) return { id: found.id, name: found.name, type: found.type }
if (providers[0])
return {
kind: 'llm' as const,
id: providers[0].id,
name: providers[0].name,
type: providers[0].type,
@@ -183,7 +175,6 @@ export const NewScheduledTaskDialog: FC<NewScheduledTaskDialogProps> = ({
})()
const providerOptions: Provider[] = providers.map((p) => ({
kind: 'llm',
id: p.id,
name: p.name,
type: p.type,

View File

@@ -18,8 +18,8 @@ describe('route-utils', () => {
expect(shouldUseChatSession('/home/chat')).toBe(true)
})
it('hides the focus grid on full-screen routes', () => {
expect(shouldHideFocusGrid('/home')).toBe(true)
it('keeps the focus grid on home while hiding it on dedicated full-screen routes', () => {
expect(shouldHideFocusGrid('/home')).toBe(false)
expect(shouldHideFocusGrid('/home/agents/main')).toBe(true)
expect(shouldHideFocusGrid('/home/chat')).toBe(true)
expect(shouldHideFocusGrid('/home/skills')).toBe(true)

View File

@@ -1,5 +1,4 @@
const HIDE_FOCUS_GRID_PATHS = new Set([
'/home',
'/home/soul',
'/home/memory',
'/home/skills',

View File

@@ -1,4 +1,4 @@
import { Bot, Github, History, Plus, SettingsIcon } from 'lucide-react'
import { Github, History, Plus, SettingsIcon } from 'lucide-react'
import type { FC } from 'react'
import { Link, useLocation, useNavigate } from 'react-router'
import { ChatProviderSelector } from '@/components/chat/ChatProviderSelector'
@@ -64,9 +64,7 @@ export const ChatHeader: FC<ChatHeaderProps> = ({
className="group relative inline-flex cursor-pointer items-center gap-2 rounded-lg p-2 text-muted-foreground transition-colors hover:bg-muted/50 hover:text-foreground data-[state=open]:bg-accent"
title="Change AI Provider"
>
{selectedProvider.kind === 'acp' ? (
<Bot className="h-[18px] w-[18px]" />
) : selectedProvider.type === 'browseros' ? (
{selectedProvider.type === 'browseros' ? (
<BrowserOSIcon size={18} />
) : (
<ProviderIcon

View File

@@ -1,200 +0,0 @@
import { describe, expect, it } from 'bun:test'
import type { HarnessAdapterDescriptor } from '@/entrypoints/app/agents/agent-harness-types'
import type { LlmProviderConfig } from '@/lib/llm-providers/types'
import {
buildSidepanelChatTargets,
persistSidepanelChatTargetSelection,
resolveSidepanelChatTarget,
type SidepanelChatTargetSelection,
toLlmProviderConfig,
} from './sidepanel-chat-targets'
const timestamp = 1000
const providers: LlmProviderConfig[] = [
{
id: 'browseros',
type: 'browseros',
name: 'BrowserOS',
baseUrl: 'https://api.browseros.com/v1',
modelId: 'browseros-auto',
supportsImages: true,
contextWindow: 200000,
temperature: 0.2,
createdAt: timestamp,
updatedAt: timestamp,
},
{
id: 'anthropic-sonnet',
type: 'anthropic',
name: 'Anthropic Sonnet',
modelId: 'claude-sonnet-4-6',
apiKey: 'sk-ant',
supportsImages: true,
contextWindow: 200000,
temperature: 0.2,
createdAt: timestamp,
updatedAt: timestamp,
},
]
const adapters: HarnessAdapterDescriptor[] = [
{
id: 'claude',
name: 'Claude Code',
defaultModelId: 'haiku',
defaultReasoningEffort: 'medium',
modelControl: 'best-effort',
models: [
{ id: 'sonnet', label: 'Sonnet' },
{ id: 'haiku', label: 'Haiku', recommended: true },
],
reasoningEfforts: [
{ id: 'medium', label: 'Medium', recommended: true },
{ id: 'high', label: 'High' },
],
},
{
id: 'codex',
name: 'Codex',
defaultModelId: 'gpt-5.5',
defaultReasoningEffort: 'medium',
modelControl: 'runtime-supported',
models: [{ id: 'gpt-5.5', label: 'GPT-5.5', recommended: true }],
reasoningEfforts: [{ id: 'medium', label: 'Medium', recommended: true }],
},
{
id: 'openclaw',
name: 'OpenClaw',
defaultModelId: 'default',
defaultReasoningEffort: 'medium',
modelControl: 'best-effort',
models: [],
reasoningEfforts: [
{ id: 'medium', label: 'Medium', recommended: true },
{ id: 'high', label: 'High' },
],
},
]
describe('buildSidepanelChatTargets', () => {
it('returns LLM targets plus one ACP target per adapter model', () => {
const targets = buildSidepanelChatTargets({ providers, adapters })
expect(targets.map((target) => target.id)).toEqual([
'browseros',
'anthropic-sonnet',
'acp:claude:sonnet:medium',
'acp:claude:haiku:medium',
'acp:codex:gpt-5.5:medium',
'acp:openclaw:default:medium',
])
})
it('emits a single default ACP target for adapters with no per-session model picker', () => {
const targets = buildSidepanelChatTargets({ providers, adapters })
const openclaw = targets.find(
(target) => target.id === 'acp:openclaw:default:medium',
)
expect(openclaw).toMatchObject({
kind: 'acp',
adapter: 'openclaw',
adapterName: 'OpenClaw',
modelId: 'default',
modelLabel: 'default',
// Without a model picker, the target name is just the adapter
// name — the user picks the adapter, not a model under it.
name: 'OpenClaw',
modelControl: 'best-effort',
reasoningEffort: 'medium',
})
})
it('preserves ACP model-control and recommendation metadata', () => {
const targets = buildSidepanelChatTargets({ providers, adapters })
const haiku = targets.find(
(target) => target.id === 'acp:claude:haiku:medium',
)
expect(haiku).toMatchObject({
kind: 'acp',
adapter: 'claude',
modelId: 'haiku',
modelControl: 'best-effort',
recommended: true,
reasoningEffort: 'medium',
})
})
it('still returns LLM targets when ACP adapters are unavailable', () => {
expect(buildSidepanelChatTargets({ providers, adapters: [] })).toEqual([
{
kind: 'llm',
id: 'browseros',
name: 'BrowserOS',
type: 'browseros',
provider: providers[0],
},
{
kind: 'llm',
id: 'anthropic-sonnet',
name: 'Anthropic Sonnet',
type: 'anthropic',
provider: providers[1],
},
])
})
})
describe('resolveSidepanelChatTarget', () => {
it('resolves selected LLM targets back to their provider config', () => {
const targets = buildSidepanelChatTargets({ providers, adapters })
const resolved = resolveSidepanelChatTarget({
targets,
defaultProviderId: 'browseros',
selection: { kind: 'llm', id: 'anthropic-sonnet' },
})
expect(resolved?.kind).toBe('llm')
expect(toLlmProviderConfig(resolved)?.modelId).toBe('claude-sonnet-4-6')
})
it('falls back to the current default LLM provider when a persisted ACP target is stale', () => {
const targets = buildSidepanelChatTargets({ providers, adapters: [] })
expect(
resolveSidepanelChatTarget({
targets,
defaultProviderId: 'anthropic-sonnet',
selection: { kind: 'acp', id: 'acp:claude:haiku:medium' },
}),
).toMatchObject({
kind: 'llm',
id: 'anthropic-sonnet',
})
})
})
describe('persistSidepanelChatTargetSelection', () => {
it('stores only target identity and does not mutate LLM provider arrays', async () => {
let savedSelection: SidepanelChatTargetSelection | null = null
const originalProviders = providers.map((provider) => ({ ...provider }))
const targets = buildSidepanelChatTargets({ providers, adapters })
const target = targets.find(
(candidate) => candidate.id === 'acp:codex:gpt-5.5:medium',
)
await persistSidepanelChatTargetSelection(target, {
setValue: async (value) => {
savedSelection = value
},
})
expect(savedSelection as SidepanelChatTargetSelection | null).toEqual({
kind: 'acp',
id: 'acp:codex:gpt-5.5:medium',
})
expect(providers).toEqual(originalProviders)
})
})

View File

@@ -1,195 +0,0 @@
import type {
HarnessAdapterDescriptor,
HarnessAgentAdapter,
} from '@/entrypoints/app/agents/agent-harness-types'
import type { LlmProviderConfig, ProviderType } from '@/lib/llm-providers/types'
export type SidepanelTargetKind = 'llm' | 'acp'
export type SidepanelChatTarget =
| {
kind: 'llm'
id: string
name: string
type: ProviderType
provider: LlmProviderConfig
}
| {
kind: 'acp'
id: string
name: string
type: 'acp'
adapter: HarnessAgentAdapter
adapterName: string
modelId: string
modelLabel: string
modelControl: HarnessAdapterDescriptor['modelControl']
recommended?: boolean
reasoningEffort: string
reasoningEffortLabel?: string
}
export type SidepanelChatTargetSelection = Pick<
SidepanelChatTarget,
'kind' | 'id'
>
interface BuildSidepanelChatTargetsInput {
providers: LlmProviderConfig[]
adapters: HarnessAdapterDescriptor[]
}
interface ResolveSidepanelChatTargetInput {
targets: SidepanelChatTarget[]
defaultProviderId: string
selection?: SidepanelChatTargetSelection | null
}
interface SidepanelChatTargetSelectionWriter {
setValue(value: SidepanelChatTargetSelection | null): Promise<void>
}
interface SidepanelChatTargetSelectionReader {
getValue(): Promise<SidepanelChatTargetSelection | null>
}
type SidepanelChatTargetSelectionStore = SidepanelChatTargetSelectionReader &
SidepanelChatTargetSelectionWriter
let sidepanelChatTargetSelectionStorage:
| SidepanelChatTargetSelectionStore
| undefined
export function buildSidepanelChatTargets({
providers,
adapters,
}: BuildSidepanelChatTargetsInput): SidepanelChatTarget[] {
return [
...providers.map(toLlmTarget),
...adapters.flatMap(toAcpTargetsForAdapter),
]
}
function toAcpTargetsForAdapter(
adapter: HarnessAdapterDescriptor,
): SidepanelChatTarget[] {
const reasoning = adapter.reasoningEfforts.find(
(effort) => effort.id === adapter.defaultReasoningEffort,
)
const reasoningEffort =
reasoning?.id ?? adapter.defaultReasoningEffort ?? 'medium'
// Adapters with no per-session model picker (e.g. OpenClaw, whose
// model lives on the gateway-side agent record) still need exactly
// one sidepanel target so the user can pick the adapter at all.
if (adapter.models.length === 0) {
return [
{
kind: 'acp',
id: buildAcpTargetId(
adapter.id,
adapter.defaultModelId,
reasoningEffort,
),
name: adapter.name,
type: 'acp',
adapter: adapter.id,
adapterName: adapter.name,
modelId: adapter.defaultModelId,
modelLabel: 'default',
modelControl: adapter.modelControl,
reasoningEffort,
reasoningEffortLabel: reasoning?.label,
},
]
}
return adapter.models.map((model) => ({
kind: 'acp' as const,
id: buildAcpTargetId(adapter.id, model.id, reasoningEffort),
name: `${adapter.name} ${model.label}`,
type: 'acp' as const,
adapter: adapter.id,
adapterName: adapter.name,
modelId: model.id,
modelLabel: model.label,
modelControl: adapter.modelControl,
recommended: model.recommended,
reasoningEffort,
reasoningEffortLabel: reasoning?.label,
}))
}
export function resolveSidepanelChatTarget({
targets,
defaultProviderId,
selection,
}: ResolveSidepanelChatTargetInput): SidepanelChatTarget | undefined {
if (selection) {
const selected = targets.find(
(target) => target.kind === selection.kind && target.id === selection.id,
)
if (selected) return selected
}
return (
targets.find(
(target) => target.kind === 'llm' && target.id === defaultProviderId,
) ?? targets.find((target) => target.kind === 'llm')
)
}
export function toLlmProviderConfig(
target: SidepanelChatTarget | undefined,
): LlmProviderConfig | undefined {
return target?.kind === 'llm' ? target.provider : undefined
}
export async function persistSidepanelChatTargetSelection(
target: SidepanelChatTarget | undefined,
store?: SidepanelChatTargetSelectionWriter,
): Promise<void> {
const targetStore = store ?? (await getSidepanelChatTargetSelectionStorage())
await targetStore.setValue(
target ? { kind: target.kind, id: target.id } : null,
)
}
export async function loadSidepanelChatTargetSelection(
store?: SidepanelChatTargetSelectionReader,
): Promise<SidepanelChatTargetSelection | null> {
const targetStore = store ?? (await getSidepanelChatTargetSelectionStorage())
return targetStore.getValue()
}
function toLlmTarget(provider: LlmProviderConfig): SidepanelChatTarget {
return {
kind: 'llm',
id: provider.id,
name: provider.name,
type: provider.type,
provider,
}
}
export function buildAcpTargetId(
adapter: HarnessAgentAdapter,
modelId: string,
reasoningEffort: string,
): string {
return `acp:${adapter}:${modelId}:${reasoningEffort}`
}
async function getSidepanelChatTargetSelectionStorage(): Promise<SidepanelChatTargetSelectionStore> {
if (sidepanelChatTargetSelectionStorage) {
return sidepanelChatTargetSelectionStorage
}
const { storage } = await import('@wxt-dev/storage')
sidepanelChatTargetSelectionStorage =
storage.defineItem<SidepanelChatTargetSelection | null>(
'local:sidepanel-chat-target-selection',
{ fallback: null },
)
return sidepanelChatTargetSelectionStorage
}

View File

@@ -1,18 +1,9 @@
import { useCallback, useEffect, useMemo, useRef, useState } from 'react'
import { useEffect, useRef } from 'react'
import useDeepCompareEffect from 'use-deep-compare-effect'
import { useAgentAdapters } from '@/entrypoints/app/agents/useAgents'
import type { LlmProviderConfig } from '@/lib/llm-providers/types'
import { useLlmProviders } from '@/lib/llm-providers/useLlmProviders'
import { type McpServer, useMcpServers } from '@/lib/mcp/mcpServerStorage'
import { usePersonalization } from '@/lib/personalization/personalizationStorage'
import {
buildSidepanelChatTargets,
loadSidepanelChatTargetSelection,
persistSidepanelChatTargetSelection,
resolveSidepanelChatTarget,
type SidepanelChatTarget,
type SidepanelChatTargetSelection,
} from './sidepanel-chat-targets'
const constructMcpServers = (servers: McpServer[]) => {
return servers
@@ -32,51 +23,14 @@ const constructCustomServers = (servers: McpServer[]) => {
export const useChatRefs = () => {
const { servers: mcpServers } = useMcpServers()
const {
providers: llmProviders,
selectedProvider: selectedLlmProvider,
setDefaultProvider,
isLoading: isLoadingProviders,
} = useLlmProviders()
const { adapters, loading: isLoadingAdapters } = useAgentAdapters()
const { personalization } = usePersonalization()
const [targetSelection, setTargetSelection] =
useState<SidepanelChatTargetSelection | null>(null)
useEffect(() => {
let cancelled = false
loadSidepanelChatTargetSelection().then((selection) => {
if (!cancelled) setTargetSelection(selection)
})
return () => {
cancelled = true
}
}, [])
const chatTargets = useMemo(
() =>
buildSidepanelChatTargets({
providers: llmProviders,
adapters,
}),
[llmProviders, adapters],
)
const selectedChatTarget = useMemo(
() =>
resolveSidepanelChatTarget({
targets: chatTargets,
defaultProviderId: selectedLlmProvider?.id ?? llmProviders[0]?.id ?? '',
selection: targetSelection,
}),
[chatTargets, llmProviders, selectedLlmProvider, targetSelection],
)
const selectedLlmProviderRef = useRef<LlmProviderConfig | null>(
selectedLlmProvider,
)
const selectedChatTargetRef = useRef<SidepanelChatTarget | undefined>(
selectedChatTarget,
)
const enabledMcpServersRef = useRef(constructMcpServers(mcpServers))
const enabledCustomServersRef = useRef(constructCustomServers(mcpServers))
const personalizationRef = useRef(personalization)
@@ -87,35 +41,16 @@ export const useChatRefs = () => {
enabledCustomServersRef.current = constructCustomServers(mcpServers)
}, [selectedLlmProvider, mcpServers])
useEffect(() => {
selectedChatTargetRef.current = selectedChatTarget
}, [selectedChatTarget])
useEffect(() => {
personalizationRef.current = personalization
}, [personalization])
const selectChatTarget = useCallback(
async (target: SidepanelChatTarget | undefined) => {
selectedChatTargetRef.current = target
setTargetSelection(target ? { kind: target.kind, id: target.id } : null)
await persistSidepanelChatTargetSelection(target)
},
[],
)
return {
selectedLlmProviderRef,
selectedChatTargetRef,
enabledMcpServersRef,
enabledCustomServersRef,
personalizationRef,
llmProviders,
setDefaultProvider,
chatTargets,
selectedChatTarget,
selectChatTarget,
selectedLlmProvider,
isLoadingProviders: isLoadingProviders || isLoadingAdapters,
isLoadingProviders,
}
}

View File

@@ -1,153 +0,0 @@
import { describe, expect, it } from 'bun:test'
import type { LlmProviderConfig } from '@/lib/llm-providers/types'
import type { ChatMode } from './chatTypes'
import type { SidepanelChatTarget } from './sidepanel-chat-targets'
import { buildSidepanelPreparedSendMessagesRequest } from './useChatSessionRequest'
const conversationId = '00000000-0000-4000-8000-000000000001'
describe('buildSidepanelPreparedSendMessagesRequest', () => {
it('keeps LLM targets on the existing /chat request body', () => {
const request = buildSidepanelPreparedSendMessagesRequest({
agentServerUrl: 'http://127.0.0.1:5151',
target: llmTarget,
fallbackProvider,
message: 'Summarize this page',
...commonRequestInput(),
})
expect(request.api).toBe('http://127.0.0.1:5151/chat')
expect(request.body).toMatchObject({
message: 'Summarize this page',
conversationId,
provider: 'browseros',
providerType: 'browseros',
providerName: 'BrowserOS',
model: 'gpt-5',
mode: 'agent',
browserContext: {
activeTab: { id: 10, url: 'https://example.com', title: 'Example' },
enabledMcpServers: ['slack'],
},
userSystemPrompt: 'Be concise',
userWorkingDir: '/tmp/work',
previousConversation: [{ role: 'assistant', content: 'Prior answer' }],
selectedText: 'selected text',
selectedTextSource: {
url: 'https://example.com',
title: 'Example',
},
})
})
it('sends ACP targets to the sidepanel ACP route with explicit target fields', () => {
const request = buildSidepanelPreparedSendMessagesRequest({
agentServerUrl: 'http://127.0.0.1:5151',
target: acpTarget,
fallbackProvider,
message: 'Inspect the current tab',
approvalResponses: [
{ approvalId: 'approval-1', approved: true, reason: 'ok' },
],
...commonRequestInput(),
})
expect(request.api).toBe('http://127.0.0.1:5151/agents/sidepanel/chat')
expect(request.body).toEqual({
conversationId,
adapter: 'codex',
modelId: 'gpt-5.5',
reasoningEffort: 'medium',
message: 'Inspect the current tab',
browserContext: {
activeTab: { id: 10, url: 'https://example.com', title: 'Example' },
enabledMcpServers: ['slack'],
},
userSystemPrompt: 'Be concise',
userWorkingDir: '/tmp/work',
selectedText: 'selected text',
selectedTextSource: {
url: 'https://example.com',
title: 'Example',
},
})
})
it('keeps tool approval retry payloads scoped to LLM chat', () => {
const request = buildSidepanelPreparedSendMessagesRequest({
agentServerUrl: 'http://127.0.0.1:5151',
target: llmTarget,
fallbackProvider,
approvalResponses: [
{ approvalId: 'approval-1', approved: false, reason: 'no' },
],
...commonRequestInput(),
})
expect(request.api).toBe('http://127.0.0.1:5151/chat')
expect(request.body).toMatchObject({
message: '',
toolApprovalResponses: [
{ approvalId: 'approval-1', approved: false, reason: 'no' },
],
})
})
})
function commonRequestInput() {
return {
conversationId,
mode: 'agent' as ChatMode,
browserContext: {
activeTab: { id: 10, url: 'https://example.com', title: 'Example' },
enabledMcpServers: ['slack'],
},
userSystemPrompt: 'Be concise',
userWorkingDir: '/tmp/work',
previousConversation: [
{ role: 'assistant' as const, content: 'Prior answer' },
],
declinedApps: ['gmail'],
aclRules: [{ id: 'rule-1', sitePattern: '*://*/*', enabled: true }],
selectedText: 'selected text',
selectedTextSource: {
url: 'https://example.com',
title: 'Example',
},
toolApprovalConfig: { categories: { navigation: true } },
}
}
const fallbackProvider: LlmProviderConfig = {
id: 'browseros',
type: 'browseros',
name: 'BrowserOS',
modelId: 'gpt-5',
supportsImages: true,
contextWindow: 128000,
temperature: 0.7,
createdAt: 1000,
updatedAt: 1000,
}
const llmTarget: SidepanelChatTarget = {
kind: 'llm',
id: fallbackProvider.id,
name: fallbackProvider.name,
type: fallbackProvider.type,
provider: fallbackProvider,
}
const acpTarget: SidepanelChatTarget = {
kind: 'acp',
id: 'acp:codex:gpt-5.5:medium',
name: 'Codex GPT-5.5',
type: 'acp',
adapter: 'codex',
adapterName: 'Codex',
modelId: 'gpt-5.5',
modelLabel: 'GPT-5.5',
modelControl: 'best-effort',
reasoningEffort: 'medium',
reasoningEffortLabel: 'Medium',
}

View File

@@ -26,14 +26,15 @@ import { useInvalidateCredits } from '@/lib/credits/useCredits'
import { declinedAppsStorage } from '@/lib/declined-apps/storage'
import { useGraphqlQuery } from '@/lib/graphql/useGraphqlQuery'
import { createDefaultBrowserOSProvider } from '@/lib/llm-providers/storage'
import type {
ApprovalResponseData,
ChatRequestBrowserContext,
import { useLlmProviders } from '@/lib/llm-providers/useLlmProviders'
import {
type ApprovalResponseData,
buildChatRequestBody,
type ChatRequestBrowserContext,
} from '@/lib/messaging/server/buildChatRequestBody'
import { track } from '@/lib/metrics/track'
import { searchActionsStorage } from '@/lib/search-actions/searchActionsStorage'
import { selectedTextStorage } from '@/lib/selected-text/selectedTextStorage'
import { sentry } from '@/lib/sentry/sentry'
import { stopAgentStorage } from '@/lib/stop-agent/stop-agent-storage'
import {
type ApprovalResponse,
@@ -51,12 +52,7 @@ import {
import { selectedWorkspaceStorage } from '@/lib/workspace/workspace-storage'
import type { ChatMode } from './chatTypes'
import { GetConversationWithMessagesDocument } from './graphql/chatSessionDocument'
import { toLlmProviderConfig } from './sidepanel-chat-targets'
import { useChatRefs } from './useChatRefs'
import {
buildSidepanelPreparedSendMessagesRequest,
toProviderOption,
} from './useChatSessionRequest'
import { useExecutionHistoryTracker } from './useExecutionHistoryTracker'
import { useNotifyActiveTab } from './useNotifyActiveTab'
import { useRemoteConversationSave } from './useRemoteConversationSave'
@@ -190,19 +186,16 @@ const buildRequestBrowserContext = ({
export const useChatSession = (options?: ChatSessionOptions) => {
const {
selectedLlmProviderRef,
selectedChatTargetRef,
enabledMcpServersRef,
enabledCustomServersRef,
personalizationRef,
setDefaultProvider,
chatTargets,
selectedChatTarget,
selectChatTarget,
selectedLlmProvider,
isLoadingProviders,
} = useChatRefs()
const invalidateCredits = useInvalidateCredits()
const { providers: llmProviders, setDefaultProvider } = useLlmProviders()
const {
baseUrl: agentServerUrl,
isLoading: isLoadingAgentUrl,
@@ -225,7 +218,11 @@ export const useChatSession = (options?: ChatSessionOptions) => {
agentUrlRef.current = agentServerUrl
}, [agentServerUrl])
const providers: Provider[] = chatTargets.map(toProviderOption)
const providers: Provider[] = llmProviders.map((p) => ({
id: p.id,
name: p.name,
type: p.type,
}))
const [mode, setMode] = useState<ChatMode>('agent')
const [textToAction, setTextToAction] = useState<Map<string, ChatAction>>(
@@ -327,8 +324,15 @@ export const useChatSession = (options?: ChatSessionOptions) => {
textToActionRef.current = textToAction
}, [mode, textToAction])
const selectedProvider = selectedChatTarget
? toProviderOption(selectedChatTarget)
const selectedProvider = selectedLlmProvider
? {
id: selectedLlmProvider.id,
name: selectedLlmProvider.name,
type:
selectedLlmProvider.id === 'browseros'
? ('browseros' as const)
: selectedLlmProvider.type,
}
: providers[0]
const {
@@ -342,8 +346,7 @@ export const useChatSession = (options?: ChatSessionOptions) => {
} = useChat({
transport: new DefaultChatTransport({
prepareSendMessagesRequest: async ({ messages }) => {
const target = selectedChatTargetRef.current
const fallbackProvider =
const provider =
selectedLlmProviderRef.current ?? createDefaultBrowserOSProvider()
const activeTabsList = await chrome.tabs.query({
active: true,
@@ -392,46 +395,51 @@ export const useChatSession = (options?: ChatSessionOptions) => {
personalizationRef.current,
)
const commonRequest = {
conversationId: conversationIdRef.current,
mode: currentMode,
browserContext: requestBrowserContext,
userSystemPrompt,
userWorkingDir: workingDirRef.current,
previousConversation,
declinedApps,
aclRules: enabledAclRules,
toolApprovalConfig: approvalConfig,
}
const approvalResponses =
target?.kind === 'acp' ? null : extractApprovalResponses(messages)
const approvalResponses = extractApprovalResponses(messages)
if (approvalResponses) {
return buildSidepanelPreparedSendMessagesRequest({
agentServerUrl: agentUrlRef.current ?? undefined,
target,
fallbackProvider,
...commonRequest,
approvalResponses,
})
return {
api: `${agentUrlRef.current}/chat`,
body: buildChatRequestBody({
conversationId: conversationIdRef.current,
provider,
mode: currentMode,
browserContext: requestBrowserContext,
userSystemPrompt,
userWorkingDir: workingDirRef.current,
previousConversation,
declinedApps,
aclRules: enabledAclRules,
toolApprovalConfig: approvalConfig,
toolApprovalResponses: approvalResponses,
}),
}
}
const message = getLastMessageText(messages)
const result = buildSidepanelPreparedSendMessagesRequest({
agentServerUrl: agentUrlRef.current ?? undefined,
target,
fallbackProvider,
message,
...commonRequest,
selectedText: activeTabSelection?.text,
selectedTextSource: activeTabSelection
? {
url: activeTabSelection.url,
title: activeTabSelection.title,
}
: undefined,
})
const result = {
api: `${agentUrlRef.current}/chat`,
body: buildChatRequestBody({
message,
conversationId: conversationIdRef.current,
provider,
mode: currentMode,
browserContext: requestBrowserContext,
userSystemPrompt,
userWorkingDir: workingDirRef.current,
previousConversation,
declinedApps,
aclRules: enabledAclRules,
selectedText: activeTabSelection?.text,
selectedTextSource: activeTabSelection
? {
url: activeTabSelection.url,
title: activeTabSelection.title,
}
: undefined,
toolApprovalConfig: approvalConfig,
}),
}
// Track which tab's selection was sent so we can clear it on success
pendingSelectionTabKeyRef.current =
@@ -443,7 +451,7 @@ export const useChatSession = (options?: ChatSessionOptions) => {
sendAutomaticallyWhen: () => {
if (approvalJustRespondedRef.current) {
approvalJustRespondedRef.current = false
return selectedChatTargetRef.current?.kind !== 'acp'
return true
}
return false
},
@@ -678,15 +686,10 @@ export const useChatSession = (options?: ChatSessionOptions) => {
}, [dispatchMessage, isIntegrationsSynced])
const sendMessage = (params: { text: string; action?: ChatAction }) => {
const target = selectedChatTargetRef.current
const llmTargetProvider = toLlmProviderConfig(target)
track(MESSAGE_SENT_EVENT, {
mode,
provider_type: target?.kind === 'acp' ? 'acp' : llmTargetProvider?.type,
model:
target?.kind === 'acp'
? target.modelId
: llmTargetProvider?.modelId || selectedLlmProvider?.modelId,
provider_type: selectedLlmProvider?.type,
model: selectedLlmProvider?.modelId,
})
if (!isIntegrationsSyncedRef.current) {
@@ -738,52 +741,14 @@ export const useChatSession = (options?: ChatSessionOptions) => {
addToolApprovalResponse(params)
}
const resetConversationState = () => {
stop()
void finishExecutionTask({ isAbort: true })
setConversationId(crypto.randomUUID())
setMessages([])
setTextToAction(new Map())
setLiked({})
setDisliked({})
setRestoredConversationId(null)
resetRemoteConversation()
}
const handleSelectProvider = (provider: Provider) => {
const target = chatTargets.find(
(candidate) =>
candidate.id === provider.id && candidate.kind === provider.kind,
)
if (!target) return
const previousTarget = selectedChatTargetRef.current
const fullProvider = llmProviders.find((p) => p.id === provider.id)
track(PROVIDER_SELECTED_EVENT, {
provider_id: target.id,
provider_type: target.kind === 'acp' ? 'acp' : target.type,
model_id:
target.kind === 'acp' ? target.modelId : target.provider.modelId,
provider_id: provider.id,
provider_type: provider.type,
model_id: fullProvider?.modelId,
})
void selectChatTarget(target).catch((error) => {
sentry.captureException(error, {
extra: {
message: 'Failed to persist sidepanel chat target selection',
targetId: target.id,
targetKind: target.kind,
},
})
})
if (target.kind === 'llm') setDefaultProvider(target.provider.id)
if (
previousTarget &&
(previousTarget.kind !== target.kind ||
previousTarget.id !== target.id) &&
messagesRef.current.length > 0
) {
resetConversationState()
}
setDefaultProvider(provider.id)
}
const getActionForMessage = (message: UIMessage) => {
@@ -797,7 +762,15 @@ export const useChatSession = (options?: ChatSessionOptions) => {
const resetConversation = () => {
track(CONVERSATION_RESET_EVENT, { message_count: messages.length })
resetConversationState()
stop()
void finishExecutionTask({ isAbort: true })
setConversationId(crypto.randomUUID())
setMessages([])
setTextToAction(new Map())
setLiked({})
setDisliked({})
setRestoredConversationId(null)
resetRemoteConversation()
}
const isRestoringConversation =

View File

@@ -1,76 +0,0 @@
import type { Provider } from '../../../components/chat/chatComponentTypes'
import type { LlmProviderConfig } from '../../../lib/llm-providers/types'
import {
type ApprovalResponseData,
buildChatRequestBody,
} from '../../../lib/messaging/server/buildChatRequestBody'
import {
type SidepanelChatTarget,
toLlmProviderConfig,
} from './sidepanel-chat-targets'
type LlmChatRequestBodyInput = Parameters<typeof buildChatRequestBody>[0]
type CommonSidepanelRequestInput = Omit<
LlmChatRequestBodyInput,
'provider' | 'message' | 'toolApprovalResponses' | 'isScheduledTask'
>
interface BuildSidepanelPreparedSendMessagesRequestInput
extends CommonSidepanelRequestInput {
agentServerUrl: string | undefined
target: SidepanelChatTarget | undefined
fallbackProvider: LlmProviderConfig
message?: string
approvalResponses?: ApprovalResponseData[] | null
}
export function buildSidepanelPreparedSendMessagesRequest({
agentServerUrl,
target,
fallbackProvider,
message,
approvalResponses,
...common
}: BuildSidepanelPreparedSendMessagesRequestInput) {
if (target?.kind === 'acp') {
// ACP session history is owned by AcpxRuntime through sessionKey, so LLM-only
// resume and approval fields are intentionally not forwarded.
return {
api: `${agentServerUrl}/agents/sidepanel/chat`,
body: {
conversationId: common.conversationId,
adapter: target.adapter,
modelId: target.modelId,
reasoningEffort: target.reasoningEffort,
message: message ?? '',
browserContext: common.browserContext,
userSystemPrompt: common.userSystemPrompt,
userWorkingDir: common.userWorkingDir,
selectedText: common.selectedText,
selectedTextSource: common.selectedTextSource,
},
}
}
const provider = toLlmProviderConfig(target) ?? fallbackProvider
return {
api: `${agentServerUrl}/chat`,
body: buildChatRequestBody({
...common,
provider,
message,
toolApprovalResponses: approvalResponses ?? undefined,
}),
}
}
export function toProviderOption(target: SidepanelChatTarget): Provider {
return {
id: target.id,
name: target.name,
type: target.type,
kind: target.kind,
modelControl: target.kind === 'acp' ? target.modelControl : undefined,
}
}

View File

@@ -12,8 +12,6 @@ export interface AssistantThinkingPart {
export interface ToolEntry {
id: string
name: string
label: string
subject?: string
status: 'running' | 'completed' | 'error'
durationMs?: number
}
@@ -28,24 +26,9 @@ export type AssistantPart =
| AssistantThinkingPart
| AssistantToolBatchPart
/**
* Attachments rendered alongside the user's text on the optimistic turn
* — populated when the composer staged any images/files. The dataUrl is
* the same one the server received; we keep it in memory only for the
* lifetime of the live turn (history reload re-fetches via the JSONL).
*/
export interface UserAttachmentPreview {
id: string
kind: 'image' | 'file'
mediaType: string
name: string
dataUrl?: string
}
export interface AgentConversationTurn {
id: string
userText: string
userAttachments?: UserAttachmentPreview[]
parts: AssistantPart[]
done: boolean
timestamp: number
@@ -67,7 +50,4 @@ export interface AgentCardData {
status: 'idle' | 'working' | 'error'
lastMessage?: string
lastMessageTimestamp?: number
activitySummary?: string
currentTool?: string
costUsd?: number
}

View File

@@ -1,369 +0,0 @@
/**
* Composer attachment helpers — validation, image compression, and the
* client-side payload shape sent to /agents/:id/chat.
*
* Image attachments travel as `data:` URLs (base64) so the gateway, which
* runs on 127.0.0.1 over Lima virtiofs, can ingest them as standard
* OpenAI-style content blocks. Non-image text-shaped files are read into
* memory and travel as their extracted text body — the server inlines
* them as a fenced `<attachment>` block on the user message.
*/
export const MAX_ATTACHMENTS_PER_MESSAGE = 10
export const MAX_IMAGE_BYTES = 5 * 1024 * 1024 // 5 MB after compression
export const MAX_FILE_TEXT_BYTES = 1 * 1024 * 1024 // 1 MB extracted text
export const IMAGE_LONG_EDGE_CAP = 2048
export const ALLOWED_IMAGE_MEDIA_TYPES = [
'image/png',
'image/jpeg',
'image/jpg',
'image/webp',
'image/gif',
] as const
export const ALLOWED_FILE_MEDIA_TYPE_PREFIXES = [
'text/',
'application/json',
] as const
export type ServerImageAttachment = {
kind: 'image'
mediaType: string
dataUrl: string
name?: string
}
export type ServerFileAttachment = {
kind: 'file'
mediaType: string
name: string
text: string
}
export type ServerAttachmentPayload =
| ServerImageAttachment
| ServerFileAttachment
/** UI-side representation: what the composer needs to render a chip. */
export interface StagedAttachment {
id: string
kind: 'image' | 'file'
mediaType: string
name: string
// Set for images so the chip thumbnail can render directly. For files
// we don't need a preview yet, but the field exists for v2 PDF previews.
dataUrl?: string
// Pre-computed payload for the server. Built once at staging time so
// re-renders don't re-encode large blobs.
payload: ServerAttachmentPayload
}
export type AttachmentValidationError =
| { code: 'too_many'; message: string }
| { code: 'unsupported_type'; message: string; mediaType: string }
| { code: 'too_large'; message: string }
| { code: 'read_failed'; message: string }
export type StageAttachmentResult =
| { ok: true; attachment: StagedAttachment }
| { ok: false; error: AttachmentValidationError }
function isImageMediaType(mediaType: string): boolean {
return (ALLOWED_IMAGE_MEDIA_TYPES as readonly string[]).includes(mediaType)
}
function isAllowedFileMediaType(mediaType: string): boolean {
return ALLOWED_FILE_MEDIA_TYPE_PREFIXES.some((prefix) =>
mediaType.startsWith(prefix),
)
}
/** Build a unique id without depending on `crypto.randomUUID` outside DOM. */
function makeId(): string {
if (typeof crypto !== 'undefined' && crypto.randomUUID) {
return crypto.randomUUID()
}
return `att-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`
}
/**
* Read a `File` and produce the staged-attachment shape — validate type,
* compress if it's a large image, and pre-build the server payload.
*/
export async function stageAttachment(
file: File,
): Promise<StageAttachmentResult> {
const mediaType = file.type || 'application/octet-stream'
if (isImageMediaType(mediaType)) {
try {
const compressed = await compressImageIfNeeded(file)
const dataUrl = await readAsDataUrl(compressed)
// Rough byte ceiling — `data:image/png;base64,...` doubles size with
// base64. Reject early so we never POST something the route will 400.
if (dataUrl.length > MAX_IMAGE_BYTES * 2) {
return {
ok: false,
error: {
code: 'too_large',
message: `Image "${file.name}" is too large (max ${humanBytes(
MAX_IMAGE_BYTES,
)}).`,
},
}
}
return {
ok: true,
attachment: {
id: makeId(),
kind: 'image',
mediaType,
name: file.name || 'image',
dataUrl,
payload: {
kind: 'image',
mediaType,
dataUrl,
name: file.name || undefined,
},
},
}
} catch (err) {
return {
ok: false,
error: {
code: 'read_failed',
message:
err instanceof Error
? err.message
: `Failed to read image "${file.name}".`,
},
}
}
}
if (isAllowedFileMediaType(mediaType)) {
let text: string
try {
text = await file.text()
} catch (err) {
return {
ok: false,
error: {
code: 'read_failed',
message:
err instanceof Error
? err.message
: `Failed to read file "${file.name}".`,
},
}
}
if (text.length > MAX_FILE_TEXT_BYTES) {
return {
ok: false,
error: {
code: 'too_large',
message: `File "${file.name}" is too large (max ${humanBytes(
MAX_FILE_TEXT_BYTES,
)}).`,
},
}
}
return {
ok: true,
attachment: {
id: makeId(),
kind: 'file',
mediaType,
name: file.name || 'attachment',
payload: {
kind: 'file',
mediaType,
name: file.name || 'attachment',
text,
},
},
}
}
return {
ok: false,
error: {
code: 'unsupported_type',
message: `Unsupported attachment type: ${mediaType || 'unknown'}`,
mediaType,
},
}
}
/**
* Stage multiple files at once, enforcing the per-message cap. The result
* partitions successful stages and any errors so the caller can show
* granular toasts.
*/
export async function stageAttachments(
files: File[],
alreadyStaged: number,
): Promise<{
staged: StagedAttachment[]
errors: AttachmentValidationError[]
}> {
const remainingSlots = Math.max(
0,
MAX_ATTACHMENTS_PER_MESSAGE - alreadyStaged,
)
const staged: StagedAttachment[] = []
const errors: AttachmentValidationError[] = []
if (remainingSlots === 0 && files.length > 0) {
errors.push({
code: 'too_many',
message: `At most ${MAX_ATTACHMENTS_PER_MESSAGE} attachments per message.`,
})
return { staged, errors }
}
const overflow = files.length - remainingSlots
if (overflow > 0) {
errors.push({
code: 'too_many',
message: `Only the first ${remainingSlots} of ${files.length} files were attached (max ${MAX_ATTACHMENTS_PER_MESSAGE}).`,
})
}
for (const file of files.slice(0, remainingSlots)) {
const result = await stageAttachment(file)
if (result.ok) {
staged.push(result.attachment)
} else {
errors.push(result.error)
}
}
return { staged, errors }
}
/**
* Resize images that are oversized to a sane long-edge cap. JPEG/WebP
* source files are re-encoded to JPEG; PNGs/GIFs that are already small
* are passed through untouched.
*/
export async function compressImageIfNeeded(file: File): Promise<Blob> {
// Cheap path: small files don't need any transform.
if (file.size <= 1.5 * 1024 * 1024) return file
const bitmap = await blobToImageBitmap(file)
const { width, height } = bitmap
const longEdge = Math.max(width, height)
if (longEdge <= IMAGE_LONG_EDGE_CAP && file.size <= MAX_IMAGE_BYTES) {
bitmap.close?.()
return file
}
const scale = Math.min(1, IMAGE_LONG_EDGE_CAP / longEdge)
const targetWidth = Math.max(1, Math.round(width * scale))
const targetHeight = Math.max(1, Math.round(height * scale))
const canvas =
typeof OffscreenCanvas !== 'undefined'
? new OffscreenCanvas(targetWidth, targetHeight)
: Object.assign(document.createElement('canvas'), {
width: targetWidth,
height: targetHeight,
})
const ctx = canvas.getContext('2d') as
| CanvasRenderingContext2D
| OffscreenCanvasRenderingContext2D
| null
if (!ctx) {
bitmap.close?.()
return file
}
ctx.drawImage(bitmap, 0, 0, targetWidth, targetHeight)
bitmap.close?.()
const outputType = 'image/jpeg'
if (canvas instanceof HTMLCanvasElement) {
return await new Promise<Blob>((resolve, reject) => {
canvas.toBlob(
(blob) => {
if (blob) resolve(blob)
else reject(new Error('Image compression failed.'))
},
outputType,
0.85,
)
})
}
return await (canvas as OffscreenCanvas).convertToBlob({
type: outputType,
quality: 0.85,
})
}
async function blobToImageBitmap(blob: Blob): Promise<ImageBitmap> {
if (typeof createImageBitmap === 'function') {
return createImageBitmap(blob)
}
// Fallback: load via an Image element and use the canvas decode path.
const url = URL.createObjectURL(blob)
try {
const img = await new Promise<HTMLImageElement>((resolve, reject) => {
const el = new Image()
el.onload = () => resolve(el)
el.onerror = () =>
reject(new Error('Failed to decode image for compression.'))
el.src = url
})
const canvas = document.createElement('canvas')
canvas.width = img.naturalWidth
canvas.height = img.naturalHeight
const ctx = canvas.getContext('2d')
if (!ctx) throw new Error('Canvas 2D context unavailable.')
ctx.drawImage(img, 0, 0)
const blobOut = await new Promise<Blob | null>((resolve) =>
canvas.toBlob(resolve, 'image/png'),
)
if (!blobOut) throw new Error('Canvas toBlob returned null.')
return await createImageBitmap(blobOut)
} finally {
URL.revokeObjectURL(url)
}
}
async function readAsDataUrl(blob: Blob): Promise<string> {
if ('arrayBuffer' in blob && typeof FileReader === 'undefined') {
const buffer = await blob.arrayBuffer()
const base64 = arrayBufferToBase64(buffer)
const type = blob.type || 'application/octet-stream'
return `data:${type};base64,${base64}`
}
return await new Promise<string>((resolve, reject) => {
const reader = new FileReader()
reader.onload = () => resolve(reader.result as string)
reader.onerror = () =>
reject(reader.error ?? new Error('FileReader failed to read blob.'))
reader.readAsDataURL(blob)
})
}
function arrayBufferToBase64(buffer: ArrayBuffer): string {
const bytes = new Uint8Array(buffer)
let binary = ''
const chunkSize = 0x8000
for (let i = 0; i < bytes.byteLength; i += chunkSize) {
binary += String.fromCharCode.apply(
null,
Array.from(bytes.subarray(i, Math.min(i + chunkSize, bytes.byteLength))),
)
}
return btoa(binary)
}
function humanBytes(bytes: number): string {
if (bytes >= 1024 * 1024) return `${(bytes / 1024 / 1024).toFixed(0)} MB`
if (bytes >= 1024) return `${(bytes / 1024).toFixed(0)} KB`
return `${bytes} B`
}

View File

@@ -75,12 +75,6 @@ export const MCP_EXTERNAL_ACCESS_DISABLED_EVENT =
/** @public */
export const MCP_SERVER_RESTARTED_EVENT = 'settings.mcp_server.restarted'
/** @public */
export const AGENT_CREATED_EVENT = 'agents.agent.created'
/** @public */
export const AGENT_DELETED_EVENT = 'agents.agent.deleted'
/** @public */
export const NEW_SCHEDULED_TASK_CREATED_EVENT =
'settings.scheduled_task.created'

View File

@@ -2,75 +2,29 @@ function isAbortError(error: unknown): boolean {
return error instanceof DOMException && error.name === 'AbortError'
}
export interface ParsedSSEEvent<T> {
data: T
/** Numeric `id:` line on the same SSE event, if any. */
seq?: number
}
export function parseSSELines<T>(buffer: string): {
events: ParsedSSEEvent<T>[]
events: T[]
remainder: string
} {
// SSE events are separated by blank lines. Buffer lines until we hit
// a blank, then assemble each event. Lines we recognise: `id: <n>`
// and `data: <payload>`. Everything else is ignored.
const events: ParsedSSEEvent<T>[] = []
const lines = buffer.split('\n')
// Find the last blank-line boundary; everything after it is the
// remainder (next event partially received).
let lastBoundary = -1
for (let i = lines.length - 1; i >= 0; i--) {
if (lines[i] === '') {
lastBoundary = i
break
}
}
const completeLines = lastBoundary >= 0 ? lines.slice(0, lastBoundary) : []
const remainder =
lastBoundary >= 0 ? lines.slice(lastBoundary + 1).join('\n') : buffer
const remainder = lines.pop() ?? ''
const events: T[] = []
let currentSeq: number | undefined
let currentData: string | null = null
const flush = () => {
if (currentData != null && currentData !== '[DONE]') {
try {
events.push({
data: JSON.parse(currentData) as T,
seq: currentSeq,
})
} catch {
// ignore
}
}
currentSeq = undefined
currentData = null
for (const line of lines) {
if (!line.startsWith('data: ')) continue
const payload = line.slice(6)
if (payload === '[DONE]') continue
try {
events.push(JSON.parse(payload) as T)
} catch {}
}
for (const line of completeLines) {
if (line === '') {
flush()
continue
}
if (line.startsWith('id: ')) {
const n = Number.parseInt(line.slice(4).trim(), 10)
if (Number.isFinite(n)) currentSeq = n
continue
}
if (line.startsWith('data: ')) {
currentData = line.slice(6)
}
}
// Catch a complete trailing event with no terminating blank line —
// shouldn't happen in well-formed SSE, but be tolerant.
flush()
return { events, remainder }
}
export async function consumeSSEStream<T>(
response: Response,
onEvent: (event: T, meta: { seq?: number }) => void,
onEvent: (event: T) => void,
signal?: AbortSignal,
): Promise<void> {
const reader = response.body?.getReader()
@@ -95,7 +49,7 @@ export async function consumeSSEStream<T>(
buffer = remainder
for (const event of events) {
onEvent(event.data, { seq: event.seq })
onEvent(event)
}
}
} catch (error) {
@@ -110,7 +64,7 @@ export async function consumeSSEStream<T>(
if (buffer) {
const { events } = parseSSELines<T>(buffer)
for (const event of events) {
onEvent(event.data, { seq: event.seq })
onEvent(event)
}
}
}

View File

@@ -1,325 +0,0 @@
/**
* @license
* Copyright 2025 BrowserOS
* SPDX-License-Identifier: AGPL-3.0-or-later
*
* Maps raw tool names + arguments to human-readable activity labels for
* the chat UI activity view. The MCP ToolRegistry is the source of truth
* for tool *existence*; this file is the editorial layer that turns
* snake_case identifiers into agent-speak verbs.
*/
const VERB_OVERRIDES: Record<string, string> = {
// Navigation
navigate_page: 'Navigated to',
new_page: 'Opened tab',
new_hidden_page: 'Opened tab',
show_page: 'Showed tab',
close_page: 'Closed tab',
list_pages: 'Listed open tabs',
get_active_page: 'Got active tab',
move_page: 'Moved tab',
group_tabs: 'Grouped tabs',
// Page reading
take_snapshot: 'Captured page snapshot',
take_enhanced_snapshot: 'Captured detailed snapshot',
get_page_content: 'Read page content',
get_page_links: 'Extracted page links',
get_dom: 'Read page DOM',
search_dom: 'Searched page DOM',
take_screenshot: 'Took screenshot',
// Input
click: 'Clicked',
click_at: 'Clicked at coordinates',
hover: 'Hovered',
hover_at: 'Hovered at coordinates',
type_at: 'Typed at coordinates',
drag_at: 'Dragged',
focus: 'Focused element',
fill: 'Filled field',
clear: 'Cleared field',
check: 'Checked box',
uncheck: 'Unchecked box',
press_key: 'Pressed key',
upload_file: 'Uploaded file',
// Console / scripts
evaluate_script: 'Ran script',
get_console_logs: 'Read console logs',
// History / bookmarks
search_history: 'Searched history',
get_recent_history: 'Read recent history',
delete_history_url: 'Deleted history entry',
delete_history_range: 'Deleted history range',
get_bookmarks: 'Listed bookmarks',
create_bookmark: 'Created bookmark',
remove_bookmark: 'Removed bookmark',
update_bookmark: 'Updated bookmark',
move_bookmark: 'Moved bookmark',
search_bookmarks: 'Searched bookmarks',
// Filesystem (sandboxed)
read_file: 'Read file',
write_file: 'Wrote file',
find_files: 'Searched files',
// Memory
read_soul: 'Read soul memory',
read_core: 'Read core memory',
write_memory: 'Wrote memory',
search_memory: 'Searched memory',
update_soul: 'Updated soul memory',
update_core: 'Updated core memory',
// Web
web_search: 'Searched the web',
web_fetch: 'Fetched URL',
// Klavis / external apps (Strata)
connector_mcp_servers: 'Listed connected apps',
discover_server_categories_or_actions: 'Browsed available actions',
get_category_actions: 'Listed actions',
get_action_details: 'Looked up action',
execute_action: 'Ran external action',
search_documentation: 'Searched docs',
handle_auth_failure: 'Handled auth issue',
// Suggestions
suggest_schedule: 'Suggested schedule',
suggest_app_connection: 'Suggested app connect',
// BrowserOS info
browseros_info: 'Read BrowserOS info',
// Windows
list_windows: 'Listed windows',
focus_window: 'Focused window',
close_window: 'Closed window',
create_window: 'Created window',
}
// ──────────────────────────────────────────────────────────────────────
// Helpers
// ──────────────────────────────────────────────────────────────────────
function asString(value: unknown): string | undefined {
return typeof value === 'string' && value.length > 0 ? value : undefined
}
function stringField(
input: Record<string, unknown>,
...keys: string[]
): string | undefined {
for (const k of keys) {
const v = asString(input[k])
if (v) return v
}
return undefined
}
function truncate(text: string | undefined, max: number): string | undefined {
if (!text) return undefined
return text.length > max ? `${text.slice(0, max - 1)}` : text
}
function quote(value: string | undefined): string | undefined {
if (!value) return undefined
return `"${truncate(value, 60)}"`
}
function basename(path: string | undefined): string | undefined {
if (!path) return undefined
const parts = path.split(/[/\\]/).filter(Boolean)
return parts[parts.length - 1] ?? path
}
function formatUrl(value: unknown): string | undefined {
const url = asString(value)
if (!url) return undefined
try {
const parsed = new URL(url)
const host = parsed.host
const path = parsed.pathname === '/' ? '' : parsed.pathname
const display = path && path.length > 0 ? `${host}${path}` : host
return truncate(display, 60)
} catch {
return truncate(url, 60)
}
}
function coords(x: unknown, y: unknown): string | undefined {
if (typeof x === 'number' && typeof y === 'number') {
return `${Math.round(x)}, ${Math.round(y)}`
}
return undefined
}
// ──────────────────────────────────────────────────────────────────────
// Subject extractors
// ──────────────────────────────────────────────────────────────────────
type SubjectExtractor = (input: Record<string, unknown>) => string | undefined
const SUBJECT_EXTRACTORS: Record<string, SubjectExtractor> = {
// URL-bearing tools
new_page: (i) => formatUrl(i.url),
new_hidden_page: (i) => formatUrl(i.url),
navigate_page: (i) => {
const action = asString(i.action)
if (action === 'back') return 'back'
if (action === 'forward') return 'forward'
if (action === 'reload') return 'reload'
return formatUrl(i.url)
},
web_fetch: (i) => formatUrl(i.url),
// Search queries
web_search: (i) => quote(stringField(i, 'query', 'q')),
search_history: (i) => quote(stringField(i, 'query', 'text')),
search_bookmarks: (i) => quote(stringField(i, 'query', 'text')),
search_memory: (i) => quote(stringField(i, 'query', 'q')),
search_dom: (i) => quote(stringField(i, 'query', 'selector')),
search_documentation: (i) => quote(stringField(i, 'query', 'q')),
find_files: (i) => quote(stringField(i, 'pattern', 'query')),
// Element interactions
click: (i) => stringField(i, 'element'),
hover: (i) => stringField(i, 'element'),
focus: (i) => stringField(i, 'element'),
clear: (i) => stringField(i, 'element'),
check: (i) => stringField(i, 'element'),
uncheck: (i) => stringField(i, 'element'),
fill: (i) => {
const target = stringField(i, 'element')
const text = stringField(i, 'text')
if (target && text) return `${target}: ${truncate(text, 40)}`
return target ?? truncate(text, 40)
},
press_key: (i) => stringField(i, 'key'),
// Coordinate-based input
click_at: (i) => coords(i.x, i.y),
hover_at: (i) => coords(i.x, i.y),
type_at: (i) => {
const at = coords(i.x, i.y)
const text = stringField(i, 'text')
if (at && text) return `${at}: ${truncate(text, 40)}`
return at ?? truncate(text, 40)
},
drag_at: (i) => {
const from = coords(i.fromX, i.fromY)
const to = coords(i.toX, i.toY)
if (from && to) return `${from}${to}`
return from ?? to
},
// Tab management
show_page: (i) => {
const page = i.page
return typeof page === 'number' ? `tab ${page}` : asString(page)
},
close_page: (i) => {
const page = i.page
return typeof page === 'number' ? `tab ${page}` : asString(page)
},
move_page: (i) => {
const page = i.page
return typeof page === 'number' ? `tab ${page}` : asString(page)
},
// Page reads (take_snapshot, take_enhanced_snapshot, get_page_content,
// get_page_links, get_dom, take_screenshot) intentionally omit a
// subject — the only argument is a numeric page ID that's internal
// to the agent and meaningless to the user ("tab 4" tells them nothing).
// The verb alone communicates what happened.
// External actions via Strata
execute_action: (i) => {
const server = stringField(i, 'server_name')
const action = stringField(i, 'action_name')
if (server && action) return `${server} · ${action}`
return action ?? server
},
get_category_actions: (i) => stringField(i, 'category_name', 'server_name'),
get_action_details: (i) => stringField(i, 'action_name'),
discover_server_categories_or_actions: (i) =>
stringField(i, 'server_name', 'category_name'),
connector_mcp_servers: (i) => stringField(i, 'server_name'),
// Filesystem
read_file: (i) => basename(stringField(i, 'path')),
write_file: (i) => basename(stringField(i, 'path')),
// Memory writes — show first chars of content
write_memory: (i) => truncate(stringField(i, 'content', 'text'), 40),
update_soul: (i) => truncate(stringField(i, 'content'), 40),
update_core: (i) => truncate(stringField(i, 'content'), 40),
// Bookmarks
create_bookmark: (i) => stringField(i, 'title') ?? formatUrl(i.url),
remove_bookmark: (i) => stringField(i, 'id', 'title'),
update_bookmark: (i) => stringField(i, 'id', 'title'),
move_bookmark: (i) => stringField(i, 'id', 'title'),
// History
delete_history_url: (i) => formatUrl(i.url),
}
// ──────────────────────────────────────────────────────────────────────
// Public API
// ──────────────────────────────────────────────────────────────────────
export interface ToolLabelResult {
label: string
subject?: string
}
/**
* Strip MCP namespace prefixes (e.g. "browseros__", "mcp_") to find the
* canonical tool name used in the override maps.
*/
function canonicalName(rawName: string): string {
return rawName.replace(/^browseros__/, '').replace(/^mcp_/, '')
}
/**
* Convert a snake_case tool name into Sentence-case English as a fallback
* when no curated override exists.
*/
function humanizeToolName(rawName: string): string {
const stripped = canonicalName(rawName)
const words = stripped.split(/[_-]/).filter((w) => w.length > 0)
if (words.length === 0) return rawName
const first = words[0]!
return [
first.charAt(0).toUpperCase() + first.slice(1),
...words.slice(1),
].join(' ')
}
/**
* Build a human-readable label and subject string for a tool call,
* suitable for rendering in the chat activity view.
*/
export function buildToolLabel(
rawName: string,
input?: Record<string, unknown>,
): ToolLabelResult {
const canonical = canonicalName(rawName)
const label =
VERB_OVERRIDES[canonical] ??
VERB_OVERRIDES[rawName] ??
humanizeToolName(rawName)
const extractor = Object.hasOwn(SUBJECT_EXTRACTORS, canonical)
? SUBJECT_EXTRACTORS[canonical]
: Object.hasOwn(SUBJECT_EXTRACTORS, rawName)
? SUBJECT_EXTRACTORS[rawName]
: undefined
const subject = extractor && input ? extractor(input) : undefined
return { label, subject }
}

View File

@@ -8,7 +8,6 @@ const chromiumArgs = [
'--show-component-extension-options',
'--disable-browseros-server',
'--disable-browseros-extensions',
'--browseros-dock-icon=dev',
]
if (env.BROWSEROS_CDP_PORT) {

View File

@@ -0,0 +1,875 @@
# Eval System - Production Grade Design Doc
## Current State Analysis
### What's Working Well
1. **Zod validation** - Already exists in `config-validator.ts`, reuses `LLMConfigSchema` from `@browseros/shared`
2. **Grader registry pattern** - `createGrader()` factory works well, easy to add new graders
3. **AgentEvaluator interface** - Clean interface: `execute() → AgentResult`
4. **Discriminated unions** - Messages, agent types use proper TypeScript patterns
5. **Capture utilities** - `ScreenshotCapture`, `MessageLogger`, `TrajectorySaver` are modular
### Key Problems
**1. No Agent Registry/Factory**
Agent creation is hardcoded if-else in `task-executor.ts`:
```typescript
// Current approach - not scalable
if (this.config.agent.type === 'single') {
const evaluator = new SingleAgentEvaluator(...)
} else if (this.config.agent.type === 'orchestrator-executor') {
const evaluator = new OrchestratorExecutorEvaluator(...)
}
// Adding new agent = modify this file
```
**2. Heavy Server Dependency**
Imports from `@browseros/server`:
- `GeminiAgent` - Core agent (necessary)
- `ToolExecutionHooks` - Hook interface
- `ResolvedAgentConfig` - Agent config type
- `AgentExecutionError` - Error type
- `VercelAIContentGenerator` - Provider adapter
- Gateway client functions
**3. Scattered Types**
- `src/types.ts` - Main types
- `agents/types.ts` - Agent interface
- `agents/orchestrator-executor/types.ts` - Orchestrator types
- `runner/types.ts` - Runner types
- `graders/types.ts` - Grader types
**4. Duplicated Capture Logic**
Both agent evaluators duplicate:
- Initialize ScreenshotCapture
- Initialize MessageLogger
- Set up tool hooks
- Handle timeouts
- Collect errors/warnings
**5. No Unified Utils**
Hooks, screenshot capture, message logging code is copy-pasted per agent type.
---
## Design Goals
1. **Easy to add new agents** - Register new agent type, implement interface, done
2. **Shared capture infrastructure** - All agents use same screenshot/logging utils
3. **Type-safe with Zod** - Config validation at entry point
4. **Minimal server coupling** - Only import what's necessary
5. **Clear folder structure** - Types where they belong
6. **Production patterns** - Factory, registry, composition
---
## Proposed Architecture
### Folder Structure
```
eval/src/
├── index.ts # Entry point, CLI
├── types/
│ ├── index.ts # Re-exports all types
│ ├── config.ts # EvalConfig, AgentConfig (Zod schemas + types)
│ ├── task.ts # Task, TaskMetadata
│ ├── message.ts # Message discriminated union
│ ├── result.ts # AgentResult, GraderResult
│ └── errors.ts # ErrorSource, TaskError, EvalWarning
├── agents/
│ ├── index.ts # Re-exports + auto-registration
│ ├── registry.ts # Agent registry + factory
│ ├── types.ts # AgentEvaluator interface, AgentContext
│ ├── single/
│ │ └── index.ts # SingleAgentEvaluator
│ └── orchestrator-executor/
│ ├── index.ts # OrchestratorExecutorEvaluator
│ ├── types.ts # Orchestrator-specific types only
│ ├── orchestrator.ts
│ ├── orchestrator-agent.ts
│ ├── orchestrator-tools.ts
│ ├── executor.ts
│ └── executor-store.ts
├── capture/
│ ├── index.ts # Re-exports
│ ├── types.ts # CaptureContext interface
│ ├── context.ts # CaptureContext class (bundles all capture)
│ ├── hooks.ts # createCaptureHooks() utility
│ ├── screenshot.ts # ScreenshotCapture
│ ├── message-logger.ts # MessageLogger
│ ├── trajectory-saver.ts # TrajectorySaver
│ └── window-manager.ts # WindowManager
├── graders/
│ ├── index.ts # Re-exports
│ ├── registry.ts # Grader registry (existing pattern)
│ ├── types.ts # Grader interface
│ ├── benchmark/
│ │ ├── webvoyager.ts
│ │ └── mind2web.ts
│ └── fara/
│ ├── alignment.ts
│ ├── rubric.ts
│ ├── multimodal.ts
│ └── combined.ts
├── runner/
│ ├── index.ts # runEval() main entry
│ ├── types.ts # RunEvalOptions, TaskResult, BatchSummary
│ ├── task-loader.ts
│ ├── task-executor.ts
│ └── parallel-executor.ts
└── utils/
├── env.ts # resolveEnvValue() helper
└── validation.ts # Config validation logic
```
---
## Key Components
### 1. Type System (`types/`)
**`types/config.ts`** - Zod schemas + inferred types:
```typescript
import { LLMConfigSchema, LLMProviderSchema } from '@browseros/shared/schemas/llm'
import { z } from 'zod'
// Single agent config
export const SingleAgentConfigSchema = LLMConfigSchema.extend({
type: z.literal('single'),
})
export type SingleAgentConfig = z.infer<typeof SingleAgentConfigSchema>
// Orchestrator-executor config
export const OrchestratorExecutorConfigSchema = z.object({
type: z.literal('orchestrator-executor'),
orchestrator: LLMConfigSchema.extend({
maxTurns: z.number().int().min(1).optional(),
}),
executor: LLMConfigSchema.extend({
maxStepsPerDelegation: z.number().int().min(1).optional(),
}),
})
export type OrchestratorExecutorConfig = z.infer<typeof OrchestratorExecutorConfigSchema>
// Discriminated union
export const AgentConfigSchema = z.discriminatedUnion('type', [
SingleAgentConfigSchema,
OrchestratorExecutorConfigSchema,
])
export type AgentConfig = z.infer<typeof AgentConfigSchema>
// Full eval config
export const EvalConfigSchema = z.object({
agent: AgentConfigSchema,
dataset: z.string().min(1),
output_dir: z.string().optional(),
num_workers: z.number().int().min(1).max(20).default(1),
browseros: z.object({
server_url: z.string().url(),
}),
grader_model: z.string().optional(),
grader_api_key_env: z.string().optional(),
grader_base_url: z.string().url().optional(),
timeout_ms: z.number().int().min(30000).max(3600000).optional(),
})
export type EvalConfig = z.infer<typeof EvalConfigSchema>
```
**`types/message.ts`** - Message types:
```typescript
import { z } from 'zod'
const BaseMessageSchema = z.object({
timestamp: z.string().datetime(),
})
export const UserMessageSchema = BaseMessageSchema.extend({
type: z.literal('user'),
content: z.string(),
})
export const AssistantMessageSchema = BaseMessageSchema.extend({
type: z.literal('assistant'),
content: z.string(),
})
export const ToolCallMessageSchema = BaseMessageSchema.extend({
type: z.literal('tool_call'),
tool: z.string(),
toolCallId: z.string(),
params: z.record(z.unknown()),
})
export const ToolResultMessageSchema = BaseMessageSchema.extend({
type: z.literal('tool_result'),
toolCallId: z.string(),
result: z.unknown(),
isError: z.boolean(),
screenshot: z.number().optional(),
})
export const ErrorMessageSchema = BaseMessageSchema.extend({
type: z.literal('error'),
content: z.string(),
errorCode: z.string().optional(),
})
// Orchestrator-specific messages
export const DelegationMessageSchema = BaseMessageSchema.extend({
type: z.literal('delegation'),
instruction: z.string(),
executorId: z.string(),
maxSteps: z.number().optional(),
})
export const DelegationResultMessageSchema = BaseMessageSchema.extend({
type: z.literal('delegation_result'),
executorId: z.string(),
summary: z.string(),
status: z.enum(['done', 'blocked', 'max_steps']),
stepsUsed: z.number(),
currentUrl: z.string().optional(),
})
export const MessageSchema = z.discriminatedUnion('type', [
UserMessageSchema,
AssistantMessageSchema,
ToolCallMessageSchema,
ToolResultMessageSchema,
ErrorMessageSchema,
DelegationMessageSchema,
DelegationResultMessageSchema,
])
export type Message = z.infer<typeof MessageSchema>
export type UserMessage = z.infer<typeof UserMessageSchema>
export type AssistantMessage = z.infer<typeof AssistantMessageSchema>
export type ToolCallMessage = z.infer<typeof ToolCallMessageSchema>
export type ToolResultMessage = z.infer<typeof ToolResultMessageSchema>
export type ErrorMessage = z.infer<typeof ErrorMessageSchema>
export type DelegationMessage = z.infer<typeof DelegationMessageSchema>
export type DelegationResultMessage = z.infer<typeof DelegationResultMessageSchema>
// Type guards
export const isToolCallMessage = (m: Message): m is ToolCallMessage => m.type === 'tool_call'
export const isDelegationMessage = (m: Message): m is DelegationMessage => m.type === 'delegation'
// ... etc
```
---
### 2. Agent Registry (`agents/registry.ts`)
```typescript
import type { AgentContext, AgentEvaluator } from './types'
type AgentFactory = (context: AgentContext) => AgentEvaluator
const registry = new Map<string, AgentFactory>()
/**
* Register an agent type
*/
export function registerAgent(type: string, factory: AgentFactory): void {
if (registry.has(type)) {
throw new Error(`Agent type "${type}" already registered`)
}
registry.set(type, factory)
}
/**
* Create agent evaluator from context
*/
export function createAgent(context: AgentContext): AgentEvaluator {
const factory = registry.get(context.config.agent.type)
if (!factory) {
const available = Array.from(registry.keys()).join(', ')
throw new Error(
`Unknown agent type: "${context.config.agent.type}". Available: ${available}`
)
}
return factory(context)
}
/**
* Get all registered agent types
*/
export function getRegisteredAgentTypes(): string[] {
return Array.from(registry.keys())
}
```
**`agents/index.ts`** - Auto-registration:
```typescript
import { registerAgent } from './registry'
import { SingleAgentEvaluator } from './single'
import { OrchestratorExecutorEvaluator } from './orchestrator-executor'
// Auto-register built-in agents
registerAgent('single', (ctx) => new SingleAgentEvaluator(ctx))
registerAgent('orchestrator-executor', (ctx) => new OrchestratorExecutorEvaluator(ctx))
// Re-exports
export { createAgent, registerAgent, getRegisteredAgentTypes } from './registry'
export type { AgentContext, AgentEvaluator, AgentResult } from './types'
```
---
### 3. Agent Context (`agents/types.ts`)
```typescript
import type { CaptureContext } from '../capture/types'
import type { EvalConfig, Task, TaskMetadata, Message } from '../types'
/**
* All dependencies an agent needs - passed to factory
*/
export interface AgentContext {
// Config
config: EvalConfig
task: Task
// Browser window
windowId: number
tabId: number
// Output
outputDir: string // Root output dir
taskOutputDir: string // Task-specific: outputDir/query_id/
// Capture infrastructure (pre-initialized)
capture: CaptureContext
}
/**
* Result returned by agent execution
*/
export interface AgentResult {
metadata: TaskMetadata
messages: Message[]
finalAnswer: string | null
}
/**
* Interface all agent evaluators must implement
*/
export interface AgentEvaluator {
/**
* Execute the agent on the task
*/
execute(): Promise<AgentResult>
}
```
---
### 4. Capture Context (`capture/context.ts`)
Bundle all capture utilities:
```typescript
import { randomUUID } from 'node:crypto'
import type { ToolExecutionHooks, ToolExecutionResult } from '@browseros/server/agent'
import type { Message, TaskError, EvalWarning, ErrorSource } from '../types'
import { MessageLogger } from './message-logger'
import { ScreenshotCapture } from './screenshot'
import { TrajectorySaver } from './trajectory-saver'
export interface CaptureContextConfig {
serverUrl: string
outputDir: string
taskId: string
tabId: number
windowId: number
}
/**
* Unified capture context - bundles screenshot, message logging, errors/warnings
*/
export class CaptureContext {
readonly screenshot: ScreenshotCapture
readonly messageLogger: MessageLogger
readonly trajectorySaver: TrajectorySaver
private errors: TaskError[] = []
private warnings: EvalWarning[] = []
private currentToolCallId: string | null = null
private readonly tabId: number
private readonly windowId: number
constructor(private config: CaptureContextConfig) {
this.tabId = config.tabId
this.windowId = config.windowId
this.trajectorySaver = new TrajectorySaver(config.outputDir, config.taskId)
}
/**
* Initialize - must be called before use
*/
async init(): Promise<string> {
const taskOutputDir = await this.trajectorySaver.init()
this.screenshot = new ScreenshotCapture(this.config.serverUrl, taskOutputDir)
await this.screenshot.init()
this.messageLogger = new MessageLogger(taskOutputDir)
return taskOutputDir
}
/**
* Create tool execution hooks for GeminiAgent
*/
createToolHooks(): ToolExecutionHooks {
return {
onBeforeToolCall: async (toolName: string, args: unknown) => {
try {
this.currentToolCallId = randomUUID()
await this.messageLogger.logToolCall(
toolName,
this.currentToolCallId,
args as Record<string, unknown>
)
} catch (err) {
this.addWarning('message_logging', `Failed to log tool call ${toolName}: ${err}`)
}
},
onAfterToolCall: async (toolName: string, result: ToolExecutionResult) => {
let screenshotNum = 0
// Capture screenshot
try {
screenshotNum = await this.screenshot.capture(this.tabId, this.windowId)
} catch (err) {
this.addWarning('screenshot', `Screenshot after ${toolName} failed: ${err}`)
screenshotNum = this.screenshot.getCount()
}
// Log tool errors
if (result.isError) {
this.addWarning('mcp_tool', `Tool ${toolName} error: ${result.errorMessage}`)
}
// Log result
if (this.currentToolCallId) {
try {
await this.messageLogger.logToolResult(
this.currentToolCallId,
result.isError ? { error: result.errorMessage } : result.parts,
result.isError,
screenshotNum
)
} catch (err) {
this.addWarning('message_logging', `Failed to log tool result: ${err}`)
}
}
this.currentToolCallId = null
},
}
}
// Error/warning collection
addError(source: ErrorSource, message: string, details?: Record<string, unknown>): void {
this.errors.push({ source, message, timestamp: new Date().toISOString(), details })
}
addWarning(source: ErrorSource, message: string): void {
this.warnings.push({ source, message, timestamp: new Date().toISOString() })
console.warn(`[${source}] ${message}`)
}
getErrors(): TaskError[] { return [...this.errors] }
getWarnings(): EvalWarning[] { return [...this.warnings] }
getMessages(): Message[] { return this.messageLogger.getMessages() }
getScreenshotCount(): number { return this.screenshot.getCount() }
getLastAssistantMessage(): string | null { return this.messageLogger.getLastAssistantMessage() }
// Delegation logging (for orchestrator-executor)
async logDelegation(instruction: string, executorId: string, maxSteps?: number): Promise<void> {
await this.messageLogger.logDelegation(instruction, executorId, maxSteps)
}
async logDelegationResult(
executorId: string,
summary: string,
status: 'done' | 'blocked' | 'max_steps',
stepsUsed: number,
currentUrl?: string
): Promise<void> {
await this.messageLogger.logDelegationResult(executorId, summary, status, stepsUsed, currentUrl)
}
}
```
---
### 5. Single Agent Evaluator (`agents/single/index.ts`)
Clean implementation using context:
```typescript
import { randomUUID } from 'node:crypto'
import { GeminiAgent } from '@browseros/server/agent'
import { AgentExecutionError } from '@browseros/server/agent/errors'
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
import { MCPServerConfig } from '@google/gemini-cli-core'
import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
import type { SingleAgentConfig, TaskMetadata } from '../../types'
import { resolveEnvValue } from '../../utils/env'
const DEFAULT_TIMEOUT_MS = 15 * 60 * 1000
export class SingleAgentEvaluator implements AgentEvaluator {
constructor(private ctx: AgentContext) {}
async execute(): Promise<AgentResult> {
const startTime = Date.now()
const { config, task, capture } = this.ctx
const agentConfig = config.agent as SingleAgentConfig
const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS
// Log initial user message
await capture.messageLogger.logUser(task.query)
// Set up timeout
const abortController = new AbortController()
const timeoutHandle = setTimeout(() => abortController.abort(), timeoutMs)
// Create agent
const resolvedConfig: ResolvedAgentConfig = {
conversationId: randomUUID(),
provider: agentConfig.provider,
model: agentConfig.model ?? 'gemini-2.0-flash',
apiKey: resolveEnvValue(agentConfig.apiKey),
baseUrl: agentConfig.baseUrl,
sessionExecutionDir: '/tmp/browseros-eval',
evalMode: true,
}
const mcpServers = {
'browseros-mcp': new MCPServerConfig(
undefined, undefined, undefined, undefined, undefined,
`${config.browseros.server_url}/mcp`,
{ Accept: 'application/json, text/event-stream', 'X-BrowserOS-Source': 'eval' },
undefined, undefined, true
),
}
const agent = await GeminiAgent.create(resolvedConfig, mcpServers)
// Set capture hooks
agent.setToolHooks(capture.createToolHooks())
// Create mock stream to capture assistant messages
let lastAssistantMessage = ''
const mockStream = {
write: async (data: string) => {
if (data.includes('"type":"text-delta"')) {
const match = data.match(/"delta":"((?:[^"\\]|\\.)*)"/)
if (match) lastAssistantMessage += JSON.parse(`"${match[1]}"`)
} else if (data.includes('"type":"finish"')) {
if (lastAssistantMessage) {
await capture.messageLogger.logAssistant(lastAssistantMessage)
lastAssistantMessage = ''
}
}
},
}
// Execute
let terminationReason: TaskMetadata['termination_reason'] = 'completed'
try {
await agent.execute(
task.query,
mockStream as Parameters<typeof agent.execute>[1],
abortController.signal,
{ windowId: this.ctx.windowId, activeTab: { id: this.ctx.tabId, url: task.start_url } }
)
} catch (err) {
const error = err instanceof Error ? err : new Error(String(err))
if (abortController.signal.aborted) {
terminationReason = 'timeout'
capture.addError('agent_execution', `Task timed out after ${timeoutMs / 1000}s`)
} else {
terminationReason = 'error'
const msg = err instanceof AgentExecutionError && err.originalError
? `${error.message}: ${err.originalError.message}`
: error.message
capture.addError('agent_execution', msg, { stack: error.stack })
}
await capture.messageLogger.logError(error.message)
} finally {
clearTimeout(timeoutHandle)
}
// Build metadata
const metadata: TaskMetadata = {
query_id: task.query_id,
dataset: task.dataset,
query: task.query,
started_at: new Date(startTime).toISOString(),
completed_at: new Date().toISOString(),
total_duration_ms: Date.now() - startTime,
total_steps: capture.getScreenshotCount(),
termination_reason: terminationReason,
final_answer: capture.getLastAssistantMessage(),
errors: capture.getErrors(),
warnings: capture.getWarnings(),
agent_config: { type: 'single', model: resolvedConfig.model },
grader_results: {},
}
await capture.trajectorySaver.saveMetadata(metadata)
return {
metadata,
messages: capture.getMessages(),
finalAnswer: metadata.final_answer,
}
}
}
```
---
### 6. Task Executor (`runner/task-executor.ts`)
Uses agent registry:
```typescript
import { createAgent } from '../agents'
import type { AgentContext } from '../agents/types'
import { CaptureContext } from '../capture/context'
import type { EvalConfig, Task } from '../types'
import type { WindowManager } from '../capture/window-manager'
export class TaskExecutor {
constructor(
private config: EvalConfig,
private outputDir: string,
private windowManager: WindowManager,
private graderOptions: GraderOptions | null,
) {}
async execute(task: Task): Promise<TaskResult> {
const startTime = Date.now()
let window: { windowId: number; tabId: number } | null = null
try {
// Create window
window = await this.windowManager.createWindow(task.query_id, task.start_url)
// Initialize capture context
const capture = new CaptureContext({
serverUrl: this.config.browseros.server_url,
outputDir: this.outputDir,
taskId: task.query_id,
tabId: window.tabId,
windowId: window.windowId,
})
const taskOutputDir = await capture.init()
// Build agent context
const context: AgentContext = {
config: this.config,
task,
windowId: window.windowId,
tabId: window.tabId,
outputDir: this.outputDir,
taskOutputDir,
capture,
}
// Create and execute agent (via registry)
const agent = createAgent(context)
const agentResult = await agent.execute()
// Run graders
const graderResults = await this.runGraders(task, agentResult)
return {
status: agentResult.metadata.termination_reason === 'timeout' ? 'timeout' : 'completed',
task,
agentResult,
graderResults,
durationMs: Date.now() - startTime,
}
} catch (error) {
return {
status: 'failed',
task,
error: error instanceof Error ? error : new Error(String(error)),
errorSource: 'unknown',
durationMs: Date.now() - startTime,
}
} finally {
if (window) {
await this.windowManager.closeWindow(task.query_id)
}
}
}
}
```
---
## Server Dependencies
### What We MUST Import from Server
These are necessary - `GeminiAgent` IS the agent:
```typescript
// Core agent
import { GeminiAgent, type ToolExecutionHooks, type ToolExecutionResult } from '@browseros/server/agent'
import { AgentExecutionError } from '@browseros/server/agent/errors'
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
// Provider adapter (for orchestrator-agent)
import { VercelAIContentGenerator } from '@browseros/server/agent/provider-adapter'
// Gateway client (for browseros provider only)
import { fetchBrowserOSConfig, getLLMConfigFromProvider } from '@browseros/server/lib/clients/gateway'
```
### What Could Move to Shared (Future)
If we want to decouple more:
```typescript
// These types could be in @browseros/shared
export interface ToolExecutionHooks { ... }
export interface ToolExecutionResult { ... }
export interface ResolvedAgentConfig { ... }
```
But for now, importing from server is fine - eval is tightly coupled to server anyway.
---
## Import Guidelines
```typescript
// Shared package - schemas, constants
import { LLMConfigSchema, LLMProviderSchema, LLM_PROVIDERS } from '@browseros/shared/schemas/llm'
import { TIMEOUTS } from '@browseros/shared/constants/timeouts'
import { AGENT_LIMITS } from '@browseros/shared/constants/limits'
import type { BrowserContext } from '@browseros/shared/schemas/browser-context'
// Server - only agent-related imports
import { GeminiAgent, type ToolExecutionHooks } from '@browseros/server/agent'
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
// Internal eval types - from types/ folder
import type { EvalConfig, Task, Message, AgentResult } from '../types'
import type { AgentContext, AgentEvaluator } from '../agents/types'
```
---
## Adding a New Agent Type
1. Create folder: `agents/my-new-agent/`
2. Implement `AgentEvaluator` interface:
```typescript
// agents/my-new-agent/index.ts
import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
export class MyNewAgentEvaluator implements AgentEvaluator {
constructor(private ctx: AgentContext) {}
async execute(): Promise<AgentResult> {
const { config, task, capture } = this.ctx
// Use capture.createToolHooks() for screenshot/logging
// Use capture.messageLogger for messages
// Use capture.addError/addWarning for errors
// Return AgentResult
}
}
```
3. Register in `agents/index.ts`:
```typescript
import { MyNewAgentEvaluator } from './my-new-agent'
registerAgent('my-new-agent', (ctx) => new MyNewAgentEvaluator(ctx))
```
4. Add config schema in `types/config.ts`:
```typescript
export const MyNewAgentConfigSchema = z.object({
type: z.literal('my-new-agent'),
// ... specific fields
})
export const AgentConfigSchema = z.discriminatedUnion('type', [
SingleAgentConfigSchema,
OrchestratorExecutorConfigSchema,
MyNewAgentConfigSchema, // Add here
])
```
Done - no changes to runner code needed.
---
## Implementation Order
1. **Phase 1: Types** (~1 hour)
- Create `types/` folder with proper structure
- Move/consolidate all types
- Add Zod schemas for messages
2. **Phase 2: Capture Context** (~1 hour)
- Create `CaptureContext` class
- Add delegation message methods
- Create `createToolHooks()` utility
3. **Phase 3: Agent Registry** (~30 min)
- Create `registry.ts`
- Create `AgentContext` interface
- Update exports
4. **Phase 4: Refactor Single Agent** (~1 hour)
- Use `AgentContext`
- Use `CaptureContext`
- Clean up code
5. **Phase 5: Refactor Orchestrator-Executor** (~2 hours)
- Use `AgentContext`
- Integrate `CaptureContext`
- Wire up hooks properly
6. **Phase 6: Update Runner** (~30 min)
- Use `createAgent()` instead of if-else
- Initialize `CaptureContext` in executor
7. **Phase 7: Testing** (~1 hour)
- Run single-agent eval
- Run orchestrator-executor eval
- Verify screenshots/messages captured
---
## Summary
| Before | After |
|--------|-------|
| If-else agent creation | Registry + factory pattern |
| Duplicated capture code | Shared `CaptureContext` |
| Scattered types | Organized `types/` folder |
| Copy-paste hooks | `createToolHooks()` utility |
| Tight coupling | Clear interfaces |
| Hard to add agents | Register + implement |

View File

@@ -0,0 +1,431 @@
# Implementation Phases - Parallel Execution Plan
## Dependency Graph
```
Phase 1: Types (4 parallel subagents)
├──────────────────┬──────────────────┐
▼ ▼ │
Phase 2: Capture Phase 3: Agent │
(2 parallel) Registry │
│ (1 subagent) │
│ │ │
└────────┬─────────┘ │
▼ │
Phase 4: Agent Refactors │
(2 parallel - after 2+3) │
│ │
▼ │
Phase 5: Runner Update │
(1 subagent - after 4) │
│ │
▼ │
Phase 6: Cleanup & Test ◄─────────────────┘
(1 subagent)
```
---
## Phase 1: Types (4 Parallel Subagents)
No dependencies - can all run simultaneously.
### Subagent 1A: Config Types
```
Create /apps/eval/src/types/config.ts
Requirements:
1. Import LLMConfigSchema, LLMProviderSchema from @browseros/shared/schemas/llm
2. Import z from zod
Create Zod schemas:
- SingleAgentConfigSchema = LLMConfigSchema.extend({ type: z.literal('single') })
- OrchestratorExecutorConfigSchema with orchestrator + executor nested configs
- AgentConfigSchema = z.discriminatedUnion('type', [...])
- EvalConfigSchema with all fields (agent, dataset, output_dir, num_workers, browseros, grader_*, timeout_ms)
Export both schemas and inferred types (z.infer<>)
Reference: Current implementation in /apps/eval/src/utils/config-validator.ts (lines 1-42)
```
### Subagent 1B: Message Types
```
Create /apps/eval/src/types/message.ts
Requirements:
1. Use Zod for all schemas
2. Create BaseMessageSchema with timestamp field
Create schemas for:
- UserMessageSchema (type: 'user', content)
- AssistantMessageSchema (type: 'assistant', content)
- ToolCallMessageSchema (type: 'tool_call', tool, toolCallId, params)
- ToolResultMessageSchema (type: 'tool_result', toolCallId, result, isError, screenshot?)
- ErrorMessageSchema (type: 'error', content, errorCode?)
- DelegationMessageSchema (type: 'delegation', instruction, executorId, maxSteps?)
- DelegationResultMessageSchema (type: 'delegation_result', executorId, summary, status, stepsUsed, currentUrl?)
Create MessageSchema = z.discriminatedUnion('type', [...all schemas])
Export schemas, types, and type guards (isToolCallMessage, isDelegationMessage, etc.)
Reference: Current types in /apps/eval/src/types.ts (lines 62-127)
```
### Subagent 1C: Task & Result Types
```
Create /apps/eval/src/types/task.ts
Requirements:
1. Use Zod schemas with inferred types
Create:
- TaskMetadataSchema (original_task_id, website?, category?, additional?)
- TaskSchema (query_id, dataset, query, graders[], start_url?, setup_script?, metadata)
Export schemas and types.
---
Create /apps/eval/src/types/result.ts
Create:
- GraderResultSchema (score, pass, reasoning, details?)
- TaskMetadataSchema (query_id, dataset, query, started_at, completed_at, total_duration_ms, total_steps, termination_reason, final_answer, errors, warnings, agent_config, grader_results)
- AgentResultSchema (metadata, messages, finalAnswer)
Export schemas and types.
Reference: Current types in /apps/eval/src/types.ts (lines 6-20, 156-182)
```
### Subagent 1D: Error Types + Index
```
Create /apps/eval/src/types/errors.ts
Create:
- ErrorSourceSchema = z.enum(['window_creation', 'agent_execution', 'mcp_tool', 'screenshot', 'grader', 'message_logging', 'cleanup', 'unknown'])
- TaskErrorSchema (source, message, timestamp, details?)
- EvalWarningSchema (source, message, timestamp)
Export schemas and types.
---
Create /apps/eval/src/types/index.ts
Re-export everything from:
- ./config
- ./message
- ./task
- ./result
- ./errors
This becomes the single import point: import { EvalConfig, Message, Task } from '../types'
Reference: Current types in /apps/eval/src/types.ts (lines 129-154)
```
---
## Phase 2: Capture Infrastructure (2 Parallel Subagents)
**Depends on:** Phase 1 (types)
### Subagent 2A: CaptureContext Class
```
Create /apps/eval/src/capture/types.ts
Define interface:
- CaptureContextConfig { serverUrl, outputDir, taskId, tabId, windowId }
---
Create /apps/eval/src/capture/context.ts
Requirements:
1. Import ToolExecutionHooks, ToolExecutionResult from @browseros/server/agent
2. Import types from ../types
3. Import existing ScreenshotCapture, MessageLogger, TrajectorySaver
Implement CaptureContext class:
- Constructor takes CaptureContextConfig
- async init() - initializes screenshot, messageLogger, trajectorySaver, returns taskOutputDir
- createToolHooks(): ToolExecutionHooks - returns hooks for GeminiAgent
- addError(source, message, details?)
- addWarning(source, message)
- getErrors(), getWarnings(), getMessages(), getScreenshotCount(), getLastAssistantMessage()
- logDelegation(instruction, executorId, maxSteps?)
- logDelegationResult(executorId, summary, status, stepsUsed, currentUrl?)
Reference implementation details in DESIGN_DOC.md section "4. Capture Context"
Update /apps/eval/src/capture/index.ts to export CaptureContext
```
### Subagent 2B: MessageLogger Extensions
```
Update /apps/eval/src/capture/message-logger.ts
Add two new methods:
1. logDelegation(instruction: string, executorId: string, maxSteps?: number): Promise<void>
- Creates DelegationMessage with type: 'delegation'
- Appends to messages
2. logDelegationResult(executorId: string, summary: string, status: 'done' | 'blocked' | 'max_steps', stepsUsed: number, currentUrl?: string): Promise<void>
- Creates DelegationResultMessage with type: 'delegation_result'
- Appends to messages
Import DelegationMessage, DelegationResultMessage from ../types
Reference: Current MessageLogger in /apps/eval/src/capture/message-logger.ts
```
---
## Phase 3: Agent Registry (1 Subagent)
**Depends on:** Phase 1 (types)
**Can run parallel with:** Phase 2
### Subagent 3A: Agent Registry + Types
```
Create /apps/eval/src/agents/types.ts
Define:
- AgentContext interface:
{
config: EvalConfig
task: Task
windowId: number
tabId: number
outputDir: string
taskOutputDir: string
capture: CaptureContext
}
- AgentResult interface (re-export from ../types or define here)
- AgentEvaluator interface { execute(): Promise<AgentResult> }
---
Create /apps/eval/src/agents/registry.ts
Implement:
- type AgentFactory = (context: AgentContext) => AgentEvaluator
- const registry = new Map<string, AgentFactory>()
- registerAgent(type: string, factory: AgentFactory): void
- createAgent(context: AgentContext): AgentEvaluator
- getRegisteredAgentTypes(): string[]
Reference: DESIGN_DOC.md section "2. Agent Registry"
---
Update /apps/eval/src/agents/index.ts
- Import registerAgent from ./registry
- Import SingleAgentEvaluator (will be updated later)
- Import OrchestratorExecutorEvaluator (will be updated later)
- Call registerAgent for both
- Re-export createAgent, registerAgent, getRegisteredAgentTypes
- Re-export types
Note: Registration calls will fail initially until agents are refactored.
That's OK - add TODO comments for now.
```
---
## Phase 4: Agent Refactors (2 Parallel Subagents)
**Depends on:** Phase 2 + Phase 3
### Subagent 4A: Single Agent Refactor
```
Refactor /apps/eval/src/agents/single-agent.ts
Changes:
1. Change constructor to accept AgentContext instead of individual params:
constructor(private ctx: AgentContext) {}
2. Use ctx.capture instead of creating ScreenshotCapture/MessageLogger:
- Remove local ScreenshotCapture initialization
- Remove local MessageLogger initialization
- Remove local hooks setup
- Use ctx.capture.createToolHooks() for GeminiAgent hooks
- Use ctx.capture.messageLogger.logUser/logAssistant
- Use ctx.capture.addError/addWarning
- Use ctx.capture.getMessages(), getScreenshotCount(), etc.
3. Build metadata using capture methods
4. Remove TrajectorySaver init (done in CaptureContext)
5. Keep the core agent execution logic (GeminiAgent.create, agent.execute)
Reference:
- Current implementation: /apps/eval/src/agents/single-agent.ts
- Target implementation: DESIGN_DOC.md section "5. Single Agent Evaluator"
```
### Subagent 4B: Orchestrator-Executor Refactor
```
Refactor /apps/eval/src/agents/orchestrator-executor/index.ts
Changes:
1. Change OrchestratorExecutorEvaluator constructor to accept AgentContext:
constructor(private ctx: AgentContext) {}
2. Initialize capture from context (already done in runner)
3. Add hook integration:
- Create executor hooks that use ctx.capture.createToolHooks()
- Wire hooks through Orchestrator → ExecutorStore → Executor
- Call ctx.capture.logDelegation() when orchestrator delegates
- Call ctx.capture.logDelegationResult() when executor returns
4. Update return to include messages:
return {
metadata,
messages: ctx.capture.getMessages(), // Now populated!
finalAnswer,
}
Also update supporting files if needed:
- orchestrator.ts - add setExecutorHooks() method
- executor.ts - accept external hooks via setObservationHooks()
- executor-store.ts - pass hooks to new executors
Reference:
- Current: /apps/eval/src/agents/orchestrator-executor/index.ts
- Target: DESIGN_DOC.md and previous IMPLEMENTATION_PLAN.md
```
---
## Phase 5: Runner Update (1 Subagent)
**Depends on:** Phase 4
### Subagent 5A: Task Executor Update
```
Update /apps/eval/src/runner/task-executor.ts
Changes:
1. Import createAgent from ../agents instead of individual evaluators
2. Import CaptureContext from ../capture
3. In execute() method:
- Create CaptureContext and call init()
- Build AgentContext with all required fields
- Use createAgent(context) instead of if-else switch
- Remove the if (config.agent.type === 'single') / else if blocks
4. Remove direct imports of SingleAgentEvaluator, OrchestratorExecutorEvaluator
Before:
```typescript
if (this.config.agent.type === 'single') {
const evaluator = new SingleAgentEvaluator(this.config, task, window.windowId, ...)
} else if (this.config.agent.type === 'orchestrator-executor') {
const evaluator = new OrchestratorExecutorEvaluator(this.config, task, ...)
}
```
After:
```typescript
const capture = new CaptureContext({ serverUrl, outputDir, taskId, tabId, windowId })
const taskOutputDir = await capture.init()
const context: AgentContext = {
config: this.config,
task,
windowId: window.windowId,
tabId: window.tabId,
outputDir: this.outputDir,
taskOutputDir,
capture,
}
const agent = createAgent(context)
const agentResult = await agent.execute()
```
Reference:
- Current: /apps/eval/src/runner/task-executor.ts (lines 143-186)
- Target: DESIGN_DOC.md section "6. Task Executor"
```
---
## Phase 6: Cleanup & Test (1 Subagent)
**Depends on:** Phase 5
### Subagent 6A: Cleanup Old Files + Verify
```
Tasks:
1. Delete old /apps/eval/src/types.ts (replaced by types/ folder)
2. Update all imports across the codebase:
- Change: import { EvalConfig, Task, Message } from '../types'
- Keep same (types/index.ts re-exports everything)
3. Update /apps/eval/src/utils/config-validator.ts:
- Import schemas from ../types/config instead of defining locally
- Remove duplicate schema definitions
4. Verify no TypeScript errors:
- Run: cd apps/eval && bun run typecheck
5. Test single-agent eval:
- Run: cd apps/eval && bun run eval -c configs/webvoyager-test.json
- Verify screenshots captured
- Verify messages.jsonl populated
6. Test orchestrator-executor eval:
- Run: cd apps/eval && bun run eval -c configs/orchestrator-executor-test.json
- Verify screenshots captured
- Verify messages.jsonl has delegation messages
- Verify graders pass (no "no_screenshots" error)
Report any issues found.
```
---
## Execution Summary
| Phase | Subagents | Can Parallelize? | Dependencies |
|-------|-----------|------------------|--------------|
| 1 | 4 (1A, 1B, 1C, 1D) | Yes - all parallel | None |
| 2 | 2 (2A, 2B) | Yes - both parallel | Phase 1 |
| 3 | 1 (3A) | Yes - parallel with Phase 2 | Phase 1 |
| 4 | 2 (4A, 4B) | Yes - both parallel | Phase 2 + 3 |
| 5 | 1 (5A) | No | Phase 4 |
| 6 | 1 (6A) | No | Phase 5 |
**Total: 11 subagent tasks**
**Parallel execution timeline:**
```
Time →
─────────────────────────────────────────────────────────────────
Phase 1: [1A] [1B] [1C] [1D] (4 parallel)
─────────────────
Phase 2: [2A] [2B] (2 parallel)
Phase 3: [3A] (parallel with Phase 2)
───────────
Phase 4: [4A] [4B] (2 parallel)
──────────
Phase 5: [5A]
────
Phase 6: [6A]
────
```
**Maximum parallelism: 4 subagents** (Phase 1)

View File

@@ -0,0 +1,888 @@
# Eval System - Production Grade Implementation Plan
## Overview
This plan outlines the changes needed to make the eval system production-grade with uniform agent observation across all agent patterns (single-agent, orchestrator-executor, future patterns).
**Goal:** All agent evaluators produce consistent `AgentResult` with screenshots, message traces, and verifiable action sequences.
---
## Phase 1: Type System Extensions
### 1.1 Add New Message Types
**File:** `src/types.ts`
Add delegation-specific message types for orchestrator pattern:
```typescript
// After ErrorMessage definition (~line 99)
export interface DelegationMessage extends BaseMessage {
type: 'delegation'
instruction: string
executorId: string
maxSteps?: number
}
export interface DelegationResultMessage extends BaseMessage {
type: 'delegation_result'
executorId: string
summary: string
status: 'done' | 'blocked' | 'max_steps'
stepsUsed: number
currentUrl?: string
}
// Update Message union (~line 101)
export type Message =
| UserMessage
| AssistantMessage
| ToolCallMessage
| ToolResultMessage
| ErrorMessage
| DelegationMessage // NEW
| DelegationResultMessage // NEW
// Add type guards
export function isDelegationMessage(msg: Message): msg is DelegationMessage {
return msg.type === 'delegation'
}
export function isDelegationResultMessage(msg: Message): msg is DelegationResultMessage {
return msg.type === 'delegation_result'
}
```
### 1.2 Add Orchestrator Hook Types
**File:** `src/agents/orchestrator-executor/types.ts`
```typescript
// Add after existing types
export interface OrchestratorHooks {
onDelegation?: (instruction: string, executorId: string, maxSteps?: number) => Promise<void>
onDelegationResult?: (result: ExecutorResult) => Promise<void>
onTurnStart?: (turn: number) => Promise<void>
onTurnComplete?: (turn: number) => Promise<void>
onComplete?: (answer: string) => Promise<void>
onFailed?: (reason: string) => Promise<void>
}
export interface ExecutorObservationHooks {
onBeforeToolCall?: (toolName: string, args: unknown) => Promise<string> // returns toolCallId
onAfterToolCall?: (toolName: string, toolCallId: string, result: unknown, isError: boolean) => Promise<void>
}
```
---
## Phase 2: Unified Capture Infrastructure
### 2.1 Create EvalCapture Class
**File:** `src/capture/eval-capture.ts` (NEW)
```typescript
/**
* EvalCapture - Unified capture infrastructure for all agent evaluators
*
* Combines screenshot capture, message logging, and provides hooks for
* both single-agent and orchestrator-executor patterns.
*/
import { randomUUID } from 'node:crypto'
import type {
AssistantMessage,
DelegationMessage,
DelegationResultMessage,
ErrorMessage,
Message,
ToolCallMessage,
ToolResultMessage,
UserMessage,
} from '../types'
import { MessageLogger } from './message-logger'
import { ScreenshotCapture } from './screenshot'
export interface EvalCaptureConfig {
serverUrl: string
outputDir: string
tabId: number
windowId: number
}
export class EvalCapture {
private screenshotCapture: ScreenshotCapture
private messageLogger: MessageLogger
private tabId: number
private windowId: number
private currentToolCallId: string | null = null
constructor(config: EvalCaptureConfig) {
this.screenshotCapture = new ScreenshotCapture(config.serverUrl, config.outputDir)
this.messageLogger = new MessageLogger(config.outputDir)
this.tabId = config.tabId
this.windowId = config.windowId
}
async init(): Promise<void> {
await this.screenshotCapture.init()
}
// ============================================================================
// Screenshot Capture
// ============================================================================
async captureScreenshot(): Promise<number> {
return this.screenshotCapture.capture(this.tabId, this.windowId)
}
getScreenshotCount(): number {
return this.screenshotCapture.getCount()
}
// ============================================================================
// Message Logging - Basic Types
// ============================================================================
async logUser(content: string): Promise<void> {
await this.messageLogger.logUser(content)
}
async logAssistant(content: string): Promise<void> {
await this.messageLogger.logAssistant(content)
}
async logError(content: string, errorCode?: string): Promise<void> {
await this.messageLogger.logError(content, errorCode)
}
// ============================================================================
// Tool Call Logging (for single-agent and executor)
// ============================================================================
async logToolCall(tool: string, params: Record<string, unknown>): Promise<string> {
const toolCallId = randomUUID()
this.currentToolCallId = toolCallId
await this.messageLogger.logToolCall(tool, toolCallId, params)
return toolCallId
}
async logToolResult(
toolCallId: string,
result: unknown,
isError: boolean,
screenshot?: number,
): Promise<void> {
await this.messageLogger.logToolResult(toolCallId, result, isError, screenshot)
this.currentToolCallId = null
}
getCurrentToolCallId(): string | null {
return this.currentToolCallId
}
// ============================================================================
// Delegation Logging (for orchestrator-executor)
// ============================================================================
async logDelegation(
instruction: string,
executorId: string,
maxSteps?: number,
): Promise<void> {
const message: DelegationMessage = {
type: 'delegation',
timestamp: new Date().toISOString(),
instruction,
executorId,
...(maxSteps !== undefined && { maxSteps }),
}
// Extend MessageLogger to handle this, or append directly
await this.appendMessage(message)
}
async logDelegationResult(
executorId: string,
summary: string,
status: 'done' | 'blocked' | 'max_steps',
stepsUsed: number,
currentUrl?: string,
): Promise<void> {
const message: DelegationResultMessage = {
type: 'delegation_result',
timestamp: new Date().toISOString(),
executorId,
summary,
status,
stepsUsed,
...(currentUrl && { currentUrl }),
}
await this.appendMessage(message)
}
// ============================================================================
// Helpers
// ============================================================================
private async appendMessage(message: Message): Promise<void> {
// Access internal messages array and file
// This requires either extending MessageLogger or using a shared approach
const messages = this.messageLogger.getMessages()
messages.push(message)
// Write to file - MessageLogger needs extension for this
}
getMessages(): Message[] {
return this.messageLogger.getMessages()
}
getLastAssistantMessage(): string | null {
return this.messageLogger.getLastAssistantMessage()
}
}
```
### 2.2 Extend MessageLogger for New Types
**File:** `src/capture/message-logger.ts`
Add methods for delegation messages:
```typescript
// Add after logError method
async logDelegation(
instruction: string,
executorId: string,
maxSteps?: number,
): Promise<void> {
const message: DelegationMessage = {
type: 'delegation',
timestamp: new Date().toISOString(),
instruction,
executorId,
...(maxSteps !== undefined && { maxSteps }),
}
await this.append(message)
}
async logDelegationResult(
executorId: string,
summary: string,
status: 'done' | 'blocked' | 'max_steps',
stepsUsed: number,
currentUrl?: string,
): Promise<void> {
const message: DelegationResultMessage = {
type: 'delegation_result',
timestamp: new Date().toISOString(),
executorId,
summary,
status,
stepsUsed,
...(currentUrl && { currentUrl }),
}
await this.append(message)
}
```
---
## Phase 3: Executor Hook Integration
### 3.1 Modify Executor to Accept External Hooks
**File:** `src/agents/orchestrator-executor/executor.ts`
```typescript
// Add import
import type { ExecutorObservationHooks } from './types'
export class Executor {
private agent: GeminiAgent | null = null
private stepsUsed = 0
private currentUrl = ''
private config: ExecutorConfig
private serverUrl: string
private windowId: number
private tabId: number
private observationHooks?: ExecutorObservationHooks // NEW
// ... existing constructor ...
/**
* Set external observation hooks for capture integration
*/
setObservationHooks(hooks: ExecutorObservationHooks): void {
this.observationHooks = hooks
}
async execute(
instruction: string,
maxSteps?: number,
signal?: AbortSignal,
): Promise<Omit<ExecutorResult, 'executorId'>> {
// ... existing setup ...
// Track steps via hooks - MODIFIED to include external observation
let stepsThisRun = 0
const hooks: ToolExecutionHooks = {
onBeforeToolCall: async (toolName: string, args: unknown) => {
// Call external hook if set (for logging)
if (this.observationHooks?.onBeforeToolCall) {
await this.observationHooks.onBeforeToolCall(toolName, args)
}
},
onAfterToolCall: async (toolName: string, result: ToolExecutionResult) => {
stepsThisRun++
this.stepsUsed++
// Call external hook if set (for screenshot capture and logging)
if (this.observationHooks?.onAfterToolCall) {
const toolCallId = 'current' // Will be tracked by EvalCapture
await this.observationHooks.onAfterToolCall(
toolName,
toolCallId,
result.parts,
result.isError,
)
}
},
}
this.agent.setToolHooks(hooks)
// ... rest of execute method ...
}
}
```
### 3.2 Pass Hooks Through ExecutorStore
**File:** `src/agents/orchestrator-executor/executor-store.ts`
```typescript
import type { ExecutorObservationHooks } from './types'
export class ExecutorStore {
private executors = new Map<string, Executor>()
private observationHooks?: ExecutorObservationHooks // NEW
/**
* Set observation hooks that will be applied to all executors
*/
setObservationHooks(hooks: ExecutorObservationHooks): void {
this.observationHooks = hooks
// Apply to existing executors
for (const executor of this.executors.values()) {
executor.setObservationHooks(hooks)
}
}
getOrCreate(
id: string,
config: ExecutorConfig,
serverUrl: string,
windowId: number,
tabId: number,
): Executor {
if (!this.executors.has(id)) {
const executor = new Executor(config, serverUrl, windowId, tabId)
// Apply observation hooks to new executor
if (this.observationHooks) {
executor.setObservationHooks(this.observationHooks)
}
this.executors.set(id, executor)
}
return this.executors.get(id)!
}
// ... rest unchanged ...
}
```
---
## Phase 4: Orchestrator Hook Integration
### 4.1 Add Hooks to OrchestratorAgent
**File:** `src/agents/orchestrator-executor/orchestrator-agent.ts`
```typescript
import type { ExecutorObservationHooks, OrchestratorHooks } from './types'
export class OrchestratorAgent {
private orchestratorHooks?: OrchestratorHooks // NEW
private constructor(
private client: GeminiClient,
private geminiConfig: GeminiConfig,
private state: OrchestratorState,
private executorStore: ExecutorStore,
private maxTurns: number,
) {}
/**
* Set orchestrator-level hooks for delegation tracking
*/
setHooks(hooks: OrchestratorHooks): void {
this.orchestratorHooks = hooks
}
/**
* Set executor observation hooks (passed through to ExecutorStore)
*/
setExecutorObservationHooks(hooks: ExecutorObservationHooks): void {
this.executorStore.setObservationHooks(hooks)
}
/**
* Get hooks for tool context (used by orchestrator-tools.ts)
*/
getOrchestratorHooks(): OrchestratorHooks | undefined {
return this.orchestratorHooks
}
async run(taskQuery: string): Promise<OrchestratorAgentResult> {
let currentParts: Part[] = [{ text: taskQuery }]
let turns = 0
while (
!this.state.isComplete &&
!this.state.isFailed &&
turns < this.maxTurns
) {
turns++
// Fire turn start hook
await this.orchestratorHooks?.onTurnStart?.(turns)
// ... existing turn logic ...
// Fire turn complete hook
await this.orchestratorHooks?.onTurnComplete?.(turns)
}
// Fire completion hooks
if (this.state.isComplete && this.state.finalAnswer) {
await this.orchestratorHooks?.onComplete?.(this.state.finalAnswer)
} else if (this.state.isFailed && this.state.failureReason) {
await this.orchestratorHooks?.onFailed?.(this.state.failureReason)
}
return {
success: this.state.isComplete,
answer: this.state.finalAnswer,
reason: this.state.failureReason,
delegationCount: this.state.delegationCount,
totalExecutorSteps: this.state.totalExecutorSteps,
turns,
}
}
// ... rest unchanged ...
}
```
### 4.2 Fire Hooks in Orchestrator Tools
**File:** `src/agents/orchestrator-executor/orchestrator-tools.ts`
Modify the delegate tool handler to fire hooks:
```typescript
// In createOrchestratorTools function, modify the delegate tool handler
// Inside the delegate tool's handler:
handler: async (args) => {
const { instruction, executorId, maxSteps } = args as DelegateParams
// Fire delegation hook BEFORE execution
const hooks = context.getOrchestratorHooks?.()
const actualExecutorId = executorId ?? randomUUID()
await hooks?.onDelegation?.(instruction, actualExecutorId, maxSteps)
// Get or create executor
const executor = context.executorStore.getOrCreate(
actualExecutorId,
context.executorConfig,
context.serverUrl,
context.windowId,
context.tabId,
)
// Execute
const result = await executor.execute(instruction, maxSteps)
// Update state
context.state.delegationCount++
context.state.totalExecutorSteps += result.stepsUsed
// Fire delegation result hook AFTER execution
await hooks?.onDelegationResult?.({
...result,
executorId: actualExecutorId,
})
// Return result to orchestrator
return {
executorId: actualExecutorId,
...result,
}
}
```
---
## Phase 5: Update OrchestratorExecutorEvaluator
### 5.1 Full Integration
**File:** `src/agents/orchestrator-executor/index.ts`
```typescript
import { ScreenshotCapture } from '../../capture/screenshot'
import { MessageLogger } from '../../capture/message-logger'
import { TrajectorySaver } from '../../capture/trajectory-saver'
import type { ExecutorObservationHooks, OrchestratorHooks } from './types'
export class OrchestratorExecutorEvaluator implements AgentEvaluator {
constructor(
private config: EvalConfig,
private task: Task,
private windowId: number,
private tabId: number,
private outputDir: string,
) {}
async execute(): Promise<AgentResult> {
const startTime = Date.now()
const timeoutMs = this.config.timeout_ms ?? DEFAULT_TIMEOUT_MS
const errors: TaskError[] = []
const warnings: EvalWarning[] = []
const addError = (source: TaskError['source'], message: string, details?: Record<string, unknown>) => {
errors.push({ source, message, timestamp: new Date().toISOString(), details })
}
const addWarning = (source: EvalWarning['source'], message: string) => {
warnings.push({ source, message, timestamp: new Date().toISOString() })
console.warn(`[${source}] ${message}`)
}
// Initialize trajectory saver
const saver = new TrajectorySaver(this.outputDir, this.task.query_id)
const taskOutputDir = await saver.init()
// NEW: Initialize capture infrastructure (same as single-agent)
const screenshotCapture = new ScreenshotCapture(
this.config.browseros.server_url,
taskOutputDir,
)
await screenshotCapture.init()
const messageLogger = new MessageLogger(taskOutputDir)
// Log initial user message
await messageLogger.logUser(this.task.query)
// Validate config type
if (this.config.agent.type !== 'orchestrator-executor') {
throw new Error('OrchestratorExecutorEvaluator requires orchestrator-executor config')
}
const agentConfig = this.config.agent as OrchestratorExecutorConfig
const { orchestrator: orchestratorConfig, executor: executorConfig } =
resolveAgentConfig(agentConfig)
// Create orchestrator
const orchestrator = new Orchestrator(
orchestratorConfig,
executorConfig,
this.config.browseros.server_url,
this.windowId,
this.tabId,
)
// NEW: Set up executor observation hooks (for tool call/result capture)
let currentToolCallId: string | null = null
const executorHooks: ExecutorObservationHooks = {
onBeforeToolCall: async (toolName: string, args: unknown) => {
try {
currentToolCallId = randomUUID()
await messageLogger.logToolCall(toolName, currentToolCallId, args as Record<string, unknown>)
} catch (err) {
addWarning('message_logging', `Failed to log tool call ${toolName}: ${err instanceof Error ? err.message : String(err)}`)
}
return currentToolCallId
},
onAfterToolCall: async (toolName: string, _toolCallId: string, result: unknown, isError: boolean) => {
let screenshotNum = 0
// Capture screenshot after tool execution
try {
screenshotNum = await screenshotCapture.capture(this.tabId, this.windowId)
} catch (err) {
addWarning('screenshot', `Screenshot after ${toolName} failed: ${err instanceof Error ? err.message : String(err)}`)
screenshotNum = screenshotCapture.getCount()
}
// Log tool errors
if (isError) {
addWarning('mcp_tool', `Tool ${toolName} returned error`)
}
if (!currentToolCallId) {
addWarning('message_logging', 'Tool result without matching tool call')
return
}
try {
await messageLogger.logToolResult(currentToolCallId, result, isError, screenshotNum)
} catch (err) {
addWarning('message_logging', `Failed to log tool result: ${err instanceof Error ? err.message : String(err)}`)
}
currentToolCallId = null
},
}
// NEW: Set up orchestrator hooks (for delegation tracking)
const orchestratorHooks: OrchestratorHooks = {
onDelegation: async (instruction: string, executorId: string, maxSteps?: number) => {
try {
await messageLogger.logDelegation(instruction, executorId, maxSteps)
} catch (err) {
addWarning('message_logging', `Failed to log delegation: ${err instanceof Error ? err.message : String(err)}`)
}
},
onDelegationResult: async (result) => {
try {
await messageLogger.logDelegationResult(
result.executorId,
result.summary,
result.status,
result.stepsUsed,
result.currentUrl,
)
} catch (err) {
addWarning('message_logging', `Failed to log delegation result: ${err instanceof Error ? err.message : String(err)}`)
}
},
}
// Apply hooks to orchestrator
orchestrator.setHooks(orchestratorHooks)
orchestrator.setExecutorObservationHooks(executorHooks)
// Set up timeout
const abortController = new AbortController()
const timeoutHandle = setTimeout(() => {
abortController.abort()
}, timeoutMs)
let terminationReason: 'completed' | 'max_steps' | 'error' | 'timeout' = 'completed'
let finalAnswer: string | null = null
let orchestratorResult: Awaited<ReturnType<typeof orchestrator.run>> | null = null
try {
const runPromise = orchestrator.run(this.task.query)
orchestratorResult = await Promise.race([
runPromise,
new Promise<never>((_, reject) => {
abortController.signal.addEventListener('abort', () => {
reject(new Error('Timeout'))
})
}),
])
if (orchestratorResult.success) {
finalAnswer = orchestratorResult.answer
terminationReason = 'completed'
// Log final assistant message
if (finalAnswer) {
await messageLogger.logAssistant(finalAnswer)
}
} else {
terminationReason = 'error'
addError('agent_execution', orchestratorResult.reason ?? 'Unknown failure')
await messageLogger.logError(orchestratorResult.reason ?? 'Unknown failure')
}
} catch (err) {
const error = err instanceof Error ? err : new Error(String(err))
if (error.message === 'Timeout' || abortController.signal.aborted) {
terminationReason = 'timeout'
addError('agent_execution', `Task timed out after ${timeoutMs / 1000}s`)
} else {
terminationReason = 'error'
addError('agent_execution', error.message, { stack: error.stack })
}
await messageLogger.logError(error.message)
} finally {
clearTimeout(timeoutHandle)
orchestrator.getExecutorStore().clear()
}
const endTime = Date.now()
// Create metadata
const metadata: TaskMetadata = {
query_id: this.task.query_id,
dataset: this.task.dataset,
query: this.task.query,
started_at: new Date(startTime).toISOString(),
completed_at: new Date(endTime).toISOString(),
total_duration_ms: endTime - startTime,
total_steps: screenshotCapture.getCount(), // Now accurate
termination_reason: terminationReason,
final_answer: finalAnswer,
errors,
warnings,
agent_config: {
type: 'orchestrator-executor',
model: `${orchestratorConfig.model} / ${executorConfig.model}`,
},
grader_results: {},
}
await saver.saveMetadata(metadata)
return {
metadata,
messages: messageLogger.getMessages(), // NOW POPULATED
finalAnswer,
}
}
}
```
---
## Phase 6: Orchestrator Class Updates
### 6.1 Add Hook Passthrough Methods
**File:** `src/agents/orchestrator-executor/orchestrator.ts`
```typescript
import type { ExecutorObservationHooks, OrchestratorHooks } from './types'
export class Orchestrator {
private agent: OrchestratorAgent | null = null
private executorStore: ExecutorStore
private pendingOrchestratorHooks?: OrchestratorHooks
private pendingExecutorHooks?: ExecutorObservationHooks
constructor(
private orchestratorConfig: OrchestratorConfig,
private executorConfig: ExecutorConfig,
private serverUrl: string,
private windowId: number,
private tabId: number,
) {
this.executorStore = new ExecutorStore()
}
/**
* Set orchestrator-level hooks (must be called before run())
*/
setHooks(hooks: OrchestratorHooks): void {
this.pendingOrchestratorHooks = hooks
if (this.agent) {
this.agent.setHooks(hooks)
}
}
/**
* Set executor observation hooks (must be called before run())
*/
setExecutorObservationHooks(hooks: ExecutorObservationHooks): void {
this.pendingExecutorHooks = hooks
this.executorStore.setObservationHooks(hooks)
if (this.agent) {
this.agent.setExecutorObservationHooks(hooks)
}
}
async run(taskQuery: string): Promise<OrchestratorAgentResult> {
this.agent = await OrchestratorAgent.create(
this.orchestratorConfig,
this.executorConfig,
this.serverUrl,
this.windowId,
this.tabId,
)
// Apply pending hooks
if (this.pendingOrchestratorHooks) {
this.agent.setHooks(this.pendingOrchestratorHooks)
}
if (this.pendingExecutorHooks) {
this.agent.setExecutorObservationHooks(this.pendingExecutorHooks)
}
const result = await this.agent.run(taskQuery)
this.executorStore = this.agent.getExecutorStore()
return result
}
getExecutorStore(): ExecutorStore {
return this.agent?.getExecutorStore() ?? this.executorStore
}
}
```
---
## Implementation Order
1. **Phase 1** - Type extensions (types.ts) - 30 min
2. **Phase 2** - MessageLogger extensions - 30 min
3. **Phase 3** - Executor hook integration - 1 hour
4. **Phase 4** - OrchestratorAgent hooks - 1 hour
5. **Phase 5** - OrchestratorExecutorEvaluator update - 1.5 hours
6. **Phase 6** - Orchestrator passthrough - 30 min
7. **Testing** - End-to-end verification - 1 hour
**Total estimated time:** ~6 hours
---
## Testing Checklist
- [ ] Single-agent eval still works (regression test)
- [ ] Orchestrator-executor produces screenshots in output folder
- [ ] Orchestrator-executor produces messages.jsonl with:
- [ ] user message
- [ ] delegation messages
- [ ] tool_call messages (from executor)
- [ ] tool_result messages with screenshot numbers
- [ ] delegation_result messages
- [ ] assistant message (final answer)
- [ ] Graders pass with orchestrator-executor (no "no_screenshots" error)
- [ ] metadata.json has accurate `total_steps` count
- [ ] Error/warning capture works for both patterns
---
## Future Considerations
1. **New Agent Patterns:** Any new agent type just needs to:
- Accept hooks in constructor or via setter
- Fire hooks at appropriate points
- Use shared capture infrastructure
2. **Grader Updates:** May need to update graders to understand delegation messages
3. **Parallel Executors:** If orchestrator delegates to multiple executors in parallel, need to handle concurrent screenshot capture
4. **Memory/Performance:** Screenshot capture creates MCP connection per capture - consider connection pooling for high-volume evals

View File

@@ -2,99 +2,155 @@
[![License: AGPL v3](https://img.shields.io/badge/License-AGPL%20v3-blue.svg)](../../../../LICENSE)
Evaluation framework for BrowserOS browser automation agents. Runs tasks from standard datasets ([WebVoyager](https://arxiv.org/abs/2401.13919), [Mind2Web](https://arxiv.org/abs/2306.06070), AGI SDK / REAL Bench, WebArena-Infinity, WebBench), captures trajectories with screenshots, and grades results automatically.
Evaluation framework for benchmarking BrowserOS browser automation agents. Runs tasks from standard datasets ([WebVoyager](https://arxiv.org/abs/2401.13919), [Mind2Web](https://arxiv.org/abs/2306.06070)), captures trajectories with screenshots, and grades results automatically.
## Prerequisites
- **BrowserOS binary** at `/Applications/BrowserOS.app` (macOS) or `BROWSEROS_BINARY` pointing at it
- **BrowserOS binary** installed at `/Applications/BrowserOS.app` (macOS)
- **Bun** runtime
- **API keys** for your LLM provider (and `CLAUDE_CODE_OAUTH_TOKEN` if you use `performance_grader`)
- **API keys** for your chosen LLM provider and grader model
## Quick Start
### 1. Set up environment
```bash
cd apps/eval
# Edit .env.development with your keys, then:
```
Edit `.env.development` and add your API keys:
```bash
# Pick ONE provider for the orchestrator (whichever you have access to)
OPENAI_API_KEY=sk-xxxxx
ANTHROPIC_API_KEY=sk-ant-xxxxx
FIREWORKS_API_KEY=fw_xxxxx
GOOGLE_API_KEY=AIza-xxxxx
# For grading results (OpenRouter recommended — gives access to many models)
OPENROUTER_API_KEY=sk-or-v1-xxxxx
```
### 2. Launch the dashboard
```bash
bun run eval
```
Opens the eval dashboard at `http://localhost:9900` in config mode. From there: load a preset, edit settings, click **Run**.
Opens the **Eval Dashboard** at `http://localhost:9900` in config mode.
### CLI mode
### 3. Configure and run
From the dashboard:
1. **Load a preset** — select from the dropdown or click **Load File** to import a config JSON
2. **Edit settings** — change agent type, provider, model, API keys, dataset, workers, timeouts
3. **Save Config** — export your configuration for reuse
4. **Click Run** — starts the evaluation with live progress
### Alternative: Run from CLI
```bash
bun run eval -c configs/browseros-agent-weekly.json
bun run eval -c configs/orchestrator-executor-clado-test.json
```
Runs immediately. Dashboard still available at `http://localhost:9900` for live progress.
## Agent types
## Agent Types
| Type | Description |
|------|-------------|
| `single` | Single LLM agent driven by the BrowserOS tool loop (CDP) |
| `orchestrator-executor` | High-level orchestrator + per-step executor (LLM or Clado visual model) |
### Orchestrator-Executor with Clado
### Single agent
The recommended architecture for visual model evals. Two tiers:
```json
{
"agent": {
"type": "single",
"provider": "openai-compatible",
"model": "moonshotai/kimi-k2.5",
"apiKey": "OPENROUTER_API_KEY",
"baseUrl": "https://openrouter.ai/api/v1",
"supportsImages": true
}
}
```
- **Orchestrator** — An LLM that plans and issues high-level instructions
- **Executor** — The **Clado Action** visual model that takes screenshots and predicts click/type/scroll coordinates
### Orchestrator-Executor
The orchestrator works with **any LLM provider**. Pick whichever you have access to:
The orchestrator works with any LLM provider. The executor can be another LLM, or the **Clado action** visual model that takes screenshots and predicts click/type/scroll coordinates.
#### OpenAI orchestrator
```json
{
"agent": {
"type": "orchestrator-executor",
"orchestrator": {
"provider": "openai-compatible",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1"
"provider": "openai",
"model": "gpt-4o",
"apiKey": "OPENAI_API_KEY"
},
"executor": {
"provider": "clado-action",
"model": "Qwen3.5-35B-A3B-action-000159-merged",
"model": "qwen3-vl-30b-a3b-instruct",
"apiKey": "",
"baseUrl": "https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run"
"baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
}
}
},
"dataset": "../data/webvoyager_e2e_test.jsonl",
"output_dir": "../results/oe-clado-openai",
"num_workers": 3,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"headless": true
},
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1200000
}
```
## Graders
| Name | Description |
|------|-------------|
| `performance_grader` | Multi-axis grader running on Claude Agent SDK (uses its own credentials via `CLAUDE_CODE_OAUTH_TOKEN`) |
| `agisdk_state_diff` | AGI SDK / REAL Bench environment state-diff grader (deterministic) |
| `infinity_state` | WebArena-Infinity verifier-script grader (deterministic) |
Set `graders` in your config to override the per-task `graders` field from the dataset:
#### Anthropic orchestrator
```json
"graders": ["performance_grader"]
"orchestrator": {
"provider": "anthropic",
"model": "claude-sonnet-4-20250514",
"apiKey": "ANTHROPIC_API_KEY"
}
```
## Configuration reference
#### Google orchestrator
```json
"orchestrator": {
"provider": "google",
"model": "gemini-2.0-flash",
"apiKey": "GOOGLE_API_KEY"
}
```
#### Fireworks orchestrator (OpenAI-compatible)
```json
"orchestrator": {
"provider": "openai-compatible",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1"
}
```
The executor config stays the same across all orchestrator providers — it always uses the Clado action model.
### Other Agent Types
| Type | Description | Example config |
|------|-------------|----------------|
| `single` | Single LLM agent via Gemini CLI + MCP | `webvoyager-test.json` |
| `tool-loop` | AI SDK tool loop, connects via CDP | `tool-loop-test.json` |
| `gemini-computer-use` | Google native computer use API | `gemini-computer-use.json` |
| `yutori-navigator` | Yutori N1 visual model | `yutori-navigator.json` |
## Configuration Reference
### API keys
The `apiKey` field supports two formats:
- **Env var name**: `"OPENAI_API_KEY"` — resolved from `.env.development` at runtime
- **Direct value**: `"sk-xxxxx"` — used as-is (not recommended)
- **Direct value**: `"sk-xxxxx"` — used as-is (not recommended, prefer env vars)
### Supported providers
@@ -104,7 +160,7 @@ The `apiKey` field supports two formats:
| Anthropic | `anthropic` | No |
| Google | `google` | No |
| Azure OpenAI | `azure` | Yes |
| AWS Bedrock | `bedrock` | No |
| AWS Bedrock | `bedrock` | No (uses `region`, `accessKeyId`, `secretAccessKey`) |
| OpenRouter | `openrouter` | No |
| Fireworks, Together, etc. | `openai-compatible` | Yes |
| Ollama | `ollama` | No |
@@ -123,27 +179,34 @@ The `apiKey` field supports two formats:
}
```
Each worker gets its own Chrome instance. Worker N uses `base_port + N` for CDP and server ports.
Each worker gets its own Chrome instance. Worker N uses `base_port + N` for CDP and server ports. `base_extension_port` is still reserved as a legacy BrowserOS launch argument for compatibility with Chromium builds that still pass it.
### Execution settings
| Field | Description | Default |
|-------|-------------|---------|
| `num_workers` | Parallel workers (each gets its own Chrome) | `1` |
| `timeout_ms` | Per-task timeout in ms | `1800000` (30 min) |
| `timeout_ms` | Per-task timeout in ms | `900000` (15 min) |
| `restart_server_per_task` | Restart Chrome between tasks (cleaner state, slower) | `false` |
### Grading
Results are auto-graded after each task. The grader uses an LLM judge.
| Field | Description |
|-------|-------------|
| `grader_model` | Model for grading (e.g., `openai/gpt-4.1`) |
| `grader_api_key_env` | Env var name for grader API key |
| `grader_base_url` | API endpoint (e.g., `https://openrouter.ai/api/v1`) |
## Datasets
| File | Tasks | Description |
|------|-------|-------------|
| `webvoyager_e2e_test.jsonl` | 10 | WebVoyager test subset (quick smoke test) |
| `webvoyager.jsonl` | 643 | Full WebVoyager benchmark |
| `mind2web.jsonl` | 300 | Online-Mind2Web |
| `webbench-{0,1,2}of4-50.jsonl` | 50 each | WebBench shards (50-task subsets) |
| `agisdk-real.jsonl` | 40 | AGI SDK / REAL Bench (action-only tasks) |
| `webarena-infinity-hard-50.jsonl` | 50 | WebArena-Infinity hard set |
| `browsecomp-medium-hard-50.jsonl` | 50 | BrowseComp medium-hard |
| `browsecomp-very-hard-50.jsonl` | 50 | BrowseComp very-hard |
| `mind2web_e2e_test.jsonl` | 10 | Mind2Web test subset |
| `mind2web.jsonl` | 300 | Full Mind2Web benchmark |
Task format (JSONL, one per line):
@@ -152,7 +215,7 @@ Task format (JSONL, one per line):
"query_id": "Amazon--0",
"dataset": "webvoyager",
"query": "Search an Xbox Wireless controller with green color and rated above 4 stars.",
"graders": ["performance_grader"],
"graders": ["webvoyager_grader", "fara_combined"],
"start_url": "https://www.amazon.com/",
"metadata": { "original_task_id": "Amazon--0", "website": "Amazon" }
}
@@ -164,25 +227,24 @@ Results are saved to `output_dir`:
```
results/
browseros-agent-weekly/
2026-04-29-1430/
Amazon--0/
metadata.json # Task result, timing, grader scores
messages.jsonl # Full message log
screenshots/
001.png # Step-by-step screenshots
002.png
summary.json # Aggregate pass rates
oe-clado-openai/
Amazon--0/
metadata.json # Task result, timing, grader scores
messages.jsonl # Full message log
screenshots/
001.png # Step-by-step screenshots
002.png
summary.json # Aggregate pass rates
```
## Troubleshooting
**BrowserOS not found**: Expects `/Applications/BrowserOS.app/Contents/MacOS/BrowserOS`. Set `BROWSEROS_BINARY` to override.
**BrowserOS not found**: Expects `/Applications/BrowserOS.app/Contents/MacOS/BrowserOS`. Make sure it's installed.
**Port conflicts**: Each worker uses `base_port + workerIndex`. 3 workers on base 9110 → ports 9110, 9111, 9112. Stop other BrowserOS instances first.
**API key not resolving**: If your config has `"apiKey": "OPENAI_API_KEY"`, ensure the env var is set in `.env.development`.
**Tasks timing out**: Increase `timeout_ms`. Default is 30 minutes.
**Tasks timing out**: Increase `timeout_ms`. Default is 15 minutes; complex tasks may need 20+ minutes.
**Headless vs headed**: Set `"headless": false` to watch Chrome in real time.
**Headless vs headed**: Set `"headless": false` to watch Chrome in real-time. Useful for debugging.

View File

@@ -0,0 +1,18 @@
{
"agent": {
"type": "single",
"provider": "openrouter",
"model": "openai/gpt-4o",
"apiKey": "OPENROUTER_API_KEY"
},
"dataset": "data/webvoyager_e2e_test.jsonl",
"output_dir": "results",
"num_workers": 5,
"browseros": {
"server_url": "http://127.0.0.1:9110"
},
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 300000
}

View File

@@ -1,26 +0,0 @@
{
"agent": {
"type": "single",
"provider": "openai-compatible",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1",
"supportsImages": true
},
"dataset": "../data/agisdk-real.jsonl",
"num_workers": 4,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["agisdk_state_diff"],
"timeout_ms": 1800000
}

View File

@@ -2,9 +2,9 @@
"agent": {
"type": "single",
"provider": "openai-compatible",
"model": "moonshotai/kimi-k2.5",
"apiKey": "OPENROUTER_API_KEY",
"baseUrl": "https://openrouter.ai/api/v1",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1",
"supportsImages": true
},
"dataset": "../data/webbench-2of4-50.jsonl",
@@ -22,5 +22,8 @@
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1800000
}

View File

@@ -29,5 +29,8 @@
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1800000
}

View File

@@ -9,12 +9,12 @@
},
"executor": {
"provider": "clado-action",
"model": "Qwen3.5-35B-A3B-action-000159-merged",
"model": "qwen3-vl-30b-a3b-instruct",
"apiKey": "",
"baseUrl": "https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run"
"baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
}
},
"dataset": "../data/agisdk-real.jsonl",
"dataset": "../data/webbench-2of4-50.jsonl",
"num_workers": 10,
"restart_server_per_task": true,
"browseros": {
@@ -23,11 +23,14 @@
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": true
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["agisdk_state_diff"],
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1800000
}

View File

@@ -1,26 +0,0 @@
{
"agent": {
"type": "single",
"provider": "openai-compatible",
"model": "moonshotai/kimi-k2.5",
"apiKey": "OPENROUTER_API_KEY",
"baseUrl": "https://openrouter.ai/api/v1",
"supportsImages": true
},
"dataset": "../data/webarena-infinity-hard-50.jsonl",
"num_workers": 10,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["infinity_state"],
"timeout_ms": 1800000
}

View File

@@ -0,0 +1,30 @@
{
"agent": {
"type": "gemini-computer-use",
"apiKey": "GOOGLE_AI_API_KEY",
"screenSize": {
"width": 1440,
"height": 900
},
"turnLimit": 100
},
"dataset": "../data/test-set.jsonl",
"num_workers": 1,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1200000
}

View File

@@ -20,5 +20,8 @@
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 300000
}

View File

@@ -22,5 +22,8 @@
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1200000
}

View File

@@ -0,0 +1,30 @@
{
"agent": {
"type": "yutori-navigator",
"apiKey": "YUTORI_API_KEY",
"screenSize": {
"width": 1280,
"height": 800
},
"turnLimit": 100
},
"dataset": "../data/test-set.jsonl",
"num_workers": 1,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1200000
}

View File

@@ -1,36 +0,0 @@
{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
{"query_id": "agisdk-fly-unified-5", "dataset": "agisdk-real", "query": "Find me the cheapest fare for a flight from Orlando to Milwaukee on December 5th, 2024 and book it.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/30, Security Code: 420 Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-5", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "United Airlines"}}}
{"query_id": "agisdk-udriver-10", "dataset": "agisdk-real", "query": "Order me a ride for 4pm, I'll be at the de Young muesum headed to the Waterbar, fanciest option possible please.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-10", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
{"query_id": "agisdk-udriver-9", "dataset": "agisdk-real", "query": "Book me a ride from the thai restaurant I last took a ride to for later today at 2pm, I'll be at 333 Apartments on Fremont", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-9", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-9", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Uber"}}}
{"query_id": "agisdk-topwork-4", "dataset": "agisdk-real", "query": "Create a job post for a UI/UX Designer with expertise in Figma, Sketch, and Adobe Creative Suite, including project details, timeline, and required skills (Wireframing, Prototyping, Responsive Design).", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-4", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
{"query_id": "agisdk-gocalendar-4", "dataset": "agisdk-real", "query": "Change the \"Team Check-In\" event on July 18, 2024, name to \"Project Kickoff\" and update the location to \"Zoom\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-4", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
{"query_id": "agisdk-staynb-6", "dataset": "agisdk-real", "query": "Find and book the stay with the best value for money (cheapest stay with the best reviews) for 1 day. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-6", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-6", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
{"query_id": "agisdk-udriver-11", "dataset": "agisdk-real", "query": "I need to go from Pacific Catch on Chestnut back home to 333 Fremont now. If the fancy version is within ten dollars of the regular one, book that.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-11", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-11", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
{"query_id": "agisdk-networkin-5", "dataset": "agisdk-real", "query": "Send a connection request to John Smith.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-5", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-5", "challenge_type": "action", "difficulty": "easy", "similar_to": "LinkedIn"}}}
{"query_id": "agisdk-zilloft-6", "dataset": "agisdk-real", "query": "Select a property listed in San Francisco as \"Condos\" within a price range under $300,000 and request a tour for tomorrow at 4:00 PM. Use these contact details: Name: Sarah Brown, Email: sarahbrown@example.com, Phone: 555-987-6543.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-6", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "Zillow"}}}
{"query_id": "agisdk-topwork-2", "dataset": "agisdk-real", "query": "Create a job posting for a Backend Developer specializing in Python, Django, and Flask to develop a high-performance web application. Include project details such as required skills (PostgreSQL, Docker, AWS, CI/CD), estimated project timeline, and budget.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-2", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-2", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
{"query_id": "agisdk-gocalendar-3", "dataset": "agisdk-real", "query": "Delete the event titled \"Breakfast Meeting with Client\" scheduled for July 19, 2024", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-3", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "Google Calendar"}}}
{"query_id": "agisdk-topwork-3", "dataset": "agisdk-real", "query": "Create a job listing for a Full-Stack Developer with expertise in Java, Spring Boot, and Angular, outlining the project scope, estimated duration, and required skills (MySQL, Docker, Kubernetes, and Jenkins). The ideal candidate should have experience in enterprise-level applications and building scalable microservices. After creating the job post, please describe what you included in the job listing.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-3", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-3", "challenge_type": "retrieval", "difficulty": "medium", "similar_to": "Upwork"}}}
{"query_id": "agisdk-dashdish-7", "dataset": "agisdk-real", "query": "Select \"Express Delivery\" for an order from \"DragonEats\" of \"Mushroom Swiss Burger\" and complete the checkout with the pre-loaded Visa card.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-7", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-7", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
{"query_id": "agisdk-networkin-3", "dataset": "agisdk-real", "query": "Write a post inviting users to a networking event, including details about the event's purpose, date, and target audience.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-3", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-3", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
{"query_id": "agisdk-gomail-7", "dataset": "agisdk-real", "query": "Delete the email with the subject \"New Leadership Articles You Can't Miss\" from the Inbox.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-7", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-7", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Gmail"}}}
{"query_id": "agisdk-opendining-8", "dataset": "agisdk-real", "query": "Identify and book the restaurant with the lowest rating. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-8", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-8", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "OpenTable"}}}
{"query_id": "agisdk-udriver-1", "dataset": "agisdk-real", "query": "Book a ride from Fitness Urbano to Pacific Cafe", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-1", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-1", "challenge_type": "action", "difficulty": "easy", "similar_to": "Uber"}}}
{"query_id": "agisdk-staynb-2", "dataset": "agisdk-real", "query": "Click on one of the stays displayed on the homepage and book it for a family of 4 (2 adults and 2 children). For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-2", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Airbnb"}}}
{"query_id": "agisdk-opendining-10", "dataset": "agisdk-real", "query": "Check the menus of all restaurants for vegetarian options and make a reservation at the one with the most vegetarian choices. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-10", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-10", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "OpenTable"}}}
{"query_id": "agisdk-opendining-4", "dataset": "agisdk-real", "query": "Use the search bar to search for a restaurant on September 2nd at 4:30 PM for 7 people, using \"Japanese\" as the search term, and book the first result. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-4", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-4", "challenge_type": "action", "difficulty": "hard", "similar_to": "OpenTable"}}}
{"query_id": "agisdk-dashdish-4", "dataset": "agisdk-real", "query": "Schedule a delivery order from \"Taco Bell\" adding a \"Classic Cheeseburger\" large size for later and add the note \"Leave at the front door\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-4", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Doordash"}}}
{"query_id": "agisdk-networkin-1", "dataset": "agisdk-real", "query": "Create a new text post for the feed with a professional update about AI trends in 2025, mentioning three key advancements and their impact on the job market.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-1", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
{"query_id": "agisdk-dashdish-5", "dataset": "agisdk-real", "query": "Add three \"Loaded Bacon Cheese Fries\" to the shopping cart from \"Man vs. Fries\". Proceed to checkout and select \"Pickup\" as the delivery method.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-5", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Doordash"}}}
{"query_id": "agisdk-opendining-5", "dataset": "agisdk-real", "query": "Scroll through the homepage carousel until \"Ocean Breeze\" is visible, select the second available time slot, and complete the reservation. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-5", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-5", "challenge_type": "action", "difficulty": "medium", "similar_to": "OpenTable"}}}
{"query_id": "agisdk-gocalendar-1", "dataset": "agisdk-real", "query": "Create a new event titled \"Team Meeting\" on July 19, 2024, from 2 PM to 2:30 PM, and include \"Conference Room A\" as the location", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-1", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
{"query_id": "agisdk-gomail-5", "dataset": "agisdk-real", "query": "Schedule an email to jane.doe@example.com with the subject \"Weekly Update\" to be sent next Monday at 9:00 AM.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-5", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Gmail"}}}
{"query_id": "agisdk-staynb-4", "dataset": "agisdk-real", "query": "Book a stay for 2 children with 1 adult. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-4", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Airbnb"}}}
{"query_id": "agisdk-dashdish-2", "dataset": "agisdk-real", "query": "Add a \"Medium Pepperoni Pizza\" from the restaurant \"Papa Johns Pizza\" to the shopping cart and purchase it.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-2", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Doordash"}}}
{"query_id": "agisdk-staynb-8", "dataset": "agisdk-real", "query": "Scroll through the homepage and book the last stay located in Paris.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-8", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-8", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
{"query_id": "agisdk-gomail-2", "dataset": "agisdk-real", "query": "Mark the first email in the Inbox as \"read\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-2", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Gmail"}}}
{"query_id": "agisdk-networkin-10", "dataset": "agisdk-real", "query": "Generate a polite follow-up message for a previous unanswered chat, starting with \"Following up on\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-10", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-10", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
{"query_id": "agisdk-gomail-3", "dataset": "agisdk-real", "query": "Compose a new email to jonathan.smith@example.com with the subject \"Meeting Notes\" and body \"Please find the meeting notes attached.\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-3", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "Gmail"}}}
{"query_id": "agisdk-udriver-6", "dataset": "agisdk-real", "query": "Me and 4 friends need a ride from the Palace Hotel to dinner at Osha Thai leaving now", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-6", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-6", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
{"query_id": "agisdk-zilloft-3", "dataset": "agisdk-real", "query": "Find a home in San Diego priced under $150,000 with at least 2 bedrooms and request a tour. Use these details: Contact Name: John Doe, Email: johndoe@example.com, Phone: 555-123-4567, Tour Time: 2:00 PM, Tour Date: First available.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-3", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-3", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "Zillow"}}}
{"query_id": "agisdk-fly-unified-6", "dataset": "agisdk-real", "query": "Reserve me a seat for the flight from Austin to Pittsburgh departing on December 11th, 2024 at 8:00 in Basic Economy.\nPassenger: Alice Brown\nDate of Birth: 05/20/1992\nSex: Female\nSeat Selection: Yes (Aisle seat)\nPayment: Credit Card (378342143523967), Exp: 09/27, security code: 332 Address: 789 Pine St, Los Angeles, CA, 90012, USA, Phone: 555-456-7890, Email: alicebrown@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-6", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "United Airlines"}}}

View File

@@ -0,0 +1,5 @@
{"query_id": "CoordClick--1", "dataset": "coordinate-click", "query": "Click on circle A located at the top-left corner of the page.", "graders": ["webvoyager_grader"], "start_url": "http://localhost:3100", "metadata": {"original_task_id": "CoordClick--1", "website": "eval-target", "category": "coordinate-prediction", "additional": {"ground_truth": "Circle A is clicked and shows data-clicked=true", "answer_type": "golden"}}}
{"query_id": "CoordClick--2", "dataset": "coordinate-click", "query": "Click on circle B located at the top-right corner of the page.", "graders": ["webvoyager_grader"], "start_url": "http://localhost:3100", "metadata": {"original_task_id": "CoordClick--2", "website": "eval-target", "category": "coordinate-prediction", "additional": {"ground_truth": "Circle B is clicked and shows data-clicked=true", "answer_type": "golden"}}}
{"query_id": "CoordClick--3", "dataset": "coordinate-click", "query": "Click on circle C located at the bottom-left corner of the page.", "graders": ["webvoyager_grader"], "start_url": "http://localhost:3100", "metadata": {"original_task_id": "CoordClick--3", "website": "eval-target", "category": "coordinate-prediction", "additional": {"ground_truth": "Circle C is clicked and shows data-clicked=true", "answer_type": "golden"}}}
{"query_id": "CoordClick--4", "dataset": "coordinate-click", "query": "Click on circle D located at the bottom-right corner of the page.", "graders": ["webvoyager_grader"], "start_url": "http://localhost:3100", "metadata": {"original_task_id": "CoordClick--4", "website": "eval-target", "category": "coordinate-prediction", "additional": {"ground_truth": "Circle D is clicked and shows data-clicked=true", "answer_type": "golden"}}}
{"query_id": "CoordClick--5", "dataset": "coordinate-click", "query": "Click on all four circles A, B, C, and D on the page.", "graders": ["webvoyager_grader"], "start_url": "http://localhost:3100", "metadata": {"original_task_id": "CoordClick--5", "website": "eval-target", "category": "coordinate-prediction", "additional": {"ground_truth": "All four circles are clicked and page shows ALL TARGETS HIT", "answer_type": "golden"}}}

View File

@@ -1,50 +0,0 @@
{"query_id": "infinity-elation-prescriptions-task_h69", "dataset": "webarena-infinity", "query": "Approve all pending refill requests except for any medication that is involved in a major drug-drug interaction with another of the patient's active medications. Deny those with the reason 'Drug interaction \u2014 needs provider review before renewal'.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h69", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h69.py", "app_base_port": 8020}}}
{"query_id": "infinity-elation-clinical-records-task_h52", "dataset": "webarena-infinity", "query": "Add the document tag 'Provider-Reviewed' to every visit note template that was created by the current logged-in provider. Do not modify templates created by other providers.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h52", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h52.py", "app_base_port": 8000}}}
{"query_id": "infinity-gmail-accounts-and-contacts-task_h44", "dataset": "webarena-infinity", "query": "Your sister's husband is one of your contacts. Find him, star his entry, and add the Friends label.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h44", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h44.py", "app_base_port": 8070}}}
{"query_id": "infinity-gmail-task_h2", "dataset": "webarena-infinity", "query": "Update the Datadog alerts filter to also archive matching emails and forward them to priya.sharma@cloudnine.dev instead of nate.patel@devops.tools.", "graders": ["infinity_state"], "start_url": "http://localhost:8060", "metadata": {"original_task_id": "gmail-task_h2", "website": "gmail", "category": "webarena-infinity", "additional": {"app_name": "gmail", "difficulty": "hard", "verifier_path": "real-tasks/task_h2.py", "app_base_port": 8060}}}
{"query_id": "infinity-gitlab-plan-and-track-task_h58", "dataset": "webarena-infinity", "query": "The Performance Initiative epic has two child epics. For the child epic with more open issues, set the weight of every issue in it to 13. For the other child epic, close all its open issues.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h58", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h58.py", "app_base_port": 8050}}}
{"query_id": "infinity-figma-slides-task_h46", "dataset": "webarena-infinity", "query": "There are two slides with tables in the deck. Lock the table that compares competitors, and change the font size to 16 on the table that tracks quarterly feature adoption.", "graders": ["infinity_state"], "start_url": "http://localhost:8030", "metadata": {"original_task_id": "figma-slides-task_h46", "website": "figma-slides", "category": "webarena-infinity", "additional": {"app_name": "figma-slides", "difficulty": "hard", "verifier_path": "real-tasks/task_h46.py", "app_base_port": 8030}}}
{"query_id": "infinity-elation-prescriptions-task_h50", "dataset": "webarena-infinity", "query": "Deny the pending refill for the patient's cholesterol medication because his lipid panel is overdue. Then deny the Lisinopril refill as well \u2014 he needs a follow-up blood pressure check first.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h50", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h50.py", "app_base_port": 8020}}}
{"query_id": "infinity-elation-prescriptions-task_h19", "dataset": "webarena-infinity", "query": "Discontinue the Omeprazole and prescribe Famotidine 20mg tablet twice daily as a replacement for GERD \u2014 qty 60, 3 refills, send to CVS #4521.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h19", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h19.py", "app_base_port": 8020}}}
{"query_id": "infinity-paypal-my-wallet-task_h25", "dataset": "webarena-infinity", "query": "Convert all of my Australian dollars to euros.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h25", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h25.py", "app_base_port": 8100}}}
{"query_id": "infinity-elation-clinical-records-task_h66", "dataset": "webarena-infinity", "query": "Create a new template called 'Anxiety Management' with HPI and Assessment sections, and billing code 99213 with description 'Office visit, established, low complexity'. Then create a visit note for Emily Nakamura using that new template and the Telehealth category, add a Psychological Status block to the note, and sign it.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h66", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h66.py", "app_base_port": 8000}}}
{"query_id": "infinity-elation-clinical-records-task_h62", "dataset": "webarena-infinity", "query": "Look up which template is assigned to the COVID Vaccine appointment type. Remove all its existing document tags and replace them with the single tag 'COVID-Protocol'. Then also assign that same template to the Urgent Same-Day appointment type.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h62", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h62.py", "app_base_port": 8000}}}
{"query_id": "infinity-elation-prescriptions-task_h32", "dataset": "webarena-infinity", "query": "The patient has a medication that's being dispensed as written (brand name only). Discontinue that prescription and replace it with a new one \u2014 same medication, same sig, same pharmacy \u2014 but allow generic substitution this time. Qty 30, 3 refills, 30 days supply.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h32", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h32.py", "app_base_port": 8020}}}
{"query_id": "infinity-gitlab-plan-and-track-task_h48", "dataset": "webarena-infinity", "query": "Add the 'breaking-change' label to every open issue in the API v3 Migration epic and remove any existing workflow-scoped labels from those issues.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h48", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h48.py", "app_base_port": 8050}}}
{"query_id": "infinity-gitlab-plan-and-track-task_h77", "dataset": "webarena-infinity", "query": "Rename the 'UX' label to 'user-experience', change its type to 'group', and then add it to every open issue in the Frontend Modernization epic that doesn't already have it.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h77", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h77.py", "app_base_port": 8050}}}
{"query_id": "infinity-xero-invoicing-task_h15", "dataset": "webarena-infinity", "query": "Create a new invoice for Summit Health Group for an annual software license and 12 months of support with a 10% discount on support.", "graders": ["infinity_state"], "start_url": "http://localhost:8120", "metadata": {"original_task_id": "xero-invoicing-task_h15", "website": "xero-invoicing", "category": "webarena-infinity", "additional": {"app_name": "xero-invoicing", "difficulty": "hard", "verifier_path": "real-tasks/task_h15.py", "app_base_port": 8120}}}
{"query_id": "infinity-elation-clinical-records-task_h55", "dataset": "webarena-infinity", "query": "Resolve every problem across all patients in the system that currently has a status of Controlled.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h55", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h55.py", "app_base_port": 8000}}}
{"query_id": "infinity-gitlab-plan-and-track-task_h8", "dataset": "webarena-infinity", "query": "Create a confidential issue titled 'Emergency security patch' with priority::critical and the 'security' label, assigned to James O'Brien and Oliver Schmidt, with weight 2 in the Security Hardening milestone.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h8", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h8.py", "app_base_port": 8050}}}
{"query_id": "infinity-paypal-my-wallet-task_h20", "dataset": "webarena-infinity", "query": "Make a $200 payment on PayPal Credit and change autopay to pay the full balance.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h20", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h20.py", "app_base_port": 8100}}}
{"query_id": "infinity-gitlab-plan-and-track-task_h52", "dataset": "webarena-infinity", "query": "Create a new board called 'Performance Tracker' with lists for the priority::critical, priority::high, and priority::medium labels. Then add the 'priority::high' label to every open issue in the v4.1 milestone that has the 'performance' label.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h52", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h52.py", "app_base_port": 8050}}}
{"query_id": "infinity-paypal-my-wallet-task_h80", "dataset": "webarena-infinity", "query": "Save all available Food & Drink offers, buy a $25 DoorDash gift card for yourself, and switch currency conversion to use my card issuer.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h80", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h80.py", "app_base_port": 8100}}}
{"query_id": "infinity-gmail-accounts-and-contacts-task_h50", "dataset": "webarena-infinity", "query": "Add the Emergency label to every contact who is currently listed as a delegate (active, pending, or expired). Then remove all delegates whose status is not 'active'.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h50", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h50.py", "app_base_port": 8070}}}
{"query_id": "infinity-elation-clinical-records-task_h14", "dataset": "webarena-infinity", "query": "Add the tag 'Flu-Season' to every patient whose primary provider is Dr. Sarah Chen.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h14", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h14.py", "app_base_port": 8000}}}
{"query_id": "infinity-figma-text-and-typography-task_h7", "dataset": "webarena-infinity", "query": "Remove all list formatting from every layer.", "graders": ["infinity_state"], "start_url": "http://localhost:8040", "metadata": {"original_task_id": "figma-text-and-typography-task_h7", "website": "figma-text-and-typography", "category": "webarena-infinity", "additional": {"app_name": "figma-text-and-typography", "difficulty": "hard", "verifier_path": "real-tasks/task_h7.py", "app_base_port": 8040}}}
{"query_id": "infinity-paypal-my-wallet-task_h26", "dataset": "webarena-infinity", "query": "Send a $50 Amazon gift card to sarah.chen@email.com with 'Thank you!' as the message, and save the Amazon cashback offer.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h26", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h26.py", "app_base_port": 8100}}}
{"query_id": "infinity-handshake-career-exploration-task_h97", "dataset": "webarena-infinity", "query": "Find the single most helpful answer across all Q&A questions and mark it helpful. Then find the most-viewed question and submit your own answer to it.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h97", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h97.py", "app_base_port": 8080}}}
{"query_id": "infinity-figma-slides-task_h79", "dataset": "webarena-infinity", "query": "In the adoption table, find the feature with the highest Target Q4 percentage. In the competitive table, change DesignCraft's entry for that same feature to 'Market Leader'. Then update that feature's Target Q4 to '95%'.", "graders": ["infinity_state"], "start_url": "http://localhost:8030", "metadata": {"original_task_id": "figma-slides-task_h79", "website": "figma-slides", "category": "webarena-infinity", "additional": {"app_name": "figma-slides", "difficulty": "hard", "verifier_path": "real-tasks/task_h79.py", "app_base_port": 8030}}}
{"query_id": "infinity-gitlab-plan-and-track-task_h41", "dataset": "webarena-infinity", "query": "For every open issue in the v4.2 - Security Hardening milestone: if it is already confidential, set its health status to 'at risk'. If it is not confidential, make it confidential and set its health status to 'needs attention'.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h41", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h41.py", "app_base_port": 8050}}}
{"query_id": "infinity-handshake-career-exploration-task_h90", "dataset": "webarena-infinity", "query": "A student in the feed mentioned attending the NSBE conference. That student also answered a Q&A question about diversity programs in tech. Submit your own answer to that same question sharing your experience, then bookmark that student's feed post.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h90", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h90.py", "app_base_port": 8080}}}
{"query_id": "infinity-elation-prescriptions-task_h30", "dataset": "webarena-infinity", "query": "The patient has three temporary medications. Discontinue the corticosteroid taper and the penicillin antibiotic \u2014 the patient completed both courses. Move the remaining temporary medication to permanent Rx.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h30", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h30.py", "app_base_port": 8020}}}
{"query_id": "infinity-linear-account-settings-task_h19", "dataset": "webarena-infinity", "query": "Turn off all desktop application settings: open in desktop app, notification badge, and spell check.", "graders": ["infinity_state"], "start_url": "http://localhost:8090", "metadata": {"original_task_id": "linear-account-settings-task_h19", "website": "linear-account-settings", "category": "webarena-infinity", "additional": {"app_name": "linear-account-settings", "difficulty": "hard", "verifier_path": "real-tasks/task_h19.py", "app_base_port": 8090}}}
{"query_id": "infinity-elation-prescriptions-task_h39", "dataset": "webarena-infinity", "query": "Change the default pharmacy to Express Scripts Mail Pharmacy for mail-order prescriptions. Then document that the patient takes Magnesium Citrate 400mg tablet as an OTC supplement \u2014 once daily at bedtime, 30-day supply.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h39", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h39.py", "app_base_port": 8020}}}
{"query_id": "infinity-handshake-career-exploration-task_h136", "dataset": "webarena-infinity", "query": "Your earliest completed appointment was a specific type. Schedule a follow-up appointment of the same category and type with the same staff member, for March 28, 2026 at 9:00 AM, in person.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h136", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h136.py", "app_base_port": 8080}}}
{"query_id": "infinity-handshake-career-exploration-task_h105", "dataset": "webarena-infinity", "query": "Find the second-most-viewed question in Q&A. It has two answers \u2014 mark the one with fewer helpful votes as helpful.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h105", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h105.py", "app_base_port": 8080}}}
{"query_id": "infinity-gmail-accounts-and-contacts-task_h22", "dataset": "webarena-infinity", "query": "The Engineering Manager at TechCorp is listed as one of your delegates. Remove her delegation and unstar her contact.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h22", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h22.py", "app_base_port": 8070}}}
{"query_id": "infinity-elation-patient-communication-task_h9", "dataset": "webarena-infinity", "query": "Acknowledge all unacknowledged reminders in the system.", "graders": ["infinity_state"], "start_url": "http://localhost:8010", "metadata": {"original_task_id": "elation-patient-communication-task_h9", "website": "elation-patient-communication", "category": "webarena-infinity", "additional": {"app_name": "elation-patient-communication", "difficulty": "hard", "verifier_path": "real-tasks/task_h9.py", "app_base_port": 8010}}}
{"query_id": "infinity-superhuman-general-task_h1", "dataset": "webarena-infinity", "query": "Label the FinancePlus partnership email and the QuantumLab prototype email as 'Clients'.", "graders": ["infinity_state"], "start_url": "http://localhost:8110", "metadata": {"original_task_id": "superhuman-general-task_h1", "website": "superhuman-general", "category": "webarena-infinity", "additional": {"app_name": "superhuman-general", "difficulty": "hard", "verifier_path": "real-tasks/task_h1.py", "app_base_port": 8110}}}
{"query_id": "infinity-xero-invoicing-task_h79", "dataset": "webarena-infinity", "query": "Change the invoice prefix to 'AUS-' and the next number to 100, then create a new invoice for CloudNine Analytics for 8 hours of UI/UX design work.", "graders": ["infinity_state"], "start_url": "http://localhost:8120", "metadata": {"original_task_id": "xero-invoicing-task_h79", "website": "xero-invoicing", "category": "webarena-infinity", "additional": {"app_name": "xero-invoicing", "difficulty": "hard", "verifier_path": "real-tasks/task_h79.py", "app_base_port": 8120}}}
{"query_id": "infinity-figma-slides-task_h16", "dataset": "webarena-infinity", "query": "Enable slide numbers on every slide using the 'with total' format and change the aspect ratio to 4:3.", "graders": ["infinity_state"], "start_url": "http://localhost:8030", "metadata": {"original_task_id": "figma-slides-task_h16", "website": "figma-slides", "category": "webarena-infinity", "additional": {"app_name": "figma-slides", "difficulty": "hard", "verifier_path": "real-tasks/task_h16.py", "app_base_port": 8030}}}
{"query_id": "infinity-linear-account-settings-task_h16", "dataset": "webarena-infinity", "query": "Revoke all API keys that have an expiration date.", "graders": ["infinity_state"], "start_url": "http://localhost:8090", "metadata": {"original_task_id": "linear-account-settings-task_h16", "website": "linear-account-settings", "category": "webarena-infinity", "additional": {"app_name": "linear-account-settings", "difficulty": "hard", "verifier_path": "real-tasks/task_h16.py", "app_base_port": 8090}}}
{"query_id": "infinity-elation-prescriptions-task_h2", "dataset": "webarena-infinity", "query": "Prescribe Buspirone 10mg for the patient's anxiety \u2014 once daily in the morning, qty 30, 5 refills. Send it to the same pharmacy that fills his Sertraline.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h2", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h2.py", "app_base_port": 8020}}}
{"query_id": "infinity-handshake-career-exploration-task_h1", "dataset": "webarena-infinity", "query": "Follow all consulting firms on Handshake.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h1", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h1.py", "app_base_port": 8080}}}
{"query_id": "infinity-handshake-career-exploration-task_h141", "dataset": "webarena-infinity", "query": "Some of your saved jobs are from employers you haven't followed yet. Find and follow each of those employers.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h141", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h141.py", "app_base_port": 8080}}}
{"query_id": "infinity-figma-text-and-typography-task_h74", "dataset": "webarena-infinity", "query": "Set the spelling language to Japanese, the big nudge amount to 50, and the default horizontal alignment to right.", "graders": ["infinity_state"], "start_url": "http://localhost:8040", "metadata": {"original_task_id": "figma-text-and-typography-task_h74", "website": "figma-text-and-typography", "category": "webarena-infinity", "additional": {"app_name": "figma-text-and-typography", "difficulty": "hard", "verifier_path": "real-tasks/task_h74.py", "app_base_port": 8040}}}
{"query_id": "infinity-elation-patient-communication-task_h63", "dataset": "webarena-infinity", "query": "Check the visit summaries to find the patient whose BNP level improved. Reply to their most recent message confirming they can resume light activity, then update their emergency contact's phone number to (650) 555-0001.", "graders": ["infinity_state"], "start_url": "http://localhost:8010", "metadata": {"original_task_id": "elation-patient-communication-task_h63", "website": "elation-patient-communication", "category": "webarena-infinity", "additional": {"app_name": "elation-patient-communication", "difficulty": "hard", "verifier_path": "real-tasks/task_h63.py", "app_base_port": 8010}}}
{"query_id": "infinity-elation-patient-communication-task_h14", "dataset": "webarena-infinity", "query": "Change Dr. Torres's notification timeframe to 'Do not notify me' and remove Dr. Torres from Dr. Chen's General Question routing.", "graders": ["infinity_state"], "start_url": "http://localhost:8010", "metadata": {"original_task_id": "elation-patient-communication-task_h14", "website": "elation-patient-communication", "category": "webarena-infinity", "additional": {"app_name": "elation-patient-communication", "difficulty": "hard", "verifier_path": "real-tasks/task_h14.py", "app_base_port": 8010}}}
{"query_id": "infinity-gitlab-plan-and-track-task_h67", "dataset": "webarena-infinity", "query": "Delete all time entries from the GraphQL gateway issue, add a single new entry of 16 hours with summary 'Complete rewrite estimate', and set its time estimate to 40 hours.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h67", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h67.py", "app_base_port": 8050}}}
{"query_id": "infinity-gmail-accounts-and-contacts-task_h73", "dataset": "webarena-infinity", "query": "Among the individual people in your other contacts (those with a first and last name), find the one who was saved most recently. Move them to your main contacts, set their company to 'Salesforce', job title to 'Account Executive', and add the Work label.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h73", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h73.py", "app_base_port": 8070}}}
{"query_id": "infinity-elation-prescriptions-task_h4", "dataset": "webarena-infinity", "query": "Run a medication reconciliation and mark the Calcium+D3 supplement for discontinuation during the review.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h4", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h4.py", "app_base_port": 8020}}}
{"query_id": "infinity-elation-prescriptions-task_h47", "dataset": "webarena-infinity", "query": "The patient's SSRI is currently dispensed at a different pharmacy than most of his other medications. Prescribe a refill of the same SSRI at the same dose and sig, but send it to CVS #4521 instead \u2014 qty 30, 5 refills, 30 days supply.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h47", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h47.py", "app_base_port": 8020}}}
{"query_id": "infinity-paypal-my-wallet-task_h89", "dataset": "webarena-infinity", "query": "If your USD PayPal balance is above $2,500, convert $500 to Japanese Yen. If it is $2,500 or below, first add $500 from your Chase bank account, then convert $500 to JPY. Either way, set the debit card cash back category to Fuel.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h89", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h89.py", "app_base_port": 8100}}}

View File

@@ -0,0 +1,147 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Coordinate Click Test</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
width: 100vw;
height: 100vh;
overflow: hidden;
background: #1a1a2e;
font-family: system-ui, -apple-system, sans-serif;
}
.circle {
position: fixed;
border-radius: 50%;
background: #e94560;
display: flex;
align-items: center;
justify-content: center;
color: #fff;
font-weight: 700;
cursor: pointer;
user-select: none;
transition: background 0.2s, transform 0.15s;
clip-path: circle(50%);
}
.circle:hover { transform: scale(1.08); }
.circle[data-clicked="true"] {
background: #0f3460;
pointer-events: none;
}
/* A — top-left area, large */
.circle-a {
width: 80px;
height: 80px;
font-size: 24px;
top: 15%;
left: 10%;
}
/* B — right side, upper-middle, medium */
.circle-b {
width: 50px;
height: 50px;
font-size: 18px;
top: 30%;
right: 18%;
}
/* C — center-left, lower area, small */
.circle-c {
width: 30px;
height: 30px;
font-size: 13px;
bottom: 25%;
left: 35%;
}
/* D — bottom-right area, very small */
.circle-d {
width: 16px;
height: 16px;
font-size: 9px;
bottom: 12%;
right: 30%;
}
#status {
position: fixed;
top: 50%;
left: 50%;
transform: translate(-50%, -50%);
color: #eee;
font-size: 16px;
text-align: center;
pointer-events: none;
}
#status .count {
font-size: 48px;
font-weight: 700;
color: #0f3460;
}
.success-flash {
animation: flash 0.4s ease-out;
}
@keyframes flash {
0% { background: #16c79a; transform: scale(1.3); }
100% { background: #0f3460; transform: scale(1); }
}
</style>
</head>
<body>
<div id="circle-a" class="circle circle-a" data-target="A" data-clicked="false">A</div>
<div id="circle-b" class="circle circle-b" data-target="B" data-clicked="false">B</div>
<div id="circle-c" class="circle circle-c" data-target="C" data-clicked="false">C</div>
<div id="circle-d" class="circle circle-d" data-target="D" data-clicked="false">D</div>
<div id="status">
<div class="count" id="clicked-count">0</div>
<div>of 4 targets clicked</div>
</div>
<script>
const circles = document.querySelectorAll('.circle')
const countEl = document.getElementById('clicked-count')
let clicked = 0
circles.forEach(circle => {
circle.addEventListener('click', (e) => {
if (circle.dataset.clicked === 'true') return
const rect = circle.getBoundingClientRect()
const centerX = rect.left + rect.width / 2
const centerY = rect.top + rect.height / 2
const radius = rect.width / 2
const dx = e.clientX - centerX
const dy = e.clientY - centerY
if (dx * dx + dy * dy > radius * radius) return
circle.dataset.clicked = 'true'
circle.classList.add('success-flash')
clicked++
countEl.textContent = clicked
if (clicked === 4) {
document.getElementById('status').innerHTML =
'<div class="count" style="color:#16c79a">ALL TARGETS HIT</div>' +
'<div>4 of 4 targets clicked</div>'
document.body.dataset.allClicked = 'true'
}
})
})
</script>
</body>
</html>

View File

@@ -0,0 +1,16 @@
const server = Bun.serve({
port: 3100,
async fetch(req) {
const url = new URL(req.url)
const path = url.pathname === '/' ? '/index.html' : url.pathname
const file = Bun.file(import.meta.dir + path)
if (await file.exists()) {
return new Response(file)
}
return new Response('Not Found', { status: 404 })
},
})
console.log(`Coordinate click test running at http://localhost:${server.port}`)

View File

@@ -1,184 +0,0 @@
#!/usr/bin/env python3
"""
AGI SDK evaluation helper for BrowserOS eval framework.
Reads JSON from stdin with task_id and env_state, runs the agisdk
evaluator, and outputs the result as JSON to stdout.
Input format:
{"task_id": "dashdish-1", "env_state": {...}, "model_response": ""}
Output format:
{"reward": 0.0, "pass": false, "message": "...", "per_criterion": [...]}
Lenient string matching is enabled by default: a failed criterion where
expected_value is a clean substring of actual_value (both strings) is
re-marked as a softened pass. This handles AGISDK tasks where the model
adds harmless decoration to a title or note (e.g. topwork-3, topwork-4).
Set AGISDK_STRICT_STRINGS=1 to disable and recover the strict score.
"""
import json
import os
import sys
_STRICT = os.environ.get("AGISDK_STRICT_STRINGS", "").lower() in ("1", "true", "yes")
def _soft_string_match(detail: object) -> bool:
"""Return True iff `detail` is `{actual_value, expected_value}` with both
strings and a non-empty `expected_value` that is contained in `actual_value`
(case-insensitive). Otherwise False — the criterion stays failed.
"""
if not isinstance(detail, dict):
return False
actual = detail.get("actual_value")
expected = detail.get("expected_value")
if not isinstance(actual, str) or not isinstance(expected, str):
return False
expected_stripped = expected.strip()
if not expected_stripped:
return False
return expected_stripped.lower() in actual.lower()
def main():
data = json.loads(sys.stdin.read())
task_id = data["task_id"]
env_state = data["env_state"]
model_response = data.get("model_response", "")
try:
from agisdk.REAL.browsergym.webclones.evaluate import WebCloneEvaluator
from agisdk.REAL.browsergym.webclones.task_config import TaskConfig
except ImportError:
print(
json.dumps(
{
"reward": 0,
"pass": False,
"message": "agisdk package not installed. Run: pip install agisdk",
"per_criterion": [],
}
)
)
sys.exit(0)
try:
# Redirect stdout to stderr during evaluation — agisdk's rich logger
# prints directly to stdout, which would corrupt our JSON output
real_stdout = sys.stdout
sys.stdout = sys.stderr
tc = TaskConfig(task_id)
evaluator = WebCloneEvaluator(tc)
reward_val, _done, message, info = evaluator.evaluate(
env_state=env_state, model_response=model_response
)
sys.stdout = real_stdout
reward_val = float(reward_val) if reward_val is not None else 0.0
results = info.get("results", [])
# `info["results"]` aligns 1:1 with `tc.task.evals` — zip them so we can
# surface the human-readable description and JMESPath query alongside
# the pass/fail. Without this the only feedback was a stringified dict.
evals = list(getattr(tc.task, "evals", []))
per_criterion = []
softened_count = 0
for idx, r in enumerate(results):
passed = bool(r[0])
detail = r[1] if len(r) > 1 else {}
ev = evals[idx] if idx < len(evals) else None
actual_value = expected_value = None
if isinstance(detail, dict):
actual_value = detail.get("actual_value")
expected_value = detail.get("expected_value")
entry: dict = {
"passed": passed,
"description": getattr(ev, "description", "") or "",
"query": getattr(ev, "query", "") or "",
"expected_value": expected_value,
"actual_value": actual_value,
}
if not _STRICT and not passed and _soft_string_match(detail):
entry["passed"] = True
entry["softened"] = True
softened_count += 1
per_criterion.append(entry)
# Recompute pass/reward after softening: if every criterion now passes,
# the task counts as a soft pass.
all_pass = all(c["passed"] for c in per_criterion) and bool(per_criterion)
if all_pass and reward_val != 1.0:
reward_val = 1.0
# Build a useful message: list every criterion with a pass/fail icon
# so the viewer's grader pill shows the full check-list, not just
# failures. This becomes the `reasoning` shown in the viewer.
if not per_criterion:
# Defensive: agisdk returned no criteria — fall back to its message.
out_message = str(message)
else:
failures = [c for c in per_criterion if not c["passed"]]
if all_pass:
header = (
f"All {len(per_criterion)} criteria passed"
+ (
f" ({softened_count} softened)."
if softened_count
else "."
)
)
else:
header = (
f"{len(failures)} of {len(per_criterion)} criteria failed:"
)
lines = []
for c in per_criterion:
icon = "" if c["passed"] else ""
desc = c["description"] or c["query"] or "<unknown>"
soft = " (softened)" if c.get("softened") else ""
if c["passed"]:
lines.append(f"{icon} {desc}{soft}")
else:
exp_s = repr(c["expected_value"])
act_s = repr(c["actual_value"])
lines.append(
f"{icon} {desc}: expected {exp_s}, got {act_s}"
)
out_message = header + "\n" + "\n".join(lines)
print(
json.dumps(
{
"reward": reward_val,
"pass": reward_val == 1.0,
"message": out_message,
"per_criterion": per_criterion,
}
)
)
except Exception as e:
sys.stdout = real_stdout if "real_stdout" in dir() else sys.__stdout__
print(
json.dumps(
{
"reward": 0,
"pass": False,
"message": f"Evaluation error: {str(e)}",
"per_criterion": [],
}
)
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,93 @@
"""
Analyze how many WebBench tasks require authentication across ALL buckets.
Usage: python3 apps/eval/scripts/analyze-webbench-auth.py
"""
import json
import re
from collections import defaultdict
# Login/auth indicators in task text
AUTH_KEYWORDS = [
"log in", "login", "sign in", "signin", "sign up", "signup",
"your account", "your profile", "your wishlist", "your order",
"your cart", "your dashboard", "your settings", "your subscription",
"your inbox", "your message", "your review", "your playlist",
"your favorites", "your saved", "your history", "your list",
"your address", "your payment", "your booking", "your reservation",
"my account", "my profile", "my wishlist", "my order", "my cart",
"my dashboard", "my settings", "my subscription", "my inbox",
"my message", "my review", "my playlist", "my favorites",
"my saved", "my history", "my list", "my address", "my payment",
"my booking", "my reservation", "my bag",
"send a message", "post a comment", "write a review", "submit a review",
"leave a review", "publish", "upload a", "create a playlist",
"add to cart", "add to bag", "add to wishlist", "add to favorites",
"save to", "bookmark", "subscribe", "unsubscribe",
"delete your", "remove your", "delete my", "remove my",
"edit your", "edit my", "update your", "update my",
"change your", "change my", "modify your", "modify my",
]
# Categories that almost always need auth
WRITE_CATEGORIES = {"CREATE", "UPDATE", "DELETE"}
def needs_auth(task_text, category):
task_lower = task_text.lower()
# Check keywords
for kw in AUTH_KEYWORDS:
if kw in task_lower:
return True, f"keyword: '{kw}'"
# WRITE tasks that don't match keywords but still likely need auth
# (be conservative — some CREATE tasks like "create a search filter" don't need login)
return False, ""
# Load all datasets
for bucket in [0, 1, 2]:
full_path = f"apps/eval/data/webbench-{bucket}of4.jsonl"
tasks = []
with open(full_path) as f:
for line in f:
tasks.append(json.loads(line))
auth_tasks = []
no_auth_tasks = []
for t in tasks:
needs, reason = needs_auth(t["query"], t["metadata"]["category"])
if needs:
auth_tasks.append((t, reason))
else:
no_auth_tasks.append(t)
print(f"{'=' * 60}")
print(f"BUCKET {bucket}/4: {len(tasks)} total")
print(f" Needs auth: {len(auth_tasks)} ({len(auth_tasks)/len(tasks)*100:.0f}%)")
print(f" No auth: {len(no_auth_tasks)} ({len(no_auth_tasks)/len(tasks)*100:.0f}%)")
# Breakdown of no-auth tasks
cats = defaultdict(int)
diffs = defaultdict(int)
domains = set()
for t in no_auth_tasks:
cats[t["metadata"]["category"]] += 1
diffs[t["metadata"]["additional"]["difficulty"]] += 1
domains.add(t["metadata"]["website"])
cat_str = ", ".join(f"{c}({n})" for c, n in sorted(cats.items(), key=lambda x: -x[1]))
diff_str = ", ".join(f"{d}({n})" for d, n in sorted(diffs.items(), key=lambda x: -x[1]))
print(f" No-auth breakdown:")
print(f" categories: {cat_str}")
print(f" difficulty: {diff_str}")
print(f" websites: {len(domains)}")
# Sample no-auth tasks
print(f"\n Sample no-auth tasks:")
for t in no_auth_tasks[:8]:
print(f" [{t['metadata']['additional']['webbench_id']}] [{t['metadata']['category']}] {t['metadata']['website']}")
print(f" {t['query'][:150]}")
# Sample auth tasks (to verify detection)
print(f"\n Sample auth tasks (verify detection):")
for t, reason in auth_tasks[:5]:
print(f" [{t['metadata']['additional']['webbench_id']}] [{t['metadata']['category']}] {t['metadata']['website']} ({reason})")
print(f" {t['query'][:150]}")
print()

View File

@@ -0,0 +1,214 @@
"""
Analyze WebBench results across ALL 8 agents to stratify tasks by pass count.
Usage: python3 apps/eval/scripts/analyze-webbench.py
"""
import csv
import os
from collections import defaultdict
DATA_DIR = "apps/eval/data/webbench"
AGENTS = [
{"file": "anthropicfinal.csv", "eval_col": "Anthropic_Eval", "name": "Anthropic CUA"},
{"file": "skyvern2.0final.csv", "eval_col": "Skyvern2.0Eval", "name": "Skyvern 2.0"},
{"file": "skyvern2.0browserbasefinal.csv", "eval_col": "Browserbase_SkyvernEval", "name": "Skyvern BB"},
{"file": "openaicuafinal.csv", "eval_col": "CUAEval", "name": "OpenAI CUA"},
{"file": "browserusefinal.csv", "eval_col": "BUEval", "name": "BrowserUse"},
{"file": "convergencehitlfinal.csv", "eval_col": "convergence_hitl_eval", "name": "Convergence"},
{"file": "operatorhitlfinal.csv", "eval_col": "operator_hitl_eval", "name": "Operator"},
{"file": "rtrvrfinal.csv", "eval_col": "Human Label", "name": "RTRVR"},
]
def load_agent(agent):
path = os.path.join(DATA_DIR, agent["file"])
results = {}
with open(path, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
try:
task_id = int(row["ID"])
except (ValueError, KeyError):
continue
eval_val = row.get(agent["eval_col"], "")
results[task_id] = {
"eval": eval_val,
"difficulty": row.get("Difficulty", ""),
"category": row.get("Category", ""),
"task": row.get("Task", ""),
"url": row.get("Starting URL", ""),
}
return results
# Load all agents
print("Loading agents...")
agent_results = {}
for agent in AGENTS:
data = load_agent(agent)
agent_results[agent["name"]] = data
print(f" {agent['name']}: {len(data)} tasks")
# ─── INDIVIDUAL AGENT STATS ──────────────────────────────────────────
print("\n" + "=" * 70)
print("INDIVIDUAL AGENT PASS RATES")
print("=" * 70)
for agent in AGENTS:
name = agent["name"]
data = agent_results[name]
total = len(data)
passed = sum(1 for r in data.values() if r["eval"] and "success" in r["eval"].lower())
easy_total = sum(1 for r in data.values() if r["difficulty"] == "easy")
easy_pass = sum(1 for r in data.values() if r["difficulty"] == "easy" and r["eval"] and "success" in r["eval"].lower())
hard_total = sum(1 for r in data.values() if r["difficulty"] == "hard")
hard_pass = sum(1 for r in data.values() if r["difficulty"] == "hard" and r["eval"] and "success" in r["eval"].lower())
print(f"\n{name}: {passed}/{total} = {passed/total*100:.1f}%")
if easy_total:
print(f" easy: {easy_pass}/{easy_total} = {easy_pass/easy_total*100:.1f}%")
if hard_total:
print(f" hard: {hard_pass}/{hard_total} = {hard_pass/hard_total*100:.1f}%")
# ─── FULL-COVERAGE AGENTS (2452 tasks each) ──────────────────────────
# Anthropic CUA, Skyvern 2.0, Skyvern BB, OpenAI CUA
full_agents = ["Anthropic CUA", "Skyvern 2.0", "Skyvern BB", "OpenAI CUA"]
print("\n" + "=" * 70)
print(f"4 FULL-COVERAGE AGENTS: {', '.join(full_agents)}")
print("(each has ~2452 tasks)")
print("=" * 70)
# Collect IDs present in ALL 4 full agents
all_ids = None
for name in full_agents:
ids = set(agent_results[name].keys())
all_ids = ids if all_ids is None else all_ids & ids
print(f"Tasks in intersection: {len(all_ids)}")
by_pass = defaultdict(list)
for tid in sorted(all_ids):
pass_count = 0
info = {}
agent_evals = {}
for name in full_agents:
r = agent_results[name][tid]
is_success = "success" in r["eval"].lower() if r["eval"] else False
if is_success:
pass_count += 1
agent_evals[name] = "PASS" if is_success else "FAIL"
if not info:
info = r
by_pass[pass_count].append({
"id": tid, "pass_count": pass_count,
"difficulty": info["difficulty"], "category": info["category"],
"task": info["task"], "url": info["url"], "agents": agent_evals,
})
for pc in range(5):
tasks = by_pass[pc]
label = {0: "0/4 (ALL FAIL)", 4: "4/4 (ALL PASS)"}.get(pc, f"{pc}/4")
easy = sum(1 for t in tasks if t["difficulty"] == "easy")
hard = sum(1 for t in tasks if t["difficulty"] == "hard")
cats = defaultdict(int)
for t in tasks:
cats[t["category"]] += 1
urls = len(set(t["url"] for t in tasks))
cat_str = ", ".join(f"{c}({n})" for c, n in sorted(cats.items(), key=lambda x: -x[1]))
print(f"\n{label}: {len(tasks)} tasks")
print(f" easy: {easy}, hard: {hard}")
print(f" categories: {cat_str}")
print(f" unique websites: {urls}")
# ─── NOW ALSO CHECK: how many 0/4 tasks require login? ───────────────
print("\n" + "=" * 70)
print("0/4 TASKS: LOGIN vs NO-LOGIN breakdown")
print("=" * 70)
login_keywords = ["log in", "login", "sign in", "signin", "your account", "your profile",
"your wishlist", "your order", "your cart", "your dashboard", "your settings",
"your subscription", "your inbox", "your message", "your review",
"send a message", "post a comment", "write a review", "submit a",
"publish", "upload"]
zero_pass = by_pass[0]
login_tasks = []
no_login_tasks = []
for t in zero_pass:
task_lower = t["task"].lower()
needs_login = any(kw in task_lower for kw in login_keywords)
if needs_login:
login_tasks.append(t)
else:
no_login_tasks.append(t)
print(f" Likely needs login: {len(login_tasks)}")
print(f" Possibly no login: {len(no_login_tasks)}")
print(f"\n No-login 0/4 tasks by category:")
cats = defaultdict(int)
for t in no_login_tasks:
cats[t["category"]] += 1
cat_str = ", ".join(f"{c}({n})" for c, n in sorted(cats.items(), key=lambda x: -x[1]))
print(f" {cat_str}")
print(f"\n Sample no-login 0/4 tasks:")
for t in no_login_tasks[:10]:
print(f" [{t['id']}] [{t['difficulty']}] [{t['category']}] {t['url']}")
print(f" {t['task'][:180]}")
# ─── ALSO INCLUDE THE HITL AGENTS (smaller overlap) ──────────────────
hitl_agents = ["Convergence", "Operator", "RTRVR"]
print("\n" + "=" * 70)
print(f"HITL AGENTS: {', '.join(hitl_agents)}")
print("=" * 70)
for name in hitl_agents:
data = agent_results[name]
total = len(data)
passed = sum(1 for r in data.values() if r["eval"] and "success" in r["eval"].lower())
print(f" {name}: {passed}/{total} = {passed/total*100:.1f}%")
# See how HITL agents do on the same tasks as the 4 full agents
hitl_ids = None
for name in hitl_agents:
ids = set(agent_results[name].keys())
hitl_ids = ids if hitl_ids is None else hitl_ids & ids
common_hitl = all_ids & hitl_ids if hitl_ids else set()
print(f"\n Tasks in common (all 7 agents): {len(common_hitl)}")
if common_hitl:
by_pass_7 = defaultdict(list)
all_7 = full_agents + hitl_agents
for tid in sorted(common_hitl):
pass_count = 0
info = {}
for name in all_7:
r = agent_results[name].get(tid)
if r:
is_success = "success" in r["eval"].lower() if r["eval"] else False
if is_success:
pass_count += 1
if not info:
info = r
by_pass_7[pass_count].append({"id": tid, **info})
print("\n 7-AGENT PASS COUNT (on common subset):")
for pc in range(8):
if by_pass_7[pc]:
print(f" {pc}/7: {len(by_pass_7[pc])} tasks")
# ─── SUMMARY TABLE ───────────────────────────────────────────────────
print("\n" + "=" * 70)
print("SUMMARY FOR DATASET BUILDING")
print("=" * 70)
print(f"""
Pool sizes (4 full-coverage agents):
0/4 (all fail): {len(by_pass[0]):>4} (login-required: ~{len(login_tasks)}, no-login: ~{len(no_login_tasks)})
1/4: {len(by_pass[1]):>4}
2/4: {len(by_pass[2]):>4}
3/4: {len(by_pass[3]):>4}
4/4 (all pass): {len(by_pass[4]):>4}
─────────────────────
Total: {sum(len(v) for v in by_pass.values()):>4}
""")

View File

@@ -0,0 +1,233 @@
/**
* Analyze WebBench results across 4 agents to stratify tasks by pass count.
* Usage: bun apps/eval/scripts/analyze-webbench.ts
*/
import { parse } from 'csv-parse/sync'
const dataDir = 'apps/eval/data/webbench'
interface AgentConfig {
file: string
evalCol: string
name: string
}
const agents: AgentConfig[] = [
{ file: 'anthropicfinal.csv', evalCol: 'Anthropic_Eval', name: 'Anthropic' },
{ file: 'skyvern2.0final.csv', evalCol: 'Skyvern2.0Eval', name: 'Skyvern' },
{ file: 'openaicuafinal.csv', evalCol: 'CUAEval', name: 'OpenAI CUA' },
{ file: 'browserusefinal.csv', evalCol: 'BUEval', name: 'BrowserUse' },
]
type Row = Record<string, string>
// Parse each agent's results
const agentResults = new Map<
string,
Map<
number,
{
eval: string
difficulty: string
category: string
task: string
url: string
}
>
>()
for (const agent of agents) {
const text = await Bun.file(`${dataDir}/${agent.file}`).text()
const rows: Row[] = parse(text, {
columns: true,
skip_empty_lines: true,
relax_column_count: true,
})
const results = new Map<
number,
{
eval: string
difficulty: string
category: string
task: string
url: string
}
>()
for (const row of rows) {
const id = parseInt(row.ID, 10)
if (Number.isNaN(id)) continue
results.set(id, {
eval: row[agent.evalCol] || '',
difficulty: row.Difficulty || '',
category: row.Category || '',
task: row.Task || '',
url: row['Starting URL'] || '',
})
}
agentResults.set(agent.name, results)
console.log(`${agent.name}: ${results.size} tasks loaded`)
}
// Find common task IDs (present in all 4 agents)
const allIds = new Set<number>()
for (const [, results] of agentResults) {
for (const id of results.keys()) allIds.add(id)
}
// Build pass count per task
interface TaskStats {
id: number
passCount: number
difficulty: string
category: string
task: string
url: string
agents: Record<string, string>
}
const taskStats: TaskStats[] = []
const _fullAgentNames = agents.map((a) => a.name)
for (const id of allIds) {
let passCount = 0
let _presentCount = 0
const agentEvals: Record<string, string> = {}
let difficulty = ''
let category = ''
let task = ''
let url = ''
for (const agent of agents) {
const result = agentResults.get(agent.name)?.get(id)
if (result) {
_presentCount++
const isSuccess = result.eval?.toLowerCase().includes('success')
if (isSuccess) passCount++
agentEvals[agent.name] = isSuccess ? 'PASS' : 'FAIL'
if (!difficulty) difficulty = result.difficulty
if (!category) category = result.category
if (!task) task = result.task
if (!url) url = result.url
} else {
agentEvals[agent.name] = 'N/A'
}
}
taskStats.push({
id,
passCount,
difficulty,
category,
task,
url,
agents: agentEvals,
})
}
// Group by pass count
const byPassCount: Record<number, TaskStats[]> = {
0: [],
1: [],
2: [],
3: [],
4: [],
}
for (const t of taskStats) {
byPassCount[t.passCount].push(t)
}
console.log('\n═══════════════════════════════════════════════════')
console.log('TASKS BY PASS COUNT (how many agents succeeded)')
console.log('═══════════════════════════════════════════════════\n')
for (let pc = 0; pc <= 4; pc++) {
const tasks = byPassCount[pc]
const label =
pc === 0 ? '0/4 (ALL FAIL)' : pc === 4 ? '4/4 (ALL PASS)' : `${pc}/4`
console.log(`${label}: ${tasks.length} tasks`)
// Breakdown by difficulty
const easy = tasks.filter((t) => t.difficulty === 'easy').length
const hard = tasks.filter((t) => t.difficulty === 'hard').length
console.log(` easy: ${easy}, hard: ${hard}`)
// Breakdown by category
const byCat: Record<string, number> = {}
for (const t of tasks) {
byCat[t.category] = (byCat[t.category] || 0) + 1
}
console.log(
` categories: ${Object.entries(byCat)
.sort((a, b) => b[1] - a[1])
.map(([c, n]) => `${c}(${n})`)
.join(', ')}`,
)
console.log()
}
// Now handle BrowserUse only having 658 tasks — let's also do a 3-agent view (Anthropic, Skyvern, OpenAI)
console.log('\n═══════════════════════════════════════════════════')
console.log('3-AGENT VIEW (Anthropic + Skyvern + OpenAI CUA)')
console.log('(BrowserUse only has 658 tasks, so this is more complete)')
console.log('═══════════════════════════════════════════════════\n')
const threeAgents = ['Anthropic', 'Skyvern', 'OpenAI CUA']
const byPassCount3: Record<number, TaskStats[]> = { 0: [], 1: [], 2: [], 3: [] }
for (const t of taskStats) {
let pc3 = 0
let allPresent = true
for (const a of threeAgents) {
if (t.agents[a] === 'N/A') {
allPresent = false
break
}
if (t.agents[a] === 'PASS') pc3++
}
if (!allPresent) continue
if (!byPassCount3[pc3]) byPassCount3[pc3] = []
byPassCount3[pc3].push(t)
}
let total3 = 0
for (let pc = 0; pc <= 3; pc++) {
const tasks = byPassCount3[pc]
total3 += tasks.length
const label =
pc === 0 ? '0/3 (ALL FAIL)' : pc === 3 ? '3/3 (ALL PASS)' : `${pc}/3`
console.log(`${label}: ${tasks.length} tasks`)
const easy = tasks.filter((t) => t.difficulty === 'easy').length
const hard = tasks.filter((t) => t.difficulty === 'hard').length
console.log(` easy: ${easy}, hard: ${hard}`)
const byCat: Record<string, number> = {}
for (const t of tasks) {
byCat[t.category] = (byCat[t.category] || 0) + 1
}
console.log(
` categories: ${Object.entries(byCat)
.sort((a, b) => b[1] - a[1])
.map(([c, n]) => `${c}(${n})`)
.join(', ')}`,
)
// Show unique websites count
const uniqueUrls = new Set(tasks.map((t) => t.url))
console.log(` unique websites: ${uniqueUrls.size}`)
console.log()
}
console.log(`Total tasks in 3-agent intersection: ${total3}`)
// Quick sample of 0/3 tasks (hardest)
console.log('\n── Sample 0/3 (all fail) tasks ──')
byPassCount3[0].slice(0, 5).forEach((t) => {
console.log(` [${t.id}] [${t.difficulty}] [${t.category}] ${t.url}`)
console.log(` ${t.task.slice(0, 150)}`)
})
console.log('\n── Sample 1/3 tasks ──')
byPassCount3[1].slice(0, 5).forEach((t) => {
console.log(` [${t.id}] [${t.difficulty}] [${t.category}] ${t.url}`)
console.log(` ${t.task.slice(0, 150)}`)
})

View File

@@ -0,0 +1,340 @@
#!/usr/bin/env bun
/**
* Annotate Screenshots with Tool Coordinates
*
* Reads messages.jsonl from an eval run and annotates screenshots with
* coordinate markers showing where browser actions (click, fill, hover, drag)
* actually landed.
*
* Coordinates are in CSS pixels (returned by tool outputs). They're mapped to
* screenshot pixels using: screenshot_xy = css_xy × devicePixelRatio
*
* Usage:
* bun run apps/eval/scripts/annotate-screenshots.ts <results-folder> [--dpr=2]
*
* Options:
* --dpr=N devicePixelRatio (default: 2). Use the value from take_screenshot output.
*
* Output:
* Creates an 'annotated' folder inside the screenshots directory.
*/
import {
copyFileSync,
existsSync,
mkdirSync,
readdirSync,
readFileSync,
} from 'node:fs'
import { basename, join } from 'node:path'
import sharp from 'sharp'
interface ActionInfo {
screenshotNum: number
toolName: string
cssX: number
cssY: number
// For drag: second coordinate
cssX2?: number
cssY2?: number
}
const COORDINATE_TOOLS = new Set([
'click',
'click_at',
'fill',
'hover',
'hover_at',
'type_at',
'drag',
'drag_at',
])
/**
* Parse CSS coordinates from tool output text.
*
* Formats returned by tools:
* "Clicked [47] at (125, 42)"
* "Typed 5 characters into [12] at (300, 150)"
* "Hovered over [31] at (200, 88)"
* "Clicked at (125, 42)"
* "Hovered at (125, 42)"
* "Typed 10 chars at (125, 42)"
* "Dragged [10] (50, 100) → [20] (400, 300)"
* "Dragged from (50, 100) to (400, 300)"
*/
function parseCoordinates(
toolName: string,
output: unknown,
): { x: number; y: number; x2?: number; y2?: number } | null {
const text = extractText(output)
if (!text) return null
// Drag with two coordinate pairs: "(x1, y1) → ... (x2, y2)" or "from (x1, y1) to (x2, y2)"
if (toolName === 'drag' || toolName === 'drag_at') {
const dragMatch = text.match(
/\((\d+),\s*(\d+)\).*?(?:→|to)\s*.*?\((\d+),\s*(\d+)\)/,
)
if (dragMatch) {
return {
x: Number(dragMatch[1]),
y: Number(dragMatch[2]),
x2: Number(dragMatch[3]),
y2: Number(dragMatch[4]),
}
}
}
// Single coordinate: "at (x, y)" or just "(x, y)"
const singleMatch = text.match(/\((\d+),\s*(\d+)\)/)
if (singleMatch) {
return { x: Number(singleMatch[1]), y: Number(singleMatch[2]) }
}
return null
}
function extractText(output: unknown): string | null {
if (typeof output === 'string') return output
if (Array.isArray(output)) {
for (const item of output) {
if (item?.type === 'text' && typeof item.text === 'string')
return item.text
}
}
if (output && typeof output === 'object' && 'text' in output) {
return String((output as Record<string, unknown>).text)
}
return null
}
/**
* Parse messages.jsonl to extract actions with coordinates
*/
function parseMessages(messagesPath: string): ActionInfo[] {
const content = readFileSync(messagesPath, 'utf-8')
const lines = content.trim().split('\n')
const messages = lines.map((line) => JSON.parse(line))
const actions: ActionInfo[] = []
const pendingTools = new Map<
string,
{ toolName: string; screenshotNum: number }
>()
let screenshotNum = 0
for (const msg of messages) {
if (msg.type === 'tool-input-available') {
pendingTools.set(msg.toolCallId, {
toolName: msg.toolName,
screenshotNum: -1,
})
}
if (msg.type === 'tool-output-available') {
screenshotNum++
const pending = pendingTools.get(msg.toolCallId)
if (!pending) continue
if (!COORDINATE_TOOLS.has(pending.toolName)) {
pendingTools.delete(msg.toolCallId)
continue
}
const coords = parseCoordinates(pending.toolName, msg.output)
if (coords) {
actions.push({
screenshotNum,
toolName: pending.toolName,
cssX: coords.x,
cssY: coords.y,
cssX2: coords.x2,
cssY2: coords.y2,
})
}
pendingTools.delete(msg.toolCallId)
}
}
return actions
}
async function annotateScreenshot(
inputPath: string,
outputPath: string,
action: ActionInfo | null,
dpr: number,
): Promise<void> {
if (!action) {
copyFileSync(inputPath, outputPath)
return
}
const image = sharp(inputPath)
const metadata = await image.metadata()
// biome-ignore lint/style/noNonNullAssertion: sharp metadata always has dimensions for valid images
const imgWidth = metadata.width!
// biome-ignore lint/style/noNonNullAssertion: sharp metadata always has dimensions for valid images
const imgHeight = metadata.height!
const sx = Math.round(action.cssX * dpr)
const sy = Math.round(action.cssY * dpr)
let markersSvg = ''
// Primary marker (red crosshair)
markersSvg += `
<circle cx="${sx}" cy="${sy}" r="25" fill="none" stroke="red" stroke-width="4"/>
<circle cx="${sx}" cy="${sy}" r="6" fill="red" fill-opacity="0.6"/>
<line x1="${sx - 40}" y1="${sy}" x2="${sx - 10}" y2="${sy}" stroke="red" stroke-width="3"/>
<line x1="${sx + 10}" y1="${sy}" x2="${sx + 40}" y2="${sy}" stroke="red" stroke-width="3"/>
<line x1="${sx}" y1="${sy - 40}" x2="${sx}" y2="${sy - 10}" stroke="red" stroke-width="3"/>
<line x1="${sx}" y1="${sy + 10}" x2="${sx}" y2="${sy + 40}" stroke="red" stroke-width="3"/>
`
// Drag target marker (orange)
if (action.cssX2 !== undefined && action.cssY2 !== undefined) {
const sx2 = Math.round(action.cssX2 * dpr)
const sy2 = Math.round(action.cssY2 * dpr)
markersSvg += `
<circle cx="${sx2}" cy="${sy2}" r="25" fill="none" stroke="orange" stroke-width="4"/>
<circle cx="${sx2}" cy="${sy2}" r="6" fill="orange" fill-opacity="0.6"/>
<line x1="${sx}" y1="${sy}" x2="${sx2}" y2="${sy2}" stroke="orange" stroke-width="2" stroke-dasharray="8,4"/>
`
}
// Info box
const label2 =
action.cssX2 !== undefined
? ` → (${action.cssX2}, ${action.cssY2}) css`
: ''
const infoText = `${action.toolName}: (${action.cssX}, ${action.cssY}) css × ${dpr} dpr = (${sx}, ${sy}) px${label2}`
markersSvg += `
<rect x="10" y="10" width="${Math.min(infoText.length * 8 + 20, imgWidth - 20)}" height="50" fill="rgba(0,0,0,0.9)" rx="5"/>
<text x="20" y="30" fill="red" font-family="monospace" font-size="14" font-weight="bold">
Screenshot ${action.screenshotNum}: AFTER ${action.toolName}
</text>
<text x="20" y="50" fill="white" font-family="monospace" font-size="12">
${infoText}
</text>
`
const svg = `<svg width="${imgWidth}" height="${imgHeight}">${markersSvg}</svg>`
await image
.composite([{ input: Buffer.from(svg), top: 0, left: 0 }])
.toFile(outputPath)
}
async function main() {
const args = process.argv.slice(2)
const flags = args.filter((a) => a.startsWith('--'))
const positional = args.filter((a) => !a.startsWith('--'))
if (positional.length === 0) {
console.log(
'Usage: bun run apps/eval/scripts/annotate-screenshots.ts <results-folder> [--dpr=2]',
)
console.log('')
console.log('Example:')
console.log(
' bun run apps/eval/scripts/annotate-screenshots.ts apps/eval/results/single/Amazon--3',
)
process.exit(1)
}
const dprFlag = flags.find((f) => f.startsWith('--dpr='))
let dpr = dprFlag ? Number(dprFlag.split('=')[1]) : 0
// Try reading DPR from metadata.json if not explicitly provided
if (!dpr) {
const metadataPath = join(positional[0], 'metadata.json')
if (existsSync(metadataPath)) {
const meta = JSON.parse(readFileSync(metadataPath, 'utf-8'))
dpr = meta.device_pixel_ratio ?? 0
if (dpr) console.log(`Read devicePixelRatio=${dpr} from metadata.json`)
}
}
if (!dpr) {
console.error(
'Error: devicePixelRatio not found in metadata.json. Provide --dpr=N flag.',
)
process.exit(1)
}
const resultsFolder = positional[0]
const messagesPath = join(resultsFolder, 'messages.jsonl')
const screenshotsDir = join(resultsFolder, 'screenshots')
const annotatedDir = join(screenshotsDir, 'annotated')
if (!existsSync(messagesPath)) {
console.error(`Error: messages.jsonl not found at ${messagesPath}`)
process.exit(1)
}
if (!existsSync(screenshotsDir)) {
console.error(`Error: screenshots directory not found at ${screenshotsDir}`)
process.exit(1)
}
mkdirSync(annotatedDir, { recursive: true })
console.log(`devicePixelRatio: ${dpr}`)
console.log('Parsing messages.jsonl...')
const actions = parseMessages(messagesPath)
console.log(`Found ${actions.length} actions with coordinates:`)
for (const action of actions) {
const dragInfo =
action.cssX2 !== undefined ? ` → (${action.cssX2}, ${action.cssY2})` : ''
console.log(
` Screenshot ${action.screenshotNum}: ${action.toolName} at (${action.cssX}, ${action.cssY})${dragInfo} css → (${Math.round(action.cssX * dpr)}, ${Math.round(action.cssY * dpr)}) px`,
)
}
console.log('')
const screenshots = readdirSync(screenshotsDir)
.filter((f) => f.endsWith('.png') && !f.includes('annotated'))
.sort((a, b) => {
const numA = parseInt(basename(a, '.png'), 10)
const numB = parseInt(basename(b, '.png'), 10)
return numA - numB
})
console.log(`Found ${screenshots.length} screenshots`)
const firstMeta = await sharp(join(screenshotsDir, screenshots[0])).metadata()
console.log(`Screenshot dimensions: ${firstMeta.width} x ${firstMeta.height}`)
console.log('')
const actionByScreenshot = new Map<number, ActionInfo>()
for (const action of actions) {
actionByScreenshot.set(action.screenshotNum, action)
}
console.log('Annotating screenshots...')
for (const ss of screenshots) {
const ssNum = parseInt(basename(ss, '.png'), 10)
const inputPath = join(screenshotsDir, ss)
const outputPath = join(annotatedDir, `${ssNum}_annotated.png`)
const action = actionByScreenshot.get(ssNum) || null
if (action) {
console.log(` ${ss} → annotated (${action.toolName})`)
} else {
console.log(` ${ss} → copied (no coordinates)`)
}
await annotateScreenshot(inputPath, outputPath, action, dpr)
}
console.log('')
console.log(`Done! Annotated screenshots saved to: ${annotatedDir}`)
}
main().catch((err) => {
console.error('Error:', err)
process.exit(1)
})

View File

@@ -1,207 +0,0 @@
#!/usr/bin/env python3
"""
Build JSONL dataset for AGI SDK / REAL Bench evaluation.
Reads task definitions from the agisdk package, filters to feasible
action-only tasks (excludes llm_boolean evaluators), and outputs JSONL
to stdout in the BrowserOS eval framework format.
Usage:
python scripts/build-agisdk-dataset.py > data/agisdk-real.jsonl
"""
import json
import re
import sys
from datetime import date
# evals-omnizon.vercel.app was DMCA-takedown'd by Vercel (HTTP 451). Every task
# on that site fails grading with "Failed to fetch /finish endpoint".
EXCLUDED_WEBSITES = {"omnizon"}
# Tasks where either the task itself is invalid (data rot, eval site broken)
# or the grader penalizes correct work. We do NOT exclude tasks where the
# agent system genuinely fails (e.g. broken MCP tools) — those are real
# capability gaps the team needs to see in the score.
#
# Each entry below was confirmed via head-to-head deep-dive on the 2026-04-28
# K2.5 + Opus 4.6 runs; see plans/audits/.
EXCLUDED_TASKS = {
# evals-topwork.vercel.app throws Minified React error #185
# ("Maximum update depth exceeded") on every form submit; the page renders
# "Application error: a client-side exception has occurred" instead of
# saving the job post. Eval site is broken.
"topwork-1",
# Hardcodes `Exp: 12/25` in both the goal text and a jmespath grader
# criterion (`paymentInfo.expDate`). Freshening the goal alone leaves the
# grader expecting the original (now-expired) value; freshening both would
# require monkey-patching agisdk's TaskConfig at runtime. Unsolvable
# without two-sided patching.
"fly-unified-2",
# Goal says "Dec 18 2024 at 10:00", but the live eval site only has 2025
# inventory and no 10:00 slot at all. Both K2.5 and Opus successfully
# booked the closest flight; neither could match the grader's expected
# timestamp. Data rot.
"fly-unified-9",
# Eval site stores selected flight times as bare-UTC wall time
# (`T08:00:00.000Z`) but the grader expects them shifted by 8h
# (`T16:00:00.000Z` = 8 AM PST). Opus 4.6 completed the booking
# correctly and was penalized only on the timestamp criteria.
# Eval-site TZ-storage bug.
"fly-unified-4",
# Goal says "Clear all emails from GitHub in the inbox" but the third
# grader criterion expects exactly 1 update. Both models correctly
# interpreted "all" and were penalized for it. Grader contradicts goal.
"gomail-8",
# Goal says "Choose a random person you haven't connected with" but the
# grader hardcodes `profilesDiff.updated."4".connectionGrade`. Both models
# picked someone other than profile id 4 (correctly random) and were
# penalized. Grader contradicts goal.
"networkin-6",
# Eval site's `searchHistoryDiff` doesn't record search queries submitted
# via the autocomplete + Enter path. Opus 4.6 completed the entire task
# correctly (sent connection request + message to a Stanford alumna) but
# the grader's first criterion (search history contains "stanford") was
# never triggered server-side. Eval-site bug.
"networkin-9",
# Goal text instructs "move event to July 19, 10 AM" but the grader expects
# `eventsDiff.updated.*.start == "2024-07-18T17:00:00Z"` (= July 18, 10 AM
# PDT — same day, 1 hour shift). Goal contradicts grader: following the
# goal yields July 19 timestamps; satisfying the grader requires ignoring
# the explicit "to July 19" instruction. Confirmed via 8-trial deep-dive:
# never passed even after the Phase 2 HTML5 dnd dispatch fix made the drag
# actually populate `eventsDiff.updated` (now produces July 19 values, but
# grader rejects them).
"gocalendar-7",
# Grader hardcodes literal year strings `'Oct 13 2025'` / `'Oct 23 2025'`
# in checkin/checkout criteria. Today is 2026, and the staynb date picker
# interprets bare "Oct 13" as the most recent past instance — currently
# 2024, not 2025. Even a perfectly-acting agent cannot produce a booking
# whose persisted date contains "2025". Confirmed via 8 trials, 0 passes.
"staynb-5",
# Goal says "maximum number of guests supported"; grader expects the very
# specific string "32 Guests, 16 Infants" — which requires the agent to
# know that (a) Adults+Children sum into the displayed "Guests" count,
# (b) Infants render separately, (c) Pets are excluded, (d) per-category
# cap is 16 despite no UI affordance signalling it. None of this is in
# the prompt. 8 trials, 0 passes; even Opus 4.6 stopped at 16 (one
# category maxed). Task is under-specified relative to grader expectation.
"staynb-9",
# Grader requires `contains(booking.date, '2024-07-20')` but the eval-site
# date picker is a React-controlled textbox that the agent's `fill` tool
# frequently no-ops on. 3 of 8 trials passed (when fill happened to stick),
# 5 failed with `actual_value: False` (booking persisted with the eval-site
# default search date, not Jul 20). Effectively a coin-flip task that
# exercises tool-fidelity flakiness rather than agent capability —
# contributes noise, not signal. Excluding for eval reliability.
"opendining-3",
}
# Far-future replacement used by `freshen_goal_dates` when a task's hardcoded
# credit-card expiration is in the past (or expires within the next 6 months).
_FRESH_EXP = "Exp: 12/30"
_EXP_PATTERN = re.compile(r"Exp:\s*(\d{2})/(\d{2})\b")
def freshen_goal_dates(goal: str) -> str:
"""Roll any `Exp: MM/YY` date forward when it's within 6 months of today.
Several AGISDK tasks (e.g., fly-unified-{2,5,12}) hardcode credit-card
expirations like `Exp: 12/25`. The eval-site checkout forms reject expired
cards; once the wall clock passes the hardcoded date, those tasks become
unsolvable. Two-digit years are interpreted as 20YY.
"""
today_yyyymm = date.today().year * 12 + date.today().month
def replace(match: re.Match[str]) -> str:
mo, yr = int(match.group(1)), int(match.group(2))
exp_yyyymm = (2000 + yr) * 12 + mo
if exp_yyyymm <= today_yyyymm + 6:
return _FRESH_EXP
return match.group(0)
return _EXP_PATTERN.sub(replace, goal)
def has_llm_eval(task: dict) -> bool:
return any(e.get("type") == "llm_boolean" for e in task.get("evals", []))
def main():
try:
from agisdk.REAL.tasks import all_tasks
except ImportError:
print(
"Error: agisdk package not installed. Run: pip install agisdk",
file=sys.stderr,
)
sys.exit(1)
count = 0
skipped_infeasible = 0
skipped_llm = 0
skipped_excluded = 0
skipped_tasks = 0
freshened = 0
for task in all_tasks:
if not task.get("possible", True):
skipped_infeasible += 1
continue
if has_llm_eval(task):
skipped_llm += 1
continue
task_id = task["id"]
if task_id in EXCLUDED_TASKS:
skipped_tasks += 1
continue
website = task.get("website", {})
if website.get("id") in EXCLUDED_WEBSITES:
skipped_excluded += 1
continue
original_goal = task.get("goal", "")
goal = freshen_goal_dates(original_goal)
if goal != original_goal:
freshened += 1
start_url = website.get("url", "")
if not start_url or not goal:
print(f"Warning: Skipping {task_id} — missing url or goal", file=sys.stderr)
continue
entry = {
"query_id": f"agisdk-{task_id}",
"dataset": "agisdk-real",
"query": goal,
"graders": ["agisdk_state_diff"],
"start_url": start_url,
"metadata": {
"original_task_id": task_id,
"website": website.get("name", ""),
"category": "agisdk-real",
"additional": {
"agisdk_task_id": task_id,
"challenge_type": task.get("challengeType", "action"),
"difficulty": task.get("difficulty", "unknown"),
"similar_to": website.get("similarTo", ""),
},
},
}
print(json.dumps(entry))
count += 1
print(
f"Generated {count} tasks (skipped {skipped_infeasible} infeasible, "
f"{skipped_llm} llm_boolean, {skipped_excluded} excluded sites, "
f"{skipped_tasks} excluded tasks; freshened {freshened} expired card dates)",
file=sys.stderr,
)
if __name__ == "__main__":
main()

View File

@@ -1,118 +0,0 @@
#!/usr/bin/env python3
"""
Dataset generator for WebArena-Infinity benchmark.
Reads real-tasks.json from each app directory and outputs JSONL
in the eval framework's TaskSchema format.
Usage:
python build-infinity-dataset.py --apps-dir /path/to/webarena-infinity/apps
python build-infinity-dataset.py --apps-dir /path/to/apps --apps gmail linear --difficulty medium
"""
import argparse
import json
import os
import sys
def load_tasks(app_dir: str) -> list[dict]:
tasks_file = os.path.join(app_dir, "real-tasks.json")
if not os.path.exists(tasks_file):
print(f"Warning: No real-tasks.json found in {app_dir}", file=sys.stderr)
return []
with open(tasks_file) as f:
return json.load(f)
def build_task_entry(
app_name: str,
task: dict,
base_port: int,
) -> dict:
task_id = task.get("id", task.get("task_id", "unknown"))
difficulty = task.get("difficulty", "unknown")
query = task.get("query", task.get("instruction", task.get("task", "")))
verifier_path = task.get(
"verify",
task.get("verifier_path", f"real-tasks/{task_id}.py"),
)
return {
"query_id": f"infinity-{app_name}-{task_id}",
"dataset": "webarena-infinity",
"query": query,
"graders": ["infinity_state"],
"start_url": f"http://localhost:{base_port}",
"setup_script": f"POST http://localhost:{base_port}/api/reset",
"metadata": {
"original_task_id": f"{app_name}-{task_id}",
"website": app_name,
"category": "webarena-infinity",
"additional": {
"app_name": app_name,
"difficulty": difficulty,
"verifier_path": verifier_path,
"app_base_port": base_port,
},
},
}
def main():
parser = argparse.ArgumentParser(
description="Generate JSONL dataset from WebArena-Infinity apps"
)
parser.add_argument(
"--apps-dir",
required=True,
help="Path to webarena-infinity/apps/ directory",
)
parser.add_argument(
"--apps",
nargs="*",
default=None,
help="Filter to specific app names (default: all)",
)
parser.add_argument(
"--difficulty",
choices=["easy", "medium", "hard"],
default=None,
help="Filter by difficulty tier",
)
parser.add_argument(
"--base-port",
type=int,
default=8000,
help="Starting port number for apps (default: 8000)",
)
args = parser.parse_args()
if not os.path.isdir(args.apps_dir):
print(f"Error: {args.apps_dir} is not a directory", file=sys.stderr)
sys.exit(1)
app_dirs = sorted(os.listdir(args.apps_dir))
if args.apps:
app_dirs = [d for d in app_dirs if d in args.apps]
port = args.base_port
for app_name in app_dirs:
app_path = os.path.join(args.apps_dir, app_name)
if not os.path.isdir(app_path):
continue
tasks = load_tasks(app_path)
for task in tasks:
difficulty = task.get("difficulty", "unknown")
if args.difficulty and difficulty != args.difficulty:
continue
entry = build_task_entry(app_name, task, port)
print(json.dumps(entry))
port += 1
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,249 @@
/**
* Long-running stress test to simulate eval behavior
* Run with: bun apps/eval/scripts/debug-long-run.ts
*/
import { Client } from '@modelcontextprotocol/sdk/client/index.js'
import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
const SERVER_URL = 'http://127.0.0.1:9110'
const MCP_URL = `${SERVER_URL}/mcp`
// Simulate 60 turns like the failing task had
const NUM_TURNS = 60
const SCREENSHOT_EVERY_N_TURNS = 1
async function checkBrowserReady(): Promise<boolean> {
try {
const res = await fetch(`${SERVER_URL}/health`, {
signal: AbortSignal.timeout(5000),
})
if (!res.ok) return false
const data = (await res.json()) as { cdpConnected?: boolean }
return data.cdpConnected === true
} catch {
return false
}
}
async function callMcpTool(
name: string,
args: Record<string, unknown> = {},
timeoutMs: number = 65000,
): Promise<{ success: boolean; error?: string; duration: number }> {
const start = Date.now()
const client = new Client({ name: 'long-run-test', version: '1.0.0' })
const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
try {
await client.connect(transport)
const toolPromise = client.callTool({ name, arguments: args })
const timeoutPromise = new Promise<never>((_, reject) =>
setTimeout(
() => reject(new Error(`Timeout after ${timeoutMs}ms`)),
timeoutMs,
),
)
const result = await Promise.race([toolPromise, timeoutPromise])
const duration = Date.now() - start
const res = result as Record<string, unknown>
if (res.isError) {
const content = res.content as
| Array<{ type: string; text?: string }>
| undefined
const errorText =
content?.find((c) => c.type === 'text')?.text || 'Unknown error'
return { success: false, error: errorText, duration }
}
return { success: true, duration }
} catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : String(error),
duration: Date.now() - start,
}
} finally {
try {
await transport.close()
} catch {}
}
}
async function main() {
console.log('='.repeat(60))
console.log('Long-Running Stress Test (simulating eval)')
console.log('='.repeat(60))
console.log(
`Simulating ${NUM_TURNS} turns with screenshots every ${SCREENSHOT_EVERY_N_TURNS} turn(s)`,
)
console.log()
// Create window
console.log('Creating window...')
let windowId = 0
let tabId = 0
const client = new Client({ name: 'long-run-test', version: '1.0.0' })
const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
try {
await client.connect(transport)
const result = await client.callTool({
name: 'browser_create_window',
arguments: { url: 'https://example.com', focused: false },
})
// Try structured content first
const createRes = result as Record<string, unknown>
const structured = createRes.structuredContent as
| Record<string, number>
| undefined
windowId = structured?.windowId ?? 0
tabId = structured?.tabId ?? 0
// Fall back to parsing text
if (!windowId || !tabId) {
const content = createRes.content as
| Array<{ type: string; text?: string }>
| undefined
const text = content?.find((c) => c.type === 'text')?.text || ''
const windowMatch = text.match(/window\s+(\d+)/i)
const tabMatch =
text.match(/Tab ID:\s*(\d+)/i) || text.match(/tab\s+(\d+)/i)
if (windowMatch) windowId = parseInt(windowMatch[1], 10)
if (tabMatch) tabId = parseInt(tabMatch[1], 10)
}
} finally {
try {
await transport.close()
} catch {}
}
if (!windowId || !tabId) {
console.log('❌ Could not determine window/tab IDs')
console.log('Trying to get from list tabs...')
// Try listing tabs
const client2 = new Client({ name: 'long-run-test', version: '1.0.0' })
const transport2 = new StreamableHTTPClientTransport(new URL(MCP_URL))
try {
await client2.connect(transport2)
const tabs = await client2.callTool({
name: 'browser_list_tabs',
arguments: {},
})
console.log('Tabs response:', JSON.stringify(tabs, null, 2))
} finally {
try {
await transport2.close()
} catch {}
}
return
}
console.log(`Window: ${windowId}, Tab: ${tabId}`)
console.log()
await new Promise((r) => setTimeout(r, 2000))
// Stats
let screenshotSuccess = 0
let screenshotFail = 0
let toolSuccess = 0
let toolFail = 0
let browserDisconnects = 0
const startTime = Date.now()
// Simulate turns
for (let turn = 1; turn <= NUM_TURNS; turn++) {
const _turnStart = Date.now()
// Random tool calls to simulate agent behavior
const tools = [
{
name: 'browser_get_interactive_elements',
args: { tabId, windowId, simplified: true },
},
{ name: 'browser_list_tabs', args: { windowId } },
{ name: 'browser_get_active_tab', args: { windowId } },
]
// Pick a random tool
const tool = tools[Math.floor(Math.random() * tools.length)]
const toolRes = await callMcpTool(tool.name, tool.args, 30000)
if (toolRes.success) {
toolSuccess++
} else {
toolFail++
console.log(` Turn ${turn}: ❌ ${tool.name} failed: ${toolRes.error}`)
}
// Screenshot every N turns
if (turn % SCREENSHOT_EVERY_N_TURNS === 0) {
const ssRes = await callMcpTool(
'browser_get_screenshot',
{ tabId, windowId, size: 'small' },
65000,
)
if (ssRes.success) {
screenshotSuccess++
} else {
screenshotFail++
console.log(` Turn ${turn}: ❌ Screenshot failed: ${ssRes.error}`)
}
}
// Check browser status
const browserReady = await checkBrowserReady()
if (!browserReady) {
browserDisconnects++
console.log(` Turn ${turn}: ⚠️ Browser became unavailable!`)
}
// Progress
if (turn % 10 === 0) {
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1)
console.log(
`Turn ${turn}/${NUM_TURNS} - Screenshots: ${screenshotSuccess}/${turn}, Tools: ${toolSuccess}/${turn}, Disconnects: ${browserDisconnects}, Elapsed: ${elapsed}s`,
)
}
// Small delay between turns
await new Promise((r) => setTimeout(r, 200))
}
// Cleanup
console.log('\nClosing window...')
await callMcpTool('browser_close_window', { windowId })
// Summary
const totalTime = ((Date.now() - startTime) / 1000).toFixed(1)
console.log(`\n${'='.repeat(60)}`)
console.log('SUMMARY')
console.log('='.repeat(60))
console.log(`Total time: ${totalTime}s`)
console.log(
`Screenshots: ${screenshotSuccess}/${NUM_TURNS} (${((screenshotSuccess / NUM_TURNS) * 100).toFixed(1)}%)`,
)
console.log(
`Tool calls: ${toolSuccess}/${NUM_TURNS} (${((toolSuccess / NUM_TURNS) * 100).toFixed(1)}%)`,
)
console.log(`Browser disconnects: ${browserDisconnects}`)
if (screenshotFail > 0 || toolFail > 0 || browserDisconnects > 0) {
console.log('\n⚠ Issues detected during long run!')
} else {
console.log('\n✅ All operations completed successfully!')
}
}
main().catch(console.error)

View File

@@ -0,0 +1,307 @@
/**
* Debug script to test MCP server stability
* Run with: bun apps/eval/scripts/debug-mcp.ts
*/
import { Client } from '@modelcontextprotocol/sdk/client/index.js'
import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
const SERVER_URL = 'http://127.0.0.1:9110'
const MCP_URL = `${SERVER_URL}/mcp`
interface TestResult {
test: string
success: boolean
duration: number
error?: string
}
const results: TestResult[] = []
async function checkHealth(): Promise<boolean> {
try {
const res = await fetch(`${SERVER_URL}/health`, {
signal: AbortSignal.timeout(5000),
})
return res.ok
} catch {
return false
}
}
async function checkBrowserReady(): Promise<boolean> {
try {
const res = await fetch(`${SERVER_URL}/health`, {
signal: AbortSignal.timeout(5000),
})
if (!res.ok) return false
const data = (await res.json()) as { cdpConnected?: boolean }
return data.cdpConnected === true
} catch {
return false
}
}
async function callMcpTool(
name: string,
args: Record<string, unknown> = {},
timeoutMs: number = 30000,
): Promise<{
success: boolean
result?: unknown
error?: string
duration: number
}> {
const start = Date.now()
const client = new Client({ name: 'debug-script', version: '1.0.0' })
const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
try {
await client.connect(transport)
const toolPromise = client.callTool({ name, arguments: args })
const timeoutPromise = new Promise<never>((_, reject) =>
setTimeout(
() => reject(new Error(`Timeout after ${timeoutMs}ms`)),
timeoutMs,
),
)
const result = await Promise.race([toolPromise, timeoutPromise])
const duration = Date.now() - start
if ((result as any).isError) {
const errorText =
(result as any).content?.find((c: any) => c.type === 'text')?.text ||
'Unknown error'
return { success: false, error: errorText, duration }
}
return { success: true, result, duration }
} catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : String(error),
duration: Date.now() - start,
}
} finally {
try {
await transport.close()
} catch {}
}
}
async function runTest(name: string, fn: () => Promise<void>): Promise<void> {
const start = Date.now()
try {
await fn()
results.push({ test: name, success: true, duration: Date.now() - start })
console.log(`${name} (${Date.now() - start}ms)`)
} catch (error) {
const errorMsg = error instanceof Error ? error.message : String(error)
results.push({
test: name,
success: false,
duration: Date.now() - start,
error: errorMsg,
})
console.log(`${name}: ${errorMsg} (${Date.now() - start}ms)`)
}
}
async function main() {
console.log('='.repeat(60))
console.log('MCP Server Debug Script')
console.log('='.repeat(60))
console.log(`Server URL: ${SERVER_URL}`)
console.log()
// Phase 1: Basic connectivity
console.log('\n--- Phase 1: Basic Connectivity ---\n')
await runTest('Health check', async () => {
const healthy = await checkHealth()
if (!healthy) throw new Error('Server not healthy')
})
await runTest('Browser status', async () => {
const connected = await checkBrowserReady()
if (!connected) throw new Error('Browser not ready')
})
// Phase 2: List tools
console.log('\n--- Phase 2: List Tools ---\n')
let tools: string[] = []
await runTest('List MCP tools', async () => {
const client = new Client({ name: 'debug-script', version: '1.0.0' })
const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
try {
await client.connect(transport)
const result = await client.listTools()
tools = result.tools.map((t) => t.name)
console.log(` Found ${tools.length} tools`)
} finally {
try {
await transport.close()
} catch {}
}
})
// Phase 3: Create window and test tools
console.log('\n--- Phase 3: Window & Screenshot Tests ---\n')
let windowId: number | null = null
let tabId: number | null = null
await runTest('Create window', async () => {
const res = await callMcpTool('browser_create_window', {
url: 'https://example.com',
focused: false,
})
if (!res.success) throw new Error(res.error)
const structured = (res.result as any)?.structuredContent
windowId = structured?.windowId
tabId = structured?.tabId
if (!windowId || !tabId) {
// Try parsing from text
const text =
(res.result as any)?.content?.find((c: any) => c.type === 'text')
?.text || ''
const windowMatch = text.match(/window\s+(\d+)/i)
const tabMatch = text.match(/tab\s+(?:ID:\s*)?(\d+)/i)
if (windowMatch) windowId = parseInt(windowMatch[1], 10)
if (tabMatch) tabId = parseInt(tabMatch[1], 10)
}
if (!windowId || !tabId) throw new Error('Could not get windowId/tabId')
console.log(` Window: ${windowId}, Tab: ${tabId}`)
})
// Wait for page to load
await new Promise((r) => setTimeout(r, 2000))
// Phase 4: Screenshot stress test
console.log('\n--- Phase 4: Screenshot Stress Test (10 screenshots) ---\n')
let screenshotSuccesses = 0
let screenshotFailures = 0
for (let i = 1; i <= 10; i++) {
const res = await callMcpTool(
'browser_get_screenshot',
{
tabId,
windowId,
size: 'small',
},
65000,
)
if (res.success) {
screenshotSuccesses++
console.log(` Screenshot ${i}: ✅ (${res.duration}ms)`)
} else {
screenshotFailures++
console.log(` Screenshot ${i}: ❌ ${res.error} (${res.duration}ms)`)
}
// Check browser status between screenshots
const extConnected = await checkBrowserReady()
if (!extConnected) {
console.log(` ⚠️ Browser became unavailable after screenshot ${i}!`)
}
// Small delay between screenshots
await new Promise((r) => setTimeout(r, 500))
}
console.log(
`\n Screenshot results: ${screenshotSuccesses}/10 success, ${screenshotFailures}/10 failed`,
)
// Phase 5: Other tool tests
console.log('\n--- Phase 5: Other Tool Tests ---\n')
await runTest('Get active tab', async () => {
const res = await callMcpTool('browser_get_active_tab', { windowId })
if (!res.success) throw new Error(res.error)
})
await runTest('List tabs', async () => {
const res = await callMcpTool('browser_list_tabs', { windowId })
if (!res.success) throw new Error(res.error)
})
await runTest('Get interactive elements', async () => {
const res = await callMcpTool('browser_get_interactive_elements', {
tabId,
windowId,
simplified: true,
})
if (!res.success) throw new Error(res.error)
})
await runTest('Navigate', async () => {
const res = await callMcpTool('browser_navigate', {
url: 'https://google.com',
tabId,
windowId,
})
if (!res.success) throw new Error(res.error)
})
await new Promise((r) => setTimeout(r, 2000))
await runTest('Get content snapshot', async () => {
const res = await callMcpTool('browser_get_content', { tabId, windowId })
if (!res.success) throw new Error(res.error)
})
// Phase 6: Cleanup
console.log('\n--- Phase 6: Cleanup ---\n')
if (windowId) {
await runTest('Close window', async () => {
const res = await callMcpTool('browser_close_window', { windowId })
if (!res.success) throw new Error(res.error)
})
}
// Final browser readiness check
await runTest('Final browser status', async () => {
const connected = await checkBrowserReady()
if (!connected) throw new Error('Browser not ready')
})
// Summary
console.log(`\n${'='.repeat(60)}`)
console.log('SUMMARY')
console.log('='.repeat(60))
const passed = results.filter((r) => r.success).length
const failed = results.filter((r) => !r.success).length
const avgDuration =
results.reduce((a, b) => a + b.duration, 0) / results.length
console.log(`Total tests: ${results.length}`)
console.log(`Passed: ${passed}`)
console.log(`Failed: ${failed}`)
console.log(`Avg duration: ${avgDuration.toFixed(0)}ms`)
console.log(
`Screenshot success rate: ${screenshotSuccesses}/10 (${screenshotSuccesses * 10}%)`,
)
if (failed > 0) {
console.log('\nFailed tests:')
for (const r of results.filter((r) => !r.success)) {
console.log(` - ${r.test}: ${r.error}`)
}
}
console.log()
}
main().catch(console.error)

View File

@@ -1,82 +0,0 @@
#!/usr/bin/env python3
"""
Evaluation helper for WebArena-Infinity verifier scripts.
Reads JSON from stdin with app_server_url, verifier_path, and task_id.
Runs the verifier against the app server and outputs a JSON result.
Verifiers have the signature: verify(server_url: str) -> tuple[bool, str]
They fetch /api/state internally and return (passed, message).
Usage:
echo '{"app_server_url": "http://localhost:8000", "verifier_path": "/path/to/verify.py"}' | python infinity-evaluate.py
"""
import importlib.util
import json
import sys
import traceback
def load_verifier(verifier_path: str):
spec = importlib.util.spec_from_file_location("verifier", verifier_path)
if spec is None or spec.loader is None:
raise ImportError(f"Cannot load verifier from {verifier_path}")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
def main():
try:
data = json.loads(sys.stdin.read())
except json.JSONDecodeError as e:
print(json.dumps({"pass": False, "reward": 0.0, "message": f"Invalid JSON input: {e}"}))
sys.exit(1)
server_url = data.get("app_server_url", "")
verifier_path = data.get("verifier_path", "")
if not server_url or not verifier_path:
print(json.dumps({
"pass": False,
"reward": 0.0,
"message": "Missing app_server_url or verifier_path",
}))
sys.exit(1)
try:
verifier = load_verifier(verifier_path)
fn = getattr(verifier, "verify", None)
if not callable(fn):
raise AttributeError(
f"Verifier has no verify() function. "
f"Available: {[a for a in dir(verifier) if not a.startswith('_')]}"
)
# Verifiers take server_url and fetch state internally
result = fn(server_url)
# Return is tuple[bool, str]
if isinstance(result, tuple) and len(result) >= 2:
passed, message = result[0], str(result[1])
else:
passed, message = bool(result), str(result)
except Exception as e:
print(json.dumps({
"pass": False,
"reward": 0.0,
"message": f"Verifier error: {e}\n{traceback.format_exc()}",
}))
sys.exit(1)
print(json.dumps({
"pass": passed,
"reward": 1.0 if passed else 0.0,
"message": message,
}))
if __name__ == "__main__":
main()

View File

@@ -1,73 +1,34 @@
/**
* Smoke-test for the Clado BrowserOS Action endpoint.
*
* Health-checks the model, then runs a generate call and prints every
* field the new contract documents (action, coordinates, text, key,
* direction, scroll/drag fields, wait, end+final_answer, thinking,
* parse_error, raw_response).
* Test script for Clado API endpoints (grounding + action models)
*
* Usage:
* bun apps/eval/scripts/test-clado-api.ts [screenshot-path]
*
* If no screenshot path is given, captures one over MCP from a
* running BrowserOS server (default http://127.0.0.1:9110, override
* with BROWSEROS_URL).
*
* Cold start can take ~5 minutes; the script waits up to 6.
* If no screenshot provided, captures one from a running BrowserOS server.
*/
import { readFile } from 'node:fs/promises'
import { resolve } from 'node:path'
const ACTION_URL =
'https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run'
'https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run'
const ACTION_HEALTH_URL =
'https://clado-ai--clado-browseros-action-000159-merged-actionmod-5e5033.modal.run'
'https://clado-ai--clado-browseros-action-actionmodel-health.modal.run'
const GROUNDING_URL =
'https://clado-ai--clado-browseros-grounding-groundingmodel-generate.modal.run'
const GROUNDING_HEALTH_URL =
'https://clado-ai--clado-browseros-grounding-groundingmodel-health.modal.run'
const COLD_START_BUDGET_MS = 360_000 // 6 min — Clado cold start is ~5 min
const COLD_START_WARN_MS = 30_000
interface CladoResponse {
action?: string | null
thinking?: string | null
raw_response?: string
parse_error?: string | null
inference_time_seconds?: number
x?: number
y?: number
text?: string
key?: string
direction?: string
amount?: number
startX?: number
startY?: number
endX?: number
endY?: number
time?: number
final_answer?: string | null
}
async function checkHealth(): Promise<boolean> {
console.log(`\n--- Action model health ---`)
console.log(` URL: ${ACTION_HEALTH_URL}`)
console.log(
` Note: cold start can take ~5 min; waiting up to ${COLD_START_BUDGET_MS / 1000}s.`,
)
async function checkHealth(name: string, url: string): Promise<boolean> {
console.log(`\n--- ${name} health check ---`)
console.log(` URL: ${url}`)
const start = performance.now()
const warn = setTimeout(() => {
console.log(
` ...still waiting (${COLD_START_WARN_MS / 1000}s in) — model is likely cold-starting on Modal.`,
)
}, COLD_START_WARN_MS)
try {
const resp = await fetch(ACTION_HEALTH_URL, {
signal: AbortSignal.timeout(COLD_START_BUDGET_MS),
})
const resp = await fetch(url, { signal: AbortSignal.timeout(30_000) })
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
const body = await resp.text()
console.log(` Status: ${resp.status} (${elapsed}s)`)
console.log(` Body: ${body.slice(0, 400)}`)
console.log(` Body: ${body.slice(0, 200)}`)
return resp.ok
} catch (err) {
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
@@ -75,34 +36,63 @@ async function checkHealth(): Promise<boolean> {
` FAILED (${elapsed}s): ${err instanceof Error ? err.message : err}`,
)
return false
} finally {
clearTimeout(warn)
}
}
async function generate(
label: string,
async function testGenerate(
name: string,
url: string,
payload: Record<string, unknown>,
): Promise<CladoResponse | null> {
console.log(`\n--- ${label} ---`)
console.log(` URL: ${ACTION_URL}`)
): Promise<Record<string, unknown> | null> {
console.log(`\n--- ${name} generate ---`)
console.log(` URL: ${url}`)
console.log(` Instruction: ${payload.instruction}`)
console.log(
` Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
` Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
)
if (payload.history && payload.history !== 'None') {
console.log(` History: ${payload.history}`)
}
if (payload.history) console.log(` History: ${payload.history}`)
const start = performance.now()
let resp: Response
try {
resp = await fetch(ACTION_URL, {
const resp = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
signal: AbortSignal.timeout(COLD_START_BUDGET_MS),
signal: AbortSignal.timeout(120_000),
})
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
if (!resp.ok) {
const body = await resp.text()
console.log(` FAILED: HTTP ${resp.status} (${elapsed}s)`)
console.log(` Body: ${body.slice(0, 400)}`)
return null
}
const result = (await resp.json()) as Record<string, unknown>
console.log(` Status: ${resp.status} (${elapsed}s)`)
console.log(` Action: ${result.action}`)
if (result.x !== null && result.x !== undefined)
console.log(` Coordinates: (${result.x}, ${result.y})`)
if (result.text)
console.log(` Text: ${(result.text as string).slice(0, 100)}`)
if (result.key) console.log(` Key: ${result.key}`)
if (result.inference_time_seconds)
console.log(` Inference: ${result.inference_time_seconds}s`)
// Show thinking if present
const raw = result.raw_response as string | undefined
if (raw) {
const thinkMatch = raw.match(/<thinking>([\s\S]*?)<\/thinking>/)
if (thinkMatch) {
const thinking = thinkMatch[1].trim()
console.log(
` Thinking: ${thinking.slice(0, 200)}${thinking.length > 200 ? '...' : ''}`,
)
}
}
return result
} catch (err) {
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
console.log(
@@ -110,50 +100,6 @@ async function generate(
)
return null
}
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
if (!resp.ok) {
const body = await resp.text()
console.log(` HTTP ${resp.status} ${resp.statusText} (${elapsed}s)`)
console.log(` Body: ${body.slice(0, 400)}`)
return null
}
const result = (await resp.json()) as CladoResponse
console.log(` HTTP ${resp.status} (${elapsed}s)`)
console.log(` action: ${result.action ?? 'null'}`)
if (result.parse_error) {
console.log(` parse_error: ${result.parse_error}`)
}
if (result.thinking) {
const trimmed = result.thinking.replace(/\s+/g, ' ').trim()
console.log(
` thinking: ${trimmed.slice(0, 240)}${trimmed.length > 240 ? '…' : ''}`,
)
}
if (typeof result.x === 'number' || typeof result.y === 'number') {
console.log(` x, y: ${result.x}, ${result.y}`)
}
if (typeof result.text === 'string')
console.log(` text: ${result.text.slice(0, 120)}`)
if (typeof result.key === 'string')
console.log(` key: ${result.key}`)
if (typeof result.direction === 'string')
console.log(` direction: ${result.direction}`)
if (typeof result.amount === 'number')
console.log(` amount: ${result.amount}`)
if (typeof result.startX === 'number' || typeof result.endX === 'number') {
console.log(
` drag: (${result.startX}, ${result.startY}) → (${result.endX}, ${result.endY})`,
)
}
if (typeof result.time === 'number')
console.log(` time: ${result.time}s`)
if (result.final_answer)
console.log(` final_answer: ${result.final_answer.slice(0, 240)}`)
if (typeof result.inference_time_seconds === 'number')
console.log(` inference_time_seconds: ${result.inference_time_seconds}`)
return result
}
async function loadScreenshot(path?: string): Promise<string> {
@@ -164,9 +110,10 @@ async function loadScreenshot(path?: string): Promise<string> {
return data.toString('base64')
}
// Try to capture from a running BrowserOS server
const serverUrl = process.env.BROWSEROS_URL || 'http://127.0.0.1:9110'
console.log(
`No screenshot path provided. Capturing from ${serverUrl} via MCP...`,
`No screenshot path provided. Trying to capture from ${serverUrl}...`,
)
const { Client } = await import('@modelcontextprotocol/sdk/client/index.js')
@@ -187,101 +134,82 @@ async function loadScreenshot(path?: string): Promise<string> {
arguments: { format: 'png', page: 1 },
})) as { content: Array<{ type: string; data?: string }> }
const image = result.content?.find((c) => c.type === 'image')
if (!image?.data)
throw new Error('No image data in take_screenshot response')
const imageContent = result.content?.find((c) => c.type === 'image')
if (!imageContent?.data)
throw new Error('No image data in screenshot response')
console.log(
`Captured screenshot (${(image.data.length / 1024).toFixed(0)} KB base64)`,
`Captured screenshot (${(imageContent.data.length / 1024).toFixed(0)} KB base64)`,
)
return image.data
return imageContent.data
} finally {
try {
await transport.close()
} catch {
/* ignore */
}
} catch {}
}
}
function summarize(history: CladoResponse[]): string {
if (history.length === 0) return 'None'
return history
.map((h) => {
switch (h.action) {
case 'click':
case 'double_click':
case 'right_click':
case 'hover':
return `${h.action}(${h.x}, ${h.y})`
case 'type':
return `type(${JSON.stringify(h.text ?? '')})`
case 'press_key':
return `press_key(${JSON.stringify(h.key ?? '')})`
case 'scroll':
return `scroll(${h.direction ?? 'down'})`
case 'drag':
return `drag(${h.startX},${h.startY} -> ${h.endX},${h.endY})`
case 'wait':
return `wait(${h.time ?? 1}s)`
case 'end':
return 'end()'
default:
return h.action ?? 'invalid'
}
})
.join(' -> ')
}
async function main() {
console.log('=== Clado action endpoint smoke test ===')
const screenshotPath = process.argv[2]
const healthy = await checkHealth()
if (!healthy) {
console.log('\nHealth check failed. Exiting.')
console.log('=== Clado API Test ===\n')
// Health checks (parallel)
const [actionHealthy, groundingHealthy] = await Promise.all([
checkHealth('Action Model', ACTION_HEALTH_URL),
checkHealth('Grounding Model', GROUNDING_HEALTH_URL),
])
if (!actionHealthy && !groundingHealthy) {
console.log('\nBoth endpoints are down. Exiting.')
process.exit(1)
}
// Load screenshot
let imageBase64: string
try {
imageBase64 = await loadScreenshot(process.argv[2])
imageBase64 = await loadScreenshot(screenshotPath)
} catch (err) {
console.log(
`\nFailed to load screenshot: ${err instanceof Error ? err.message : err}`,
)
console.log(
'Pass a path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
'Provide a screenshot path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
)
process.exit(1)
}
const history: CladoResponse[] = []
const instruction = 'Click on the search button or search bar'
// Step 1: open task — let the model decide what to do.
const step1 = await generate('Step 1: cold task', {
instruction: 'Find the search bar and click it',
image_base64: imageBase64,
history: 'None',
})
if (step1?.action) history.push(step1)
// Step 2: continuation with history, asks for typing.
if (step1?.action) {
const step2 = await generate('Step 2: with history', {
instruction: 'Type "hello world" into the search bar',
// Test grounding model
if (groundingHealthy) {
await testGenerate('Grounding Model', GROUNDING_URL, {
instruction,
image_base64: imageBase64,
history: summarize(history),
})
if (step2?.action) history.push(step2)
} else {
console.log('\nSkipping grounding model (unhealthy)')
}
// Step 3: ask for end with a final answer to exercise that field.
await generate('Step 3: ask for end+final_answer', {
instruction:
'You have completed the task. Reply with end() and final_answer="done".',
image_base64: imageBase64,
history: summarize(history),
})
// Test action model (no history)
if (actionHealthy) {
const result = await testGenerate('Action Model (step 1)', ACTION_URL, {
instruction,
image_base64: imageBase64,
history: 'None',
})
// Test action model with history (simulate multi-turn)
if (result && result.action === 'click') {
await testGenerate('Action Model (step 2, with history)', ACTION_URL, {
instruction: 'Type "hello world" in the search bar',
image_base64: imageBase64,
history: `click(${result.x}, ${result.y})`,
})
}
} else {
console.log('\nSkipping action model (unhealthy)')
}
console.log('\n=== Done ===')
}

View File

@@ -0,0 +1,647 @@
/**
* Test script to validate failure scenario handling
* Run with: bun apps/eval/scripts/test-failure-scenarios.ts
*
* This script simulates various failure scenarios and shows the recovery flow.
* Run each scenario individually to see how the system handles it.
*/
import { dirname, join } from 'node:path'
import { fileURLToPath } from 'node:url'
import { type Subprocess, spawn, spawnSync } from 'bun'
// Ports from config.dev.json - must match BrowserOS server_config.json
const EVAL_PORTS = {
cdp: 9005,
server: 9105, // http_mcp in config.dev.json
} as const
const MONOREPO_ROOT = join(dirname(fileURLToPath(import.meta.url)), '../../..')
// ============================================================================
// Utility Functions (copied from parallel-executor for testing)
// ============================================================================
function log(category: string, message: string): void {
const timestamp = new Date().toISOString().split('T')[1].slice(0, 12)
console.log(`[${timestamp}] [${category}] ${message}`)
}
function killPort(port: number): void {
log('UTIL', `Killing processes on port ${port}`)
spawnSync({
cmd: ['sh', '-c', `lsof -ti:${port} | xargs kill -9 2>/dev/null || true`],
})
}
function isBrowserOSAppRunning(): boolean {
const result = spawnSync({
cmd: ['sh', '-c', 'pgrep -f "BrowserOS" 2>/dev/null || true'],
})
const output = result.stdout?.toString().trim() ?? ''
return output.length > 0
}
async function killBrowserOSApp(): Promise<void> {
log('BROWSEROS', 'Killing BrowserOS application...')
spawnSync({
cmd: ['sh', '-c', 'pkill -9 -f "BrowserOS" 2>/dev/null || true'],
})
killPort(EVAL_PORTS.cdp)
for (let i = 0; i < 10; i++) {
if (!isBrowserOSAppRunning()) {
log('BROWSEROS', 'Application killed')
return
}
await sleep(500)
}
log('BROWSEROS', 'Warning: Application may not have fully terminated')
}
async function launchBrowserOSApp(): Promise<boolean> {
log(
'BROWSEROS',
`Launching BrowserOS (server disabled, CDP=${EVAL_PORTS.cdp})...`,
)
spawnSync({
cmd: [
'open',
'-a',
'BrowserOS',
'--args',
'--disable-browseros-server',
`--browseros-cdp-port=${EVAL_PORTS.cdp}`,
],
})
for (let i = 0; i < 30; i++) {
await sleep(1000)
if (isBrowserOSAppRunning()) {
log(
'BROWSEROS',
'Application launched, waiting for initialization (8s)...',
)
await sleep(8000)
return true
}
}
log('BROWSEROS', 'Failed to launch application')
return false
}
async function waitForPortFree(
port: number,
maxAttempts = 30,
): Promise<boolean> {
for (let i = 0; i < maxAttempts; i++) {
const result = spawnSync({
cmd: ['sh', '-c', `lsof -ti:${port} 2>/dev/null`],
})
if (!result.stdout || result.stdout.toString().trim() === '') {
return true
}
await sleep(500)
}
return false
}
async function waitForServerHealth(
port: number,
maxAttempts = 60,
): Promise<boolean> {
for (let i = 0; i < maxAttempts; i++) {
try {
const res = await fetch(`http://127.0.0.1:${port}/health`, {
signal: AbortSignal.timeout(1000),
})
if (res.ok) return true
} catch {
/* not ready */
}
await sleep(500)
}
return false
}
async function waitForBrowserReady(
port: number,
maxAttempts = 60,
): Promise<boolean> {
let connectedCount = 0
for (let i = 0; i < maxAttempts; i++) {
try {
const res = await fetch(`http://127.0.0.1:${port}/health`, {
signal: AbortSignal.timeout(2000),
})
if (res.ok) {
const data = (await res.json()) as { cdpConnected?: boolean }
if (data.cdpConnected) {
connectedCount++
if (connectedCount >= 3) return true
} else {
connectedCount = 0
}
}
} catch {
connectedCount = 0
}
await sleep(500)
}
return false
}
async function checkBrowserReady(port: number): Promise<boolean> {
try {
const res = await fetch(`http://127.0.0.1:${port}/health`, {
signal: AbortSignal.timeout(3000),
})
if (res.ok) {
const data = (await res.json()) as { cdpConnected?: boolean }
return data.cdpConnected === true
}
} catch {
/* failed */
}
return false
}
function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms))
}
let serverProc: Subprocess | null = null
async function startServer(): Promise<Subprocess> {
log('SERVER', 'Cleaning up ports...')
killPort(EVAL_PORTS.server)
await waitForPortFree(EVAL_PORTS.server, 30)
log('SERVER', 'Starting server process...')
const proc = spawn({
cmd: [
'bun',
'apps/server/src/index.ts',
'--server-port',
String(EVAL_PORTS.server),
'--cdp-port',
String(EVAL_PORTS.cdp),
],
cwd: MONOREPO_ROOT,
stdout: 'pipe',
stderr: 'pipe',
env: { ...process.env, NODE_ENV: 'development' },
})
serverProc = proc
log('SERVER', `Server started with PID ${proc.pid}`)
return proc
}
async function stopServer(proc: Subprocess): Promise<void> {
log('SERVER', 'Stopping server...')
try {
proc.kill('SIGKILL')
await Promise.race([proc.exited, sleep(5000)])
} catch {
/* ignore */
}
serverProc = null
log('SERVER', 'Server stopped')
}
// ============================================================================
// Scenario Tests
// ============================================================================
async function scenario1_AppNotRunningAtStart(): Promise<void> {
console.log(`\n${'='.repeat(70)}`)
console.log('SCENARIO 1: BrowserOS App Not Running at Start')
console.log('='.repeat(70))
console.log(
'Expected: Detect missing app → Launch app → Wait for init → Continue\n',
)
// Kill the app first
await killBrowserOSApp()
await sleep(2000)
// Now check what happens
log('CHECK', `Is BrowserOS running? ${isBrowserOSAppRunning()}`)
if (!isBrowserOSAppRunning()) {
log('FLOW', '→ App not running, attempting to launch...')
const launched = await launchBrowserOSApp()
if (launched) {
log('FLOW', '→ App launched successfully')
log('CHECK', `Is BrowserOS running now? ${isBrowserOSAppRunning()}`)
} else {
log('FLOW', '→ FAILED to launch app')
log(
'RESULT',
'Task would FAIL with: "BrowserOS application is not running"',
)
return
}
}
log('RESULT', 'SUCCESS - App is now running, can proceed with server start')
}
async function scenario2_BrowserNotReady(): Promise<void> {
console.log(`\n${'='.repeat(70)}`)
console.log('SCENARIO 2: Browser Does Not Become Ready Within 30 Seconds')
console.log('='.repeat(70))
console.log(
'Expected: Wait 30s → Restart BrowserOS app → Retry → Success or fail after 3 attempts\n',
)
// Make sure app is running first
if (!isBrowserOSAppRunning()) {
log('SETUP', 'Launching BrowserOS for test...')
await launchBrowserOSApp()
}
const MAX_RETRIES = 3
let browserOSRestartAttempted = false
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
log('ATTEMPT', `Server start attempt ${attempt}/${MAX_RETRIES}`)
try {
const proc = await startServer()
log('WAIT', 'Waiting for server health...')
const healthy = await waitForServerHealth(EVAL_PORTS.server, 30)
if (!healthy) {
throw new Error('Server health check failed')
}
log('HEALTH', 'Server health OK')
log('WAIT', 'Waiting for browser readiness (30s timeout)...')
const browserReady = await waitForBrowserReady(EVAL_PORTS.server, 60)
if (!browserReady) {
log('TIMEOUT', 'Browser did not become ready within 30 seconds')
await stopServer(proc)
if (!browserOSRestartAttempted) {
log('RECOVERY', '→ Restarting BrowserOS application...')
await killBrowserOSApp()
await sleep(2000)
const restarted = await launchBrowserOSApp()
browserOSRestartAttempted = true
if (restarted) {
log('RECOVERY', '→ BrowserOS restarted, will retry server')
continue
} else {
log('RECOVERY', '→ FAILED to restart BrowserOS')
}
}
throw new Error('Browser did not become ready')
}
log('CONNECTED', 'Browser ready!')
await stopServer(proc)
log('RESULT', 'SUCCESS - Would proceed with task execution')
return
} catch (error) {
log('ERROR', `Attempt ${attempt} failed: ${error}`)
if (attempt === MAX_RETRIES) {
log('RESULT', 'FAILURE - All retries exhausted, task would fail')
}
}
await sleep(5000)
}
}
async function scenario3_ServerCrashesMidTask(): Promise<void> {
console.log(`\n${'='.repeat(70)}`)
console.log('SCENARIO 3: Server Process Crashes Mid-Task')
console.log('='.repeat(70))
console.log(
'Expected: Task fails → Clean up ports → Next task restarts fresh\n',
)
if (!isBrowserOSAppRunning()) {
log('SETUP', 'Launching BrowserOS for test...')
await launchBrowserOSApp()
}
const proc = await startServer()
log('WAIT', 'Waiting for server to be ready...')
const healthy = await waitForServerHealth(EVAL_PORTS.server, 30)
if (!healthy) {
log('SETUP', 'Server failed to become healthy')
return
}
const browserReady = await waitForBrowserReady(EVAL_PORTS.server, 60)
if (!browserReady) {
log('SETUP', 'Browser did not become ready')
await stopServer(proc)
return
}
log('READY', 'Server and browser ready')
log('SIMULATE', 'Simulating server crash by killing the process...')
// Kill the server to simulate crash
proc.kill('SIGKILL')
await sleep(1000)
// Check what we see now
log('CHECK', 'Checking server health after crash...')
const stillHealthy = await waitForServerHealth(EVAL_PORTS.server, 5)
log('CHECK', `Server health: ${stillHealthy ? 'OK' : 'FAILED'}`)
log('CHECK', 'Checking browser readiness...')
const stillConnected = await checkBrowserReady(EVAL_PORTS.server)
log('CHECK', `Browser ready: ${stillConnected}`)
if (!stillHealthy || !stillConnected) {
log('DETECTED', '→ Infrastructure failure detected!')
log(
'RECOVERY',
'→ In real flow: Would clean up ports and restart for next task',
)
killPort(EVAL_PORTS.server)
log('CLEANUP', 'Ports cleaned')
log('RESULT', 'Task would FAIL, but next task gets clean environment')
}
}
async function scenario4_ToolTimeout(): Promise<void> {
console.log(`\n${'='.repeat(70)}`)
console.log('SCENARIO 4: Tool Execution Timeout')
console.log('='.repeat(70))
console.log(
'Expected: Tool times out → Error contains "timeout" → Classified as infra error → Clean restart\n',
)
// Simulate what happens when we get a timeout error
const errorMessage = 'MCP tool call timed out after 65000ms'
log('ERROR', `Received error: "${errorMessage}"`)
const isInfraError =
errorMessage.includes('BrowserOS') ||
errorMessage.includes('server') ||
errorMessage.includes('not connected') ||
errorMessage.includes('timed out') ||
errorMessage.includes('timeout')
log('CLASSIFY', `Is infrastructure error? ${isInfraError}`)
if (isInfraError) {
log('FLOW', '→ Error classified as infrastructure failure')
log('FLOW', '→ Would kill ports for clean next-task state')
log('FLOW', '→ killPort(9110)')
log('FLOW', '→ killPort(9310)')
log('RESULT', 'Task FAILS, but ports cleaned for next task')
} else {
log('FLOW', '→ Error classified as task-specific failure')
log('RESULT', 'Task FAILS, environment not reset')
}
}
async function scenario5_BrowserUnavailableMidTask(): Promise<void> {
console.log(`\n${'='.repeat(70)}`)
console.log('SCENARIO 5: Browser Becomes Unavailable Mid-Task (App Crashes)')
console.log('='.repeat(70))
console.log(
'Expected: Tool call fails → "not connected" error → Kill app → Restart for next task\n',
)
if (!isBrowserOSAppRunning()) {
log('SETUP', 'Launching BrowserOS for test...')
await launchBrowserOSApp()
}
const proc = await startServer()
log('WAIT', 'Waiting for server to be ready...')
await waitForServerHealth(EVAL_PORTS.server, 30)
await waitForBrowserReady(EVAL_PORTS.server, 60)
log('READY', 'Server and browser ready')
log('SIMULATE', 'Simulating BrowserOS crash by killing the app...')
await killBrowserOSApp()
await sleep(2000)
// Check browser status
log('CHECK', 'Checking browser readiness after app crash...')
const stillConnected = await checkBrowserReady(EVAL_PORTS.server)
log('CHECK', `Browser ready: ${stillConnected}`)
if (!stillConnected) {
log('DETECTED', '→ Browser became unavailable!')
const errorMessage = 'BrowserOS helper service not connected'
log('ERROR', `Tool call would fail with: "${errorMessage}"`)
const isInfraError = errorMessage.includes('not connected')
log('CLASSIFY', `Is infrastructure error? ${isInfraError}`)
if (isInfraError) {
log('RECOVERY', '→ Cleaning up for next task...')
await stopServer(proc)
killPort(EVAL_PORTS.server)
log('RECOVERY', '→ Next task would check if BrowserOS is running...')
const appRunning = isBrowserOSAppRunning()
log('CHECK', `BrowserOS running: ${appRunning}`)
if (!appRunning) {
log('RECOVERY', '→ Would launch BrowserOS app')
await launchBrowserOSApp()
}
log('RESULT', 'Current task FAILS, next task gets fresh environment')
}
} else {
await stopServer(proc)
}
}
async function scenario6_GracefulShutdown(): Promise<void> {
console.log(`\n${'='.repeat(70)}`)
console.log('SCENARIO 6: Graceful Shutdown (Ctrl+C)')
console.log('='.repeat(70))
console.log('Expected: SIGINT received → Kill server → Clean ports → Exit\n')
log('INFO', 'In real flow, signal handlers are registered at startup:')
log('CODE', ' process.on("SIGINT", cleanup)')
log('CODE', ' process.on("SIGTERM", cleanup)')
log('CODE', ' process.on("uncaughtException", cleanup)')
log('FLOW', 'When Ctrl+C is pressed:')
log('FLOW', ' 1. isShuttingDown = true (prevent duplicate cleanup)')
log('FLOW', ' 2. Kill server process if running')
log('FLOW', ' 3. Kill processes on ports 9110, 9310')
log('FLOW', ' 4. Exit with code 0')
log('RESULT', 'Clean shutdown, no orphaned processes')
}
async function scenario7_ConsecutiveFailures(): Promise<void> {
console.log(`\n${'='.repeat(70)}`)
console.log('SCENARIO 7: Consecutive Task Failures')
console.log('='.repeat(70))
console.log(
'Expected: Each failed task cleans up → Next task gets fresh start\n',
)
const tasks = ['task-1', 'task-2', 'task-3']
for (const taskId of tasks) {
log('TASK', `=== Starting ${taskId} ===`)
// Check if app is running
log('CHECK', `BrowserOS running: ${isBrowserOSAppRunning()}`)
if (!isBrowserOSAppRunning()) {
log('FLOW', '→ Would launch BrowserOS')
}
// Simulate infrastructure check before task
log('FLOW', '→ Start server')
log('FLOW', '→ Wait for health')
log('FLOW', '→ Wait for browser readiness')
// Simulate task failure
const failureReason =
taskId === 'task-1'
? 'Browser did not become ready'
: taskId === 'task-2'
? 'Tool timed out after 65000ms'
: 'BrowserOS helper service not connected'
log('ERROR', `Task failed: ${failureReason}`)
const isInfraError =
failureReason.includes('timeout') ||
failureReason.includes('not connected')
if (isInfraError) {
log('CLEANUP', '→ Detected infra error, cleaning ports')
log('CLEANUP', '→ killPort(9110)')
}
log('CLEANUP', '→ Stop server')
log('CLEANUP', '→ Wait 2s before next task')
console.log()
}
log('RESULT', 'Each task failure is isolated, next task starts clean')
}
// ============================================================================
// Main Menu
// ============================================================================
async function main() {
console.log('='.repeat(70))
console.log('Failure Scenario Test Suite')
console.log('='.repeat(70))
console.log(`Server Port: ${EVAL_PORTS.server}`)
console.log(`CDP Port: ${EVAL_PORTS.cdp}`)
console.log()
const scenarios = [
{
num: 1,
name: 'BrowserOS App Not Running at Start',
fn: scenario1_AppNotRunningAtStart,
},
{
num: 2,
name: 'Browser Does Not Become Ready (30s timeout)',
fn: scenario2_BrowserNotReady,
},
{
num: 3,
name: 'Server Process Crashes Mid-Task',
fn: scenario3_ServerCrashesMidTask,
},
{
num: 4,
name: 'Tool Execution Timeout (simulated)',
fn: scenario4_ToolTimeout,
},
{
num: 5,
name: 'Browser Becomes Unavailable Mid-Task (App Crash)',
fn: scenario5_BrowserUnavailableMidTask,
},
{
num: 6,
name: 'Graceful Shutdown (explanation)',
fn: scenario6_GracefulShutdown,
},
{
num: 7,
name: 'Consecutive Task Failures (simulated)',
fn: scenario7_ConsecutiveFailures,
},
]
console.log('Available scenarios:')
for (const s of scenarios) {
console.log(` ${s.num}. ${s.name}`)
}
console.log(' all. Run all scenarios')
console.log()
const arg = process.argv[2]
if (!arg) {
console.log(
'Usage: bun apps/eval/scripts/test-failure-scenarios.ts <scenario-number|all>',
)
console.log('Example: bun apps/eval/scripts/test-failure-scenarios.ts 1')
console.log('Example: bun apps/eval/scripts/test-failure-scenarios.ts all')
process.exit(0)
}
// Setup cleanup handler
const cleanup = async () => {
console.log('\n[CLEANUP] Cleaning up...')
if (serverProc) {
try {
serverProc.kill('SIGKILL')
} catch {}
}
killPort(EVAL_PORTS.server)
process.exit(0)
}
process.on('SIGINT', cleanup)
if (arg === 'all') {
for (const s of scenarios) {
await s.fn()
await sleep(3000)
}
} else {
const num = parseInt(arg, 10)
const scenario = scenarios.find((s) => s.num === num)
if (!scenario) {
console.log(`Unknown scenario: ${arg}`)
process.exit(1)
}
await scenario.fn()
}
// Cleanup
if (serverProc) {
await stopServer(serverProc)
}
console.log(`\n${'='.repeat(70)}`)
console.log('Test completed')
console.log('='.repeat(70))
}
main().catch(console.error)

Some files were not shown because too many files have changed in this diff Show More