mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 23:53:25 +00:00
Compare commits
33 Commits
fix/tests
...
fix/eval-4
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6ee306236e | ||
|
|
0afc59cda1 | ||
|
|
eb8faa931a | ||
|
|
be70170313 | ||
|
|
0661197f5b | ||
|
|
c4e7824266 | ||
|
|
22f71a36c5 | ||
|
|
d49986d0b3 | ||
|
|
acdd394585 | ||
|
|
219fdf1e28 | ||
|
|
014f71d227 | ||
|
|
876dea4d56 | ||
|
|
fca7d4cbcb | ||
|
|
e1bfadb075 | ||
|
|
aa0d9b96ef | ||
|
|
1c9604b5fa | ||
|
|
685266a1d8 | ||
|
|
561f2baf97 | ||
|
|
df0f45dd29 | ||
|
|
edfc5c751c | ||
|
|
471256f31c | ||
|
|
4c90ca696b | ||
|
|
f2ac87d7c3 | ||
|
|
231bd6821d | ||
|
|
a228c278c6 | ||
|
|
e2ec1991cf | ||
|
|
0c84547e8f | ||
|
|
2ff5c12840 | ||
|
|
d87422eea1 | ||
|
|
1946ca0cf8 | ||
|
|
754f7d0e1d | ||
|
|
85bb3f7b42 | ||
|
|
cb32b8191d |
40
.github/workflows/eval-weekly.yml
vendored
40
.github/workflows/eval-weekly.yml
vendored
@@ -14,7 +14,7 @@ on:
|
||||
config:
|
||||
description: 'Eval config file (relative to apps/eval/)'
|
||||
required: false
|
||||
default: 'configs/browseros-agent-weekly.json'
|
||||
default: 'configs/legacy/browseros-agent-weekly.json'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -42,10 +42,12 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
working-directory: packages/browseros-agent
|
||||
run: bun install --ignore-scripts && bun run build:agent-sdk
|
||||
run: bun install --ignore-scripts
|
||||
|
||||
- name: Install Python eval dependencies
|
||||
run: pip install agisdk requests
|
||||
# agisdk pinned so silent upstream releases can't shift task definitions
|
||||
# or grader behavior. Bump intentionally with a documented re-baseline.
|
||||
run: pip install agisdk==0.3.5 requests
|
||||
|
||||
- name: Clone WebArena-Infinity
|
||||
run: git clone --depth 1 https://github.com/web-arena-x/webarena-infinity.git /tmp/webarena-infinity
|
||||
@@ -60,33 +62,27 @@ jobs:
|
||||
curl -sL -o /tmp/nopecha.zip https://github.com/NopeCHALLC/nopecha-extension/releases/latest/download/chromium_automation.zip
|
||||
unzip -qo /tmp/nopecha.zip -d extensions/nopecha
|
||||
|
||||
- name: Run eval
|
||||
- name: Run eval and publish to R2
|
||||
working-directory: packages/browseros-agent/apps/eval
|
||||
env:
|
||||
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
|
||||
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
||||
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
|
||||
NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
|
||||
BROWSEROS_BINARY: /usr/bin/browseros
|
||||
WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
|
||||
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
|
||||
run: |
|
||||
echo "Running eval with config: $EVAL_CONFIG"
|
||||
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts -c "$EVAL_CONFIG"
|
||||
|
||||
- name: Upload runs to R2
|
||||
if: success()
|
||||
working-directory: packages/browseros-agent/apps/eval
|
||||
env:
|
||||
EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
|
||||
EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }}
|
||||
EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }}
|
||||
EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }}
|
||||
EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }}
|
||||
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
|
||||
BROWSEROS_BINARY: /usr/bin/browseros
|
||||
WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
|
||||
# OpenClaw container runtime is macOS-only; opt the Linux runner
|
||||
# into the no-op stub so the server can boot and the eval can run.
|
||||
BROWSEROS_SKIP_OPENCLAW: '1'
|
||||
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }}
|
||||
run: |
|
||||
CONFIG_NAME=$(basename "$EVAL_CONFIG" .json)
|
||||
bun scripts/upload-run.ts "results/$CONFIG_NAME"
|
||||
echo "Running eval with config: $EVAL_CONFIG"
|
||||
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts suite --config "$EVAL_CONFIG" --publish r2
|
||||
|
||||
- name: Generate trend report
|
||||
if: success()
|
||||
@@ -107,3 +103,11 @@ jobs:
|
||||
with:
|
||||
name: eval-report-${{ github.run_id }}
|
||||
path: /tmp/eval-report.html
|
||||
|
||||
- name: Upload server stderr logs (for post-mortem on startup failures)
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: browseros-server-logs-${{ github.run_id }}
|
||||
path: /tmp/browseros-server-logs/
|
||||
if-no-files-found: ignore
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
name: build-agent
|
||||
name: Publish VM Agent Cache
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -16,7 +16,7 @@ on:
|
||||
pull_request:
|
||||
paths:
|
||||
- "packages/browseros-agent/packages/build-tools/**"
|
||||
- ".github/workflows/build-agent.yml"
|
||||
- ".github/workflows/publish-vm-agent-cache.yml"
|
||||
|
||||
env:
|
||||
BUN_VERSION: "1.3.6"
|
||||
@@ -48,6 +48,8 @@ jobs:
|
||||
include:
|
||||
- arch: arm64
|
||||
runner: ubuntu-24.04-arm
|
||||
- arch: x64
|
||||
runner: ubuntu-24.04
|
||||
runs-on: ${{ matrix.runner }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -74,7 +76,15 @@ jobs:
|
||||
|
||||
smoke:
|
||||
needs: build
|
||||
runs-on: ubuntu-24.04-arm
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- arch: arm64
|
||||
runner: ubuntu-24.04-arm
|
||||
- arch: x64
|
||||
runner: ubuntu-24.04
|
||||
runs-on: ${{ matrix.runner }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: oven-sh/setup-bun@v2
|
||||
@@ -82,7 +92,7 @@ jobs:
|
||||
bun-version: ${{ env.BUN_VERSION }}
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: tarball-${{ inputs.agent || 'openclaw' }}-arm64
|
||||
name: tarball-${{ inputs.agent || 'openclaw' }}-${{ matrix.arch }}
|
||||
path: dist/images
|
||||
- name: Install podman
|
||||
run: |
|
||||
@@ -96,12 +106,12 @@ jobs:
|
||||
AGENT: ${{ inputs.agent || 'openclaw' }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
tarball="$(find "$GITHUB_WORKSPACE/dist/images" -name "${AGENT}-*-arm64.tar.gz" -print -quit)"
|
||||
tarball="$(find "$GITHUB_WORKSPACE/dist/images" -name "${AGENT}-*-${{ matrix.arch }}.tar.gz" -print -quit)"
|
||||
if [ -z "$tarball" ]; then
|
||||
echo "missing arm64 tarball artifact for ${AGENT}" >&2
|
||||
echo "missing ${{ matrix.arch }} tarball artifact for ${AGENT}" >&2
|
||||
exit 1
|
||||
fi
|
||||
bun run smoke:tarball -- --agent "$AGENT" --arch arm64 --tarball "$tarball"
|
||||
bun run smoke:tarball -- --agent "$AGENT" --arch "${{ matrix.arch }}" --tarball "$tarball"
|
||||
|
||||
publish:
|
||||
needs: [build, smoke]
|
||||
165
.github/workflows/release-agent-sdk.yml
vendored
165
.github/workflows/release-agent-sdk.yml
vendored
@@ -1,168 +1,11 @@
|
||||
name: Release BrowserOS Agent SDK
|
||||
name: Release BrowserOS Agent SDK (disabled)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: release-agent-sdk
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
if: github.ref == 'refs/heads/main'
|
||||
disabled:
|
||||
if: ${{ false }}
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
pull-requests: write
|
||||
defaults:
|
||||
run:
|
||||
working-directory: packages/browseros-agent/packages/agent-sdk
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- uses: oven-sh/setup-bun@v2
|
||||
|
||||
- uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "20"
|
||||
registry-url: "https://registry.npmjs.org"
|
||||
|
||||
- name: Install dependencies
|
||||
run: bun ci
|
||||
working-directory: packages/browseros-agent
|
||||
|
||||
- name: Build
|
||||
run: bun run build
|
||||
|
||||
- name: Test
|
||||
run: bun test
|
||||
|
||||
- name: Get version
|
||||
id: version
|
||||
run: |
|
||||
echo "version=$(node -p "require('./package.json').version")" >> "$GITHUB_OUTPUT"
|
||||
echo "release_sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Generate release notes
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
SDK_PATH="packages/browseros-agent/packages/agent-sdk"
|
||||
CURRENT_TAG="agent-sdk-v${{ steps.version.outputs.version }}"
|
||||
# Find the previous tag, excluding the current version's tag
|
||||
# (which may already exist from a prior failed run)
|
||||
PREV_TAG=$(git tag -l "agent-sdk-v*" --sort=-v:refname | grep -v "^${CURRENT_TAG}$" | head -n 1)
|
||||
|
||||
if [ -z "$PREV_TAG" ]; then
|
||||
echo "Initial release" > /tmp/release-notes.md
|
||||
else
|
||||
# Get commits scoped to the SDK directory
|
||||
COMMITS=$(git log "$PREV_TAG"..HEAD --pretty=format:"%H" -- "$SDK_PATH")
|
||||
|
||||
if [ -z "$COMMITS" ]; then
|
||||
echo "No notable changes." > /tmp/release-notes.md
|
||||
else
|
||||
echo "## What's Changed" > /tmp/release-notes.md
|
||||
echo "" >> /tmp/release-notes.md
|
||||
|
||||
# For each commit, find the associated PR and format with author
|
||||
CONTRIBUTORS=""
|
||||
while IFS= read -r SHA; do
|
||||
# Get commit subject and author
|
||||
SUBJECT=$(git log -1 --pretty=format:"%s" "$SHA")
|
||||
AUTHOR=$(git log -1 --pretty=format:"%an" "$SHA")
|
||||
GITHUB_USER=$(gh api "/repos/${{ github.repository }}/commits/${SHA}" --jq '.author.login // empty' 2>/dev/null)
|
||||
|
||||
# Find associated PR number
|
||||
PR_NUM=$(gh api "/repos/${{ github.repository }}/commits/${SHA}/pulls" --jq '.[0].number // empty' 2>/dev/null)
|
||||
|
||||
# Format line: skip PR number if already in the commit subject
|
||||
# (squash merges include "(#123)" in the subject automatically)
|
||||
if [ -n "$PR_NUM" ] && ! echo "$SUBJECT" | grep -qF "(#${PR_NUM})"; then
|
||||
echo "- ${SUBJECT} (#${PR_NUM})" >> /tmp/release-notes.md
|
||||
else
|
||||
echo "- ${SUBJECT}" >> /tmp/release-notes.md
|
||||
fi
|
||||
done <<< "$COMMITS"
|
||||
fi
|
||||
fi
|
||||
working-directory: ${{ github.workspace }}
|
||||
|
||||
- name: Publish
|
||||
run: npm publish --access public
|
||||
env:
|
||||
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
|
||||
|
||||
- name: Create GitHub release
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
TAG="agent-sdk-v${{ steps.version.outputs.version }}"
|
||||
RELEASE_SHA="${{ steps.version.outputs.release_sha }}"
|
||||
TITLE="BrowserOS Agent SDK - v${{ steps.version.outputs.version }}"
|
||||
|
||||
# Create or reuse tag (idempotent for re-runs)
|
||||
if git rev-parse "$TAG" >/dev/null 2>&1; then
|
||||
echo "Tag $TAG already exists, skipping tag creation"
|
||||
else
|
||||
git tag "$TAG" "$RELEASE_SHA"
|
||||
fi
|
||||
|
||||
# Push tag (skip if already on remote)
|
||||
if git ls-remote --tags origin "$TAG" | grep -q "$TAG"; then
|
||||
echo "Tag $TAG already on remote, skipping push"
|
||||
else
|
||||
git push origin "$TAG"
|
||||
fi
|
||||
|
||||
# Create or update release
|
||||
if gh release view "$TAG" >/dev/null 2>&1; then
|
||||
echo "Release $TAG already exists, updating"
|
||||
gh release edit "$TAG" --title "$TITLE" --notes-file /tmp/release-notes.md
|
||||
else
|
||||
gh release create "$TAG" --title "$TITLE" --notes-file /tmp/release-notes.md
|
||||
fi
|
||||
working-directory: ${{ github.workspace }}
|
||||
|
||||
- name: Update CHANGELOG.md via PR
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
VERSION="${{ steps.version.outputs.version }}"
|
||||
DATE=$(date -u +"%Y-%m-%d")
|
||||
BRANCH="docs/agent-sdk-changelog-v${VERSION}"
|
||||
CHANGELOG="packages/browseros-agent/packages/agent-sdk/CHANGELOG.md"
|
||||
|
||||
# Return to main before branching
|
||||
git checkout main
|
||||
|
||||
# Use head/tail to safely insert without sed quoting issues
|
||||
{
|
||||
head -n 1 "$CHANGELOG"
|
||||
echo ""
|
||||
echo "## v${VERSION} (${DATE})"
|
||||
echo ""
|
||||
cat /tmp/release-notes.md
|
||||
echo ""
|
||||
tail -n +2 "$CHANGELOG"
|
||||
} > /tmp/new-changelog.md
|
||||
mv /tmp/new-changelog.md "$CHANGELOG"
|
||||
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
git checkout -b "$BRANCH"
|
||||
git add "$CHANGELOG"
|
||||
git commit -m "docs: update agent-sdk changelog for v${VERSION}"
|
||||
git push origin "$BRANCH"
|
||||
|
||||
gh pr create \
|
||||
--title "docs: update agent-sdk changelog for v${VERSION}" \
|
||||
--body "Auto-generated changelog update for BrowserOS Agent SDK v${VERSION}." \
|
||||
--base main \
|
||||
--head "$BRANCH"
|
||||
|
||||
gh pr merge "$BRANCH" --squash --auto || true
|
||||
working-directory: ${{ github.workspace }}
|
||||
- run: echo "Agent SDK publishing is disabled."
|
||||
|
||||
12
.github/workflows/test.yml
vendored
12
.github/workflows/test.yml
vendored
@@ -54,10 +54,10 @@ jobs:
|
||||
command: (cd apps/server && bun run test:integration)
|
||||
junit_path: test-results/server-integration.xml
|
||||
needs_browser: true
|
||||
- suite: server-sdk
|
||||
command: (cd apps/server && bun run test:sdk)
|
||||
junit_path: test-results/server-sdk.xml
|
||||
needs_browser: true
|
||||
- suite: server-lib
|
||||
command: (cd apps/server && bun run test:lib)
|
||||
junit_path: test-results/server-lib.xml
|
||||
needs_browser: false
|
||||
- suite: server-root
|
||||
command: (cd apps/server && bun run test:root)
|
||||
junit_path: test-results/server-root.xml
|
||||
@@ -70,10 +70,6 @@ jobs:
|
||||
command: bun run test:eval
|
||||
junit_path: test-results/eval.xml
|
||||
needs_browser: false
|
||||
- suite: agent-sdk
|
||||
command: bun run test:agent-sdk
|
||||
junit_path: test-results/agent-sdk.xml
|
||||
needs_browser: false
|
||||
- suite: build
|
||||
command: bun run test:build
|
||||
junit_path: test-results/build.xml
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# BrowserOS Agent
|
||||
|
||||
The agent platform powering [BrowserOS](https://github.com/browseros-ai/BrowserOS) — contains the MCP server, agent UI, CLI, evaluation framework, and SDK.
|
||||
The agent platform powering [BrowserOS](https://github.com/browseros-ai/BrowserOS) — contains the MCP server, agent UI, CLI, and evaluation framework.
|
||||
|
||||
## Monorepo Structure
|
||||
|
||||
@@ -12,7 +12,6 @@ apps/
|
||||
eval/ # Evaluation framework for benchmarking agents
|
||||
|
||||
packages/
|
||||
agent-sdk/ # Node.js SDK (@browseros-ai/agent-sdk)
|
||||
cdp-protocol/ # Type-safe Chrome DevTools Protocol bindings
|
||||
shared/ # Shared constants (ports, timeouts, limits)
|
||||
```
|
||||
@@ -23,7 +22,6 @@ packages/
|
||||
| `apps/agent` | Agent UI — Chrome extension for the chat interface |
|
||||
| `apps/cli` | Go CLI — control BrowserOS from the terminal or AI coding agents |
|
||||
| `apps/eval` | Benchmark framework — WebVoyager, Mind2Web evaluation |
|
||||
| `packages/agent-sdk` | Node.js SDK for browser automation with natural language |
|
||||
| `packages/cdp-protocol` | Auto-generated CDP type bindings used by the server |
|
||||
| `packages/shared` | Shared constants used across packages |
|
||||
|
||||
|
||||
@@ -0,0 +1,50 @@
|
||||
import type { Provider } from './chatComponentTypes'
|
||||
|
||||
export interface ProviderOptionGroup {
|
||||
key: 'llm' | 'acp'
|
||||
label: string
|
||||
options: Provider[]
|
||||
}
|
||||
|
||||
export function groupProviderOptions(
|
||||
providers: Provider[],
|
||||
): ProviderOptionGroup[] {
|
||||
const llm = providers.filter((provider) => provider.kind !== 'acp')
|
||||
const acp = providers.filter((provider) => provider.kind === 'acp')
|
||||
|
||||
return [
|
||||
...(llm.length
|
||||
? [{ key: 'llm' as const, label: 'AI Providers', options: llm }]
|
||||
: []),
|
||||
...(acp.length
|
||||
? [{ key: 'acp' as const, label: 'Agents', options: acp }]
|
||||
: []),
|
||||
]
|
||||
}
|
||||
|
||||
export function getProviderSearchValue(
|
||||
provider: Provider,
|
||||
groupLabel: string,
|
||||
): string {
|
||||
return [
|
||||
provider.id,
|
||||
provider.name,
|
||||
provider.type,
|
||||
groupLabel,
|
||||
provider.adapterName,
|
||||
provider.modelLabel,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(' ')
|
||||
}
|
||||
|
||||
export function getProviderSubtitle(provider: Provider): string | undefined {
|
||||
if (provider.kind !== 'acp') return undefined
|
||||
return [
|
||||
provider.adapterName,
|
||||
provider.modelLabel,
|
||||
provider.modelControl === 'best-effort' ? 'best effort' : undefined,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(' · ')
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import {
|
||||
getProviderSearchValue,
|
||||
getProviderSubtitle,
|
||||
groupProviderOptions,
|
||||
} from './ChatProviderSelector.helpers'
|
||||
import type { Provider } from './chatComponentTypes'
|
||||
|
||||
const options: Provider[] = [
|
||||
{ kind: 'llm', id: 'browseros', name: 'BrowserOS', type: 'browseros' },
|
||||
{
|
||||
kind: 'llm',
|
||||
id: 'anthropic-sonnet',
|
||||
name: 'Anthropic Sonnet',
|
||||
type: 'anthropic',
|
||||
},
|
||||
{
|
||||
kind: 'acp',
|
||||
id: 'agent-claude-review',
|
||||
name: 'Review Bot',
|
||||
type: 'acp',
|
||||
adapterName: 'Claude Code',
|
||||
modelLabel: 'Haiku',
|
||||
modelControl: 'best-effort',
|
||||
},
|
||||
{
|
||||
kind: 'acp',
|
||||
id: 'agent-codex-browser',
|
||||
name: 'Browser Driver',
|
||||
type: 'acp',
|
||||
adapterName: 'Codex',
|
||||
modelLabel: 'GPT-5.5',
|
||||
modelControl: 'runtime-supported',
|
||||
},
|
||||
]
|
||||
|
||||
describe('groupProviderOptions', () => {
|
||||
it('groups normal providers separately from created agents', () => {
|
||||
expect(groupProviderOptions(options)).toEqual([
|
||||
{
|
||||
key: 'llm',
|
||||
label: 'AI Providers',
|
||||
options: [options[0], options[1]],
|
||||
},
|
||||
{
|
||||
key: 'acp',
|
||||
label: 'Agents',
|
||||
options: [options[2], options[3]],
|
||||
},
|
||||
])
|
||||
})
|
||||
})
|
||||
|
||||
describe('getProviderSearchValue', () => {
|
||||
it('matches created-agent group labels and item labels', () => {
|
||||
expect(getProviderSearchValue(options[2], 'Agents')).toContain('Agents')
|
||||
expect(getProviderSearchValue(options[2], 'Agents')).toContain('Review Bot')
|
||||
expect(getProviderSearchValue(options[2], 'Agents')).toContain(
|
||||
'Claude Code',
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
describe('getProviderSubtitle', () => {
|
||||
it('describes created-agent runtime context without model-target copy', () => {
|
||||
expect(getProviderSubtitle(options[2])).toBe(
|
||||
'Claude Code · Haiku · best effort',
|
||||
)
|
||||
expect(getProviderSubtitle(options[3])).toBe('Codex · GPT-5.5')
|
||||
expect(getProviderSubtitle(options[0])).toBeUndefined()
|
||||
})
|
||||
})
|
||||
@@ -1,4 +1,4 @@
|
||||
import { Check, Plus } from 'lucide-react'
|
||||
import { Bot, Check, Plus } from 'lucide-react'
|
||||
import type { FC, PropsWithChildren } from 'react'
|
||||
import { useState } from 'react'
|
||||
import {
|
||||
@@ -17,6 +17,11 @@ import {
|
||||
import { BrowserOSIcon, ProviderIcon } from '@/lib/llm-providers/providerIcons'
|
||||
import type { ProviderType } from '@/lib/llm-providers/types'
|
||||
import { cn } from '@/lib/utils'
|
||||
import {
|
||||
getProviderSearchValue,
|
||||
getProviderSubtitle,
|
||||
groupProviderOptions,
|
||||
} from './ChatProviderSelector.helpers'
|
||||
import type { Provider } from './chatComponentTypes'
|
||||
|
||||
interface ChatProviderSelectorProps {
|
||||
@@ -29,54 +34,58 @@ export const ChatProviderSelector: FC<
|
||||
PropsWithChildren<ChatProviderSelectorProps>
|
||||
> = ({ children, providers, selectedProvider, onSelectProvider }) => {
|
||||
const [open, setOpen] = useState(false)
|
||||
const groups = groupProviderOptions(providers)
|
||||
|
||||
return (
|
||||
<Popover open={open} onOpenChange={setOpen}>
|
||||
<PopoverTrigger asChild>{children}</PopoverTrigger>
|
||||
<PopoverContent side="bottom" align="start" className="w-48 p-0">
|
||||
<PopoverContent side="bottom" align="start" className="w-64 p-0">
|
||||
<Command>
|
||||
<CommandInput placeholder="Search providers..." className="h-9" />
|
||||
<CommandInput
|
||||
placeholder="Search providers or agents..."
|
||||
className="h-9"
|
||||
/>
|
||||
<CommandList>
|
||||
<div className="my-2 px-2 font-semibold text-muted-foreground text-xs uppercase tracking-wide">
|
||||
AI Provider
|
||||
</div>
|
||||
<CommandEmpty>No provider found</CommandEmpty>
|
||||
<CommandGroup>
|
||||
{providers.map((provider) => {
|
||||
const isSelected = selectedProvider.id === provider.id
|
||||
return (
|
||||
<CommandItem
|
||||
key={provider.id}
|
||||
value={`${provider.id} ${provider.name}`}
|
||||
onSelect={() => {
|
||||
onSelectProvider(provider)
|
||||
setOpen(false)
|
||||
}}
|
||||
className={cn(
|
||||
'flex w-full items-center gap-3 rounded-md p-2 transition-colors',
|
||||
isSelected && 'bg-[var(--accent-orange)]/10',
|
||||
)}
|
||||
>
|
||||
<span className="text-muted-foreground">
|
||||
{provider.type === 'browseros' ? (
|
||||
<BrowserOSIcon size={18} />
|
||||
) : (
|
||||
<ProviderIcon
|
||||
type={provider.type as ProviderType}
|
||||
size={18}
|
||||
/>
|
||||
{groups.map((group) => (
|
||||
<CommandGroup key={group.key} heading={group.label}>
|
||||
{group.options.map((provider) => {
|
||||
const isSelected = selectedProvider.id === provider.id
|
||||
const subtitle = getProviderSubtitle(provider)
|
||||
return (
|
||||
<CommandItem
|
||||
key={provider.id}
|
||||
value={getProviderSearchValue(provider, group.label)}
|
||||
onSelect={() => {
|
||||
onSelectProvider(provider)
|
||||
setOpen(false)
|
||||
}}
|
||||
className={cn(
|
||||
'flex w-full items-center gap-3 rounded-md p-2 transition-colors',
|
||||
isSelected && 'bg-[var(--accent-orange)]/10',
|
||||
)}
|
||||
</span>
|
||||
<span className="flex-1 text-left text-sm">
|
||||
{provider.name}
|
||||
</span>
|
||||
{isSelected && (
|
||||
<Check className="h-3.5 w-3.5 text-[var(--accent-orange)]" />
|
||||
)}
|
||||
</CommandItem>
|
||||
)
|
||||
})}
|
||||
</CommandGroup>
|
||||
>
|
||||
<span className="text-muted-foreground">
|
||||
<ProviderOptionIcon provider={provider} />
|
||||
</span>
|
||||
<span className="min-w-0 flex-1 text-left">
|
||||
<span className="block truncate text-sm">
|
||||
{provider.name}
|
||||
</span>
|
||||
{subtitle && (
|
||||
<span className="block truncate text-muted-foreground text-xs">
|
||||
{subtitle}
|
||||
</span>
|
||||
)}
|
||||
</span>
|
||||
{isSelected && (
|
||||
<Check className="h-3.5 w-3.5 text-[var(--accent-orange)]" />
|
||||
)}
|
||||
</CommandItem>
|
||||
)
|
||||
})}
|
||||
</CommandGroup>
|
||||
))}
|
||||
<div className="border-border border-t p-1">
|
||||
<button
|
||||
type="button"
|
||||
@@ -96,3 +105,9 @@ export const ChatProviderSelector: FC<
|
||||
</Popover>
|
||||
)
|
||||
}
|
||||
|
||||
function ProviderOptionIcon({ provider }: { provider: Provider }) {
|
||||
if (provider.kind === 'acp') return <Bot size={18} />
|
||||
if (provider.type === 'browseros') return <BrowserOSIcon size={18} />
|
||||
return <ProviderIcon type={provider.type as ProviderType} size={18} />
|
||||
}
|
||||
|
||||
@@ -1,7 +1,14 @@
|
||||
import type { ProviderType } from '@/lib/llm-providers/types'
|
||||
|
||||
export type ChatProviderType = ProviderType | 'acp'
|
||||
|
||||
export interface Provider {
|
||||
id: string
|
||||
name: string
|
||||
type: ProviderType
|
||||
type: ChatProviderType
|
||||
kind: 'llm' | 'acp'
|
||||
agentId?: string
|
||||
adapterName?: string
|
||||
modelLabel?: string
|
||||
modelControl?: 'runtime-supported' | 'best-effort'
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { ArrowLeft, Bot, Home } from 'lucide-react'
|
||||
import { type FC, useEffect, useMemo, useRef, useState } from 'react'
|
||||
import { type FC, useEffect, useMemo, useRef } from 'react'
|
||||
import { Navigate, useNavigate, useParams, useSearchParams } from 'react-router'
|
||||
import { Button } from '@/components/ui/button'
|
||||
import {
|
||||
@@ -16,9 +16,7 @@ import {
|
||||
flattenHistoryPages,
|
||||
} from './claw-chat-types'
|
||||
import { useAgentConversation } from './useAgentConversation'
|
||||
import { useClawChatHistory } from './useClawChatHistory'
|
||||
import { useHarnessChatHistory } from './useHarnessChatHistory'
|
||||
import { useOutboundQueue } from './useOutboundQueue'
|
||||
|
||||
function StatusBadge({ status }: { status: string }) {
|
||||
return (
|
||||
@@ -176,19 +174,10 @@ function getAgentEntryMeta(agent: AgentEntry | undefined): string {
|
||||
return getModelDisplayName(agent?.model) ?? 'OpenClaw agent'
|
||||
}
|
||||
|
||||
function getConversationStatusCopy(status: string | undefined): string {
|
||||
if (status === 'running') return 'Ready'
|
||||
if (status === 'starting') return 'Connecting'
|
||||
if (status === 'error') return 'Attention'
|
||||
if (status === 'stopped') return 'Offline'
|
||||
return 'Setup'
|
||||
}
|
||||
|
||||
function AgentConversationController({
|
||||
agentId,
|
||||
initialMessage,
|
||||
onInitialMessageConsumed,
|
||||
status,
|
||||
agents,
|
||||
agentPathPrefix,
|
||||
createAgentPath,
|
||||
@@ -196,7 +185,6 @@ function AgentConversationController({
|
||||
agentId: string
|
||||
initialMessage: string | null
|
||||
onInitialMessageConsumed: () => void
|
||||
status: ReturnType<typeof useAgentCommandData>['status']
|
||||
agents: AgentEntry[]
|
||||
agentPathPrefix: string
|
||||
createAgentPath: string
|
||||
@@ -204,121 +192,49 @@ function AgentConversationController({
|
||||
const navigate = useNavigate()
|
||||
const initialMessageSentRef = useRef<string | null>(null)
|
||||
const onInitialMessageConsumedRef = useRef(onInitialMessageConsumed)
|
||||
const [streamSessionKey, setStreamSessionKey] = useState<string | null>(null)
|
||||
const agent = agents.find((entry) => entry.agentId === agentId)
|
||||
const agentName = agent?.name || agentId || 'Agent'
|
||||
const isAgentHarnessAgent = agent?.source === 'agent-harness'
|
||||
const clawHistoryQuery = useClawChatHistory({
|
||||
agentId,
|
||||
sessionKey: streamSessionKey,
|
||||
enabled: Boolean(agent) && !isAgentHarnessAgent,
|
||||
})
|
||||
const harnessHistoryQuery = useHarnessChatHistory(
|
||||
agentId,
|
||||
Boolean(agent) && isAgentHarnessAgent,
|
||||
)
|
||||
// Routing is now harness-only. Every OpenClaw agent has a harness
|
||||
// record post the gateway → harness backfill, so the chat panel
|
||||
// always talks to /agents/<id>/chat. The legacy ClawChat surface
|
||||
// was deleted with the /claw/agents/:id/chat server route.
|
||||
const harnessHistoryQuery = useHarnessChatHistory(agentId, Boolean(agent))
|
||||
|
||||
const historyMessages = useMemo(
|
||||
() =>
|
||||
flattenHistoryPages(
|
||||
isAgentHarnessAgent
|
||||
? harnessHistoryQuery.data
|
||||
? [harnessHistoryQuery.data]
|
||||
: []
|
||||
: (clawHistoryQuery.data?.pages ?? []),
|
||||
harnessHistoryQuery.data ? [harnessHistoryQuery.data] : [],
|
||||
),
|
||||
[
|
||||
clawHistoryQuery.data?.pages,
|
||||
harnessHistoryQuery.data,
|
||||
isAgentHarnessAgent,
|
||||
],
|
||||
[harnessHistoryQuery.data],
|
||||
)
|
||||
const chatHistory = useMemo(
|
||||
() => buildChatHistoryFromClawMessages(historyMessages),
|
||||
[historyMessages],
|
||||
)
|
||||
const resolvedSessionKey =
|
||||
streamSessionKey ??
|
||||
(isAgentHarnessAgent
|
||||
? null
|
||||
: (clawHistoryQuery.data?.pages?.[0]?.sessionKey ?? null))
|
||||
|
||||
const { turns, streaming, send } = useAgentConversation(agentId, {
|
||||
runtime: isAgentHarnessAgent ? 'agent-harness' : 'openclaw',
|
||||
sessionKey: resolvedSessionKey,
|
||||
runtime: 'agent-harness',
|
||||
sessionKey: null,
|
||||
history: chatHistory,
|
||||
onComplete: () => {
|
||||
if (isAgentHarnessAgent) {
|
||||
void harnessHistoryQuery.refetch()
|
||||
}
|
||||
},
|
||||
onSessionKeyChange: (sessionKey) => {
|
||||
setStreamSessionKey(sessionKey)
|
||||
void harnessHistoryQuery.refetch()
|
||||
},
|
||||
onSessionKeyChange: () => {},
|
||||
})
|
||||
const visibleTurns = useMemo(
|
||||
() =>
|
||||
isAgentHarnessAgent
|
||||
? filterTurnsPersistedInHistory(turns, historyMessages)
|
||||
: turns,
|
||||
[historyMessages, isAgentHarnessAgent, turns],
|
||||
() => filterTurnsPersistedInHistory(turns, historyMessages),
|
||||
[historyMessages, turns],
|
||||
)
|
||||
const outboundQueue = useOutboundQueue({
|
||||
agentId,
|
||||
sessionKey: resolvedSessionKey,
|
||||
enabled: Boolean(agent) && !isAgentHarnessAgent,
|
||||
})
|
||||
onInitialMessageConsumedRef.current = onInitialMessageConsumed
|
||||
|
||||
// Refetch history whenever a server-dispatched queue item completes.
|
||||
// The server worker streams the queued turn into OpenClaw directly, so
|
||||
// the client never observes the live tokens — we only see the new
|
||||
// assistant turn once the JSONL is updated. Watching the queue for
|
||||
// any 'sending' item dropping out is the cleanest "turn finalized"
|
||||
// signal we have without exposing per-turn SSE.
|
||||
const previousSendingIdsRef = useRef<Set<string>>(new Set())
|
||||
useEffect(() => {
|
||||
if (isAgentHarnessAgent) return
|
||||
const currentSending = new Set(
|
||||
outboundQueue.queue
|
||||
.filter((item) => item.status === 'sending')
|
||||
.map((item) => item.id),
|
||||
)
|
||||
const dropped = [...previousSendingIdsRef.current].filter(
|
||||
(id) => !currentSending.has(id),
|
||||
)
|
||||
previousSendingIdsRef.current = currentSending
|
||||
if (dropped.length > 0) {
|
||||
void clawHistoryQuery.refetch()
|
||||
}
|
||||
}, [clawHistoryQuery, isAgentHarnessAgent, outboundQueue.queue])
|
||||
|
||||
const disabled =
|
||||
!agent || (!isAgentHarnessAgent && status?.status !== 'running')
|
||||
// Two-part gate: cover both "still fetching" AND "just got enabled but
|
||||
// hasn't started fetching yet". When `enabled` flips true (baseUrl
|
||||
// resolves), there's a render frame where React Query reports
|
||||
// isLoading=false but hasn't run the queryFn yet — `isFetched` is still
|
||||
// false. Without this we render EmptyState during that one frame.
|
||||
const isInitialLoading =
|
||||
!isAgentHarnessAgent &&
|
||||
(clawHistoryQuery.isLoading ||
|
||||
(!clawHistoryQuery.isFetched && !clawHistoryQuery.isError))
|
||||
|
||||
const disabled = !agent
|
||||
const historyReady =
|
||||
(isAgentHarnessAgent &&
|
||||
(harnessHistoryQuery.isFetched || harnessHistoryQuery.isError)) ||
|
||||
(!isAgentHarnessAgent &&
|
||||
(clawHistoryQuery.isFetched || clawHistoryQuery.isError))
|
||||
harnessHistoryQuery.isFetched || harnessHistoryQuery.isError
|
||||
const initialMessageKey = initialMessage
|
||||
? `${agentId}:${initialMessage}`
|
||||
: null
|
||||
const error = isAgentHarnessAgent
|
||||
? (harnessHistoryQuery.error ?? null)
|
||||
: (clawHistoryQuery.error ?? null)
|
||||
const error = harnessHistoryQuery.error ?? null
|
||||
|
||||
const enqueueRef = useRef(outboundQueue.enqueue)
|
||||
enqueueRef.current = outboundQueue.enqueue
|
||||
const sendRef = useRef(send)
|
||||
sendRef.current = send
|
||||
|
||||
@@ -340,18 +256,8 @@ function AgentConversationController({
|
||||
|
||||
initialMessageSentRef.current = initialMessageKey
|
||||
onInitialMessageConsumedRef.current()
|
||||
if (isAgentHarnessAgent) {
|
||||
void sendRef.current({ text: query })
|
||||
} else {
|
||||
enqueueRef.current({ text: query })
|
||||
}
|
||||
}, [
|
||||
disabled,
|
||||
historyReady,
|
||||
initialMessage,
|
||||
initialMessageKey,
|
||||
isAgentHarnessAgent,
|
||||
])
|
||||
void sendRef.current({ text: query })
|
||||
}, [disabled, historyReady, initialMessage, initialMessageKey])
|
||||
|
||||
const handleSelectAgent = (entry: AgentEntry) => {
|
||||
navigate(`${agentPathPrefix}/${entry.agentId}`)
|
||||
@@ -364,27 +270,13 @@ function AgentConversationController({
|
||||
historyMessages={historyMessages}
|
||||
turns={visibleTurns}
|
||||
streaming={streaming}
|
||||
isInitialLoading={
|
||||
isAgentHarnessAgent ? harnessHistoryQuery.isLoading : isInitialLoading
|
||||
}
|
||||
isInitialLoading={harnessHistoryQuery.isLoading}
|
||||
error={error}
|
||||
hasNextPage={
|
||||
isAgentHarnessAgent ? false : Boolean(clawHistoryQuery.hasNextPage)
|
||||
}
|
||||
isFetchingNextPage={
|
||||
isAgentHarnessAgent ? false : clawHistoryQuery.isFetchingNextPage
|
||||
}
|
||||
onFetchNextPage={() => {
|
||||
if (!isAgentHarnessAgent) {
|
||||
void clawHistoryQuery.fetchNextPage()
|
||||
}
|
||||
}}
|
||||
hasNextPage={false}
|
||||
isFetchingNextPage={false}
|
||||
onFetchNextPage={() => {}}
|
||||
onRetry={() => {
|
||||
if (isAgentHarnessAgent) {
|
||||
void harnessHistoryQuery.refetch()
|
||||
} else {
|
||||
void clawHistoryQuery.refetch()
|
||||
}
|
||||
void harnessHistoryQuery.refetch()
|
||||
}}
|
||||
/>
|
||||
|
||||
@@ -404,32 +296,14 @@ function AgentConversationController({
|
||||
name: a.name,
|
||||
dataUrl: a.dataUrl,
|
||||
}))
|
||||
if (isAgentHarnessAgent) {
|
||||
void send({ text: input.text, attachments, attachmentPreviews })
|
||||
} else {
|
||||
outboundQueue.enqueue({
|
||||
text: input.text,
|
||||
attachments,
|
||||
attachmentPreviews,
|
||||
history: chatHistory,
|
||||
})
|
||||
}
|
||||
void send({ text: input.text, attachments, attachmentPreviews })
|
||||
}}
|
||||
onCreateAgent={() => navigate(createAgentPath)}
|
||||
streaming={streaming}
|
||||
disabled={disabled}
|
||||
status={isAgentHarnessAgent ? 'running' : status?.status}
|
||||
attachmentsEnabled={!isAgentHarnessAgent}
|
||||
status="running"
|
||||
attachmentsEnabled={true}
|
||||
placeholder={`Message ${agentName}...`}
|
||||
outboundQueue={
|
||||
isAgentHarnessAgent ? undefined : outboundQueue.queue
|
||||
}
|
||||
onCancelQueued={
|
||||
isAgentHarnessAgent ? undefined : outboundQueue.cancel
|
||||
}
|
||||
onRetryQueued={
|
||||
isAgentHarnessAgent ? undefined : outboundQueue.retry
|
||||
}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
@@ -453,7 +327,7 @@ export const AgentCommandConversation: FC<AgentCommandConversationProps> = ({
|
||||
const { agentId } = useParams<{ agentId: string }>()
|
||||
const [searchParams, setSearchParams] = useSearchParams()
|
||||
const navigate = useNavigate()
|
||||
const { status, agents } = useAgentCommandData()
|
||||
const { agents } = useAgentCommandData()
|
||||
const shouldRedirectHome = !agentId
|
||||
const resolvedAgentId = agentId ?? ''
|
||||
const agent = agents.find((entry) => entry.agentId === resolvedAgentId)
|
||||
@@ -471,10 +345,11 @@ export const AgentCommandConversation: FC<AgentCommandConversationProps> = ({
|
||||
navigate(`${agentPathPrefix}/${entry.agentId}`)
|
||||
}
|
||||
|
||||
const statusCopy =
|
||||
agent?.source === 'agent-harness'
|
||||
? 'Ready'
|
||||
: getConversationStatusCopy(status?.status)
|
||||
// Every visible agent runs through the harness now, so per-agent
|
||||
// runtime status doesn't gate chat the way OpenClaw's legacy
|
||||
// gateway lifecycle did. Show "Ready" once the agent record is
|
||||
// resolved from the rail, "Setup" otherwise.
|
||||
const statusCopy = agent ? 'Ready' : 'Setup'
|
||||
|
||||
return (
|
||||
<div className="absolute inset-0 overflow-hidden bg-background md:pl-[theme(spacing.14)]">
|
||||
@@ -500,7 +375,6 @@ export const AgentCommandConversation: FC<AgentCommandConversationProps> = ({
|
||||
key={resolvedAgentId}
|
||||
agentId={resolvedAgentId}
|
||||
agents={agents}
|
||||
status={status}
|
||||
initialMessage={initialMessage}
|
||||
onInitialMessageConsumed={() =>
|
||||
setSearchParams({}, { replace: true })
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import {
|
||||
AlertTriangle,
|
||||
ArrowRight,
|
||||
Bot,
|
||||
ChevronDown,
|
||||
@@ -9,7 +8,6 @@ import {
|
||||
Loader2,
|
||||
Mic,
|
||||
Paperclip,
|
||||
RefreshCw,
|
||||
Square,
|
||||
X,
|
||||
} from 'lucide-react'
|
||||
@@ -38,7 +36,6 @@ import { cn } from '@/lib/utils'
|
||||
import { useVoiceInput } from '@/lib/voice/useVoiceInput'
|
||||
import { useWorkspace } from '@/lib/workspace/use-workspace'
|
||||
import { AgentSelector } from './AgentSelector'
|
||||
import type { OutboundMessage } from './useOutboundQueue'
|
||||
|
||||
export interface ConversationInputSendInput {
|
||||
text: string
|
||||
@@ -57,15 +54,6 @@ interface ConversationInputProps {
|
||||
placeholder?: string
|
||||
attachmentsEnabled?: boolean
|
||||
variant?: 'home' | 'conversation'
|
||||
// Outbound queue: when present, the composer renders the queue strip
|
||||
// above the textarea and lets the user keep sending while a previous
|
||||
// turn is in flight. Optional so non-conversation variants (the home
|
||||
// page) can opt out — the queue only makes sense in the conversation
|
||||
// page where each enqueued message will eventually be delivered to the
|
||||
// active agent.
|
||||
outboundQueue?: OutboundMessage[]
|
||||
onCancelQueued?: (id: string) => void
|
||||
onRetryQueued?: (id: string) => void
|
||||
}
|
||||
|
||||
function InputActionButton({
|
||||
@@ -311,9 +299,6 @@ export const ConversationInput: FC<ConversationInputProps> = ({
|
||||
placeholder,
|
||||
attachmentsEnabled = true,
|
||||
variant = 'conversation',
|
||||
outboundQueue,
|
||||
onCancelQueued,
|
||||
onRetryQueued,
|
||||
}) => {
|
||||
const [input, setInput] = useState('')
|
||||
const [selectedTabs, setSelectedTabs] = useState<chrome.tabs.Tab[]>([])
|
||||
@@ -394,15 +379,10 @@ export const ConversationInput: FC<ConversationInputProps> = ({
|
||||
}
|
||||
|
||||
const hasContent = input.trim().length > 0 || attachments.length > 0
|
||||
const queueEnabled = outboundQueue !== undefined
|
||||
|
||||
const handleSend = () => {
|
||||
const text = input.trim()
|
||||
// The outbound queue accepts new messages while streaming; legacy
|
||||
// direct-send callers (e.g., the home composer) keep the original
|
||||
// streaming-blocks-send semantic.
|
||||
if (disabled || isStaging) return
|
||||
if (!queueEnabled && streaming) return
|
||||
if (disabled || isStaging || streaming) return
|
||||
if (!text && attachments.length === 0) return
|
||||
onSend({ text, attachments })
|
||||
setInput('')
|
||||
@@ -494,13 +474,6 @@ export const ConversationInput: FC<ConversationInputProps> = ({
|
||||
error={attachmentError}
|
||||
/>
|
||||
) : null}
|
||||
{queueEnabled && outboundQueue && outboundQueue.length > 0 ? (
|
||||
<OutboundQueueStrip
|
||||
messages={outboundQueue}
|
||||
onCancel={onCancelQueued}
|
||||
onRetry={onRetryQueued}
|
||||
/>
|
||||
) : null}
|
||||
<div
|
||||
className={cn(
|
||||
'flex gap-3',
|
||||
@@ -556,10 +529,7 @@ export const ConversationInput: FC<ConversationInputProps> = ({
|
||||
!!disabled ||
|
||||
voice.isRecording ||
|
||||
voice.isTranscribing ||
|
||||
// Only block on `streaming` for the legacy direct-send path
|
||||
// (no queue). With the queue active the press always
|
||||
// succeeds — it just enqueues instead of dispatching.
|
||||
(!queueEnabled && streaming)
|
||||
streaming
|
||||
}
|
||||
onClick={handleSend}
|
||||
// Spinner stays the user-facing "agent is busy" hint; with the
|
||||
@@ -595,117 +565,6 @@ export const ConversationInput: FC<ConversationInputProps> = ({
|
||||
)
|
||||
}
|
||||
|
||||
function OutboundQueueStrip({
|
||||
messages,
|
||||
onCancel,
|
||||
onRetry,
|
||||
}: {
|
||||
messages: OutboundMessage[]
|
||||
onCancel?: (id: string) => void
|
||||
onRetry?: (id: string) => void
|
||||
}) {
|
||||
return (
|
||||
<div className="border-border/40 border-b px-4 pt-3 pb-2">
|
||||
<ul className="flex flex-col gap-1">
|
||||
{messages.map((message) => (
|
||||
<OutboundQueueItem
|
||||
key={message.id}
|
||||
message={message}
|
||||
onCancel={onCancel}
|
||||
onRetry={onRetry}
|
||||
/>
|
||||
))}
|
||||
</ul>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function OutboundQueueItem({
|
||||
message,
|
||||
onCancel,
|
||||
onRetry,
|
||||
}: {
|
||||
message: OutboundMessage
|
||||
onCancel?: (id: string) => void
|
||||
onRetry?: (id: string) => void
|
||||
}) {
|
||||
const preview = message.text.trim() || '(attachments only)'
|
||||
return (
|
||||
<li className="flex items-center gap-2 rounded-md px-2 py-1 text-xs">
|
||||
<OutboundQueueStatusIcon status={message.status} />
|
||||
<span className="min-w-0 flex-1 truncate text-muted-foreground">
|
||||
{preview}
|
||||
</span>
|
||||
{message.attachmentPreviews.length > 0 ? (
|
||||
<span className="inline-flex items-center gap-1 text-muted-foreground/70">
|
||||
<Paperclip className="size-3" />
|
||||
<span className="tabular-nums">
|
||||
{message.attachmentPreviews.length}
|
||||
</span>
|
||||
</span>
|
||||
) : null}
|
||||
{message.status === 'queued' && onCancel ? (
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => onCancel(message.id)}
|
||||
className="ml-1 inline-flex size-5 items-center justify-center rounded-full text-muted-foreground hover:bg-accent hover:text-foreground"
|
||||
aria-label="Cancel queued message"
|
||||
title="Cancel"
|
||||
>
|
||||
<X className="size-3" />
|
||||
</button>
|
||||
) : null}
|
||||
{message.status === 'failed' ? (
|
||||
<span className="ml-1 inline-flex items-center gap-2 text-destructive">
|
||||
<span className="max-w-[160px] truncate" title={message.error}>
|
||||
{message.error ?? 'Failed'}
|
||||
</span>
|
||||
{onRetry ? (
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => onRetry(message.id)}
|
||||
className="inline-flex size-5 items-center justify-center rounded-full hover:bg-accent hover:text-foreground"
|
||||
aria-label="Retry failed message"
|
||||
title="Retry"
|
||||
>
|
||||
<RefreshCw className="size-3" />
|
||||
</button>
|
||||
) : null}
|
||||
{onCancel ? (
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => onCancel(message.id)}
|
||||
className="inline-flex size-5 items-center justify-center rounded-full hover:bg-accent hover:text-foreground"
|
||||
aria-label="Discard failed message"
|
||||
title="Discard"
|
||||
>
|
||||
<X className="size-3" />
|
||||
</button>
|
||||
) : null}
|
||||
</span>
|
||||
) : null}
|
||||
</li>
|
||||
)
|
||||
}
|
||||
|
||||
function OutboundQueueStatusIcon({
|
||||
status,
|
||||
}: {
|
||||
status: OutboundMessage['status']
|
||||
}) {
|
||||
if (status === 'sending') {
|
||||
return (
|
||||
<Loader2 className="size-3.5 shrink-0 animate-spin text-muted-foreground" />
|
||||
)
|
||||
}
|
||||
if (status === 'failed') {
|
||||
return <AlertTriangle className="size-3.5 shrink-0 text-destructive" />
|
||||
}
|
||||
return (
|
||||
<span className="inline-block size-2 shrink-0 rounded-full bg-muted-foreground/40" />
|
||||
)
|
||||
}
|
||||
|
||||
function AttachmentStrip({
|
||||
attachments,
|
||||
onRemove,
|
||||
|
||||
@@ -26,7 +26,15 @@ export const AgentCommandLayout: FC = () => {
|
||||
const { agents: harnessAgents, loading: harnessAgentsLoading } =
|
||||
useHarnessAgents()
|
||||
const visibleOpenClawAgents = openClawEnabled ? openClawAgents : []
|
||||
const agents = [...visibleOpenClawAgents, ...harnessAgents]
|
||||
// Dual-created OpenClaw agents appear in both `/claw/agents` (gateway
|
||||
// record) and `/agents` (harness record) under the same id. Prefer the
|
||||
// harness entry so the chat panel can route through the harness path
|
||||
// and the rail doesn't show duplicates.
|
||||
const harnessAgentIds = new Set(harnessAgents.map((entry) => entry.agentId))
|
||||
const dedupedOpenClawAgents = visibleOpenClawAgents.filter(
|
||||
(entry) => !harnessAgentIds.has(entry.agentId),
|
||||
)
|
||||
const agents = [...dedupedOpenClawAgents, ...harnessAgents]
|
||||
|
||||
return (
|
||||
<Outlet
|
||||
|
||||
@@ -23,9 +23,9 @@ export interface BrowserOSChatHistoryToolCall {
|
||||
toolName: string
|
||||
label: string
|
||||
subject?: string
|
||||
status: 'completed' | 'failed'
|
||||
input?: Record<string, unknown>
|
||||
output?: string
|
||||
status: 'pending' | 'running' | 'completed' | 'failed'
|
||||
input?: unknown
|
||||
output?: unknown
|
||||
error?: string
|
||||
durationMs?: number
|
||||
}
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
import { buildToolLabel } from '../../../lib/tool-labels'
|
||||
import type { HarnessAgentHistoryPage } from '../agents/agent-harness-types'
|
||||
import type {
|
||||
AgentHistoryPageResponse,
|
||||
BrowserOSChatHistoryItem,
|
||||
BrowserOSChatHistoryToolCall,
|
||||
} from './claw-chat-types'
|
||||
|
||||
export function mapHarnessHistoryPage(
|
||||
page: HarnessAgentHistoryPage,
|
||||
): AgentHistoryPageResponse {
|
||||
const items: BrowserOSChatHistoryItem[] = page.items.map((item, index) => {
|
||||
const toolCalls = item.toolCalls?.map(
|
||||
(tool): BrowserOSChatHistoryToolCall => {
|
||||
const input = asRecord(tool.input)
|
||||
const { label, subject } = buildToolLabel(tool.toolName, input)
|
||||
return {
|
||||
toolName: tool.toolName,
|
||||
label,
|
||||
status: tool.status,
|
||||
...(tool.toolCallId ? { toolCallId: tool.toolCallId } : {}),
|
||||
...(subject ? { subject } : {}),
|
||||
...(tool.input !== undefined ? { input: tool.input } : {}),
|
||||
...(tool.output !== undefined ? { output: tool.output } : {}),
|
||||
...(tool.error ? { error: tool.error } : {}),
|
||||
...(tool.durationMs != null ? { durationMs: tool.durationMs } : {}),
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
return {
|
||||
id: item.id,
|
||||
role: item.role,
|
||||
text: item.text,
|
||||
timestamp: item.createdAt,
|
||||
messageSeq: index + 1,
|
||||
sessionKey: 'main',
|
||||
source: 'user-chat',
|
||||
...(item.reasoning ? { reasoning: item.reasoning } : {}),
|
||||
...(toolCalls && toolCalls.length > 0 ? { toolCalls } : {}),
|
||||
}
|
||||
})
|
||||
const updatedAt =
|
||||
page.items.length > 0
|
||||
? Math.max(...page.items.map((item) => item.createdAt))
|
||||
: Date.now()
|
||||
|
||||
return {
|
||||
agentId: page.agentId,
|
||||
sessionKey: 'main',
|
||||
session: {
|
||||
key: 'main',
|
||||
updatedAt,
|
||||
sessionId: 'main',
|
||||
agentId: page.agentId,
|
||||
kind: 'agent-harness',
|
||||
source: 'user-chat',
|
||||
},
|
||||
items,
|
||||
page: {
|
||||
hasMore: false,
|
||||
limit: items.length,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
function asRecord(value: unknown): Record<string, unknown> | undefined {
|
||||
return value && typeof value === 'object' && !Array.isArray(value)
|
||||
? (value as Record<string, unknown>)
|
||||
: undefined
|
||||
}
|
||||
@@ -1,13 +1,12 @@
|
||||
import { useEffect, useRef, useState } from 'react'
|
||||
import {
|
||||
type AgentHarnessStreamEvent,
|
||||
attachToHarnessTurn,
|
||||
cancelHarnessTurn,
|
||||
chatWithHarnessAgent,
|
||||
fetchActiveHarnessTurn,
|
||||
} from '@/entrypoints/app/agents/useAgents'
|
||||
import {
|
||||
chatWithAgent,
|
||||
type OpenClawChatHistoryMessage,
|
||||
type OpenClawStreamEvent,
|
||||
} from '@/entrypoints/app/agents/useOpenClaw'
|
||||
import type { OpenClawChatHistoryMessage } from '@/entrypoints/app/agents/useOpenClaw'
|
||||
import type {
|
||||
AgentConversationTurn,
|
||||
AssistantPart,
|
||||
@@ -29,7 +28,10 @@ export interface SendInput {
|
||||
}
|
||||
|
||||
interface UseAgentConversationOptions {
|
||||
runtime?: 'openclaw' | 'agent-harness'
|
||||
// The hook always speaks to the harness chat path now; the OpenClaw
|
||||
// legacy /claw/agents/:id/chat surface was removed in Step 12. The
|
||||
// option remains for forward-compatibility.
|
||||
runtime?: 'agent-harness'
|
||||
sessionKey?: string | null
|
||||
history?: OpenClawChatHistoryMessage[]
|
||||
onComplete?: () => void
|
||||
@@ -49,6 +51,11 @@ export function useAgentConversation(
|
||||
const streamAbortRef = useRef<AbortController | null>(null)
|
||||
const onCompleteRef = useRef(options.onComplete)
|
||||
const onSessionKeyChangeRef = useRef(options.onSessionKeyChange)
|
||||
// Per-turn resume bookkeeping. `turnId` is captured from the response
|
||||
// header; `lastSeq` advances with every SSE event so a reconnect can
|
||||
// resume via Last-Event-ID.
|
||||
const turnIdRef = useRef<string | null>(null)
|
||||
const lastSeqRef = useRef<number | null>(null)
|
||||
|
||||
useEffect(() => {
|
||||
sessionKeyRef.current = options.sessionKey ?? ''
|
||||
@@ -72,6 +79,12 @@ export function useAgentConversation(
|
||||
}
|
||||
}, [])
|
||||
|
||||
// Indirection for the resume effect below: lets it call the latest
|
||||
// event handler without re-subscribing on every render.
|
||||
const processEventRef = useRef<(event: AgentHarnessStreamEvent) => void>(
|
||||
() => {},
|
||||
)
|
||||
|
||||
const updateCurrentTurnParts = (
|
||||
updater: (parts: AssistantPart[]) => AssistantPart[],
|
||||
) => {
|
||||
@@ -82,85 +95,6 @@ export function useAgentConversation(
|
||||
})
|
||||
}
|
||||
|
||||
const processStreamEvent = (event: OpenClawStreamEvent) => {
|
||||
switch (event.type) {
|
||||
case 'text-delta': {
|
||||
appendTextDelta((event.data.text as string) ?? '')
|
||||
break
|
||||
}
|
||||
|
||||
case 'thinking': {
|
||||
appendThinkingDelta((event.data.text as string) ?? '')
|
||||
break
|
||||
}
|
||||
|
||||
case 'tool-start': {
|
||||
const rawName = (event.data.toolName as string) ?? 'unknown'
|
||||
const args = event.data.args as Record<string, unknown> | undefined
|
||||
const { label, subject } = buildToolLabel(rawName, args)
|
||||
const tool = {
|
||||
id: (event.data.toolCallId as string) ?? crypto.randomUUID(),
|
||||
name: rawName,
|
||||
label,
|
||||
subject,
|
||||
status: 'running' as const,
|
||||
}
|
||||
updateCurrentTurnParts((parts) => {
|
||||
const last = parts[parts.length - 1]
|
||||
if (last?.kind === 'tool-batch') {
|
||||
return [
|
||||
...parts.slice(0, -1),
|
||||
{ ...last, tools: [...last.tools, tool] },
|
||||
]
|
||||
}
|
||||
return [...parts, { kind: 'tool-batch', tools: [tool] }]
|
||||
})
|
||||
break
|
||||
}
|
||||
|
||||
case 'tool-end': {
|
||||
const toolId = event.data.toolCallId as string
|
||||
const toolStatus: 'completed' | 'error' =
|
||||
(event.data.status as string) === 'error' ? 'error' : 'completed'
|
||||
const durationMs = event.data.durationMs as number | undefined
|
||||
updateCurrentTurnParts((parts) => {
|
||||
for (let i = parts.length - 1; i >= 0; i--) {
|
||||
const part = parts[i]
|
||||
if (
|
||||
part.kind === 'tool-batch' &&
|
||||
part.tools.some((t) => t.id === toolId)
|
||||
) {
|
||||
const updatedTools = part.tools.map((t) =>
|
||||
t.id === toolId ? { ...t, status: toolStatus, durationMs } : t,
|
||||
)
|
||||
return [
|
||||
...parts.slice(0, i),
|
||||
{ ...part, tools: updatedTools },
|
||||
...parts.slice(i + 1),
|
||||
]
|
||||
}
|
||||
}
|
||||
return parts
|
||||
})
|
||||
break
|
||||
}
|
||||
|
||||
case 'done': {
|
||||
markCurrentTurnDone()
|
||||
break
|
||||
}
|
||||
|
||||
case 'error': {
|
||||
const msg =
|
||||
(event.data.message as string) ??
|
||||
(event.data.error as string) ??
|
||||
'Unknown error'
|
||||
appendErrorText(msg)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const appendTextDelta = (delta: string) => {
|
||||
textAccRef.current += delta
|
||||
const text = textAccRef.current
|
||||
@@ -275,6 +209,79 @@ export function useAgentConversation(
|
||||
break
|
||||
}
|
||||
}
|
||||
processEventRef.current = processAgentHarnessStreamEvent
|
||||
|
||||
// On mount (and whenever the agent changes), check whether the
|
||||
// server has an in-flight turn for this agent and reattach to it.
|
||||
// This is what makes the chat resilient across tab close/reopen,
|
||||
// refresh, and navigation: the runtime call kept running on the
|
||||
// server while we were away. Effect only depends on `agentId` —
|
||||
// the event handler is read off a ref so this doesn't re-subscribe
|
||||
// every render.
|
||||
useEffect(() => {
|
||||
let cancelled = false
|
||||
const abortController = new AbortController()
|
||||
|
||||
const attemptResume = async () => {
|
||||
try {
|
||||
const active = await fetchActiveHarnessTurn(agentId)
|
||||
if (cancelled || !active || active.status !== 'running') return
|
||||
if (streamAbortRef.current) return // a fresh send already in flight
|
||||
|
||||
// Stage a placeholder turn so the streamed events have a row
|
||||
// to render into. We don't have the user message text on
|
||||
// resume; the assistant turn is what we're catching up on.
|
||||
setTurns((prev) => [
|
||||
...prev,
|
||||
{
|
||||
id: crypto.randomUUID(),
|
||||
userText: '',
|
||||
parts: [],
|
||||
done: false,
|
||||
timestamp: active.startedAt,
|
||||
},
|
||||
])
|
||||
textAccRef.current = ''
|
||||
thinkAccRef.current = ''
|
||||
turnIdRef.current = active.turnId
|
||||
lastSeqRef.current = null
|
||||
streamAbortRef.current = abortController
|
||||
setStreaming(true)
|
||||
|
||||
const response = await attachToHarnessTurn(agentId, {
|
||||
turnId: active.turnId,
|
||||
signal: abortController.signal,
|
||||
})
|
||||
if (!response.ok) return
|
||||
await consumeSSEStream<AgentHarnessStreamEvent>(
|
||||
response,
|
||||
(event, meta) => {
|
||||
if (typeof meta.seq === 'number') lastSeqRef.current = meta.seq
|
||||
processEventRef.current(event)
|
||||
},
|
||||
abortController.signal,
|
||||
)
|
||||
} catch {
|
||||
// Resume is best-effort; transient errors fall back to the
|
||||
// user starting a new turn manually.
|
||||
} finally {
|
||||
if (!cancelled) {
|
||||
if (streamAbortRef.current === abortController) {
|
||||
streamAbortRef.current = null
|
||||
}
|
||||
turnIdRef.current = null
|
||||
lastSeqRef.current = null
|
||||
setStreaming(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void attemptResume()
|
||||
return () => {
|
||||
cancelled = true
|
||||
abortController.abort()
|
||||
}
|
||||
}, [agentId])
|
||||
|
||||
const send = async (input: string | SendInput) => {
|
||||
const normalized: SendInput =
|
||||
@@ -304,17 +311,25 @@ export function useAgentConversation(
|
||||
streamAbortRef.current = abortController
|
||||
|
||||
try {
|
||||
const response =
|
||||
options.runtime === 'agent-harness'
|
||||
? await chatWithHarnessAgent(agentId, trimmed, abortController.signal)
|
||||
: await chatWithAgent(
|
||||
agentId,
|
||||
trimmed,
|
||||
sessionKeyRef.current || undefined,
|
||||
historyRef.current,
|
||||
abortController.signal,
|
||||
attachments,
|
||||
)
|
||||
let response = await chatWithHarnessAgent(
|
||||
agentId,
|
||||
trimmed,
|
||||
abortController.signal,
|
||||
attachments,
|
||||
)
|
||||
// 409 means the server already has an active turn for this
|
||||
// agent (e.g. a previous tab kicked one off and we're a fresh
|
||||
// mount that missed the resume window). Attach to it instead of
|
||||
// double-sending.
|
||||
if (response.status === 409) {
|
||||
const body = (await response.json()) as { turnId?: string }
|
||||
if (body.turnId) {
|
||||
response = await attachToHarnessTurn(agentId, {
|
||||
turnId: body.turnId,
|
||||
signal: abortController.signal,
|
||||
})
|
||||
}
|
||||
}
|
||||
const responseSessionKey =
|
||||
response.headers.get('X-Session-Key') ??
|
||||
response.headers.get('X-Session-Id')
|
||||
@@ -322,6 +337,11 @@ export function useAgentConversation(
|
||||
sessionKeyRef.current = responseSessionKey
|
||||
onSessionKeyChangeRef.current?.(responseSessionKey)
|
||||
}
|
||||
const responseTurnId = response.headers.get('X-Turn-Id')
|
||||
if (responseTurnId) {
|
||||
turnIdRef.current = responseTurnId
|
||||
lastSeqRef.current = null
|
||||
}
|
||||
if (!response.ok) {
|
||||
const err = await response.text()
|
||||
updateCurrentTurnParts((parts) => [
|
||||
@@ -330,19 +350,14 @@ export function useAgentConversation(
|
||||
])
|
||||
return
|
||||
}
|
||||
if (options.runtime === 'agent-harness') {
|
||||
await consumeSSEStream<AgentHarnessStreamEvent>(
|
||||
response,
|
||||
processAgentHarnessStreamEvent,
|
||||
abortController.signal,
|
||||
)
|
||||
} else {
|
||||
await consumeSSEStream<OpenClawStreamEvent>(
|
||||
response,
|
||||
processStreamEvent,
|
||||
abortController.signal,
|
||||
)
|
||||
}
|
||||
await consumeSSEStream<AgentHarnessStreamEvent>(
|
||||
response,
|
||||
(event, meta) => {
|
||||
if (typeof meta.seq === 'number') lastSeqRef.current = meta.seq
|
||||
processAgentHarnessStreamEvent(event)
|
||||
},
|
||||
abortController.signal,
|
||||
)
|
||||
} catch (err) {
|
||||
if (abortController.signal.aborted) return
|
||||
const msg = err instanceof Error ? err.message : String(err)
|
||||
@@ -354,14 +369,35 @@ export function useAgentConversation(
|
||||
if (streamAbortRef.current === abortController) {
|
||||
streamAbortRef.current = null
|
||||
}
|
||||
turnIdRef.current = null
|
||||
lastSeqRef.current = null
|
||||
onCompleteRef.current?.()
|
||||
setStreaming(false)
|
||||
}
|
||||
}
|
||||
|
||||
const resetConversation = () => {
|
||||
/**
|
||||
* Stop button. The fetch abort only detaches *this* SSE subscriber
|
||||
* now — the underlying turn would otherwise keep running on the
|
||||
* server. So we explicitly cancel via the new endpoint, then unwind
|
||||
* the local stream.
|
||||
*/
|
||||
const stop = async () => {
|
||||
const turnId = turnIdRef.current ?? undefined
|
||||
streamAbortRef.current?.abort()
|
||||
streamAbortRef.current = null
|
||||
try {
|
||||
await cancelHarnessTurn(agentId, {
|
||||
turnId,
|
||||
reason: 'user pressed stop',
|
||||
})
|
||||
} catch {
|
||||
// Best-effort — UI already aborted.
|
||||
}
|
||||
}
|
||||
|
||||
const resetConversation = () => {
|
||||
void stop()
|
||||
setTurns([])
|
||||
setStreaming(false)
|
||||
}
|
||||
@@ -371,6 +407,7 @@ export function useAgentConversation(
|
||||
streaming,
|
||||
sessionKey: sessionKeyRef.current,
|
||||
send,
|
||||
stop,
|
||||
resetConversation,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,71 +0,0 @@
|
||||
import { useInfiniteQuery } from '@tanstack/react-query'
|
||||
import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
|
||||
import type { AgentHistoryPageResponse } from './claw-chat-types'
|
||||
|
||||
const HISTORY_QUERY_KEY = 'claw-agent-history'
|
||||
|
||||
async function fetchClawJson<T>(url: string): Promise<T> {
|
||||
const response = await fetch(url)
|
||||
|
||||
if (!response.ok) {
|
||||
let message = `Request failed with status ${response.status}`
|
||||
try {
|
||||
const body = (await response.json()) as { error?: string }
|
||||
if (body.error) message = body.error
|
||||
} catch {}
|
||||
throw new Error(message)
|
||||
}
|
||||
|
||||
return response.json() as Promise<T>
|
||||
}
|
||||
|
||||
function buildClawUrl(baseUrl: string, path: string): URL {
|
||||
return new URL(`/claw${path}`, baseUrl)
|
||||
}
|
||||
|
||||
export function useClawChatHistory({
|
||||
agentId,
|
||||
sessionKey,
|
||||
enabled = true,
|
||||
limit = 50,
|
||||
}: {
|
||||
agentId: string
|
||||
// null lets the server resolve the most recent user-chat session for the
|
||||
// agent — avoids an extra /session round-trip and the race that came with it.
|
||||
sessionKey: string | null
|
||||
enabled?: boolean
|
||||
limit?: number
|
||||
}) {
|
||||
const {
|
||||
baseUrl,
|
||||
isLoading: urlLoading,
|
||||
error: urlError,
|
||||
} = useAgentServerUrl()
|
||||
|
||||
const query = useInfiniteQuery<AgentHistoryPageResponse, Error>({
|
||||
queryKey: [HISTORY_QUERY_KEY, baseUrl, agentId, sessionKey],
|
||||
initialPageParam: undefined as string | undefined,
|
||||
queryFn: async ({ pageParam }) => {
|
||||
const url = buildClawUrl(baseUrl as string, `/agents/${agentId}/history`)
|
||||
url.searchParams.set('limit', String(limit))
|
||||
|
||||
if (sessionKey) {
|
||||
url.searchParams.set('sessionKey', sessionKey)
|
||||
}
|
||||
if (typeof pageParam === 'string' && pageParam) {
|
||||
url.searchParams.set('cursor', pageParam)
|
||||
}
|
||||
|
||||
return fetchClawJson<AgentHistoryPageResponse>(url.toString())
|
||||
},
|
||||
getNextPageParam: (lastPage) =>
|
||||
lastPage.page.hasMore ? lastPage.page.cursor : undefined,
|
||||
enabled: enabled && Boolean(baseUrl) && !urlLoading && Boolean(agentId),
|
||||
})
|
||||
|
||||
return {
|
||||
...query,
|
||||
error: query.error ?? urlError,
|
||||
isLoading: query.isLoading || urlLoading,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import { mapHarnessHistoryPage } from './harness-history-mapper'
|
||||
|
||||
describe('mapHarnessHistoryPage', () => {
|
||||
it('maps rich harness history into chat history items', () => {
|
||||
const page = mapHarnessHistoryPage({
|
||||
agentId: 'agent-1',
|
||||
sessionId: 'main',
|
||||
items: [
|
||||
{
|
||||
id: 'agent:agent-1:main:1',
|
||||
agentId: 'agent-1',
|
||||
sessionId: 'main',
|
||||
role: 'assistant',
|
||||
text: 'Done.',
|
||||
createdAt: 1000,
|
||||
reasoning: { text: 'checking state' },
|
||||
toolCalls: [
|
||||
{
|
||||
toolCallId: 'tool-1',
|
||||
toolName: 'read_file',
|
||||
status: 'completed',
|
||||
input: { path: 'src/index.ts' },
|
||||
output: 'file contents',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
expect(page.items).toEqual([
|
||||
{
|
||||
id: 'agent:agent-1:main:1',
|
||||
role: 'assistant',
|
||||
text: 'Done.',
|
||||
timestamp: 1000,
|
||||
messageSeq: 1,
|
||||
sessionKey: 'main',
|
||||
source: 'user-chat',
|
||||
reasoning: { text: 'checking state' },
|
||||
toolCalls: [
|
||||
{
|
||||
toolCallId: 'tool-1',
|
||||
toolName: 'read_file',
|
||||
label: 'Read file',
|
||||
subject: 'index.ts',
|
||||
status: 'completed',
|
||||
input: { path: 'src/index.ts' },
|
||||
output: 'file contents',
|
||||
},
|
||||
],
|
||||
},
|
||||
])
|
||||
})
|
||||
})
|
||||
@@ -1,11 +1,8 @@
|
||||
import { useQuery } from '@tanstack/react-query'
|
||||
import type { HarnessAgentHistoryPage } from '@/entrypoints/app/agents/agent-harness-types'
|
||||
import { fetchHarnessAgentHistory } from '@/entrypoints/app/agents/useAgents'
|
||||
import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
|
||||
import type {
|
||||
AgentHistoryPageResponse,
|
||||
BrowserOSChatHistoryItem,
|
||||
} from './claw-chat-types'
|
||||
import type { AgentHistoryPageResponse } from './claw-chat-types'
|
||||
import { mapHarnessHistoryPage } from './harness-history-mapper'
|
||||
|
||||
const HISTORY_QUERY_KEY = 'harness-agent-history'
|
||||
|
||||
@@ -30,39 +27,3 @@ export function useHarnessChatHistory(agentId: string, enabled = true) {
|
||||
isLoading: query.isLoading || urlLoading,
|
||||
}
|
||||
}
|
||||
|
||||
function mapHarnessHistoryPage(
|
||||
page: HarnessAgentHistoryPage,
|
||||
): AgentHistoryPageResponse {
|
||||
const items: BrowserOSChatHistoryItem[] = page.items.map((item, index) => ({
|
||||
id: item.id,
|
||||
role: item.role,
|
||||
text: item.text,
|
||||
timestamp: item.createdAt,
|
||||
messageSeq: index + 1,
|
||||
sessionKey: 'main',
|
||||
source: 'user-chat',
|
||||
}))
|
||||
const updatedAt =
|
||||
page.items.length > 0
|
||||
? Math.max(...page.items.map((item) => item.createdAt))
|
||||
: Date.now()
|
||||
|
||||
return {
|
||||
agentId: page.agentId,
|
||||
sessionKey: 'main',
|
||||
session: {
|
||||
key: 'main',
|
||||
updatedAt,
|
||||
sessionId: 'main',
|
||||
agentId: page.agentId,
|
||||
kind: 'agent-harness',
|
||||
source: 'user-chat',
|
||||
},
|
||||
items,
|
||||
page: {
|
||||
hasMore: false,
|
||||
limit: items.length,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,271 +0,0 @@
|
||||
import { useCallback, useEffect, useRef, useState } from 'react'
|
||||
import type { OpenClawChatHistoryMessage } from '@/entrypoints/app/agents/useOpenClaw'
|
||||
import type { UserAttachmentPreview } from '@/lib/agent-conversations/types'
|
||||
import type { ServerAttachmentPayload } from '@/lib/attachments'
|
||||
import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
|
||||
|
||||
export type OutboundMessageStatus = 'queued' | 'sending' | 'failed'
|
||||
|
||||
export interface OutboundMessage {
|
||||
id: string
|
||||
text: string
|
||||
attachments: ServerAttachmentPayload[]
|
||||
attachmentPreviews: UserAttachmentPreview[]
|
||||
status: OutboundMessageStatus
|
||||
error?: string
|
||||
createdAt: number
|
||||
}
|
||||
|
||||
export interface OutboundQueueEnqueueInput {
|
||||
text: string
|
||||
attachments?: ServerAttachmentPayload[]
|
||||
attachmentPreviews?: UserAttachmentPreview[]
|
||||
history?: OpenClawChatHistoryMessage[]
|
||||
}
|
||||
|
||||
export interface OutboundQueueApi {
|
||||
queue: OutboundMessage[]
|
||||
enqueue(input: OutboundQueueEnqueueInput): void
|
||||
cancel(id: string): void
|
||||
retry(id: string): void
|
||||
}
|
||||
|
||||
interface UseOutboundQueueOptions {
|
||||
agentId: string | null | undefined
|
||||
sessionKey?: string | null
|
||||
enabled?: boolean
|
||||
}
|
||||
|
||||
interface ServerQueuedItem {
|
||||
id: string
|
||||
status: 'queued' | 'dispatching' | 'failed'
|
||||
message: string
|
||||
attachmentsPreview: Array<{
|
||||
kind: 'image' | 'file'
|
||||
mediaType: string
|
||||
name?: string
|
||||
}>
|
||||
error?: string
|
||||
createdAt: number
|
||||
}
|
||||
|
||||
function makeId(): string {
|
||||
if (typeof crypto !== 'undefined' && crypto.randomUUID) {
|
||||
return crypto.randomUUID()
|
||||
}
|
||||
return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`
|
||||
}
|
||||
|
||||
/**
|
||||
* Server-backed outbound message queue. The browser is purely a
|
||||
* projection of server state — closing the tab is safe because the queue
|
||||
* keeps draining server-side via the OutboundQueueService.
|
||||
*
|
||||
* Single id-keyed list: the client generates the queue id and hands it
|
||||
* to the server in the POST body, so the optimistic row and the SSE
|
||||
* snapshot reconcile on the same key from frame zero — there is no
|
||||
* window in which the message renders twice.
|
||||
*/
|
||||
export function useOutboundQueue(
|
||||
options: UseOutboundQueueOptions,
|
||||
): OutboundQueueApi {
|
||||
const { agentId, enabled = true, sessionKey } = options
|
||||
const { baseUrl } = useAgentServerUrl()
|
||||
const sessionKeyRef = useRef<string | null | undefined>(sessionKey)
|
||||
sessionKeyRef.current = sessionKey
|
||||
|
||||
const [items, setItems] = useState<OutboundMessage[]>([])
|
||||
// Track which ids the server has confirmed seeing in any SSE snapshot.
|
||||
// We use this to know whether a missing-from-snapshot id is "drained
|
||||
// by the server" (drop it) or "still in flight client-side" (keep
|
||||
// showing the optimistic row).
|
||||
const everSeenByServerRef = useRef<Set<string>>(new Set())
|
||||
// Local-only attachment previews, keyed by queue id. Data URLs never
|
||||
// leave the browser — the SSE feed only carries metadata, so we hold
|
||||
// them here so the chip strip keeps rendering after server takeover.
|
||||
const previewMapRef = useRef<Map<string, UserAttachmentPreview[]>>(new Map())
|
||||
|
||||
useEffect(() => {
|
||||
if (!enabled || !baseUrl || !agentId) {
|
||||
setItems([])
|
||||
everSeenByServerRef.current = new Set()
|
||||
previewMapRef.current = new Map()
|
||||
return
|
||||
}
|
||||
let cancelled = false
|
||||
const url = `${baseUrl}/claw/agents/${encodeURIComponent(agentId)}/queue/stream`
|
||||
const source = new EventSource(url)
|
||||
source.onmessage = (event) => {
|
||||
if (cancelled) return
|
||||
try {
|
||||
const parsed = JSON.parse(event.data) as { items: ServerQueuedItem[] }
|
||||
const snapshotIds = new Set(parsed.items.map((item) => item.id))
|
||||
for (const id of snapshotIds) everSeenByServerRef.current.add(id)
|
||||
|
||||
setItems((prev) => {
|
||||
const next: OutboundMessage[] = parsed.items.map((item) => ({
|
||||
id: item.id,
|
||||
text: item.message,
|
||||
attachments: [],
|
||||
attachmentPreviews: previewMapRef.current.get(item.id) ?? [],
|
||||
status: serverStatusToClient(item.status),
|
||||
error: item.error,
|
||||
createdAt: item.createdAt,
|
||||
}))
|
||||
// Carry forward any optimistic / failed entries the server
|
||||
// doesn't know about yet (POST in flight) or has finished
|
||||
// dispatching but the client wants to keep visible (failed).
|
||||
const carried = prev.filter((local) => {
|
||||
if (snapshotIds.has(local.id)) return false
|
||||
if (everSeenByServerRef.current.has(local.id)) {
|
||||
// Server saw it before and it's gone now — drained.
|
||||
previewMapRef.current.delete(local.id)
|
||||
return false
|
||||
}
|
||||
return local.status !== 'failed' || Boolean(local.error)
|
||||
})
|
||||
return [...carried, ...next]
|
||||
})
|
||||
} catch {
|
||||
// Malformed event — ignore; next snapshot will recover.
|
||||
}
|
||||
}
|
||||
source.onerror = () => {
|
||||
// Auto-reconnects; nothing to do here.
|
||||
}
|
||||
return () => {
|
||||
cancelled = true
|
||||
source.close()
|
||||
}
|
||||
}, [baseUrl, agentId, enabled])
|
||||
|
||||
const enqueue = useCallback(
|
||||
(input: OutboundQueueEnqueueInput) => {
|
||||
if (!enabled || !baseUrl || !agentId) return
|
||||
const trimmed = input.text.trim()
|
||||
const attachments = input.attachments ?? []
|
||||
if (!trimmed && attachments.length === 0) return
|
||||
|
||||
const id = makeId()
|
||||
const previews = input.attachmentPreviews ?? []
|
||||
previewMapRef.current.set(id, previews)
|
||||
setItems((prev) => [
|
||||
...prev,
|
||||
{
|
||||
id,
|
||||
text: trimmed,
|
||||
attachments,
|
||||
attachmentPreviews: previews,
|
||||
status: 'queued',
|
||||
createdAt: Date.now(),
|
||||
},
|
||||
])
|
||||
|
||||
void (async () => {
|
||||
try {
|
||||
const response = await fetch(
|
||||
`${baseUrl}/claw/agents/${encodeURIComponent(agentId)}/queue`,
|
||||
{
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
id,
|
||||
message: trimmed,
|
||||
attachments: attachments.length > 0 ? attachments : undefined,
|
||||
sessionKey: sessionKeyRef.current ?? undefined,
|
||||
history: input.history,
|
||||
}),
|
||||
},
|
||||
)
|
||||
if (!response.ok) {
|
||||
const text = await response.text().catch(() => '')
|
||||
previewMapRef.current.delete(id)
|
||||
setItems((prev) =>
|
||||
prev.map((item) =>
|
||||
item.id === id
|
||||
? {
|
||||
...item,
|
||||
status: 'failed',
|
||||
error:
|
||||
text || `Failed to enqueue (status ${response.status})`,
|
||||
}
|
||||
: item,
|
||||
),
|
||||
)
|
||||
}
|
||||
} catch (err) {
|
||||
// Only mark as failed if the SSE snapshot hasn't already
|
||||
// taken ownership of the entry (i.e. the request actually
|
||||
// reached the server).
|
||||
if (everSeenByServerRef.current.has(id)) return
|
||||
previewMapRef.current.delete(id)
|
||||
setItems((prev) =>
|
||||
prev.map((item) =>
|
||||
item.id === id
|
||||
? {
|
||||
...item,
|
||||
status: 'failed',
|
||||
error:
|
||||
err instanceof Error
|
||||
? err.message
|
||||
: 'Failed to enqueue message',
|
||||
}
|
||||
: item,
|
||||
),
|
||||
)
|
||||
}
|
||||
})()
|
||||
},
|
||||
[baseUrl, agentId, enabled],
|
||||
)
|
||||
|
||||
const cancel = useCallback(
|
||||
(id: string) => {
|
||||
// If the server has never seen this id, just drop it locally.
|
||||
if (!everSeenByServerRef.current.has(id)) {
|
||||
previewMapRef.current.delete(id)
|
||||
setItems((prev) => prev.filter((item) => item.id !== id))
|
||||
return
|
||||
}
|
||||
if (!enabled || !baseUrl || !agentId) return
|
||||
void fetch(
|
||||
`${baseUrl}/claw/agents/${encodeURIComponent(agentId)}/queue/${encodeURIComponent(id)}`,
|
||||
{ method: 'DELETE' },
|
||||
).catch(() => {})
|
||||
},
|
||||
[baseUrl, agentId, enabled],
|
||||
)
|
||||
|
||||
const retry = useCallback(
|
||||
(id: string) => {
|
||||
if (!everSeenByServerRef.current.has(id)) {
|
||||
// Optimistic-only entry, never made it to the server. Reset
|
||||
// status so the user can press Send again.
|
||||
setItems((prev) =>
|
||||
prev.map((item) =>
|
||||
item.id === id
|
||||
? { ...item, status: 'queued', error: undefined }
|
||||
: item,
|
||||
),
|
||||
)
|
||||
return
|
||||
}
|
||||
if (!enabled || !baseUrl || !agentId) return
|
||||
void fetch(
|
||||
`${baseUrl}/claw/agents/${encodeURIComponent(agentId)}/queue/${encodeURIComponent(id)}/retry`,
|
||||
{ method: 'POST' },
|
||||
).catch(() => {})
|
||||
},
|
||||
[baseUrl, agentId, enabled],
|
||||
)
|
||||
|
||||
return { queue: items, enqueue, cancel, retry }
|
||||
}
|
||||
|
||||
function serverStatusToClient(
|
||||
status: ServerQueuedItem['status'],
|
||||
): OutboundMessageStatus {
|
||||
if (status === 'dispatching') return 'sending'
|
||||
if (status === 'failed') return 'failed'
|
||||
return 'queued'
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
import { Bot, Cpu, Sparkles } from 'lucide-react'
|
||||
import type { FC } from 'react'
|
||||
import type { HarnessAgentAdapter } from './agent-harness-types'
|
||||
|
||||
/**
|
||||
* Single icon component for any adapter the agent rail can render.
|
||||
* Falls back to a generic bot when the adapter is unknown so future
|
||||
* adapters land without a code change at the call site.
|
||||
*/
|
||||
interface AdapterIconProps {
|
||||
adapter: HarnessAgentAdapter | 'unknown'
|
||||
className?: string
|
||||
}
|
||||
|
||||
export const AdapterIcon: FC<AdapterIconProps> = ({ adapter, className }) => {
|
||||
switch (adapter) {
|
||||
case 'claude':
|
||||
// Claude Code — text-based agent, sparkles to evoke the "AI assistant" feel.
|
||||
return <Sparkles className={className} aria-label="Claude Code" />
|
||||
case 'codex':
|
||||
// Codex — code-leaning, CPU mark.
|
||||
return <Cpu className={className} aria-label="Codex" />
|
||||
case 'openclaw':
|
||||
// OpenClaw — bot/automation framing.
|
||||
return <Bot className={className} aria-label="OpenClaw" />
|
||||
default:
|
||||
return <Bot className={className} aria-label="Agent" />
|
||||
}
|
||||
}
|
||||
|
||||
export function adapterLabel(adapter: HarnessAgentAdapter | 'unknown'): string {
|
||||
switch (adapter) {
|
||||
case 'claude':
|
||||
return 'Claude Code'
|
||||
case 'codex':
|
||||
return 'Codex'
|
||||
case 'openclaw':
|
||||
return 'OpenClaw'
|
||||
default:
|
||||
return 'Agent'
|
||||
}
|
||||
}
|
||||
@@ -1,117 +1,108 @@
|
||||
import { Bot, Cpu, Loader2, MessageSquare, Plus, Trash2 } from 'lucide-react'
|
||||
import type { FC } from 'react'
|
||||
import { Badge } from '@/components/ui/badge'
|
||||
import { Button } from '@/components/ui/button'
|
||||
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'
|
||||
import { Loader2 } from 'lucide-react'
|
||||
import { type FC, useMemo } from 'react'
|
||||
import { AgentRowCard } from './AgentRowCard'
|
||||
import { AgentsEmptyState } from './AgentsEmptyState'
|
||||
import type { HarnessAgent, HarnessAgentAdapter } from './agent-harness-types'
|
||||
import type { AgentListItem } from './agents-page-types'
|
||||
import type { AgentLiveness } from './LivenessDot'
|
||||
|
||||
interface AgentListProps {
|
||||
agents: AgentListItem[]
|
||||
/**
|
||||
* Optional per-agent activity metadata. Keyed by `agentId`. Missing
|
||||
* entries fall back to status='unknown' / lastUsedAt=null and the
|
||||
* row renders an "unknown" dot. The server will populate this once
|
||||
* the activity tracker ships; the page works without it.
|
||||
*/
|
||||
activity?: Record<
|
||||
string,
|
||||
{ status: AgentLiveness; lastUsedAt: number | null }
|
||||
>
|
||||
/**
|
||||
* Lookup table from harness agent id → adapter + reasoning effort,
|
||||
* sourced from `useHarnessAgents`. Lets the row card render the
|
||||
* correct adapter icon and chips for harness agents (legacy
|
||||
* /claw/agents entries fall back to inferring from `runtimeLabel`).
|
||||
*/
|
||||
harnessAgentLookup?: Map<string, HarnessAgent>
|
||||
loading: boolean
|
||||
deletingAgentKey: string | null
|
||||
onChatAgent: (agent: AgentListItem) => void
|
||||
onCreateAgent: () => void
|
||||
onDeleteAgent: (agent: AgentListItem) => void
|
||||
}
|
||||
|
||||
export const AgentList: FC<AgentListProps> = ({
|
||||
agents,
|
||||
activity,
|
||||
harnessAgentLookup,
|
||||
loading,
|
||||
deletingAgentKey,
|
||||
onChatAgent,
|
||||
onCreateAgent,
|
||||
onDeleteAgent,
|
||||
}) => {
|
||||
// Sort by recency: most recently used first; never-used agents drop
|
||||
// to the bottom in id-stable order so the list doesn't reshuffle on
|
||||
// every refresh. The pinned exception is the gateway's `main` agent
|
||||
// when it's never been touched — keep it at the top so a fresh
|
||||
// install has an obvious starting point.
|
||||
const ordered = useMemo(() => {
|
||||
const withScore = agents.map((agent) => {
|
||||
const lastUsedAt = activity?.[agent.agentId]?.lastUsedAt ?? null
|
||||
return { agent, lastUsedAt }
|
||||
})
|
||||
return withScore
|
||||
.sort((a, b) => {
|
||||
const aPinned = a.agent.agentId === 'main' && a.lastUsedAt === null
|
||||
const bPinned = b.agent.agentId === 'main' && b.lastUsedAt === null
|
||||
if (aPinned && !bPinned) return -1
|
||||
if (!aPinned && bPinned) return 1
|
||||
const aValue = a.lastUsedAt ?? -Infinity
|
||||
const bValue = b.lastUsedAt ?? -Infinity
|
||||
if (aValue !== bValue) return bValue - aValue
|
||||
return a.agent.agentId.localeCompare(b.agent.agentId)
|
||||
})
|
||||
.map((entry) => entry.agent)
|
||||
}, [activity, agents])
|
||||
|
||||
if (loading && agents.length === 0) {
|
||||
return (
|
||||
<div className="flex h-36 items-center justify-center rounded-lg border border-border/70">
|
||||
<div className="flex h-36 items-center justify-center rounded-xl border border-border border-dashed bg-card/50">
|
||||
<Loader2 className="size-5 animate-spin text-muted-foreground" />
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
if (agents.length === 0) {
|
||||
return (
|
||||
<Card>
|
||||
<CardContent className="flex h-48 flex-col items-center justify-center gap-4 text-center">
|
||||
<div className="flex size-10 items-center justify-center rounded-lg bg-muted text-muted-foreground">
|
||||
<Bot className="size-5" />
|
||||
</div>
|
||||
<div className="space-y-1">
|
||||
<h2 className="font-medium text-base">No agents</h2>
|
||||
<p className="text-muted-foreground text-sm">
|
||||
Create an OpenClaw, Claude Code, or Codex agent.
|
||||
</p>
|
||||
</div>
|
||||
<Button variant="outline" onClick={onCreateAgent}>
|
||||
<Plus className="mr-2 size-4" />
|
||||
New Agent
|
||||
</Button>
|
||||
</CardContent>
|
||||
</Card>
|
||||
)
|
||||
return <AgentsEmptyState onCreateAgent={onCreateAgent} />
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="grid gap-3">
|
||||
{agents.map((agent) => (
|
||||
<Card key={agent.key} className="rounded-lg border-border/70">
|
||||
<CardHeader className="flex flex-row items-center justify-between gap-4 py-3">
|
||||
<div className="flex min-w-0 items-center gap-3">
|
||||
<div className="flex size-10 shrink-0 items-center justify-center rounded-lg bg-muted text-muted-foreground">
|
||||
{agent.source === 'openclaw' ? (
|
||||
<Cpu className="size-5" />
|
||||
) : (
|
||||
<Bot className="size-5" />
|
||||
)}
|
||||
</div>
|
||||
<div className="min-w-0">
|
||||
<CardTitle className="truncate text-base">
|
||||
{agent.name}
|
||||
</CardTitle>
|
||||
<div className="mt-1 flex flex-wrap items-center gap-2 text-muted-foreground text-xs">
|
||||
<Badge variant="outline" className="rounded-md">
|
||||
{agent.runtimeLabel}
|
||||
</Badge>
|
||||
<span>{agent.modelLabel}</span>
|
||||
<Badge variant="outline" className="rounded-md">
|
||||
main
|
||||
</Badge>
|
||||
</div>
|
||||
<p className="mt-1 truncate font-mono text-muted-foreground text-xs">
|
||||
{agent.detail}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
<div className="flex shrink-0 items-center gap-1">
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="sm"
|
||||
onClick={() => onChatAgent(agent)}
|
||||
disabled={!agent.canChat}
|
||||
>
|
||||
<MessageSquare className="mr-1 size-4" />
|
||||
Chat
|
||||
</Button>
|
||||
{agent.canDelete ? (
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
title="Delete agent"
|
||||
onClick={() => onDeleteAgent(agent)}
|
||||
disabled={deletingAgentKey === agent.key}
|
||||
>
|
||||
{deletingAgentKey === agent.key ? (
|
||||
<Loader2 className="size-4 animate-spin" />
|
||||
) : (
|
||||
<Trash2 className="size-4 text-destructive" />
|
||||
)}
|
||||
</Button>
|
||||
) : null}
|
||||
</div>
|
||||
</CardHeader>
|
||||
</Card>
|
||||
))}
|
||||
{ordered.map((agent) => {
|
||||
const harness = harnessAgentLookup?.get(agent.agentId)
|
||||
const adapter: HarnessAgentAdapter | undefined =
|
||||
harness?.adapter ?? inferAdapterFromLabel(agent.runtimeLabel)
|
||||
return (
|
||||
<AgentRowCard
|
||||
key={agent.key}
|
||||
agent={agent}
|
||||
status={activity?.[agent.agentId]?.status}
|
||||
lastUsedAt={activity?.[agent.agentId]?.lastUsedAt}
|
||||
adapter={adapter}
|
||||
reasoningEffort={harness?.reasoningEffort ?? null}
|
||||
onDelete={onDeleteAgent}
|
||||
deleting={deletingAgentKey === agent.key}
|
||||
/>
|
||||
)
|
||||
})}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function inferAdapterFromLabel(label: string): HarnessAgentAdapter | undefined {
|
||||
const lower = label?.toLowerCase()
|
||||
if (lower === 'claude code') return 'claude'
|
||||
if (lower === 'codex') return 'codex'
|
||||
if (lower === 'openclaw') return 'openclaw'
|
||||
return undefined
|
||||
}
|
||||
|
||||
@@ -0,0 +1,270 @@
|
||||
import {
|
||||
Copy,
|
||||
Loader2,
|
||||
MessageSquare,
|
||||
MoreHorizontal,
|
||||
Pencil,
|
||||
RotateCcw,
|
||||
Trash2,
|
||||
} from 'lucide-react'
|
||||
import type { FC } from 'react'
|
||||
import { useNavigate } from 'react-router'
|
||||
import { toast } from 'sonner'
|
||||
import { Badge } from '@/components/ui/badge'
|
||||
import { Button } from '@/components/ui/button'
|
||||
import {
|
||||
DropdownMenu,
|
||||
DropdownMenuContent,
|
||||
DropdownMenuItem,
|
||||
DropdownMenuSeparator,
|
||||
DropdownMenuTrigger,
|
||||
} from '@/components/ui/dropdown-menu'
|
||||
import {
|
||||
Tooltip,
|
||||
TooltipContent,
|
||||
TooltipProvider,
|
||||
TooltipTrigger,
|
||||
} from '@/components/ui/tooltip'
|
||||
import { cn } from '@/lib/utils'
|
||||
import { AdapterIcon, adapterLabel } from './AdapterIcon'
|
||||
import {
|
||||
canDelete as canDeleteAgent,
|
||||
canRename as canRenameAgent,
|
||||
displayName,
|
||||
formatRelativeTime,
|
||||
workspaceLabel,
|
||||
} from './agent-display.helpers'
|
||||
import type { HarnessAgentAdapter } from './agent-harness-types'
|
||||
import type { AgentListItem } from './agents-page-types'
|
||||
import { type AgentLiveness, LivenessDot } from './LivenessDot'
|
||||
|
||||
interface AgentRowCardProps {
|
||||
agent: AgentListItem
|
||||
/**
|
||||
* Per-agent extras the listing surface provides on top of the
|
||||
* minimal `AgentListItem` shape. `lastUsedAt` survives server
|
||||
* restart (sourced from acpx session record); `status` is in-memory
|
||||
* server-side.
|
||||
*/
|
||||
status?: AgentLiveness
|
||||
lastUsedAt?: number | null
|
||||
/** Adapter the agent belongs to. Drives icon + label. */
|
||||
adapter?: HarnessAgentAdapter
|
||||
/** Reasoning effort chip (claude/codex/openclaw catalog). */
|
||||
reasoningEffort?: string | null
|
||||
/** Modeled directly off the inbound delete handler so the parent owns the dialog. */
|
||||
onDelete: (agent: AgentListItem) => void
|
||||
/** Whether THIS agent is mid-delete; renders a spinner in place of the trash icon. */
|
||||
deleting?: boolean
|
||||
}
|
||||
|
||||
export const AgentRowCard: FC<AgentRowCardProps> = ({
|
||||
agent,
|
||||
status = 'unknown',
|
||||
lastUsedAt,
|
||||
adapter,
|
||||
reasoningEffort,
|
||||
onDelete,
|
||||
deleting,
|
||||
}) => {
|
||||
const navigate = useNavigate()
|
||||
const adapterId = adapter ?? inferAdapterFromListItem(agent)
|
||||
const workspace = workspaceLabel(agent)
|
||||
const lastUsedLabel = formatRelativeTime(lastUsedAt ?? null)
|
||||
const allowDelete = canDeleteAgent(agent)
|
||||
const allowRename = canRenameAgent(agent)
|
||||
|
||||
const handleChat = () => navigate(`/agents/${agent.agentId}`)
|
||||
const handleCopyId = async () => {
|
||||
try {
|
||||
await navigator.clipboard.writeText(agent.agentId)
|
||||
toast.success('Agent id copied')
|
||||
} catch {
|
||||
toast.error('Could not copy agent id')
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div
|
||||
className={cn(
|
||||
'group rounded-xl border border-border bg-card p-4 shadow-sm transition-all',
|
||||
'hover:border-[var(--accent-orange)]/50 hover:shadow-sm',
|
||||
)}
|
||||
>
|
||||
<div className="flex items-start gap-4">
|
||||
{/* Adapter tile + liveness dot in the corner. */}
|
||||
<div className="relative shrink-0">
|
||||
<div className="flex h-12 w-12 items-center justify-center rounded-xl bg-muted text-muted-foreground">
|
||||
<AdapterIcon adapter={adapterId} className="h-6 w-6" />
|
||||
</div>
|
||||
<LivenessDot
|
||||
status={status}
|
||||
detail={livenessDetail(status, lastUsedAt)}
|
||||
className="absolute -right-0.5 -bottom-0.5"
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className="min-w-0 flex-1">
|
||||
<div className="mb-1 flex items-center gap-2">
|
||||
<span className="truncate font-semibold">{displayName(agent)}</span>
|
||||
{status === 'working' && (
|
||||
<Badge
|
||||
variant="secondary"
|
||||
className="bg-amber-50 text-amber-900 hover:bg-amber-50"
|
||||
>
|
||||
Working
|
||||
</Badge>
|
||||
)}
|
||||
{status === 'asleep' && (
|
||||
<Badge variant="outline" className="text-muted-foreground">
|
||||
Asleep
|
||||
</Badge>
|
||||
)}
|
||||
{status === 'error' && (
|
||||
<Badge variant="destructive">Attention</Badge>
|
||||
)}
|
||||
</div>
|
||||
|
||||
<div className="mb-2 flex flex-wrap items-center gap-1.5 text-xs">
|
||||
<Badge variant="secondary" className="font-normal">
|
||||
{adapterLabel(adapterId)}
|
||||
</Badge>
|
||||
{agent.modelLabel && agent.modelLabel !== 'default' && (
|
||||
<Badge variant="outline" className="font-normal">
|
||||
{agent.modelLabel}
|
||||
</Badge>
|
||||
)}
|
||||
{reasoningEffort && reasoningEffort !== 'medium' && (
|
||||
<Badge variant="outline" className="font-normal">
|
||||
{reasoningEffort}
|
||||
</Badge>
|
||||
)}
|
||||
</div>
|
||||
|
||||
<div className="flex flex-wrap items-center gap-2 text-muted-foreground text-xs">
|
||||
<span>Last used {lastUsedLabel}</span>
|
||||
{workspace && (
|
||||
<>
|
||||
<span aria-hidden>•</span>
|
||||
<span className="truncate font-mono" title={workspace}>
|
||||
{workspace}
|
||||
</span>
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex shrink-0 items-center gap-2">
|
||||
<Button variant="outline" size="sm" onClick={handleChat}>
|
||||
<MessageSquare className="mr-1.5 h-3 w-3" />
|
||||
Chat
|
||||
</Button>
|
||||
<DropdownMenu>
|
||||
<DropdownMenuTrigger asChild>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
aria-label={`More actions for ${displayName(agent)}`}
|
||||
className="h-8 w-8"
|
||||
>
|
||||
<MoreHorizontal className="h-4 w-4" />
|
||||
</Button>
|
||||
</DropdownMenuTrigger>
|
||||
<DropdownMenuContent align="end" className="w-44">
|
||||
<DropdownMenuItem onSelect={() => void handleCopyId()}>
|
||||
<Copy className="mr-2 h-3.5 w-3.5" />
|
||||
Copy id
|
||||
</DropdownMenuItem>
|
||||
<RenameMenuItem disabled={!allowRename} />
|
||||
<ResetHistoryMenuItem />
|
||||
<DropdownMenuSeparator />
|
||||
<DropdownMenuItem
|
||||
onSelect={() => onDelete(agent)}
|
||||
disabled={!allowDelete || deleting}
|
||||
className="text-destructive focus:text-destructive"
|
||||
>
|
||||
{deleting ? (
|
||||
<Loader2 className="mr-2 h-3.5 w-3.5 animate-spin" />
|
||||
) : (
|
||||
<Trash2 className="mr-2 h-3.5 w-3.5" />
|
||||
)}
|
||||
Delete
|
||||
</DropdownMenuItem>
|
||||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
const RenameMenuItem: FC<{ disabled: boolean }> = ({ disabled }) => {
|
||||
const item = (
|
||||
<DropdownMenuItem disabled className="text-muted-foreground">
|
||||
<Pencil className="mr-2 h-3.5 w-3.5" />
|
||||
Rename
|
||||
</DropdownMenuItem>
|
||||
)
|
||||
if (!disabled) return item
|
||||
// Disabled but with a hint so users know it's coming, not broken.
|
||||
return (
|
||||
<TooltipProvider delayDuration={300}>
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<span className="block w-full">{item}</span>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="left" className="text-xs">
|
||||
Rename coming soon
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
</TooltipProvider>
|
||||
)
|
||||
}
|
||||
|
||||
const ResetHistoryMenuItem: FC = () => {
|
||||
const item = (
|
||||
<DropdownMenuItem disabled className="text-muted-foreground">
|
||||
<RotateCcw className="mr-2 h-3.5 w-3.5" />
|
||||
Reset history
|
||||
</DropdownMenuItem>
|
||||
)
|
||||
return (
|
||||
<TooltipProvider delayDuration={300}>
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<span className="block w-full">{item}</span>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="left" className="text-xs">
|
||||
Reset history coming soon
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
</TooltipProvider>
|
||||
)
|
||||
}
|
||||
|
||||
function inferAdapterFromListItem(
|
||||
agent: AgentListItem,
|
||||
): HarnessAgentAdapter | 'unknown' {
|
||||
const label = agent.runtimeLabel?.toLowerCase()
|
||||
if (label?.includes('claude')) return 'claude'
|
||||
if (label?.includes('codex')) return 'codex'
|
||||
if (label?.includes('openclaw')) return 'openclaw'
|
||||
return 'unknown'
|
||||
}
|
||||
|
||||
function livenessDetail(
|
||||
status: AgentLiveness,
|
||||
lastUsedAt: number | null | undefined,
|
||||
): string | undefined {
|
||||
if (lastUsedAt == null) return undefined
|
||||
const diffMin = Math.floor((Date.now() - lastUsedAt) / 60_000)
|
||||
if (status === 'idle') return `Idle for ${Math.max(0, diffMin)} min`
|
||||
if (status === 'asleep') {
|
||||
if (diffMin < 60) return `Asleep — quiet for ${diffMin} min`
|
||||
const hr = Math.floor(diffMin / 60)
|
||||
return `Asleep — quiet for ${hr} hr`
|
||||
}
|
||||
if (status === 'working') return 'Working on a turn'
|
||||
if (status === 'error') return 'Attention — last turn failed'
|
||||
return undefined
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
import { Bot, Plus } from 'lucide-react'
|
||||
import type { FC } from 'react'
|
||||
import { Button } from '@/components/ui/button'
|
||||
|
||||
interface AgentsEmptyStateProps {
|
||||
onCreateAgent: () => void
|
||||
}
|
||||
|
||||
export const AgentsEmptyState: FC<AgentsEmptyStateProps> = ({
|
||||
onCreateAgent,
|
||||
}) => {
|
||||
return (
|
||||
<div className="rounded-xl border border-border border-dashed bg-card/50 p-12 text-center">
|
||||
<div className="mx-auto mb-4 flex h-12 w-12 items-center justify-center rounded-xl bg-[var(--accent-orange)]/10">
|
||||
<Bot className="h-6 w-6 text-[var(--accent-orange)]" />
|
||||
</div>
|
||||
<h3 className="mb-1 font-semibold">No agents yet</h3>
|
||||
<p className="mx-auto mb-4 max-w-sm text-muted-foreground text-sm">
|
||||
Spin up an OpenClaw, Claude Code, or Codex agent to chat with, schedule,
|
||||
or run in the background.
|
||||
</p>
|
||||
<Button
|
||||
onClick={onCreateAgent}
|
||||
variant="outline"
|
||||
className="border-[var(--accent-orange)] bg-[var(--accent-orange)]/10 text-[var(--accent-orange)] hover:bg-[var(--accent-orange)]/20 hover:text-[var(--accent-orange)]"
|
||||
>
|
||||
<Plus className="mr-1.5 h-4 w-4" />
|
||||
Create your first agent
|
||||
</Button>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
import { Bot, Plus } from 'lucide-react'
|
||||
import type { FC } from 'react'
|
||||
import { Button } from '@/components/ui/button'
|
||||
|
||||
interface AgentsHeaderProps {
|
||||
onCreateAgent: () => void
|
||||
}
|
||||
|
||||
/**
|
||||
* Mirrors the visual shape of `SoulHeader` and `ScheduledTasksHeader`
|
||||
* so the page reads as part of the same family. Loose lifecycle
|
||||
* controls that used to sit next to the title moved into
|
||||
* `GatewayStatusBar` — they're OpenClaw-specific and don't apply to
|
||||
* Claude/Codex agents.
|
||||
*/
|
||||
export const AgentsHeader: FC<AgentsHeaderProps> = ({ onCreateAgent }) => {
|
||||
return (
|
||||
<div className="rounded-xl border border-border bg-card p-6 shadow-sm transition-all hover:shadow-md">
|
||||
<div className="flex items-start gap-4">
|
||||
<div className="flex h-12 w-12 shrink-0 items-center justify-center rounded-xl bg-[var(--accent-orange)]/10">
|
||||
<Bot className="h-6 w-6 text-[var(--accent-orange)]" />
|
||||
</div>
|
||||
<div className="flex-1">
|
||||
<h2 className="mb-1 font-semibold text-xl">Agents</h2>
|
||||
<p className="text-muted-foreground text-sm">
|
||||
OpenClaw, Claude Code, and Codex agents — chat, schedule, and run
|
||||
them in the background.
|
||||
</p>
|
||||
</div>
|
||||
<Button
|
||||
onClick={onCreateAgent}
|
||||
className="border-[var(--accent-orange)] bg-[var(--accent-orange)]/10 text-[var(--accent-orange)] hover:bg-[var(--accent-orange)]/20 hover:text-[var(--accent-orange)]"
|
||||
variant="outline"
|
||||
>
|
||||
<Plus className="mr-1.5 h-4 w-4" />
|
||||
New Agent
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -3,8 +3,9 @@ import { type FC, useMemo, useState } from 'react'
|
||||
import { useNavigate } from 'react-router'
|
||||
import { useLlmProviders } from '@/lib/llm-providers/useLlmProviders'
|
||||
import { AgentList } from './AgentList'
|
||||
import { AgentsHeader } from './AgentsHeader'
|
||||
import { AgentTerminal } from './AgentTerminal'
|
||||
import type { HarnessAgentAdapter } from './agent-harness-types'
|
||||
import type { HarnessAgent, HarnessAgentAdapter } from './agent-harness-types'
|
||||
import { createAgentPageActions } from './agents-page-actions'
|
||||
import {
|
||||
useDefaultAgentName,
|
||||
@@ -29,9 +30,9 @@ import {
|
||||
toHarnessListItem,
|
||||
toOpenClawListItem,
|
||||
} from './agents-page-utils'
|
||||
import { GatewayStatusBar } from './GatewayStatusBar'
|
||||
import { NewAgentDialog } from './NewAgentDialog'
|
||||
import {
|
||||
AgentsPageHeader,
|
||||
ControlPlaneAlert,
|
||||
GatewayStateCards,
|
||||
InlineErrorAlert,
|
||||
@@ -44,42 +45,35 @@ import {
|
||||
useDeleteHarnessAgent,
|
||||
useHarnessAgents,
|
||||
} from './useAgents'
|
||||
import {
|
||||
useOpenClawAgents,
|
||||
useOpenClawMutations,
|
||||
useOpenClawStatus,
|
||||
} from './useOpenClaw'
|
||||
import { useOpenClawAgents, useOpenClawMutations } from './useOpenClaw'
|
||||
|
||||
export const AgentsPage: FC = () => {
|
||||
const navigate = useNavigate()
|
||||
const {
|
||||
status,
|
||||
loading: statusLoading,
|
||||
error: statusError,
|
||||
refetch: refetchStatus,
|
||||
} = useOpenClawStatus()
|
||||
const { providers, defaultProviderId } = useLlmProviders()
|
||||
const {
|
||||
adapters,
|
||||
loading: adaptersLoading,
|
||||
error: adaptersError,
|
||||
refetch: refetchAdapters,
|
||||
} = useAgentAdapters()
|
||||
|
||||
// The harness listing now carries the gateway lifecycle snapshot
|
||||
// alongside the agents — one polling source for everything the
|
||||
// agents page renders. The legacy `/claw/status` poll is dead from
|
||||
// this surface; the chat-panel layout still uses it for now.
|
||||
const {
|
||||
harnessAgents,
|
||||
gateway: status,
|
||||
loading: harnessAgentsLoading,
|
||||
error: harnessAgentsError,
|
||||
} = useHarnessAgents()
|
||||
|
||||
const openClawAgentsEnabled =
|
||||
status?.status === 'running' && status.controlPlaneStatus === 'connected'
|
||||
const {
|
||||
agents: openClawAgents,
|
||||
loading: openClawAgentsLoading,
|
||||
error: openClawAgentsError,
|
||||
refetch: refetchOpenClawAgents,
|
||||
} = useOpenClawAgents(openClawAgentsEnabled)
|
||||
const {
|
||||
harnessAgents,
|
||||
loading: harnessAgentsLoading,
|
||||
error: harnessAgentsError,
|
||||
refetch: refetchHarnessAgents,
|
||||
} = useHarnessAgents()
|
||||
const createHarnessAgent = useCreateHarnessAgent()
|
||||
const deleteHarnessAgent = useDeleteHarnessAgent()
|
||||
const {
|
||||
@@ -87,7 +81,6 @@ export const AgentsPage: FC = () => {
|
||||
createAgent: createOpenClawAgent,
|
||||
deleteAgent: deleteOpenClawAgent,
|
||||
startOpenClaw,
|
||||
stopOpenClaw,
|
||||
restartOpenClaw,
|
||||
reconnectOpenClaw,
|
||||
actionInProgress,
|
||||
@@ -158,42 +151,68 @@ export const AgentsPage: FC = () => {
|
||||
openClawAgentsEnabled,
|
||||
openClawAgents,
|
||||
)
|
||||
const agentListItems = useMemo(
|
||||
() => [
|
||||
...visibleOpenClawAgents.map((agent) =>
|
||||
const agentListItems = useMemo(() => {
|
||||
// Dual-created OpenClaw agents (and the backfilled `main`/orphans
|
||||
// post Step 9) live in both `/claw/agents` and `/agents` under the
|
||||
// same id. Prefer the harness entry — it carries adapter/model/
|
||||
// reasoning/lastUsedAt/status that the chat path actually uses —
|
||||
// and drop the legacy duplicate so the rail doesn't show every
|
||||
// OpenClaw agent twice.
|
||||
const harnessIds = new Set(harnessAgents.map((agent) => agent.id))
|
||||
const dedupedOpenClawAgents = visibleOpenClawAgents.filter(
|
||||
(agent) => !harnessIds.has(agent.agentId),
|
||||
)
|
||||
return [
|
||||
...dedupedOpenClawAgents.map((agent) =>
|
||||
toOpenClawListItem(agent, openClawManageable),
|
||||
),
|
||||
...harnessAgents.map(toHarnessListItem),
|
||||
],
|
||||
[harnessAgents, openClawManageable, visibleOpenClawAgents],
|
||||
)
|
||||
]
|
||||
}, [harnessAgents, openClawManageable, visibleOpenClawAgents])
|
||||
// Lookup map so AgentList can render adapter chips, reasoning, etc.
|
||||
// Computed up here to keep all hooks above the early returns below.
|
||||
const harnessAgentLookup = useMemo(() => {
|
||||
const map = new Map<string, HarnessAgent>()
|
||||
for (const agent of harnessAgents) map.set(agent.id, agent)
|
||||
return map
|
||||
}, [harnessAgents])
|
||||
// Activity map keyed by agentId. Sourced from the harness listing's
|
||||
// server-side enrichment (`status` + `lastUsedAt`). Legacy gateway
|
||||
// agents that don't have a harness record yet (rare post-backfill)
|
||||
// simply miss from the map and render with the default `unknown`
|
||||
// dot until reconciliation picks them up.
|
||||
const agentActivity = useMemo(() => {
|
||||
const map: Record<
|
||||
string,
|
||||
{
|
||||
status: 'working' | 'idle' | 'asleep' | 'error'
|
||||
lastUsedAt: number | null
|
||||
}
|
||||
> = {}
|
||||
for (const agent of harnessAgents) {
|
||||
if (!agent.status) continue
|
||||
map[agent.id] = {
|
||||
status: agent.status,
|
||||
lastUsedAt: agent.lastUsedAt ?? null,
|
||||
}
|
||||
}
|
||||
return map
|
||||
}, [harnessAgents])
|
||||
const inlineError = getInlineError({
|
||||
lifecyclePending,
|
||||
pageError,
|
||||
statusError,
|
||||
openClawAgentsError,
|
||||
adaptersError,
|
||||
harnessAgentsError,
|
||||
})
|
||||
const agentsLoading = getAgentsLoading({
|
||||
statusLoading,
|
||||
adaptersLoading,
|
||||
harnessAgentsLoading,
|
||||
openClawAgentsEnabled,
|
||||
openClawAgentsLoading,
|
||||
})
|
||||
const creatingAgent = creatingOpenClawAgent || createHarnessAgent.isPending
|
||||
const deletingAgent = deletingOpenClawAgent || deleteHarnessAgent.isPending
|
||||
|
||||
const refreshAll = async () => {
|
||||
await Promise.all([
|
||||
refetchStatus(),
|
||||
refetchAdapters(),
|
||||
refetchHarnessAgents(),
|
||||
openClawAgentsEnabled ? refetchOpenClawAgents() : Promise.resolve(),
|
||||
])
|
||||
}
|
||||
|
||||
const handleHarnessAdapterChange = (adapter: HarnessAgentAdapter) => {
|
||||
const descriptor = adapters.find((entry) => entry.id === adapter)
|
||||
setHarnessAdapterId(adapter)
|
||||
@@ -239,7 +258,9 @@ export const AgentsPage: FC = () => {
|
||||
)
|
||||
}
|
||||
|
||||
if (statusLoading && !status) {
|
||||
// First-paint loader: until the harness listing has resolved at
|
||||
// least once we don't know which adapters / agents to render.
|
||||
if (harnessAgentsLoading && !status) {
|
||||
return (
|
||||
<div className="flex items-center justify-center py-20">
|
||||
<Loader2 className="size-6 animate-spin text-muted-foreground" />
|
||||
@@ -255,27 +276,18 @@ export const AgentsPage: FC = () => {
|
||||
const recoveryDetail = status ? getRecoveryDetail(status) : null
|
||||
const controlPlaneCopy = getControlPlaneCopyForStatus(status)
|
||||
|
||||
// Bar only makes sense when the gateway is meaningfully alive AND
|
||||
// there's at least one OpenClaw agent in the merged list. Hide it
|
||||
// for Claude/Codex-only setups so the page stays uncluttered.
|
||||
const showGatewayStatusBar =
|
||||
status?.status === 'running' &&
|
||||
(visibleOpenClawAgents.length > 0 ||
|
||||
harnessAgents.some((agent) => agent.adapter === 'openclaw'))
|
||||
|
||||
return (
|
||||
<div className="min-h-full bg-background px-6 py-8">
|
||||
<div className="mx-auto flex w-full max-w-5xl flex-col gap-6">
|
||||
<AgentsPageHeader
|
||||
actionInProgress={actionInProgress}
|
||||
controlPlaneBusy={gatewayUiState.controlPlaneBusy}
|
||||
reconnecting={reconnecting}
|
||||
status={status}
|
||||
onCreateAgent={() => setCreateOpen(true)}
|
||||
onOpenTerminal={() => setShowTerminal(true)}
|
||||
onReconnect={() => {
|
||||
void runWithPageErrorHandling(reconnectOpenClaw)
|
||||
}}
|
||||
onRefresh={() => void refreshAll()}
|
||||
onRestart={() => {
|
||||
void runWithPageErrorHandling(restartOpenClaw)
|
||||
}}
|
||||
onStop={() => {
|
||||
void runWithPageErrorHandling(stopOpenClaw)
|
||||
}}
|
||||
/>
|
||||
<div className="fade-in slide-in-from-bottom-5 mx-auto flex w-full max-w-5xl animate-in flex-col gap-6 duration-500">
|
||||
<AgentsHeader onCreateAgent={() => setCreateOpen(true)} />
|
||||
|
||||
{lifecycleBanner ? <LifecycleAlert message={lifecycleBanner} /> : null}
|
||||
|
||||
@@ -315,11 +327,23 @@ export const AgentsPage: FC = () => {
|
||||
}}
|
||||
/>
|
||||
|
||||
{showGatewayStatusBar ? (
|
||||
<GatewayStatusBar
|
||||
status={status}
|
||||
actionInProgress={actionInProgress}
|
||||
onOpenTerminal={() => setShowTerminal(true)}
|
||||
onRestart={() => {
|
||||
void runWithPageErrorHandling(restartOpenClaw)
|
||||
}}
|
||||
/>
|
||||
) : null}
|
||||
|
||||
<AgentList
|
||||
agents={agentListItems}
|
||||
activity={agentActivity}
|
||||
harnessAgentLookup={harnessAgentLookup}
|
||||
loading={agentsLoading}
|
||||
deletingAgentKey={deletingAgent ? deletingAgentKey : null}
|
||||
onChatAgent={(agent) => navigate(`/agents/${agent.agentId}`)}
|
||||
onCreateAgent={() => setCreateOpen(true)}
|
||||
onDeleteAgent={(agent) => {
|
||||
void handleDelete(agent)
|
||||
|
||||
@@ -0,0 +1,206 @@
|
||||
import { Loader2, RotateCcw, Terminal } from 'lucide-react'
|
||||
import type { FC, ReactNode } from 'react'
|
||||
import { Badge } from '@/components/ui/badge'
|
||||
import { Button } from '@/components/ui/button'
|
||||
import { Separator } from '@/components/ui/separator'
|
||||
import {
|
||||
Tooltip,
|
||||
TooltipContent,
|
||||
TooltipProvider,
|
||||
TooltipTrigger,
|
||||
} from '@/components/ui/tooltip'
|
||||
import { cn } from '@/lib/utils'
|
||||
import type { OpenClawStatus } from './useOpenClaw'
|
||||
|
||||
interface GatewayStatusBarProps {
|
||||
status: OpenClawStatus | null
|
||||
/** Disabled while a gateway lifecycle mutation is mid-flight. */
|
||||
actionInProgress: boolean
|
||||
onOpenTerminal: () => void
|
||||
onRestart: () => void
|
||||
}
|
||||
|
||||
/**
|
||||
* Compact one-line status bar for the OpenClaw gateway. Renders the
|
||||
* lifecycle pills (Running / Control plane connected) plus a Terminal
|
||||
* escape hatch and a Restart Gateway action. Lives between the page
|
||||
* header and the agent list when at least one OpenClaw agent is in
|
||||
* the merged list; collapses to nothing for Claude/Codex-only setups.
|
||||
*
|
||||
* Status is sourced from `GET /agents`'s `gateway` field — the agents
|
||||
* page no longer polls `/claw/status` directly. One endpoint, one
|
||||
* 5s interval, no duplicate state.
|
||||
*/
|
||||
export const GatewayStatusBar: FC<GatewayStatusBarProps> = ({
|
||||
status,
|
||||
actionInProgress,
|
||||
onOpenTerminal,
|
||||
onRestart,
|
||||
}) => {
|
||||
if (!status) return null
|
||||
|
||||
const runningPill = pillForRuntimeStatus(status.status)
|
||||
const controlPlanePill = pillForControlPlane(status.controlPlaneStatus)
|
||||
|
||||
return (
|
||||
<div className="rounded-xl border border-border bg-card px-4 py-3 shadow-sm">
|
||||
<div className="flex items-center gap-3 text-sm">
|
||||
<span className="font-medium text-muted-foreground">
|
||||
OpenClaw gateway
|
||||
</span>
|
||||
<Badge
|
||||
variant={runningPill.variant}
|
||||
className={cn('gap-1.5', runningPill.className)}
|
||||
>
|
||||
<span
|
||||
className={cn(
|
||||
'inline-block h-1.5 w-1.5 rounded-full',
|
||||
runningPill.dot,
|
||||
)}
|
||||
/>
|
||||
{runningPill.label}
|
||||
</Badge>
|
||||
<Badge
|
||||
variant={controlPlanePill.variant}
|
||||
className={cn('gap-1.5', controlPlanePill.className)}
|
||||
>
|
||||
<span
|
||||
className={cn(
|
||||
'inline-block h-1.5 w-1.5 rounded-full',
|
||||
controlPlanePill.dot,
|
||||
)}
|
||||
/>
|
||||
{controlPlanePill.label}
|
||||
</Badge>
|
||||
<Separator orientation="vertical" className="h-4" />
|
||||
<WithTooltip label="Open a shell into the OpenClaw gateway container for raw CLI access (config edits, session inspection).">
|
||||
<Button variant="ghost" size="sm" onClick={onOpenTerminal}>
|
||||
<Terminal className="mr-1.5 h-3.5 w-3.5" />
|
||||
Terminal
|
||||
</Button>
|
||||
</WithTooltip>
|
||||
<WithTooltip label="Restart the OpenClaw gateway. Useful when the gateway is stuck or after editing provider config.">
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="sm"
|
||||
onClick={onRestart}
|
||||
disabled={actionInProgress}
|
||||
className="ml-auto"
|
||||
>
|
||||
{actionInProgress ? (
|
||||
<Loader2 className="mr-1.5 h-3.5 w-3.5 animate-spin" />
|
||||
) : (
|
||||
<RotateCcw className="mr-1.5 h-3.5 w-3.5" />
|
||||
)}
|
||||
Restart Gateway
|
||||
</Button>
|
||||
</WithTooltip>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
const WithTooltip: FC<{ label: string; children: ReactNode }> = ({
|
||||
label,
|
||||
children,
|
||||
}) => (
|
||||
<TooltipProvider delayDuration={250}>
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>{children}</TooltipTrigger>
|
||||
<TooltipContent side="bottom" className="max-w-xs text-xs">
|
||||
{label}
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
</TooltipProvider>
|
||||
)
|
||||
|
||||
type PillKind = {
|
||||
variant: 'default' | 'secondary' | 'outline' | 'destructive'
|
||||
label: string
|
||||
dot: string
|
||||
className?: string
|
||||
}
|
||||
|
||||
function pillForRuntimeStatus(status: OpenClawStatus['status']): PillKind {
|
||||
switch (status) {
|
||||
case 'running':
|
||||
return {
|
||||
variant: 'secondary',
|
||||
label: 'Running',
|
||||
dot: 'bg-emerald-500',
|
||||
className: 'bg-emerald-50 text-emerald-900 hover:bg-emerald-50',
|
||||
}
|
||||
case 'starting':
|
||||
return {
|
||||
variant: 'secondary',
|
||||
label: 'Starting',
|
||||
dot: 'bg-amber-500 animate-pulse',
|
||||
className: 'bg-amber-50 text-amber-900 hover:bg-amber-50',
|
||||
}
|
||||
case 'stopped':
|
||||
return {
|
||||
variant: 'outline',
|
||||
label: 'Stopped',
|
||||
dot: 'bg-muted-foreground/40',
|
||||
}
|
||||
case 'error':
|
||||
return {
|
||||
variant: 'destructive',
|
||||
label: 'Error',
|
||||
dot: 'bg-destructive-foreground',
|
||||
}
|
||||
default:
|
||||
return {
|
||||
variant: 'outline',
|
||||
label: 'Unknown',
|
||||
dot: 'bg-muted-foreground/40',
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function pillForControlPlane(
|
||||
status: OpenClawStatus['controlPlaneStatus'],
|
||||
): PillKind {
|
||||
switch (status) {
|
||||
case 'connected':
|
||||
return {
|
||||
variant: 'secondary',
|
||||
label: 'Control plane connected',
|
||||
dot: 'bg-emerald-500',
|
||||
className: 'bg-emerald-50 text-emerald-900 hover:bg-emerald-50',
|
||||
}
|
||||
case 'connecting':
|
||||
return {
|
||||
variant: 'secondary',
|
||||
label: 'Connecting',
|
||||
dot: 'bg-amber-500 animate-pulse',
|
||||
className: 'bg-amber-50 text-amber-900 hover:bg-amber-50',
|
||||
}
|
||||
case 'reconnecting':
|
||||
return {
|
||||
variant: 'secondary',
|
||||
label: 'Reconnecting',
|
||||
dot: 'bg-amber-500 animate-pulse',
|
||||
className: 'bg-amber-50 text-amber-900 hover:bg-amber-50',
|
||||
}
|
||||
case 'recovering':
|
||||
return {
|
||||
variant: 'secondary',
|
||||
label: 'Recovering',
|
||||
dot: 'bg-amber-500 animate-pulse',
|
||||
className: 'bg-amber-50 text-amber-900 hover:bg-amber-50',
|
||||
}
|
||||
case 'failed':
|
||||
return {
|
||||
variant: 'destructive',
|
||||
label: 'Needs attention',
|
||||
dot: 'bg-destructive-foreground',
|
||||
}
|
||||
default:
|
||||
return {
|
||||
variant: 'outline',
|
||||
label: 'Disconnected',
|
||||
dot: 'bg-muted-foreground/40',
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
import type { FC } from 'react'
|
||||
import {
|
||||
Tooltip,
|
||||
TooltipContent,
|
||||
TooltipProvider,
|
||||
TooltipTrigger,
|
||||
} from '@/components/ui/tooltip'
|
||||
import { cn } from '@/lib/utils'
|
||||
|
||||
export type AgentLiveness = 'working' | 'idle' | 'asleep' | 'error' | 'unknown'
|
||||
|
||||
interface LivenessDotProps {
|
||||
status: AgentLiveness
|
||||
/**
|
||||
* Optional human-friendly secondary line, e.g. "Idle for 4 min" or
|
||||
* "Asleep — no activity for 22 min". When absent the tooltip just
|
||||
* reads the status label.
|
||||
*/
|
||||
detail?: string
|
||||
className?: string
|
||||
}
|
||||
|
||||
const VARIANT: Record<
|
||||
AgentLiveness,
|
||||
{ dot: string; ring: string; label: string }
|
||||
> = {
|
||||
working: {
|
||||
// Animated amber pulse + soft halo so the eye catches an active
|
||||
// agent in a long list without the dot screaming for attention.
|
||||
dot: 'bg-amber-500 animate-pulse',
|
||||
ring: 'ring-2 ring-amber-200',
|
||||
label: 'Working on a turn',
|
||||
},
|
||||
idle: {
|
||||
dot: 'bg-emerald-500',
|
||||
ring: 'ring-2 ring-emerald-100',
|
||||
label: 'Idle',
|
||||
},
|
||||
asleep: {
|
||||
dot: 'bg-muted-foreground/40',
|
||||
ring: 'ring-2 ring-muted',
|
||||
label: 'Asleep',
|
||||
},
|
||||
error: {
|
||||
dot: 'bg-destructive',
|
||||
ring: 'ring-2 ring-destructive/30',
|
||||
label: 'Attention',
|
||||
},
|
||||
unknown: {
|
||||
dot: 'bg-muted-foreground/30',
|
||||
ring: 'ring-2 ring-muted',
|
||||
label: 'Status unknown',
|
||||
},
|
||||
}
|
||||
|
||||
export const LivenessDot: FC<LivenessDotProps> = ({
|
||||
status,
|
||||
detail,
|
||||
className,
|
||||
}) => {
|
||||
const variant = VARIANT[status]
|
||||
return (
|
||||
<TooltipProvider delayDuration={150}>
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<span
|
||||
role="img"
|
||||
aria-label={detail ?? variant.label}
|
||||
className={cn(
|
||||
'inline-block h-3 w-3 rounded-full',
|
||||
variant.dot,
|
||||
variant.ring,
|
||||
className,
|
||||
)}
|
||||
/>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="right" className="text-xs">
|
||||
{detail ?? variant.label}
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
</TooltipProvider>
|
||||
)
|
||||
}
|
||||
@@ -154,7 +154,6 @@ export const NewAgentDialog: FC<NewAgentDialogProps> = ({
|
||||
<SelectValue />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value="openclaw">OpenClaw</SelectItem>
|
||||
{adapters.map((adapter) => (
|
||||
<SelectItem key={adapter.id} value={adapter.id}>
|
||||
{adapter.name}
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
import type { AgentListItem } from './agents-page-types'
|
||||
|
||||
/**
|
||||
* Display rules for the redesigned agent rows. Pure helpers — no React,
|
||||
* no API calls — so they're trivial to unit-test and the row card stays
|
||||
* focused on layout.
|
||||
*/
|
||||
|
||||
const UUID_PATTERN =
|
||||
/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
|
||||
|
||||
const OC_UUID_PATTERN =
|
||||
/^oc-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i
|
||||
|
||||
/**
|
||||
* The agent rail used to render whatever the gateway returned for `name`.
|
||||
* Post-migration that's frequently the agent's UUID — readable to nobody.
|
||||
* Prefer the explicit `name` when it differs meaningfully from the id;
|
||||
* otherwise fall back to a short prefix users can recognize on second
|
||||
* glance.
|
||||
*/
|
||||
export function displayName(agent: AgentListItem): string {
|
||||
const name = agent.name?.trim()
|
||||
const id = agent.agentId
|
||||
if (!name || name === id) {
|
||||
if (OC_UUID_PATTERN.test(id)) return id.slice(0, 11) // "oc-XXXXXXXX"
|
||||
if (UUID_PATTERN.test(id)) return id.slice(0, 8)
|
||||
return id
|
||||
}
|
||||
return name
|
||||
}
|
||||
|
||||
export function canDelete(agent: AgentListItem): boolean {
|
||||
// The gateway's protected `main` agent must not be deletable. The
|
||||
// server enforces this too, but disabling the menu item avoids users
|
||||
// hitting an opaque 400.
|
||||
if (agent.agentId === 'main') return false
|
||||
return agent.canDelete
|
||||
}
|
||||
|
||||
/**
|
||||
* Rename will be wired to a future `PATCH /agents/:id` endpoint. The
|
||||
* legacy `/claw/agents` create flow named the agent on the gateway via
|
||||
* the `name` field but the field isn't editable post-create today.
|
||||
*/
|
||||
export function canRename(_agent: AgentListItem): boolean {
|
||||
return false
|
||||
}
|
||||
|
||||
/**
|
||||
* The detail line carries the agent's workspace path. The `detail`
|
||||
* field on AgentListItem already holds it for OpenClaw entries
|
||||
* (`/home/node/.openclaw/workspace-...`); for harness agents it's the
|
||||
* synthetic `<adapter>:main` marker that's not informative — hide it.
|
||||
*/
|
||||
export function workspaceLabel(agent: AgentListItem): string | null {
|
||||
if (!agent.detail) return null
|
||||
if (/^(claude|codex|openclaw):main$/.test(agent.detail)) return null
|
||||
return agent.detail
|
||||
}
|
||||
|
||||
const ONE_MINUTE = 60_000
|
||||
const ONE_HOUR = 60 * ONE_MINUTE
|
||||
const ONE_DAY = 24 * ONE_HOUR
|
||||
|
||||
/**
|
||||
* Lightweight relative-time formatter. We don't want to drag in
|
||||
* `dayjs/relativeTime` just for a few labels.
|
||||
*/
|
||||
export function formatRelativeTime(epochMs: number | null): string {
|
||||
if (epochMs === null || !Number.isFinite(epochMs)) return 'never'
|
||||
const diff = Math.max(0, Date.now() - epochMs)
|
||||
if (diff < ONE_MINUTE) return 'just now'
|
||||
if (diff < ONE_HOUR) {
|
||||
const m = Math.floor(diff / ONE_MINUTE)
|
||||
return `${m} min ago`
|
||||
}
|
||||
if (diff < ONE_DAY) {
|
||||
const h = Math.floor(diff / ONE_HOUR)
|
||||
return h === 1 ? '1 hr ago' : `${h} hr ago`
|
||||
}
|
||||
const d = Math.floor(diff / ONE_DAY)
|
||||
return d === 1 ? '1 day ago' : `${d} days ago`
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
import type { AgentEntry } from './useOpenClaw'
|
||||
|
||||
export type HarnessAgentAdapter = 'claude' | 'codex'
|
||||
export type HarnessAgentAdapter = 'claude' | 'codex' | 'openclaw'
|
||||
|
||||
export type AgentHarnessStreamEvent =
|
||||
| {
|
||||
@@ -33,6 +33,8 @@ export type AgentHarnessStreamEvent =
|
||||
code?: string
|
||||
}
|
||||
|
||||
export type HarnessAgentLiveness = 'working' | 'idle' | 'asleep' | 'error'
|
||||
|
||||
export interface HarnessAgent {
|
||||
id: string
|
||||
name: string
|
||||
@@ -43,6 +45,17 @@ export interface HarnessAgent {
|
||||
sessionKey: string
|
||||
createdAt: number
|
||||
updatedAt: number
|
||||
/**
|
||||
* Server-derived liveness state. When the listing endpoint hasn't
|
||||
* been enriched yet (older deployments) this is undefined and the UI
|
||||
* falls back to `unknown`.
|
||||
*/
|
||||
status?: HarnessAgentLiveness
|
||||
/**
|
||||
* Wall-clock ms of the last persisted turn. `null` for never-used
|
||||
* agents. Drives the recency sort and the "Last used X min ago" copy.
|
||||
*/
|
||||
lastUsedAt?: number | null
|
||||
}
|
||||
|
||||
export interface HarnessAdapterDescriptor {
|
||||
@@ -62,19 +75,36 @@ export interface CreateHarnessAgentInput {
|
||||
reasoningEffort?: string
|
||||
}
|
||||
|
||||
export interface HarnessTranscriptEntry {
|
||||
export interface HarnessHistoryReasoning {
|
||||
text: string
|
||||
durationMs?: number
|
||||
}
|
||||
|
||||
export interface HarnessHistoryToolCall {
|
||||
toolCallId?: string
|
||||
toolName: string
|
||||
status: 'pending' | 'running' | 'completed' | 'failed'
|
||||
input?: unknown
|
||||
output?: unknown
|
||||
error?: string
|
||||
durationMs?: number
|
||||
}
|
||||
|
||||
export interface HarnessHistoryEntry {
|
||||
id: string
|
||||
agentId: string
|
||||
sessionId: 'main'
|
||||
role: 'user' | 'assistant'
|
||||
text: string
|
||||
createdAt: number
|
||||
reasoning?: HarnessHistoryReasoning
|
||||
toolCalls?: HarnessHistoryToolCall[]
|
||||
}
|
||||
|
||||
export interface HarnessAgentHistoryPage {
|
||||
agentId: string
|
||||
sessionId: 'main'
|
||||
items: HarnessTranscriptEntry[]
|
||||
items: HarnessHistoryEntry[]
|
||||
}
|
||||
|
||||
export function mapHarnessAgentToEntry(agent: HarnessAgent): AgentEntry {
|
||||
|
||||
@@ -138,24 +138,20 @@ export function getVisibleOpenClawAgents(
|
||||
}
|
||||
|
||||
export function getAgentsLoading(input: {
|
||||
statusLoading: boolean
|
||||
adaptersLoading: boolean
|
||||
harnessAgentsLoading: boolean
|
||||
openClawAgentsEnabled: boolean
|
||||
openClawAgentsLoading: boolean
|
||||
}): boolean {
|
||||
return (
|
||||
input.statusLoading ||
|
||||
input.adaptersLoading ||
|
||||
input.harnessAgentsLoading ||
|
||||
(input.openClawAgentsEnabled && input.openClawAgentsLoading)
|
||||
input.openClawAgentsLoading
|
||||
)
|
||||
}
|
||||
|
||||
export function getInlineError(input: {
|
||||
lifecyclePending: boolean
|
||||
pageError: string | null
|
||||
statusError: Error | null
|
||||
openClawAgentsError: Error | null
|
||||
adaptersError: Error | null
|
||||
harnessAgentsError: Error | null
|
||||
@@ -163,7 +159,6 @@ export function getInlineError(input: {
|
||||
if (input.lifecyclePending) return null
|
||||
return (
|
||||
input.pageError ??
|
||||
input.statusError?.message ??
|
||||
input.openClawAgentsError?.message ??
|
||||
input.adaptersError?.message ??
|
||||
input.harnessAgentsError?.message ??
|
||||
|
||||
@@ -10,6 +10,17 @@ import {
|
||||
type HarnessAgentHistoryPage,
|
||||
mapHarnessAgentToEntry,
|
||||
} from './agent-harness-types'
|
||||
import type { OpenClawStatus } from './useOpenClaw'
|
||||
|
||||
/**
|
||||
* Combined response shape of `GET /agents`. The page polls this once
|
||||
* and consumes both fields, replacing the dedicated `/claw/status`
|
||||
* poll the previous design carried.
|
||||
*/
|
||||
interface HarnessAgentsResponse {
|
||||
agents: HarnessAgent[]
|
||||
gateway: OpenClawStatus | null
|
||||
}
|
||||
|
||||
export type { AgentHarnessStreamEvent }
|
||||
|
||||
@@ -69,21 +80,31 @@ export function useHarnessAgents(enabled = true) {
|
||||
error: urlError,
|
||||
} = useAgentServerUrl()
|
||||
|
||||
const query = useQuery<HarnessAgent[], Error>({
|
||||
const query = useQuery<HarnessAgentsResponse, Error>({
|
||||
queryKey: [AGENT_QUERY_KEYS.agents, baseUrl],
|
||||
queryFn: async () => {
|
||||
const data = await agentsFetch<{ agents: HarnessAgent[] }>(
|
||||
const data = await agentsFetch<HarnessAgentsResponse>(
|
||||
baseUrl as string,
|
||||
'/',
|
||||
)
|
||||
return data.agents ?? []
|
||||
return {
|
||||
agents: data.agents ?? [],
|
||||
gateway: data.gateway ?? null,
|
||||
}
|
||||
},
|
||||
enabled: Boolean(baseUrl) && !urlLoading && enabled,
|
||||
// Poll every 5s so the per-agent liveness state (working / idle /
|
||||
// asleep / error) and last-used timestamps stay fresh without a
|
||||
// websocket. `refetchIntervalInBackground: false` lets a hidden
|
||||
// tab go quiet — react-query's default, made explicit.
|
||||
refetchInterval: 5_000,
|
||||
refetchIntervalInBackground: false,
|
||||
})
|
||||
|
||||
return {
|
||||
agents: (query.data ?? []).map(mapHarnessAgentToEntry),
|
||||
harnessAgents: query.data ?? [],
|
||||
agents: (query.data?.agents ?? []).map(mapHarnessAgentToEntry),
|
||||
harnessAgents: query.data?.agents ?? [],
|
||||
gateway: query.data?.gateway ?? null,
|
||||
loading: query.isLoading || urlLoading,
|
||||
error: query.error ?? urlError,
|
||||
refetch: query.refetch,
|
||||
@@ -141,16 +162,95 @@ export async function chatWithHarnessAgent(
|
||||
agentId: string,
|
||||
message: string,
|
||||
signal?: AbortSignal,
|
||||
attachments?: ReadonlyArray<unknown>,
|
||||
): Promise<Response> {
|
||||
const baseUrl = await getAgentServerUrl()
|
||||
return fetch(`${baseUrl}/agents/${encodeURIComponent(agentId)}/chat`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ message }),
|
||||
body: JSON.stringify({
|
||||
message,
|
||||
...(attachments && attachments.length > 0 ? { attachments } : {}),
|
||||
}),
|
||||
signal,
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Subscribe to an existing turn (the server's `ActiveTurnRegistry`
|
||||
* decoupled the turn lifecycle from POST /chat). `lastSeq` lets the
|
||||
* client resume after a disconnect — the server replays buffered
|
||||
* frames with seq > lastSeq, then tails new ones.
|
||||
*/
|
||||
export async function attachToHarnessTurn(
|
||||
agentId: string,
|
||||
options: { turnId?: string; lastSeq?: number; signal?: AbortSignal } = {},
|
||||
): Promise<Response> {
|
||||
const baseUrl = await getAgentServerUrl()
|
||||
const url = new URL(
|
||||
`${baseUrl}/agents/${encodeURIComponent(agentId)}/chat/stream`,
|
||||
)
|
||||
if (options.turnId) url.searchParams.set('turnId', options.turnId)
|
||||
const headers: Record<string, string> = {}
|
||||
if (typeof options.lastSeq === 'number') {
|
||||
headers['Last-Event-ID'] = String(options.lastSeq)
|
||||
}
|
||||
return fetch(url.toString(), { signal: options.signal, headers })
|
||||
}
|
||||
|
||||
export interface HarnessActiveTurnInfo {
|
||||
turnId: string
|
||||
agentId: string
|
||||
sessionId: 'main'
|
||||
status: 'running' | 'done' | 'error' | 'cancelled'
|
||||
lastSeq: number
|
||||
startedAt: number
|
||||
endedAt?: number
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover an in-flight turn for an agent. Used on chat mount so the
|
||||
* UI reattaches instead of starting a new turn after a tab/refresh.
|
||||
*/
|
||||
export async function fetchActiveHarnessTurn(
|
||||
agentId: string,
|
||||
): Promise<HarnessActiveTurnInfo | null> {
|
||||
const baseUrl = await getAgentServerUrl()
|
||||
const response = await fetch(
|
||||
`${baseUrl}/agents/${encodeURIComponent(agentId)}/chat/active`,
|
||||
)
|
||||
if (!response.ok) return null
|
||||
const body = (await response.json()) as {
|
||||
active: HarnessActiveTurnInfo | null
|
||||
}
|
||||
return body.active
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop button. Hits the explicit cancel endpoint instead of just
|
||||
* aborting the fetch (which now only detaches *this* subscriber from
|
||||
* the buffer; the underlying turn would otherwise keep running).
|
||||
*/
|
||||
export async function cancelHarnessTurn(
|
||||
agentId: string,
|
||||
options: { turnId?: string; reason?: string } = {},
|
||||
): Promise<{ cancelled: boolean }> {
|
||||
const baseUrl = await getAgentServerUrl()
|
||||
const response = await fetch(
|
||||
`${baseUrl}/agents/${encodeURIComponent(agentId)}/chat/cancel`,
|
||||
{
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
...(options.turnId ? { turnId: options.turnId } : {}),
|
||||
...(options.reason ? { reason: options.reason } : {}),
|
||||
}),
|
||||
},
|
||||
)
|
||||
if (!response.ok) return { cancelled: false }
|
||||
return (await response.json()) as { cancelled: boolean }
|
||||
}
|
||||
|
||||
export async function fetchHarnessAgentHistory(
|
||||
agentId: string,
|
||||
): Promise<HarnessAgentHistoryPage> {
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
|
||||
import { getAgentServerUrl } from '@/lib/browseros/helpers'
|
||||
import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
|
||||
|
||||
export interface AgentEntry {
|
||||
@@ -319,25 +318,3 @@ export function buildChatHistoryFromTurns(
|
||||
|
||||
return messages
|
||||
}
|
||||
|
||||
export async function chatWithAgent(
|
||||
agentId: string,
|
||||
message: string,
|
||||
sessionKey?: string,
|
||||
history: OpenClawChatHistoryMessage[] = [],
|
||||
signal?: AbortSignal,
|
||||
attachments?: ReadonlyArray<unknown>,
|
||||
): Promise<Response> {
|
||||
const baseUrl = await getAgentServerUrl()
|
||||
return fetch(`${baseUrl}/claw/agents/${agentId}/chat`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
message,
|
||||
sessionKey,
|
||||
history,
|
||||
...(attachments && attachments.length > 0 ? { attachments } : {}),
|
||||
}),
|
||||
signal,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -164,9 +164,17 @@ export const NewScheduledTaskDialog: FC<NewScheduledTaskDialogProps> = ({
|
||||
const resolvedProvider: Provider | null = (() => {
|
||||
const id = selectedProviderId ?? defaultProviderId
|
||||
const found = providers.find((p) => p.id === id)
|
||||
if (found) return { id: found.id, name: found.name, type: found.type }
|
||||
if (found) {
|
||||
return {
|
||||
kind: 'llm' as const,
|
||||
id: found.id,
|
||||
name: found.name,
|
||||
type: found.type,
|
||||
}
|
||||
}
|
||||
if (providers[0])
|
||||
return {
|
||||
kind: 'llm' as const,
|
||||
id: providers[0].id,
|
||||
name: providers[0].name,
|
||||
type: providers[0].type,
|
||||
@@ -175,6 +183,7 @@ export const NewScheduledTaskDialog: FC<NewScheduledTaskDialogProps> = ({
|
||||
})()
|
||||
|
||||
const providerOptions: Provider[] = providers.map((p) => ({
|
||||
kind: 'llm',
|
||||
id: p.id,
|
||||
name: p.name,
|
||||
type: p.type,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { Github, History, Plus, SettingsIcon } from 'lucide-react'
|
||||
import { Bot, Github, History, Plus, SettingsIcon } from 'lucide-react'
|
||||
import type { FC } from 'react'
|
||||
import { Link, useLocation, useNavigate } from 'react-router'
|
||||
import { ChatProviderSelector } from '@/components/chat/ChatProviderSelector'
|
||||
@@ -64,7 +64,9 @@ export const ChatHeader: FC<ChatHeaderProps> = ({
|
||||
className="group relative inline-flex cursor-pointer items-center gap-2 rounded-lg p-2 text-muted-foreground transition-colors hover:bg-muted/50 hover:text-foreground data-[state=open]:bg-accent"
|
||||
title="Change AI Provider"
|
||||
>
|
||||
{selectedProvider.type === 'browseros' ? (
|
||||
{selectedProvider.kind === 'acp' ? (
|
||||
<Bot className="h-[18px] w-[18px]" />
|
||||
) : selectedProvider.type === 'browseros' ? (
|
||||
<BrowserOSIcon size={18} />
|
||||
) : (
|
||||
<ProviderIcon
|
||||
|
||||
@@ -0,0 +1,258 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import type {
|
||||
HarnessAdapterDescriptor,
|
||||
HarnessAgent,
|
||||
} from '@/entrypoints/app/agents/agent-harness-types'
|
||||
import type { LlmProviderConfig } from '@/lib/llm-providers/types'
|
||||
import {
|
||||
buildSidepanelChatTargets,
|
||||
persistSidepanelChatTargetSelection,
|
||||
resolveSidepanelChatTarget,
|
||||
type SidepanelChatTargetSelection,
|
||||
toLlmProviderConfig,
|
||||
} from './sidepanel-chat-targets'
|
||||
|
||||
const timestamp = 1000
|
||||
|
||||
const providers: LlmProviderConfig[] = [
|
||||
{
|
||||
id: 'browseros',
|
||||
type: 'browseros',
|
||||
name: 'BrowserOS',
|
||||
baseUrl: 'https://api.browseros.com/v1',
|
||||
modelId: 'browseros-auto',
|
||||
supportsImages: true,
|
||||
contextWindow: 200000,
|
||||
temperature: 0.2,
|
||||
createdAt: timestamp,
|
||||
updatedAt: timestamp,
|
||||
},
|
||||
{
|
||||
id: 'anthropic-sonnet',
|
||||
type: 'anthropic',
|
||||
name: 'Anthropic Sonnet',
|
||||
modelId: 'claude-sonnet-4-6',
|
||||
apiKey: 'sk-ant',
|
||||
supportsImages: true,
|
||||
contextWindow: 200000,
|
||||
temperature: 0.2,
|
||||
createdAt: timestamp,
|
||||
updatedAt: timestamp,
|
||||
},
|
||||
]
|
||||
|
||||
const adapters: HarnessAdapterDescriptor[] = [
|
||||
{
|
||||
id: 'claude',
|
||||
name: 'Claude Code',
|
||||
defaultModelId: 'haiku',
|
||||
defaultReasoningEffort: 'medium',
|
||||
modelControl: 'best-effort',
|
||||
models: [
|
||||
{ id: 'sonnet', label: 'Sonnet' },
|
||||
{ id: 'haiku', label: 'Haiku', recommended: true },
|
||||
],
|
||||
reasoningEfforts: [
|
||||
{ id: 'medium', label: 'Medium', recommended: true },
|
||||
{ id: 'high', label: 'High' },
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'codex',
|
||||
name: 'Codex',
|
||||
defaultModelId: 'gpt-5.5',
|
||||
defaultReasoningEffort: 'medium',
|
||||
modelControl: 'runtime-supported',
|
||||
models: [{ id: 'gpt-5.5', label: 'GPT-5.5', recommended: true }],
|
||||
reasoningEfforts: [{ id: 'medium', label: 'Medium', recommended: true }],
|
||||
},
|
||||
{
|
||||
id: 'openclaw',
|
||||
name: 'OpenClaw',
|
||||
defaultModelId: 'default',
|
||||
defaultReasoningEffort: 'medium',
|
||||
modelControl: 'best-effort',
|
||||
models: [],
|
||||
reasoningEfforts: [
|
||||
{ id: 'medium', label: 'Medium', recommended: true },
|
||||
{ id: 'high', label: 'High' },
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
const agents: HarnessAgent[] = [
|
||||
{
|
||||
id: 'agent-codex',
|
||||
name: 'Review Bot',
|
||||
adapter: 'codex',
|
||||
modelId: 'gpt-5.5',
|
||||
reasoningEffort: 'medium',
|
||||
permissionMode: 'approve-all',
|
||||
sessionKey: 'agent:agent-codex:main',
|
||||
createdAt: timestamp,
|
||||
updatedAt: timestamp,
|
||||
},
|
||||
{
|
||||
id: 'agent-openclaw',
|
||||
name: 'Research Claw',
|
||||
adapter: 'openclaw',
|
||||
modelId: 'default',
|
||||
reasoningEffort: 'high',
|
||||
permissionMode: 'approve-all',
|
||||
sessionKey: 'agent:agent-openclaw:main',
|
||||
createdAt: timestamp,
|
||||
updatedAt: timestamp,
|
||||
},
|
||||
]
|
||||
|
||||
describe('buildSidepanelChatTargets', () => {
|
||||
it('returns LLM targets plus one ACP target per persisted harness agent', () => {
|
||||
const targets = buildSidepanelChatTargets({ providers, adapters, agents })
|
||||
|
||||
expect(targets.map((target) => target.id)).toEqual([
|
||||
'browseros',
|
||||
'anthropic-sonnet',
|
||||
'agent-codex',
|
||||
'agent-openclaw',
|
||||
])
|
||||
})
|
||||
|
||||
it('does not emit catalog-only ACP targets without persisted agents', () => {
|
||||
const targets = buildSidepanelChatTargets({
|
||||
providers,
|
||||
adapters,
|
||||
agents: [],
|
||||
})
|
||||
|
||||
expect(targets.map((target) => target.id)).toEqual([
|
||||
'browseros',
|
||||
'anthropic-sonnet',
|
||||
])
|
||||
})
|
||||
|
||||
it('uses the created OpenClaw agent name instead of a generic adapter target', () => {
|
||||
const targets = buildSidepanelChatTargets({ providers, adapters, agents })
|
||||
const openclaw = targets.find((target) => target.id === 'agent-openclaw')
|
||||
|
||||
expect(openclaw).toMatchObject({
|
||||
kind: 'acp',
|
||||
id: 'agent-openclaw',
|
||||
agentId: 'agent-openclaw',
|
||||
adapter: 'openclaw',
|
||||
adapterName: 'OpenClaw',
|
||||
modelId: 'default',
|
||||
modelLabel: 'default',
|
||||
name: 'Research Claw',
|
||||
modelControl: 'best-effort',
|
||||
reasoningEffort: 'high',
|
||||
})
|
||||
})
|
||||
|
||||
it('preserves adapter metadata for created agent targets', () => {
|
||||
const targets = buildSidepanelChatTargets({ providers, adapters, agents })
|
||||
const codex = targets.find((target) => target.id === 'agent-codex')
|
||||
|
||||
expect(codex).toMatchObject({
|
||||
kind: 'acp',
|
||||
agentId: 'agent-codex',
|
||||
adapter: 'codex',
|
||||
adapterName: 'Codex',
|
||||
modelId: 'gpt-5.5',
|
||||
modelLabel: 'GPT-5.5',
|
||||
modelControl: 'runtime-supported',
|
||||
recommended: true,
|
||||
reasoningEffort: 'medium',
|
||||
reasoningEffortLabel: 'Medium',
|
||||
})
|
||||
})
|
||||
|
||||
it('still returns LLM targets when agents and adapters are unavailable', () => {
|
||||
expect(
|
||||
buildSidepanelChatTargets({ providers, adapters: [], agents: [] }),
|
||||
).toEqual([
|
||||
{
|
||||
kind: 'llm',
|
||||
id: 'browseros',
|
||||
name: 'BrowserOS',
|
||||
type: 'browseros',
|
||||
provider: providers[0],
|
||||
},
|
||||
{
|
||||
kind: 'llm',
|
||||
id: 'anthropic-sonnet',
|
||||
name: 'Anthropic Sonnet',
|
||||
type: 'anthropic',
|
||||
provider: providers[1],
|
||||
},
|
||||
])
|
||||
})
|
||||
})
|
||||
|
||||
describe('resolveSidepanelChatTarget', () => {
|
||||
it('resolves selected LLM targets back to their provider config', () => {
|
||||
const targets = buildSidepanelChatTargets({ providers, adapters, agents })
|
||||
const resolved = resolveSidepanelChatTarget({
|
||||
targets,
|
||||
defaultProviderId: 'browseros',
|
||||
selection: { kind: 'llm', id: 'anthropic-sonnet' },
|
||||
})
|
||||
|
||||
expect(resolved?.kind).toBe('llm')
|
||||
expect(toLlmProviderConfig(resolved)?.modelId).toBe('claude-sonnet-4-6')
|
||||
})
|
||||
|
||||
it('falls back to the current default LLM provider when a persisted ACP target is stale', () => {
|
||||
const targets = buildSidepanelChatTargets({
|
||||
providers,
|
||||
adapters,
|
||||
agents: [],
|
||||
})
|
||||
|
||||
expect(
|
||||
resolveSidepanelChatTarget({
|
||||
targets,
|
||||
defaultProviderId: 'anthropic-sonnet',
|
||||
selection: { kind: 'acp', id: 'agent-codex' },
|
||||
}),
|
||||
).toMatchObject({
|
||||
kind: 'llm',
|
||||
id: 'anthropic-sonnet',
|
||||
})
|
||||
})
|
||||
|
||||
it('falls back when an old catalog-style ACP target id is persisted', () => {
|
||||
const targets = buildSidepanelChatTargets({ providers, adapters, agents })
|
||||
|
||||
expect(
|
||||
resolveSidepanelChatTarget({
|
||||
targets,
|
||||
defaultProviderId: 'anthropic-sonnet',
|
||||
selection: { kind: 'acp', id: 'acp:codex:gpt-5.5:medium' },
|
||||
}),
|
||||
).toMatchObject({
|
||||
kind: 'llm',
|
||||
id: 'anthropic-sonnet',
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
describe('persistSidepanelChatTargetSelection', () => {
|
||||
it('stores only target identity and does not mutate LLM provider arrays', async () => {
|
||||
let savedSelection: SidepanelChatTargetSelection | null = null
|
||||
const originalProviders = providers.map((provider) => ({ ...provider }))
|
||||
const targets = buildSidepanelChatTargets({ providers, adapters, agents })
|
||||
const target = targets.find((candidate) => candidate.id === 'agent-codex')
|
||||
|
||||
await persistSidepanelChatTargetSelection(target, {
|
||||
setValue: async (value) => {
|
||||
savedSelection = value
|
||||
},
|
||||
})
|
||||
|
||||
expect(savedSelection as SidepanelChatTargetSelection | null).toEqual({
|
||||
kind: 'acp',
|
||||
id: 'agent-codex',
|
||||
})
|
||||
expect(providers).toEqual(originalProviders)
|
||||
})
|
||||
})
|
||||
@@ -0,0 +1,178 @@
|
||||
import type {
|
||||
HarnessAdapterDescriptor,
|
||||
HarnessAgent,
|
||||
HarnessAgentAdapter,
|
||||
} from '@/entrypoints/app/agents/agent-harness-types'
|
||||
import type { LlmProviderConfig, ProviderType } from '@/lib/llm-providers/types'
|
||||
|
||||
export type SidepanelTargetKind = 'llm' | 'acp'
|
||||
|
||||
export type SidepanelChatTarget =
|
||||
| {
|
||||
kind: 'llm'
|
||||
id: string
|
||||
name: string
|
||||
type: ProviderType
|
||||
provider: LlmProviderConfig
|
||||
}
|
||||
| {
|
||||
kind: 'acp'
|
||||
id: string
|
||||
name: string
|
||||
type: 'acp'
|
||||
agentId: string
|
||||
adapter: HarnessAgentAdapter
|
||||
adapterName: string
|
||||
modelId: string
|
||||
modelLabel: string
|
||||
modelControl: HarnessAdapterDescriptor['modelControl']
|
||||
recommended?: boolean
|
||||
reasoningEffort: string
|
||||
reasoningEffortLabel?: string
|
||||
}
|
||||
|
||||
export type SidepanelChatTargetSelection = Pick<
|
||||
SidepanelChatTarget,
|
||||
'kind' | 'id'
|
||||
>
|
||||
|
||||
interface BuildSidepanelChatTargetsInput {
|
||||
providers: LlmProviderConfig[]
|
||||
adapters: HarnessAdapterDescriptor[]
|
||||
agents?: HarnessAgent[]
|
||||
}
|
||||
|
||||
interface ResolveSidepanelChatTargetInput {
|
||||
targets: SidepanelChatTarget[]
|
||||
defaultProviderId: string
|
||||
selection?: SidepanelChatTargetSelection | null
|
||||
}
|
||||
|
||||
interface SidepanelChatTargetSelectionWriter {
|
||||
setValue(value: SidepanelChatTargetSelection | null): Promise<void>
|
||||
}
|
||||
|
||||
interface SidepanelChatTargetSelectionReader {
|
||||
getValue(): Promise<SidepanelChatTargetSelection | null>
|
||||
}
|
||||
|
||||
type SidepanelChatTargetSelectionStore = SidepanelChatTargetSelectionReader &
|
||||
SidepanelChatTargetSelectionWriter
|
||||
|
||||
let sidepanelChatTargetSelectionStorage:
|
||||
| SidepanelChatTargetSelectionStore
|
||||
| undefined
|
||||
|
||||
export function buildSidepanelChatTargets({
|
||||
providers,
|
||||
adapters,
|
||||
agents = [],
|
||||
}: BuildSidepanelChatTargetsInput): SidepanelChatTarget[] {
|
||||
return [
|
||||
...providers.map(toLlmTarget),
|
||||
...agents.map((agent) => toAcpTargetForAgent(agent, adapters)),
|
||||
]
|
||||
}
|
||||
|
||||
function toAcpTargetForAgent(
|
||||
agent: HarnessAgent,
|
||||
adapters: HarnessAdapterDescriptor[],
|
||||
): SidepanelChatTarget {
|
||||
const adapter = adapters.find((entry) => entry.id === agent.adapter)
|
||||
const modelId = agent.modelId ?? adapter?.defaultModelId ?? 'default'
|
||||
const reasoningEffort =
|
||||
agent.reasoningEffort ?? adapter?.defaultReasoningEffort ?? 'medium'
|
||||
const model = adapter?.models.find((entry) => entry.id === modelId)
|
||||
const reasoning = adapter?.reasoningEfforts.find(
|
||||
(effort) => effort.id === reasoningEffort,
|
||||
)
|
||||
|
||||
return {
|
||||
kind: 'acp',
|
||||
id: agent.id,
|
||||
name: agent.name,
|
||||
type: 'acp',
|
||||
agentId: agent.id,
|
||||
adapter: agent.adapter,
|
||||
adapterName: adapter?.name ?? formatAdapterName(agent.adapter),
|
||||
modelId,
|
||||
modelLabel: model?.label ?? modelId,
|
||||
modelControl: adapter?.modelControl ?? 'best-effort',
|
||||
recommended: model?.recommended,
|
||||
reasoningEffort,
|
||||
reasoningEffortLabel: reasoning?.label,
|
||||
}
|
||||
}
|
||||
|
||||
function formatAdapterName(adapter: HarnessAgentAdapter): string {
|
||||
if (adapter === 'claude') return 'Claude Code'
|
||||
if (adapter === 'codex') return 'Codex'
|
||||
if (adapter === 'openclaw') return 'OpenClaw'
|
||||
return adapter
|
||||
}
|
||||
|
||||
export function resolveSidepanelChatTarget({
|
||||
targets,
|
||||
defaultProviderId,
|
||||
selection,
|
||||
}: ResolveSidepanelChatTargetInput): SidepanelChatTarget | undefined {
|
||||
if (selection) {
|
||||
const selected = targets.find(
|
||||
(target) => target.kind === selection.kind && target.id === selection.id,
|
||||
)
|
||||
if (selected) return selected
|
||||
}
|
||||
|
||||
return (
|
||||
targets.find(
|
||||
(target) => target.kind === 'llm' && target.id === defaultProviderId,
|
||||
) ?? targets.find((target) => target.kind === 'llm')
|
||||
)
|
||||
}
|
||||
|
||||
export function toLlmProviderConfig(
|
||||
target: SidepanelChatTarget | undefined,
|
||||
): LlmProviderConfig | undefined {
|
||||
return target?.kind === 'llm' ? target.provider : undefined
|
||||
}
|
||||
|
||||
export async function persistSidepanelChatTargetSelection(
|
||||
target: SidepanelChatTarget | undefined,
|
||||
store?: SidepanelChatTargetSelectionWriter,
|
||||
): Promise<void> {
|
||||
const targetStore = store ?? (await getSidepanelChatTargetSelectionStorage())
|
||||
await targetStore.setValue(
|
||||
target ? { kind: target.kind, id: target.id } : null,
|
||||
)
|
||||
}
|
||||
|
||||
export async function loadSidepanelChatTargetSelection(
|
||||
store?: SidepanelChatTargetSelectionReader,
|
||||
): Promise<SidepanelChatTargetSelection | null> {
|
||||
const targetStore = store ?? (await getSidepanelChatTargetSelectionStorage())
|
||||
return targetStore.getValue()
|
||||
}
|
||||
|
||||
function toLlmTarget(provider: LlmProviderConfig): SidepanelChatTarget {
|
||||
return {
|
||||
kind: 'llm',
|
||||
id: provider.id,
|
||||
name: provider.name,
|
||||
type: provider.type,
|
||||
provider,
|
||||
}
|
||||
}
|
||||
|
||||
async function getSidepanelChatTargetSelectionStorage(): Promise<SidepanelChatTargetSelectionStore> {
|
||||
if (sidepanelChatTargetSelectionStorage) {
|
||||
return sidepanelChatTargetSelectionStorage
|
||||
}
|
||||
|
||||
const { storage } = await import('@wxt-dev/storage')
|
||||
sidepanelChatTargetSelectionStorage =
|
||||
storage.defineItem<SidepanelChatTargetSelection | null>(
|
||||
'local:sidepanel-chat-target-selection',
|
||||
{ fallback: null },
|
||||
)
|
||||
return sidepanelChatTargetSelectionStorage
|
||||
}
|
||||
@@ -1,9 +1,21 @@
|
||||
import { useEffect, useRef } from 'react'
|
||||
import { useCallback, useEffect, useMemo, useRef, useState } from 'react'
|
||||
import useDeepCompareEffect from 'use-deep-compare-effect'
|
||||
import {
|
||||
useAgentAdapters,
|
||||
useHarnessAgents,
|
||||
} from '@/entrypoints/app/agents/useAgents'
|
||||
import type { LlmProviderConfig } from '@/lib/llm-providers/types'
|
||||
import { useLlmProviders } from '@/lib/llm-providers/useLlmProviders'
|
||||
import { type McpServer, useMcpServers } from '@/lib/mcp/mcpServerStorage'
|
||||
import { usePersonalization } from '@/lib/personalization/personalizationStorage'
|
||||
import {
|
||||
buildSidepanelChatTargets,
|
||||
loadSidepanelChatTargetSelection,
|
||||
persistSidepanelChatTargetSelection,
|
||||
resolveSidepanelChatTarget,
|
||||
type SidepanelChatTarget,
|
||||
type SidepanelChatTargetSelection,
|
||||
} from './sidepanel-chat-targets'
|
||||
|
||||
const constructMcpServers = (servers: McpServer[]) => {
|
||||
return servers
|
||||
@@ -23,14 +35,53 @@ const constructCustomServers = (servers: McpServer[]) => {
|
||||
export const useChatRefs = () => {
|
||||
const { servers: mcpServers } = useMcpServers()
|
||||
const {
|
||||
providers: llmProviders,
|
||||
selectedProvider: selectedLlmProvider,
|
||||
setDefaultProvider,
|
||||
isLoading: isLoadingProviders,
|
||||
} = useLlmProviders()
|
||||
const { adapters, loading: isLoadingAdapters } = useAgentAdapters()
|
||||
const { harnessAgents, loading: isLoadingAgents } = useHarnessAgents()
|
||||
const { personalization } = usePersonalization()
|
||||
const [targetSelection, setTargetSelection] =
|
||||
useState<SidepanelChatTargetSelection | null>(null)
|
||||
|
||||
useEffect(() => {
|
||||
let cancelled = false
|
||||
loadSidepanelChatTargetSelection().then((selection) => {
|
||||
if (!cancelled) setTargetSelection(selection)
|
||||
})
|
||||
return () => {
|
||||
cancelled = true
|
||||
}
|
||||
}, [])
|
||||
|
||||
const chatTargets = useMemo(
|
||||
() =>
|
||||
buildSidepanelChatTargets({
|
||||
providers: llmProviders,
|
||||
adapters,
|
||||
agents: harnessAgents,
|
||||
}),
|
||||
[llmProviders, adapters, harnessAgents],
|
||||
)
|
||||
|
||||
const selectedChatTarget = useMemo(
|
||||
() =>
|
||||
resolveSidepanelChatTarget({
|
||||
targets: chatTargets,
|
||||
defaultProviderId: selectedLlmProvider?.id ?? llmProviders[0]?.id ?? '',
|
||||
selection: targetSelection,
|
||||
}),
|
||||
[chatTargets, llmProviders, selectedLlmProvider, targetSelection],
|
||||
)
|
||||
|
||||
const selectedLlmProviderRef = useRef<LlmProviderConfig | null>(
|
||||
selectedLlmProvider,
|
||||
)
|
||||
const selectedChatTargetRef = useRef<SidepanelChatTarget | undefined>(
|
||||
selectedChatTarget,
|
||||
)
|
||||
const enabledMcpServersRef = useRef(constructMcpServers(mcpServers))
|
||||
const enabledCustomServersRef = useRef(constructCustomServers(mcpServers))
|
||||
const personalizationRef = useRef(personalization)
|
||||
@@ -41,16 +92,36 @@ export const useChatRefs = () => {
|
||||
enabledCustomServersRef.current = constructCustomServers(mcpServers)
|
||||
}, [selectedLlmProvider, mcpServers])
|
||||
|
||||
useEffect(() => {
|
||||
selectedChatTargetRef.current = selectedChatTarget
|
||||
}, [selectedChatTarget])
|
||||
|
||||
useEffect(() => {
|
||||
personalizationRef.current = personalization
|
||||
}, [personalization])
|
||||
|
||||
const selectChatTarget = useCallback(
|
||||
async (target: SidepanelChatTarget | undefined) => {
|
||||
selectedChatTargetRef.current = target
|
||||
setTargetSelection(target ? { kind: target.kind, id: target.id } : null)
|
||||
await persistSidepanelChatTargetSelection(target)
|
||||
},
|
||||
[],
|
||||
)
|
||||
|
||||
return {
|
||||
selectedLlmProviderRef,
|
||||
selectedChatTargetRef,
|
||||
enabledMcpServersRef,
|
||||
enabledCustomServersRef,
|
||||
personalizationRef,
|
||||
llmProviders,
|
||||
setDefaultProvider,
|
||||
chatTargets,
|
||||
selectedChatTarget,
|
||||
selectChatTarget,
|
||||
selectedLlmProvider,
|
||||
isLoadingProviders,
|
||||
isLoadingProviders:
|
||||
isLoadingProviders || isLoadingAdapters || isLoadingAgents,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,153 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import type { LlmProviderConfig } from '@/lib/llm-providers/types'
|
||||
import type { ChatMode } from './chatTypes'
|
||||
import type { SidepanelChatTarget } from './sidepanel-chat-targets'
|
||||
import { buildSidepanelPreparedSendMessagesRequest } from './useChatSessionRequest'
|
||||
|
||||
const conversationId = '00000000-0000-4000-8000-000000000001'
|
||||
|
||||
describe('buildSidepanelPreparedSendMessagesRequest', () => {
|
||||
it('keeps LLM targets on the existing /chat request body', () => {
|
||||
const request = buildSidepanelPreparedSendMessagesRequest({
|
||||
agentServerUrl: 'http://127.0.0.1:5151',
|
||||
target: llmTarget,
|
||||
fallbackProvider,
|
||||
message: 'Summarize this page',
|
||||
...commonRequestInput(),
|
||||
})
|
||||
|
||||
expect(request.api).toBe('http://127.0.0.1:5151/chat')
|
||||
expect(request.body).toMatchObject({
|
||||
message: 'Summarize this page',
|
||||
conversationId,
|
||||
provider: 'browseros',
|
||||
providerType: 'browseros',
|
||||
providerName: 'BrowserOS',
|
||||
model: 'gpt-5',
|
||||
mode: 'agent',
|
||||
browserContext: {
|
||||
activeTab: { id: 10, url: 'https://example.com', title: 'Example' },
|
||||
enabledMcpServers: ['slack'],
|
||||
},
|
||||
userSystemPrompt: 'Be concise',
|
||||
userWorkingDir: '/tmp/work',
|
||||
previousConversation: [{ role: 'assistant', content: 'Prior answer' }],
|
||||
selectedText: 'selected text',
|
||||
selectedTextSource: {
|
||||
url: 'https://example.com',
|
||||
title: 'Example',
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
it('sends created-agent targets to the agent-id sidepanel route', () => {
|
||||
const request = buildSidepanelPreparedSendMessagesRequest({
|
||||
agentServerUrl: 'http://127.0.0.1:5151',
|
||||
target: acpTarget,
|
||||
fallbackProvider,
|
||||
message: 'Inspect the current tab',
|
||||
approvalResponses: [
|
||||
{ approvalId: 'approval-1', approved: true, reason: 'ok' },
|
||||
],
|
||||
...commonRequestInput(),
|
||||
})
|
||||
|
||||
expect(request.api).toBe(
|
||||
'http://127.0.0.1:5151/agents/agent-codex/sidepanel/chat',
|
||||
)
|
||||
expect(request.body).toEqual({
|
||||
conversationId,
|
||||
message: 'Inspect the current tab',
|
||||
browserContext: {
|
||||
activeTab: { id: 10, url: 'https://example.com', title: 'Example' },
|
||||
enabledMcpServers: ['slack'],
|
||||
},
|
||||
userSystemPrompt: 'Be concise',
|
||||
userWorkingDir: '/tmp/work',
|
||||
selectedText: 'selected text',
|
||||
selectedTextSource: {
|
||||
url: 'https://example.com',
|
||||
title: 'Example',
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
it('keeps tool approval retry payloads scoped to LLM chat', () => {
|
||||
const request = buildSidepanelPreparedSendMessagesRequest({
|
||||
agentServerUrl: 'http://127.0.0.1:5151',
|
||||
target: llmTarget,
|
||||
fallbackProvider,
|
||||
approvalResponses: [
|
||||
{ approvalId: 'approval-1', approved: false, reason: 'no' },
|
||||
],
|
||||
...commonRequestInput(),
|
||||
})
|
||||
|
||||
expect(request.api).toBe('http://127.0.0.1:5151/chat')
|
||||
expect(request.body).toMatchObject({
|
||||
message: '',
|
||||
toolApprovalResponses: [
|
||||
{ approvalId: 'approval-1', approved: false, reason: 'no' },
|
||||
],
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
function commonRequestInput() {
|
||||
return {
|
||||
conversationId,
|
||||
mode: 'agent' as ChatMode,
|
||||
browserContext: {
|
||||
activeTab: { id: 10, url: 'https://example.com', title: 'Example' },
|
||||
enabledMcpServers: ['slack'],
|
||||
},
|
||||
userSystemPrompt: 'Be concise',
|
||||
userWorkingDir: '/tmp/work',
|
||||
previousConversation: [
|
||||
{ role: 'assistant' as const, content: 'Prior answer' },
|
||||
],
|
||||
declinedApps: ['gmail'],
|
||||
aclRules: [{ id: 'rule-1', sitePattern: '*://*/*', enabled: true }],
|
||||
selectedText: 'selected text',
|
||||
selectedTextSource: {
|
||||
url: 'https://example.com',
|
||||
title: 'Example',
|
||||
},
|
||||
toolApprovalConfig: { categories: { navigation: true } },
|
||||
}
|
||||
}
|
||||
|
||||
const fallbackProvider: LlmProviderConfig = {
|
||||
id: 'browseros',
|
||||
type: 'browseros',
|
||||
name: 'BrowserOS',
|
||||
modelId: 'gpt-5',
|
||||
supportsImages: true,
|
||||
contextWindow: 128000,
|
||||
temperature: 0.7,
|
||||
createdAt: 1000,
|
||||
updatedAt: 1000,
|
||||
}
|
||||
|
||||
const llmTarget: SidepanelChatTarget = {
|
||||
kind: 'llm',
|
||||
id: fallbackProvider.id,
|
||||
name: fallbackProvider.name,
|
||||
type: fallbackProvider.type,
|
||||
provider: fallbackProvider,
|
||||
}
|
||||
|
||||
const acpTarget: SidepanelChatTarget = {
|
||||
kind: 'acp',
|
||||
id: 'agent-codex',
|
||||
name: 'Review bot',
|
||||
type: 'acp',
|
||||
agentId: 'agent-codex',
|
||||
adapter: 'codex',
|
||||
adapterName: 'Codex',
|
||||
modelId: 'gpt-5.5',
|
||||
modelLabel: 'GPT-5.5',
|
||||
modelControl: 'best-effort',
|
||||
reasoningEffort: 'medium',
|
||||
reasoningEffortLabel: 'Medium',
|
||||
}
|
||||
@@ -26,15 +26,14 @@ import { useInvalidateCredits } from '@/lib/credits/useCredits'
|
||||
import { declinedAppsStorage } from '@/lib/declined-apps/storage'
|
||||
import { useGraphqlQuery } from '@/lib/graphql/useGraphqlQuery'
|
||||
import { createDefaultBrowserOSProvider } from '@/lib/llm-providers/storage'
|
||||
import { useLlmProviders } from '@/lib/llm-providers/useLlmProviders'
|
||||
import {
|
||||
type ApprovalResponseData,
|
||||
buildChatRequestBody,
|
||||
type ChatRequestBrowserContext,
|
||||
import type {
|
||||
ApprovalResponseData,
|
||||
ChatRequestBrowserContext,
|
||||
} from '@/lib/messaging/server/buildChatRequestBody'
|
||||
import { track } from '@/lib/metrics/track'
|
||||
import { searchActionsStorage } from '@/lib/search-actions/searchActionsStorage'
|
||||
import { selectedTextStorage } from '@/lib/selected-text/selectedTextStorage'
|
||||
import { sentry } from '@/lib/sentry/sentry'
|
||||
import { stopAgentStorage } from '@/lib/stop-agent/stop-agent-storage'
|
||||
import {
|
||||
type ApprovalResponse,
|
||||
@@ -52,7 +51,12 @@ import {
|
||||
import { selectedWorkspaceStorage } from '@/lib/workspace/workspace-storage'
|
||||
import type { ChatMode } from './chatTypes'
|
||||
import { GetConversationWithMessagesDocument } from './graphql/chatSessionDocument'
|
||||
import { toLlmProviderConfig } from './sidepanel-chat-targets'
|
||||
import { useChatRefs } from './useChatRefs'
|
||||
import {
|
||||
buildSidepanelPreparedSendMessagesRequest,
|
||||
toProviderOption,
|
||||
} from './useChatSessionRequest'
|
||||
import { useExecutionHistoryTracker } from './useExecutionHistoryTracker'
|
||||
import { useNotifyActiveTab } from './useNotifyActiveTab'
|
||||
import { useRemoteConversationSave } from './useRemoteConversationSave'
|
||||
@@ -186,16 +190,19 @@ const buildRequestBrowserContext = ({
|
||||
export const useChatSession = (options?: ChatSessionOptions) => {
|
||||
const {
|
||||
selectedLlmProviderRef,
|
||||
selectedChatTargetRef,
|
||||
enabledMcpServersRef,
|
||||
enabledCustomServersRef,
|
||||
personalizationRef,
|
||||
setDefaultProvider,
|
||||
chatTargets,
|
||||
selectedChatTarget,
|
||||
selectChatTarget,
|
||||
selectedLlmProvider,
|
||||
isLoadingProviders,
|
||||
} = useChatRefs()
|
||||
const invalidateCredits = useInvalidateCredits()
|
||||
|
||||
const { providers: llmProviders, setDefaultProvider } = useLlmProviders()
|
||||
|
||||
const {
|
||||
baseUrl: agentServerUrl,
|
||||
isLoading: isLoadingAgentUrl,
|
||||
@@ -218,11 +225,7 @@ export const useChatSession = (options?: ChatSessionOptions) => {
|
||||
agentUrlRef.current = agentServerUrl
|
||||
}, [agentServerUrl])
|
||||
|
||||
const providers: Provider[] = llmProviders.map((p) => ({
|
||||
id: p.id,
|
||||
name: p.name,
|
||||
type: p.type,
|
||||
}))
|
||||
const providers: Provider[] = chatTargets.map(toProviderOption)
|
||||
|
||||
const [mode, setMode] = useState<ChatMode>('agent')
|
||||
const [textToAction, setTextToAction] = useState<Map<string, ChatAction>>(
|
||||
@@ -324,15 +327,8 @@ export const useChatSession = (options?: ChatSessionOptions) => {
|
||||
textToActionRef.current = textToAction
|
||||
}, [mode, textToAction])
|
||||
|
||||
const selectedProvider = selectedLlmProvider
|
||||
? {
|
||||
id: selectedLlmProvider.id,
|
||||
name: selectedLlmProvider.name,
|
||||
type:
|
||||
selectedLlmProvider.id === 'browseros'
|
||||
? ('browseros' as const)
|
||||
: selectedLlmProvider.type,
|
||||
}
|
||||
const selectedProvider = selectedChatTarget
|
||||
? toProviderOption(selectedChatTarget)
|
||||
: providers[0]
|
||||
|
||||
const {
|
||||
@@ -346,7 +342,8 @@ export const useChatSession = (options?: ChatSessionOptions) => {
|
||||
} = useChat({
|
||||
transport: new DefaultChatTransport({
|
||||
prepareSendMessagesRequest: async ({ messages }) => {
|
||||
const provider =
|
||||
const target = selectedChatTargetRef.current
|
||||
const fallbackProvider =
|
||||
selectedLlmProviderRef.current ?? createDefaultBrowserOSProvider()
|
||||
const activeTabsList = await chrome.tabs.query({
|
||||
active: true,
|
||||
@@ -395,51 +392,46 @@ export const useChatSession = (options?: ChatSessionOptions) => {
|
||||
personalizationRef.current,
|
||||
)
|
||||
|
||||
const approvalResponses = extractApprovalResponses(messages)
|
||||
const commonRequest = {
|
||||
conversationId: conversationIdRef.current,
|
||||
mode: currentMode,
|
||||
browserContext: requestBrowserContext,
|
||||
userSystemPrompt,
|
||||
userWorkingDir: workingDirRef.current,
|
||||
previousConversation,
|
||||
declinedApps,
|
||||
aclRules: enabledAclRules,
|
||||
toolApprovalConfig: approvalConfig,
|
||||
}
|
||||
|
||||
const approvalResponses =
|
||||
target?.kind === 'acp' ? null : extractApprovalResponses(messages)
|
||||
if (approvalResponses) {
|
||||
return {
|
||||
api: `${agentUrlRef.current}/chat`,
|
||||
body: buildChatRequestBody({
|
||||
conversationId: conversationIdRef.current,
|
||||
provider,
|
||||
mode: currentMode,
|
||||
browserContext: requestBrowserContext,
|
||||
userSystemPrompt,
|
||||
userWorkingDir: workingDirRef.current,
|
||||
previousConversation,
|
||||
declinedApps,
|
||||
aclRules: enabledAclRules,
|
||||
toolApprovalConfig: approvalConfig,
|
||||
toolApprovalResponses: approvalResponses,
|
||||
}),
|
||||
}
|
||||
return buildSidepanelPreparedSendMessagesRequest({
|
||||
agentServerUrl: agentUrlRef.current ?? undefined,
|
||||
target,
|
||||
fallbackProvider,
|
||||
...commonRequest,
|
||||
approvalResponses,
|
||||
})
|
||||
}
|
||||
|
||||
const message = getLastMessageText(messages)
|
||||
|
||||
const result = {
|
||||
api: `${agentUrlRef.current}/chat`,
|
||||
body: buildChatRequestBody({
|
||||
message,
|
||||
conversationId: conversationIdRef.current,
|
||||
provider,
|
||||
mode: currentMode,
|
||||
browserContext: requestBrowserContext,
|
||||
userSystemPrompt,
|
||||
userWorkingDir: workingDirRef.current,
|
||||
previousConversation,
|
||||
declinedApps,
|
||||
aclRules: enabledAclRules,
|
||||
selectedText: activeTabSelection?.text,
|
||||
selectedTextSource: activeTabSelection
|
||||
? {
|
||||
url: activeTabSelection.url,
|
||||
title: activeTabSelection.title,
|
||||
}
|
||||
: undefined,
|
||||
toolApprovalConfig: approvalConfig,
|
||||
}),
|
||||
}
|
||||
const result = buildSidepanelPreparedSendMessagesRequest({
|
||||
agentServerUrl: agentUrlRef.current ?? undefined,
|
||||
target,
|
||||
fallbackProvider,
|
||||
message,
|
||||
...commonRequest,
|
||||
selectedText: activeTabSelection?.text,
|
||||
selectedTextSource: activeTabSelection
|
||||
? {
|
||||
url: activeTabSelection.url,
|
||||
title: activeTabSelection.title,
|
||||
}
|
||||
: undefined,
|
||||
})
|
||||
|
||||
// Track which tab's selection was sent so we can clear it on success
|
||||
pendingSelectionTabKeyRef.current =
|
||||
@@ -451,7 +443,7 @@ export const useChatSession = (options?: ChatSessionOptions) => {
|
||||
sendAutomaticallyWhen: () => {
|
||||
if (approvalJustRespondedRef.current) {
|
||||
approvalJustRespondedRef.current = false
|
||||
return true
|
||||
return selectedChatTargetRef.current?.kind !== 'acp'
|
||||
}
|
||||
return false
|
||||
},
|
||||
@@ -686,10 +678,22 @@ export const useChatSession = (options?: ChatSessionOptions) => {
|
||||
}, [dispatchMessage, isIntegrationsSynced])
|
||||
|
||||
const sendMessage = (params: { text: string; action?: ChatAction }) => {
|
||||
const target = selectedChatTargetRef.current
|
||||
const llmTargetProvider = toLlmProviderConfig(target)
|
||||
const agentTarget = target?.kind === 'acp' ? target : undefined
|
||||
track(MESSAGE_SENT_EVENT, {
|
||||
mode,
|
||||
provider_type: selectedLlmProvider?.type,
|
||||
model: selectedLlmProvider?.modelId,
|
||||
provider_id:
|
||||
agentTarget?.agentId ??
|
||||
llmTargetProvider?.id ??
|
||||
selectedLlmProvider?.id,
|
||||
provider_type: agentTarget ? 'acp' : llmTargetProvider?.type,
|
||||
agent_id: agentTarget?.agentId,
|
||||
adapter: agentTarget?.adapter,
|
||||
model:
|
||||
agentTarget?.modelId ??
|
||||
llmTargetProvider?.modelId ??
|
||||
selectedLlmProvider?.modelId,
|
||||
})
|
||||
|
||||
if (!isIntegrationsSyncedRef.current) {
|
||||
@@ -741,14 +745,54 @@ export const useChatSession = (options?: ChatSessionOptions) => {
|
||||
addToolApprovalResponse(params)
|
||||
}
|
||||
|
||||
const resetConversationState = () => {
|
||||
stop()
|
||||
void finishExecutionTask({ isAbort: true })
|
||||
setConversationId(crypto.randomUUID())
|
||||
setMessages([])
|
||||
setTextToAction(new Map())
|
||||
setLiked({})
|
||||
setDisliked({})
|
||||
setRestoredConversationId(null)
|
||||
resetRemoteConversation()
|
||||
}
|
||||
|
||||
const handleSelectProvider = (provider: Provider) => {
|
||||
const fullProvider = llmProviders.find((p) => p.id === provider.id)
|
||||
const target = chatTargets.find(
|
||||
(candidate) =>
|
||||
candidate.id === provider.id && candidate.kind === provider.kind,
|
||||
)
|
||||
if (!target) return
|
||||
|
||||
const previousTarget = selectedChatTargetRef.current
|
||||
track(PROVIDER_SELECTED_EVENT, {
|
||||
provider_id: provider.id,
|
||||
provider_type: provider.type,
|
||||
model_id: fullProvider?.modelId,
|
||||
provider_id: target.id,
|
||||
provider_type: target.kind === 'acp' ? 'acp' : target.type,
|
||||
model_id:
|
||||
target.kind === 'acp' ? target.modelId : target.provider.modelId,
|
||||
agent_id: target.kind === 'acp' ? target.agentId : undefined,
|
||||
adapter: target.kind === 'acp' ? target.adapter : undefined,
|
||||
})
|
||||
setDefaultProvider(provider.id)
|
||||
|
||||
void selectChatTarget(target).catch((error) => {
|
||||
sentry.captureException(error, {
|
||||
extra: {
|
||||
message: 'Failed to persist sidepanel chat target selection',
|
||||
targetId: target.id,
|
||||
targetKind: target.kind,
|
||||
},
|
||||
})
|
||||
})
|
||||
if (target.kind === 'llm') setDefaultProvider(target.provider.id)
|
||||
|
||||
if (
|
||||
previousTarget &&
|
||||
(previousTarget.kind !== target.kind ||
|
||||
previousTarget.id !== target.id) &&
|
||||
messagesRef.current.length > 0
|
||||
) {
|
||||
resetConversationState()
|
||||
}
|
||||
}
|
||||
|
||||
const getActionForMessage = (message: UIMessage) => {
|
||||
@@ -762,15 +806,7 @@ export const useChatSession = (options?: ChatSessionOptions) => {
|
||||
|
||||
const resetConversation = () => {
|
||||
track(CONVERSATION_RESET_EVENT, { message_count: messages.length })
|
||||
stop()
|
||||
void finishExecutionTask({ isAbort: true })
|
||||
setConversationId(crypto.randomUUID())
|
||||
setMessages([])
|
||||
setTextToAction(new Map())
|
||||
setLiked({})
|
||||
setDisliked({})
|
||||
setRestoredConversationId(null)
|
||||
resetRemoteConversation()
|
||||
resetConversationState()
|
||||
}
|
||||
|
||||
const isRestoringConversation =
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
import type { Provider } from '../../../components/chat/chatComponentTypes'
|
||||
import type { LlmProviderConfig } from '../../../lib/llm-providers/types'
|
||||
import {
|
||||
type ApprovalResponseData,
|
||||
buildChatRequestBody,
|
||||
} from '../../../lib/messaging/server/buildChatRequestBody'
|
||||
import {
|
||||
type SidepanelChatTarget,
|
||||
toLlmProviderConfig,
|
||||
} from './sidepanel-chat-targets'
|
||||
|
||||
type LlmChatRequestBodyInput = Parameters<typeof buildChatRequestBody>[0]
|
||||
|
||||
type CommonSidepanelRequestInput = Omit<
|
||||
LlmChatRequestBodyInput,
|
||||
'provider' | 'message' | 'toolApprovalResponses' | 'isScheduledTask'
|
||||
>
|
||||
|
||||
interface BuildSidepanelPreparedSendMessagesRequestInput
|
||||
extends CommonSidepanelRequestInput {
|
||||
agentServerUrl: string | undefined
|
||||
target: SidepanelChatTarget | undefined
|
||||
fallbackProvider: LlmProviderConfig
|
||||
message?: string
|
||||
approvalResponses?: ApprovalResponseData[] | null
|
||||
}
|
||||
|
||||
export function buildSidepanelPreparedSendMessagesRequest({
|
||||
agentServerUrl,
|
||||
target,
|
||||
fallbackProvider,
|
||||
message,
|
||||
approvalResponses,
|
||||
...common
|
||||
}: BuildSidepanelPreparedSendMessagesRequestInput) {
|
||||
if (target?.kind === 'acp') {
|
||||
return {
|
||||
api: `${agentServerUrl}/agents/${encodeURIComponent(target.agentId)}/sidepanel/chat`,
|
||||
body: {
|
||||
conversationId: common.conversationId,
|
||||
message: message ?? '',
|
||||
browserContext: common.browserContext,
|
||||
userSystemPrompt: common.userSystemPrompt,
|
||||
userWorkingDir: common.userWorkingDir,
|
||||
selectedText: common.selectedText,
|
||||
selectedTextSource: common.selectedTextSource,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
const provider = toLlmProviderConfig(target) ?? fallbackProvider
|
||||
return {
|
||||
api: `${agentServerUrl}/chat`,
|
||||
body: buildChatRequestBody({
|
||||
...common,
|
||||
provider,
|
||||
message,
|
||||
toolApprovalResponses: approvalResponses ?? undefined,
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
export function toProviderOption(target: SidepanelChatTarget): Provider {
|
||||
return {
|
||||
id: target.id,
|
||||
name: target.name,
|
||||
type: target.type,
|
||||
kind: target.kind,
|
||||
agentId: target.kind === 'acp' ? target.agentId : undefined,
|
||||
adapterName: target.kind === 'acp' ? target.adapterName : undefined,
|
||||
modelLabel: target.kind === 'acp' ? target.modelLabel : undefined,
|
||||
modelControl: target.kind === 'acp' ? target.modelControl : undefined,
|
||||
}
|
||||
}
|
||||
@@ -2,29 +2,75 @@ function isAbortError(error: unknown): boolean {
|
||||
return error instanceof DOMException && error.name === 'AbortError'
|
||||
}
|
||||
|
||||
export interface ParsedSSEEvent<T> {
|
||||
data: T
|
||||
/** Numeric `id:` line on the same SSE event, if any. */
|
||||
seq?: number
|
||||
}
|
||||
|
||||
export function parseSSELines<T>(buffer: string): {
|
||||
events: T[]
|
||||
events: ParsedSSEEvent<T>[]
|
||||
remainder: string
|
||||
} {
|
||||
// SSE events are separated by blank lines. Buffer lines until we hit
|
||||
// a blank, then assemble each event. Lines we recognise: `id: <n>`
|
||||
// and `data: <payload>`. Everything else is ignored.
|
||||
const events: ParsedSSEEvent<T>[] = []
|
||||
const lines = buffer.split('\n')
|
||||
const remainder = lines.pop() ?? ''
|
||||
const events: T[] = []
|
||||
|
||||
for (const line of lines) {
|
||||
if (!line.startsWith('data: ')) continue
|
||||
const payload = line.slice(6)
|
||||
if (payload === '[DONE]') continue
|
||||
try {
|
||||
events.push(JSON.parse(payload) as T)
|
||||
} catch {}
|
||||
// Find the last blank-line boundary; everything after it is the
|
||||
// remainder (next event partially received).
|
||||
let lastBoundary = -1
|
||||
for (let i = lines.length - 1; i >= 0; i--) {
|
||||
if (lines[i] === '') {
|
||||
lastBoundary = i
|
||||
break
|
||||
}
|
||||
}
|
||||
const completeLines = lastBoundary >= 0 ? lines.slice(0, lastBoundary) : []
|
||||
const remainder =
|
||||
lastBoundary >= 0 ? lines.slice(lastBoundary + 1).join('\n') : buffer
|
||||
|
||||
let currentSeq: number | undefined
|
||||
let currentData: string | null = null
|
||||
const flush = () => {
|
||||
if (currentData != null && currentData !== '[DONE]') {
|
||||
try {
|
||||
events.push({
|
||||
data: JSON.parse(currentData) as T,
|
||||
seq: currentSeq,
|
||||
})
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
currentSeq = undefined
|
||||
currentData = null
|
||||
}
|
||||
|
||||
for (const line of completeLines) {
|
||||
if (line === '') {
|
||||
flush()
|
||||
continue
|
||||
}
|
||||
if (line.startsWith('id: ')) {
|
||||
const n = Number.parseInt(line.slice(4).trim(), 10)
|
||||
if (Number.isFinite(n)) currentSeq = n
|
||||
continue
|
||||
}
|
||||
if (line.startsWith('data: ')) {
|
||||
currentData = line.slice(6)
|
||||
}
|
||||
}
|
||||
// Catch a complete trailing event with no terminating blank line —
|
||||
// shouldn't happen in well-formed SSE, but be tolerant.
|
||||
flush()
|
||||
|
||||
return { events, remainder }
|
||||
}
|
||||
|
||||
export async function consumeSSEStream<T>(
|
||||
response: Response,
|
||||
onEvent: (event: T) => void,
|
||||
onEvent: (event: T, meta: { seq?: number }) => void,
|
||||
signal?: AbortSignal,
|
||||
): Promise<void> {
|
||||
const reader = response.body?.getReader()
|
||||
@@ -49,7 +95,7 @@ export async function consumeSSEStream<T>(
|
||||
buffer = remainder
|
||||
|
||||
for (const event of events) {
|
||||
onEvent(event)
|
||||
onEvent(event.data, { seq: event.seq })
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
@@ -64,7 +110,7 @@ export async function consumeSSEStream<T>(
|
||||
if (buffer) {
|
||||
const { events } = parseSSELines<T>(buffer)
|
||||
for (const event of events) {
|
||||
onEvent(event)
|
||||
onEvent(event.data, { seq: event.seq })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
51
packages/browseros-agent/apps/eval/.env.example
vendored
Normal file
51
packages/browseros-agent/apps/eval/.env.example
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
# Copy to .env.development for local eval runs.
|
||||
|
||||
# Provider keys used by existing config files.
|
||||
OPENROUTER_API_KEY=
|
||||
FIREWORKS_API_KEY=
|
||||
ANTHROPIC_API_KEY=
|
||||
OPENAI_API_KEY=
|
||||
GOOGLE_GENERATIVE_AI_API_KEY=
|
||||
|
||||
# Claude Agent SDK token used by performance_grader.
|
||||
CLAUDE_CODE_OAUTH_TOKEN=
|
||||
|
||||
# Suite-mode model selection.
|
||||
EVAL_VARIANT=local
|
||||
EVAL_AGENT_PROVIDER=openai-compatible
|
||||
EVAL_AGENT_MODEL=
|
||||
EVAL_AGENT_API_KEY=
|
||||
EVAL_AGENT_BASE_URL=
|
||||
EVAL_AGENT_SUPPORTS_IMAGES=true
|
||||
|
||||
# Optional suite-mode executor override for orchestrator suites.
|
||||
EVAL_EXECUTOR_MODEL=
|
||||
EVAL_EXECUTOR_API_KEY=
|
||||
EVAL_EXECUTOR_BASE_URL=
|
||||
|
||||
# Clado visual action executor.
|
||||
CLADO_ACTION_MODEL=
|
||||
CLADO_ACTION_API_KEY=
|
||||
CLADO_ACTION_BASE_URL=
|
||||
# Backward-compatible alias used by older local scripts.
|
||||
CLADO_ACTION_URL=
|
||||
|
||||
# BrowserOS runner.
|
||||
BROWSEROS_BINARY=/Applications/BrowserOS.app/Contents/MacOS/BrowserOS
|
||||
BROWSEROS_SERVER_URL=http://127.0.0.1:9110
|
||||
BROWSEROS_SERVER_LOG_DIR=/tmp/browseros-server-logs
|
||||
BROWSEROS_CONFIG_URL=
|
||||
|
||||
# Captcha solver extension.
|
||||
NOPECHA_API_KEY=
|
||||
|
||||
# WebArena-Infinity.
|
||||
WEBARENA_INFINITY_DIR=
|
||||
INFINITY_APP_URL=
|
||||
|
||||
# R2 publishing and weekly report.
|
||||
EVAL_R2_ACCOUNT_ID=
|
||||
EVAL_R2_ACCESS_KEY_ID=
|
||||
EVAL_R2_SECRET_ACCESS_KEY=
|
||||
EVAL_R2_BUCKET=browseros-eval
|
||||
EVAL_R2_CDN_BASE_URL=https://eval.browseros.com
|
||||
88
packages/browseros-agent/apps/eval/README.md
vendored
88
packages/browseros-agent/apps/eval/README.md
vendored
@@ -14,6 +14,7 @@ Evaluation framework for BrowserOS browser automation agents. Runs tasks from st
|
||||
|
||||
```bash
|
||||
cd apps/eval
|
||||
cp .env.example .env.development
|
||||
# Edit .env.development with your keys, then:
|
||||
bun run eval
|
||||
```
|
||||
@@ -23,11 +24,55 @@ Opens the eval dashboard at `http://localhost:9900` in config mode. From there:
|
||||
### CLI mode
|
||||
|
||||
```bash
|
||||
bun run eval -c configs/browseros-agent-weekly.json
|
||||
bun run eval -c configs/legacy/browseros-agent-weekly.json
|
||||
bun run eval suite --config configs/legacy/browseros-agent-weekly.json --publish r2
|
||||
```
|
||||
|
||||
Runs immediately. Dashboard still available at `http://localhost:9900` for live progress.
|
||||
|
||||
The `suite` command is the workflow-compatible full loop: execute tasks, run graders, write artifacts, and optionally publish to R2. The old `-c` form remains supported during migration.
|
||||
|
||||
```bash
|
||||
bun run eval run --config configs/legacy/browseros-agent-weekly.json
|
||||
bun run eval suite --suite configs/suites/agisdk-daily-10.json --variant kimi-fireworks --publish r2
|
||||
bun run eval grade --run results/browseros-agent-weekly/2026-04-29-1430
|
||||
bun run eval publish --run results/browseros-agent-weekly/2026-04-29-1430 --target r2
|
||||
```
|
||||
|
||||
Config files live in two groups:
|
||||
|
||||
```txt
|
||||
configs/legacy/ # Complete EvalConfig files used by older workflows and the dashboard
|
||||
configs/suites/ # Suite definitions; model/provider comes from CLI flags or env
|
||||
```
|
||||
|
||||
Suite mode takes model settings from CLI flags first, then env:
|
||||
|
||||
```bash
|
||||
EVAL_VARIANT=kimi-fireworks \
|
||||
EVAL_AGENT_PROVIDER=openai-compatible \
|
||||
EVAL_AGENT_MODEL=accounts/fireworks/models/kimi-k2p5 \
|
||||
EVAL_AGENT_API_KEY=$FIREWORKS_API_KEY \
|
||||
EVAL_AGENT_BASE_URL=https://api.fireworks.ai/inference/v1 \
|
||||
bun run eval suite --suite configs/suites/agisdk-daily-10.json --publish r2
|
||||
```
|
||||
|
||||
### Suites and variants
|
||||
|
||||
A **suite** is what we run: the task dataset, graders, worker count, timeout, and browser settings. For example, `agisdk-daily-10` means "run these 10 AGI SDK tasks and grade them with `agisdk_state_diff`."
|
||||
|
||||
A **variant** is the model setup we are testing on that suite. `EVAL_VARIANT` is just the human-readable name for that setup. The actual model connection still comes from `EVAL_AGENT_PROVIDER`, `EVAL_AGENT_MODEL`, `EVAL_AGENT_API_KEY`, and `EVAL_AGENT_BASE_URL`.
|
||||
|
||||
This lets us run the same suite against multiple model setups without copying the benchmark config:
|
||||
|
||||
```txt
|
||||
agisdk-daily-10 + kimi-fireworks
|
||||
agisdk-daily-10 + claude-sonnet
|
||||
agisdk-daily-10 + clado-action-000159
|
||||
```
|
||||
|
||||
For `orchestrator-executor` suites, there can also be an executor model/backend. The `EVAL_AGENT_*` vars describe the main agent or orchestrator. The optional `EVAL_EXECUTOR_*` or `CLADO_ACTION_*` vars describe the delegated executor.
|
||||
|
||||
## Agent types
|
||||
|
||||
| Type | Description |
|
||||
@@ -66,9 +111,9 @@ The orchestrator works with any LLM provider. The executor can be another LLM, o
|
||||
},
|
||||
"executor": {
|
||||
"provider": "clado-action",
|
||||
"model": "qwen3-vl-30b-a3b-instruct",
|
||||
"model": "Qwen3.5-35B-A3B-action-000159-merged",
|
||||
"apiKey": "",
|
||||
"baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
|
||||
"baseUrl": "https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -96,6 +141,20 @@ The `apiKey` field supports two formats:
|
||||
- **Env var name**: `"OPENAI_API_KEY"` — resolved from `.env.development` at runtime
|
||||
- **Direct value**: `"sk-xxxxx"` — used as-is (not recommended)
|
||||
|
||||
### Environment variables
|
||||
|
||||
| Variable | Used for |
|
||||
|----------|----------|
|
||||
| `EVAL_AGENT_PROVIDER`, `EVAL_AGENT_MODEL`, `EVAL_AGENT_API_KEY`, `EVAL_AGENT_BASE_URL`, `EVAL_AGENT_SUPPORTS_IMAGES` | Suite variant model selection |
|
||||
| `FIREWORKS_API_KEY`, `OPENROUTER_API_KEY`, `ANTHROPIC_API_KEY`, provider-specific keys | Config-file or provider-backed model calls |
|
||||
| `EVAL_EXECUTOR_MODEL`, `EVAL_EXECUTOR_API_KEY`, `EVAL_EXECUTOR_BASE_URL` | Suite-mode orchestrator executor override |
|
||||
| `CLADO_ACTION_MODEL`, `CLADO_ACTION_API_KEY`, `CLADO_ACTION_BASE_URL` | Clado executor defaults |
|
||||
| `BROWSEROS_BINARY` | BrowserOS binary path in CI/local smoke runs |
|
||||
| `BROWSEROS_SERVER_URL` | Optional grader MCP URL override |
|
||||
| `WEBARENA_INFINITY_DIR` | Local WebArena-Infinity checkout for Infinity tasks |
|
||||
| `NOPECHA_API_KEY` | CAPTCHA solver extension |
|
||||
| `EVAL_R2_ACCOUNT_ID`, `EVAL_R2_ACCESS_KEY_ID`, `EVAL_R2_SECRET_ACCESS_KEY`, `EVAL_R2_BUCKET`, `EVAL_R2_CDN_BASE_URL` | R2 upload and viewer URL |
|
||||
|
||||
### Supported providers
|
||||
|
||||
| Provider | `provider` value | Requires `baseUrl` |
|
||||
@@ -110,6 +169,20 @@ The `apiKey` field supports two formats:
|
||||
| Ollama | `ollama` | No |
|
||||
| Clado Action (executor only) | `clado-action` | Yes |
|
||||
|
||||
### R2 publishing
|
||||
|
||||
`suite --config ... --publish r2` and `publish --target r2` upload the run artifacts plus `viewer.html` to the viewer-compatible R2 layout:
|
||||
|
||||
```bash
|
||||
export EVAL_R2_ACCOUNT_ID=...
|
||||
export EVAL_R2_ACCESS_KEY_ID=...
|
||||
export EVAL_R2_SECRET_ACCESS_KEY=...
|
||||
export EVAL_R2_BUCKET=browseros-eval
|
||||
export EVAL_R2_CDN_BASE_URL=https://eval.browseros.com
|
||||
```
|
||||
|
||||
Published runs are available at `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
|
||||
|
||||
### BrowserOS infrastructure
|
||||
|
||||
```json
|
||||
@@ -137,10 +210,12 @@ Each worker gets its own Chrome instance. Worker N uses `base_port + N` for CDP
|
||||
|
||||
| File | Tasks | Description |
|
||||
|------|-------|-------------|
|
||||
| `agisdk-daily-10.jsonl` | 10 | Daily AGI SDK / REAL Bench subset |
|
||||
| `webvoyager.jsonl` | 643 | Full WebVoyager benchmark |
|
||||
| `mind2web.jsonl` | 300 | Online-Mind2Web |
|
||||
| `webbench-{0,1,2}of4-50.jsonl` | 50 each | WebBench shards (50-task subsets) |
|
||||
| `agisdk-real.jsonl` | 40 | AGI SDK / REAL Bench (action-only tasks) |
|
||||
| `agisdk-real-smoke.jsonl` | 1 | AGI SDK / REAL Bench smoke task |
|
||||
| `agisdk-real.jsonl` | 36 | AGI SDK / REAL Bench (action-only tasks) |
|
||||
| `webarena-infinity-hard-50.jsonl` | 50 | WebArena-Infinity hard set |
|
||||
| `browsecomp-medium-hard-50.jsonl` | 50 | BrowseComp medium-hard |
|
||||
| `browsecomp-very-hard-50.jsonl` | 50 | BrowseComp very-hard |
|
||||
@@ -167,14 +242,19 @@ results/
|
||||
browseros-agent-weekly/
|
||||
2026-04-29-1430/
|
||||
Amazon--0/
|
||||
attempt.json # Stable attempt summary for viewer/reporting
|
||||
metadata.json # Task result, timing, grader scores
|
||||
grades.json # Compact grader results
|
||||
messages.jsonl # Full message log
|
||||
grader-artifacts/ # Grader-specific inputs/outputs/stderr
|
||||
screenshots/
|
||||
001.png # Step-by-step screenshots
|
||||
002.png
|
||||
summary.json # Aggregate pass rates
|
||||
```
|
||||
|
||||
R2 publishing preserves the same task files under `runs/<run-id>/...`, writes `runs/<run-id>/manifest.json`, and uploads `viewer.html` at the bucket root. The viewer URL is `EVAL_R2_CDN_BASE_URL/viewer.html?run=<run-id>`.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**BrowserOS not found**: Expects `/Applications/BrowserOS.app/Contents/MacOS/BrowserOS`. Set `BROWSEROS_BINARY` to override.
|
||||
|
||||
@@ -7,8 +7,8 @@
|
||||
"baseUrl": "https://openrouter.ai/api/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/agisdk-real.jsonl",
|
||||
"num_workers": 10,
|
||||
"dataset": "../../data/agisdk-real-smoke.jsonl",
|
||||
"num_workers": 1,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
26
packages/browseros-agent/apps/eval/configs/legacy/agisdk-real.json
vendored
Normal file
26
packages/browseros-agent/apps/eval/configs/legacy/agisdk-real.json
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "single",
|
||||
"provider": "openai-compatible",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../../data/agisdk-real.jsonl",
|
||||
"num_workers": 4,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"timeout_ms": 1800000
|
||||
}
|
||||
@@ -7,7 +7,7 @@
|
||||
"baseUrl": "https://openrouter.ai/api/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/webbench-2of4-50.jsonl",
|
||||
"dataset": "../../data/webbench-2of4-50.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -14,7 +14,7 @@
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1"
|
||||
}
|
||||
},
|
||||
"dataset": "../data/webbench-2of4-50.jsonl",
|
||||
"dataset": "../../data/webbench-2of4-50.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -9,12 +9,12 @@
|
||||
},
|
||||
"executor": {
|
||||
"provider": "clado-action",
|
||||
"model": "qwen3-vl-30b-a3b-instruct",
|
||||
"model": "Qwen3.5-35B-A3B-action-000159-merged",
|
||||
"apiKey": "",
|
||||
"baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
|
||||
"baseUrl": "https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run"
|
||||
}
|
||||
},
|
||||
"dataset": "../data/webbench-2of4-50.jsonl",
|
||||
"dataset": "../../data/agisdk-real.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -23,11 +23,11 @@
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
"headless": true
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["performance_grader"],
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"timeout_ms": 1800000
|
||||
}
|
||||
@@ -7,7 +7,7 @@
|
||||
"baseUrl": "https://openrouter.ai/api/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/webarena-infinity-hard-50.jsonl",
|
||||
"dataset": "../../data/webarena-infinity-hard-50.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -5,7 +5,7 @@
|
||||
"model": "openai/gpt-4.1",
|
||||
"apiKey": "OPENROUTER_API_KEY"
|
||||
},
|
||||
"dataset": "../data/mind2web.jsonl",
|
||||
"dataset": "../../data/mind2web.jsonl",
|
||||
"num_workers": 5,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -7,7 +7,7 @@
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/webvoyager.jsonl",
|
||||
"dataset": "../../data/webvoyager.jsonl",
|
||||
"num_workers": 3,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
22
packages/browseros-agent/apps/eval/configs/suites/agisdk-daily-10.json
vendored
Normal file
22
packages/browseros-agent/apps/eval/configs/suites/agisdk-daily-10.json
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"id": "agisdk-daily-10",
|
||||
"dataset": "../../data/agisdk-daily-10.jsonl",
|
||||
"agent": {
|
||||
"type": "single"
|
||||
},
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"workers": 1,
|
||||
"restartBrowserPerTask": true,
|
||||
"timeoutMs": 1800000,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": true
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
}
|
||||
}
|
||||
22
packages/browseros-agent/apps/eval/configs/suites/agisdk-real-smoke.json
vendored
Normal file
22
packages/browseros-agent/apps/eval/configs/suites/agisdk-real-smoke.json
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"id": "agisdk-real-smoke",
|
||||
"dataset": "../../data/agisdk-real-smoke.jsonl",
|
||||
"agent": {
|
||||
"type": "single"
|
||||
},
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"workers": 1,
|
||||
"restartBrowserPerTask": true,
|
||||
"timeoutMs": 1800000,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
}
|
||||
}
|
||||
22
packages/browseros-agent/apps/eval/configs/suites/agisdk-real.json
vendored
Normal file
22
packages/browseros-agent/apps/eval/configs/suites/agisdk-real.json
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"id": "agisdk-real",
|
||||
"dataset": "../../data/agisdk-real.jsonl",
|
||||
"agent": {
|
||||
"type": "single"
|
||||
},
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"workers": 1,
|
||||
"restartBrowserPerTask": true,
|
||||
"timeoutMs": 1800000,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
}
|
||||
}
|
||||
10
packages/browseros-agent/apps/eval/data/agisdk-daily-10.jsonl
vendored
Normal file
10
packages/browseros-agent/apps/eval/data/agisdk-daily-10.jsonl
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
|
||||
{"query_id": "agisdk-fly-unified-5", "dataset": "agisdk-real", "query": "Find me the cheapest fare for a flight from Orlando to Milwaukee on December 5th, 2024 and book it.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/30, Security Code: 420 Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-5", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "United Airlines"}}}
|
||||
{"query_id": "agisdk-udriver-10", "dataset": "agisdk-real", "query": "Order me a ride for 4pm, I'll be at the de Young muesum headed to the Waterbar, fanciest option possible please.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-10", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
|
||||
{"query_id": "agisdk-udriver-9", "dataset": "agisdk-real", "query": "Book me a ride from the thai restaurant I last took a ride to for later today at 2pm, I'll be at 333 Apartments on Fremont", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-9", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-9", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Uber"}}}
|
||||
{"query_id": "agisdk-topwork-4", "dataset": "agisdk-real", "query": "Create a job post for a UI/UX Designer with expertise in Figma, Sketch, and Adobe Creative Suite, including project details, timeline, and required skills (Wireframing, Prototyping, Responsive Design).", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-4", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
|
||||
{"query_id": "agisdk-gocalendar-4", "dataset": "agisdk-real", "query": "Change the \"Team Check-In\" event on July 18, 2024, name to \"Project Kickoff\" and update the location to \"Zoom\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-4", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
|
||||
{"query_id": "agisdk-staynb-6", "dataset": "agisdk-real", "query": "Find and book the stay with the best value for money (cheapest stay with the best reviews) for 1 day. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-6", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-6", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
|
||||
{"query_id": "agisdk-udriver-11", "dataset": "agisdk-real", "query": "I need to go from Pacific Catch on Chestnut back home to 333 Fremont now. If the fancy version is within ten dollars of the regular one, book that.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-11", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-11", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
|
||||
{"query_id": "agisdk-networkin-5", "dataset": "agisdk-real", "query": "Send a connection request to John Smith.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-5", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-5", "challenge_type": "action", "difficulty": "easy", "similar_to": "LinkedIn"}}}
|
||||
{"query_id": "agisdk-zilloft-6", "dataset": "agisdk-real", "query": "Select a property listed in San Francisco as \"Condos\" within a price range under $300,000 and request a tour for tomorrow at 4:00 PM. Use these contact details: Name: Sarah Brown, Email: sarahbrown@example.com, Phone: 555-987-6543.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-6", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "Zillow"}}}
|
||||
1
packages/browseros-agent/apps/eval/data/agisdk-real-smoke.jsonl
vendored
Normal file
1
packages/browseros-agent/apps/eval/data/agisdk-real-smoke.jsonl
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
|
||||
@@ -32,9 +32,5 @@
|
||||
{"query_id": "agisdk-networkin-10", "dataset": "agisdk-real", "query": "Generate a polite follow-up message for a previous unanswered chat, starting with \"Following up on\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-10", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-10", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
|
||||
{"query_id": "agisdk-gomail-3", "dataset": "agisdk-real", "query": "Compose a new email to jonathan.smith@example.com with the subject \"Meeting Notes\" and body \"Please find the meeting notes attached.\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-3", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "Gmail"}}}
|
||||
{"query_id": "agisdk-udriver-6", "dataset": "agisdk-real", "query": "Me and 4 friends need a ride from the Palace Hotel to dinner at Osha Thai leaving now", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-6", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-6", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
|
||||
{"query_id": "agisdk-staynb-9", "dataset": "agisdk-real", "query": "Book a stay with the maximum number of guests supported. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-9", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-9", "challenge_type": "action", "difficulty": "hard", "similar_to": "Airbnb"}}}
|
||||
{"query_id": "agisdk-zilloft-3", "dataset": "agisdk-real", "query": "Find a home in San Diego priced under $150,000 with at least 2 bedrooms and request a tour. Use these details: Contact Name: John Doe, Email: johndoe@example.com, Phone: 555-123-4567, Tour Time: 2:00 PM, Tour Date: First available.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-3", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-3", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "Zillow"}}}
|
||||
{"query_id": "agisdk-fly-unified-6", "dataset": "agisdk-real", "query": "Reserve me a seat for the flight from Austin to Pittsburgh departing on December 11th, 2024 at 8:00 in Basic Economy.\nPassenger: Alice Brown\nDate of Birth: 05/20/1992\nSex: Female\nSeat Selection: Yes (Aisle seat)\nPayment: Credit Card (378342143523967), Exp: 09/27, security code: 332 Address: 789 Pine St, Los Angeles, CA, 90012, USA, Phone: 555-456-7890, Email: alicebrown@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-6", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "United Airlines"}}}
|
||||
{"query_id": "agisdk-opendining-3", "dataset": "agisdk-real", "query": "Book a table at \"The Royal Dine\" for a party of 4 on July 20, 2024, at 7 PM. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-3", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "OpenTable"}}}
|
||||
{"query_id": "agisdk-gocalendar-7", "dataset": "agisdk-real", "query": "Reschedule the \"Morning Coffee with sister\" event from July 18, 2024, at 9 AM to July 19, 2024, at 10AM using drag-and-drop functionality", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-7", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-7", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
|
||||
{"query_id": "agisdk-staynb-5", "dataset": "agisdk-real", "query": "Use the search bar to look for a stay. For the \"Where\" section, use the \"Search by region\" popover and select \"Europe\". Set the check-in date to October 13th and the check-out date to October 23rd. For the \"Who\" section, select 1 infant, 2 children, and 2 adults. Press the search button, select the first stay, and book it.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-5", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-5", "challenge_type": "action", "difficulty": "medium", "similar_to": "Airbnb"}}}
|
||||
|
||||
@@ -64,6 +64,37 @@ EXCLUDED_TASKS = {
|
||||
# the grader's first criterion (search history contains "stanford") was
|
||||
# never triggered server-side. Eval-site bug.
|
||||
"networkin-9",
|
||||
# Goal text instructs "move event to July 19, 10 AM" but the grader expects
|
||||
# `eventsDiff.updated.*.start == "2024-07-18T17:00:00Z"` (= July 18, 10 AM
|
||||
# PDT — same day, 1 hour shift). Goal contradicts grader: following the
|
||||
# goal yields July 19 timestamps; satisfying the grader requires ignoring
|
||||
# the explicit "to July 19" instruction. Confirmed via 8-trial deep-dive:
|
||||
# never passed even after the Phase 2 HTML5 dnd dispatch fix made the drag
|
||||
# actually populate `eventsDiff.updated` (now produces July 19 values, but
|
||||
# grader rejects them).
|
||||
"gocalendar-7",
|
||||
# Grader hardcodes literal year strings `'Oct 13 2025'` / `'Oct 23 2025'`
|
||||
# in checkin/checkout criteria. Today is 2026, and the staynb date picker
|
||||
# interprets bare "Oct 13" as the most recent past instance — currently
|
||||
# 2024, not 2025. Even a perfectly-acting agent cannot produce a booking
|
||||
# whose persisted date contains "2025". Confirmed via 8 trials, 0 passes.
|
||||
"staynb-5",
|
||||
# Goal says "maximum number of guests supported"; grader expects the very
|
||||
# specific string "32 Guests, 16 Infants" — which requires the agent to
|
||||
# know that (a) Adults+Children sum into the displayed "Guests" count,
|
||||
# (b) Infants render separately, (c) Pets are excluded, (d) per-category
|
||||
# cap is 16 despite no UI affordance signalling it. None of this is in
|
||||
# the prompt. 8 trials, 0 passes; even Opus 4.6 stopped at 16 (one
|
||||
# category maxed). Task is under-specified relative to grader expectation.
|
||||
"staynb-9",
|
||||
# Grader requires `contains(booking.date, '2024-07-20')` but the eval-site
|
||||
# date picker is a React-controlled textbox that the agent's `fill` tool
|
||||
# frequently no-ops on. 3 of 8 trials passed (when fill happened to stick),
|
||||
# 5 failed with `actual_value: False` (booking persisted with the eval-site
|
||||
# default search date, not Jul 20). Effectively a coin-flip task that
|
||||
# exercises tool-fidelity flakiness rather than agent capability —
|
||||
# contributes noise, not signal. Excluding for eval reliability.
|
||||
"opendining-3",
|
||||
}
|
||||
|
||||
# Far-future replacement used by `freshen_goal_dates` when a task's hardcoded
|
||||
|
||||
@@ -1,34 +1,73 @@
|
||||
/**
|
||||
* Test script for Clado API endpoints (grounding + action models)
|
||||
* Smoke-test for the Clado BrowserOS Action endpoint.
|
||||
*
|
||||
* Health-checks the model, then runs a generate call and prints every
|
||||
* field the new contract documents (action, coordinates, text, key,
|
||||
* direction, scroll/drag fields, wait, end+final_answer, thinking,
|
||||
* parse_error, raw_response).
|
||||
*
|
||||
* Usage:
|
||||
* bun apps/eval/scripts/test-clado-api.ts [screenshot-path]
|
||||
*
|
||||
* If no screenshot provided, captures one from a running BrowserOS server.
|
||||
* If no screenshot path is given, captures one over MCP from a
|
||||
* running BrowserOS server (default http://127.0.0.1:9110, override
|
||||
* with BROWSEROS_URL).
|
||||
*
|
||||
* Cold start can take ~5 minutes; the script waits up to 6.
|
||||
*/
|
||||
|
||||
import { readFile } from 'node:fs/promises'
|
||||
import { resolve } from 'node:path'
|
||||
|
||||
const ACTION_URL =
|
||||
'https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run'
|
||||
'https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run'
|
||||
const ACTION_HEALTH_URL =
|
||||
'https://clado-ai--clado-browseros-action-actionmodel-health.modal.run'
|
||||
const GROUNDING_URL =
|
||||
'https://clado-ai--clado-browseros-grounding-groundingmodel-generate.modal.run'
|
||||
const GROUNDING_HEALTH_URL =
|
||||
'https://clado-ai--clado-browseros-grounding-groundingmodel-health.modal.run'
|
||||
'https://clado-ai--clado-browseros-action-000159-merged-actionmod-5e5033.modal.run'
|
||||
|
||||
async function checkHealth(name: string, url: string): Promise<boolean> {
|
||||
console.log(`\n--- ${name} health check ---`)
|
||||
console.log(` URL: ${url}`)
|
||||
const COLD_START_BUDGET_MS = 360_000 // 6 min — Clado cold start is ~5 min
|
||||
const COLD_START_WARN_MS = 30_000
|
||||
|
||||
interface CladoResponse {
|
||||
action?: string | null
|
||||
thinking?: string | null
|
||||
raw_response?: string
|
||||
parse_error?: string | null
|
||||
inference_time_seconds?: number
|
||||
x?: number
|
||||
y?: number
|
||||
text?: string
|
||||
key?: string
|
||||
direction?: string
|
||||
amount?: number
|
||||
startX?: number
|
||||
startY?: number
|
||||
endX?: number
|
||||
endY?: number
|
||||
time?: number
|
||||
final_answer?: string | null
|
||||
}
|
||||
|
||||
async function checkHealth(): Promise<boolean> {
|
||||
console.log(`\n--- Action model health ---`)
|
||||
console.log(` URL: ${ACTION_HEALTH_URL}`)
|
||||
console.log(
|
||||
` Note: cold start can take ~5 min; waiting up to ${COLD_START_BUDGET_MS / 1000}s.`,
|
||||
)
|
||||
const start = performance.now()
|
||||
const warn = setTimeout(() => {
|
||||
console.log(
|
||||
` ...still waiting (${COLD_START_WARN_MS / 1000}s in) — model is likely cold-starting on Modal.`,
|
||||
)
|
||||
}, COLD_START_WARN_MS)
|
||||
|
||||
try {
|
||||
const resp = await fetch(url, { signal: AbortSignal.timeout(30_000) })
|
||||
const resp = await fetch(ACTION_HEALTH_URL, {
|
||||
signal: AbortSignal.timeout(COLD_START_BUDGET_MS),
|
||||
})
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
const body = await resp.text()
|
||||
console.log(` Status: ${resp.status} (${elapsed}s)`)
|
||||
console.log(` Body: ${body.slice(0, 200)}`)
|
||||
console.log(` Body: ${body.slice(0, 400)}`)
|
||||
return resp.ok
|
||||
} catch (err) {
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
@@ -36,63 +75,34 @@ async function checkHealth(name: string, url: string): Promise<boolean> {
|
||||
` FAILED (${elapsed}s): ${err instanceof Error ? err.message : err}`,
|
||||
)
|
||||
return false
|
||||
} finally {
|
||||
clearTimeout(warn)
|
||||
}
|
||||
}
|
||||
|
||||
async function testGenerate(
|
||||
name: string,
|
||||
url: string,
|
||||
async function generate(
|
||||
label: string,
|
||||
payload: Record<string, unknown>,
|
||||
): Promise<Record<string, unknown> | null> {
|
||||
console.log(`\n--- ${name} generate ---`)
|
||||
console.log(` URL: ${url}`)
|
||||
): Promise<CladoResponse | null> {
|
||||
console.log(`\n--- ${label} ---`)
|
||||
console.log(` URL: ${ACTION_URL}`)
|
||||
console.log(` Instruction: ${payload.instruction}`)
|
||||
console.log(
|
||||
` Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
|
||||
` Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
|
||||
)
|
||||
if (payload.history) console.log(` History: ${payload.history}`)
|
||||
if (payload.history && payload.history !== 'None') {
|
||||
console.log(` History: ${payload.history}`)
|
||||
}
|
||||
|
||||
const start = performance.now()
|
||||
let resp: Response
|
||||
try {
|
||||
const resp = await fetch(url, {
|
||||
resp = await fetch(ACTION_URL, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(payload),
|
||||
signal: AbortSignal.timeout(120_000),
|
||||
signal: AbortSignal.timeout(COLD_START_BUDGET_MS),
|
||||
})
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
|
||||
if (!resp.ok) {
|
||||
const body = await resp.text()
|
||||
console.log(` FAILED: HTTP ${resp.status} (${elapsed}s)`)
|
||||
console.log(` Body: ${body.slice(0, 400)}`)
|
||||
return null
|
||||
}
|
||||
|
||||
const result = (await resp.json()) as Record<string, unknown>
|
||||
console.log(` Status: ${resp.status} (${elapsed}s)`)
|
||||
console.log(` Action: ${result.action}`)
|
||||
if (result.x !== null && result.x !== undefined)
|
||||
console.log(` Coordinates: (${result.x}, ${result.y})`)
|
||||
if (result.text)
|
||||
console.log(` Text: ${(result.text as string).slice(0, 100)}`)
|
||||
if (result.key) console.log(` Key: ${result.key}`)
|
||||
if (result.inference_time_seconds)
|
||||
console.log(` Inference: ${result.inference_time_seconds}s`)
|
||||
|
||||
// Show thinking if present
|
||||
const raw = result.raw_response as string | undefined
|
||||
if (raw) {
|
||||
const thinkMatch = raw.match(/<thinking>([\s\S]*?)<\/thinking>/)
|
||||
if (thinkMatch) {
|
||||
const thinking = thinkMatch[1].trim()
|
||||
console.log(
|
||||
` Thinking: ${thinking.slice(0, 200)}${thinking.length > 200 ? '...' : ''}`,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
} catch (err) {
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
console.log(
|
||||
@@ -100,6 +110,50 @@ async function testGenerate(
|
||||
)
|
||||
return null
|
||||
}
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
|
||||
if (!resp.ok) {
|
||||
const body = await resp.text()
|
||||
console.log(` HTTP ${resp.status} ${resp.statusText} (${elapsed}s)`)
|
||||
console.log(` Body: ${body.slice(0, 400)}`)
|
||||
return null
|
||||
}
|
||||
|
||||
const result = (await resp.json()) as CladoResponse
|
||||
console.log(` HTTP ${resp.status} (${elapsed}s)`)
|
||||
console.log(` action: ${result.action ?? 'null'}`)
|
||||
if (result.parse_error) {
|
||||
console.log(` parse_error: ${result.parse_error}`)
|
||||
}
|
||||
if (result.thinking) {
|
||||
const trimmed = result.thinking.replace(/\s+/g, ' ').trim()
|
||||
console.log(
|
||||
` thinking: ${trimmed.slice(0, 240)}${trimmed.length > 240 ? '…' : ''}`,
|
||||
)
|
||||
}
|
||||
if (typeof result.x === 'number' || typeof result.y === 'number') {
|
||||
console.log(` x, y: ${result.x}, ${result.y}`)
|
||||
}
|
||||
if (typeof result.text === 'string')
|
||||
console.log(` text: ${result.text.slice(0, 120)}`)
|
||||
if (typeof result.key === 'string')
|
||||
console.log(` key: ${result.key}`)
|
||||
if (typeof result.direction === 'string')
|
||||
console.log(` direction: ${result.direction}`)
|
||||
if (typeof result.amount === 'number')
|
||||
console.log(` amount: ${result.amount}`)
|
||||
if (typeof result.startX === 'number' || typeof result.endX === 'number') {
|
||||
console.log(
|
||||
` drag: (${result.startX}, ${result.startY}) → (${result.endX}, ${result.endY})`,
|
||||
)
|
||||
}
|
||||
if (typeof result.time === 'number')
|
||||
console.log(` time: ${result.time}s`)
|
||||
if (result.final_answer)
|
||||
console.log(` final_answer: ${result.final_answer.slice(0, 240)}`)
|
||||
if (typeof result.inference_time_seconds === 'number')
|
||||
console.log(` inference_time_seconds: ${result.inference_time_seconds}`)
|
||||
return result
|
||||
}
|
||||
|
||||
async function loadScreenshot(path?: string): Promise<string> {
|
||||
@@ -110,10 +164,9 @@ async function loadScreenshot(path?: string): Promise<string> {
|
||||
return data.toString('base64')
|
||||
}
|
||||
|
||||
// Try to capture from a running BrowserOS server
|
||||
const serverUrl = process.env.BROWSEROS_URL || 'http://127.0.0.1:9110'
|
||||
console.log(
|
||||
`No screenshot path provided. Trying to capture from ${serverUrl}...`,
|
||||
`No screenshot path provided. Capturing from ${serverUrl} via MCP...`,
|
||||
)
|
||||
|
||||
const { Client } = await import('@modelcontextprotocol/sdk/client/index.js')
|
||||
@@ -134,82 +187,101 @@ async function loadScreenshot(path?: string): Promise<string> {
|
||||
arguments: { format: 'png', page: 1 },
|
||||
})) as { content: Array<{ type: string; data?: string }> }
|
||||
|
||||
const imageContent = result.content?.find((c) => c.type === 'image')
|
||||
if (!imageContent?.data)
|
||||
throw new Error('No image data in screenshot response')
|
||||
const image = result.content?.find((c) => c.type === 'image')
|
||||
if (!image?.data)
|
||||
throw new Error('No image data in take_screenshot response')
|
||||
|
||||
console.log(
|
||||
`Captured screenshot (${(imageContent.data.length / 1024).toFixed(0)} KB base64)`,
|
||||
`Captured screenshot (${(image.data.length / 1024).toFixed(0)} KB base64)`,
|
||||
)
|
||||
return imageContent.data
|
||||
return image.data
|
||||
} finally {
|
||||
try {
|
||||
await transport.close()
|
||||
} catch {}
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function summarize(history: CladoResponse[]): string {
|
||||
if (history.length === 0) return 'None'
|
||||
return history
|
||||
.map((h) => {
|
||||
switch (h.action) {
|
||||
case 'click':
|
||||
case 'double_click':
|
||||
case 'right_click':
|
||||
case 'hover':
|
||||
return `${h.action}(${h.x}, ${h.y})`
|
||||
case 'type':
|
||||
return `type(${JSON.stringify(h.text ?? '')})`
|
||||
case 'press_key':
|
||||
return `press_key(${JSON.stringify(h.key ?? '')})`
|
||||
case 'scroll':
|
||||
return `scroll(${h.direction ?? 'down'})`
|
||||
case 'drag':
|
||||
return `drag(${h.startX},${h.startY} -> ${h.endX},${h.endY})`
|
||||
case 'wait':
|
||||
return `wait(${h.time ?? 1}s)`
|
||||
case 'end':
|
||||
return 'end()'
|
||||
default:
|
||||
return h.action ?? 'invalid'
|
||||
}
|
||||
})
|
||||
.join(' -> ')
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const screenshotPath = process.argv[2]
|
||||
console.log('=== Clado action endpoint smoke test ===')
|
||||
|
||||
console.log('=== Clado API Test ===\n')
|
||||
|
||||
// Health checks (parallel)
|
||||
const [actionHealthy, groundingHealthy] = await Promise.all([
|
||||
checkHealth('Action Model', ACTION_HEALTH_URL),
|
||||
checkHealth('Grounding Model', GROUNDING_HEALTH_URL),
|
||||
])
|
||||
|
||||
if (!actionHealthy && !groundingHealthy) {
|
||||
console.log('\nBoth endpoints are down. Exiting.')
|
||||
const healthy = await checkHealth()
|
||||
if (!healthy) {
|
||||
console.log('\nHealth check failed. Exiting.')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
// Load screenshot
|
||||
let imageBase64: string
|
||||
try {
|
||||
imageBase64 = await loadScreenshot(screenshotPath)
|
||||
imageBase64 = await loadScreenshot(process.argv[2])
|
||||
} catch (err) {
|
||||
console.log(
|
||||
`\nFailed to load screenshot: ${err instanceof Error ? err.message : err}`,
|
||||
)
|
||||
console.log(
|
||||
'Provide a screenshot path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
|
||||
'Pass a path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const instruction = 'Click on the search button or search bar'
|
||||
const history: CladoResponse[] = []
|
||||
|
||||
// Test grounding model
|
||||
if (groundingHealthy) {
|
||||
await testGenerate('Grounding Model', GROUNDING_URL, {
|
||||
instruction,
|
||||
// Step 1: open task — let the model decide what to do.
|
||||
const step1 = await generate('Step 1: cold task', {
|
||||
instruction: 'Find the search bar and click it',
|
||||
image_base64: imageBase64,
|
||||
history: 'None',
|
||||
})
|
||||
if (step1?.action) history.push(step1)
|
||||
|
||||
// Step 2: continuation with history, asks for typing.
|
||||
if (step1?.action) {
|
||||
const step2 = await generate('Step 2: with history', {
|
||||
instruction: 'Type "hello world" into the search bar',
|
||||
image_base64: imageBase64,
|
||||
history: summarize(history),
|
||||
})
|
||||
} else {
|
||||
console.log('\nSkipping grounding model (unhealthy)')
|
||||
if (step2?.action) history.push(step2)
|
||||
}
|
||||
|
||||
// Test action model (no history)
|
||||
if (actionHealthy) {
|
||||
const result = await testGenerate('Action Model (step 1)', ACTION_URL, {
|
||||
instruction,
|
||||
image_base64: imageBase64,
|
||||
history: 'None',
|
||||
})
|
||||
|
||||
// Test action model with history (simulate multi-turn)
|
||||
if (result && result.action === 'click') {
|
||||
await testGenerate('Action Model (step 2, with history)', ACTION_URL, {
|
||||
instruction: 'Type "hello world" in the search bar',
|
||||
image_base64: imageBase64,
|
||||
history: `click(${result.x}, ${result.y})`,
|
||||
})
|
||||
}
|
||||
} else {
|
||||
console.log('\nSkipping action model (unhealthy)')
|
||||
}
|
||||
// Step 3: ask for end with a final answer to exercise that field.
|
||||
await generate('Step 3: ask for end+final_answer', {
|
||||
instruction:
|
||||
'You have completed the task. Reply with end() and final_answer="done".',
|
||||
image_base64: imageBase64,
|
||||
history: summarize(history),
|
||||
})
|
||||
|
||||
console.log('\n=== Done ===')
|
||||
}
|
||||
|
||||
@@ -1,349 +1,43 @@
|
||||
#!/usr/bin/env bun
|
||||
|
||||
/**
|
||||
* Upload eval runs to R2.
|
||||
*
|
||||
* Two modes:
|
||||
* bun scripts/upload-run.ts results/browseros-agent-weekly/2026-03-21-1730
|
||||
* → uploads that specific run
|
||||
*
|
||||
* bun scripts/upload-run.ts results/browseros-agent-weekly
|
||||
* → finds all timestamped subfolders, uploads any not yet in R2
|
||||
*
|
||||
* Env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY
|
||||
* EVAL_R2_BUCKET (default: browseros-eval)
|
||||
* EVAL_R2_CDN_BASE_URL (default: https://eval.browseros.com)
|
||||
*/
|
||||
|
||||
import { readdir, readFile, stat } from 'node:fs/promises'
|
||||
import { basename, dirname, extname, join } from 'node:path'
|
||||
import {
|
||||
GetObjectCommand,
|
||||
PutObjectCommand,
|
||||
S3Client,
|
||||
} from '@aws-sdk/client-s3'
|
||||
loadR2ConfigFromEnv,
|
||||
R2Publisher,
|
||||
} from '../src/publishing/r2-publisher'
|
||||
|
||||
const CONCURRENCY = 20
|
||||
|
||||
const CONTENT_TYPES: Record<string, string> = {
|
||||
'.json': 'application/json',
|
||||
'.jsonl': 'application/x-ndjson',
|
||||
'.png': 'image/png',
|
||||
}
|
||||
|
||||
interface R2Config {
|
||||
accountId: string
|
||||
accessKeyId: string
|
||||
secretAccessKey: string
|
||||
bucket: string
|
||||
cdnBaseUrl: string
|
||||
}
|
||||
|
||||
function loadConfig(): R2Config {
|
||||
const accountId = process.env.EVAL_R2_ACCOUNT_ID
|
||||
const accessKeyId = process.env.EVAL_R2_ACCESS_KEY_ID
|
||||
const secretAccessKey = process.env.EVAL_R2_SECRET_ACCESS_KEY
|
||||
|
||||
if (!accountId || !accessKeyId || !secretAccessKey) {
|
||||
console.error(
|
||||
'Missing required env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY',
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
return {
|
||||
accountId,
|
||||
accessKeyId,
|
||||
secretAccessKey,
|
||||
bucket: process.env.EVAL_R2_BUCKET || 'browseros-eval',
|
||||
cdnBaseUrl: (
|
||||
process.env.EVAL_R2_CDN_BASE_URL || 'https://eval.browseros.com'
|
||||
).replace(/\/+$/, ''),
|
||||
}
|
||||
}
|
||||
|
||||
function createClient(config: R2Config): S3Client {
|
||||
return new S3Client({
|
||||
region: 'auto',
|
||||
endpoint: `https://${config.accountId}.r2.cloudflarestorage.com`,
|
||||
credentials: {
|
||||
accessKeyId: config.accessKeyId,
|
||||
secretAccessKey: config.secretAccessKey,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
async function upload(
|
||||
client: S3Client,
|
||||
bucket: string,
|
||||
key: string,
|
||||
body: Buffer,
|
||||
contentType: string,
|
||||
) {
|
||||
await client.send(
|
||||
new PutObjectCommand({
|
||||
Bucket: bucket,
|
||||
Key: key,
|
||||
Body: body,
|
||||
ContentType: contentType,
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
async function collectFiles(dir: string): Promise<string[]> {
|
||||
const files: string[] = []
|
||||
const entries = await readdir(dir, { withFileTypes: true })
|
||||
for (const entry of entries) {
|
||||
const full = join(dir, entry.name)
|
||||
if (entry.isDirectory()) {
|
||||
files.push(...(await collectFiles(full)))
|
||||
} else {
|
||||
files.push(full)
|
||||
}
|
||||
}
|
||||
return files
|
||||
}
|
||||
|
||||
async function runPool<T>(
|
||||
items: T[],
|
||||
concurrency: number,
|
||||
fn: (item: T) => Promise<void>,
|
||||
) {
|
||||
let i = 0
|
||||
const workers = Array.from({ length: concurrency }, async () => {
|
||||
while (i < items.length) {
|
||||
const idx = i++
|
||||
await fn(items[idx])
|
||||
}
|
||||
})
|
||||
await Promise.all(workers)
|
||||
}
|
||||
|
||||
// Check if a run has already been uploaded to R2
|
||||
async function isUploaded(
|
||||
client: S3Client,
|
||||
bucket: string,
|
||||
runId: string,
|
||||
): Promise<boolean> {
|
||||
try {
|
||||
await client.send(
|
||||
new GetObjectCommand({
|
||||
Bucket: bucket,
|
||||
Key: `runs/${runId}/manifest.json`,
|
||||
}),
|
||||
)
|
||||
return true
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Detect if a directory is a run dir (has task subdirs with metadata.json)
|
||||
// vs a config dir (has timestamped subdirs like 2026-03-21-1730/)
|
||||
async function isRunDir(dir: string): Promise<boolean> {
|
||||
const entries = await readdir(dir, { withFileTypes: true })
|
||||
const subdirs = entries.filter((e) => e.isDirectory())
|
||||
for (const subdir of subdirs) {
|
||||
const metaPath = join(dir, subdir.name, 'metadata.json')
|
||||
const metaStat = await stat(metaPath).catch(() => null)
|
||||
if (metaStat?.isFile()) return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
async function uploadSingleRun(
|
||||
runDir: string,
|
||||
runId: string,
|
||||
r2Config: R2Config,
|
||||
client: S3Client,
|
||||
): Promise<void> {
|
||||
const taskDirs = await readdir(runDir, { withFileTypes: true })
|
||||
const taskEntries = taskDirs.filter((d) => d.isDirectory())
|
||||
|
||||
if (taskEntries.length === 0) {
|
||||
console.warn(` No task subdirectories in ${runId}, skipping`)
|
||||
return
|
||||
}
|
||||
|
||||
const manifestTasks: Record<string, unknown>[] = []
|
||||
const jobs: { key: string; filePath: string; contentType: string }[] = []
|
||||
|
||||
// Extract agent config from first task
|
||||
let agentConfig: Record<string, unknown> | undefined
|
||||
let dataset: string | undefined
|
||||
|
||||
for (const taskDir of taskEntries) {
|
||||
const taskId = taskDir.name
|
||||
const taskPath = join(runDir, taskId)
|
||||
const metaPath = join(taskPath, 'metadata.json')
|
||||
|
||||
let meta: Record<string, unknown> = {}
|
||||
try {
|
||||
meta = JSON.parse(await readFile(metaPath, 'utf-8'))
|
||||
} catch {
|
||||
continue
|
||||
}
|
||||
|
||||
if (!agentConfig && meta.agent_config)
|
||||
agentConfig = meta.agent_config as Record<string, unknown>
|
||||
if (!dataset && meta.dataset) dataset = meta.dataset as string
|
||||
|
||||
const files = await collectFiles(taskPath)
|
||||
let screenshotCount = 0
|
||||
|
||||
for (const file of files) {
|
||||
const relative = file.slice(taskPath.length + 1)
|
||||
const ext = extname(file)
|
||||
if (relative.startsWith('screenshots/') && ext === '.png')
|
||||
screenshotCount++
|
||||
|
||||
jobs.push({
|
||||
key: `runs/${runId}/${taskId}/${relative}`,
|
||||
filePath: file,
|
||||
contentType: CONTENT_TYPES[ext] || 'application/octet-stream',
|
||||
})
|
||||
}
|
||||
|
||||
manifestTasks.push({
|
||||
queryId: meta.query_id || taskId,
|
||||
query: meta.query || '',
|
||||
startUrl: meta.start_url || '',
|
||||
status:
|
||||
meta.termination_reason === 'completed'
|
||||
? 'completed'
|
||||
: meta.termination_reason || 'unknown',
|
||||
durationMs: meta.total_duration_ms || 0,
|
||||
screenshotCount: (meta.screenshot_count as number) || screenshotCount,
|
||||
graderResults: meta.grader_results || {},
|
||||
})
|
||||
}
|
||||
|
||||
if (manifestTasks.length === 0) {
|
||||
console.warn(` No completed tasks in ${runId}, skipping`)
|
||||
return
|
||||
}
|
||||
|
||||
console.log(
|
||||
` Uploading ${jobs.length} files across ${manifestTasks.length} tasks...`,
|
||||
)
|
||||
|
||||
let uploaded = 0
|
||||
await runPool(jobs, CONCURRENCY, async (job) => {
|
||||
const body = await readFile(job.filePath)
|
||||
await upload(client, r2Config.bucket, job.key, body, job.contentType)
|
||||
uploaded++
|
||||
if (uploaded % 50 === 0 || uploaded === jobs.length) {
|
||||
console.log(` ${uploaded}/${jobs.length}`)
|
||||
}
|
||||
})
|
||||
|
||||
// Read summary.json if it exists
|
||||
let summaryData: Record<string, unknown> | undefined
|
||||
try {
|
||||
summaryData = JSON.parse(
|
||||
await readFile(join(runDir, 'summary.json'), 'utf-8'),
|
||||
)
|
||||
} catch {}
|
||||
|
||||
// Upload manifest
|
||||
const manifest = {
|
||||
runId,
|
||||
uploadedAt: new Date().toISOString(),
|
||||
agentConfig,
|
||||
dataset,
|
||||
summary: summaryData
|
||||
? {
|
||||
passRate: summaryData.passRate,
|
||||
avgDurationMs: summaryData.avgDurationMs,
|
||||
}
|
||||
: undefined,
|
||||
tasks: manifestTasks,
|
||||
}
|
||||
const manifestBody = Buffer.from(JSON.stringify(manifest, null, 2))
|
||||
await upload(
|
||||
client,
|
||||
r2Config.bucket,
|
||||
`runs/${runId}/manifest.json`,
|
||||
manifestBody,
|
||||
'application/json',
|
||||
)
|
||||
|
||||
// Upload viewer.html to bucket root
|
||||
const viewerPath = join(
|
||||
import.meta.dir,
|
||||
'..',
|
||||
'src',
|
||||
'dashboard',
|
||||
'viewer.html',
|
||||
)
|
||||
const viewerBody = await readFile(viewerPath)
|
||||
await upload(client, r2Config.bucket, 'viewer.html', viewerBody, 'text/html')
|
||||
|
||||
console.log(` Uploaded ${uploaded + 2} files`)
|
||||
console.log(` ${r2Config.cdnBaseUrl}/viewer.html?run=${runId}`)
|
||||
}
|
||||
|
||||
async function main() {
|
||||
async function main(): Promise<void> {
|
||||
const inputDir = process.argv[2]
|
||||
if (!inputDir) {
|
||||
console.error(
|
||||
throw new Error(
|
||||
'Usage:\n' +
|
||||
' bun scripts/upload-run.ts results/config-name/2026-03-21-1730 (specific run)\n' +
|
||||
' bun scripts/upload-run.ts results/config-name (all un-uploaded runs)',
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const dirStat = await stat(inputDir).catch(() => null)
|
||||
if (!dirStat?.isDirectory()) {
|
||||
console.error(`Not a directory: ${inputDir}`)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const r2Config = loadConfig()
|
||||
const client = createClient(r2Config)
|
||||
|
||||
if (await isRunDir(inputDir)) {
|
||||
// Single run: results/config-name/2026-03-21-1730
|
||||
const timestamp = basename(inputDir)
|
||||
const configName = basename(dirname(inputDir))
|
||||
const runId = `${configName}-${timestamp}`
|
||||
console.log(`Uploading run: ${runId}`)
|
||||
await uploadSingleRun(inputDir, runId, r2Config, client)
|
||||
} else {
|
||||
// Config dir: results/config-name/ — upload all un-uploaded runs
|
||||
const configName = basename(inputDir)
|
||||
const entries = await readdir(inputDir, { withFileTypes: true })
|
||||
const runDirs = entries
|
||||
.filter((e) => e.isDirectory())
|
||||
.map((e) => e.name)
|
||||
.sort()
|
||||
|
||||
if (runDirs.length === 0) {
|
||||
console.error('No run subdirectories found')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
console.log(
|
||||
`Found ${runDirs.length} runs for config "${configName}", checking R2...`,
|
||||
)
|
||||
|
||||
let uploadedCount = 0
|
||||
for (const dir of runDirs) {
|
||||
const runId = `${configName}-${dir}`
|
||||
const alreadyUploaded = await isUploaded(client, r2Config.bucket, runId)
|
||||
if (alreadyUploaded) {
|
||||
console.log(` ${runId}: already uploaded, skipping`)
|
||||
continue
|
||||
}
|
||||
|
||||
console.log(` ${runId}: uploading...`)
|
||||
await uploadSingleRun(join(inputDir, dir), runId, r2Config, client)
|
||||
uploadedCount++
|
||||
}
|
||||
|
||||
console.log(
|
||||
`\nDone. Uploaded ${uploadedCount} new run(s), ${runDirs.length - uploadedCount} already in R2.`,
|
||||
' bun scripts/upload-run.ts results/config-name/2026-03-21-1730\n' +
|
||||
' bun scripts/upload-run.ts results/config-name',
|
||||
)
|
||||
}
|
||||
|
||||
const publisher = new R2Publisher({ config: loadR2ConfigFromEnv() })
|
||||
const result = await publisher.publishPath(inputDir)
|
||||
for (const run of result.uploadedRuns) {
|
||||
console.log(`Uploaded ${run.uploadedFiles} files for ${run.runId}`)
|
||||
console.log(run.viewerUrl)
|
||||
}
|
||||
for (const runId of result.skippedRuns) {
|
||||
console.log(`${runId}: already uploaded, skipping`)
|
||||
}
|
||||
console.log(
|
||||
`Done. Uploaded ${result.uploadedRuns.length} run(s), skipped ${result.skippedRuns.length}.`,
|
||||
)
|
||||
}
|
||||
|
||||
main()
|
||||
main().catch((error) => {
|
||||
console.error(error instanceof Error ? error.message : String(error))
|
||||
process.exit(1)
|
||||
})
|
||||
|
||||
191
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-actions.ts
vendored
Normal file
191
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-actions.ts
vendored
Normal file
@@ -0,0 +1,191 @@
|
||||
import type {
|
||||
CladoAction,
|
||||
CladoActionResponse,
|
||||
RawCladoActionPayload,
|
||||
} from './types'
|
||||
|
||||
/** Parses Clado's structured response plus any raw `<answer>` blocks into executable actions. */
|
||||
export function parseCladoActions(
|
||||
prediction: CladoActionResponse,
|
||||
): CladoAction[] {
|
||||
const actionFromField =
|
||||
typeof prediction.action === 'string' ? prediction.action : null
|
||||
|
||||
const rawActions = parseCladoActionsFromRawResponse(prediction.raw_response)
|
||||
const primaryFromRaw = rawActions[0] ?? null
|
||||
const mergedPrimary = {
|
||||
...primaryFromRaw,
|
||||
...prediction,
|
||||
action: actionFromField ?? primaryFromRaw?.action,
|
||||
}
|
||||
|
||||
const normalized: CladoAction[] = []
|
||||
const primary = normalizeCladoActionPayload(mergedPrimary)
|
||||
if (primary) normalized.push(primary)
|
||||
|
||||
for (const candidate of rawActions.slice(1)) {
|
||||
const parsed = normalizeCladoActionPayload(candidate)
|
||||
if (!parsed) continue
|
||||
const prev = normalized[normalized.length - 1]
|
||||
if (
|
||||
!prev ||
|
||||
getCladoActionSignature(prev) !== getCladoActionSignature(parsed)
|
||||
) {
|
||||
normalized.push(parsed)
|
||||
}
|
||||
}
|
||||
|
||||
return normalized
|
||||
}
|
||||
|
||||
export function normalizeCladoActionPayload(
|
||||
payload: RawCladoActionPayload,
|
||||
): CladoAction | null {
|
||||
if (!payload.action || typeof payload.action !== 'string') {
|
||||
return null
|
||||
}
|
||||
return {
|
||||
action: payload.action,
|
||||
x: typeof payload.x === 'number' ? payload.x : undefined,
|
||||
y: typeof payload.y === 'number' ? payload.y : undefined,
|
||||
text: typeof payload.text === 'string' ? payload.text : undefined,
|
||||
key: typeof payload.key === 'string' ? payload.key : undefined,
|
||||
direction:
|
||||
typeof payload.direction === 'string' ? payload.direction : undefined,
|
||||
startX: typeof payload.startX === 'number' ? payload.startX : undefined,
|
||||
startY: typeof payload.startY === 'number' ? payload.startY : undefined,
|
||||
endX: typeof payload.endX === 'number' ? payload.endX : undefined,
|
||||
endY: typeof payload.endY === 'number' ? payload.endY : undefined,
|
||||
amount: typeof payload.amount === 'number' ? payload.amount : undefined,
|
||||
time: typeof payload.time === 'number' ? payload.time : undefined,
|
||||
final_answer:
|
||||
typeof payload.final_answer === 'string'
|
||||
? payload.final_answer
|
||||
: undefined,
|
||||
}
|
||||
}
|
||||
|
||||
export function parseCladoActionsFromRawResponse(
|
||||
rawResponse: string | undefined,
|
||||
): RawCladoActionPayload[] {
|
||||
if (!rawResponse) return []
|
||||
const matches = [
|
||||
...rawResponse.matchAll(/<answer>\s*([\s\S]*?)\s*<\/answer>/gi),
|
||||
]
|
||||
const parsed: RawCladoActionPayload[] = []
|
||||
for (const match of matches) {
|
||||
try {
|
||||
parsed.push(JSON.parse(match[1]) as RawCladoActionPayload)
|
||||
} catch {
|
||||
// Ignore malformed answer blocks so one bad block does not drop the whole prediction.
|
||||
}
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
|
||||
export function extractCladoThinking(
|
||||
rawResponse: string | undefined,
|
||||
): string | undefined {
|
||||
if (!rawResponse) return undefined
|
||||
const matches = [
|
||||
...rawResponse.matchAll(/<thinking>\s*([\s\S]*?)\s*<\/thinking>/gi),
|
||||
]
|
||||
if (matches.length === 0) return undefined
|
||||
|
||||
const merged = matches
|
||||
.map((match) => match[1]?.replace(/\s+/g, ' ').trim() ?? '')
|
||||
.filter((value) => value.length > 0)
|
||||
.join(' ')
|
||||
|
||||
if (!merged) return undefined
|
||||
return merged
|
||||
}
|
||||
|
||||
export function summarizeCladoPrediction(
|
||||
prediction: CladoActionResponse,
|
||||
): Record<string, unknown> {
|
||||
const preview =
|
||||
typeof prediction.raw_response === 'string' &&
|
||||
prediction.raw_response.length > 0
|
||||
? prediction.raw_response.slice(0, 240)
|
||||
: undefined
|
||||
|
||||
return {
|
||||
action: prediction.action,
|
||||
x: prediction.x,
|
||||
y: prediction.y,
|
||||
text: prediction.text,
|
||||
key: prediction.key,
|
||||
direction: prediction.direction,
|
||||
startX: prediction.startX,
|
||||
startY: prediction.startY,
|
||||
endX: prediction.endX,
|
||||
endY: prediction.endY,
|
||||
amount: prediction.amount,
|
||||
time: prediction.time,
|
||||
inference_time_seconds: prediction.inference_time_seconds,
|
||||
raw_response_preview: preview,
|
||||
}
|
||||
}
|
||||
|
||||
export function getCladoActionSignature(action: CladoAction): string {
|
||||
switch (action.action) {
|
||||
case 'click':
|
||||
case 'double_click':
|
||||
case 'right_click':
|
||||
case 'hover':
|
||||
return `${action.action}:${action.x ?? 'x'}:${action.y ?? 'y'}`
|
||||
case 'type':
|
||||
return `${action.action}:${(action.text ?? '').slice(0, 16)}`
|
||||
case 'press_key':
|
||||
return `${action.action}:${action.key ?? 'key'}`
|
||||
case 'scroll':
|
||||
return `${action.action}:${action.direction ?? 'down'}:${action.amount ?? 500}`
|
||||
case 'drag':
|
||||
return `${action.action}:${action.startX}:${action.startY}:${action.endX}:${action.endY}`
|
||||
case 'wait':
|
||||
return `${action.action}:${action.time ?? 1}`
|
||||
case 'end':
|
||||
return action.final_answer
|
||||
? `end(${action.final_answer.slice(0, 32)})`
|
||||
: 'end()'
|
||||
case 'invalid':
|
||||
return `invalid(${(action.text ?? '').slice(0, 40)})`
|
||||
default:
|
||||
return action.action
|
||||
}
|
||||
}
|
||||
|
||||
export function formatCladoHistory(actions: CladoAction[]): string {
|
||||
if (actions.length === 0) return 'None'
|
||||
|
||||
const parts = actions.map((action) => {
|
||||
switch (action.action) {
|
||||
case 'click':
|
||||
case 'double_click':
|
||||
case 'right_click':
|
||||
case 'hover':
|
||||
return `${action.action}(${Math.round(action.x ?? 500)}, ${Math.round(action.y ?? 500)})`
|
||||
case 'type': {
|
||||
const text = (action.text ?? '').replace(/'/g, "\\'")
|
||||
return `type('${text}')`
|
||||
}
|
||||
case 'press_key':
|
||||
return `press_key('${action.key ?? 'Enter'}')`
|
||||
case 'scroll':
|
||||
return `scroll(${action.direction ?? 'down'})`
|
||||
case 'drag':
|
||||
return `drag(${Math.round(action.startX ?? 500)},${Math.round(action.startY ?? 500)} -> ${Math.round(action.endX ?? 500)},${Math.round(action.endY ?? 500)})`
|
||||
case 'wait':
|
||||
return `wait(${Math.round(action.time ?? 1)}s)`
|
||||
case 'end':
|
||||
return 'end()'
|
||||
case 'invalid':
|
||||
return 'invalid()'
|
||||
default:
|
||||
return action.action
|
||||
}
|
||||
})
|
||||
|
||||
return parts.join(' -> ')
|
||||
}
|
||||
123
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-browser-driver.ts
vendored
Normal file
123
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-browser-driver.ts
vendored
Normal file
@@ -0,0 +1,123 @@
|
||||
import {
|
||||
CLADO_PAGE_SCOPED_TOOLS,
|
||||
type CladoActionPoint,
|
||||
type CladoViewport,
|
||||
} from './types'
|
||||
|
||||
export function clampCladoNormalizedCoordinate(value: number): number {
|
||||
return Math.min(999, Math.max(0, Math.round(value)))
|
||||
}
|
||||
|
||||
/** Converts Clado's 0-1000 normalized coordinate space into BrowserOS viewport pixels. */
|
||||
export function resolveCladoPoint(
|
||||
viewport: CladoViewport,
|
||||
normalizedX: number | undefined,
|
||||
normalizedY: number | undefined,
|
||||
): CladoActionPoint {
|
||||
const nx = clampCladoNormalizedCoordinate(normalizedX ?? 500)
|
||||
const ny = clampCladoNormalizedCoordinate(normalizedY ?? 500)
|
||||
|
||||
return {
|
||||
x: Math.round((nx / 1000) * viewport.width),
|
||||
y: Math.round((ny / 1000) * viewport.height),
|
||||
}
|
||||
}
|
||||
|
||||
/** Adapts Clado action tool arguments to the BrowserOS MCP tool argument contract. */
|
||||
export function prepareCladoToolArgs(
|
||||
toolName: string,
|
||||
args: Record<string, unknown>,
|
||||
pageId: number,
|
||||
): Record<string, unknown> {
|
||||
const prepared: Record<string, unknown> = { ...args }
|
||||
|
||||
if (
|
||||
toolName === 'evaluate_script' &&
|
||||
typeof prepared.function === 'string' &&
|
||||
prepared.expression === undefined
|
||||
) {
|
||||
prepared.expression = toCladoEvaluateExpression(prepared.function)
|
||||
delete prepared.function
|
||||
}
|
||||
|
||||
if (
|
||||
toolName === 'click_at' &&
|
||||
typeof prepared.dblClick === 'boolean' &&
|
||||
prepared.clickCount === undefined
|
||||
) {
|
||||
prepared.clickCount = prepared.dblClick ? 2 : 1
|
||||
delete prepared.dblClick
|
||||
}
|
||||
|
||||
if (
|
||||
CLADO_PAGE_SCOPED_TOOLS.has(toolName) &&
|
||||
typeof prepared.page !== 'number'
|
||||
) {
|
||||
prepared.page = pageId
|
||||
}
|
||||
|
||||
return prepared
|
||||
}
|
||||
|
||||
export function toCladoEvaluateExpression(rawFunction: unknown): string {
|
||||
const source = String(rawFunction).trim()
|
||||
if (source.startsWith('() =>') || source.startsWith('async () =>')) {
|
||||
return `(${source})()`
|
||||
}
|
||||
if (source.startsWith('function')) {
|
||||
return `(${source})()`
|
||||
}
|
||||
return source
|
||||
}
|
||||
|
||||
export function normalizeCladoPressKey(key: string | undefined): string {
|
||||
const raw = (key ?? '').trim()
|
||||
if (!raw) throw new Error('press_key action missing key field')
|
||||
|
||||
const map: Record<string, string> = {
|
||||
'C-a': 'Control+A',
|
||||
'C-c': 'Control+C',
|
||||
'C-v': 'Control+V',
|
||||
'C-x': 'Control+X',
|
||||
'C-z': 'Control+Z',
|
||||
'C-y': 'Control+Y',
|
||||
'C-s': 'Control+S',
|
||||
'C-t': 'Control+T',
|
||||
'C-w': 'Control+W',
|
||||
'C-h': 'Control+H',
|
||||
'C-f': 'Control+F',
|
||||
'C-+': 'Control++',
|
||||
'C--': 'Control+-',
|
||||
'C-tab': 'Control+Tab',
|
||||
'C-S-tab': 'Control+Shift+Tab',
|
||||
'C-S-n': 'Control+Shift+N',
|
||||
'C-down': 'Control+ArrowDown',
|
||||
'M-a': 'Meta+A',
|
||||
'M-c': 'Meta+C',
|
||||
'M-v': 'Meta+V',
|
||||
'M-x': 'Meta+X',
|
||||
'M-f4': 'Alt+F4',
|
||||
}
|
||||
return map[raw] ?? raw
|
||||
}
|
||||
|
||||
export function normalizeCladoDirection(
|
||||
direction: string | undefined,
|
||||
): 'up' | 'down' | 'left' | 'right' {
|
||||
if (
|
||||
direction === 'up' ||
|
||||
direction === 'down' ||
|
||||
direction === 'left' ||
|
||||
direction === 'right'
|
||||
) {
|
||||
return direction
|
||||
}
|
||||
return 'down'
|
||||
}
|
||||
|
||||
export function normalizeCladoScrollAmount(amount: number | undefined): number {
|
||||
if (typeof amount !== 'number') return 500
|
||||
if (amount <= 0) return 100
|
||||
const clamped = Math.min(amount, 1000)
|
||||
return Math.max(100, Math.round((clamped / 1000) * 900))
|
||||
}
|
||||
68
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-client.ts
vendored
Normal file
68
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/clado-client.ts
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
import { CLADO_REQUEST_TIMEOUT_MS } from '../../../../constants'
|
||||
import { formatCladoHistory } from './clado-actions'
|
||||
import type { CladoAction, CladoActionResponse } from './types'
|
||||
|
||||
export interface CladoActionClientOptions {
|
||||
baseUrl?: string
|
||||
apiKey?: string
|
||||
}
|
||||
|
||||
export interface CladoActionPredictionInput {
|
||||
instruction: string
|
||||
imageBase64: string
|
||||
actionHistory: CladoAction[]
|
||||
signal?: AbortSignal
|
||||
}
|
||||
|
||||
/** Calls the Clado action model without exposing credentials in process arguments or artifacts. */
|
||||
export class CladoActionClient {
|
||||
constructor(private readonly options: CladoActionClientOptions) {}
|
||||
|
||||
async requestActionPrediction(
|
||||
input: CladoActionPredictionInput,
|
||||
): Promise<CladoActionResponse> {
|
||||
if (!this.options.baseUrl) {
|
||||
throw new Error('executor.baseUrl must be set for clado-action provider')
|
||||
}
|
||||
|
||||
const requestController = new AbortController()
|
||||
const onAbort = () => requestController.abort()
|
||||
input.signal?.addEventListener('abort', onAbort, { once: true })
|
||||
|
||||
const timeoutHandle = setTimeout(() => {
|
||||
requestController.abort()
|
||||
}, CLADO_REQUEST_TIMEOUT_MS)
|
||||
|
||||
try {
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
if (this.options.apiKey) {
|
||||
headers.Authorization = `Bearer ${this.options.apiKey}`
|
||||
}
|
||||
|
||||
const response = await fetch(this.options.baseUrl, {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: JSON.stringify({
|
||||
instruction: input.instruction,
|
||||
image_base64: input.imageBase64,
|
||||
history: formatCladoHistory(input.actionHistory),
|
||||
}),
|
||||
signal: requestController.signal,
|
||||
})
|
||||
|
||||
if (!response.ok) {
|
||||
const body = await response.text()
|
||||
throw new Error(
|
||||
`HTTP ${response.status} ${response.statusText}: ${body.slice(0, 400)}`,
|
||||
)
|
||||
}
|
||||
|
||||
return (await response.json()) as CladoActionResponse
|
||||
} finally {
|
||||
clearTimeout(timeoutHandle)
|
||||
input.signal?.removeEventListener('abort', onAbort)
|
||||
}
|
||||
}
|
||||
}
|
||||
78
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/types.ts
vendored
Normal file
78
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/clado/types.ts
vendored
Normal file
@@ -0,0 +1,78 @@
|
||||
export const CLADO_ACTION_PROVIDER = 'clado-action'
|
||||
|
||||
export const CLADO_PAGE_SCOPED_TOOLS = new Set<string>([
|
||||
'take_screenshot',
|
||||
'evaluate_script',
|
||||
'click',
|
||||
'click_at',
|
||||
'hover',
|
||||
'hover_at',
|
||||
'clear',
|
||||
'fill',
|
||||
'press_key',
|
||||
'type_at',
|
||||
'drag',
|
||||
'drag_at',
|
||||
'scroll',
|
||||
'handle_dialog',
|
||||
'select_option',
|
||||
'navigate_page',
|
||||
'close_page',
|
||||
'wait_for',
|
||||
])
|
||||
|
||||
export interface CladoActionResponse {
|
||||
action?: string | null
|
||||
x?: number
|
||||
y?: number
|
||||
text?: string
|
||||
key?: string
|
||||
direction?: string
|
||||
startX?: number
|
||||
startY?: number
|
||||
endX?: number
|
||||
endY?: number
|
||||
amount?: number
|
||||
time?: number
|
||||
final_answer?: string | null
|
||||
inference_time_seconds?: number
|
||||
raw_response?: string
|
||||
thinking?: string | null
|
||||
parse_error?: string | null
|
||||
}
|
||||
|
||||
export interface CladoViewport {
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
|
||||
export interface CladoAction {
|
||||
action: string
|
||||
x?: number
|
||||
y?: number
|
||||
text?: string
|
||||
key?: string
|
||||
direction?: string
|
||||
startX?: number
|
||||
startY?: number
|
||||
endX?: number
|
||||
endY?: number
|
||||
amount?: number
|
||||
time?: number
|
||||
final_answer?: string
|
||||
}
|
||||
|
||||
export type RawCladoActionPayload = Partial<
|
||||
Omit<CladoAction, 'final_answer'>
|
||||
> & {
|
||||
final_answer?: string | null
|
||||
}
|
||||
|
||||
export interface CladoActionPoint {
|
||||
x: number
|
||||
y: number
|
||||
}
|
||||
|
||||
export function isCladoActionProvider(provider: string): boolean {
|
||||
return provider === CLADO_ACTION_PROVIDER
|
||||
}
|
||||
45
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/create-executor-backend.ts
vendored
Normal file
45
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/create-executor-backend.ts
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
|
||||
import type { Browser } from '@browseros/server/browser'
|
||||
import type { ExecutorCallbacks } from '../../orchestrator-executor/executor'
|
||||
import type { ExecutorBackend, ExecutorBackendKind } from '../executor-backend'
|
||||
import { ExecutorAdapterBackend } from './tool-loop-backend'
|
||||
|
||||
export interface CreateExecutorBackendOptions {
|
||||
backendKind?: ExecutorBackendKind
|
||||
provider?: string
|
||||
configTemplate?: ResolvedAgentConfig
|
||||
browser?: Browser | null
|
||||
serverUrl?: string
|
||||
windowId?: number
|
||||
tabId?: number
|
||||
initialPageId?: number
|
||||
callbacks?: ExecutorCallbacks
|
||||
executor?: ExecutorBackend
|
||||
}
|
||||
|
||||
export function backendKindForProvider(provider: string): ExecutorBackendKind {
|
||||
return provider === 'clado-action' ? 'clado' : 'tool-loop'
|
||||
}
|
||||
|
||||
/** Creates the backend used for one orchestrator delegation. */
|
||||
export function createExecutorBackend(
|
||||
options: CreateExecutorBackendOptions,
|
||||
): ExecutorBackend {
|
||||
const kind =
|
||||
options.backendKind ??
|
||||
backendKindForProvider(
|
||||
options.provider ?? options.configTemplate?.provider ?? '',
|
||||
)
|
||||
|
||||
return new ExecutorAdapterBackend({
|
||||
kind,
|
||||
configTemplate: options.configTemplate,
|
||||
browser: options.browser,
|
||||
serverUrl: options.serverUrl,
|
||||
windowId: options.windowId,
|
||||
tabId: options.tabId,
|
||||
initialPageId: options.initialPageId,
|
||||
callbacks: options.callbacks,
|
||||
executor: options.executor,
|
||||
})
|
||||
}
|
||||
72
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/tool-loop-backend.ts
vendored
Normal file
72
packages/browseros-agent/apps/eval/src/agents/orchestrated/backends/tool-loop-backend.ts
vendored
Normal file
@@ -0,0 +1,72 @@
|
||||
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
|
||||
import type { Browser } from '@browseros/server/browser'
|
||||
import {
|
||||
Executor,
|
||||
type ExecutorCallbacks,
|
||||
} from '../../orchestrator-executor/executor'
|
||||
import type {
|
||||
DelegationResult,
|
||||
ExecutorBackend,
|
||||
ExecutorBackendKind,
|
||||
} from '../executor-backend'
|
||||
|
||||
interface ExecutorRunner {
|
||||
execute(instruction: string, signal?: AbortSignal): Promise<DelegationResult>
|
||||
close(): Promise<void>
|
||||
getTotalSteps(): number
|
||||
}
|
||||
|
||||
export interface ExecutorAdapterBackendOptions {
|
||||
kind: ExecutorBackendKind
|
||||
configTemplate?: ResolvedAgentConfig
|
||||
browser?: Browser | null
|
||||
serverUrl?: string
|
||||
windowId?: number
|
||||
tabId?: number
|
||||
initialPageId?: number
|
||||
callbacks?: ExecutorCallbacks
|
||||
executor?: ExecutorRunner
|
||||
}
|
||||
|
||||
export class ExecutorAdapterBackend implements ExecutorBackend {
|
||||
readonly kind: ExecutorBackendKind
|
||||
private readonly executor: ExecutorRunner
|
||||
|
||||
constructor(options: ExecutorAdapterBackendOptions) {
|
||||
this.kind = options.kind
|
||||
this.executor =
|
||||
options.executor ??
|
||||
new Executor(
|
||||
required(options.configTemplate, 'configTemplate'),
|
||||
options.browser ?? null,
|
||||
required(options.serverUrl, 'serverUrl'),
|
||||
{
|
||||
isCladoAction: options.kind === 'clado',
|
||||
windowId: options.windowId,
|
||||
tabId: options.tabId,
|
||||
initialPageId: options.initialPageId,
|
||||
callbacks: options.callbacks,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
execute(
|
||||
instruction: string,
|
||||
signal?: AbortSignal,
|
||||
): Promise<DelegationResult> {
|
||||
return this.executor.execute(instruction, signal)
|
||||
}
|
||||
|
||||
close(): Promise<void> {
|
||||
return this.executor.close()
|
||||
}
|
||||
|
||||
getTotalSteps(): number {
|
||||
return this.executor.getTotalSteps()
|
||||
}
|
||||
}
|
||||
|
||||
function required<T>(value: T | undefined, name: string): T {
|
||||
if (value === undefined) throw new Error(`${name} is required`)
|
||||
return value
|
||||
}
|
||||
11
packages/browseros-agent/apps/eval/src/agents/orchestrated/executor-backend.ts
vendored
Normal file
11
packages/browseros-agent/apps/eval/src/agents/orchestrated/executor-backend.ts
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
import type { ExecutorResult } from '../orchestrator-executor/types'
|
||||
|
||||
export type ExecutorBackendKind = 'tool-loop' | 'clado'
|
||||
export type DelegationResult = ExecutorResult
|
||||
|
||||
export interface ExecutorBackend {
|
||||
readonly kind: ExecutorBackendKind
|
||||
execute(instruction: string, signal?: AbortSignal): Promise<DelegationResult>
|
||||
close(): Promise<void>
|
||||
getTotalSteps(): number
|
||||
}
|
||||
@@ -1,98 +1,47 @@
|
||||
import { randomUUID } from 'node:crypto'
|
||||
import {
|
||||
CLADO_REQUEST_TIMEOUT_MS,
|
||||
MAX_ACTIONS_PER_DELEGATION,
|
||||
} from '../../constants'
|
||||
import { MAX_ACTIONS_PER_DELEGATION } from '../../constants'
|
||||
import { McpClient, type McpToolResult } from '../../utils/mcp-client'
|
||||
import { sleep } from '../../utils/sleep'
|
||||
import {
|
||||
extractCladoThinking,
|
||||
formatCladoHistory,
|
||||
getCladoActionSignature,
|
||||
parseCladoActions,
|
||||
summarizeCladoPrediction,
|
||||
} from '../orchestrated/backends/clado/clado-actions'
|
||||
import {
|
||||
normalizeCladoDirection,
|
||||
normalizeCladoPressKey,
|
||||
normalizeCladoScrollAmount,
|
||||
prepareCladoToolArgs,
|
||||
resolveCladoPoint,
|
||||
} from '../orchestrated/backends/clado/clado-browser-driver'
|
||||
import { CladoActionClient } from '../orchestrated/backends/clado/clado-client'
|
||||
import {
|
||||
CLADO_ACTION_PROVIDER,
|
||||
type CladoAction,
|
||||
type CladoActionPoint,
|
||||
type CladoActionResponse,
|
||||
type CladoViewport,
|
||||
isCladoActionProvider,
|
||||
} from '../orchestrated/backends/clado/types'
|
||||
import type { ExecutorCallbacks } from './executor'
|
||||
import type { ExecutorConfig, ExecutorResult } from './types'
|
||||
|
||||
const CLADO_ACTION_PROVIDER = 'clado-action'
|
||||
const PAGE_SCOPED_TOOLS = new Set<string>([
|
||||
'take_screenshot',
|
||||
'evaluate_script',
|
||||
'click',
|
||||
'click_at',
|
||||
'hover',
|
||||
'hover_at',
|
||||
'clear',
|
||||
'fill',
|
||||
'press_key',
|
||||
'type_at',
|
||||
'drag',
|
||||
'drag_at',
|
||||
'scroll',
|
||||
'handle_dialog',
|
||||
'select_option',
|
||||
'navigate_page',
|
||||
'close_page',
|
||||
'wait_for',
|
||||
])
|
||||
|
||||
interface CladoActionResponse {
|
||||
action?: string
|
||||
x?: number
|
||||
y?: number
|
||||
text?: string
|
||||
key?: string
|
||||
direction?: string
|
||||
startX?: number
|
||||
startY?: number
|
||||
endX?: number
|
||||
endY?: number
|
||||
amount?: number
|
||||
time?: number
|
||||
inference_time_seconds?: number
|
||||
raw_response?: string
|
||||
}
|
||||
|
||||
interface Viewport {
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
|
||||
interface CladoAction {
|
||||
action: string
|
||||
x?: number
|
||||
y?: number
|
||||
text?: string
|
||||
key?: string
|
||||
direction?: string
|
||||
startX?: number
|
||||
startY?: number
|
||||
endX?: number
|
||||
endY?: number
|
||||
amount?: number
|
||||
time?: number
|
||||
}
|
||||
|
||||
type RawActionPayload = Partial<CladoAction>
|
||||
|
||||
interface ActionPoint {
|
||||
x: number
|
||||
y: number
|
||||
}
|
||||
const MAX_CONSECUTIVE_PARSE_FAILURES = 3
|
||||
|
||||
function asErrorMessage(error: unknown): string {
|
||||
return error instanceof Error ? error.message : String(error)
|
||||
}
|
||||
|
||||
function clampNormalized(value: number): number {
|
||||
return Math.min(999, Math.max(0, Math.round(value)))
|
||||
}
|
||||
|
||||
function isCladoProvider(provider: string): boolean {
|
||||
return provider === CLADO_ACTION_PROVIDER
|
||||
}
|
||||
|
||||
export class CladoActionExecutor {
|
||||
private readonly mcpClient: McpClient
|
||||
private readonly cladoClient: CladoActionClient
|
||||
private readonly pageId: number
|
||||
private callbacks: ExecutorCallbacks = {}
|
||||
private stepsUsed = 0
|
||||
private viewport: Viewport | null = null
|
||||
private lastPoint: ActionPoint | null = null
|
||||
private viewport: CladoViewport | null = null
|
||||
private lastPoint: CladoActionPoint | null = null
|
||||
private currentUrl = ''
|
||||
|
||||
constructor(
|
||||
@@ -102,12 +51,16 @@ export class CladoActionExecutor {
|
||||
readonly _tabId?: number,
|
||||
initialPageId?: number,
|
||||
) {
|
||||
if (!isCladoProvider(config.provider)) {
|
||||
if (!isCladoActionProvider(config.provider)) {
|
||||
throw new Error(
|
||||
`CladoActionExecutor requires provider="${CLADO_ACTION_PROVIDER}"`,
|
||||
)
|
||||
}
|
||||
this.mcpClient = new McpClient(`${serverUrl}/mcp`)
|
||||
this.cladoClient = new CladoActionClient({
|
||||
baseUrl: config.baseUrl,
|
||||
apiKey: config.apiKey,
|
||||
})
|
||||
this.pageId = initialPageId ?? 1
|
||||
}
|
||||
|
||||
@@ -135,6 +88,8 @@ export class CladoActionExecutor {
|
||||
const actionHistory: CladoAction[] = []
|
||||
let predictionCalls = 0
|
||||
const thinkingTrace: string[] = []
|
||||
let consecutiveParseFailures = 0
|
||||
let finalAnswer: string | undefined
|
||||
|
||||
let status: ExecutorResult['status'] = 'done'
|
||||
let reason = 'Goal executed.'
|
||||
@@ -155,7 +110,7 @@ export class CladoActionExecutor {
|
||||
break
|
||||
}
|
||||
|
||||
const historyForPrediction = this.formatHistory(actionHistory)
|
||||
const historyForPrediction = formatCladoHistory(actionHistory)
|
||||
const actionToolCallId = randomUUID()
|
||||
const predictionInput = {
|
||||
instruction,
|
||||
@@ -177,7 +132,7 @@ export class CladoActionExecutor {
|
||||
signal,
|
||||
)
|
||||
predictionCalls++
|
||||
const thinking = this.extractThinking(prediction.raw_response)
|
||||
const thinking = extractCladoThinking(prediction.raw_response)
|
||||
if (thinking) {
|
||||
const previous = thinkingTrace[thinkingTrace.length - 1]
|
||||
if (previous !== thinking) {
|
||||
@@ -207,8 +162,19 @@ export class CladoActionExecutor {
|
||||
break
|
||||
}
|
||||
|
||||
const predictedActions = this.parseActions(prediction)
|
||||
const predictedActions = parseCladoActions(prediction)
|
||||
if (predictedActions.length === 0) {
|
||||
// Per Clado contract: HTTP 200 with action=null on parse failure.
|
||||
// Count as an invalid step so the model can self-correct on the
|
||||
// next call instead of dropping the trajectory.
|
||||
consecutiveParseFailures++
|
||||
const parseError =
|
||||
prediction.parse_error ?? 'no parsable <answer> in raw_response'
|
||||
actionHistory.push({
|
||||
action: 'invalid',
|
||||
text: `parse_error: ${parseError}`,
|
||||
})
|
||||
this.stepsUsed++
|
||||
await this.callbacks.onStepFinish?.({
|
||||
toolCalls: [
|
||||
{
|
||||
@@ -222,16 +188,23 @@ export class CladoActionExecutor {
|
||||
toolCallId: actionToolCallId,
|
||||
toolName: 'clado_action_predict',
|
||||
output: {
|
||||
prediction: this.summarizePrediction(prediction),
|
||||
prediction: summarizeCladoPrediction(prediction),
|
||||
parsedActions: [],
|
||||
parseError,
|
||||
consecutiveParseFailures,
|
||||
},
|
||||
},
|
||||
],
|
||||
})
|
||||
status = 'blocked'
|
||||
reason = 'Clado action response did not contain a valid action.'
|
||||
break
|
||||
|
||||
if (consecutiveParseFailures >= MAX_CONSECUTIVE_PARSE_FAILURES) {
|
||||
status = 'blocked'
|
||||
reason = `Clado returned ${consecutiveParseFailures} consecutive unparseable responses.`
|
||||
break
|
||||
}
|
||||
continue
|
||||
}
|
||||
consecutiveParseFailures = 0
|
||||
|
||||
let requestedStop = false
|
||||
const executionNotes: string[] = []
|
||||
@@ -257,7 +230,7 @@ export class CladoActionExecutor {
|
||||
toolCallId: actionToolCallId,
|
||||
toolName: 'clado_action_predict',
|
||||
output: {
|
||||
prediction: this.summarizePrediction(prediction),
|
||||
prediction: summarizeCladoPrediction(prediction),
|
||||
parsedActions: predictedActions,
|
||||
executed: executionNotes,
|
||||
},
|
||||
@@ -272,7 +245,12 @@ export class CladoActionExecutor {
|
||||
|
||||
actionHistory.push(predictedAction)
|
||||
if (predictedAction.action === 'end') {
|
||||
reason = 'Model requested end() and marked task complete.'
|
||||
if (predictedAction.final_answer) {
|
||||
finalAnswer = predictedAction.final_answer
|
||||
reason = `Model requested end() with final_answer: ${predictedAction.final_answer.slice(0, 240)}`
|
||||
} else {
|
||||
reason = 'Model requested end() and marked task complete.'
|
||||
}
|
||||
requestedStop = true
|
||||
break
|
||||
}
|
||||
@@ -293,7 +271,7 @@ export class CladoActionExecutor {
|
||||
toolCallId: actionToolCallId,
|
||||
toolName: 'clado_action_predict',
|
||||
output: {
|
||||
prediction: this.summarizePrediction(prediction),
|
||||
prediction: summarizeCladoPrediction(prediction),
|
||||
parsedActions: predictedActions,
|
||||
executed: executionNotes,
|
||||
},
|
||||
@@ -327,6 +305,7 @@ export class CladoActionExecutor {
|
||||
actions: actionHistory,
|
||||
url: this.currentUrl,
|
||||
thinkingTrace,
|
||||
finalAnswer,
|
||||
})
|
||||
|
||||
return {
|
||||
@@ -344,121 +323,12 @@ export class CladoActionExecutor {
|
||||
actionHistory: CladoAction[],
|
||||
signal?: AbortSignal,
|
||||
): Promise<CladoActionResponse> {
|
||||
if (!this.config.baseUrl) {
|
||||
throw new Error('executor.baseUrl must be set for clado-action provider')
|
||||
}
|
||||
|
||||
const requestController = new AbortController()
|
||||
const onAbort = () => requestController.abort()
|
||||
signal?.addEventListener('abort', onAbort, { once: true })
|
||||
|
||||
const timeoutHandle = setTimeout(() => {
|
||||
requestController.abort()
|
||||
}, CLADO_REQUEST_TIMEOUT_MS)
|
||||
|
||||
try {
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
}
|
||||
if (this.config.apiKey) {
|
||||
headers.Authorization = `Bearer ${this.config.apiKey}`
|
||||
}
|
||||
|
||||
const response = await fetch(this.config.baseUrl, {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: JSON.stringify({
|
||||
instruction,
|
||||
image_base64: imageBase64,
|
||||
history: this.formatHistory(actionHistory),
|
||||
}),
|
||||
signal: requestController.signal,
|
||||
})
|
||||
|
||||
if (!response.ok) {
|
||||
const body = await response.text()
|
||||
throw new Error(
|
||||
`HTTP ${response.status} ${response.statusText}: ${body.slice(0, 400)}`,
|
||||
)
|
||||
}
|
||||
|
||||
return (await response.json()) as CladoActionResponse
|
||||
} finally {
|
||||
clearTimeout(timeoutHandle)
|
||||
signal?.removeEventListener('abort', onAbort)
|
||||
}
|
||||
}
|
||||
|
||||
private parseActions(prediction: CladoActionResponse): CladoAction[] {
|
||||
const actionFromField =
|
||||
typeof prediction.action === 'string' ? prediction.action : null
|
||||
|
||||
const rawActions = this.parseActionsFromRawResponse(prediction.raw_response)
|
||||
const primaryFromRaw = rawActions[0] ?? null
|
||||
const mergedPrimary = {
|
||||
...primaryFromRaw,
|
||||
...prediction,
|
||||
action: actionFromField ?? primaryFromRaw?.action,
|
||||
}
|
||||
|
||||
const normalized: CladoAction[] = []
|
||||
const primary = this.normalizeActionPayload(mergedPrimary)
|
||||
if (primary) normalized.push(primary)
|
||||
|
||||
for (const candidate of rawActions.slice(1)) {
|
||||
const parsed = this.normalizeActionPayload(candidate)
|
||||
if (!parsed) continue
|
||||
const prev = normalized[normalized.length - 1]
|
||||
if (
|
||||
!prev ||
|
||||
this.getActionSignature(prev) !== this.getActionSignature(parsed)
|
||||
) {
|
||||
normalized.push(parsed)
|
||||
}
|
||||
}
|
||||
|
||||
return normalized
|
||||
}
|
||||
|
||||
private normalizeActionPayload(
|
||||
payload: RawActionPayload,
|
||||
): CladoAction | null {
|
||||
if (!payload.action || typeof payload.action !== 'string') {
|
||||
return null
|
||||
}
|
||||
return {
|
||||
action: payload.action,
|
||||
x: typeof payload.x === 'number' ? payload.x : undefined,
|
||||
y: typeof payload.y === 'number' ? payload.y : undefined,
|
||||
text: typeof payload.text === 'string' ? payload.text : undefined,
|
||||
key: typeof payload.key === 'string' ? payload.key : undefined,
|
||||
direction:
|
||||
typeof payload.direction === 'string' ? payload.direction : undefined,
|
||||
startX: typeof payload.startX === 'number' ? payload.startX : undefined,
|
||||
startY: typeof payload.startY === 'number' ? payload.startY : undefined,
|
||||
endX: typeof payload.endX === 'number' ? payload.endX : undefined,
|
||||
endY: typeof payload.endY === 'number' ? payload.endY : undefined,
|
||||
amount: typeof payload.amount === 'number' ? payload.amount : undefined,
|
||||
time: typeof payload.time === 'number' ? payload.time : undefined,
|
||||
}
|
||||
}
|
||||
|
||||
private parseActionsFromRawResponse(
|
||||
rawResponse: string | undefined,
|
||||
): RawActionPayload[] {
|
||||
if (!rawResponse) return []
|
||||
const matches = [
|
||||
...rawResponse.matchAll(/<answer>\s*([\s\S]*?)\s*<\/answer>/gi),
|
||||
]
|
||||
const parsed: RawActionPayload[] = []
|
||||
for (const match of matches) {
|
||||
try {
|
||||
parsed.push(JSON.parse(match[1]) as RawActionPayload)
|
||||
} catch {
|
||||
// ignore malformed answer blocks
|
||||
}
|
||||
}
|
||||
return parsed
|
||||
return this.cladoClient.requestActionPrediction({
|
||||
instruction,
|
||||
imageBase64,
|
||||
actionHistory,
|
||||
signal,
|
||||
})
|
||||
}
|
||||
|
||||
private async executeAction(
|
||||
@@ -529,14 +399,14 @@ export class CladoActionExecutor {
|
||||
}
|
||||
|
||||
case 'press_key': {
|
||||
const key = this.normalizePressKey(action.key)
|
||||
const key = normalizeCladoPressKey(action.key)
|
||||
await this.runTool('press_key', { key }, signal)
|
||||
return `Pressed key "${key}".`
|
||||
}
|
||||
|
||||
case 'scroll': {
|
||||
const direction = this.normalizeDirection(action.direction)
|
||||
const amountPx = this.normalizeScrollAmount(action.amount)
|
||||
const direction = normalizeCladoDirection(action.direction)
|
||||
const amountPx = normalizeCladoScrollAmount(action.amount)
|
||||
const ticks = Math.max(1, Math.round(amountPx / 120))
|
||||
|
||||
await this.runTool('scroll', { direction, amount: ticks }, signal)
|
||||
@@ -578,7 +448,9 @@ export class CladoActionExecutor {
|
||||
}
|
||||
|
||||
case 'end': {
|
||||
return 'Model requested end().'
|
||||
return action.final_answer
|
||||
? `Model requested end() with final_answer: ${action.final_answer.slice(0, 240)}`
|
||||
: 'Model requested end().'
|
||||
}
|
||||
|
||||
default: {
|
||||
@@ -588,9 +460,10 @@ export class CladoActionExecutor {
|
||||
}
|
||||
|
||||
private async captureScreenshotBase64(signal?: AbortSignal): Promise<string> {
|
||||
// Clado contract is PNG or JPEG; use PNG for lossless input.
|
||||
const result = await this.runTool(
|
||||
'take_screenshot',
|
||||
{ format: 'webp', quality: 80 },
|
||||
{ format: 'png' },
|
||||
signal,
|
||||
)
|
||||
|
||||
@@ -604,7 +477,7 @@ export class CladoActionExecutor {
|
||||
return image.data
|
||||
}
|
||||
|
||||
private async getViewport(signal?: AbortSignal): Promise<Viewport> {
|
||||
private async getViewport(signal?: AbortSignal): Promise<CladoViewport> {
|
||||
if (this.viewport) return this.viewport
|
||||
|
||||
try {
|
||||
@@ -635,15 +508,9 @@ export class CladoActionExecutor {
|
||||
normalizedX: number | undefined,
|
||||
normalizedY: number | undefined,
|
||||
signal?: AbortSignal,
|
||||
): Promise<ActionPoint> {
|
||||
): Promise<CladoActionPoint> {
|
||||
const viewport = await this.getViewport(signal)
|
||||
const nx = clampNormalized(normalizedX ?? 500)
|
||||
const ny = clampNormalized(normalizedY ?? 500)
|
||||
|
||||
return {
|
||||
x: Math.round((nx / 1000) * viewport.width),
|
||||
y: Math.round((ny / 1000) * viewport.height),
|
||||
}
|
||||
return resolveCladoPoint(viewport, normalizedX, normalizedY)
|
||||
}
|
||||
|
||||
private async getCurrentUrl(signal?: AbortSignal): Promise<string> {
|
||||
@@ -670,7 +537,7 @@ export class CladoActionExecutor {
|
||||
throw new Error('aborted')
|
||||
}
|
||||
|
||||
const toolArgs = this.prepareToolArgs(toolName, args)
|
||||
const toolArgs = prepareCladoToolArgs(toolName, args, this.pageId)
|
||||
|
||||
try {
|
||||
const raw = await this.mcpClient.callTool(toolName, toolArgs)
|
||||
@@ -689,211 +556,22 @@ export class CladoActionExecutor {
|
||||
}
|
||||
}
|
||||
|
||||
private prepareToolArgs(
|
||||
toolName: string,
|
||||
args: Record<string, unknown>,
|
||||
): Record<string, unknown> {
|
||||
const prepared: Record<string, unknown> = { ...args }
|
||||
|
||||
if (
|
||||
toolName === 'evaluate_script' &&
|
||||
typeof prepared.function === 'string' &&
|
||||
prepared.expression === undefined
|
||||
) {
|
||||
prepared.expression = this.toEvaluateExpression(prepared.function)
|
||||
delete prepared.function
|
||||
}
|
||||
|
||||
if (
|
||||
toolName === 'click_at' &&
|
||||
typeof prepared.dblClick === 'boolean' &&
|
||||
prepared.clickCount === undefined
|
||||
) {
|
||||
prepared.clickCount = prepared.dblClick ? 2 : 1
|
||||
delete prepared.dblClick
|
||||
}
|
||||
|
||||
// Use fixed page ID for all page-scoped tools (single-page operation)
|
||||
if (PAGE_SCOPED_TOOLS.has(toolName) && typeof prepared.page !== 'number') {
|
||||
prepared.page = this.pageId
|
||||
}
|
||||
|
||||
return prepared
|
||||
}
|
||||
|
||||
private toEvaluateExpression(rawFunction: unknown): string {
|
||||
const source = String(rawFunction).trim()
|
||||
if (source.startsWith('() =>') || source.startsWith('async () =>')) {
|
||||
return `(${source})()`
|
||||
}
|
||||
if (source.startsWith('function')) {
|
||||
return `(${source})()`
|
||||
}
|
||||
return source
|
||||
}
|
||||
|
||||
private normalizePressKey(key: string | undefined): string {
|
||||
const raw = (key ?? '').trim()
|
||||
if (!raw) throw new Error('press_key action missing key field')
|
||||
|
||||
const map: Record<string, string> = {
|
||||
'C-a': 'Control+A',
|
||||
'C-c': 'Control+C',
|
||||
'C-v': 'Control+V',
|
||||
'C-x': 'Control+X',
|
||||
'C-z': 'Control+Z',
|
||||
'C-y': 'Control+Y',
|
||||
'C-s': 'Control+S',
|
||||
'C-t': 'Control+T',
|
||||
'C-w': 'Control+W',
|
||||
'C-h': 'Control+H',
|
||||
'C-f': 'Control+F',
|
||||
'C-+': 'Control++',
|
||||
'C--': 'Control+-',
|
||||
'C-tab': 'Control+Tab',
|
||||
'C-S-tab': 'Control+Shift+Tab',
|
||||
'C-S-n': 'Control+Shift+N',
|
||||
'C-down': 'Control+ArrowDown',
|
||||
'M-f4': 'Alt+F4',
|
||||
}
|
||||
return map[raw] ?? raw
|
||||
}
|
||||
|
||||
private normalizeDirection(
|
||||
direction: string | undefined,
|
||||
): 'up' | 'down' | 'left' | 'right' {
|
||||
if (
|
||||
direction === 'up' ||
|
||||
direction === 'down' ||
|
||||
direction === 'left' ||
|
||||
direction === 'right'
|
||||
) {
|
||||
return direction
|
||||
}
|
||||
return 'down'
|
||||
}
|
||||
|
||||
private normalizeScrollAmount(amount: number | undefined): number {
|
||||
if (typeof amount !== 'number') return 500
|
||||
if (amount <= 0) return 100
|
||||
const clamped = Math.min(amount, 1000)
|
||||
return Math.max(100, Math.round((clamped / 1000) * 900))
|
||||
}
|
||||
|
||||
private summarizePrediction(
|
||||
prediction: CladoActionResponse,
|
||||
): Record<string, unknown> {
|
||||
const preview =
|
||||
typeof prediction.raw_response === 'string' &&
|
||||
prediction.raw_response.length > 0
|
||||
? prediction.raw_response.slice(0, 240)
|
||||
: undefined
|
||||
|
||||
return {
|
||||
action: prediction.action,
|
||||
x: prediction.x,
|
||||
y: prediction.y,
|
||||
text: prediction.text,
|
||||
key: prediction.key,
|
||||
direction: prediction.direction,
|
||||
startX: prediction.startX,
|
||||
startY: prediction.startY,
|
||||
endX: prediction.endX,
|
||||
endY: prediction.endY,
|
||||
amount: prediction.amount,
|
||||
time: prediction.time,
|
||||
inference_time_seconds: prediction.inference_time_seconds,
|
||||
raw_response_preview: preview,
|
||||
}
|
||||
}
|
||||
|
||||
private extractThinking(rawResponse: string | undefined): string | undefined {
|
||||
if (!rawResponse) return undefined
|
||||
const matches = [
|
||||
...rawResponse.matchAll(/<thinking>\s*([\s\S]*?)\s*<\/thinking>/gi),
|
||||
]
|
||||
if (matches.length === 0) return undefined
|
||||
|
||||
const merged = matches
|
||||
.map((match) => match[1]?.replace(/\s+/g, ' ').trim() ?? '')
|
||||
.filter((value) => value.length > 0)
|
||||
.join(' ')
|
||||
|
||||
if (!merged) return undefined
|
||||
return merged
|
||||
}
|
||||
|
||||
private getActionSignature(action: CladoAction): string {
|
||||
switch (action.action) {
|
||||
case 'click':
|
||||
case 'double_click':
|
||||
case 'right_click':
|
||||
case 'hover':
|
||||
return `${action.action}:${action.x ?? 'x'}:${action.y ?? 'y'}`
|
||||
case 'type':
|
||||
return `${action.action}:${(action.text ?? '').slice(0, 16)}`
|
||||
case 'press_key':
|
||||
return `${action.action}:${action.key ?? 'key'}`
|
||||
case 'scroll':
|
||||
return `${action.action}:${action.direction ?? 'down'}:${action.amount ?? 500}`
|
||||
case 'drag':
|
||||
return `${action.action}:${action.startX}:${action.startY}:${action.endX}:${action.endY}`
|
||||
case 'wait':
|
||||
return `${action.action}:${action.time ?? 1}`
|
||||
case 'end':
|
||||
return 'end()'
|
||||
default:
|
||||
return action.action
|
||||
}
|
||||
}
|
||||
|
||||
private formatHistory(actions: CladoAction[]): string {
|
||||
if (actions.length === 0) return 'None'
|
||||
|
||||
const parts = actions.map((action) => {
|
||||
switch (action.action) {
|
||||
case 'click':
|
||||
case 'double_click':
|
||||
case 'right_click':
|
||||
case 'hover':
|
||||
return `${action.action}(${Math.round(action.x ?? 500)}, ${Math.round(action.y ?? 500)})`
|
||||
case 'type': {
|
||||
const text = (action.text ?? '').replace(/'/g, "\\'")
|
||||
return `type('${text}')`
|
||||
}
|
||||
case 'press_key':
|
||||
return `press_key('${action.key ?? 'Enter'}')`
|
||||
case 'scroll':
|
||||
return `scroll(${action.direction ?? 'down'})`
|
||||
case 'drag':
|
||||
return `drag(${Math.round(action.startX ?? 500)},${Math.round(action.startY ?? 500)} -> ${Math.round(action.endX ?? 500)},${Math.round(action.endY ?? 500)})`
|
||||
case 'wait':
|
||||
return `wait(${Math.round(action.time ?? 1)}s)`
|
||||
case 'end':
|
||||
return 'end()'
|
||||
default:
|
||||
return action.action
|
||||
}
|
||||
})
|
||||
|
||||
return parts.join(' -> ')
|
||||
}
|
||||
|
||||
private buildObservation(params: {
|
||||
status: ExecutorResult['status']
|
||||
reason: string
|
||||
actions: CladoAction[]
|
||||
url: string
|
||||
thinkingTrace: string[]
|
||||
finalAnswer?: string
|
||||
}): string {
|
||||
const { status, reason, actions, url, thinkingTrace } = params
|
||||
const { status, reason, actions, url, thinkingTrace, finalAnswer } = params
|
||||
const actionSummary =
|
||||
actions.length === 0
|
||||
? 'No actions were executed.'
|
||||
: actions
|
||||
.slice(-5)
|
||||
.map(
|
||||
(action, idx) => `${idx + 1}. ${this.getActionSignature(action)}`,
|
||||
(action, idx) => `${idx + 1}. ${getCladoActionSignature(action)}`,
|
||||
)
|
||||
.join('\n')
|
||||
const thinkingSummary =
|
||||
@@ -907,6 +585,7 @@ export class CladoActionExecutor {
|
||||
`Status: ${status}`,
|
||||
`Reason: ${reason}`,
|
||||
`URL: ${url || 'unknown'}`,
|
||||
finalAnswer ? `Final answer: ${finalAnswer}` : '',
|
||||
'',
|
||||
'Recent actions:',
|
||||
actionSummary,
|
||||
|
||||
@@ -24,8 +24,9 @@ import {
|
||||
resolveProviderConfig,
|
||||
} from '../../utils/resolve-provider-config'
|
||||
import { withEvalTimeout } from '../../utils/with-eval-timeout'
|
||||
import { createExecutorBackend } from '../orchestrated/backends/create-executor-backend'
|
||||
import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
|
||||
import { Executor, type ExecutorCallbacks } from './executor'
|
||||
import type { ExecutorCallbacks } from './executor'
|
||||
import { OrchestratorAgent } from './orchestrator-agent'
|
||||
import type { ExecutorFactory, ExecutorResult } from './types'
|
||||
|
||||
@@ -235,12 +236,13 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
|
||||
await capture.messageLogger.logStreamEvent(delegateInputEvent)
|
||||
capture.emitEvent(task.query_id, delegateInputEvent)
|
||||
|
||||
const executor = new Executor(
|
||||
executorConfig,
|
||||
const executor = createExecutorBackend({
|
||||
backendKind: isCladoAction ? 'clado' : 'tool-loop',
|
||||
configTemplate: executorConfig,
|
||||
browser,
|
||||
config.browseros.server_url,
|
||||
{ isCladoAction, callbacks },
|
||||
)
|
||||
serverUrl: config.browseros.server_url,
|
||||
callbacks,
|
||||
})
|
||||
let result: ExecutorResult
|
||||
try {
|
||||
result = await executor.execute(instruction, signal)
|
||||
|
||||
@@ -57,6 +57,20 @@ export class TrajectorySaver {
|
||||
)
|
||||
}
|
||||
|
||||
async saveAttempt(attempt: Record<string, unknown>): Promise<void> {
|
||||
await writeFile(
|
||||
join(this.outputDir, 'attempt.json'),
|
||||
JSON.stringify(attempt, null, 2),
|
||||
)
|
||||
}
|
||||
|
||||
async saveGrades(graderResults: Record<string, GraderResult>): Promise<void> {
|
||||
await writeFile(
|
||||
join(this.outputDir, 'grades.json'),
|
||||
JSON.stringify(graderResults, null, 2),
|
||||
)
|
||||
}
|
||||
|
||||
async loadMetadata(): Promise<TaskMetadata> {
|
||||
const content = await readFile(
|
||||
join(this.outputDir, 'metadata.json'),
|
||||
@@ -70,6 +84,7 @@ export class TrajectorySaver {
|
||||
): Promise<void> {
|
||||
const metadata = await this.loadMetadata()
|
||||
metadata.grader_results = graderResults
|
||||
await this.saveGrades(graderResults)
|
||||
await this.saveMetadata(metadata)
|
||||
}
|
||||
|
||||
|
||||
170
packages/browseros-agent/apps/eval/src/cli/args.ts
vendored
Normal file
170
packages/browseros-agent/apps/eval/src/cli/args.ts
vendored
Normal file
@@ -0,0 +1,170 @@
|
||||
import { parseArgs } from 'node:util'
|
||||
|
||||
export type PublishTarget = 'r2'
|
||||
|
||||
export interface LegacyCliArgs {
|
||||
command: 'legacy'
|
||||
configPath?: string
|
||||
help?: boolean
|
||||
}
|
||||
|
||||
export interface SuiteCliArgs {
|
||||
command: 'suite'
|
||||
configPath?: string
|
||||
suitePath?: string
|
||||
variantId?: string
|
||||
provider?: string
|
||||
model?: string
|
||||
apiKey?: string
|
||||
baseUrl?: string
|
||||
publishTarget?: PublishTarget
|
||||
}
|
||||
|
||||
export interface RunCliArgs
|
||||
extends Omit<SuiteCliArgs, 'command' | 'publishTarget'> {
|
||||
command: 'run'
|
||||
}
|
||||
|
||||
export interface GradeCliArgs {
|
||||
command: 'grade'
|
||||
runDir: string
|
||||
}
|
||||
|
||||
export interface PublishCliArgs {
|
||||
command: 'publish'
|
||||
runDir: string
|
||||
target: PublishTarget
|
||||
}
|
||||
|
||||
export type EvalCliArgs =
|
||||
| LegacyCliArgs
|
||||
| SuiteCliArgs
|
||||
| RunCliArgs
|
||||
| GradeCliArgs
|
||||
| PublishCliArgs
|
||||
|
||||
const COMMANDS = new Set(['suite', 'run', 'grade', 'publish'])
|
||||
|
||||
function stringValue(value: string | boolean | undefined): string | undefined {
|
||||
return typeof value === 'string' && value.length > 0 ? value : undefined
|
||||
}
|
||||
|
||||
function publishTarget(value: string | undefined): PublishTarget | undefined {
|
||||
if (value === undefined) return undefined
|
||||
if (value === 'r2') return 'r2'
|
||||
throw new Error(`Unsupported publish target: ${value}`)
|
||||
}
|
||||
|
||||
function requireOne(
|
||||
command: string,
|
||||
configPath: string | undefined,
|
||||
suitePath: string | undefined,
|
||||
): void {
|
||||
if (!configPath && !suitePath) {
|
||||
throw new Error(`${command} requires --config or --suite`)
|
||||
}
|
||||
if (configPath && suitePath) {
|
||||
throw new Error(`${command} accepts either --config or --suite, not both`)
|
||||
}
|
||||
}
|
||||
|
||||
function parseSuiteLikeArgs(
|
||||
command: 'suite' | 'run',
|
||||
argv: string[],
|
||||
): SuiteCliArgs | RunCliArgs {
|
||||
const { values } = parseArgs({
|
||||
args: argv,
|
||||
options: {
|
||||
config: { type: 'string' },
|
||||
suite: { type: 'string' },
|
||||
variant: { type: 'string' },
|
||||
provider: { type: 'string' },
|
||||
model: { type: 'string' },
|
||||
'api-key': { type: 'string' },
|
||||
'base-url': { type: 'string' },
|
||||
publish: { type: 'string' },
|
||||
},
|
||||
})
|
||||
|
||||
const configPath = stringValue(values.config)
|
||||
const suitePath = stringValue(values.suite)
|
||||
requireOne(command, configPath, suitePath)
|
||||
|
||||
const parsed: SuiteCliArgs | RunCliArgs =
|
||||
command === 'suite' ? { command: 'suite' } : { command: 'run' }
|
||||
if (configPath) parsed.configPath = configPath
|
||||
if (suitePath) parsed.suitePath = suitePath
|
||||
const variantId = stringValue(values.variant)
|
||||
if (variantId) parsed.variantId = variantId
|
||||
const provider = stringValue(values.provider)
|
||||
if (provider) parsed.provider = provider
|
||||
const model = stringValue(values.model)
|
||||
if (model) parsed.model = model
|
||||
const apiKey = stringValue(values['api-key'])
|
||||
if (apiKey) parsed.apiKey = apiKey
|
||||
const baseUrl = stringValue(values['base-url'])
|
||||
if (baseUrl) parsed.baseUrl = baseUrl
|
||||
|
||||
if (command === 'suite') {
|
||||
const target = publishTarget(stringValue(values.publish))
|
||||
if (target) {
|
||||
const suiteArgs = parsed as SuiteCliArgs
|
||||
suiteArgs.publishTarget = target
|
||||
}
|
||||
}
|
||||
|
||||
return parsed
|
||||
}
|
||||
|
||||
function parseLegacyArgs(argv: string[]): LegacyCliArgs {
|
||||
const { values } = parseArgs({
|
||||
args: argv,
|
||||
options: {
|
||||
config: { type: 'string', short: 'c' },
|
||||
help: { type: 'boolean', short: 'h' },
|
||||
},
|
||||
})
|
||||
|
||||
const parsed: LegacyCliArgs = { command: 'legacy' }
|
||||
const configPath = stringValue(values.config)
|
||||
if (configPath) parsed.configPath = configPath
|
||||
if (values.help) parsed.help = true
|
||||
return parsed
|
||||
}
|
||||
|
||||
/** Parses the eval CLI command without running browser or publishing side effects. */
|
||||
export function parseEvalCliArgs(argv: string[]): EvalCliArgs {
|
||||
const [command, ...rest] = argv
|
||||
if (!COMMANDS.has(command ?? '')) {
|
||||
return parseLegacyArgs(argv)
|
||||
}
|
||||
|
||||
switch (command) {
|
||||
case 'suite':
|
||||
return parseSuiteLikeArgs('suite', rest)
|
||||
case 'run':
|
||||
return parseSuiteLikeArgs('run', rest)
|
||||
case 'grade': {
|
||||
const { values } = parseArgs({
|
||||
args: rest,
|
||||
options: { run: { type: 'string' } },
|
||||
})
|
||||
const runDir = stringValue(values.run)
|
||||
if (!runDir) throw new Error('grade requires --run')
|
||||
return { command: 'grade', runDir }
|
||||
}
|
||||
case 'publish': {
|
||||
const { values } = parseArgs({
|
||||
args: rest,
|
||||
options: { run: { type: 'string' }, target: { type: 'string' } },
|
||||
})
|
||||
const runDir = stringValue(values.run)
|
||||
if (!runDir) throw new Error('publish requires --run')
|
||||
const target = publishTarget(stringValue(values.target))
|
||||
if (!target) throw new Error('publish requires --target')
|
||||
return { command: 'publish', runDir, target }
|
||||
}
|
||||
default:
|
||||
return parseLegacyArgs(argv)
|
||||
}
|
||||
}
|
||||
84
packages/browseros-agent/apps/eval/src/cli/commands/grade.ts
vendored
Normal file
84
packages/browseros-agent/apps/eval/src/cli/commands/grade.ts
vendored
Normal file
@@ -0,0 +1,84 @@
|
||||
import { readdir, readFile, stat } from 'node:fs/promises'
|
||||
import { join } from 'node:path'
|
||||
import { TrajectorySaver } from '../../capture/trajectory-saver'
|
||||
import { runGraders } from '../../grading/grader-runner'
|
||||
import { type Message, MessageSchema, TaskMetadataSchema } from '../../types'
|
||||
import type { GradeCliArgs } from '../args'
|
||||
|
||||
async function loadMessages(taskDir: string): Promise<Message[]> {
|
||||
const content = await readFile(
|
||||
join(taskDir, 'messages.jsonl'),
|
||||
'utf-8',
|
||||
).catch(() => '')
|
||||
return content
|
||||
.split('\n')
|
||||
.filter((line) => line.trim().length > 0)
|
||||
.map((line) => MessageSchema.parse(JSON.parse(line)))
|
||||
}
|
||||
|
||||
async function findTaskDirs(runDir: string): Promise<string[]> {
|
||||
const entries = await readdir(runDir, { withFileTypes: true })
|
||||
const taskDirs: string[] = []
|
||||
for (const entry of entries) {
|
||||
if (!entry.isDirectory()) continue
|
||||
const taskDir = join(runDir, entry.name)
|
||||
const metadata = await stat(join(taskDir, 'metadata.json')).catch(
|
||||
() => null,
|
||||
)
|
||||
if (metadata?.isFile()) taskDirs.push(taskDir)
|
||||
}
|
||||
return taskDirs
|
||||
}
|
||||
|
||||
/** Re-runs graders for task artifacts that already contain metadata and messages. */
|
||||
export async function runGradeCommand(args: GradeCliArgs): Promise<void> {
|
||||
const runStat = await stat(args.runDir).catch(() => null)
|
||||
if (!runStat?.isDirectory()) {
|
||||
throw new Error(`Not a run directory: ${args.runDir}`)
|
||||
}
|
||||
|
||||
const taskDirs = await findTaskDirs(args.runDir)
|
||||
if (taskDirs.length === 0) {
|
||||
throw new Error(`No task metadata found under ${args.runDir}`)
|
||||
}
|
||||
|
||||
let graded = 0
|
||||
for (const taskDir of taskDirs) {
|
||||
const metadata = TaskMetadataSchema.parse(
|
||||
JSON.parse(await readFile(join(taskDir, 'metadata.json'), 'utf-8')),
|
||||
)
|
||||
const graderNames = Object.keys(metadata.grader_results ?? {})
|
||||
if (graderNames.length === 0) {
|
||||
console.warn(`Skipping ${metadata.query_id}: no existing grader names`)
|
||||
continue
|
||||
}
|
||||
|
||||
const messages = await loadMessages(taskDir)
|
||||
const graderResults = await runGraders(graderNames, {
|
||||
task: {
|
||||
query_id: metadata.query_id,
|
||||
query: metadata.query,
|
||||
dataset: metadata.dataset,
|
||||
},
|
||||
messages,
|
||||
screenshotCount: metadata.screenshot_count ?? metadata.total_steps,
|
||||
finalAnswer: metadata.final_answer,
|
||||
taskArtifactDir: taskDir,
|
||||
outputDir: taskDir,
|
||||
mcpUrl: `${process.env.BROWSEROS_SERVER_URL || 'http://127.0.0.1:9110'}/mcp`,
|
||||
})
|
||||
|
||||
await new TrajectorySaver(
|
||||
args.runDir,
|
||||
metadata.query_id,
|
||||
).updateGraderResults(graderResults)
|
||||
graded++
|
||||
}
|
||||
|
||||
if (graded === 0) {
|
||||
throw new Error(
|
||||
`No tasks with existing grader names found under ${args.runDir}`,
|
||||
)
|
||||
}
|
||||
console.log(`Re-graded ${graded} task(s) in ${args.runDir}`)
|
||||
}
|
||||
25
packages/browseros-agent/apps/eval/src/cli/commands/publish.ts
vendored
Normal file
25
packages/browseros-agent/apps/eval/src/cli/commands/publish.ts
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
import { publishPathToR2 } from '../../publishing/r2-publisher'
|
||||
import type { PublishCliArgs, PublishTarget } from '../args'
|
||||
|
||||
export interface PublishRunOptions {
|
||||
runDir: string
|
||||
target: PublishTarget
|
||||
}
|
||||
|
||||
/** Publishes run artifacts through the R2 viewer upload path. */
|
||||
export async function publishRun(options: PublishRunOptions): Promise<void> {
|
||||
if (options.target !== 'r2') {
|
||||
throw new Error(`Unsupported publish target: ${options.target}`)
|
||||
}
|
||||
const result = await publishPathToR2(options.runDir)
|
||||
for (const run of result.uploadedRuns) {
|
||||
console.log(run.viewerUrl)
|
||||
}
|
||||
for (const runId of result.skippedRuns) {
|
||||
console.log(`${runId}: already uploaded, skipping`)
|
||||
}
|
||||
}
|
||||
|
||||
export async function runPublishCommand(args: PublishCliArgs): Promise<void> {
|
||||
await publishRun({ runDir: args.runDir, target: args.target })
|
||||
}
|
||||
21
packages/browseros-agent/apps/eval/src/cli/commands/run.ts
vendored
Normal file
21
packages/browseros-agent/apps/eval/src/cli/commands/run.ts
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
import type { RunCliArgs } from '../args'
|
||||
import { runSuiteCommand, type SuiteCommandDeps } from './suite'
|
||||
|
||||
/** Executes tasks from a config or suite without publishing artifacts. */
|
||||
export async function runRunCommand(
|
||||
args: RunCliArgs,
|
||||
deps: SuiteCommandDeps = {},
|
||||
): Promise<void> {
|
||||
await runSuiteCommand(
|
||||
{
|
||||
configPath: args.configPath,
|
||||
suitePath: args.suitePath,
|
||||
variantId: args.variantId,
|
||||
provider: args.provider,
|
||||
model: args.model,
|
||||
apiKey: args.apiKey,
|
||||
baseUrl: args.baseUrl,
|
||||
},
|
||||
deps,
|
||||
)
|
||||
}
|
||||
187
packages/browseros-agent/apps/eval/src/cli/commands/suite.ts
vendored
Normal file
187
packages/browseros-agent/apps/eval/src/cli/commands/suite.ts
vendored
Normal file
@@ -0,0 +1,187 @@
|
||||
import type { RunEvalOptions, RunEvalResult } from '../../runner/types'
|
||||
import { runEval as defaultRunEval } from '../../runs/eval-runner'
|
||||
import {
|
||||
type AdaptedEvalConfig,
|
||||
adaptEvalConfigFile,
|
||||
} from '../../suites/config-adapter'
|
||||
import { loadSuite } from '../../suites/load-suite'
|
||||
import { type EvalVariant, resolveVariant } from '../../suites/resolve-variant'
|
||||
import type { EvalSuite } from '../../suites/schema'
|
||||
import { type EvalConfig, EvalConfigSchema } from '../../types'
|
||||
import type { PublishTarget } from '../args'
|
||||
|
||||
type Env = Record<string, string | undefined>
|
||||
|
||||
export interface SuiteCommandOptions {
|
||||
configPath?: string
|
||||
suitePath?: string
|
||||
variantId?: string
|
||||
provider?: string
|
||||
model?: string
|
||||
apiKey?: string
|
||||
baseUrl?: string
|
||||
publishTarget?: PublishTarget
|
||||
env?: Env
|
||||
}
|
||||
|
||||
export type ResolvedSuiteCommand =
|
||||
| (AdaptedEvalConfig & { kind: 'config'; datasetPath?: undefined })
|
||||
| {
|
||||
kind: 'suite'
|
||||
suitePath: string
|
||||
suite: EvalSuite
|
||||
variant: EvalVariant
|
||||
datasetPath: string
|
||||
evalConfig: EvalConfig
|
||||
}
|
||||
|
||||
export interface SuiteCommandDeps {
|
||||
runEval?: (options: RunEvalOptions) => Promise<RunEvalResult | undefined>
|
||||
publishRun?: (options: {
|
||||
runDir: string
|
||||
target: PublishTarget
|
||||
}) => Promise<void>
|
||||
}
|
||||
|
||||
function ensureRunnableSuite(suite: EvalSuite): void {
|
||||
if (!suite.browseros) {
|
||||
throw new Error('suite browseros config is required to run suite commands')
|
||||
}
|
||||
}
|
||||
|
||||
function suiteToEvalConfig(
|
||||
suite: EvalSuite,
|
||||
datasetPath: string,
|
||||
variant: EvalVariant,
|
||||
env: Env,
|
||||
): EvalConfig {
|
||||
ensureRunnableSuite(suite)
|
||||
|
||||
const base = {
|
||||
dataset: datasetPath,
|
||||
num_workers: suite.workers,
|
||||
restart_server_per_task: suite.restartBrowserPerTask,
|
||||
browseros: suite.browseros,
|
||||
graders: suite.graders,
|
||||
timeout_ms: suite.timeoutMs,
|
||||
captcha: suite.captcha,
|
||||
}
|
||||
|
||||
if (suite.agent.type === 'single' || suite.agent.type === 'tool-loop') {
|
||||
// The legacy runner names the BrowserOS tool-loop agent "single".
|
||||
return EvalConfigSchema.parse({
|
||||
...base,
|
||||
agent: {
|
||||
type: 'single',
|
||||
provider: variant.agent.provider,
|
||||
model: variant.agent.model,
|
||||
apiKey: variant.agent.apiKey,
|
||||
baseUrl: variant.agent.baseUrl,
|
||||
supportsImages: variant.agent.supportsImages,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
const executorBackend = suite.agent.executorBackend ?? 'tool-loop'
|
||||
const executor =
|
||||
executorBackend === 'clado'
|
||||
? {
|
||||
provider: 'clado-action' as const,
|
||||
model:
|
||||
env.EVAL_EXECUTOR_MODEL ?? env.CLADO_ACTION_MODEL ?? 'clado-action',
|
||||
apiKey: env.EVAL_EXECUTOR_API_KEY ?? env.CLADO_ACTION_API_KEY ?? '',
|
||||
baseUrl:
|
||||
env.EVAL_EXECUTOR_BASE_URL ??
|
||||
env.CLADO_ACTION_BASE_URL ??
|
||||
env.CLADO_ACTION_URL,
|
||||
}
|
||||
: {
|
||||
provider: variant.agent.provider,
|
||||
model: variant.agent.model,
|
||||
apiKey: variant.agent.apiKey,
|
||||
baseUrl: variant.agent.baseUrl,
|
||||
}
|
||||
|
||||
return EvalConfigSchema.parse({
|
||||
...base,
|
||||
agent: {
|
||||
type: 'orchestrator-executor',
|
||||
orchestrator: {
|
||||
provider: variant.agent.provider,
|
||||
model: variant.agent.model,
|
||||
apiKey: variant.agent.apiKey,
|
||||
baseUrl: variant.agent.baseUrl,
|
||||
},
|
||||
executor,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
/** Resolves config-backed or suite-backed CLI input into the run shape used by the runner. */
|
||||
export async function resolveSuiteCommand(
|
||||
options: SuiteCommandOptions,
|
||||
): Promise<ResolvedSuiteCommand> {
|
||||
const env = options.env ?? process.env
|
||||
if (options.configPath) {
|
||||
return {
|
||||
kind: 'config',
|
||||
...(await adaptEvalConfigFile(options.configPath, { env })),
|
||||
}
|
||||
}
|
||||
if (!options.suitePath) {
|
||||
throw new Error('suite requires --config or --suite')
|
||||
}
|
||||
|
||||
const loaded = await loadSuite(options.suitePath)
|
||||
const variant = resolveVariant({
|
||||
variantId: options.variantId,
|
||||
provider: options.provider,
|
||||
model: options.model,
|
||||
apiKey: options.apiKey,
|
||||
baseUrl: options.baseUrl,
|
||||
env,
|
||||
})
|
||||
|
||||
return {
|
||||
kind: 'suite',
|
||||
suitePath: loaded.suitePath,
|
||||
suite: loaded.suite,
|
||||
variant,
|
||||
datasetPath: loaded.datasetPath,
|
||||
evalConfig: suiteToEvalConfig(
|
||||
loaded.suite,
|
||||
loaded.datasetPath,
|
||||
variant,
|
||||
env,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/** Runs the full suite loop: resolve input, execute tasks, then optionally publish the run. */
|
||||
export async function runSuiteCommand(
|
||||
options: SuiteCommandOptions,
|
||||
deps: SuiteCommandDeps = {},
|
||||
): Promise<void> {
|
||||
const runEval = deps.runEval ?? defaultRunEval
|
||||
const resolved = await resolveSuiteCommand(options)
|
||||
const runOptions: RunEvalOptions =
|
||||
resolved.kind === 'config'
|
||||
? { configPath: resolved.configPath }
|
||||
: {
|
||||
configPath: resolved.suitePath,
|
||||
dataPath: resolved.datasetPath,
|
||||
config: resolved.evalConfig,
|
||||
}
|
||||
|
||||
const result = await runEval(runOptions)
|
||||
if (!options.publishTarget) return
|
||||
|
||||
const outputDir = result?.outputDir
|
||||
if (!outputDir) {
|
||||
throw new Error('publish requested but runner did not return an outputDir')
|
||||
}
|
||||
if (!deps.publishRun) {
|
||||
throw new Error('publish requested before the publisher is configured')
|
||||
}
|
||||
await deps.publishRun({ runDir: outputDir, target: options.publishTarget })
|
||||
}
|
||||
70
packages/browseros-agent/apps/eval/src/cli/index.ts
vendored
Normal file
70
packages/browseros-agent/apps/eval/src/cli/index.ts
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
import { startDashboard } from '../dashboard/server'
|
||||
import { runEval } from '../runs/eval-runner'
|
||||
import { type EvalCliArgs, parseEvalCliArgs } from './args'
|
||||
import { runGradeCommand } from './commands/grade'
|
||||
import { publishRun, runPublishCommand } from './commands/publish'
|
||||
import { runRunCommand } from './commands/run'
|
||||
import { runSuiteCommand } from './commands/suite'
|
||||
|
||||
export function usage(): string {
|
||||
return `
|
||||
BrowserOS Eval
|
||||
|
||||
Usage:
|
||||
bun run eval suite --config <config.json> [--publish r2]
|
||||
bun run eval suite --suite <suite.json> --variant <id> [--publish r2]
|
||||
bun run eval run --config <config.json>
|
||||
bun run eval run --suite <suite.json> --variant <id>
|
||||
bun run eval grade --run <results/run-dir>
|
||||
bun run eval publish --run <results/run-dir> --target r2
|
||||
bun run eval -c <config.json>
|
||||
`
|
||||
}
|
||||
|
||||
async function runLegacyCommand(args: EvalCliArgs): Promise<void> {
|
||||
if (args.command !== 'legacy') return
|
||||
if (args.help) {
|
||||
console.log(usage())
|
||||
return
|
||||
}
|
||||
if (args.configPath) {
|
||||
await runEval({ configPath: args.configPath })
|
||||
return
|
||||
}
|
||||
|
||||
startDashboard({
|
||||
tasks: [],
|
||||
configName: '',
|
||||
agentType: '',
|
||||
outputDir: '',
|
||||
configMode: true,
|
||||
})
|
||||
console.log(
|
||||
'Dashboard running at http://localhost:9900 — configure and run from the UI',
|
||||
)
|
||||
await new Promise(() => {})
|
||||
}
|
||||
|
||||
/** Dispatches the eval CLI while preserving the old config/dashboard entry points. */
|
||||
export async function runCli(
|
||||
argv: string[] = Bun.argv.slice(2),
|
||||
): Promise<void> {
|
||||
const args = parseEvalCliArgs(argv)
|
||||
switch (args.command) {
|
||||
case 'legacy':
|
||||
await runLegacyCommand(args)
|
||||
break
|
||||
case 'suite':
|
||||
await runSuiteCommand(args, { publishRun })
|
||||
break
|
||||
case 'run':
|
||||
await runRunCommand(args)
|
||||
break
|
||||
case 'grade':
|
||||
await runGradeCommand(args)
|
||||
break
|
||||
case 'publish':
|
||||
await runPublishCommand(args)
|
||||
break
|
||||
}
|
||||
}
|
||||
@@ -5,4 +5,5 @@
|
||||
export const DEFAULT_TIMEOUT_MS = 30 * 60 * 1000 // 30 minutes
|
||||
export const SCREENSHOT_TIMEOUT_MS = 65_000 // 65s — ensures we get extension's error (60s)
|
||||
export const MAX_ACTIONS_PER_DELEGATION = 15
|
||||
export const CLADO_REQUEST_TIMEOUT_MS = 120_000
|
||||
// Cold start can take ~5 minutes per Clado; 6 minutes leaves headroom.
|
||||
export const CLADO_REQUEST_TIMEOUT_MS = 360_000
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { mkdir, readdir, readFile, stat } from 'node:fs/promises'
|
||||
import { join, resolve } from 'node:path'
|
||||
import { dirname, join, resolve, sep } from 'node:path'
|
||||
import { Hono } from 'hono'
|
||||
import { streamSSE } from 'hono/streaming'
|
||||
import { ParallelExecutor } from '../runner/parallel-executor'
|
||||
@@ -128,6 +128,35 @@ let dashboardConfigMode = false
|
||||
const configsDir = join(import.meta.dir, '..', '..', 'configs')
|
||||
const projectRoot = resolve(import.meta.dir, '..', '..', '..', '..')
|
||||
|
||||
async function listConfigFiles(dir: string, prefix = ''): Promise<string[]> {
|
||||
const entries = await readdir(join(dir, prefix), { withFileTypes: true })
|
||||
const files: string[] = []
|
||||
for (const entry of entries) {
|
||||
const relativePath = prefix ? join(prefix, entry.name) : entry.name
|
||||
if (entry.isDirectory()) {
|
||||
files.push(...(await listConfigFiles(dir, relativePath)))
|
||||
} else if (entry.isFile() && entry.name.endsWith('.json')) {
|
||||
files.push(relativePath.split(sep).join('/'))
|
||||
}
|
||||
}
|
||||
return files.sort()
|
||||
}
|
||||
|
||||
function resolveConfigPath(name: string): string | null {
|
||||
if (!name.endsWith('.json')) return null
|
||||
if (name.split('/').some((part) => !part || part === '.' || part === '..')) {
|
||||
return null
|
||||
}
|
||||
|
||||
const resolvedPath = resolve(configsDir, name)
|
||||
const resolvedConfigsDir = resolve(configsDir)
|
||||
const configRootPrefix = resolvedConfigsDir.endsWith(sep)
|
||||
? resolvedConfigsDir
|
||||
: `${resolvedConfigsDir}${sep}`
|
||||
if (!resolvedPath.startsWith(configRootPrefix)) return null
|
||||
return resolvedPath
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Hono App
|
||||
// ============================================================================
|
||||
@@ -339,21 +368,21 @@ app.get('/api/mode', (c) => {
|
||||
// List saved config files
|
||||
app.get('/api/configs', async (c) => {
|
||||
try {
|
||||
const files = await readdir(configsDir)
|
||||
return c.json(files.filter((f) => f.endsWith('.json')))
|
||||
return c.json(await listConfigFiles(configsDir))
|
||||
} catch {
|
||||
return c.json([])
|
||||
}
|
||||
})
|
||||
|
||||
// Read a specific config file
|
||||
app.get('/api/config/:name', async (c) => {
|
||||
const name = c.req.param('name')
|
||||
if (name.includes('/') || name.includes('..')) {
|
||||
app.get('/api/config/*', async (c) => {
|
||||
const name = decodeURIComponent(c.req.path.slice('/api/config/'.length))
|
||||
const configPath = resolveConfigPath(name)
|
||||
if (!configPath) {
|
||||
return c.json({ error: 'Invalid config name' }, 400)
|
||||
}
|
||||
try {
|
||||
const content = await readFile(join(configsDir, name), 'utf-8')
|
||||
const content = await readFile(configPath, 'utf-8')
|
||||
return c.json(JSON.parse(content))
|
||||
} catch {
|
||||
return c.notFound()
|
||||
@@ -382,8 +411,17 @@ app.post('/api/run', async (c) => {
|
||||
|
||||
const config = parseResult.data
|
||||
|
||||
// Resolve relative paths from configs/ dir (dataset dropdown values are relative to it)
|
||||
const baseDir = configsDir
|
||||
let baseDir = configsDir
|
||||
if (body.configName) {
|
||||
const configPath = resolveConfigPath(body.configName)
|
||||
if (!configPath) {
|
||||
return c.json({ error: 'Invalid config name' }, 400)
|
||||
}
|
||||
baseDir = dirname(configPath)
|
||||
}
|
||||
|
||||
// Resolve relative paths from the loaded config location. Unsaved dashboard
|
||||
// configs keep using apps/eval/configs as their base for dropdown values.
|
||||
const datasetPath = resolve(
|
||||
config.dataset.startsWith('/')
|
||||
? config.dataset
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
import { spawn } from 'node:child_process'
|
||||
import { join } from 'node:path'
|
||||
import {
|
||||
writeGraderJsonArtifact,
|
||||
writeGraderTextArtifact,
|
||||
} from '../../grading/artifacts'
|
||||
import {
|
||||
type PythonEvaluatorResult,
|
||||
runPythonJsonEvaluator,
|
||||
} from '../../grading/python-evaluator'
|
||||
import type { GraderResult } from '../../types'
|
||||
import { callMcpTool } from '../../utils/mcp-client'
|
||||
import type { Grader, GraderInput } from '../types'
|
||||
@@ -7,12 +14,23 @@ import type { Grader, GraderInput } from '../types'
|
||||
const EVAL_SCRIPT = join(
|
||||
import.meta.dirname,
|
||||
'..',
|
||||
'..',
|
||||
'..',
|
||||
'scripts',
|
||||
'python',
|
||||
'agisdk-evaluate.py',
|
||||
)
|
||||
|
||||
interface AgisdkEvaluatorInput {
|
||||
task_id: string
|
||||
env_state: Record<string, unknown>
|
||||
model_response: string
|
||||
}
|
||||
|
||||
interface AgisdkEvaluatorOutput {
|
||||
reward: number
|
||||
pass: boolean
|
||||
message: string
|
||||
per_criterion: unknown[]
|
||||
}
|
||||
|
||||
export class AgisdkStateDiffGrader implements Grader {
|
||||
name = 'agisdk_state_diff'
|
||||
|
||||
@@ -36,6 +54,16 @@ export class AgisdkStateDiffGrader implements Grader {
|
||||
let envState: Record<string, unknown>
|
||||
try {
|
||||
envState = await this.fetchFinishState(origin, mcpEndpoint)
|
||||
await writeGraderJsonArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'finish-state.json',
|
||||
envState,
|
||||
)
|
||||
await writeGraderJsonArtifact(input, this.name, 'context.json', {
|
||||
origin,
|
||||
agisdk_task_id: taskId,
|
||||
})
|
||||
} catch (error) {
|
||||
return {
|
||||
score: 0,
|
||||
@@ -46,10 +74,30 @@ export class AgisdkStateDiffGrader implements Grader {
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.runPythonEvaluator(
|
||||
taskId,
|
||||
envState,
|
||||
input.finalAnswer || '',
|
||||
const evaluatorInput: AgisdkEvaluatorInput = {
|
||||
task_id: taskId,
|
||||
env_state: envState,
|
||||
model_response: input.finalAnswer || '',
|
||||
}
|
||||
await writeGraderJsonArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'evaluator-input.json',
|
||||
evaluatorInput,
|
||||
)
|
||||
const evaluation = await this.runPythonEvaluator(evaluatorInput)
|
||||
const result = evaluation.output
|
||||
await writeGraderJsonArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'evaluator-output.json',
|
||||
result,
|
||||
)
|
||||
await writeGraderTextArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'stderr.txt',
|
||||
evaluation.stderr,
|
||||
)
|
||||
return {
|
||||
score: result.reward,
|
||||
@@ -144,59 +192,12 @@ export class AgisdkStateDiffGrader implements Grader {
|
||||
}
|
||||
|
||||
private runPythonEvaluator(
|
||||
taskId: string,
|
||||
envState: Record<string, unknown>,
|
||||
modelResponse: string,
|
||||
): Promise<{
|
||||
reward: number
|
||||
pass: boolean
|
||||
message: string
|
||||
per_criterion: unknown[]
|
||||
}> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const proc = spawn('python3', [EVAL_SCRIPT], {
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
})
|
||||
|
||||
const inputData = JSON.stringify({
|
||||
task_id: taskId,
|
||||
env_state: envState,
|
||||
model_response: modelResponse,
|
||||
})
|
||||
|
||||
let stdout = ''
|
||||
let stderr = ''
|
||||
|
||||
proc.stdout.on('data', (data: Buffer) => {
|
||||
stdout += data.toString()
|
||||
})
|
||||
|
||||
proc.stderr.on('data', (data: Buffer) => {
|
||||
stderr += data.toString()
|
||||
})
|
||||
|
||||
proc.on('close', (code) => {
|
||||
if (code !== 0) {
|
||||
reject(
|
||||
new Error(`Python evaluator exited with code ${code}: ${stderr}`),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
const result = JSON.parse(stdout.trim())
|
||||
resolve(result)
|
||||
} catch {
|
||||
reject(new Error(`Failed to parse evaluator output: ${stdout}`))
|
||||
}
|
||||
})
|
||||
|
||||
proc.on('error', (err) => {
|
||||
reject(new Error(`Failed to spawn Python evaluator: ${err.message}`))
|
||||
})
|
||||
|
||||
proc.stdin.write(inputData)
|
||||
proc.stdin.end()
|
||||
evalInput: AgisdkEvaluatorInput,
|
||||
): Promise<PythonEvaluatorResult<AgisdkEvaluatorOutput>> {
|
||||
return runPythonJsonEvaluator<AgisdkEvaluatorOutput>({
|
||||
scriptPath: EVAL_SCRIPT,
|
||||
input: evalInput,
|
||||
timeoutMs: 300_000,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,12 @@
|
||||
import { join, resolve } from 'node:path'
|
||||
import {
|
||||
writeGraderJsonArtifact,
|
||||
writeGraderTextArtifact,
|
||||
} from '../../grading/artifacts'
|
||||
import {
|
||||
type PythonEvaluatorResult,
|
||||
runPythonJsonEvaluator,
|
||||
} from '../../grading/python-evaluator'
|
||||
import type { GraderResult } from '../../types'
|
||||
import type { Grader, GraderInput } from '../types'
|
||||
|
||||
@@ -14,10 +22,7 @@ interface InfinityEvalOutput {
|
||||
message: string
|
||||
}
|
||||
|
||||
const EVAL_SCRIPT = resolve(
|
||||
import.meta.dir,
|
||||
'../../../scripts/infinity-evaluate.py',
|
||||
)
|
||||
const EVAL_SCRIPT = resolve(import.meta.dir, '../python/infinity-evaluate.py')
|
||||
|
||||
export class InfinityStateGrader implements Grader {
|
||||
name = 'infinity_state'
|
||||
@@ -66,7 +71,32 @@ export class InfinityStateGrader implements Grader {
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.runPythonEvaluator(evalInput)
|
||||
await writeGraderJsonArtifact(input, this.name, 'verifier.json', {
|
||||
appName: parsed.appName,
|
||||
taskId: parsed.taskId,
|
||||
verifierPath,
|
||||
appServerUrl,
|
||||
})
|
||||
await writeGraderJsonArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'evaluator-input.json',
|
||||
evalInput,
|
||||
)
|
||||
const evaluation = await this.runPythonEvaluator(evalInput)
|
||||
const result = evaluation.output
|
||||
await writeGraderJsonArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'evaluator-output.json',
|
||||
result,
|
||||
)
|
||||
await writeGraderTextArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'stderr.txt',
|
||||
evaluation.stderr,
|
||||
)
|
||||
return {
|
||||
score: result.pass ? 1 : 0,
|
||||
pass: result.pass,
|
||||
@@ -108,27 +138,11 @@ export class InfinityStateGrader implements Grader {
|
||||
|
||||
private async runPythonEvaluator(
|
||||
evalInput: InfinityEvalInput,
|
||||
): Promise<InfinityEvalOutput> {
|
||||
const proc = Bun.spawn(['python3', EVAL_SCRIPT], {
|
||||
stdin: 'pipe',
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
): Promise<PythonEvaluatorResult<InfinityEvalOutput>> {
|
||||
return runPythonJsonEvaluator<InfinityEvalOutput>({
|
||||
scriptPath: EVAL_SCRIPT,
|
||||
input: evalInput,
|
||||
timeoutMs: 300_000,
|
||||
})
|
||||
|
||||
const inputJson = JSON.stringify(evalInput)
|
||||
proc.stdin.write(inputJson)
|
||||
proc.stdin.end()
|
||||
|
||||
const stdout = await new Response(proc.stdout).text()
|
||||
const stderr = await new Response(proc.stderr).text()
|
||||
const exitCode = await proc.exited
|
||||
|
||||
if (exitCode !== 0) {
|
||||
throw new Error(
|
||||
`Python evaluator exited with code ${exitCode}: ${stderr || stdout}`,
|
||||
)
|
||||
}
|
||||
|
||||
return JSON.parse(stdout.trim()) as InfinityEvalOutput
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { readFile } from 'node:fs/promises'
|
||||
import { join } from 'node:path'
|
||||
import { query } from '@anthropic-ai/claude-agent-sdk'
|
||||
import { writeGraderJsonArtifact } from '../../grading/artifacts'
|
||||
import type { GraderResult } from '../../types'
|
||||
import type { Grader, GraderInput } from '../types'
|
||||
import {
|
||||
@@ -63,6 +64,7 @@ export class PerformanceGrader implements Grader {
|
||||
input.screenshotCount,
|
||||
terminationReason,
|
||||
)
|
||||
await writeGraderJsonArtifact(input, this.name, 'metrics.json', metrics)
|
||||
|
||||
const systemPrompt = PERFORMANCE_SYSTEM_PROMPT.replace(
|
||||
/\{screenshot_count\}/g,
|
||||
@@ -82,6 +84,14 @@ export class PerformanceGrader implements Grader {
|
||||
userPrompt,
|
||||
input.outputDir,
|
||||
)
|
||||
if (response) {
|
||||
await writeGraderJsonArtifact(
|
||||
input,
|
||||
this.name,
|
||||
'agent-output.json',
|
||||
response,
|
||||
)
|
||||
}
|
||||
|
||||
if (!response) {
|
||||
return {
|
||||
@@ -140,6 +150,7 @@ export class PerformanceGrader implements Grader {
|
||||
`Perf grader: LLM returned ${returnedAxes.size}/${expectedAxes.size} axes, missing: ${missingAxes.join(', ')}`,
|
||||
)
|
||||
}
|
||||
await writeGraderJsonArtifact(input, this.name, 'axes.json', axisResults)
|
||||
|
||||
return {
|
||||
score: compositeScore / 100,
|
||||
|
||||
@@ -1,51 +1,2 @@
|
||||
import type { GraderResult } from '../types'
|
||||
import { AgisdkStateDiffGrader } from './benchmark/agisdk-state-diff'
|
||||
import { InfinityStateGrader } from './benchmark/infinity-state'
|
||||
import { PerformanceGrader } from './performance/performance-grader'
|
||||
import type { Grader, GraderInput } from './types'
|
||||
|
||||
export const PASS_FAIL_GRADER_ORDER = [
|
||||
'agisdk_state_diff',
|
||||
'infinity_state',
|
||||
'performance_grader',
|
||||
] as const
|
||||
|
||||
export function createGrader(name: string): Grader | null {
|
||||
switch (name) {
|
||||
case 'agisdk_state_diff':
|
||||
return new AgisdkStateDiffGrader()
|
||||
case 'infinity_state':
|
||||
return new InfinityStateGrader()
|
||||
case 'performance_grader':
|
||||
return new PerformanceGrader()
|
||||
default:
|
||||
console.warn(`Unknown grader: ${name}`)
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
export async function runGraders(
|
||||
graderNames: string[],
|
||||
input: GraderInput,
|
||||
): Promise<Record<string, GraderResult>> {
|
||||
const results: Record<string, GraderResult> = {}
|
||||
|
||||
for (const name of graderNames) {
|
||||
const grader = createGrader(name)
|
||||
if (!grader) continue
|
||||
try {
|
||||
console.log(` Running grader: ${name}`)
|
||||
results[name] = await grader.grade(input)
|
||||
} catch (error) {
|
||||
results[name] = {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: `Error running grader: ${error}`,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
export { AgisdkStateDiffGrader, InfinityStateGrader, PerformanceGrader }
|
||||
export * from '../grading/grader-registry'
|
||||
export { runConfiguredGraders, runGraders } from '../grading/grader-runner'
|
||||
|
||||
@@ -1,21 +1 @@
|
||||
import type { GraderResult, Message } from '../types'
|
||||
|
||||
export interface GraderInput {
|
||||
task: {
|
||||
query_id: string
|
||||
query: string
|
||||
dataset: string
|
||||
}
|
||||
messages: Message[]
|
||||
screenshotCount: number
|
||||
finalAnswer: string | null
|
||||
expectedAnswer?: string | null
|
||||
outputDir: string
|
||||
mcpUrl?: string
|
||||
infinityAppUrl?: string
|
||||
}
|
||||
|
||||
export interface Grader {
|
||||
name: string
|
||||
grade(input: GraderInput): Promise<GraderResult>
|
||||
}
|
||||
export type { Grader, GraderInput } from '../grading/types'
|
||||
|
||||
34
packages/browseros-agent/apps/eval/src/grading/artifacts.ts
vendored
Normal file
34
packages/browseros-agent/apps/eval/src/grading/artifacts.ts
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
import { mkdir, writeFile } from 'node:fs/promises'
|
||||
import { join } from 'node:path'
|
||||
import type { GraderInput } from './types'
|
||||
|
||||
function artifactDir(input: GraderInput, graderName: string): string {
|
||||
return join(
|
||||
input.taskArtifactDir || input.outputDir,
|
||||
'grader-artifacts',
|
||||
graderName,
|
||||
)
|
||||
}
|
||||
|
||||
/** Writes a JSON artifact for a grader under the task artifact directory. */
|
||||
export async function writeGraderJsonArtifact(
|
||||
input: GraderInput,
|
||||
graderName: string,
|
||||
filename: string,
|
||||
value: unknown,
|
||||
): Promise<void> {
|
||||
const dir = artifactDir(input, graderName)
|
||||
await mkdir(dir, { recursive: true })
|
||||
await writeFile(join(dir, filename), JSON.stringify(value, null, 2))
|
||||
}
|
||||
|
||||
export async function writeGraderTextArtifact(
|
||||
input: GraderInput,
|
||||
graderName: string,
|
||||
filename: string,
|
||||
value: string,
|
||||
): Promise<void> {
|
||||
const dir = artifactDir(input, graderName)
|
||||
await mkdir(dir, { recursive: true })
|
||||
await writeFile(join(dir, filename), value)
|
||||
}
|
||||
26
packages/browseros-agent/apps/eval/src/grading/grader-registry.ts
vendored
Normal file
26
packages/browseros-agent/apps/eval/src/grading/grader-registry.ts
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
import { AgisdkStateDiffGrader } from '../graders/benchmark/agisdk-state-diff'
|
||||
import { InfinityStateGrader } from '../graders/benchmark/infinity-state'
|
||||
import { PerformanceGrader } from '../graders/performance/performance-grader'
|
||||
import type { Grader } from './types'
|
||||
|
||||
export const PASS_FAIL_GRADER_ORDER = [
|
||||
'agisdk_state_diff',
|
||||
'infinity_state',
|
||||
'performance_grader',
|
||||
] as const
|
||||
|
||||
export function createGrader(name: string): Grader | null {
|
||||
switch (name) {
|
||||
case 'agisdk_state_diff':
|
||||
return new AgisdkStateDiffGrader()
|
||||
case 'infinity_state':
|
||||
return new InfinityStateGrader()
|
||||
case 'performance_grader':
|
||||
return new PerformanceGrader()
|
||||
default:
|
||||
console.warn(`Unknown grader: ${name}`)
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
export { AgisdkStateDiffGrader, InfinityStateGrader, PerformanceGrader }
|
||||
36
packages/browseros-agent/apps/eval/src/grading/grader-runner.ts
vendored
Normal file
36
packages/browseros-agent/apps/eval/src/grading/grader-runner.ts
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
import type { GraderResult } from '../types'
|
||||
import { createGrader as defaultCreateGrader } from './grader-registry'
|
||||
import type { Grader, GraderInput } from './types'
|
||||
|
||||
export interface GraderRunnerDeps {
|
||||
createGrader?: (name: string) => Grader | null
|
||||
}
|
||||
|
||||
/** Runs configured graders independently so one failure does not hide others. */
|
||||
export async function runConfiguredGraders(
|
||||
graderNames: string[],
|
||||
input: GraderInput,
|
||||
deps: GraderRunnerDeps = {},
|
||||
): Promise<Record<string, GraderResult>> {
|
||||
const create = deps.createGrader ?? defaultCreateGrader
|
||||
const results: Record<string, GraderResult> = {}
|
||||
|
||||
for (const name of graderNames) {
|
||||
const grader = create(name)
|
||||
if (!grader) continue
|
||||
try {
|
||||
console.log(` Running grader: ${name}`)
|
||||
results[name] = await grader.grade(input)
|
||||
} catch (error) {
|
||||
results[name] = {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: `Error running grader: ${error instanceof Error ? error.message : String(error)}`,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
export const runGraders = runConfiguredGraders
|
||||
65
packages/browseros-agent/apps/eval/src/grading/python-evaluator.ts
vendored
Normal file
65
packages/browseros-agent/apps/eval/src/grading/python-evaluator.ts
vendored
Normal file
@@ -0,0 +1,65 @@
|
||||
export interface PythonEvaluatorOptions {
|
||||
scriptPath: string
|
||||
input: unknown
|
||||
timeoutMs: number
|
||||
}
|
||||
|
||||
export interface PythonEvaluatorResult<T> {
|
||||
output: T
|
||||
stdout: string
|
||||
stderr: string
|
||||
exitCode: number
|
||||
}
|
||||
|
||||
/** Runs a Python evaluator that accepts stdin JSON and emits stdout JSON. */
|
||||
export async function runPythonJsonEvaluator<T>(
|
||||
options: PythonEvaluatorOptions,
|
||||
): Promise<PythonEvaluatorResult<T>> {
|
||||
const proc = Bun.spawn(['python3', options.scriptPath], {
|
||||
stdin: 'pipe',
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
|
||||
proc.stdin.write(JSON.stringify(options.input))
|
||||
proc.stdin.end()
|
||||
|
||||
let timeoutHandle: ReturnType<typeof setTimeout> | undefined
|
||||
const timeout = new Promise<never>((_, reject) => {
|
||||
timeoutHandle = setTimeout(() => {
|
||||
proc.kill('SIGKILL')
|
||||
reject(
|
||||
new Error(`Python evaluator timed out after ${options.timeoutMs}ms`),
|
||||
)
|
||||
}, options.timeoutMs)
|
||||
})
|
||||
|
||||
const completed = (async (): Promise<PythonEvaluatorResult<T>> => {
|
||||
const stdout = await new Response(proc.stdout).text()
|
||||
const stderr = await new Response(proc.stderr).text()
|
||||
const exitCode = await proc.exited
|
||||
|
||||
if (exitCode !== 0) {
|
||||
throw new Error(
|
||||
`Python evaluator exited with code ${exitCode}: ${stderr || stdout}`,
|
||||
)
|
||||
}
|
||||
|
||||
try {
|
||||
return {
|
||||
output: JSON.parse(stdout.trim()) as T,
|
||||
stdout,
|
||||
stderr,
|
||||
exitCode,
|
||||
}
|
||||
} catch {
|
||||
throw new Error(`Failed to parse Python evaluator output: ${stdout}`)
|
||||
}
|
||||
})()
|
||||
|
||||
try {
|
||||
return await Promise.race([completed, timeout])
|
||||
} finally {
|
||||
clearTimeout(timeoutHandle)
|
||||
}
|
||||
}
|
||||
22
packages/browseros-agent/apps/eval/src/grading/types.ts
vendored
Normal file
22
packages/browseros-agent/apps/eval/src/grading/types.ts
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
import type { GraderResult, Message } from '../types'
|
||||
|
||||
export interface GraderInput {
|
||||
task: {
|
||||
query_id: string
|
||||
query: string
|
||||
dataset: string
|
||||
}
|
||||
messages: Message[]
|
||||
screenshotCount: number
|
||||
finalAnswer: string | null
|
||||
expectedAnswer?: string | null
|
||||
taskArtifactDir: string
|
||||
outputDir: string
|
||||
mcpUrl?: string
|
||||
infinityAppUrl?: string
|
||||
}
|
||||
|
||||
export interface Grader {
|
||||
name: string
|
||||
grade(input: GraderInput): Promise<GraderResult>
|
||||
}
|
||||
74
packages/browseros-agent/apps/eval/src/index.ts
vendored
74
packages/browseros-agent/apps/eval/src/index.ts
vendored
@@ -1,72 +1,10 @@
|
||||
#!/usr/bin/env bun
|
||||
|
||||
import { parseArgs } from 'node:util'
|
||||
import { runEval } from './runner/eval-runner'
|
||||
import { runCli } from './cli'
|
||||
|
||||
const { values } = parseArgs({
|
||||
args: Bun.argv.slice(2),
|
||||
options: {
|
||||
config: { type: 'string', short: 'c' },
|
||||
help: { type: 'boolean', short: 'h' },
|
||||
},
|
||||
})
|
||||
|
||||
if (values.help) {
|
||||
console.log(`
|
||||
BrowserOS Eval
|
||||
|
||||
Usage:
|
||||
bun run eval # Opens dashboard in config mode
|
||||
bun run eval --config <config.json> # Runs eval with config file
|
||||
|
||||
Available agent types:
|
||||
- single Single LLM agent driven by the BrowserOS tool loop
|
||||
- orchestrator-executor High-level planner + visual/text executor
|
||||
|
||||
Available graders:
|
||||
- performance_grader Multi-axis grader using Claude Agent SDK
|
||||
- agisdk_state_diff AGI SDK / REAL Bench state-diff grader
|
||||
- infinity_state WebArena-Infinity verifier-script grader
|
||||
|
||||
Preset configs in configs/:
|
||||
- browseros-agent-weekly.json Weekly eval (single agent)
|
||||
- browseros-oe-agent-weekly.json Weekly eval (orchestrator + LLM executor)
|
||||
- browseros-oe-clado-weekly.json Weekly eval (orchestrator + Clado executor)
|
||||
- agisdk-real-smoke.json AGI SDK smoke run
|
||||
- infinity-hard-50.json WebArena-Infinity hard-50 set
|
||||
- test-webvoyager.json WebVoyager test
|
||||
- test-mind2web.json Mind2Web test
|
||||
|
||||
Examples:
|
||||
bun run eval # Dashboard config mode
|
||||
bun run eval -c configs/browseros-agent-weekly.json
|
||||
bun run eval -c configs/test-webvoyager.json
|
||||
`)
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
if (values.config) {
|
||||
try {
|
||||
await runEval({ configPath: values.config })
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : String(error))
|
||||
process.exit(1)
|
||||
}
|
||||
process.exit(0)
|
||||
} else {
|
||||
// No config — start dashboard in config mode, wait for user to configure and run
|
||||
const { startDashboard } = await import('./dashboard/server')
|
||||
startDashboard({
|
||||
tasks: [],
|
||||
configName: '',
|
||||
agentType: '',
|
||||
outputDir: '',
|
||||
configMode: true,
|
||||
})
|
||||
console.log(
|
||||
'Dashboard running at http://localhost:9900 — configure and run from the UI',
|
||||
)
|
||||
|
||||
// Keep process alive until SIGINT
|
||||
await new Promise(() => {})
|
||||
try {
|
||||
await runCli(Bun.argv.slice(2))
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : String(error))
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
41
packages/browseros-agent/apps/eval/src/publishing/r2-manifest.ts
vendored
Normal file
41
packages/browseros-agent/apps/eval/src/publishing/r2-manifest.ts
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
export interface R2UploadConfig {
|
||||
accountId: string
|
||||
accessKeyId: string
|
||||
secretAccessKey: string
|
||||
bucket: string
|
||||
cdnBaseUrl: string
|
||||
}
|
||||
|
||||
export interface R2ManifestTask {
|
||||
queryId: string
|
||||
query: string
|
||||
startUrl: string
|
||||
status: string
|
||||
durationMs: number
|
||||
screenshotCount: number
|
||||
graderResults: Record<string, unknown>
|
||||
}
|
||||
|
||||
export interface R2RunManifest {
|
||||
runId: string
|
||||
uploadedAt: string
|
||||
agentConfig?: Record<string, unknown>
|
||||
dataset?: string
|
||||
summary?: {
|
||||
passRate?: unknown
|
||||
avgDurationMs?: unknown
|
||||
}
|
||||
tasks: R2ManifestTask[]
|
||||
}
|
||||
|
||||
export interface R2PublishRunResult {
|
||||
runId: string
|
||||
uploadedFiles: number
|
||||
viewerUrl: string
|
||||
manifest: R2RunManifest
|
||||
}
|
||||
|
||||
export interface R2PublishPathResult {
|
||||
uploadedRuns: R2PublishRunResult[]
|
||||
skippedRuns: string[]
|
||||
}
|
||||
425
packages/browseros-agent/apps/eval/src/publishing/r2-publisher.ts
vendored
Normal file
425
packages/browseros-agent/apps/eval/src/publishing/r2-publisher.ts
vendored
Normal file
@@ -0,0 +1,425 @@
|
||||
import { readdir, readFile, stat } from 'node:fs/promises'
|
||||
import { basename, dirname, extname, join } from 'node:path'
|
||||
import {
|
||||
GetObjectCommand,
|
||||
PutObjectCommand,
|
||||
S3Client,
|
||||
} from '@aws-sdk/client-s3'
|
||||
import type {
|
||||
R2ManifestTask,
|
||||
R2PublishPathResult,
|
||||
R2PublishRunResult,
|
||||
R2RunManifest,
|
||||
R2UploadConfig,
|
||||
} from './r2-manifest'
|
||||
|
||||
const DEFAULT_CONCURRENCY = 20
|
||||
|
||||
const CONTENT_TYPES: Record<string, string> = {
|
||||
'.json': 'application/json',
|
||||
'.jsonl': 'application/x-ndjson',
|
||||
'.png': 'image/png',
|
||||
'.html': 'text/html',
|
||||
}
|
||||
|
||||
export interface R2Client {
|
||||
send(command: unknown): Promise<unknown>
|
||||
}
|
||||
|
||||
export interface R2PublisherOptions {
|
||||
config: R2UploadConfig
|
||||
client?: R2Client
|
||||
viewerPath?: string
|
||||
concurrency?: number
|
||||
now?: () => Date
|
||||
}
|
||||
|
||||
interface UploadJob {
|
||||
key: string
|
||||
filePath: string
|
||||
contentType: string
|
||||
}
|
||||
|
||||
interface TaskDirEntry {
|
||||
taskId: string
|
||||
taskPath: string
|
||||
canonicalLayout: boolean
|
||||
}
|
||||
|
||||
export function contentTypeForPath(filePath: string): string {
|
||||
return CONTENT_TYPES[extname(filePath)] || 'application/octet-stream'
|
||||
}
|
||||
|
||||
export function loadR2ConfigFromEnv(
|
||||
env: Record<string, string | undefined> = process.env,
|
||||
): R2UploadConfig {
|
||||
const accountId = env.EVAL_R2_ACCOUNT_ID
|
||||
const accessKeyId = env.EVAL_R2_ACCESS_KEY_ID
|
||||
const secretAccessKey = env.EVAL_R2_SECRET_ACCESS_KEY
|
||||
|
||||
if (!accountId || !accessKeyId || !secretAccessKey) {
|
||||
throw new Error(
|
||||
'Missing required env vars: EVAL_R2_ACCOUNT_ID, EVAL_R2_ACCESS_KEY_ID, EVAL_R2_SECRET_ACCESS_KEY',
|
||||
)
|
||||
}
|
||||
|
||||
return {
|
||||
accountId,
|
||||
accessKeyId,
|
||||
secretAccessKey,
|
||||
bucket: env.EVAL_R2_BUCKET || 'browseros-eval',
|
||||
cdnBaseUrl: (
|
||||
env.EVAL_R2_CDN_BASE_URL || 'https://eval.browseros.com'
|
||||
).replace(/\/+$/, ''),
|
||||
}
|
||||
}
|
||||
|
||||
export function createR2Client(config: R2UploadConfig): S3Client {
|
||||
return new S3Client({
|
||||
region: 'auto',
|
||||
endpoint: `https://${config.accountId}.r2.cloudflarestorage.com`,
|
||||
credentials: {
|
||||
accessKeyId: config.accessKeyId,
|
||||
secretAccessKey: config.secretAccessKey,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
async function collectFiles(dir: string): Promise<string[]> {
|
||||
const files: string[] = []
|
||||
const entries = await readdir(dir, { withFileTypes: true })
|
||||
for (const entry of entries) {
|
||||
const full = join(dir, entry.name)
|
||||
if (entry.isDirectory()) {
|
||||
files.push(...(await collectFiles(full)))
|
||||
} else {
|
||||
files.push(full)
|
||||
}
|
||||
}
|
||||
return files
|
||||
}
|
||||
|
||||
async function runPool<T>(
|
||||
items: T[],
|
||||
concurrency: number,
|
||||
fn: (item: T) => Promise<void>,
|
||||
): Promise<void> {
|
||||
let i = 0
|
||||
const workers = Array.from({ length: concurrency }, async () => {
|
||||
while (i < items.length) {
|
||||
const idx = i++
|
||||
await fn(items[idx])
|
||||
}
|
||||
})
|
||||
await Promise.all(workers)
|
||||
}
|
||||
|
||||
async function hasMetadata(dir: string): Promise<boolean> {
|
||||
const metaStat = await stat(join(dir, 'metadata.json')).catch(() => null)
|
||||
return !!metaStat?.isFile()
|
||||
}
|
||||
|
||||
async function findTaskDirs(runDir: string): Promise<TaskDirEntry[]> {
|
||||
const entries = await readdir(runDir, { withFileTypes: true })
|
||||
const legacyTasks: TaskDirEntry[] = []
|
||||
for (const entry of entries) {
|
||||
if (!entry.isDirectory() || entry.name === 'tasks') continue
|
||||
const taskPath = join(runDir, entry.name)
|
||||
if (await hasMetadata(taskPath)) {
|
||||
legacyTasks.push({
|
||||
taskId: entry.name,
|
||||
taskPath,
|
||||
canonicalLayout: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
const tasksRoot = join(runDir, 'tasks')
|
||||
const canonicalEntries = await readdir(tasksRoot, {
|
||||
withFileTypes: true,
|
||||
}).catch(() => [])
|
||||
const canonicalTasks: TaskDirEntry[] = []
|
||||
for (const entry of canonicalEntries) {
|
||||
if (!entry.isDirectory()) continue
|
||||
const taskPath = join(tasksRoot, entry.name)
|
||||
if (await hasMetadata(taskPath)) {
|
||||
canonicalTasks.push({
|
||||
taskId: entry.name,
|
||||
taskPath,
|
||||
canonicalLayout: true,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return legacyTasks.length > 0 ? legacyTasks : canonicalTasks
|
||||
}
|
||||
|
||||
async function isRunDir(dir: string): Promise<boolean> {
|
||||
return (await findTaskDirs(dir)).length > 0
|
||||
}
|
||||
|
||||
async function collectRunRootFiles(runDir: string): Promise<UploadJob[]> {
|
||||
const entries = await readdir(runDir, { withFileTypes: true })
|
||||
return entries
|
||||
.filter((entry) => entry.isFile())
|
||||
.map((entry) => {
|
||||
const filePath = join(runDir, entry.name)
|
||||
return {
|
||||
key: entry.name,
|
||||
filePath,
|
||||
contentType: contentTypeForPath(filePath),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
function statusFromMetadata(meta: Record<string, unknown>): string {
|
||||
return meta.termination_reason === 'completed'
|
||||
? 'completed'
|
||||
: ((meta.termination_reason as string | undefined) ?? 'unknown')
|
||||
}
|
||||
|
||||
function runIdForDir(runDir: string): string {
|
||||
const timestamp = basename(runDir)
|
||||
const configName = basename(dirname(runDir))
|
||||
return `${configName}-${timestamp}`
|
||||
}
|
||||
|
||||
/** Publishes eval artifacts in the viewer-compatible R2 layout. */
|
||||
export class R2Publisher {
|
||||
private readonly client: R2Client
|
||||
private readonly config: R2UploadConfig
|
||||
private readonly viewerPath: string
|
||||
private readonly concurrency: number
|
||||
private readonly now: () => Date
|
||||
|
||||
constructor(options: R2PublisherOptions) {
|
||||
this.config = options.config
|
||||
this.client = options.client ?? createR2Client(options.config)
|
||||
this.viewerPath =
|
||||
options.viewerPath ??
|
||||
join(import.meta.dirname, '..', 'dashboard', 'viewer.html')
|
||||
this.concurrency = options.concurrency ?? DEFAULT_CONCURRENCY
|
||||
this.now = options.now ?? (() => new Date())
|
||||
}
|
||||
|
||||
async isUploaded(runId: string): Promise<boolean> {
|
||||
try {
|
||||
await this.client.send(
|
||||
new GetObjectCommand({
|
||||
Bucket: this.config.bucket,
|
||||
Key: `runs/${runId}/manifest.json`,
|
||||
}),
|
||||
)
|
||||
return true
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
async publishPath(inputDir: string): Promise<R2PublishPathResult> {
|
||||
const dirStat = await stat(inputDir).catch(() => null)
|
||||
if (!dirStat?.isDirectory()) {
|
||||
throw new Error(`Not a directory: ${inputDir}`)
|
||||
}
|
||||
|
||||
if (await isRunDir(inputDir)) {
|
||||
const result = await this.publishRun(inputDir, runIdForDir(inputDir))
|
||||
return { uploadedRuns: [result], skippedRuns: [] }
|
||||
}
|
||||
|
||||
const configName = basename(inputDir)
|
||||
const entries = await readdir(inputDir, { withFileTypes: true })
|
||||
const runDirs = entries
|
||||
.filter((entry) => entry.isDirectory())
|
||||
.map((entry) => entry.name)
|
||||
.sort()
|
||||
|
||||
if (runDirs.length === 0) {
|
||||
throw new Error('No run subdirectories found')
|
||||
}
|
||||
|
||||
const uploadedRuns: R2PublishRunResult[] = []
|
||||
const skippedRuns: string[] = []
|
||||
for (const dir of runDirs) {
|
||||
const runId = `${configName}-${dir}`
|
||||
if (await this.isUploaded(runId)) {
|
||||
skippedRuns.push(runId)
|
||||
continue
|
||||
}
|
||||
uploadedRuns.push(await this.publishRun(join(inputDir, dir), runId))
|
||||
}
|
||||
|
||||
return { uploadedRuns, skippedRuns }
|
||||
}
|
||||
|
||||
async publishRun(
|
||||
runDir: string,
|
||||
runId: string = runIdForDir(runDir),
|
||||
): Promise<R2PublishRunResult> {
|
||||
const taskEntries = await findTaskDirs(runDir)
|
||||
|
||||
if (taskEntries.length === 0) {
|
||||
throw new Error(`No task subdirectories in ${runId}`)
|
||||
}
|
||||
|
||||
const manifestTasks: R2ManifestTask[] = []
|
||||
const jobs: UploadJob[] = (await collectRunRootFiles(runDir)).map(
|
||||
(job) => ({
|
||||
...job,
|
||||
key: `runs/${runId}/${job.key}`,
|
||||
}),
|
||||
)
|
||||
let agentConfig: Record<string, unknown> | undefined
|
||||
let dataset: string | undefined
|
||||
|
||||
for (const taskDirEntry of taskEntries) {
|
||||
const { taskId, taskPath } = taskDirEntry
|
||||
const meta = await this.readMetadata(taskPath)
|
||||
if (!meta) continue
|
||||
|
||||
if (!agentConfig && meta.agent_config) {
|
||||
agentConfig = meta.agent_config as Record<string, unknown>
|
||||
}
|
||||
if (!dataset && meta.dataset) dataset = meta.dataset as string
|
||||
|
||||
const files = await collectFiles(taskPath)
|
||||
let screenshotCount = 0
|
||||
for (const file of files) {
|
||||
const relative = file.slice(taskPath.length + 1)
|
||||
if (relative.startsWith('screenshots/') && extname(file) === '.png') {
|
||||
screenshotCount++
|
||||
}
|
||||
jobs.push({
|
||||
key: `runs/${runId}/${taskId}/${relative}`,
|
||||
filePath: file,
|
||||
contentType: contentTypeForPath(file),
|
||||
})
|
||||
if (taskDirEntry.canonicalLayout) {
|
||||
jobs.push({
|
||||
key: `runs/${runId}/tasks/${taskId}/${relative}`,
|
||||
filePath: file,
|
||||
contentType: contentTypeForPath(file),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
manifestTasks.push({
|
||||
queryId: (meta.query_id as string | undefined) || taskId,
|
||||
query: (meta.query as string | undefined) || '',
|
||||
startUrl: (meta.start_url as string | undefined) || '',
|
||||
status: statusFromMetadata(meta),
|
||||
durationMs: (meta.total_duration_ms as number | undefined) || 0,
|
||||
screenshotCount:
|
||||
(meta.screenshot_count as number | undefined) || screenshotCount,
|
||||
graderResults:
|
||||
(meta.grader_results as Record<string, unknown> | undefined) || {},
|
||||
})
|
||||
}
|
||||
|
||||
if (manifestTasks.length === 0) {
|
||||
throw new Error(`No completed tasks in ${runId}`)
|
||||
}
|
||||
|
||||
let uploaded = 0
|
||||
await runPool(jobs, this.concurrency, async (job) => {
|
||||
await this.uploadFile(job)
|
||||
uploaded++
|
||||
})
|
||||
|
||||
const manifest = await this.buildManifest(
|
||||
runDir,
|
||||
runId,
|
||||
agentConfig,
|
||||
dataset,
|
||||
manifestTasks,
|
||||
)
|
||||
await this.uploadBuffer(
|
||||
`runs/${runId}/manifest.json`,
|
||||
Buffer.from(JSON.stringify(manifest, null, 2)),
|
||||
'application/json',
|
||||
)
|
||||
await this.uploadBuffer(
|
||||
'viewer.html',
|
||||
await readFile(this.viewerPath),
|
||||
'text/html',
|
||||
)
|
||||
|
||||
return {
|
||||
runId,
|
||||
uploadedFiles: uploaded + 2,
|
||||
viewerUrl: `${this.config.cdnBaseUrl}/viewer.html?run=${runId}`,
|
||||
manifest,
|
||||
}
|
||||
}
|
||||
|
||||
private async readMetadata(
|
||||
taskPath: string,
|
||||
): Promise<Record<string, unknown> | null> {
|
||||
try {
|
||||
return JSON.parse(
|
||||
await readFile(join(taskPath, 'metadata.json'), 'utf-8'),
|
||||
) as Record<string, unknown>
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
private async buildManifest(
|
||||
runDir: string,
|
||||
runId: string,
|
||||
agentConfig: Record<string, unknown> | undefined,
|
||||
dataset: string | undefined,
|
||||
tasks: R2ManifestTask[],
|
||||
): Promise<R2RunManifest> {
|
||||
let summaryData: Record<string, unknown> | undefined
|
||||
try {
|
||||
summaryData = JSON.parse(
|
||||
await readFile(join(runDir, 'summary.json'), 'utf-8'),
|
||||
) as Record<string, unknown>
|
||||
} catch {}
|
||||
|
||||
return {
|
||||
runId,
|
||||
uploadedAt: this.now().toISOString(),
|
||||
agentConfig,
|
||||
dataset,
|
||||
summary: summaryData
|
||||
? {
|
||||
passRate: summaryData.passRate,
|
||||
avgDurationMs: summaryData.avgDurationMs,
|
||||
}
|
||||
: undefined,
|
||||
tasks,
|
||||
}
|
||||
}
|
||||
|
||||
private async uploadFile(job: UploadJob): Promise<void> {
|
||||
await this.uploadBuffer(
|
||||
job.key,
|
||||
await readFile(job.filePath),
|
||||
job.contentType,
|
||||
)
|
||||
}
|
||||
|
||||
private async uploadBuffer(
|
||||
key: string,
|
||||
body: Buffer,
|
||||
contentType: string,
|
||||
): Promise<void> {
|
||||
await this.client.send(
|
||||
new PutObjectCommand({
|
||||
Bucket: this.config.bucket,
|
||||
Key: key,
|
||||
Body: body,
|
||||
ContentType: contentType,
|
||||
}),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
export async function publishPathToR2(
|
||||
inputDir: string,
|
||||
): Promise<R2PublishPathResult> {
|
||||
const config = loadR2ConfigFromEnv()
|
||||
return new R2Publisher({ config }).publishPath(inputDir)
|
||||
}
|
||||
@@ -14,8 +14,11 @@
|
||||
*/
|
||||
|
||||
import {
|
||||
closeSync,
|
||||
existsSync,
|
||||
mkdirSync,
|
||||
mkdtempSync,
|
||||
openSync,
|
||||
readFileSync,
|
||||
rmSync,
|
||||
writeFileSync,
|
||||
@@ -33,7 +36,17 @@ export interface EvalPorts {
|
||||
|
||||
const MAX_RESTART_ATTEMPTS = 3
|
||||
const CDP_WAIT_TIMEOUT_MS = 30_000
|
||||
const SERVER_HEALTH_TIMEOUT_MS = 30_000
|
||||
// Bumped from 30s → 90s while debugging dev-CI startup. Dev's server module
|
||||
// graph is ~108 files larger than main's; cold-cache module load on a CI
|
||||
// runner can take much longer than the original 30s budget allowed.
|
||||
const SERVER_HEALTH_TIMEOUT_MS = 90_000
|
||||
|
||||
// Where per-worker server stderr is written. Captured (rather than ignored)
|
||||
// so eval-weekly.yml can upload these as workflow artifacts on failure for
|
||||
// post-mortem debugging. Path is also referenced in the workflow's artifact
|
||||
// upload step.
|
||||
const SERVER_LOG_DIR =
|
||||
process.env.BROWSEROS_SERVER_LOG_DIR || '/tmp/browseros-server-logs'
|
||||
|
||||
const MONOREPO_ROOT = join(
|
||||
dirname(fileURLToPath(import.meta.url)),
|
||||
@@ -53,6 +66,7 @@ export class BrowserOSAppManager {
|
||||
private ports: EvalPorts
|
||||
private chromeProc: Subprocess | null = null
|
||||
private serverProc: Subprocess | null = null
|
||||
private serverLogFd: number | null = null
|
||||
private tempDir: string | null = null
|
||||
private readonly workerIndex: number
|
||||
private readonly loadExtensions: boolean
|
||||
@@ -183,15 +197,36 @@ export class BrowserOSAppManager {
|
||||
VITE_BROWSEROS_SERVER_PORT: String(server),
|
||||
}
|
||||
|
||||
// Capture both stdout and stderr to a per-worker file so we can
|
||||
// post-mortem startup hangs. The server uses pino which writes logs to
|
||||
// stdout by default — capturing stderr alone misses everything. The
|
||||
// eval-weekly workflow uploads /tmp/browseros-server-logs/ as a workflow
|
||||
// artifact on failure.
|
||||
// Open the per-worker log file under SERVER_LOG_DIR. If the directory
|
||||
// can't be created or the file can't be opened (e.g. unwritable custom
|
||||
// BROWSEROS_SERVER_LOG_DIR), fall back to /dev/null so spawn still works.
|
||||
const logPath = join(SERVER_LOG_DIR, `server-W${this.workerIndex}.log`)
|
||||
let logFd: number
|
||||
try {
|
||||
mkdirSync(SERVER_LOG_DIR, { recursive: true })
|
||||
logFd = openSync(logPath, 'a')
|
||||
} catch {
|
||||
logFd = openSync('/dev/null', 'w')
|
||||
}
|
||||
this.serverLogFd = logFd
|
||||
|
||||
// `start:ci` skips `--watch` (no file-watcher overhead in CI). Falls back
|
||||
// to the regular `start` script outside CI for the dev-watch experience.
|
||||
const startScript = process.env.CI ? 'start:ci' : 'start'
|
||||
this.serverProc = spawn({
|
||||
cmd: ['bun', 'run', '--filter', '@browseros/server', 'start'],
|
||||
cmd: ['bun', 'run', '--filter', '@browseros/server', startScript],
|
||||
cwd: MONOREPO_ROOT,
|
||||
stdout: 'ignore',
|
||||
stderr: 'ignore',
|
||||
stdout: logFd,
|
||||
stderr: logFd,
|
||||
env: serverEnv,
|
||||
})
|
||||
console.log(
|
||||
` [W${this.workerIndex}] Server started (PID: ${this.serverProc.pid})`,
|
||||
` [W${this.workerIndex}] Server started (PID: ${this.serverProc.pid}, logs → ${logPath})`,
|
||||
)
|
||||
|
||||
// --- Wait for Server Health ---
|
||||
@@ -244,6 +279,18 @@ export class BrowserOSAppManager {
|
||||
await this.killProcess(this.serverProc)
|
||||
this.serverProc = null
|
||||
|
||||
// Close the parent's copy of the server log fd. Child kept its own dup
|
||||
// until it exited above, so closing here doesn't truncate any output.
|
||||
// Without this we'd leak one fd per restart attempt across all workers.
|
||||
if (this.serverLogFd !== null) {
|
||||
try {
|
||||
closeSync(this.serverLogFd)
|
||||
} catch {
|
||||
// already closed or invalid — ignore
|
||||
}
|
||||
this.serverLogFd = null
|
||||
}
|
||||
|
||||
// Kill Chrome (graceful → force)
|
||||
await this.killProcess(this.chromeProc)
|
||||
this.chromeProc = null
|
||||
|
||||
@@ -1,362 +1 @@
|
||||
import { mkdir, writeFile } from 'node:fs/promises'
|
||||
import { basename, dirname, join, resolve } from 'node:path'
|
||||
import {
|
||||
dashboardState,
|
||||
setActiveExecutor,
|
||||
startDashboard,
|
||||
stopDashboard,
|
||||
} from '../dashboard/server'
|
||||
import type { ErrorSource, EvalConfig, Task } from '../types'
|
||||
import {
|
||||
printValidationResult,
|
||||
validateConfig,
|
||||
} from '../utils/config-validator'
|
||||
import { ParallelExecutor } from './parallel-executor'
|
||||
import {
|
||||
getTaskSourceDescription,
|
||||
loadTasks,
|
||||
TaskLoadError,
|
||||
} from './task-loader'
|
||||
import type {
|
||||
BatchSummary,
|
||||
RunEvalOptions,
|
||||
TaskResult,
|
||||
TaskResultSummary,
|
||||
TaskSource,
|
||||
} from './types'
|
||||
import { getPrimaryGraderResult, isSuccessfulResult } from './types'
|
||||
|
||||
// ============================================================================
|
||||
// Main Entry Point
|
||||
// ============================================================================
|
||||
|
||||
export async function runEval(options: RunEvalOptions): Promise<void> {
|
||||
// Step 1: Validate configuration
|
||||
const config = await loadAndValidateConfig(options.configPath)
|
||||
|
||||
// Step 2: Resolve paths relative to config location
|
||||
const configDir = dirname(resolve(options.configPath))
|
||||
const resolvedPaths = resolvePaths(options, config, configDir)
|
||||
|
||||
// Log configuration
|
||||
console.log('Eval Configuration:')
|
||||
console.log(` Config: ${options.configPath}`)
|
||||
console.log(` Dataset: ${resolvedPaths.dataPath}`)
|
||||
console.log(` Output: ${resolvedPaths.outputDir}`)
|
||||
console.log(` Workers: ${config.num_workers}`)
|
||||
console.log(` Agent: ${config.agent.type}`)
|
||||
console.log()
|
||||
|
||||
// Step 3: Load tasks
|
||||
const taskSource = resolveTaskSource(options, resolvedPaths.dataPath)
|
||||
const { tasks } = await loadTasksWithLogging(taskSource)
|
||||
|
||||
// Step 4: Setup
|
||||
await mkdir(resolvedPaths.outputDir, { recursive: true })
|
||||
|
||||
// Step 5: Start dashboard
|
||||
startDashboard({
|
||||
tasks,
|
||||
configName: options.configPath,
|
||||
agentType: config.agent.type,
|
||||
outputDir: resolvedPaths.outputDir,
|
||||
})
|
||||
|
||||
// Step 6: Execute tasks (parallel or sequential based on num_workers)
|
||||
const results = await executeTasks(tasks, config, resolvedPaths.outputDir)
|
||||
|
||||
// Step 7: Summary
|
||||
const summary = buildSummary(results)
|
||||
await saveSummary(summary, resolvedPaths.outputDir)
|
||||
printSummary(summary)
|
||||
console.log(`\nResults saved to: ${resolvedPaths.outputDir}`)
|
||||
|
||||
stopDashboard()
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Configuration
|
||||
// ============================================================================
|
||||
|
||||
async function loadAndValidateConfig(configPath: string) {
|
||||
console.log('Validating configuration...')
|
||||
const validationResult = await validateConfig(configPath)
|
||||
printValidationResult(validationResult)
|
||||
|
||||
if (!validationResult.valid || !validationResult.config) {
|
||||
throw new Error(
|
||||
'Configuration validation failed. Fix the above errors and try again.',
|
||||
)
|
||||
}
|
||||
|
||||
return validationResult.config
|
||||
}
|
||||
|
||||
interface ResolvedPaths {
|
||||
dataPath: string
|
||||
outputDir: string
|
||||
}
|
||||
|
||||
function resolvePaths(
|
||||
options: RunEvalOptions,
|
||||
config: EvalConfig,
|
||||
configDir: string,
|
||||
): ResolvedPaths {
|
||||
// Resolve dataset path: use options.dataPath if provided, otherwise resolve from config
|
||||
const dataPath = options.dataPath
|
||||
? options.dataPath
|
||||
: config.dataset.startsWith('/')
|
||||
? config.dataset
|
||||
: resolve(configDir, config.dataset)
|
||||
|
||||
// Resolve output directory: results/{config-name}/{timestamp}/
|
||||
// Config name derived from config filename (e.g., "browseros-agent-weekly.json" → "browseros-agent-weekly")
|
||||
const configName = options.configPath
|
||||
? basename(resolve(options.configPath), '.json')
|
||||
: 'eval'
|
||||
const timestamp = formatTimestamp(new Date())
|
||||
const resultsBase = config.output_dir
|
||||
? config.output_dir.startsWith('/')
|
||||
? config.output_dir
|
||||
: resolve(configDir, config.output_dir)
|
||||
: resolve(configDir, '..', 'results')
|
||||
const outputDir = join(resultsBase, configName, timestamp)
|
||||
|
||||
return { dataPath, outputDir }
|
||||
}
|
||||
|
||||
function formatTimestamp(date: Date): string {
|
||||
const y = date.getFullYear()
|
||||
const m = String(date.getMonth() + 1).padStart(2, '0')
|
||||
const d = String(date.getDate()).padStart(2, '0')
|
||||
const h = String(date.getHours()).padStart(2, '0')
|
||||
const min = String(date.getMinutes()).padStart(2, '0')
|
||||
return `${y}-${m}-${d}-${h}${min}`
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Task Loading
|
||||
// ============================================================================
|
||||
|
||||
function resolveTaskSource(
|
||||
options: RunEvalOptions,
|
||||
dataPath: string,
|
||||
): TaskSource {
|
||||
// If query is provided, use single task mode
|
||||
if (options.query) {
|
||||
return { type: 'single', query: options.query, startUrl: options.startUrl }
|
||||
}
|
||||
|
||||
// Otherwise use file mode with the resolved dataPath
|
||||
return { type: 'file', path: dataPath }
|
||||
}
|
||||
|
||||
async function loadTasksWithLogging(
|
||||
source: TaskSource,
|
||||
): Promise<{ tasks: Awaited<ReturnType<typeof loadTasks>>['tasks'] }> {
|
||||
console.log(`Loading tasks from ${getTaskSourceDescription(source)}...`)
|
||||
|
||||
try {
|
||||
const result = await loadTasks(source)
|
||||
console.log(`Loaded ${result.tasks.length} task(s)`)
|
||||
return { tasks: result.tasks }
|
||||
} catch (error) {
|
||||
if (error instanceof TaskLoadError) {
|
||||
throw new Error(`Failed to load tasks: ${error.message}`)
|
||||
}
|
||||
throw new Error(`Failed to load tasks: ${error}`)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Task Execution
|
||||
// ============================================================================
|
||||
|
||||
async function executeTasks(
|
||||
tasks: Task[],
|
||||
config: EvalConfig,
|
||||
outputDir: string,
|
||||
): Promise<TaskResult[]> {
|
||||
console.log(`\n${'='.repeat(60)}`)
|
||||
console.log('STARTING EVALUATION')
|
||||
console.log(`${'='.repeat(60)}\n`)
|
||||
|
||||
const numWorkers = config.num_workers || 1
|
||||
console.log(`Running with ${numWorkers} worker(s)`)
|
||||
if (config.restart_server_per_task) {
|
||||
console.log(`Server restart per task: enabled`)
|
||||
}
|
||||
console.log()
|
||||
|
||||
const executor = new ParallelExecutor({
|
||||
numWorkers,
|
||||
config,
|
||||
outputDir,
|
||||
restartServerPerTask: config.restart_server_per_task,
|
||||
onEvent: (taskId, event) =>
|
||||
dashboardState.broadcastStreamEvent(taskId, event),
|
||||
})
|
||||
|
||||
// Register so dashboard stop button works for CLI runs too
|
||||
setActiveExecutor(executor)
|
||||
try {
|
||||
return await executor.execute(tasks, (completed, total, task, result) => {
|
||||
printTaskProgress(completed, total, task, result)
|
||||
})
|
||||
} finally {
|
||||
setActiveExecutor(null)
|
||||
}
|
||||
}
|
||||
|
||||
function printTaskProgress(
|
||||
completed: number,
|
||||
total: number,
|
||||
task: Task,
|
||||
result: TaskResult,
|
||||
): void {
|
||||
const status =
|
||||
result.status === 'completed'
|
||||
? 'DONE'
|
||||
: result.status === 'timeout'
|
||||
? 'TIMEOUT'
|
||||
: 'FAILED'
|
||||
|
||||
const duration =
|
||||
result.durationMs > 0 ? ` (${(result.durationMs / 1000).toFixed(1)}s)` : ''
|
||||
|
||||
console.log(`[${completed}/${total}] ${task.query_id}: ${status}${duration}`)
|
||||
|
||||
if (result.status === 'failed') {
|
||||
console.log(` ERROR: ${result.error.message}`)
|
||||
} else if (isSuccessfulResult(result)) {
|
||||
// Log agent errors (e.g., LLM API failures) even if task "completed"
|
||||
if (result.agentResult.metadata.errors?.length) {
|
||||
for (const err of result.agentResult.metadata.errors) {
|
||||
console.log(` ERROR [${err.source}]: ${err.message}`)
|
||||
}
|
||||
}
|
||||
for (const [name, gr] of Object.entries(result.graderResults)) {
|
||||
const icon = gr.pass ? 'PASS' : 'FAIL'
|
||||
console.log(` ${name}: ${icon}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Summary
|
||||
// ============================================================================
|
||||
|
||||
function buildSummary(results: TaskResult[]): BatchSummary {
|
||||
// Track errors by source
|
||||
const errorsBySource: Partial<Record<ErrorSource, number>> = {}
|
||||
let totalWarnings = 0
|
||||
|
||||
const taskSummaries: TaskResultSummary[] = results.map((r) => {
|
||||
let errorCount = 0
|
||||
let warningCount = 0
|
||||
let errorSources: ErrorSource[] | undefined
|
||||
let failureReason: string | undefined
|
||||
|
||||
if (isSuccessfulResult(r)) {
|
||||
// Count errors and warnings from agent metadata
|
||||
errorCount = r.agentResult.metadata.errors?.length ?? 0
|
||||
warningCount = r.agentResult.metadata.warnings?.length ?? 0
|
||||
totalWarnings += warningCount
|
||||
|
||||
// Track error sources
|
||||
if (r.agentResult.metadata.errors?.length) {
|
||||
errorSources = r.agentResult.metadata.errors.map((e) => e.source)
|
||||
for (const err of r.agentResult.metadata.errors) {
|
||||
errorsBySource[err.source] = (errorsBySource[err.source] ?? 0) + 1
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Failed task
|
||||
errorCount = 1
|
||||
errorSources = [r.errorSource]
|
||||
failureReason = r.error.message
|
||||
errorsBySource[r.errorSource] = (errorsBySource[r.errorSource] ?? 0) + 1
|
||||
}
|
||||
|
||||
return {
|
||||
queryId: r.task.query_id,
|
||||
status: r.status,
|
||||
durationMs: r.durationMs,
|
||||
graderResults: isSuccessfulResult(r)
|
||||
? Object.fromEntries(
|
||||
Object.entries(r.graderResults).map(([name, gr]) => [
|
||||
name,
|
||||
{ pass: gr.pass, score: gr.score },
|
||||
]),
|
||||
)
|
||||
: undefined,
|
||||
errorCount,
|
||||
warningCount,
|
||||
errorSources: errorSources?.length ? errorSources : undefined,
|
||||
failureReason,
|
||||
}
|
||||
})
|
||||
|
||||
const completed = results.filter((r) => r.status === 'completed').length
|
||||
const timeout = results.filter((r) => r.status === 'timeout').length
|
||||
const failed = results.filter((r) => r.status === 'failed').length
|
||||
|
||||
// Calculate pass rate using primary grader (fallback order)
|
||||
let totalGraded = 0
|
||||
let totalPasses = 0
|
||||
|
||||
for (const result of results) {
|
||||
if (isSuccessfulResult(result)) {
|
||||
const primary = getPrimaryGraderResult(result.graderResults)
|
||||
if (primary) {
|
||||
totalGraded++
|
||||
if (primary.pass) totalPasses++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const passRate = totalGraded > 0 ? totalPasses / totalGraded : 0
|
||||
|
||||
// Calculate average duration for non-failed tasks
|
||||
const durations = results
|
||||
.filter((r) => r.status !== 'failed')
|
||||
.map((r) => r.durationMs)
|
||||
const avgDurationMs =
|
||||
durations.length > 0
|
||||
? durations.reduce((a, b) => a + b, 0) / durations.length
|
||||
: 0
|
||||
|
||||
return {
|
||||
total: results.length,
|
||||
completed,
|
||||
failed,
|
||||
timeout,
|
||||
passRate,
|
||||
avgDurationMs,
|
||||
errorsBySource,
|
||||
totalWarnings,
|
||||
results: taskSummaries,
|
||||
}
|
||||
}
|
||||
|
||||
async function saveSummary(
|
||||
summary: BatchSummary,
|
||||
outputDir: string,
|
||||
): Promise<void> {
|
||||
await writeFile(
|
||||
join(outputDir, 'summary.json'),
|
||||
JSON.stringify(summary, null, 2),
|
||||
)
|
||||
}
|
||||
|
||||
function printSummary(summary: BatchSummary): void {
|
||||
console.log('='.repeat(60))
|
||||
console.log('EVALUATION COMPLETE')
|
||||
console.log('='.repeat(60))
|
||||
console.log(`Total: ${summary.total} tasks`)
|
||||
console.log(` Completed: ${summary.completed}`)
|
||||
console.log(` Timeout: ${summary.timeout}`)
|
||||
console.log(` Failed: ${summary.failed}`)
|
||||
console.log(` Pass Rate: ${(summary.passRate * 100).toFixed(1)}%`)
|
||||
console.log(` Avg Duration: ${(summary.avgDurationMs / 1000).toFixed(1)}s`)
|
||||
}
|
||||
export { runEval } from '../runs/eval-runner'
|
||||
|
||||
@@ -1,266 +1,5 @@
|
||||
/**
|
||||
* Parallel Executor
|
||||
*
|
||||
* Each worker gets its own isolated BrowserOS stack:
|
||||
* - BrowserOSAppManager (Chrome + Server on unique ports)
|
||||
* - TaskExecutor (uses that worker's server URL)
|
||||
*
|
||||
* Port allocation: Worker N → CDP=base+N, Server=base+N, Extension=base+N
|
||||
*/
|
||||
|
||||
import type { EvalConfig, Task } from '../types'
|
||||
import { BrowserOSAppManager, type EvalPorts } from './browseros-app-manager'
|
||||
import { createTaskExecutor } from './task-executor'
|
||||
import type { TaskResult } from './types'
|
||||
|
||||
// ============================================================================
|
||||
// Types
|
||||
// ============================================================================
|
||||
|
||||
export interface ParallelExecutorConfig {
|
||||
numWorkers: number
|
||||
config: EvalConfig
|
||||
outputDir: string
|
||||
restartServerPerTask?: boolean
|
||||
onEvent?: (taskId: string, event: Record<string, unknown>) => void
|
||||
}
|
||||
|
||||
export type ProgressCallback = (
|
||||
completed: number,
|
||||
total: number,
|
||||
task: Task,
|
||||
result: TaskResult,
|
||||
) => void
|
||||
|
||||
// ============================================================================
|
||||
// Task Queue (thread-safe for single-threaded async — index is atomic)
|
||||
// ============================================================================
|
||||
|
||||
class TaskQueue {
|
||||
private tasks: Task[]
|
||||
private index: number = 0
|
||||
private stopped: boolean = false
|
||||
|
||||
constructor(tasks: Task[]) {
|
||||
this.tasks = [...tasks]
|
||||
}
|
||||
|
||||
next(): Task | null {
|
||||
if (this.stopped || this.index >= this.tasks.length) return null
|
||||
return this.tasks[this.index++]
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
this.stopped = true
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Parallel Executor
|
||||
// ============================================================================
|
||||
|
||||
export class ParallelExecutor {
|
||||
private readonly numWorkers: number
|
||||
private readonly appManagers = new Map<number, BrowserOSAppManager>()
|
||||
private completedCount: number = 0
|
||||
private readonly resultLock = new Map<string, TaskResult>()
|
||||
private queue: TaskQueue | null = null
|
||||
|
||||
constructor(private readonly config: ParallelExecutorConfig) {
|
||||
this.numWorkers = Math.max(1, config.numWorkers)
|
||||
}
|
||||
|
||||
async stop(): Promise<void> {
|
||||
console.log('\nStopping eval run...')
|
||||
this.queue?.stop()
|
||||
const kills = [...this.appManagers.values()].map((m) => m.killApp())
|
||||
await Promise.allSettled(kills)
|
||||
}
|
||||
|
||||
async execute(
|
||||
tasks: Task[],
|
||||
onProgress?: ProgressCallback,
|
||||
): Promise<TaskResult[]> {
|
||||
if (tasks.length === 0) return []
|
||||
|
||||
const cleanup = this.setupSignalHandlers()
|
||||
|
||||
const loadExtensions = this.config.config.browseros.load_extensions ?? false
|
||||
|
||||
// Patch NopeCHA API key before launching any workers
|
||||
const captchaConfig = this.config.config.captcha
|
||||
if (captchaConfig) {
|
||||
const apiKey = process.env[captchaConfig.api_key_env]
|
||||
if (apiKey) {
|
||||
BrowserOSAppManager.patchNopechaApiKey(apiKey)
|
||||
}
|
||||
}
|
||||
|
||||
this.queue = new TaskQueue(tasks)
|
||||
const totalTasks = tasks.length
|
||||
|
||||
try {
|
||||
const queue = this.queue
|
||||
// Launch N workers in parallel — each gets its own Chrome + Server
|
||||
const workers = Array.from({ length: this.numWorkers }, (_, i) =>
|
||||
this.runWorker(i, queue, totalTasks, loadExtensions, onProgress),
|
||||
)
|
||||
await Promise.all(workers)
|
||||
|
||||
// Return results in original task order
|
||||
return tasks.map((task) => {
|
||||
const result = this.resultLock.get(task.query_id)
|
||||
if (!result) {
|
||||
return {
|
||||
status: 'failed' as const,
|
||||
task,
|
||||
error: new Error('Task result not found'),
|
||||
errorSource: 'unknown' as const,
|
||||
durationMs: 0,
|
||||
}
|
||||
}
|
||||
return result
|
||||
})
|
||||
} finally {
|
||||
cleanup()
|
||||
}
|
||||
}
|
||||
|
||||
private async runWorker(
|
||||
workerIndex: number,
|
||||
queue: TaskQueue,
|
||||
totalTasks: number,
|
||||
loadExtensions: boolean,
|
||||
onProgress?: ProgressCallback,
|
||||
): Promise<void> {
|
||||
// Per-worker isolated ports
|
||||
const basePorts: EvalPorts = {
|
||||
cdp: this.config.config.browseros.base_cdp_port,
|
||||
server: this.config.config.browseros.base_server_port,
|
||||
extension: this.config.config.browseros.base_extension_port,
|
||||
}
|
||||
const headless = this.config.config.browseros.headless ?? false
|
||||
const appManager = new BrowserOSAppManager(
|
||||
workerIndex,
|
||||
basePorts,
|
||||
loadExtensions,
|
||||
headless,
|
||||
)
|
||||
this.appManagers.set(workerIndex, appManager)
|
||||
|
||||
// Per-worker executor pointing to this worker's server
|
||||
const workerConfig: typeof this.config.config = {
|
||||
...this.config.config,
|
||||
browseros: {
|
||||
...this.config.config.browseros,
|
||||
server_url: appManager.getServerUrl(),
|
||||
},
|
||||
}
|
||||
const executor = createTaskExecutor(
|
||||
workerConfig,
|
||||
workerIndex,
|
||||
this.config.outputDir,
|
||||
this.config.onEvent,
|
||||
)
|
||||
|
||||
try {
|
||||
// Always start Chrome+Server once for this worker
|
||||
console.log(`\n Worker ${workerIndex}: Starting BrowserOS stack...`)
|
||||
await appManager.restart()
|
||||
|
||||
while (true) {
|
||||
const task = queue.next()
|
||||
if (!task) break
|
||||
|
||||
const taskStartTime = Date.now()
|
||||
let result: TaskResult
|
||||
|
||||
try {
|
||||
// Restart between tasks if configured
|
||||
if (this.config.restartServerPerTask) {
|
||||
console.log(`\n${'─'.repeat(60)}`)
|
||||
console.log(` Worker ${workerIndex}: Task: ${task.query_id}`)
|
||||
console.log(`${'─'.repeat(60)}`)
|
||||
await appManager.restart()
|
||||
}
|
||||
|
||||
this.config.onEvent?.(task.query_id, {
|
||||
type: 'task-state',
|
||||
taskId: task.query_id,
|
||||
status: 'running',
|
||||
})
|
||||
result = await executor.execute(task)
|
||||
console.log(
|
||||
` Worker ${workerIndex}: ${task.query_id}: ${result.status}`,
|
||||
)
|
||||
} catch (error) {
|
||||
console.error(
|
||||
` Worker ${workerIndex}: ${task.query_id}: FAILED - ${error instanceof Error ? error.message : String(error)}`,
|
||||
)
|
||||
result = {
|
||||
status: 'failed',
|
||||
task,
|
||||
error: error instanceof Error ? error : new Error(String(error)),
|
||||
errorSource: 'unknown',
|
||||
durationMs: Date.now() - taskStartTime,
|
||||
}
|
||||
}
|
||||
|
||||
this.resultLock.set(task.query_id, result)
|
||||
this.completedCount++
|
||||
|
||||
// Emit task completion to dashboard
|
||||
const stateEvent: Record<string, unknown> = {
|
||||
type: 'task-state',
|
||||
taskId: task.query_id,
|
||||
status: result.status,
|
||||
durationMs: result.durationMs,
|
||||
}
|
||||
if (result.status !== 'failed' && 'graderResults' in result) {
|
||||
stateEvent.graderResults = Object.fromEntries(
|
||||
Object.entries(result.graderResults).map(([name, gr]) => [
|
||||
name,
|
||||
{
|
||||
pass: gr.pass,
|
||||
score: gr.score,
|
||||
reasoning: gr.reasoning,
|
||||
details: gr.details,
|
||||
},
|
||||
]),
|
||||
)
|
||||
stateEvent.screenshotCount =
|
||||
result.agentResult?.metadata?.total_steps ?? 0
|
||||
}
|
||||
this.config.onEvent?.(task.query_id, stateEvent)
|
||||
|
||||
onProgress?.(this.completedCount, totalTasks, task, result)
|
||||
|
||||
if (this.config.restartServerPerTask) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000))
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
await appManager.killApp()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* SIGINT/SIGTERM kills all Chrome + Server instances across all workers.
|
||||
* Returns a cleanup function that removes the listeners after execute() completes.
|
||||
*/
|
||||
private setupSignalHandlers(): () => void {
|
||||
const onSignal = async () => {
|
||||
console.log('\nShutting down all workers...')
|
||||
this.queue?.stop()
|
||||
const kills = [...this.appManagers.values()].map((m) => m.killApp())
|
||||
await Promise.allSettled(kills)
|
||||
process.exit(0)
|
||||
}
|
||||
process.on('SIGINT', onSignal)
|
||||
process.on('SIGTERM', onSignal)
|
||||
return () => {
|
||||
process.off('SIGINT', onSignal)
|
||||
process.off('SIGTERM', onSignal)
|
||||
}
|
||||
}
|
||||
}
|
||||
export {
|
||||
type ProgressCallback,
|
||||
TaskWorkerPool as ParallelExecutor,
|
||||
type TaskWorkerPoolConfig as ParallelExecutorConfig,
|
||||
} from '../runs/task-worker-pool'
|
||||
|
||||
@@ -1,316 +1,6 @@
|
||||
import { join } from 'node:path'
|
||||
import { createAgent } from '../agents'
|
||||
import type { AgentContext, AgentResult } from '../agents/types'
|
||||
import { CaptureContext } from '../capture/context'
|
||||
import {
|
||||
hasExistingGraderResults,
|
||||
TrajectorySaver,
|
||||
} from '../capture/trajectory-saver'
|
||||
import { runGraders } from '../graders/registry'
|
||||
import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
|
||||
import { callMcpTool } from '../utils/mcp-client'
|
||||
import { InfinityAppManager } from './infinity-app-manager'
|
||||
import type { TaskResult } from './types'
|
||||
|
||||
// ============================================================================
|
||||
// Errors
|
||||
// ============================================================================
|
||||
|
||||
export class TaskExecutionError extends Error {
|
||||
public readonly errorSource: ErrorSource
|
||||
|
||||
constructor(
|
||||
message: string,
|
||||
public readonly task: Task,
|
||||
public readonly phase:
|
||||
| 'navigation'
|
||||
| 'agent_execution'
|
||||
| 'grading'
|
||||
| 'cleanup',
|
||||
public readonly cause?: Error,
|
||||
) {
|
||||
super(message)
|
||||
this.name = 'TaskExecutionError'
|
||||
this.errorSource = phase as ErrorSource
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Task Executor
|
||||
// ============================================================================
|
||||
|
||||
export interface TaskExecutorDeps {
|
||||
onEvent?: (taskId: string, event: Record<string, unknown>) => void
|
||||
}
|
||||
|
||||
export class TaskExecutor {
|
||||
constructor(
|
||||
private readonly config: EvalConfig,
|
||||
private readonly workerIndex: number,
|
||||
private readonly outputDir: string,
|
||||
private readonly deps: TaskExecutorDeps,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* Resolve the initial page ID via list_pages MCP call.
|
||||
* Called once per task on a fresh browser — there's exactly one page.
|
||||
*/
|
||||
private async resolveInitialPageId(mcpUrl: string): Promise<number> {
|
||||
try {
|
||||
const result = await callMcpTool(mcpUrl, 'list_pages', {})
|
||||
if (!result.isError) {
|
||||
const textContent = result.content?.find(
|
||||
(c: { type: string }) => c.type === 'text',
|
||||
)
|
||||
const match = textContent?.text?.match(/^\s*(\d+)\./m)
|
||||
if (match) return Number.parseInt(match[1], 10)
|
||||
}
|
||||
} catch {
|
||||
// Fall through to default
|
||||
}
|
||||
// Fresh browser always has page 1
|
||||
return 1
|
||||
}
|
||||
|
||||
async execute(task: Task): Promise<TaskResult> {
|
||||
const startTime = Date.now()
|
||||
const mcpUrl = `${this.config.browseros.server_url}/mcp`
|
||||
|
||||
// Check if task already has grader results (resume capability)
|
||||
const existing = await hasExistingGraderResults(
|
||||
this.outputDir,
|
||||
task.query_id,
|
||||
)
|
||||
if (existing.exists && existing.metadata) {
|
||||
console.log(` Skipping: already has grader results`)
|
||||
return {
|
||||
status:
|
||||
existing.metadata.termination_reason === 'timeout'
|
||||
? 'timeout'
|
||||
: 'completed',
|
||||
task,
|
||||
agentResult: {
|
||||
metadata: existing.metadata,
|
||||
messages: [],
|
||||
finalAnswer: existing.metadata.final_answer,
|
||||
},
|
||||
graderResults: existing.metadata.grader_results,
|
||||
durationMs: existing.metadata.total_duration_ms,
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve page ID once — fresh browser has exactly one page
|
||||
const pageId = await this.resolveInitialPageId(mcpUrl)
|
||||
|
||||
// For Infinity tasks, start a fresh app server per task
|
||||
let infinityManager: InfinityAppManager | null = null
|
||||
let actualStartUrl = task.start_url
|
||||
|
||||
if (task.dataset === 'webarena-infinity') {
|
||||
const appName = (task.metadata?.additional as Record<string, unknown>)
|
||||
?.app_name as string
|
||||
const appBasePort =
|
||||
((task.metadata?.additional as Record<string, unknown>)
|
||||
?.app_base_port as number) || 8000
|
||||
|
||||
if (appName && process.env.WEBARENA_INFINITY_DIR) {
|
||||
infinityManager = new InfinityAppManager(this.workerIndex, appBasePort)
|
||||
try {
|
||||
actualStartUrl = await infinityManager.startApp(appName)
|
||||
console.log(
|
||||
` Infinity app "${appName}" started on port ${infinityManager.getPort()}`,
|
||||
)
|
||||
} catch (error) {
|
||||
throw new TaskExecutionError(
|
||||
`Failed to start Infinity app: ${error instanceof Error ? error.message : String(error)}`,
|
||||
task,
|
||||
'navigation',
|
||||
error instanceof Error ? error : undefined,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
// Phase 1: Set viewport + navigate to start URL
|
||||
try {
|
||||
await callMcpTool(mcpUrl, 'evaluate_script', {
|
||||
page: pageId,
|
||||
expression: 'window.resizeTo(1440, 900)',
|
||||
})
|
||||
} catch (vpError) {
|
||||
console.warn(
|
||||
` Viewport resize failed: ${vpError instanceof Error ? vpError.message : String(vpError)}`,
|
||||
)
|
||||
}
|
||||
|
||||
if (actualStartUrl && actualStartUrl !== 'about:blank') {
|
||||
try {
|
||||
await callMcpTool(mcpUrl, 'navigate_page', {
|
||||
url: actualStartUrl,
|
||||
page: pageId,
|
||||
})
|
||||
} catch (error) {
|
||||
throw new TaskExecutionError(
|
||||
`Failed to navigate to start URL: ${error instanceof Error ? error.message : String(error)}`,
|
||||
task,
|
||||
'navigation',
|
||||
error instanceof Error ? error : undefined,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Execute agent
|
||||
const agentResult = await this.executeAgent(task, pageId)
|
||||
|
||||
// Phase 3: Run graders
|
||||
const graderResults = await this.runGraders(
|
||||
task,
|
||||
agentResult,
|
||||
infinityManager?.getUrl(),
|
||||
)
|
||||
|
||||
const status =
|
||||
agentResult.metadata.termination_reason === 'timeout'
|
||||
? 'timeout'
|
||||
: 'completed'
|
||||
|
||||
return {
|
||||
status,
|
||||
task,
|
||||
agentResult,
|
||||
graderResults,
|
||||
durationMs: Date.now() - startTime,
|
||||
}
|
||||
} catch (error) {
|
||||
const errorSource: ErrorSource =
|
||||
error instanceof TaskExecutionError ? error.errorSource : 'unknown'
|
||||
|
||||
return {
|
||||
status: 'failed',
|
||||
task,
|
||||
error: error instanceof Error ? error : new Error(String(error)),
|
||||
errorSource,
|
||||
durationMs: Date.now() - startTime,
|
||||
}
|
||||
} finally {
|
||||
// Navigate to about:blank to clean up
|
||||
try {
|
||||
await callMcpTool(mcpUrl, 'navigate_page', {
|
||||
url: 'about:blank',
|
||||
page: pageId,
|
||||
})
|
||||
} catch {
|
||||
// Ignore cleanup errors
|
||||
}
|
||||
|
||||
// Stop Infinity app server if running
|
||||
if (infinityManager) {
|
||||
await infinityManager.stop().catch(() => {})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async executeAgent(task: Task, pageId: number): Promise<AgentResult> {
|
||||
try {
|
||||
const { capture, taskOutputDir } = await CaptureContext.create({
|
||||
serverUrl: this.config.browseros.server_url,
|
||||
outputDir: this.outputDir,
|
||||
taskId: task.query_id,
|
||||
initialPageId: pageId,
|
||||
onEvent: this.deps.onEvent,
|
||||
})
|
||||
|
||||
const context: AgentContext = {
|
||||
config: this.config,
|
||||
task,
|
||||
workerIndex: this.workerIndex,
|
||||
initialPageId: pageId,
|
||||
outputDir: this.outputDir,
|
||||
taskOutputDir,
|
||||
capture,
|
||||
}
|
||||
|
||||
const agent = createAgent(context)
|
||||
return await agent.execute()
|
||||
} catch (error) {
|
||||
if (error instanceof TaskExecutionError) {
|
||||
throw error
|
||||
}
|
||||
throw new TaskExecutionError(
|
||||
`Agent execution failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
task,
|
||||
'agent_execution',
|
||||
error instanceof Error ? error : undefined,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private async runGraders(
|
||||
task: Task,
|
||||
agentResult: AgentResult,
|
||||
infinityAppUrl?: string,
|
||||
): Promise<Record<string, GraderResult>> {
|
||||
const configGraders = this.config.graders ?? []
|
||||
const taskGraders = task.graders ?? []
|
||||
const graderNames = configGraders.length > 0 ? configGraders : taskGraders
|
||||
if (graderNames.length === 0) {
|
||||
return {}
|
||||
}
|
||||
|
||||
try {
|
||||
const graderResults = await runGraders(graderNames, {
|
||||
task: {
|
||||
query_id: task.query_id,
|
||||
query: task.query,
|
||||
dataset: task.dataset,
|
||||
},
|
||||
messages: agentResult.messages,
|
||||
screenshotCount:
|
||||
agentResult.metadata.screenshot_count ??
|
||||
agentResult.metadata.total_steps,
|
||||
finalAnswer: agentResult.finalAnswer,
|
||||
expectedAnswer: (task.metadata?.additional as Record<string, unknown>)
|
||||
?.answer as string | undefined,
|
||||
outputDir: join(this.outputDir, task.query_id),
|
||||
mcpUrl: `${this.config.browseros.server_url}/mcp`,
|
||||
infinityAppUrl,
|
||||
})
|
||||
|
||||
try {
|
||||
const saver = new TrajectorySaver(this.outputDir, task.query_id)
|
||||
await saver.updateGraderResults(graderResults)
|
||||
} catch (saveError) {
|
||||
console.warn(
|
||||
` Failed to persist grader results: ${saveError instanceof Error ? saveError.message : String(saveError)}`,
|
||||
)
|
||||
}
|
||||
|
||||
return graderResults
|
||||
} catch (error) {
|
||||
console.warn(
|
||||
` Grading failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
)
|
||||
return {
|
||||
_error: {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: `Grading failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Factory
|
||||
// ============================================================================
|
||||
|
||||
export function createTaskExecutor(
|
||||
config: EvalConfig,
|
||||
workerIndex: number,
|
||||
outputDir: string,
|
||||
onEvent?: (taskId: string, event: Record<string, unknown>) => void,
|
||||
): TaskExecutor {
|
||||
return new TaskExecutor(config, workerIndex, outputDir, { onEvent })
|
||||
}
|
||||
export {
|
||||
createTaskRunPipeline as createTaskExecutor,
|
||||
TaskExecutionError,
|
||||
TaskRunPipeline as TaskExecutor,
|
||||
type TaskRunPipelineDeps as TaskExecutorDeps,
|
||||
} from '../runs/task-run-pipeline'
|
||||
|
||||
@@ -8,12 +8,18 @@ import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
|
||||
|
||||
export interface RunEvalOptions {
|
||||
configPath: string
|
||||
config?: EvalConfig
|
||||
dataPath?: string
|
||||
query?: string
|
||||
startUrl?: string
|
||||
outputDir?: string
|
||||
}
|
||||
|
||||
export interface RunEvalResult {
|
||||
outputDir: string
|
||||
summary: BatchSummary
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Task Loading
|
||||
// ============================================================================
|
||||
|
||||
46
packages/browseros-agent/apps/eval/src/runs/artifact-paths.ts
vendored
Normal file
46
packages/browseros-agent/apps/eval/src/runs/artifact-paths.ts
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
import { join } from 'node:path'
|
||||
|
||||
function timestamp(date: Date): string {
|
||||
const y = date.getUTCFullYear()
|
||||
const m = String(date.getUTCMonth() + 1).padStart(2, '0')
|
||||
const d = String(date.getUTCDate()).padStart(2, '0')
|
||||
const h = String(date.getUTCHours()).padStart(2, '0')
|
||||
const min = String(date.getUTCMinutes()).padStart(2, '0')
|
||||
return `${y}-${m}-${d}-${h}${min}`
|
||||
}
|
||||
|
||||
function safeSegment(value: string): string {
|
||||
return value
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9._-]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '')
|
||||
}
|
||||
|
||||
/** Creates a path-safe run id from suite/config, variant, and time. */
|
||||
export function createRunId(
|
||||
suiteId: string,
|
||||
variantId: string,
|
||||
date = new Date(),
|
||||
): string {
|
||||
return `${safeSegment(suiteId)}__${safeSegment(variantId)}__${timestamp(date)}`
|
||||
}
|
||||
|
||||
export function getRunPaths(baseDir: string, runId: string, taskId?: string) {
|
||||
const runDir = join(baseDir, 'runs', runId)
|
||||
const taskDir = taskId ? join(runDir, 'tasks', taskId) : undefined
|
||||
|
||||
return {
|
||||
runDir,
|
||||
runManifest: join(runDir, 'run.json'),
|
||||
summary: join(runDir, 'summary.json'),
|
||||
viewerManifest: join(runDir, 'viewer-manifest.json'),
|
||||
uploadManifest: join(runDir, 'upload-manifest.json'),
|
||||
taskDir,
|
||||
attempt: taskDir ? join(taskDir, 'attempt.json') : undefined,
|
||||
trace: taskDir ? join(taskDir, 'trace.jsonl') : undefined,
|
||||
messages: taskDir ? join(taskDir, 'messages.jsonl') : undefined,
|
||||
grades: taskDir ? join(taskDir, 'grades.json') : undefined,
|
||||
graderArtifacts: taskDir ? join(taskDir, 'grader-artifacts') : undefined,
|
||||
screenshots: taskDir ? join(taskDir, 'screenshots') : undefined,
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user