Compare commits

...

4 Commits

Author SHA1 Message Date
Nikhil Sonti
71cb158739 feat(eval): add parallel workers support to showcase generator
Add --workers/-w flag for parallel task execution. Each worker gets
its own isolated BrowserOS stack on offset ports, matching the eval
ParallelExecutor pattern. Also uses env-based defaults for model/
provider/baseUrl (SHOWCASE_MODEL, SHOWCASE_PROVIDER, SHOWCASE_BASE_URL)
and reads API key from OPENROUTER_API_KEY.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 17:16:04 -07:00
Nikhil Sonti
654a871841 fix(eval): improve crosshair visibility, fix page ID resolution, add prod tasks
- Crosshair: 2.5x larger (20px ring, 40px lines, 3px stroke), glow
  filter, semi-transparent fill circle, bigger label
- Page ID: add resolvePageId() helper that falls back to first
  available page when the agent's page reference is stale
- Add prod-tasks.jsonl with 20 real-world queries covering finance,
  legal, healthcare, tech, real estate, HR, and more

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 17:14:40 -07:00
Nikhil Sonti
ff167fb855 fix(eval): use openrouter defaults, fix page ID tracking, add showcase tasks
- Default provider/model/apiKey read from env vars (SHOWCASE_PROVIDER,
  SHOWCASE_MODEL, OPENROUTER_API_KEY) instead of hardcoded OpenAI
- Fix stale page ID bug: activePageId now updates when tool calls
  reference a different page
- Add showcase-tasks.jsonl with sample Amazon task
- Add showcase-output/ and .env to .gitignore

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 16:12:33 -07:00
Nikhil Sonti
3d4e493512 feat(eval): add showcase dataset generator with before/after screenshots
Standalone script that runs the BrowserOS agent on tasks and captures
rich execution traces for website showcase galleries:

- Before/after screenshots per tool call via AI SDK callbacks
- CSS crosshair overlays injected via browser.evaluate() for
  element-targeting tools (click, fill, hover, etc.)
- Accessibility snapshot capture at each step
- Per-task JSON manifests with execution metadata
- R2 upload with URL rewriting for cloud-hosted galleries

New files:
- apps/eval/scripts/showcase/ (generate, executor, crosshair,
  manifest, uploader, types)
- apps/eval/.env.example (R2 credentials template)

Also adds Browser.getElementCenter() public method for coordinate
resolution without triggering actions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 15:07:14 -07:00
11 changed files with 880 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
# Showcase generator — LLM config
OPENROUTER_API_KEY=
SHOWCASE_MODEL=openai/gpt-4o
SHOWCASE_PROVIDER=openrouter
SHOWCASE_BASE_URL=https://openrouter.ai/api/v1
# R2 upload (for --upload flag)
R2_ACCOUNT_ID=
R2_ACCESS_KEY_ID=
R2_SECRET_ACCESS_KEY=
R2_BUCKET=rl-env

View File

@@ -1,3 +1,5 @@
data/raw/
results/
extensions/
showcase-output/
.env

View File

@@ -0,0 +1,20 @@
{"query_id":"prod-financial-advisor-morningstar","dataset":"prod","query":"Look up the Morningstar rating and expense ratio for Vanguard Total Stock Market Index Fund (VTSAX) on morningstar.com","start_url":"https://www.morningstar.com","metadata":{"original_task_id":"prod-financial-advisor-morningstar","category":"finance"}}
{"query_id":"prod-lawyer-pacer-search","dataset":"prod","query":"Go to courtlistener.com and search for recent federal court opinions mentioning 'non-compete agreement' from 2025","start_url":"https://www.courtlistener.com","metadata":{"original_task_id":"prod-lawyer-pacer-search","category":"legal"}}
{"query_id":"prod-doctor-drug-interaction","dataset":"prod","query":"Check drug interactions between metformin and lisinopril on drugs.com","start_url":"https://www.drugs.com/drug_interactions.html","metadata":{"original_task_id":"prod-doctor-drug-interaction","category":"healthcare"}}
{"query_id":"prod-software-eng-github-trending","dataset":"prod","query":"Find the top trending Python repositories on GitHub this week and open the most starred one","start_url":"https://github.com/trending","metadata":{"original_task_id":"prod-software-eng-github-trending","category":"technology"}}
{"query_id":"prod-cfo-sec-filing","dataset":"prod","query":"Look up Apple's most recent 10-K filing on SEC EDGAR and find their total revenue for fiscal year 2024","start_url":"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&company=apple&CIK=&type=10-K&dateb=&owner=include&count=10&search_text=&action=getcompany","metadata":{"original_task_id":"prod-cfo-sec-filing","category":"finance"}}
{"query_id":"prod-realtor-zillow-comp","dataset":"prod","query":"Search for recently sold homes in Palo Alto, CA on Zillow and filter for 3+ bedrooms sold in the last 30 days","start_url":"https://www.zillow.com","metadata":{"original_task_id":"prod-realtor-zillow-comp","category":"real_estate"}}
{"query_id":"prod-hr-manager-linkedin-job","dataset":"prod","query":"Post a job listing search on LinkedIn for a Senior Product Manager role in San Francisco with salary range $180k-$220k","start_url":"https://www.linkedin.com/jobs/","metadata":{"original_task_id":"prod-hr-manager-linkedin-job","category":"hr"}}
{"query_id":"prod-analyst-fred-data","dataset":"prod","query":"Go to FRED and pull up the US Consumer Price Index chart, change the time range to the last 5 years","start_url":"https://fred.stlouisfed.org","metadata":{"original_task_id":"prod-analyst-fred-data","category":"finance"}}
{"query_id":"prod-accountant-irs-form","dataset":"prod","query":"Find and download the latest IRS Form W-9 from irs.gov","start_url":"https://www.irs.gov","metadata":{"original_task_id":"prod-accountant-irs-form","category":"finance"}}
{"query_id":"prod-sales-manager-crm-research","dataset":"prod","query":"Go to g2.com and compare the top 3 CRM software platforms by user rating and pricing","start_url":"https://www.g2.com/categories/crm","metadata":{"original_task_id":"prod-sales-manager-crm-research","category":"sales"}}
{"query_id":"prod-engineer-stackoverflow","dataset":"prod","query":"Search Stack Overflow for how to implement retry logic with exponential backoff in Python and find the highest voted answer","start_url":"https://stackoverflow.com","metadata":{"original_task_id":"prod-engineer-stackoverflow","category":"technology"}}
{"query_id":"prod-pm-producthunt","dataset":"prod","query":"Browse today's top launches on Product Hunt and upvote the highest ranked AI product","start_url":"https://www.producthunt.com","metadata":{"original_task_id":"prod-pm-producthunt","category":"technology"}}
{"query_id":"prod-pharmacist-fda-recall","dataset":"prod","query":"Check the FDA website for any recent drug recalls in the last month","start_url":"https://www.fda.gov/safety/recalls-market-withdrawals-safety-alerts","metadata":{"original_task_id":"prod-pharmacist-fda-recall","category":"healthcare"}}
{"query_id":"prod-investment-analyst-yahoo-finance","dataset":"prod","query":"Look up NVIDIA stock on Yahoo Finance, check the P/E ratio, and add it to a watchlist","start_url":"https://finance.yahoo.com","metadata":{"original_task_id":"prod-investment-analyst-yahoo-finance","category":"finance"}}
{"query_id":"prod-compliance-officer-regulations","dataset":"prod","query":"Search for the latest GDPR enforcement actions on the European Data Protection Board website","start_url":"https://www.edpb.europa.eu/news/news_en","metadata":{"original_task_id":"prod-compliance-officer-regulations","category":"compliance"}}
{"query_id":"prod-management-consultant-mckinsey","dataset":"prod","query":"Go to McKinsey's insights page and find their latest article about generative AI's impact on productivity","start_url":"https://www.mckinsey.com/featured-insights","metadata":{"original_task_id":"prod-management-consultant-mckinsey","category":"consulting"}}
{"query_id":"prod-operations-manager-shipping","dataset":"prod","query":"Track a FedEx package with tracking number 123456789012 on fedex.com","start_url":"https://www.fedex.com/en-us/tracking.html","metadata":{"original_task_id":"prod-operations-manager-shipping","category":"operations"}}
{"query_id":"prod-market-researcher-statista","dataset":"prod","query":"Search Statista for the global AI market size forecast and find the projected value for 2026","start_url":"https://www.statista.com","metadata":{"original_task_id":"prod-market-researcher-statista","category":"research"}}
{"query_id":"prod-nurse-uptodate","dataset":"prod","query":"Search WebMD for the recommended dosing guidelines for adult acetaminophen and check the maximum daily dose","start_url":"https://www.webmd.com","metadata":{"original_task_id":"prod-nurse-uptodate","category":"healthcare"}}
{"query_id":"prod-executive-flights","dataset":"prod","query":"Search Google Flights for a business class round trip from SFO to JFK departing next Monday returning Friday","start_url":"https://www.google.com/travel/flights","metadata":{"original_task_id":"prod-executive-flights","category":"travel"}}

View File

@@ -0,0 +1 @@
{"query_id":"showcase-amazon-order","dataset":"showcase","query":"Open amazon.com and order Sensodyne toothpaste","start_url":"https://www.amazon.com","metadata":{"original_task_id":"showcase-amazon-order"}}

View File

@@ -0,0 +1,52 @@
import type { Browser } from '@browseros/server/browser'
const CROSSHAIR_ID = '__browseros_showcase_crosshair__'
export async function injectCrosshair(
browser: Browser,
pageId: number,
coords: { x: number; y: number },
toolName: string,
): Promise<void> {
const x = Math.round(coords.x)
const y = Math.round(coords.y)
const label = toolName.replace(/_/g, ' ')
const labelWidth = Math.round(label.length * 9 + 24)
const labelX = x + 32
const labelY = y - 32
await browser.evaluate(
pageId,
`(() => {
const existing = document.getElementById('${CROSSHAIR_ID}');
if (existing) existing.remove();
const el = document.createElement('div');
el.id = '${CROSSHAIR_ID}';
el.style.cssText = 'position:fixed;top:0;left:0;width:100vw;height:100vh;pointer-events:none;z-index:2147483647';
el.innerHTML = '<svg style="position:absolute;top:0;left:0;width:100%;height:100%" xmlns="http://www.w3.org/2000/svg">'
+ '<defs><filter id="glow"><feGaussianBlur stdDeviation="3" result="blur"/><feMerge><feMergeNode in="blur"/><feMergeNode in="SourceGraphic"/></feMerge></filter></defs>'
+ '<circle cx="${x}" cy="${y}" r="28" fill="rgba(255,59,48,0.12)" stroke="none"/>'
+ '<line x1="${x - 40}" y1="${y}" x2="${x - 12}" y2="${y}" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
+ '<line x1="${x + 12}" y1="${y}" x2="${x + 40}" y2="${y}" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
+ '<line x1="${x}" y1="${y - 40}" x2="${x}" y2="${y - 12}" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
+ '<line x1="${x}" y1="${y + 12}" x2="${x}" y2="${y + 40}" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
+ '<circle cx="${x}" cy="${y}" r="20" fill="none" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
+ '<circle cx="${x}" cy="${y}" r="4" fill="#FF3B30"/>'
+ '<rect x="${labelX}" y="${labelY}" rx="6" ry="6" width="${labelWidth}" height="28" fill="rgba(0,0,0,0.85)"/>'
+ '<text x="${labelX + 12}" y="${labelY + 19}" font-family="system-ui,-apple-system,sans-serif" font-size="14" fill="white" font-weight="600">${label}</text>'
+ '</svg>';
document.body.appendChild(el);
})()`,
)
}
export async function removeCrosshair(
browser: Browser,
pageId: number,
): Promise<void> {
await browser.evaluate(
pageId,
`document.getElementById('${CROSSHAIR_ID}')?.remove()`,
)
}

View File

@@ -0,0 +1,288 @@
import { randomUUID } from 'node:crypto'
import { mkdir, writeFile } from 'node:fs/promises'
import { join } from 'node:path'
import { AiSdkAgent } from '@browseros/server/agent/tool-loop'
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
import { Browser } from '@browseros/server/browser'
import { CdpBackend } from '@browseros/server/browser/backends/cdp'
import { registry } from '@browseros/server/tools/registry'
import type { Task } from '../../src/types'
import { injectCrosshair, removeCrosshair } from './crosshair'
import { buildTaskManifest, saveTaskManifest } from './manifest'
import type { ShowcaseStep, ShowcaseTaskManifest } from './types'
const ELEMENT_TOOLS = new Set([
'click',
'fill',
'hover',
'clear',
'select_option',
'drag',
'focus',
'check',
'uncheck',
])
const COORDINATE_TOOLS = new Set(['click_at', 'hover_at', 'type_at', 'drag_at'])
const CONTROLLER_STUB = {
start: async () => {},
stop: async () => {},
isConnected: () => false,
send: async () => ({}),
// biome-ignore lint/suspicious/noExplicitAny: ControllerBackend type not exported
} as any
async function resolvePageId(
browser: Browser,
requestedId: number,
): Promise<number> {
const pages = await browser.listPages()
if (pages.some((p) => p.pageId === requestedId)) return requestedId
if (pages.length > 0) return pages[0].pageId
return requestedId
}
export interface ExecuteTaskResult {
manifest: ShowcaseTaskManifest
status: 'completed' | 'timeout' | 'failed'
}
export async function executeShowcaseTask(
task: Task,
cdpPort: number,
outputDir: string,
agentConfig: {
model: string
provider: string
apiKey?: string
baseUrl?: string
},
timeoutMs: number,
): Promise<ExecuteTaskResult> {
const executionId = randomUUID()
const taskDir = join(outputDir, executionId)
const screenshotDir = join(taskDir, 'screenshots')
await mkdir(screenshotDir, { recursive: true })
const cdp = new CdpBackend({ port: cdpPort })
await cdp.connect()
const browser = new Browser(cdp, CONTROLLER_STUB)
const pages = await browser.listPages()
const activePage = pages[0]
let activePageId = activePage?.pageId ?? 1
// Navigate to start URL
if (task.start_url && task.start_url !== 'about:blank') {
await browser.goto(activePageId, task.start_url)
}
const conversationId = randomUUID()
const resolvedConfig: ResolvedAgentConfig = {
conversationId,
// biome-ignore lint/suspicious/noExplicitAny: LLMProvider type validated at runtime
provider: agentConfig.provider as any,
model: agentConfig.model,
apiKey: agentConfig.apiKey,
baseUrl: agentConfig.baseUrl,
workingDir: `/tmp/browseros-showcase-${conversationId}`,
evalMode: true,
supportsImages: true,
}
const browserContext = activePage
? {
activeTab: {
id: activePage.tabId,
pageId: activePage.pageId,
url: activePage.url,
title: activePage.title,
},
}
: undefined
let agent: AiSdkAgent | null = null
const steps: ShowcaseStep[] = []
let stepNum = 0
let finalText: string | null = null
let status: 'completed' | 'timeout' | 'failed' = 'completed'
const startTime = Date.now()
try {
agent = await AiSdkAgent.create({
resolvedConfig,
browser,
registry,
browserContext,
})
let pendingStep: Partial<ShowcaseStep> | null = null
const abortController = new AbortController()
const timeoutHandle = setTimeout(() => abortController.abort(), timeoutMs)
try {
const result = await agent.toolLoopAgent.generate({
prompt: task.query,
abortSignal: abortController.signal,
experimental_onToolCallStart: async ({ toolCall }) => {
try {
const input = (toolCall.input ?? {}) as Record<string, unknown>
if (typeof input.page === 'number') {
activePageId = input.page
}
const pageId = await resolvePageId(browser, activePageId)
activePageId = pageId
const beforeResult = await browser.screenshot(pageId, {
format: 'png',
fullPage: false,
})
const beforePath = join(screenshotDir, `${stepNum}_before.png`)
await writeFile(
beforePath,
Buffer.from(beforeResult.data, 'base64'),
)
let axTree = ''
try {
axTree = await browser.snapshot(pageId)
} catch {
// snapshot can fail on some pages
}
let coords: { x: number; y: number } | undefined
const elementId = input.element as number | undefined
if (
elementId !== undefined &&
ELEMENT_TOOLS.has(toolCall.toolName)
) {
try {
coords = await browser.getElementCenter(pageId, elementId)
} catch {
// element may have been removed
}
} else if (
COORDINATE_TOOLS.has(toolCall.toolName) &&
typeof input.x === 'number' &&
typeof input.y === 'number'
) {
coords = { x: input.x, y: input.y }
}
pendingStep = {
stepIndex: stepNum,
toolName: toolCall.toolName,
toolInput: input,
beforeScreenshot: beforePath,
accessibilitySnapshot: axTree,
elementCoordinates: coords,
timestamp: new Date().toISOString(),
}
if (coords) {
try {
await injectCrosshair(
browser,
pageId,
coords,
toolCall.toolName,
)
const annotatedResult = await browser.screenshot(pageId, {
format: 'png',
fullPage: false,
})
const annotatedPath = join(
screenshotDir,
`${stepNum}_annotated.png`,
)
await writeFile(
annotatedPath,
Buffer.from(annotatedResult.data, 'base64'),
)
pendingStep.annotatedScreenshot = annotatedPath
await removeCrosshair(browser, pageId)
} catch {
// annotation is best-effort
}
}
} catch (err) {
console.warn(
` Step ${stepNum} before-capture failed: ${err instanceof Error ? err.message : String(err)}`,
)
}
},
experimental_onToolCallFinish: async ({ toolResult }) => {
try {
const pageId = await resolvePageId(browser, activePageId)
activePageId = pageId
const afterResult = await browser.screenshot(pageId, {
format: 'png',
fullPage: false,
})
const afterPath = join(screenshotDir, `${stepNum}_after.png`)
await writeFile(afterPath, Buffer.from(afterResult.data, 'base64'))
if (pendingStep) {
pendingStep.afterScreenshot = afterPath
pendingStep.toolOutput = toolResult
steps.push(pendingStep as ShowcaseStep)
stepNum++
}
} catch (err) {
console.warn(
` Step ${stepNum} after-capture failed: ${err instanceof Error ? err.message : String(err)}`,
)
}
pendingStep = null
},
onStepFinish: async ({ text }) => {
if (text && steps.length > 0) {
const lastStep = steps[steps.length - 1]
lastStep.assistantText = text
}
},
})
finalText = result.text || null
} catch (err) {
if (abortController.signal.aborted) {
status = 'timeout'
console.log(` ${task.query_id}: timed out after ${timeoutMs / 1000}s`)
} else {
status = 'failed'
console.error(
` ${task.query_id}: failed — ${err instanceof Error ? err.message : String(err)}`,
)
}
} finally {
clearTimeout(timeoutHandle)
}
const totalDurationMs = Date.now() - startTime
const manifest = buildTaskManifest({
executionId,
taskId: task.query_id,
query: task.query,
startUrl: task.start_url ?? 'about:blank',
dataset: task.dataset,
steps,
finalAnswer: finalText,
model: agentConfig.model,
provider: agentConfig.provider,
totalDurationMs,
})
await saveTaskManifest(outputDir, executionId, manifest)
return { manifest, status }
} finally {
if (agent) await agent.dispose().catch(() => {})
await cdp.disconnect().catch(() => {})
}
}

View File

@@ -0,0 +1,235 @@
#!/usr/bin/env bun
import { mkdir } from 'node:fs/promises'
import { parseArgs } from 'node:util'
import { BrowserOSAppManager } from '../../src/runner/browseros-app-manager'
import { loadTasks } from '../../src/runner/task-loader'
import type { Task } from '../../src/types'
import { executeShowcaseTask } from './executor'
import { saveRunIndex } from './manifest'
import type { ShowcaseRunIndex } from './types'
import { uploadShowcase } from './uploader'
const BASE_PORTS = { cdp: 9010, server: 9110, extension: 9310 }
const { values } = parseArgs({
args: Bun.argv.slice(2),
options: {
tasks: { type: 'string', short: 't' },
output: { type: 'string', short: 'o', default: './showcase-output' },
model: { type: 'string', short: 'm' },
provider: { type: 'string', short: 'p' },
'base-url': { type: 'string' },
workers: { type: 'string', short: 'w', default: '1' },
'cdp-port': { type: 'string' },
timeout: { type: 'string', default: '300000' },
upload: { type: 'boolean', default: false },
help: { type: 'boolean', short: 'h' },
},
})
if (values.help || !values.tasks) {
console.log(`
Showcase Dataset Generator
Runs the BrowserOS agent on tasks and captures before/after screenshots
with crosshair annotations for element-targeting tool calls.
Usage:
bun scripts/showcase/generate.ts --tasks <path> [options]
Options:
-t, --tasks <path> JSONL task file (required)
-o, --output <dir> Output directory (default: ./showcase-output)
-m, --model <model> LLM model (env: SHOWCASE_MODEL, default: openai/gpt-4o)
-p, --provider <name> LLM provider (env: SHOWCASE_PROVIDER, default: openrouter)
--base-url <url> LLM base URL (env: SHOWCASE_BASE_URL)
-w, --workers <n> Parallel workers (default: 1)
--cdp-port <port> Connect to existing Chrome (single-worker only)
--timeout <ms> Per-task timeout in ms (default: 300000)
--upload Upload results to R2 after generation
-h, --help Show this help
`)
process.exit(values.help ? 0 : 1)
}
const config = {
tasks: values.tasks as string,
output: (values.output ?? './showcase-output') as string,
model: (values.model ??
process.env.SHOWCASE_MODEL ??
'openai/gpt-4o') as string,
provider: (values.provider ??
process.env.SHOWCASE_PROVIDER ??
'openrouter') as string,
baseUrl: (values['base-url'] ?? process.env.SHOWCASE_BASE_URL) as
| string
| undefined,
workers: Math.max(1, Number(values.workers ?? '1')),
cdpPort: values['cdp-port'] ? Number(values['cdp-port']) : undefined,
timeout: Number(values.timeout ?? '300000'),
upload: values.upload ?? false,
}
if (config.cdpPort && config.workers > 1) {
console.error('--cdp-port only works with a single worker (--workers 1)')
process.exit(1)
}
const apiKey = process.env.OPENROUTER_API_KEY ?? process.env.OPENAI_API_KEY
if (!apiKey) {
console.error(
'Missing API key: set OPENROUTER_API_KEY or OPENAI_API_KEY environment variable',
)
process.exit(1)
}
const { tasks } = await loadTasks({ type: 'file', path: config.tasks })
console.log(`Loaded ${tasks.length} task(s), ${config.workers} worker(s)`)
await mkdir(config.output, { recursive: true })
const runId = `${new Date().toISOString().slice(0, 10)}-${crypto.randomUUID().slice(0, 8)}`
const runIndex: ShowcaseRunIndex = {
runId,
createdAt: new Date().toISOString(),
agentConfig: { model: config.model, provider: config.provider },
tasks: [],
}
console.log(`\nRun ID: ${runId}`)
console.log(`Output: ${config.output}\n`)
// --- Task Queue ---
class TaskQueue {
private index = 0
private stopped = false
constructor(private tasks: Task[]) {}
next(): Task | null {
if (this.stopped || this.index >= this.tasks.length) return null
return this.tasks[this.index++]
}
stop(): void {
this.stopped = true
}
}
const queue = new TaskQueue(tasks)
let completedCount = 0
const appManagers: BrowserOSAppManager[] = []
// --- Signal handling ---
const onSignal = async () => {
console.log('\nShutting down workers...')
queue.stop()
await Promise.allSettled(appManagers.map((m) => m.killApp()))
process.exit(0)
}
process.on('SIGINT', onSignal)
process.on('SIGTERM', onSignal)
// --- Worker ---
async function runWorker(workerIndex: number): Promise<void> {
let appManager: BrowserOSAppManager | null = null
let cdpPort = config.cdpPort ?? BASE_PORTS.cdp + workerIndex
if (!config.cdpPort) {
appManager = new BrowserOSAppManager(workerIndex, BASE_PORTS)
appManagers.push(appManager)
console.log(` [W${workerIndex}] Starting BrowserOS...`)
await appManager.restart()
cdpPort = BASE_PORTS.cdp + workerIndex
}
const agentConfig = {
model: config.model,
provider: config.provider,
apiKey,
baseUrl: config.baseUrl,
}
try {
while (true) {
const task = queue.next()
if (!task) break
completedCount++
const tag = config.workers > 1 ? `[W${workerIndex}] ` : ''
console.log(
`${tag}[${completedCount}/${tasks.length}] ${task.query_id}: ${task.query}`,
)
// Restart browser between tasks for clean state
if (appManager) {
await appManager.restart()
}
try {
const { manifest, status } = await executeShowcaseTask(
task,
cdpPort,
config.output,
agentConfig,
config.timeout,
)
runIndex.tasks.push({
executionId: manifest.executionId,
taskId: task.query_id,
query: task.query,
stepCount: manifest.steps.length,
status,
manifestPath: `${manifest.executionId}/manifest.json`,
})
const duration = (manifest.totalDurationMs / 1000).toFixed(1)
console.log(
`${tag} ${status.toUpperCase()}${manifest.steps.length} steps, ${duration}s\n`,
)
} catch (err) {
console.error(
`${tag} FAILED — ${err instanceof Error ? err.message : String(err)}\n`,
)
runIndex.tasks.push({
executionId: 'unknown',
taskId: task.query_id,
query: task.query,
stepCount: 0,
status: 'failed',
manifestPath: '',
})
}
}
} finally {
if (appManager) await appManager.killApp()
}
}
// --- Run ---
try {
const workers = Array.from({ length: config.workers }, (_, i) => runWorker(i))
await Promise.all(workers)
await saveRunIndex(config.output, runIndex)
console.log(`\nResults saved to: ${config.output}`)
console.log(
`Tasks: ${runIndex.tasks.filter((t) => t.status === 'completed').length} completed, ` +
`${runIndex.tasks.filter((t) => t.status === 'failed').length} failed, ` +
`${runIndex.tasks.filter((t) => t.status === 'timeout').length} timed out`,
)
if (config.upload) {
console.log('\nUploading to R2...')
const baseUrl = await uploadShowcase(config.output, runId)
console.log(`Uploaded to: ${baseUrl}`)
}
} finally {
process.off('SIGINT', onSignal)
process.off('SIGTERM', onSignal)
}

View File

@@ -0,0 +1,52 @@
import { writeFile } from 'node:fs/promises'
import { join } from 'node:path'
import type {
ShowcaseRunIndex,
ShowcaseStep,
ShowcaseTaskManifest,
} from './types'
export function buildTaskManifest(opts: {
executionId: string
taskId: string
query: string
startUrl: string
dataset: string
steps: ShowcaseStep[]
finalAnswer: string | null
model: string
provider: string
totalDurationMs: number
}): ShowcaseTaskManifest {
return {
executionId: opts.executionId,
taskId: opts.taskId,
query: opts.query,
startUrl: opts.startUrl,
dataset: opts.dataset,
steps: opts.steps,
finalAnswer: opts.finalAnswer,
agentConfig: { model: opts.model, provider: opts.provider },
totalDurationMs: opts.totalDurationMs,
createdAt: new Date().toISOString(),
}
}
export async function saveTaskManifest(
outputDir: string,
executionId: string,
manifest: ShowcaseTaskManifest,
): Promise<string> {
const manifestPath = join(outputDir, executionId, 'manifest.json')
await writeFile(manifestPath, JSON.stringify(manifest, null, 2))
return manifestPath
}
export async function saveRunIndex(
outputDir: string,
index: ShowcaseRunIndex,
): Promise<string> {
const indexPath = join(outputDir, 'index.json')
await writeFile(indexPath, JSON.stringify(index, null, 2))
return indexPath
}

View File

@@ -0,0 +1,53 @@
export interface ShowcaseConfig {
tasks: string
output: string
upload: boolean
model: string
provider: string
apiKeyEnv: string
cdpPort?: number
timeout: number
}
export interface ShowcaseStep {
stepIndex: number
toolName: string
toolInput: Record<string, unknown>
toolOutput: unknown
elementCoordinates?: { x: number; y: number }
beforeScreenshot: string
afterScreenshot: string
annotatedScreenshot?: string
accessibilitySnapshot: string
assistantText?: string
timestamp: string
}
export interface ShowcaseTaskManifest {
executionId: string
taskId: string
query: string
startUrl: string
dataset: string
steps: ShowcaseStep[]
finalAnswer: string | null
agentConfig: { model: string; provider: string }
totalDurationMs: number
createdAt: string
uploadedAt?: string
}
export interface ShowcaseRunIndex {
runId: string
createdAt: string
uploadedAt?: string
agentConfig: { model: string; provider: string }
tasks: Array<{
executionId: string
taskId: string
query: string
stepCount: number
status: 'completed' | 'timeout' | 'failed'
manifestPath: string
}>
}

View File

@@ -0,0 +1,155 @@
import { readdir, readFile, writeFile } from 'node:fs/promises'
import { extname, join, relative } from 'node:path'
import { PutObjectCommand, S3Client } from '@aws-sdk/client-s3'
import type { ShowcaseRunIndex, ShowcaseTaskManifest } from './types'
interface R2Config {
accountId: string
accessKeyId: string
secretAccessKey: string
bucket: string
}
function contentTypeFor(filePath: string): string {
const ext = extname(filePath).toLowerCase()
if (ext === '.png') return 'image/png'
if (ext === '.json') return 'application/json'
if (ext === '.jsonl') return 'application/jsonl'
return 'application/octet-stream'
}
function loadR2Config(): R2Config {
const accountId = process.env.R2_ACCOUNT_ID
const accessKeyId = process.env.R2_ACCESS_KEY_ID
const secretAccessKey = process.env.R2_SECRET_ACCESS_KEY
const bucket = process.env.R2_BUCKET ?? 'rl-env'
if (!accountId || !accessKeyId || !secretAccessKey) {
throw new Error(
'Missing R2 credentials. Set R2_ACCOUNT_ID, R2_ACCESS_KEY_ID, R2_SECRET_ACCESS_KEY',
)
}
return { accountId, accessKeyId, secretAccessKey, bucket }
}
function toR2Key(prefix: string, outputDir: string, filePath: string): string {
return `${prefix}/${relative(outputDir, filePath).replaceAll('\\', '/')}`
}
async function walkDir(dir: string): Promise<string[]> {
const files: string[] = []
const entries = await readdir(dir, { withFileTypes: true })
for (const entry of entries) {
const fullPath = join(dir, entry.name)
if (entry.isDirectory()) {
files.push(...(await walkDir(fullPath)))
} else {
files.push(fullPath)
}
}
return files
}
export async function uploadShowcase(
outputDir: string,
runId: string,
): Promise<string> {
const r2 = loadR2Config()
const client = new S3Client({
region: 'auto',
endpoint: `https://${r2.accountId}.r2.cloudflarestorage.com`,
credentials: {
accessKeyId: r2.accessKeyId,
secretAccessKey: r2.secretAccessKey,
},
})
const prefix = `showcase/${runId}`
const files = await walkDir(outputDir)
console.log(
`Uploading ${files.length} files to R2 (${r2.bucket}/${prefix})...`,
)
for (const filePath of files) {
const key = toR2Key(prefix, outputDir, filePath)
const data = await readFile(filePath)
await client.send(
new PutObjectCommand({
Bucket: r2.bucket,
Key: key,
Body: data,
ContentType: contentTypeFor(filePath),
}),
)
}
const baseUrl = `https://${r2.bucket}.${r2.accountId}.r2.cloudflarestorage.com/${prefix}`
console.log(`Upload complete. Base: ${baseUrl}`)
// Stamp uploadedAt on index.json
const indexPath = join(outputDir, 'index.json')
try {
const indexData = JSON.parse(
await readFile(indexPath, 'utf-8'),
) as ShowcaseRunIndex
indexData.uploadedAt = new Date().toISOString()
await writeFile(indexPath, JSON.stringify(indexData, null, 2))
} catch {
// index may not exist if run was partial
}
// Stamp uploadedAt on each task manifest
for (const file of files) {
if (file.endsWith('manifest.json') && file !== indexPath) {
try {
const manifestData = JSON.parse(
await readFile(file, 'utf-8'),
) as ShowcaseTaskManifest
manifestData.uploadedAt = new Date().toISOString()
// Rewrite screenshot paths to R2 keys
for (const step of manifestData.steps) {
step.beforeScreenshot = toR2Key(
prefix,
outputDir,
step.beforeScreenshot,
)
step.afterScreenshot = toR2Key(
prefix,
outputDir,
step.afterScreenshot,
)
if (step.annotatedScreenshot) {
step.annotatedScreenshot = toR2Key(
prefix,
outputDir,
step.annotatedScreenshot,
)
}
}
await writeFile(file, JSON.stringify(manifestData, null, 2))
} catch {
// skip malformed manifests
}
}
}
// Re-upload rewritten manifests + index
const jsonFiles = files.filter(
(f) => f.endsWith('.json') && !f.includes('node_modules'),
)
for (const filePath of jsonFiles) {
const key = toR2Key(prefix, outputDir, filePath)
const data = await readFile(filePath)
await client.send(
new PutObjectCommand({
Bucket: r2.bucket,
Key: key,
Body: data,
ContentType: 'application/json',
}),
)
}
return baseUrl
}

View File

@@ -701,6 +701,17 @@ export class Browser {
}
}
// --- Element Geometry ---
async getElementCenter(
page: number,
element: number,
): Promise<{ x: number; y: number }> {
const session = await this.resolveSession(page)
await elements.scrollIntoView(session, element)
return elements.getElementCenter(session, element)
}
// --- Input ---
async click(