Compare commits

..

9 Commits

Author SHA1 Message Date
Nikhil Sonti
71cb158739 feat(eval): add parallel workers support to showcase generator
Add --workers/-w flag for parallel task execution. Each worker gets
its own isolated BrowserOS stack on offset ports, matching the eval
ParallelExecutor pattern. Also uses env-based defaults for model/
provider/baseUrl (SHOWCASE_MODEL, SHOWCASE_PROVIDER, SHOWCASE_BASE_URL)
and reads API key from OPENROUTER_API_KEY.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 17:16:04 -07:00
Nikhil Sonti
654a871841 fix(eval): improve crosshair visibility, fix page ID resolution, add prod tasks
- Crosshair: 2.5x larger (20px ring, 40px lines, 3px stroke), glow
  filter, semi-transparent fill circle, bigger label
- Page ID: add resolvePageId() helper that falls back to first
  available page when the agent's page reference is stale
- Add prod-tasks.jsonl with 20 real-world queries covering finance,
  legal, healthcare, tech, real estate, HR, and more

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 17:14:40 -07:00
Nikhil Sonti
ff167fb855 fix(eval): use openrouter defaults, fix page ID tracking, add showcase tasks
- Default provider/model/apiKey read from env vars (SHOWCASE_PROVIDER,
  SHOWCASE_MODEL, OPENROUTER_API_KEY) instead of hardcoded OpenAI
- Fix stale page ID bug: activePageId now updates when tool calls
  reference a different page
- Add showcase-tasks.jsonl with sample Amazon task
- Add showcase-output/ and .env to .gitignore

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 16:12:33 -07:00
Nikhil Sonti
3d4e493512 feat(eval): add showcase dataset generator with before/after screenshots
Standalone script that runs the BrowserOS agent on tasks and captures
rich execution traces for website showcase galleries:

- Before/after screenshots per tool call via AI SDK callbacks
- CSS crosshair overlays injected via browser.evaluate() for
  element-targeting tools (click, fill, hover, etc.)
- Accessibility snapshot capture at each step
- Per-task JSON manifests with execution metadata
- R2 upload with URL rewriting for cloud-hosted galleries

New files:
- apps/eval/scripts/showcase/ (generate, executor, crosshair,
  manifest, uploader, types)
- apps/eval/.env.example (R2 credentials template)

Also adds Browser.getElementCenter() public method for coordinate
resolution without triggering actions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 15:07:14 -07:00
Nikhil
2b53daf641 fix: prevent deleted scheduled tasks from reappearing after sync (#518)
* fix: prevent deleted scheduled tasks from reappearing after sync

When a scheduled task was deleted, the sync function would see the
remote job missing locally and re-add it, undoing the delete. Fix by
tracking pending deletions in storage so the sync function deletes
them from the backend instead of re-adding them locally.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: use read-modify-write for pending deletions to prevent concurrent clobber

Re-read pendingDeletionStorage before write-back and only remove
resolved IDs, preserving any new entries added by concurrent
removeJob calls during the sync's network I/O.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 11:31:57 -07:00
Nikhil
3cc946ded8 fix(ci): report test pass/fail status on PRs (#520)
The test workflow captured exit codes but never failed the job, so PR
checks always showed green even when tests failed. Exit with the
captured code in the summarize step so each suite properly reports
pass/fail. Not a required check, so failures remain non-blocking.

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 11:31:23 -07:00
shivammittal274
70be5c5c21 fix(eval): log agent errors in task progress for CI visibility (#523) 2026-03-21 23:33:19 +05:30
shivammittal274
0f9d93058f chore(eval): remove unused env vars from workflow (OPENROUTER, OPENAI) (#522) 2026-03-21 23:22:03 +05:30
shivammittal274
cafed57832 fix(eval): use CLAUDE_CODE_OAUTH_TOKEN for performance grader auth (#521) 2026-03-21 23:14:23 +05:30
16 changed files with 927 additions and 9 deletions

View File

@@ -49,9 +49,7 @@ jobs:
working-directory: packages/browseros-agent/apps/eval
env:
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
BROWSEROS_BINARY: /usr/bin/browseros
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
run: |

View File

@@ -22,9 +22,7 @@ import {
SCHEDULED_TASK_TOGGLED_EVENT,
SCHEDULED_TASK_VIEW_RESULTS_EVENT,
} from '@/lib/constants/analyticsEvents'
import { useGraphqlMutation } from '@/lib/graphql/useGraphqlMutation'
import { track } from '@/lib/metrics/track'
import { DeleteScheduledJobDocument } from '@/lib/schedules/graphql/syncSchedulesDocument'
import {
scheduledJobRunStorage,
useScheduledJobRuns,
@@ -46,8 +44,6 @@ export const ScheduledTasksPage: FC = () => {
useScheduledJobs()
const { jobRuns, cancelJobRun } = useScheduledJobRuns()
const deleteRemoteJobMutation = useGraphqlMutation(DeleteScheduledJobDocument)
const [activeTab, setActiveTab] = useState<string | null>(null)
const [isDialogOpen, setIsDialogOpen] = useState(false)
const [editingJob, setEditingJob] = useState<ScheduledJob | null>(null)
@@ -102,7 +98,6 @@ export const ScheduledTasksPage: FC = () => {
const confirmDelete = async () => {
if (deleteJobId) {
await removeJob(deleteJobId)
deleteRemoteJobMutation.mutate({ rowId: deleteJobId })
setDeleteJobId(null)
track(SCHEDULED_TASK_DELETED_EVENT)
}

View File

@@ -22,6 +22,13 @@ export const scheduledJobRunStorage = storage.defineItem<ScheduledJobRun[]>(
},
)
export const pendingDeletionStorage = storage.defineItem<string[]>(
'local:scheduledJobsPendingDeletion',
{
fallback: [],
},
)
export function useScheduledJobs() {
const [jobs, setJobs] = useState<ScheduledJob[]>([])
@@ -54,6 +61,11 @@ export function useScheduledJobs() {
const removeJob = async (id: string) => {
await chrome.alarms.clear(getAlarmName(id))
const pending = (await pendingDeletionStorage.getValue()) ?? []
if (!pending.includes(id)) {
await pendingDeletionStorage.setValue([...pending, id])
}
const currentJobs = (await scheduledJobStorage.getValue()) ?? []
await scheduledJobStorage.setValue(currentJobs.filter((j) => j.id !== id))

View File

@@ -5,10 +5,11 @@ import { sentry } from '@/lib/sentry/sentry'
import { createAlarmFromJob } from './createAlarmFromJob'
import {
CreateScheduledJobDocument,
DeleteScheduledJobDocument,
GetScheduledJobsByProfileIdDocument,
UpdateScheduledJobDocument,
} from './graphql/syncSchedulesDocument'
import { scheduledJobStorage } from './scheduleStorage'
import { pendingDeletionStorage, scheduledJobStorage } from './scheduleStorage'
import type { ScheduledJob } from './scheduleTypes'
type RemoteScheduledJob = {
@@ -99,6 +100,32 @@ export async function syncSchedulesToBackend(
}
}
const pendingDeletions = new Set(
(await pendingDeletionStorage.getValue()) ?? [],
)
const resolvedDeletions = new Set<string>()
for (const rowId of pendingDeletions) {
if (remoteJobs.has(rowId)) {
try {
await execute(DeleteScheduledJobDocument, { rowId })
remoteJobs.delete(rowId)
resolvedDeletions.add(rowId)
} catch (error) {
sentry.captureException(error, {
extra: { jobId: rowId, context: 'sync-pending-deletion' },
})
}
} else {
resolvedDeletions.add(rowId)
}
}
const latestPending = (await pendingDeletionStorage.getValue()) ?? []
await pendingDeletionStorage.setValue(
latestPending.filter((id) => !resolvedDeletions.has(id)),
)
const localJobsMap = new Map(localJobs.map((j) => [j.id, j]))
const jobsToAddLocally: ScheduledJob[] = []
const jobsToUpdateLocally: ScheduledJob[] = []

View File

@@ -0,0 +1,11 @@
# Showcase generator — LLM config
OPENROUTER_API_KEY=
SHOWCASE_MODEL=openai/gpt-4o
SHOWCASE_PROVIDER=openrouter
SHOWCASE_BASE_URL=https://openrouter.ai/api/v1
# R2 upload (for --upload flag)
R2_ACCOUNT_ID=
R2_ACCESS_KEY_ID=
R2_SECRET_ACCESS_KEY=
R2_BUCKET=rl-env

View File

@@ -1,3 +1,5 @@
data/raw/
results/
extensions/
showcase-output/
.env

View File

@@ -0,0 +1,20 @@
{"query_id":"prod-financial-advisor-morningstar","dataset":"prod","query":"Look up the Morningstar rating and expense ratio for Vanguard Total Stock Market Index Fund (VTSAX) on morningstar.com","start_url":"https://www.morningstar.com","metadata":{"original_task_id":"prod-financial-advisor-morningstar","category":"finance"}}
{"query_id":"prod-lawyer-pacer-search","dataset":"prod","query":"Go to courtlistener.com and search for recent federal court opinions mentioning 'non-compete agreement' from 2025","start_url":"https://www.courtlistener.com","metadata":{"original_task_id":"prod-lawyer-pacer-search","category":"legal"}}
{"query_id":"prod-doctor-drug-interaction","dataset":"prod","query":"Check drug interactions between metformin and lisinopril on drugs.com","start_url":"https://www.drugs.com/drug_interactions.html","metadata":{"original_task_id":"prod-doctor-drug-interaction","category":"healthcare"}}
{"query_id":"prod-software-eng-github-trending","dataset":"prod","query":"Find the top trending Python repositories on GitHub this week and open the most starred one","start_url":"https://github.com/trending","metadata":{"original_task_id":"prod-software-eng-github-trending","category":"technology"}}
{"query_id":"prod-cfo-sec-filing","dataset":"prod","query":"Look up Apple's most recent 10-K filing on SEC EDGAR and find their total revenue for fiscal year 2024","start_url":"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&company=apple&CIK=&type=10-K&dateb=&owner=include&count=10&search_text=&action=getcompany","metadata":{"original_task_id":"prod-cfo-sec-filing","category":"finance"}}
{"query_id":"prod-realtor-zillow-comp","dataset":"prod","query":"Search for recently sold homes in Palo Alto, CA on Zillow and filter for 3+ bedrooms sold in the last 30 days","start_url":"https://www.zillow.com","metadata":{"original_task_id":"prod-realtor-zillow-comp","category":"real_estate"}}
{"query_id":"prod-hr-manager-linkedin-job","dataset":"prod","query":"Post a job listing search on LinkedIn for a Senior Product Manager role in San Francisco with salary range $180k-$220k","start_url":"https://www.linkedin.com/jobs/","metadata":{"original_task_id":"prod-hr-manager-linkedin-job","category":"hr"}}
{"query_id":"prod-analyst-fred-data","dataset":"prod","query":"Go to FRED and pull up the US Consumer Price Index chart, change the time range to the last 5 years","start_url":"https://fred.stlouisfed.org","metadata":{"original_task_id":"prod-analyst-fred-data","category":"finance"}}
{"query_id":"prod-accountant-irs-form","dataset":"prod","query":"Find and download the latest IRS Form W-9 from irs.gov","start_url":"https://www.irs.gov","metadata":{"original_task_id":"prod-accountant-irs-form","category":"finance"}}
{"query_id":"prod-sales-manager-crm-research","dataset":"prod","query":"Go to g2.com and compare the top 3 CRM software platforms by user rating and pricing","start_url":"https://www.g2.com/categories/crm","metadata":{"original_task_id":"prod-sales-manager-crm-research","category":"sales"}}
{"query_id":"prod-engineer-stackoverflow","dataset":"prod","query":"Search Stack Overflow for how to implement retry logic with exponential backoff in Python and find the highest voted answer","start_url":"https://stackoverflow.com","metadata":{"original_task_id":"prod-engineer-stackoverflow","category":"technology"}}
{"query_id":"prod-pm-producthunt","dataset":"prod","query":"Browse today's top launches on Product Hunt and upvote the highest ranked AI product","start_url":"https://www.producthunt.com","metadata":{"original_task_id":"prod-pm-producthunt","category":"technology"}}
{"query_id":"prod-pharmacist-fda-recall","dataset":"prod","query":"Check the FDA website for any recent drug recalls in the last month","start_url":"https://www.fda.gov/safety/recalls-market-withdrawals-safety-alerts","metadata":{"original_task_id":"prod-pharmacist-fda-recall","category":"healthcare"}}
{"query_id":"prod-investment-analyst-yahoo-finance","dataset":"prod","query":"Look up NVIDIA stock on Yahoo Finance, check the P/E ratio, and add it to a watchlist","start_url":"https://finance.yahoo.com","metadata":{"original_task_id":"prod-investment-analyst-yahoo-finance","category":"finance"}}
{"query_id":"prod-compliance-officer-regulations","dataset":"prod","query":"Search for the latest GDPR enforcement actions on the European Data Protection Board website","start_url":"https://www.edpb.europa.eu/news/news_en","metadata":{"original_task_id":"prod-compliance-officer-regulations","category":"compliance"}}
{"query_id":"prod-management-consultant-mckinsey","dataset":"prod","query":"Go to McKinsey's insights page and find their latest article about generative AI's impact on productivity","start_url":"https://www.mckinsey.com/featured-insights","metadata":{"original_task_id":"prod-management-consultant-mckinsey","category":"consulting"}}
{"query_id":"prod-operations-manager-shipping","dataset":"prod","query":"Track a FedEx package with tracking number 123456789012 on fedex.com","start_url":"https://www.fedex.com/en-us/tracking.html","metadata":{"original_task_id":"prod-operations-manager-shipping","category":"operations"}}
{"query_id":"prod-market-researcher-statista","dataset":"prod","query":"Search Statista for the global AI market size forecast and find the projected value for 2026","start_url":"https://www.statista.com","metadata":{"original_task_id":"prod-market-researcher-statista","category":"research"}}
{"query_id":"prod-nurse-uptodate","dataset":"prod","query":"Search WebMD for the recommended dosing guidelines for adult acetaminophen and check the maximum daily dose","start_url":"https://www.webmd.com","metadata":{"original_task_id":"prod-nurse-uptodate","category":"healthcare"}}
{"query_id":"prod-executive-flights","dataset":"prod","query":"Search Google Flights for a business class round trip from SFO to JFK departing next Monday returning Friday","start_url":"https://www.google.com/travel/flights","metadata":{"original_task_id":"prod-executive-flights","category":"travel"}}

View File

@@ -0,0 +1 @@
{"query_id":"showcase-amazon-order","dataset":"showcase","query":"Open amazon.com and order Sensodyne toothpaste","start_url":"https://www.amazon.com","metadata":{"original_task_id":"showcase-amazon-order"}}

View File

@@ -0,0 +1,52 @@
import type { Browser } from '@browseros/server/browser'
const CROSSHAIR_ID = '__browseros_showcase_crosshair__'
export async function injectCrosshair(
browser: Browser,
pageId: number,
coords: { x: number; y: number },
toolName: string,
): Promise<void> {
const x = Math.round(coords.x)
const y = Math.round(coords.y)
const label = toolName.replace(/_/g, ' ')
const labelWidth = Math.round(label.length * 9 + 24)
const labelX = x + 32
const labelY = y - 32
await browser.evaluate(
pageId,
`(() => {
const existing = document.getElementById('${CROSSHAIR_ID}');
if (existing) existing.remove();
const el = document.createElement('div');
el.id = '${CROSSHAIR_ID}';
el.style.cssText = 'position:fixed;top:0;left:0;width:100vw;height:100vh;pointer-events:none;z-index:2147483647';
el.innerHTML = '<svg style="position:absolute;top:0;left:0;width:100%;height:100%" xmlns="http://www.w3.org/2000/svg">'
+ '<defs><filter id="glow"><feGaussianBlur stdDeviation="3" result="blur"/><feMerge><feMergeNode in="blur"/><feMergeNode in="SourceGraphic"/></feMerge></filter></defs>'
+ '<circle cx="${x}" cy="${y}" r="28" fill="rgba(255,59,48,0.12)" stroke="none"/>'
+ '<line x1="${x - 40}" y1="${y}" x2="${x - 12}" y2="${y}" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
+ '<line x1="${x + 12}" y1="${y}" x2="${x + 40}" y2="${y}" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
+ '<line x1="${x}" y1="${y - 40}" x2="${x}" y2="${y - 12}" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
+ '<line x1="${x}" y1="${y + 12}" x2="${x}" y2="${y + 40}" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
+ '<circle cx="${x}" cy="${y}" r="20" fill="none" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
+ '<circle cx="${x}" cy="${y}" r="4" fill="#FF3B30"/>'
+ '<rect x="${labelX}" y="${labelY}" rx="6" ry="6" width="${labelWidth}" height="28" fill="rgba(0,0,0,0.85)"/>'
+ '<text x="${labelX + 12}" y="${labelY + 19}" font-family="system-ui,-apple-system,sans-serif" font-size="14" fill="white" font-weight="600">${label}</text>'
+ '</svg>';
document.body.appendChild(el);
})()`,
)
}
export async function removeCrosshair(
browser: Browser,
pageId: number,
): Promise<void> {
await browser.evaluate(
pageId,
`document.getElementById('${CROSSHAIR_ID}')?.remove()`,
)
}

View File

@@ -0,0 +1,288 @@
import { randomUUID } from 'node:crypto'
import { mkdir, writeFile } from 'node:fs/promises'
import { join } from 'node:path'
import { AiSdkAgent } from '@browseros/server/agent/tool-loop'
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
import { Browser } from '@browseros/server/browser'
import { CdpBackend } from '@browseros/server/browser/backends/cdp'
import { registry } from '@browseros/server/tools/registry'
import type { Task } from '../../src/types'
import { injectCrosshair, removeCrosshair } from './crosshair'
import { buildTaskManifest, saveTaskManifest } from './manifest'
import type { ShowcaseStep, ShowcaseTaskManifest } from './types'
const ELEMENT_TOOLS = new Set([
'click',
'fill',
'hover',
'clear',
'select_option',
'drag',
'focus',
'check',
'uncheck',
])
const COORDINATE_TOOLS = new Set(['click_at', 'hover_at', 'type_at', 'drag_at'])
const CONTROLLER_STUB = {
start: async () => {},
stop: async () => {},
isConnected: () => false,
send: async () => ({}),
// biome-ignore lint/suspicious/noExplicitAny: ControllerBackend type not exported
} as any
async function resolvePageId(
browser: Browser,
requestedId: number,
): Promise<number> {
const pages = await browser.listPages()
if (pages.some((p) => p.pageId === requestedId)) return requestedId
if (pages.length > 0) return pages[0].pageId
return requestedId
}
export interface ExecuteTaskResult {
manifest: ShowcaseTaskManifest
status: 'completed' | 'timeout' | 'failed'
}
export async function executeShowcaseTask(
task: Task,
cdpPort: number,
outputDir: string,
agentConfig: {
model: string
provider: string
apiKey?: string
baseUrl?: string
},
timeoutMs: number,
): Promise<ExecuteTaskResult> {
const executionId = randomUUID()
const taskDir = join(outputDir, executionId)
const screenshotDir = join(taskDir, 'screenshots')
await mkdir(screenshotDir, { recursive: true })
const cdp = new CdpBackend({ port: cdpPort })
await cdp.connect()
const browser = new Browser(cdp, CONTROLLER_STUB)
const pages = await browser.listPages()
const activePage = pages[0]
let activePageId = activePage?.pageId ?? 1
// Navigate to start URL
if (task.start_url && task.start_url !== 'about:blank') {
await browser.goto(activePageId, task.start_url)
}
const conversationId = randomUUID()
const resolvedConfig: ResolvedAgentConfig = {
conversationId,
// biome-ignore lint/suspicious/noExplicitAny: LLMProvider type validated at runtime
provider: agentConfig.provider as any,
model: agentConfig.model,
apiKey: agentConfig.apiKey,
baseUrl: agentConfig.baseUrl,
workingDir: `/tmp/browseros-showcase-${conversationId}`,
evalMode: true,
supportsImages: true,
}
const browserContext = activePage
? {
activeTab: {
id: activePage.tabId,
pageId: activePage.pageId,
url: activePage.url,
title: activePage.title,
},
}
: undefined
let agent: AiSdkAgent | null = null
const steps: ShowcaseStep[] = []
let stepNum = 0
let finalText: string | null = null
let status: 'completed' | 'timeout' | 'failed' = 'completed'
const startTime = Date.now()
try {
agent = await AiSdkAgent.create({
resolvedConfig,
browser,
registry,
browserContext,
})
let pendingStep: Partial<ShowcaseStep> | null = null
const abortController = new AbortController()
const timeoutHandle = setTimeout(() => abortController.abort(), timeoutMs)
try {
const result = await agent.toolLoopAgent.generate({
prompt: task.query,
abortSignal: abortController.signal,
experimental_onToolCallStart: async ({ toolCall }) => {
try {
const input = (toolCall.input ?? {}) as Record<string, unknown>
if (typeof input.page === 'number') {
activePageId = input.page
}
const pageId = await resolvePageId(browser, activePageId)
activePageId = pageId
const beforeResult = await browser.screenshot(pageId, {
format: 'png',
fullPage: false,
})
const beforePath = join(screenshotDir, `${stepNum}_before.png`)
await writeFile(
beforePath,
Buffer.from(beforeResult.data, 'base64'),
)
let axTree = ''
try {
axTree = await browser.snapshot(pageId)
} catch {
// snapshot can fail on some pages
}
let coords: { x: number; y: number } | undefined
const elementId = input.element as number | undefined
if (
elementId !== undefined &&
ELEMENT_TOOLS.has(toolCall.toolName)
) {
try {
coords = await browser.getElementCenter(pageId, elementId)
} catch {
// element may have been removed
}
} else if (
COORDINATE_TOOLS.has(toolCall.toolName) &&
typeof input.x === 'number' &&
typeof input.y === 'number'
) {
coords = { x: input.x, y: input.y }
}
pendingStep = {
stepIndex: stepNum,
toolName: toolCall.toolName,
toolInput: input,
beforeScreenshot: beforePath,
accessibilitySnapshot: axTree,
elementCoordinates: coords,
timestamp: new Date().toISOString(),
}
if (coords) {
try {
await injectCrosshair(
browser,
pageId,
coords,
toolCall.toolName,
)
const annotatedResult = await browser.screenshot(pageId, {
format: 'png',
fullPage: false,
})
const annotatedPath = join(
screenshotDir,
`${stepNum}_annotated.png`,
)
await writeFile(
annotatedPath,
Buffer.from(annotatedResult.data, 'base64'),
)
pendingStep.annotatedScreenshot = annotatedPath
await removeCrosshair(browser, pageId)
} catch {
// annotation is best-effort
}
}
} catch (err) {
console.warn(
` Step ${stepNum} before-capture failed: ${err instanceof Error ? err.message : String(err)}`,
)
}
},
experimental_onToolCallFinish: async ({ toolResult }) => {
try {
const pageId = await resolvePageId(browser, activePageId)
activePageId = pageId
const afterResult = await browser.screenshot(pageId, {
format: 'png',
fullPage: false,
})
const afterPath = join(screenshotDir, `${stepNum}_after.png`)
await writeFile(afterPath, Buffer.from(afterResult.data, 'base64'))
if (pendingStep) {
pendingStep.afterScreenshot = afterPath
pendingStep.toolOutput = toolResult
steps.push(pendingStep as ShowcaseStep)
stepNum++
}
} catch (err) {
console.warn(
` Step ${stepNum} after-capture failed: ${err instanceof Error ? err.message : String(err)}`,
)
}
pendingStep = null
},
onStepFinish: async ({ text }) => {
if (text && steps.length > 0) {
const lastStep = steps[steps.length - 1]
lastStep.assistantText = text
}
},
})
finalText = result.text || null
} catch (err) {
if (abortController.signal.aborted) {
status = 'timeout'
console.log(` ${task.query_id}: timed out after ${timeoutMs / 1000}s`)
} else {
status = 'failed'
console.error(
` ${task.query_id}: failed — ${err instanceof Error ? err.message : String(err)}`,
)
}
} finally {
clearTimeout(timeoutHandle)
}
const totalDurationMs = Date.now() - startTime
const manifest = buildTaskManifest({
executionId,
taskId: task.query_id,
query: task.query,
startUrl: task.start_url ?? 'about:blank',
dataset: task.dataset,
steps,
finalAnswer: finalText,
model: agentConfig.model,
provider: agentConfig.provider,
totalDurationMs,
})
await saveTaskManifest(outputDir, executionId, manifest)
return { manifest, status }
} finally {
if (agent) await agent.dispose().catch(() => {})
await cdp.disconnect().catch(() => {})
}
}

View File

@@ -0,0 +1,235 @@
#!/usr/bin/env bun
import { mkdir } from 'node:fs/promises'
import { parseArgs } from 'node:util'
import { BrowserOSAppManager } from '../../src/runner/browseros-app-manager'
import { loadTasks } from '../../src/runner/task-loader'
import type { Task } from '../../src/types'
import { executeShowcaseTask } from './executor'
import { saveRunIndex } from './manifest'
import type { ShowcaseRunIndex } from './types'
import { uploadShowcase } from './uploader'
const BASE_PORTS = { cdp: 9010, server: 9110, extension: 9310 }
const { values } = parseArgs({
args: Bun.argv.slice(2),
options: {
tasks: { type: 'string', short: 't' },
output: { type: 'string', short: 'o', default: './showcase-output' },
model: { type: 'string', short: 'm' },
provider: { type: 'string', short: 'p' },
'base-url': { type: 'string' },
workers: { type: 'string', short: 'w', default: '1' },
'cdp-port': { type: 'string' },
timeout: { type: 'string', default: '300000' },
upload: { type: 'boolean', default: false },
help: { type: 'boolean', short: 'h' },
},
})
if (values.help || !values.tasks) {
console.log(`
Showcase Dataset Generator
Runs the BrowserOS agent on tasks and captures before/after screenshots
with crosshair annotations for element-targeting tool calls.
Usage:
bun scripts/showcase/generate.ts --tasks <path> [options]
Options:
-t, --tasks <path> JSONL task file (required)
-o, --output <dir> Output directory (default: ./showcase-output)
-m, --model <model> LLM model (env: SHOWCASE_MODEL, default: openai/gpt-4o)
-p, --provider <name> LLM provider (env: SHOWCASE_PROVIDER, default: openrouter)
--base-url <url> LLM base URL (env: SHOWCASE_BASE_URL)
-w, --workers <n> Parallel workers (default: 1)
--cdp-port <port> Connect to existing Chrome (single-worker only)
--timeout <ms> Per-task timeout in ms (default: 300000)
--upload Upload results to R2 after generation
-h, --help Show this help
`)
process.exit(values.help ? 0 : 1)
}
const config = {
tasks: values.tasks as string,
output: (values.output ?? './showcase-output') as string,
model: (values.model ??
process.env.SHOWCASE_MODEL ??
'openai/gpt-4o') as string,
provider: (values.provider ??
process.env.SHOWCASE_PROVIDER ??
'openrouter') as string,
baseUrl: (values['base-url'] ?? process.env.SHOWCASE_BASE_URL) as
| string
| undefined,
workers: Math.max(1, Number(values.workers ?? '1')),
cdpPort: values['cdp-port'] ? Number(values['cdp-port']) : undefined,
timeout: Number(values.timeout ?? '300000'),
upload: values.upload ?? false,
}
if (config.cdpPort && config.workers > 1) {
console.error('--cdp-port only works with a single worker (--workers 1)')
process.exit(1)
}
const apiKey = process.env.OPENROUTER_API_KEY ?? process.env.OPENAI_API_KEY
if (!apiKey) {
console.error(
'Missing API key: set OPENROUTER_API_KEY or OPENAI_API_KEY environment variable',
)
process.exit(1)
}
const { tasks } = await loadTasks({ type: 'file', path: config.tasks })
console.log(`Loaded ${tasks.length} task(s), ${config.workers} worker(s)`)
await mkdir(config.output, { recursive: true })
const runId = `${new Date().toISOString().slice(0, 10)}-${crypto.randomUUID().slice(0, 8)}`
const runIndex: ShowcaseRunIndex = {
runId,
createdAt: new Date().toISOString(),
agentConfig: { model: config.model, provider: config.provider },
tasks: [],
}
console.log(`\nRun ID: ${runId}`)
console.log(`Output: ${config.output}\n`)
// --- Task Queue ---
class TaskQueue {
private index = 0
private stopped = false
constructor(private tasks: Task[]) {}
next(): Task | null {
if (this.stopped || this.index >= this.tasks.length) return null
return this.tasks[this.index++]
}
stop(): void {
this.stopped = true
}
}
const queue = new TaskQueue(tasks)
let completedCount = 0
const appManagers: BrowserOSAppManager[] = []
// --- Signal handling ---
const onSignal = async () => {
console.log('\nShutting down workers...')
queue.stop()
await Promise.allSettled(appManagers.map((m) => m.killApp()))
process.exit(0)
}
process.on('SIGINT', onSignal)
process.on('SIGTERM', onSignal)
// --- Worker ---
async function runWorker(workerIndex: number): Promise<void> {
let appManager: BrowserOSAppManager | null = null
let cdpPort = config.cdpPort ?? BASE_PORTS.cdp + workerIndex
if (!config.cdpPort) {
appManager = new BrowserOSAppManager(workerIndex, BASE_PORTS)
appManagers.push(appManager)
console.log(` [W${workerIndex}] Starting BrowserOS...`)
await appManager.restart()
cdpPort = BASE_PORTS.cdp + workerIndex
}
const agentConfig = {
model: config.model,
provider: config.provider,
apiKey,
baseUrl: config.baseUrl,
}
try {
while (true) {
const task = queue.next()
if (!task) break
completedCount++
const tag = config.workers > 1 ? `[W${workerIndex}] ` : ''
console.log(
`${tag}[${completedCount}/${tasks.length}] ${task.query_id}: ${task.query}`,
)
// Restart browser between tasks for clean state
if (appManager) {
await appManager.restart()
}
try {
const { manifest, status } = await executeShowcaseTask(
task,
cdpPort,
config.output,
agentConfig,
config.timeout,
)
runIndex.tasks.push({
executionId: manifest.executionId,
taskId: task.query_id,
query: task.query,
stepCount: manifest.steps.length,
status,
manifestPath: `${manifest.executionId}/manifest.json`,
})
const duration = (manifest.totalDurationMs / 1000).toFixed(1)
console.log(
`${tag} ${status.toUpperCase()}${manifest.steps.length} steps, ${duration}s\n`,
)
} catch (err) {
console.error(
`${tag} FAILED — ${err instanceof Error ? err.message : String(err)}\n`,
)
runIndex.tasks.push({
executionId: 'unknown',
taskId: task.query_id,
query: task.query,
stepCount: 0,
status: 'failed',
manifestPath: '',
})
}
}
} finally {
if (appManager) await appManager.killApp()
}
}
// --- Run ---
try {
const workers = Array.from({ length: config.workers }, (_, i) => runWorker(i))
await Promise.all(workers)
await saveRunIndex(config.output, runIndex)
console.log(`\nResults saved to: ${config.output}`)
console.log(
`Tasks: ${runIndex.tasks.filter((t) => t.status === 'completed').length} completed, ` +
`${runIndex.tasks.filter((t) => t.status === 'failed').length} failed, ` +
`${runIndex.tasks.filter((t) => t.status === 'timeout').length} timed out`,
)
if (config.upload) {
console.log('\nUploading to R2...')
const baseUrl = await uploadShowcase(config.output, runId)
console.log(`Uploaded to: ${baseUrl}`)
}
} finally {
process.off('SIGINT', onSignal)
process.off('SIGTERM', onSignal)
}

View File

@@ -0,0 +1,52 @@
import { writeFile } from 'node:fs/promises'
import { join } from 'node:path'
import type {
ShowcaseRunIndex,
ShowcaseStep,
ShowcaseTaskManifest,
} from './types'
export function buildTaskManifest(opts: {
executionId: string
taskId: string
query: string
startUrl: string
dataset: string
steps: ShowcaseStep[]
finalAnswer: string | null
model: string
provider: string
totalDurationMs: number
}): ShowcaseTaskManifest {
return {
executionId: opts.executionId,
taskId: opts.taskId,
query: opts.query,
startUrl: opts.startUrl,
dataset: opts.dataset,
steps: opts.steps,
finalAnswer: opts.finalAnswer,
agentConfig: { model: opts.model, provider: opts.provider },
totalDurationMs: opts.totalDurationMs,
createdAt: new Date().toISOString(),
}
}
export async function saveTaskManifest(
outputDir: string,
executionId: string,
manifest: ShowcaseTaskManifest,
): Promise<string> {
const manifestPath = join(outputDir, executionId, 'manifest.json')
await writeFile(manifestPath, JSON.stringify(manifest, null, 2))
return manifestPath
}
export async function saveRunIndex(
outputDir: string,
index: ShowcaseRunIndex,
): Promise<string> {
const indexPath = join(outputDir, 'index.json')
await writeFile(indexPath, JSON.stringify(index, null, 2))
return indexPath
}

View File

@@ -0,0 +1,53 @@
export interface ShowcaseConfig {
tasks: string
output: string
upload: boolean
model: string
provider: string
apiKeyEnv: string
cdpPort?: number
timeout: number
}
export interface ShowcaseStep {
stepIndex: number
toolName: string
toolInput: Record<string, unknown>
toolOutput: unknown
elementCoordinates?: { x: number; y: number }
beforeScreenshot: string
afterScreenshot: string
annotatedScreenshot?: string
accessibilitySnapshot: string
assistantText?: string
timestamp: string
}
export interface ShowcaseTaskManifest {
executionId: string
taskId: string
query: string
startUrl: string
dataset: string
steps: ShowcaseStep[]
finalAnswer: string | null
agentConfig: { model: string; provider: string }
totalDurationMs: number
createdAt: string
uploadedAt?: string
}
export interface ShowcaseRunIndex {
runId: string
createdAt: string
uploadedAt?: string
agentConfig: { model: string; provider: string }
tasks: Array<{
executionId: string
taskId: string
query: string
stepCount: number
status: 'completed' | 'timeout' | 'failed'
manifestPath: string
}>
}

View File

@@ -0,0 +1,155 @@
import { readdir, readFile, writeFile } from 'node:fs/promises'
import { extname, join, relative } from 'node:path'
import { PutObjectCommand, S3Client } from '@aws-sdk/client-s3'
import type { ShowcaseRunIndex, ShowcaseTaskManifest } from './types'
interface R2Config {
accountId: string
accessKeyId: string
secretAccessKey: string
bucket: string
}
function contentTypeFor(filePath: string): string {
const ext = extname(filePath).toLowerCase()
if (ext === '.png') return 'image/png'
if (ext === '.json') return 'application/json'
if (ext === '.jsonl') return 'application/jsonl'
return 'application/octet-stream'
}
function loadR2Config(): R2Config {
const accountId = process.env.R2_ACCOUNT_ID
const accessKeyId = process.env.R2_ACCESS_KEY_ID
const secretAccessKey = process.env.R2_SECRET_ACCESS_KEY
const bucket = process.env.R2_BUCKET ?? 'rl-env'
if (!accountId || !accessKeyId || !secretAccessKey) {
throw new Error(
'Missing R2 credentials. Set R2_ACCOUNT_ID, R2_ACCESS_KEY_ID, R2_SECRET_ACCESS_KEY',
)
}
return { accountId, accessKeyId, secretAccessKey, bucket }
}
function toR2Key(prefix: string, outputDir: string, filePath: string): string {
return `${prefix}/${relative(outputDir, filePath).replaceAll('\\', '/')}`
}
async function walkDir(dir: string): Promise<string[]> {
const files: string[] = []
const entries = await readdir(dir, { withFileTypes: true })
for (const entry of entries) {
const fullPath = join(dir, entry.name)
if (entry.isDirectory()) {
files.push(...(await walkDir(fullPath)))
} else {
files.push(fullPath)
}
}
return files
}
export async function uploadShowcase(
outputDir: string,
runId: string,
): Promise<string> {
const r2 = loadR2Config()
const client = new S3Client({
region: 'auto',
endpoint: `https://${r2.accountId}.r2.cloudflarestorage.com`,
credentials: {
accessKeyId: r2.accessKeyId,
secretAccessKey: r2.secretAccessKey,
},
})
const prefix = `showcase/${runId}`
const files = await walkDir(outputDir)
console.log(
`Uploading ${files.length} files to R2 (${r2.bucket}/${prefix})...`,
)
for (const filePath of files) {
const key = toR2Key(prefix, outputDir, filePath)
const data = await readFile(filePath)
await client.send(
new PutObjectCommand({
Bucket: r2.bucket,
Key: key,
Body: data,
ContentType: contentTypeFor(filePath),
}),
)
}
const baseUrl = `https://${r2.bucket}.${r2.accountId}.r2.cloudflarestorage.com/${prefix}`
console.log(`Upload complete. Base: ${baseUrl}`)
// Stamp uploadedAt on index.json
const indexPath = join(outputDir, 'index.json')
try {
const indexData = JSON.parse(
await readFile(indexPath, 'utf-8'),
) as ShowcaseRunIndex
indexData.uploadedAt = new Date().toISOString()
await writeFile(indexPath, JSON.stringify(indexData, null, 2))
} catch {
// index may not exist if run was partial
}
// Stamp uploadedAt on each task manifest
for (const file of files) {
if (file.endsWith('manifest.json') && file !== indexPath) {
try {
const manifestData = JSON.parse(
await readFile(file, 'utf-8'),
) as ShowcaseTaskManifest
manifestData.uploadedAt = new Date().toISOString()
// Rewrite screenshot paths to R2 keys
for (const step of manifestData.steps) {
step.beforeScreenshot = toR2Key(
prefix,
outputDir,
step.beforeScreenshot,
)
step.afterScreenshot = toR2Key(
prefix,
outputDir,
step.afterScreenshot,
)
if (step.annotatedScreenshot) {
step.annotatedScreenshot = toR2Key(
prefix,
outputDir,
step.annotatedScreenshot,
)
}
}
await writeFile(file, JSON.stringify(manifestData, null, 2))
} catch {
// skip malformed manifests
}
}
}
// Re-upload rewritten manifests + index
const jsonFiles = files.filter(
(f) => f.endsWith('.json') && !f.includes('node_modules'),
)
for (const filePath of jsonFiles) {
const key = toR2Key(prefix, outputDir, filePath)
const data = await readFile(filePath)
await client.send(
new PutObjectCommand({
Bucket: r2.bucket,
Key: key,
Body: data,
ContentType: 'application/json',
}),
)
}
return baseUrl
}

View File

@@ -242,6 +242,12 @@ function printTaskProgress(
if (result.status === 'failed') {
console.log(` ERROR: ${result.error.message}`)
} else if (isSuccessfulResult(result)) {
// Log agent errors (e.g., LLM API failures) even if task "completed"
if (result.agentResult.metadata.errors?.length) {
for (const err of result.agentResult.metadata.errors) {
console.log(` ERROR [${err.source}]: ${err.message}`)
}
}
for (const [name, gr] of Object.entries(result.graderResults)) {
const icon = gr.pass ? 'PASS' : 'FAIL'
console.log(` ${name}: ${icon}`)

View File

@@ -701,6 +701,17 @@ export class Browser {
}
}
// --- Element Geometry ---
async getElementCenter(
page: number,
element: number,
): Promise<{ x: number; y: number }> {
const session = await this.resolveSession(page)
await elements.scrollIntoView(session, element)
return elements.getElementCenter(session, element)
}
// --- Input ---
async click(