mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 23:53:25 +00:00
Compare commits
4 Commits
chore/giti
...
fix/eval-t
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
71cb158739 | ||
|
|
654a871841 | ||
|
|
ff167fb855 | ||
|
|
3d4e493512 |
11
packages/browseros-agent/apps/eval/.env.example
Normal file
11
packages/browseros-agent/apps/eval/.env.example
Normal file
@@ -0,0 +1,11 @@
|
||||
# Showcase generator — LLM config
|
||||
OPENROUTER_API_KEY=
|
||||
SHOWCASE_MODEL=openai/gpt-4o
|
||||
SHOWCASE_PROVIDER=openrouter
|
||||
SHOWCASE_BASE_URL=https://openrouter.ai/api/v1
|
||||
|
||||
# R2 upload (for --upload flag)
|
||||
R2_ACCOUNT_ID=
|
||||
R2_ACCESS_KEY_ID=
|
||||
R2_SECRET_ACCESS_KEY=
|
||||
R2_BUCKET=rl-env
|
||||
@@ -1,3 +1,5 @@
|
||||
data/raw/
|
||||
results/
|
||||
extensions/
|
||||
showcase-output/
|
||||
.env
|
||||
|
||||
20
packages/browseros-agent/apps/eval/data/prod-tasks.jsonl
Normal file
20
packages/browseros-agent/apps/eval/data/prod-tasks.jsonl
Normal file
@@ -0,0 +1,20 @@
|
||||
{"query_id":"prod-financial-advisor-morningstar","dataset":"prod","query":"Look up the Morningstar rating and expense ratio for Vanguard Total Stock Market Index Fund (VTSAX) on morningstar.com","start_url":"https://www.morningstar.com","metadata":{"original_task_id":"prod-financial-advisor-morningstar","category":"finance"}}
|
||||
{"query_id":"prod-lawyer-pacer-search","dataset":"prod","query":"Go to courtlistener.com and search for recent federal court opinions mentioning 'non-compete agreement' from 2025","start_url":"https://www.courtlistener.com","metadata":{"original_task_id":"prod-lawyer-pacer-search","category":"legal"}}
|
||||
{"query_id":"prod-doctor-drug-interaction","dataset":"prod","query":"Check drug interactions between metformin and lisinopril on drugs.com","start_url":"https://www.drugs.com/drug_interactions.html","metadata":{"original_task_id":"prod-doctor-drug-interaction","category":"healthcare"}}
|
||||
{"query_id":"prod-software-eng-github-trending","dataset":"prod","query":"Find the top trending Python repositories on GitHub this week and open the most starred one","start_url":"https://github.com/trending","metadata":{"original_task_id":"prod-software-eng-github-trending","category":"technology"}}
|
||||
{"query_id":"prod-cfo-sec-filing","dataset":"prod","query":"Look up Apple's most recent 10-K filing on SEC EDGAR and find their total revenue for fiscal year 2024","start_url":"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&company=apple&CIK=&type=10-K&dateb=&owner=include&count=10&search_text=&action=getcompany","metadata":{"original_task_id":"prod-cfo-sec-filing","category":"finance"}}
|
||||
{"query_id":"prod-realtor-zillow-comp","dataset":"prod","query":"Search for recently sold homes in Palo Alto, CA on Zillow and filter for 3+ bedrooms sold in the last 30 days","start_url":"https://www.zillow.com","metadata":{"original_task_id":"prod-realtor-zillow-comp","category":"real_estate"}}
|
||||
{"query_id":"prod-hr-manager-linkedin-job","dataset":"prod","query":"Post a job listing search on LinkedIn for a Senior Product Manager role in San Francisco with salary range $180k-$220k","start_url":"https://www.linkedin.com/jobs/","metadata":{"original_task_id":"prod-hr-manager-linkedin-job","category":"hr"}}
|
||||
{"query_id":"prod-analyst-fred-data","dataset":"prod","query":"Go to FRED and pull up the US Consumer Price Index chart, change the time range to the last 5 years","start_url":"https://fred.stlouisfed.org","metadata":{"original_task_id":"prod-analyst-fred-data","category":"finance"}}
|
||||
{"query_id":"prod-accountant-irs-form","dataset":"prod","query":"Find and download the latest IRS Form W-9 from irs.gov","start_url":"https://www.irs.gov","metadata":{"original_task_id":"prod-accountant-irs-form","category":"finance"}}
|
||||
{"query_id":"prod-sales-manager-crm-research","dataset":"prod","query":"Go to g2.com and compare the top 3 CRM software platforms by user rating and pricing","start_url":"https://www.g2.com/categories/crm","metadata":{"original_task_id":"prod-sales-manager-crm-research","category":"sales"}}
|
||||
{"query_id":"prod-engineer-stackoverflow","dataset":"prod","query":"Search Stack Overflow for how to implement retry logic with exponential backoff in Python and find the highest voted answer","start_url":"https://stackoverflow.com","metadata":{"original_task_id":"prod-engineer-stackoverflow","category":"technology"}}
|
||||
{"query_id":"prod-pm-producthunt","dataset":"prod","query":"Browse today's top launches on Product Hunt and upvote the highest ranked AI product","start_url":"https://www.producthunt.com","metadata":{"original_task_id":"prod-pm-producthunt","category":"technology"}}
|
||||
{"query_id":"prod-pharmacist-fda-recall","dataset":"prod","query":"Check the FDA website for any recent drug recalls in the last month","start_url":"https://www.fda.gov/safety/recalls-market-withdrawals-safety-alerts","metadata":{"original_task_id":"prod-pharmacist-fda-recall","category":"healthcare"}}
|
||||
{"query_id":"prod-investment-analyst-yahoo-finance","dataset":"prod","query":"Look up NVIDIA stock on Yahoo Finance, check the P/E ratio, and add it to a watchlist","start_url":"https://finance.yahoo.com","metadata":{"original_task_id":"prod-investment-analyst-yahoo-finance","category":"finance"}}
|
||||
{"query_id":"prod-compliance-officer-regulations","dataset":"prod","query":"Search for the latest GDPR enforcement actions on the European Data Protection Board website","start_url":"https://www.edpb.europa.eu/news/news_en","metadata":{"original_task_id":"prod-compliance-officer-regulations","category":"compliance"}}
|
||||
{"query_id":"prod-management-consultant-mckinsey","dataset":"prod","query":"Go to McKinsey's insights page and find their latest article about generative AI's impact on productivity","start_url":"https://www.mckinsey.com/featured-insights","metadata":{"original_task_id":"prod-management-consultant-mckinsey","category":"consulting"}}
|
||||
{"query_id":"prod-operations-manager-shipping","dataset":"prod","query":"Track a FedEx package with tracking number 123456789012 on fedex.com","start_url":"https://www.fedex.com/en-us/tracking.html","metadata":{"original_task_id":"prod-operations-manager-shipping","category":"operations"}}
|
||||
{"query_id":"prod-market-researcher-statista","dataset":"prod","query":"Search Statista for the global AI market size forecast and find the projected value for 2026","start_url":"https://www.statista.com","metadata":{"original_task_id":"prod-market-researcher-statista","category":"research"}}
|
||||
{"query_id":"prod-nurse-uptodate","dataset":"prod","query":"Search WebMD for the recommended dosing guidelines for adult acetaminophen and check the maximum daily dose","start_url":"https://www.webmd.com","metadata":{"original_task_id":"prod-nurse-uptodate","category":"healthcare"}}
|
||||
{"query_id":"prod-executive-flights","dataset":"prod","query":"Search Google Flights for a business class round trip from SFO to JFK departing next Monday returning Friday","start_url":"https://www.google.com/travel/flights","metadata":{"original_task_id":"prod-executive-flights","category":"travel"}}
|
||||
@@ -0,0 +1 @@
|
||||
{"query_id":"showcase-amazon-order","dataset":"showcase","query":"Open amazon.com and order Sensodyne toothpaste","start_url":"https://www.amazon.com","metadata":{"original_task_id":"showcase-amazon-order"}}
|
||||
@@ -0,0 +1,52 @@
|
||||
import type { Browser } from '@browseros/server/browser'
|
||||
|
||||
const CROSSHAIR_ID = '__browseros_showcase_crosshair__'
|
||||
|
||||
export async function injectCrosshair(
|
||||
browser: Browser,
|
||||
pageId: number,
|
||||
coords: { x: number; y: number },
|
||||
toolName: string,
|
||||
): Promise<void> {
|
||||
const x = Math.round(coords.x)
|
||||
const y = Math.round(coords.y)
|
||||
const label = toolName.replace(/_/g, ' ')
|
||||
const labelWidth = Math.round(label.length * 9 + 24)
|
||||
const labelX = x + 32
|
||||
const labelY = y - 32
|
||||
|
||||
await browser.evaluate(
|
||||
pageId,
|
||||
`(() => {
|
||||
const existing = document.getElementById('${CROSSHAIR_ID}');
|
||||
if (existing) existing.remove();
|
||||
|
||||
const el = document.createElement('div');
|
||||
el.id = '${CROSSHAIR_ID}';
|
||||
el.style.cssText = 'position:fixed;top:0;left:0;width:100vw;height:100vh;pointer-events:none;z-index:2147483647';
|
||||
el.innerHTML = '<svg style="position:absolute;top:0;left:0;width:100%;height:100%" xmlns="http://www.w3.org/2000/svg">'
|
||||
+ '<defs><filter id="glow"><feGaussianBlur stdDeviation="3" result="blur"/><feMerge><feMergeNode in="blur"/><feMergeNode in="SourceGraphic"/></feMerge></filter></defs>'
|
||||
+ '<circle cx="${x}" cy="${y}" r="28" fill="rgba(255,59,48,0.12)" stroke="none"/>'
|
||||
+ '<line x1="${x - 40}" y1="${y}" x2="${x - 12}" y2="${y}" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
|
||||
+ '<line x1="${x + 12}" y1="${y}" x2="${x + 40}" y2="${y}" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
|
||||
+ '<line x1="${x}" y1="${y - 40}" x2="${x}" y2="${y - 12}" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
|
||||
+ '<line x1="${x}" y1="${y + 12}" x2="${x}" y2="${y + 40}" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
|
||||
+ '<circle cx="${x}" cy="${y}" r="20" fill="none" stroke="#FF3B30" stroke-width="3" filter="url(#glow)"/>'
|
||||
+ '<circle cx="${x}" cy="${y}" r="4" fill="#FF3B30"/>'
|
||||
+ '<rect x="${labelX}" y="${labelY}" rx="6" ry="6" width="${labelWidth}" height="28" fill="rgba(0,0,0,0.85)"/>'
|
||||
+ '<text x="${labelX + 12}" y="${labelY + 19}" font-family="system-ui,-apple-system,sans-serif" font-size="14" fill="white" font-weight="600">${label}</text>'
|
||||
+ '</svg>';
|
||||
document.body.appendChild(el);
|
||||
})()`,
|
||||
)
|
||||
}
|
||||
|
||||
export async function removeCrosshair(
|
||||
browser: Browser,
|
||||
pageId: number,
|
||||
): Promise<void> {
|
||||
await browser.evaluate(
|
||||
pageId,
|
||||
`document.getElementById('${CROSSHAIR_ID}')?.remove()`,
|
||||
)
|
||||
}
|
||||
288
packages/browseros-agent/apps/eval/scripts/showcase/executor.ts
Normal file
288
packages/browseros-agent/apps/eval/scripts/showcase/executor.ts
Normal file
@@ -0,0 +1,288 @@
|
||||
import { randomUUID } from 'node:crypto'
|
||||
import { mkdir, writeFile } from 'node:fs/promises'
|
||||
import { join } from 'node:path'
|
||||
import { AiSdkAgent } from '@browseros/server/agent/tool-loop'
|
||||
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
|
||||
import { Browser } from '@browseros/server/browser'
|
||||
import { CdpBackend } from '@browseros/server/browser/backends/cdp'
|
||||
import { registry } from '@browseros/server/tools/registry'
|
||||
import type { Task } from '../../src/types'
|
||||
import { injectCrosshair, removeCrosshair } from './crosshair'
|
||||
import { buildTaskManifest, saveTaskManifest } from './manifest'
|
||||
import type { ShowcaseStep, ShowcaseTaskManifest } from './types'
|
||||
|
||||
const ELEMENT_TOOLS = new Set([
|
||||
'click',
|
||||
'fill',
|
||||
'hover',
|
||||
'clear',
|
||||
'select_option',
|
||||
'drag',
|
||||
'focus',
|
||||
'check',
|
||||
'uncheck',
|
||||
])
|
||||
|
||||
const COORDINATE_TOOLS = new Set(['click_at', 'hover_at', 'type_at', 'drag_at'])
|
||||
|
||||
const CONTROLLER_STUB = {
|
||||
start: async () => {},
|
||||
stop: async () => {},
|
||||
isConnected: () => false,
|
||||
send: async () => ({}),
|
||||
// biome-ignore lint/suspicious/noExplicitAny: ControllerBackend type not exported
|
||||
} as any
|
||||
|
||||
async function resolvePageId(
|
||||
browser: Browser,
|
||||
requestedId: number,
|
||||
): Promise<number> {
|
||||
const pages = await browser.listPages()
|
||||
if (pages.some((p) => p.pageId === requestedId)) return requestedId
|
||||
if (pages.length > 0) return pages[0].pageId
|
||||
return requestedId
|
||||
}
|
||||
|
||||
export interface ExecuteTaskResult {
|
||||
manifest: ShowcaseTaskManifest
|
||||
status: 'completed' | 'timeout' | 'failed'
|
||||
}
|
||||
|
||||
export async function executeShowcaseTask(
|
||||
task: Task,
|
||||
cdpPort: number,
|
||||
outputDir: string,
|
||||
agentConfig: {
|
||||
model: string
|
||||
provider: string
|
||||
apiKey?: string
|
||||
baseUrl?: string
|
||||
},
|
||||
timeoutMs: number,
|
||||
): Promise<ExecuteTaskResult> {
|
||||
const executionId = randomUUID()
|
||||
const taskDir = join(outputDir, executionId)
|
||||
const screenshotDir = join(taskDir, 'screenshots')
|
||||
await mkdir(screenshotDir, { recursive: true })
|
||||
|
||||
const cdp = new CdpBackend({ port: cdpPort })
|
||||
await cdp.connect()
|
||||
const browser = new Browser(cdp, CONTROLLER_STUB)
|
||||
|
||||
const pages = await browser.listPages()
|
||||
const activePage = pages[0]
|
||||
let activePageId = activePage?.pageId ?? 1
|
||||
|
||||
// Navigate to start URL
|
||||
if (task.start_url && task.start_url !== 'about:blank') {
|
||||
await browser.goto(activePageId, task.start_url)
|
||||
}
|
||||
|
||||
const conversationId = randomUUID()
|
||||
const resolvedConfig: ResolvedAgentConfig = {
|
||||
conversationId,
|
||||
// biome-ignore lint/suspicious/noExplicitAny: LLMProvider type validated at runtime
|
||||
provider: agentConfig.provider as any,
|
||||
model: agentConfig.model,
|
||||
apiKey: agentConfig.apiKey,
|
||||
baseUrl: agentConfig.baseUrl,
|
||||
workingDir: `/tmp/browseros-showcase-${conversationId}`,
|
||||
evalMode: true,
|
||||
supportsImages: true,
|
||||
}
|
||||
|
||||
const browserContext = activePage
|
||||
? {
|
||||
activeTab: {
|
||||
id: activePage.tabId,
|
||||
pageId: activePage.pageId,
|
||||
url: activePage.url,
|
||||
title: activePage.title,
|
||||
},
|
||||
}
|
||||
: undefined
|
||||
|
||||
let agent: AiSdkAgent | null = null
|
||||
const steps: ShowcaseStep[] = []
|
||||
let stepNum = 0
|
||||
let finalText: string | null = null
|
||||
let status: 'completed' | 'timeout' | 'failed' = 'completed'
|
||||
const startTime = Date.now()
|
||||
|
||||
try {
|
||||
agent = await AiSdkAgent.create({
|
||||
resolvedConfig,
|
||||
browser,
|
||||
registry,
|
||||
browserContext,
|
||||
})
|
||||
|
||||
let pendingStep: Partial<ShowcaseStep> | null = null
|
||||
|
||||
const abortController = new AbortController()
|
||||
const timeoutHandle = setTimeout(() => abortController.abort(), timeoutMs)
|
||||
|
||||
try {
|
||||
const result = await agent.toolLoopAgent.generate({
|
||||
prompt: task.query,
|
||||
abortSignal: abortController.signal,
|
||||
|
||||
experimental_onToolCallStart: async ({ toolCall }) => {
|
||||
try {
|
||||
const input = (toolCall.input ?? {}) as Record<string, unknown>
|
||||
if (typeof input.page === 'number') {
|
||||
activePageId = input.page
|
||||
}
|
||||
const pageId = await resolvePageId(browser, activePageId)
|
||||
activePageId = pageId
|
||||
|
||||
const beforeResult = await browser.screenshot(pageId, {
|
||||
format: 'png',
|
||||
fullPage: false,
|
||||
})
|
||||
const beforePath = join(screenshotDir, `${stepNum}_before.png`)
|
||||
await writeFile(
|
||||
beforePath,
|
||||
Buffer.from(beforeResult.data, 'base64'),
|
||||
)
|
||||
|
||||
let axTree = ''
|
||||
try {
|
||||
axTree = await browser.snapshot(pageId)
|
||||
} catch {
|
||||
// snapshot can fail on some pages
|
||||
}
|
||||
|
||||
let coords: { x: number; y: number } | undefined
|
||||
const elementId = input.element as number | undefined
|
||||
if (
|
||||
elementId !== undefined &&
|
||||
ELEMENT_TOOLS.has(toolCall.toolName)
|
||||
) {
|
||||
try {
|
||||
coords = await browser.getElementCenter(pageId, elementId)
|
||||
} catch {
|
||||
// element may have been removed
|
||||
}
|
||||
} else if (
|
||||
COORDINATE_TOOLS.has(toolCall.toolName) &&
|
||||
typeof input.x === 'number' &&
|
||||
typeof input.y === 'number'
|
||||
) {
|
||||
coords = { x: input.x, y: input.y }
|
||||
}
|
||||
|
||||
pendingStep = {
|
||||
stepIndex: stepNum,
|
||||
toolName: toolCall.toolName,
|
||||
toolInput: input,
|
||||
beforeScreenshot: beforePath,
|
||||
accessibilitySnapshot: axTree,
|
||||
elementCoordinates: coords,
|
||||
timestamp: new Date().toISOString(),
|
||||
}
|
||||
|
||||
if (coords) {
|
||||
try {
|
||||
await injectCrosshair(
|
||||
browser,
|
||||
pageId,
|
||||
coords,
|
||||
toolCall.toolName,
|
||||
)
|
||||
const annotatedResult = await browser.screenshot(pageId, {
|
||||
format: 'png',
|
||||
fullPage: false,
|
||||
})
|
||||
const annotatedPath = join(
|
||||
screenshotDir,
|
||||
`${stepNum}_annotated.png`,
|
||||
)
|
||||
await writeFile(
|
||||
annotatedPath,
|
||||
Buffer.from(annotatedResult.data, 'base64'),
|
||||
)
|
||||
pendingStep.annotatedScreenshot = annotatedPath
|
||||
await removeCrosshair(browser, pageId)
|
||||
} catch {
|
||||
// annotation is best-effort
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
` Step ${stepNum} before-capture failed: ${err instanceof Error ? err.message : String(err)}`,
|
||||
)
|
||||
}
|
||||
},
|
||||
|
||||
experimental_onToolCallFinish: async ({ toolResult }) => {
|
||||
try {
|
||||
const pageId = await resolvePageId(browser, activePageId)
|
||||
activePageId = pageId
|
||||
const afterResult = await browser.screenshot(pageId, {
|
||||
format: 'png',
|
||||
fullPage: false,
|
||||
})
|
||||
const afterPath = join(screenshotDir, `${stepNum}_after.png`)
|
||||
await writeFile(afterPath, Buffer.from(afterResult.data, 'base64'))
|
||||
|
||||
if (pendingStep) {
|
||||
pendingStep.afterScreenshot = afterPath
|
||||
pendingStep.toolOutput = toolResult
|
||||
steps.push(pendingStep as ShowcaseStep)
|
||||
stepNum++
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
` Step ${stepNum} after-capture failed: ${err instanceof Error ? err.message : String(err)}`,
|
||||
)
|
||||
}
|
||||
pendingStep = null
|
||||
},
|
||||
|
||||
onStepFinish: async ({ text }) => {
|
||||
if (text && steps.length > 0) {
|
||||
const lastStep = steps[steps.length - 1]
|
||||
lastStep.assistantText = text
|
||||
}
|
||||
},
|
||||
})
|
||||
|
||||
finalText = result.text || null
|
||||
} catch (err) {
|
||||
if (abortController.signal.aborted) {
|
||||
status = 'timeout'
|
||||
console.log(` ${task.query_id}: timed out after ${timeoutMs / 1000}s`)
|
||||
} else {
|
||||
status = 'failed'
|
||||
console.error(
|
||||
` ${task.query_id}: failed — ${err instanceof Error ? err.message : String(err)}`,
|
||||
)
|
||||
}
|
||||
} finally {
|
||||
clearTimeout(timeoutHandle)
|
||||
}
|
||||
|
||||
const totalDurationMs = Date.now() - startTime
|
||||
|
||||
const manifest = buildTaskManifest({
|
||||
executionId,
|
||||
taskId: task.query_id,
|
||||
query: task.query,
|
||||
startUrl: task.start_url ?? 'about:blank',
|
||||
dataset: task.dataset,
|
||||
steps,
|
||||
finalAnswer: finalText,
|
||||
model: agentConfig.model,
|
||||
provider: agentConfig.provider,
|
||||
totalDurationMs,
|
||||
})
|
||||
|
||||
await saveTaskManifest(outputDir, executionId, manifest)
|
||||
|
||||
return { manifest, status }
|
||||
} finally {
|
||||
if (agent) await agent.dispose().catch(() => {})
|
||||
await cdp.disconnect().catch(() => {})
|
||||
}
|
||||
}
|
||||
235
packages/browseros-agent/apps/eval/scripts/showcase/generate.ts
Normal file
235
packages/browseros-agent/apps/eval/scripts/showcase/generate.ts
Normal file
@@ -0,0 +1,235 @@
|
||||
#!/usr/bin/env bun
|
||||
|
||||
import { mkdir } from 'node:fs/promises'
|
||||
import { parseArgs } from 'node:util'
|
||||
import { BrowserOSAppManager } from '../../src/runner/browseros-app-manager'
|
||||
import { loadTasks } from '../../src/runner/task-loader'
|
||||
import type { Task } from '../../src/types'
|
||||
import { executeShowcaseTask } from './executor'
|
||||
import { saveRunIndex } from './manifest'
|
||||
import type { ShowcaseRunIndex } from './types'
|
||||
import { uploadShowcase } from './uploader'
|
||||
|
||||
const BASE_PORTS = { cdp: 9010, server: 9110, extension: 9310 }
|
||||
|
||||
const { values } = parseArgs({
|
||||
args: Bun.argv.slice(2),
|
||||
options: {
|
||||
tasks: { type: 'string', short: 't' },
|
||||
output: { type: 'string', short: 'o', default: './showcase-output' },
|
||||
model: { type: 'string', short: 'm' },
|
||||
provider: { type: 'string', short: 'p' },
|
||||
'base-url': { type: 'string' },
|
||||
workers: { type: 'string', short: 'w', default: '1' },
|
||||
'cdp-port': { type: 'string' },
|
||||
timeout: { type: 'string', default: '300000' },
|
||||
upload: { type: 'boolean', default: false },
|
||||
help: { type: 'boolean', short: 'h' },
|
||||
},
|
||||
})
|
||||
|
||||
if (values.help || !values.tasks) {
|
||||
console.log(`
|
||||
Showcase Dataset Generator
|
||||
|
||||
Runs the BrowserOS agent on tasks and captures before/after screenshots
|
||||
with crosshair annotations for element-targeting tool calls.
|
||||
|
||||
Usage:
|
||||
bun scripts/showcase/generate.ts --tasks <path> [options]
|
||||
|
||||
Options:
|
||||
-t, --tasks <path> JSONL task file (required)
|
||||
-o, --output <dir> Output directory (default: ./showcase-output)
|
||||
-m, --model <model> LLM model (env: SHOWCASE_MODEL, default: openai/gpt-4o)
|
||||
-p, --provider <name> LLM provider (env: SHOWCASE_PROVIDER, default: openrouter)
|
||||
--base-url <url> LLM base URL (env: SHOWCASE_BASE_URL)
|
||||
-w, --workers <n> Parallel workers (default: 1)
|
||||
--cdp-port <port> Connect to existing Chrome (single-worker only)
|
||||
--timeout <ms> Per-task timeout in ms (default: 300000)
|
||||
--upload Upload results to R2 after generation
|
||||
-h, --help Show this help
|
||||
`)
|
||||
process.exit(values.help ? 0 : 1)
|
||||
}
|
||||
|
||||
const config = {
|
||||
tasks: values.tasks as string,
|
||||
output: (values.output ?? './showcase-output') as string,
|
||||
model: (values.model ??
|
||||
process.env.SHOWCASE_MODEL ??
|
||||
'openai/gpt-4o') as string,
|
||||
provider: (values.provider ??
|
||||
process.env.SHOWCASE_PROVIDER ??
|
||||
'openrouter') as string,
|
||||
baseUrl: (values['base-url'] ?? process.env.SHOWCASE_BASE_URL) as
|
||||
| string
|
||||
| undefined,
|
||||
workers: Math.max(1, Number(values.workers ?? '1')),
|
||||
cdpPort: values['cdp-port'] ? Number(values['cdp-port']) : undefined,
|
||||
timeout: Number(values.timeout ?? '300000'),
|
||||
upload: values.upload ?? false,
|
||||
}
|
||||
|
||||
if (config.cdpPort && config.workers > 1) {
|
||||
console.error('--cdp-port only works with a single worker (--workers 1)')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const apiKey = process.env.OPENROUTER_API_KEY ?? process.env.OPENAI_API_KEY
|
||||
if (!apiKey) {
|
||||
console.error(
|
||||
'Missing API key: set OPENROUTER_API_KEY or OPENAI_API_KEY environment variable',
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const { tasks } = await loadTasks({ type: 'file', path: config.tasks })
|
||||
console.log(`Loaded ${tasks.length} task(s), ${config.workers} worker(s)`)
|
||||
|
||||
await mkdir(config.output, { recursive: true })
|
||||
|
||||
const runId = `${new Date().toISOString().slice(0, 10)}-${crypto.randomUUID().slice(0, 8)}`
|
||||
const runIndex: ShowcaseRunIndex = {
|
||||
runId,
|
||||
createdAt: new Date().toISOString(),
|
||||
agentConfig: { model: config.model, provider: config.provider },
|
||||
tasks: [],
|
||||
}
|
||||
|
||||
console.log(`\nRun ID: ${runId}`)
|
||||
console.log(`Output: ${config.output}\n`)
|
||||
|
||||
// --- Task Queue ---
|
||||
|
||||
class TaskQueue {
|
||||
private index = 0
|
||||
private stopped = false
|
||||
constructor(private tasks: Task[]) {}
|
||||
|
||||
next(): Task | null {
|
||||
if (this.stopped || this.index >= this.tasks.length) return null
|
||||
return this.tasks[this.index++]
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
this.stopped = true
|
||||
}
|
||||
}
|
||||
|
||||
const queue = new TaskQueue(tasks)
|
||||
let completedCount = 0
|
||||
const appManagers: BrowserOSAppManager[] = []
|
||||
|
||||
// --- Signal handling ---
|
||||
|
||||
const onSignal = async () => {
|
||||
console.log('\nShutting down workers...')
|
||||
queue.stop()
|
||||
await Promise.allSettled(appManagers.map((m) => m.killApp()))
|
||||
process.exit(0)
|
||||
}
|
||||
process.on('SIGINT', onSignal)
|
||||
process.on('SIGTERM', onSignal)
|
||||
|
||||
// --- Worker ---
|
||||
|
||||
async function runWorker(workerIndex: number): Promise<void> {
|
||||
let appManager: BrowserOSAppManager | null = null
|
||||
let cdpPort = config.cdpPort ?? BASE_PORTS.cdp + workerIndex
|
||||
|
||||
if (!config.cdpPort) {
|
||||
appManager = new BrowserOSAppManager(workerIndex, BASE_PORTS)
|
||||
appManagers.push(appManager)
|
||||
console.log(` [W${workerIndex}] Starting BrowserOS...`)
|
||||
await appManager.restart()
|
||||
cdpPort = BASE_PORTS.cdp + workerIndex
|
||||
}
|
||||
|
||||
const agentConfig = {
|
||||
model: config.model,
|
||||
provider: config.provider,
|
||||
apiKey,
|
||||
baseUrl: config.baseUrl,
|
||||
}
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
const task = queue.next()
|
||||
if (!task) break
|
||||
|
||||
completedCount++
|
||||
const tag = config.workers > 1 ? `[W${workerIndex}] ` : ''
|
||||
console.log(
|
||||
`${tag}[${completedCount}/${tasks.length}] ${task.query_id}: ${task.query}`,
|
||||
)
|
||||
|
||||
// Restart browser between tasks for clean state
|
||||
if (appManager) {
|
||||
await appManager.restart()
|
||||
}
|
||||
|
||||
try {
|
||||
const { manifest, status } = await executeShowcaseTask(
|
||||
task,
|
||||
cdpPort,
|
||||
config.output,
|
||||
agentConfig,
|
||||
config.timeout,
|
||||
)
|
||||
|
||||
runIndex.tasks.push({
|
||||
executionId: manifest.executionId,
|
||||
taskId: task.query_id,
|
||||
query: task.query,
|
||||
stepCount: manifest.steps.length,
|
||||
status,
|
||||
manifestPath: `${manifest.executionId}/manifest.json`,
|
||||
})
|
||||
|
||||
const duration = (manifest.totalDurationMs / 1000).toFixed(1)
|
||||
console.log(
|
||||
`${tag} ${status.toUpperCase()} — ${manifest.steps.length} steps, ${duration}s\n`,
|
||||
)
|
||||
} catch (err) {
|
||||
console.error(
|
||||
`${tag} FAILED — ${err instanceof Error ? err.message : String(err)}\n`,
|
||||
)
|
||||
runIndex.tasks.push({
|
||||
executionId: 'unknown',
|
||||
taskId: task.query_id,
|
||||
query: task.query,
|
||||
stepCount: 0,
|
||||
status: 'failed',
|
||||
manifestPath: '',
|
||||
})
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
if (appManager) await appManager.killApp()
|
||||
}
|
||||
}
|
||||
|
||||
// --- Run ---
|
||||
|
||||
try {
|
||||
const workers = Array.from({ length: config.workers }, (_, i) => runWorker(i))
|
||||
await Promise.all(workers)
|
||||
|
||||
await saveRunIndex(config.output, runIndex)
|
||||
console.log(`\nResults saved to: ${config.output}`)
|
||||
console.log(
|
||||
`Tasks: ${runIndex.tasks.filter((t) => t.status === 'completed').length} completed, ` +
|
||||
`${runIndex.tasks.filter((t) => t.status === 'failed').length} failed, ` +
|
||||
`${runIndex.tasks.filter((t) => t.status === 'timeout').length} timed out`,
|
||||
)
|
||||
|
||||
if (config.upload) {
|
||||
console.log('\nUploading to R2...')
|
||||
const baseUrl = await uploadShowcase(config.output, runId)
|
||||
console.log(`Uploaded to: ${baseUrl}`)
|
||||
}
|
||||
} finally {
|
||||
process.off('SIGINT', onSignal)
|
||||
process.off('SIGTERM', onSignal)
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
import { writeFile } from 'node:fs/promises'
|
||||
import { join } from 'node:path'
|
||||
import type {
|
||||
ShowcaseRunIndex,
|
||||
ShowcaseStep,
|
||||
ShowcaseTaskManifest,
|
||||
} from './types'
|
||||
|
||||
export function buildTaskManifest(opts: {
|
||||
executionId: string
|
||||
taskId: string
|
||||
query: string
|
||||
startUrl: string
|
||||
dataset: string
|
||||
steps: ShowcaseStep[]
|
||||
finalAnswer: string | null
|
||||
model: string
|
||||
provider: string
|
||||
totalDurationMs: number
|
||||
}): ShowcaseTaskManifest {
|
||||
return {
|
||||
executionId: opts.executionId,
|
||||
taskId: opts.taskId,
|
||||
query: opts.query,
|
||||
startUrl: opts.startUrl,
|
||||
dataset: opts.dataset,
|
||||
steps: opts.steps,
|
||||
finalAnswer: opts.finalAnswer,
|
||||
agentConfig: { model: opts.model, provider: opts.provider },
|
||||
totalDurationMs: opts.totalDurationMs,
|
||||
createdAt: new Date().toISOString(),
|
||||
}
|
||||
}
|
||||
|
||||
export async function saveTaskManifest(
|
||||
outputDir: string,
|
||||
executionId: string,
|
||||
manifest: ShowcaseTaskManifest,
|
||||
): Promise<string> {
|
||||
const manifestPath = join(outputDir, executionId, 'manifest.json')
|
||||
await writeFile(manifestPath, JSON.stringify(manifest, null, 2))
|
||||
return manifestPath
|
||||
}
|
||||
|
||||
export async function saveRunIndex(
|
||||
outputDir: string,
|
||||
index: ShowcaseRunIndex,
|
||||
): Promise<string> {
|
||||
const indexPath = join(outputDir, 'index.json')
|
||||
await writeFile(indexPath, JSON.stringify(index, null, 2))
|
||||
return indexPath
|
||||
}
|
||||
53
packages/browseros-agent/apps/eval/scripts/showcase/types.ts
Normal file
53
packages/browseros-agent/apps/eval/scripts/showcase/types.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
export interface ShowcaseConfig {
|
||||
tasks: string
|
||||
output: string
|
||||
upload: boolean
|
||||
model: string
|
||||
provider: string
|
||||
apiKeyEnv: string
|
||||
cdpPort?: number
|
||||
timeout: number
|
||||
}
|
||||
|
||||
export interface ShowcaseStep {
|
||||
stepIndex: number
|
||||
toolName: string
|
||||
toolInput: Record<string, unknown>
|
||||
toolOutput: unknown
|
||||
elementCoordinates?: { x: number; y: number }
|
||||
beforeScreenshot: string
|
||||
afterScreenshot: string
|
||||
annotatedScreenshot?: string
|
||||
accessibilitySnapshot: string
|
||||
assistantText?: string
|
||||
timestamp: string
|
||||
}
|
||||
|
||||
export interface ShowcaseTaskManifest {
|
||||
executionId: string
|
||||
taskId: string
|
||||
query: string
|
||||
startUrl: string
|
||||
dataset: string
|
||||
steps: ShowcaseStep[]
|
||||
finalAnswer: string | null
|
||||
agentConfig: { model: string; provider: string }
|
||||
totalDurationMs: number
|
||||
createdAt: string
|
||||
uploadedAt?: string
|
||||
}
|
||||
|
||||
export interface ShowcaseRunIndex {
|
||||
runId: string
|
||||
createdAt: string
|
||||
uploadedAt?: string
|
||||
agentConfig: { model: string; provider: string }
|
||||
tasks: Array<{
|
||||
executionId: string
|
||||
taskId: string
|
||||
query: string
|
||||
stepCount: number
|
||||
status: 'completed' | 'timeout' | 'failed'
|
||||
manifestPath: string
|
||||
}>
|
||||
}
|
||||
155
packages/browseros-agent/apps/eval/scripts/showcase/uploader.ts
Normal file
155
packages/browseros-agent/apps/eval/scripts/showcase/uploader.ts
Normal file
@@ -0,0 +1,155 @@
|
||||
import { readdir, readFile, writeFile } from 'node:fs/promises'
|
||||
import { extname, join, relative } from 'node:path'
|
||||
import { PutObjectCommand, S3Client } from '@aws-sdk/client-s3'
|
||||
import type { ShowcaseRunIndex, ShowcaseTaskManifest } from './types'
|
||||
|
||||
interface R2Config {
|
||||
accountId: string
|
||||
accessKeyId: string
|
||||
secretAccessKey: string
|
||||
bucket: string
|
||||
}
|
||||
|
||||
function contentTypeFor(filePath: string): string {
|
||||
const ext = extname(filePath).toLowerCase()
|
||||
if (ext === '.png') return 'image/png'
|
||||
if (ext === '.json') return 'application/json'
|
||||
if (ext === '.jsonl') return 'application/jsonl'
|
||||
return 'application/octet-stream'
|
||||
}
|
||||
|
||||
function loadR2Config(): R2Config {
|
||||
const accountId = process.env.R2_ACCOUNT_ID
|
||||
const accessKeyId = process.env.R2_ACCESS_KEY_ID
|
||||
const secretAccessKey = process.env.R2_SECRET_ACCESS_KEY
|
||||
const bucket = process.env.R2_BUCKET ?? 'rl-env'
|
||||
|
||||
if (!accountId || !accessKeyId || !secretAccessKey) {
|
||||
throw new Error(
|
||||
'Missing R2 credentials. Set R2_ACCOUNT_ID, R2_ACCESS_KEY_ID, R2_SECRET_ACCESS_KEY',
|
||||
)
|
||||
}
|
||||
|
||||
return { accountId, accessKeyId, secretAccessKey, bucket }
|
||||
}
|
||||
|
||||
function toR2Key(prefix: string, outputDir: string, filePath: string): string {
|
||||
return `${prefix}/${relative(outputDir, filePath).replaceAll('\\', '/')}`
|
||||
}
|
||||
|
||||
async function walkDir(dir: string): Promise<string[]> {
|
||||
const files: string[] = []
|
||||
const entries = await readdir(dir, { withFileTypes: true })
|
||||
for (const entry of entries) {
|
||||
const fullPath = join(dir, entry.name)
|
||||
if (entry.isDirectory()) {
|
||||
files.push(...(await walkDir(fullPath)))
|
||||
} else {
|
||||
files.push(fullPath)
|
||||
}
|
||||
}
|
||||
return files
|
||||
}
|
||||
|
||||
export async function uploadShowcase(
|
||||
outputDir: string,
|
||||
runId: string,
|
||||
): Promise<string> {
|
||||
const r2 = loadR2Config()
|
||||
const client = new S3Client({
|
||||
region: 'auto',
|
||||
endpoint: `https://${r2.accountId}.r2.cloudflarestorage.com`,
|
||||
credentials: {
|
||||
accessKeyId: r2.accessKeyId,
|
||||
secretAccessKey: r2.secretAccessKey,
|
||||
},
|
||||
})
|
||||
|
||||
const prefix = `showcase/${runId}`
|
||||
const files = await walkDir(outputDir)
|
||||
console.log(
|
||||
`Uploading ${files.length} files to R2 (${r2.bucket}/${prefix})...`,
|
||||
)
|
||||
|
||||
for (const filePath of files) {
|
||||
const key = toR2Key(prefix, outputDir, filePath)
|
||||
const data = await readFile(filePath)
|
||||
await client.send(
|
||||
new PutObjectCommand({
|
||||
Bucket: r2.bucket,
|
||||
Key: key,
|
||||
Body: data,
|
||||
ContentType: contentTypeFor(filePath),
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
const baseUrl = `https://${r2.bucket}.${r2.accountId}.r2.cloudflarestorage.com/${prefix}`
|
||||
console.log(`Upload complete. Base: ${baseUrl}`)
|
||||
|
||||
// Stamp uploadedAt on index.json
|
||||
const indexPath = join(outputDir, 'index.json')
|
||||
try {
|
||||
const indexData = JSON.parse(
|
||||
await readFile(indexPath, 'utf-8'),
|
||||
) as ShowcaseRunIndex
|
||||
indexData.uploadedAt = new Date().toISOString()
|
||||
await writeFile(indexPath, JSON.stringify(indexData, null, 2))
|
||||
} catch {
|
||||
// index may not exist if run was partial
|
||||
}
|
||||
|
||||
// Stamp uploadedAt on each task manifest
|
||||
for (const file of files) {
|
||||
if (file.endsWith('manifest.json') && file !== indexPath) {
|
||||
try {
|
||||
const manifestData = JSON.parse(
|
||||
await readFile(file, 'utf-8'),
|
||||
) as ShowcaseTaskManifest
|
||||
manifestData.uploadedAt = new Date().toISOString()
|
||||
// Rewrite screenshot paths to R2 keys
|
||||
for (const step of manifestData.steps) {
|
||||
step.beforeScreenshot = toR2Key(
|
||||
prefix,
|
||||
outputDir,
|
||||
step.beforeScreenshot,
|
||||
)
|
||||
step.afterScreenshot = toR2Key(
|
||||
prefix,
|
||||
outputDir,
|
||||
step.afterScreenshot,
|
||||
)
|
||||
if (step.annotatedScreenshot) {
|
||||
step.annotatedScreenshot = toR2Key(
|
||||
prefix,
|
||||
outputDir,
|
||||
step.annotatedScreenshot,
|
||||
)
|
||||
}
|
||||
}
|
||||
await writeFile(file, JSON.stringify(manifestData, null, 2))
|
||||
} catch {
|
||||
// skip malformed manifests
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Re-upload rewritten manifests + index
|
||||
const jsonFiles = files.filter(
|
||||
(f) => f.endsWith('.json') && !f.includes('node_modules'),
|
||||
)
|
||||
for (const filePath of jsonFiles) {
|
||||
const key = toR2Key(prefix, outputDir, filePath)
|
||||
const data = await readFile(filePath)
|
||||
await client.send(
|
||||
new PutObjectCommand({
|
||||
Bucket: r2.bucket,
|
||||
Key: key,
|
||||
Body: data,
|
||||
ContentType: 'application/json',
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
return baseUrl
|
||||
}
|
||||
@@ -701,6 +701,17 @@ export class Browser {
|
||||
}
|
||||
}
|
||||
|
||||
// --- Element Geometry ---
|
||||
|
||||
async getElementCenter(
|
||||
page: number,
|
||||
element: number,
|
||||
): Promise<{ x: number; y: number }> {
|
||||
const session = await this.resolveSession(page)
|
||||
await elements.scrollIntoView(session, element)
|
||||
return elements.getElementCenter(session, element)
|
||||
}
|
||||
|
||||
// --- Input ---
|
||||
|
||||
async click(
|
||||
|
||||
Reference in New Issue
Block a user