mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
- Add hover_at, type_at, drag_at coordinate tools to server - Add hoverAt, typeAt, dragAt methods to Browser class - Export server internals (browser, tool-loop, registry) for eval imports - Copy eval app from enterprise repo with agents, graders, runner, dashboard - Nest eval-targets inside apps/eval - Adapt sessionExecutionDir → workingDir for current server API - Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
241 lines
6.9 KiB
TypeScript
241 lines
6.9 KiB
TypeScript
/**
|
||
* Long-running stress test to simulate eval behavior
|
||
* Run with: bun apps/eval/scripts/debug-long-run.ts
|
||
*/
|
||
|
||
import { Client } from '@modelcontextprotocol/sdk/client/index.js'
|
||
import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
|
||
|
||
const SERVER_URL = 'http://127.0.0.1:9110'
|
||
const MCP_URL = `${SERVER_URL}/mcp`
|
||
|
||
// Simulate 60 turns like the failing task had
|
||
const NUM_TURNS = 60
|
||
const SCREENSHOT_EVERY_N_TURNS = 1
|
||
|
||
async function checkExtension(): Promise<boolean> {
|
||
try {
|
||
const res = await fetch(`${SERVER_URL}/extension-status`, {
|
||
signal: AbortSignal.timeout(5000),
|
||
})
|
||
if (!res.ok) return false
|
||
const data = (await res.json()) as { extensionConnected?: boolean }
|
||
return data.extensionConnected === true
|
||
} catch {
|
||
return false
|
||
}
|
||
}
|
||
|
||
async function callMcpTool(
|
||
name: string,
|
||
args: Record<string, unknown> = {},
|
||
timeoutMs: number = 65000,
|
||
): Promise<{ success: boolean; error?: string; duration: number }> {
|
||
const start = Date.now()
|
||
const client = new Client({ name: 'long-run-test', version: '1.0.0' })
|
||
const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
|
||
|
||
try {
|
||
await client.connect(transport)
|
||
|
||
const toolPromise = client.callTool({ name, arguments: args })
|
||
const timeoutPromise = new Promise<never>((_, reject) =>
|
||
setTimeout(
|
||
() => reject(new Error(`Timeout after ${timeoutMs}ms`)),
|
||
timeoutMs,
|
||
),
|
||
)
|
||
|
||
const result = await Promise.race([toolPromise, timeoutPromise])
|
||
const duration = Date.now() - start
|
||
|
||
if ((result as any).isError) {
|
||
const errorText =
|
||
(result as any).content?.find((c: any) => c.type === 'text')?.text ||
|
||
'Unknown error'
|
||
return { success: false, error: errorText, duration }
|
||
}
|
||
|
||
return { success: true, duration }
|
||
} catch (error) {
|
||
return {
|
||
success: false,
|
||
error: error instanceof Error ? error.message : String(error),
|
||
duration: Date.now() - start,
|
||
}
|
||
} finally {
|
||
try {
|
||
await transport.close()
|
||
} catch {}
|
||
}
|
||
}
|
||
|
||
async function main() {
|
||
console.log('='.repeat(60))
|
||
console.log('Long-Running Stress Test (simulating eval)')
|
||
console.log('='.repeat(60))
|
||
console.log(
|
||
`Simulating ${NUM_TURNS} turns with screenshots every ${SCREENSHOT_EVERY_N_TURNS} turn(s)`,
|
||
)
|
||
console.log()
|
||
|
||
// Create window
|
||
console.log('Creating window...')
|
||
|
||
let windowId = 0
|
||
let tabId = 0
|
||
|
||
const client = new Client({ name: 'long-run-test', version: '1.0.0' })
|
||
const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
|
||
|
||
try {
|
||
await client.connect(transport)
|
||
const result = await client.callTool({
|
||
name: 'browser_create_window',
|
||
arguments: { url: 'https://example.com', focused: false },
|
||
})
|
||
|
||
// Try structured content first
|
||
windowId = (result as any).structuredContent?.windowId
|
||
tabId = (result as any).structuredContent?.tabId
|
||
|
||
// Fall back to parsing text
|
||
if (!windowId || !tabId) {
|
||
const text =
|
||
(result as any).content?.find((c: any) => c.type === 'text')?.text || ''
|
||
const windowMatch = text.match(/window\s+(\d+)/i)
|
||
const tabMatch =
|
||
text.match(/Tab ID:\s*(\d+)/i) || text.match(/tab\s+(\d+)/i)
|
||
if (windowMatch) windowId = parseInt(windowMatch[1], 10)
|
||
if (tabMatch) tabId = parseInt(tabMatch[1], 10)
|
||
}
|
||
} finally {
|
||
try {
|
||
await transport.close()
|
||
} catch {}
|
||
}
|
||
|
||
if (!windowId || !tabId) {
|
||
console.log('❌ Could not determine window/tab IDs')
|
||
console.log('Trying to get from list tabs...')
|
||
|
||
// Try listing tabs
|
||
const client2 = new Client({ name: 'long-run-test', version: '1.0.0' })
|
||
const transport2 = new StreamableHTTPClientTransport(new URL(MCP_URL))
|
||
try {
|
||
await client2.connect(transport2)
|
||
const tabs = await client2.callTool({
|
||
name: 'browser_list_tabs',
|
||
arguments: {},
|
||
})
|
||
console.log('Tabs response:', JSON.stringify(tabs, null, 2))
|
||
} finally {
|
||
try {
|
||
await transport2.close()
|
||
} catch {}
|
||
}
|
||
return
|
||
}
|
||
|
||
console.log(`Window: ${windowId}, Tab: ${tabId}`)
|
||
console.log()
|
||
|
||
await new Promise((r) => setTimeout(r, 2000))
|
||
|
||
// Stats
|
||
let screenshotSuccess = 0
|
||
let screenshotFail = 0
|
||
let toolSuccess = 0
|
||
let toolFail = 0
|
||
let extensionDisconnects = 0
|
||
|
||
const startTime = Date.now()
|
||
|
||
// Simulate turns
|
||
for (let turn = 1; turn <= NUM_TURNS; turn++) {
|
||
const _turnStart = Date.now()
|
||
|
||
// Random tool calls to simulate agent behavior
|
||
const tools = [
|
||
{
|
||
name: 'browser_get_interactive_elements',
|
||
args: { tabId, windowId, simplified: true },
|
||
},
|
||
{ name: 'browser_list_tabs', args: { windowId } },
|
||
{ name: 'browser_get_active_tab', args: { windowId } },
|
||
]
|
||
|
||
// Pick a random tool
|
||
const tool = tools[Math.floor(Math.random() * tools.length)]
|
||
const toolRes = await callMcpTool(tool.name, tool.args, 30000)
|
||
|
||
if (toolRes.success) {
|
||
toolSuccess++
|
||
} else {
|
||
toolFail++
|
||
console.log(` Turn ${turn}: ❌ ${tool.name} failed: ${toolRes.error}`)
|
||
}
|
||
|
||
// Screenshot every N turns
|
||
if (turn % SCREENSHOT_EVERY_N_TURNS === 0) {
|
||
const ssRes = await callMcpTool(
|
||
'browser_get_screenshot',
|
||
{ tabId, windowId, size: 'small' },
|
||
65000,
|
||
)
|
||
|
||
if (ssRes.success) {
|
||
screenshotSuccess++
|
||
} else {
|
||
screenshotFail++
|
||
console.log(` Turn ${turn}: ❌ Screenshot failed: ${ssRes.error}`)
|
||
}
|
||
}
|
||
|
||
// Check extension status
|
||
const extConnected = await checkExtension()
|
||
if (!extConnected) {
|
||
extensionDisconnects++
|
||
console.log(` Turn ${turn}: ⚠️ Extension disconnected!`)
|
||
}
|
||
|
||
// Progress
|
||
if (turn % 10 === 0) {
|
||
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1)
|
||
console.log(
|
||
`Turn ${turn}/${NUM_TURNS} - Screenshots: ${screenshotSuccess}/${turn}, Tools: ${toolSuccess}/${turn}, Disconnects: ${extensionDisconnects}, Elapsed: ${elapsed}s`,
|
||
)
|
||
}
|
||
|
||
// Small delay between turns
|
||
await new Promise((r) => setTimeout(r, 200))
|
||
}
|
||
|
||
// Cleanup
|
||
console.log('\nClosing window...')
|
||
await callMcpTool('browser_close_window', { windowId })
|
||
|
||
// Summary
|
||
const totalTime = ((Date.now() - startTime) / 1000).toFixed(1)
|
||
|
||
console.log(`\n${'='.repeat(60)}`)
|
||
console.log('SUMMARY')
|
||
console.log('='.repeat(60))
|
||
console.log(`Total time: ${totalTime}s`)
|
||
console.log(
|
||
`Screenshots: ${screenshotSuccess}/${NUM_TURNS} (${((screenshotSuccess / NUM_TURNS) * 100).toFixed(1)}%)`,
|
||
)
|
||
console.log(
|
||
`Tool calls: ${toolSuccess}/${NUM_TURNS} (${((toolSuccess / NUM_TURNS) * 100).toFixed(1)}%)`,
|
||
)
|
||
console.log(`Extension disconnects: ${extensionDisconnects}`)
|
||
|
||
if (screenshotFail > 0 || toolFail > 0 || extensionDisconnects > 0) {
|
||
console.log('\n⚠️ Issues detected during long run!')
|
||
} else {
|
||
console.log('\n✅ All operations completed successfully!')
|
||
}
|
||
}
|
||
|
||
main().catch(console.error)
|