mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
- Add hover_at, type_at, drag_at coordinate tools to server - Add hoverAt, typeAt, dragAt methods to Browser class - Export server internals (browser, tool-loop, registry) for eval imports - Copy eval app from enterprise repo with agents, graders, runner, dashboard - Nest eval-targets inside apps/eval - Adapt sessionExecutionDir → workingDir for current server API - Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
234 lines
6.6 KiB
TypeScript
Vendored
234 lines
6.6 KiB
TypeScript
Vendored
/**
|
|
* Analyze WebBench results across 4 agents to stratify tasks by pass count.
|
|
* Usage: bun apps/eval/scripts/analyze-webbench.ts
|
|
*/
|
|
import { parse } from 'csv-parse/sync'
|
|
|
|
const dataDir = 'apps/eval/data/webbench'
|
|
|
|
interface AgentConfig {
|
|
file: string
|
|
evalCol: string
|
|
name: string
|
|
}
|
|
|
|
const agents: AgentConfig[] = [
|
|
{ file: 'anthropicfinal.csv', evalCol: 'Anthropic_Eval', name: 'Anthropic' },
|
|
{ file: 'skyvern2.0final.csv', evalCol: 'Skyvern2.0Eval', name: 'Skyvern' },
|
|
{ file: 'openaicuafinal.csv', evalCol: 'CUAEval', name: 'OpenAI CUA' },
|
|
{ file: 'browserusefinal.csv', evalCol: 'BUEval', name: 'BrowserUse' },
|
|
]
|
|
|
|
type Row = Record<string, string>
|
|
|
|
// Parse each agent's results
|
|
const agentResults = new Map<
|
|
string,
|
|
Map<
|
|
number,
|
|
{
|
|
eval: string
|
|
difficulty: string
|
|
category: string
|
|
task: string
|
|
url: string
|
|
}
|
|
>
|
|
>()
|
|
|
|
for (const agent of agents) {
|
|
const text = await Bun.file(`${dataDir}/${agent.file}`).text()
|
|
const rows: Row[] = parse(text, {
|
|
columns: true,
|
|
skip_empty_lines: true,
|
|
relax_column_count: true,
|
|
})
|
|
const results = new Map<
|
|
number,
|
|
{
|
|
eval: string
|
|
difficulty: string
|
|
category: string
|
|
task: string
|
|
url: string
|
|
}
|
|
>()
|
|
for (const row of rows) {
|
|
const id = parseInt(row.ID, 10)
|
|
if (Number.isNaN(id)) continue
|
|
results.set(id, {
|
|
eval: row[agent.evalCol] || '',
|
|
difficulty: row.Difficulty || '',
|
|
category: row.Category || '',
|
|
task: row.Task || '',
|
|
url: row['Starting URL'] || '',
|
|
})
|
|
}
|
|
agentResults.set(agent.name, results)
|
|
console.log(`${agent.name}: ${results.size} tasks loaded`)
|
|
}
|
|
|
|
// Find common task IDs (present in all 4 agents)
|
|
const allIds = new Set<number>()
|
|
for (const [, results] of agentResults) {
|
|
for (const id of results.keys()) allIds.add(id)
|
|
}
|
|
|
|
// Build pass count per task
|
|
interface TaskStats {
|
|
id: number
|
|
passCount: number
|
|
difficulty: string
|
|
category: string
|
|
task: string
|
|
url: string
|
|
agents: Record<string, string>
|
|
}
|
|
|
|
const taskStats: TaskStats[] = []
|
|
const _fullAgentNames = agents.map((a) => a.name)
|
|
|
|
for (const id of allIds) {
|
|
let passCount = 0
|
|
let _presentCount = 0
|
|
const agentEvals: Record<string, string> = {}
|
|
let difficulty = ''
|
|
let category = ''
|
|
let task = ''
|
|
let url = ''
|
|
|
|
for (const agent of agents) {
|
|
const result = agentResults.get(agent.name)?.get(id)
|
|
if (result) {
|
|
_presentCount++
|
|
const isSuccess = result.eval?.toLowerCase().includes('success')
|
|
if (isSuccess) passCount++
|
|
agentEvals[agent.name] = isSuccess ? 'PASS' : 'FAIL'
|
|
if (!difficulty) difficulty = result.difficulty
|
|
if (!category) category = result.category
|
|
if (!task) task = result.task
|
|
if (!url) url = result.url
|
|
} else {
|
|
agentEvals[agent.name] = 'N/A'
|
|
}
|
|
}
|
|
|
|
taskStats.push({
|
|
id,
|
|
passCount,
|
|
difficulty,
|
|
category,
|
|
task,
|
|
url,
|
|
agents: agentEvals,
|
|
})
|
|
}
|
|
|
|
// Group by pass count
|
|
const byPassCount: Record<number, TaskStats[]> = {
|
|
0: [],
|
|
1: [],
|
|
2: [],
|
|
3: [],
|
|
4: [],
|
|
}
|
|
for (const t of taskStats) {
|
|
byPassCount[t.passCount].push(t)
|
|
}
|
|
|
|
console.log('\n═══════════════════════════════════════════════════')
|
|
console.log('TASKS BY PASS COUNT (how many agents succeeded)')
|
|
console.log('═══════════════════════════════════════════════════\n')
|
|
|
|
for (let pc = 0; pc <= 4; pc++) {
|
|
const tasks = byPassCount[pc]
|
|
const label =
|
|
pc === 0 ? '0/4 (ALL FAIL)' : pc === 4 ? '4/4 (ALL PASS)' : `${pc}/4`
|
|
console.log(`${label}: ${tasks.length} tasks`)
|
|
|
|
// Breakdown by difficulty
|
|
const easy = tasks.filter((t) => t.difficulty === 'easy').length
|
|
const hard = tasks.filter((t) => t.difficulty === 'hard').length
|
|
console.log(` easy: ${easy}, hard: ${hard}`)
|
|
|
|
// Breakdown by category
|
|
const byCat: Record<string, number> = {}
|
|
for (const t of tasks) {
|
|
byCat[t.category] = (byCat[t.category] || 0) + 1
|
|
}
|
|
console.log(
|
|
` categories: ${Object.entries(byCat)
|
|
.sort((a, b) => b[1] - a[1])
|
|
.map(([c, n]) => `${c}(${n})`)
|
|
.join(', ')}`,
|
|
)
|
|
console.log()
|
|
}
|
|
|
|
// Now handle BrowserUse only having 658 tasks — let's also do a 3-agent view (Anthropic, Skyvern, OpenAI)
|
|
console.log('\n═══════════════════════════════════════════════════')
|
|
console.log('3-AGENT VIEW (Anthropic + Skyvern + OpenAI CUA)')
|
|
console.log('(BrowserUse only has 658 tasks, so this is more complete)')
|
|
console.log('═══════════════════════════════════════════════════\n')
|
|
|
|
const threeAgents = ['Anthropic', 'Skyvern', 'OpenAI CUA']
|
|
const byPassCount3: Record<number, TaskStats[]> = { 0: [], 1: [], 2: [], 3: [] }
|
|
|
|
for (const t of taskStats) {
|
|
let pc3 = 0
|
|
let allPresent = true
|
|
for (const a of threeAgents) {
|
|
if (t.agents[a] === 'N/A') {
|
|
allPresent = false
|
|
break
|
|
}
|
|
if (t.agents[a] === 'PASS') pc3++
|
|
}
|
|
if (!allPresent) continue
|
|
if (!byPassCount3[pc3]) byPassCount3[pc3] = []
|
|
byPassCount3[pc3].push(t)
|
|
}
|
|
|
|
let total3 = 0
|
|
for (let pc = 0; pc <= 3; pc++) {
|
|
const tasks = byPassCount3[pc]
|
|
total3 += tasks.length
|
|
const label =
|
|
pc === 0 ? '0/3 (ALL FAIL)' : pc === 3 ? '3/3 (ALL PASS)' : `${pc}/3`
|
|
console.log(`${label}: ${tasks.length} tasks`)
|
|
|
|
const easy = tasks.filter((t) => t.difficulty === 'easy').length
|
|
const hard = tasks.filter((t) => t.difficulty === 'hard').length
|
|
console.log(` easy: ${easy}, hard: ${hard}`)
|
|
|
|
const byCat: Record<string, number> = {}
|
|
for (const t of tasks) {
|
|
byCat[t.category] = (byCat[t.category] || 0) + 1
|
|
}
|
|
console.log(
|
|
` categories: ${Object.entries(byCat)
|
|
.sort((a, b) => b[1] - a[1])
|
|
.map(([c, n]) => `${c}(${n})`)
|
|
.join(', ')}`,
|
|
)
|
|
|
|
// Show unique websites count
|
|
const uniqueUrls = new Set(tasks.map((t) => t.url))
|
|
console.log(` unique websites: ${uniqueUrls.size}`)
|
|
console.log()
|
|
}
|
|
console.log(`Total tasks in 3-agent intersection: ${total3}`)
|
|
|
|
// Quick sample of 0/3 tasks (hardest)
|
|
console.log('\n── Sample 0/3 (all fail) tasks ──')
|
|
byPassCount3[0].slice(0, 5).forEach((t) => {
|
|
console.log(` [${t.id}] [${t.difficulty}] [${t.category}] ${t.url}`)
|
|
console.log(` ${t.task.slice(0, 150)}`)
|
|
})
|
|
|
|
console.log('\n── Sample 1/3 tasks ──')
|
|
byPassCount3[1].slice(0, 5).forEach((t) => {
|
|
console.log(` [${t.id}] [${t.difficulty}] [${t.category}] ${t.url}`)
|
|
console.log(` ${t.task.slice(0, 150)}`)
|
|
})
|