diff --git a/packages/browseros-agent/apps/eval/scripts/weekly-report.ts b/packages/browseros-agent/apps/eval/scripts/weekly-report.ts index 20a828038..ec67ea9a4 100644 --- a/packages/browseros-agent/apps/eval/scripts/weekly-report.ts +++ b/packages/browseros-agent/apps/eval/scripts/weekly-report.ts @@ -47,7 +47,7 @@ interface RunSummary { runId: string configName: string date: string - passRate: number + avgScore: number total: number completed: number failed: number @@ -135,20 +135,20 @@ const runs: RunSummary[] = manifests const failed = m.tasks.filter((t) => t.status === 'failed').length const timeout = m.tasks.filter((t) => t.status === 'timeout').length - let graded = 0 - let passed = 0 + let scoredCount = 0 + let scoreSum = 0 for (const task of m.tasks) { if (!task.graderResults) continue for (const name of PASS_FAIL_GRADER_ORDER) { if (task.graderResults[name]) { - graded++ - if (task.graderResults[name].pass) passed++ + scoredCount++ + scoreSum += task.graderResults[name].score ?? 0 break } } } - const passRate = graded > 0 ? passed / graded : 0 + const avgScore = scoredCount > 0 ? (scoreSum / scoredCount) * 100 : 0 const durations = m.tasks .filter((t) => t.durationMs > 0) .map((t) => t.durationMs) @@ -170,7 +170,7 @@ const runs: RunSummary[] = manifests runId: m.runId, configName, date, - passRate, + avgScore, total, completed, failed, @@ -242,7 +242,7 @@ const html = ` .stat-value.big { font-size: 2.5rem; font-weight: 700; } .pass { color: #3fb950; } .fail { color: #f85149; } - .neutral { color: #8b949e; } + .neutral { color: #f0883e; } .trend-up { color: #3fb950; } .trend-down { color: #f85149; } .trend-flat { color: #8b949e; } @@ -314,7 +314,7 @@ const html = ` Model Dataset Architecture - Pass Rate + Score Tasks Timeout Avg Duration @@ -327,7 +327,6 @@ const html = ` .reverse() .map((r) => { const viewerUrl = `viewer.html?run=${encodeURIComponent(r.runId)}` - const passed = Math.round(r.passRate * r.total) const archLabel = r.agentType === 'orchestrator-executor' ? 'Orch-Exec' @@ -342,7 +341,7 @@ const html = ` ${escHtml(r.model)} ${escHtml(r.dataset)} ${escHtml(archLabel)} - ${(r.passRate * 100).toFixed(1)}% (${passed}/${r.total}) + ${r.avgScore.toFixed(1)}% ${r.total} ${r.timeout} ${(r.avgDurationMs / 1000).toFixed(0)}s @@ -386,10 +385,12 @@ const html = ` : latest.agentType === 'single' ? 'Single Agent (Tool Loop)' : latest.agentType === 'gemini-computer-use' ? 'Gemini Computer Use' : latest.agentType || 'Unknown'; + var scoreColor = latest.avgScore >= 75 ? '#3fb950' : latest.avgScore >= 40 ? '#f0883e' : '#f85149'; el.innerHTML = '
Architecture' + archLabel + '
' + '
Model' + (latest.model || 'unknown') + '
' + '
Dataset' + (latest.dataset || 'unknown') + '
' + + '
Latest Score' + latest.avgScore.toFixed(1) + '%
' + '
Tasks' + latest.total + '
' + '
Runs' + runs.length + '
'; } @@ -400,15 +401,16 @@ const html = ` if (runs.length === 0) { el.innerHTML = ''; return; } var latest = runs[runs.length - 1]; var prev = runs.length >= 2 ? runs[runs.length - 2] : null; - var best = Math.max.apply(null, runs.map(function(r) { return r.passRate; })); - var delta = prev ? latest.passRate - prev.passRate : 0; + var best = Math.max.apply(null, runs.map(function(r) { return r.avgScore; })); + var delta = prev ? latest.avgScore - prev.avgScore : 0; var sign = delta > 0 ? '+' : ''; var trendCls = delta > 0 ? 'trend-up' : delta < 0 ? 'trend-down' : 'trend-flat'; + var latestColor = latest.avgScore >= 75 ? 'pass' : latest.avgScore >= 40 ? 'neutral' : 'fail'; el.innerHTML = - '
Latest Pass Rate
' + (latest.passRate * 100).toFixed(1) + '%
' + - '
Trend
' + (prev ? sign + (delta * 100).toFixed(1) + ' pp' : 'N/A') + '
' + - '
Best Score
' + (best * 100).toFixed(1) + '%
' + + '
Latest Score
' + latest.avgScore.toFixed(1) + '%
' + + '
Trend
' + (prev ? sign + delta.toFixed(1) + ' pp' : 'N/A') + '
' + + '
Best Score
' + best.toFixed(1) + '%
' + '
Avg Duration
' + (latest.avgDurationMs / 1000).toFixed(0) + 's
' + '
Runs
' + runs.length + '
'; } @@ -436,7 +438,7 @@ const html = ` return; } - var scores = runs.map(function(r) { return r.passRate * 100; }); + var scores = runs.map(function(r) { return r.avgScore; }); var minY = Math.max(0, Math.floor(Math.min.apply(null, scores) / 10) * 10 - 10); var maxY = Math.min(100, Math.ceil(Math.max.apply(null, scores) / 10) * 10 + 10); if (minY === maxY) { minY = Math.max(0, minY - 10); maxY = Math.min(100, maxY + 10); } @@ -463,7 +465,7 @@ const html = ` ctx.strokeStyle = '#58a6ff'; ctx.lineWidth = 2; ctx.beginPath(); runs.forEach(function(r, i) { var px = pad.left + (runs.length === 1 ? plotW / 2 : (i / (runs.length - 1)) * plotW); - var py2 = pad.top + plotH - ((r.passRate * 100 - minY) / (maxY - minY)) * plotH; + var py2 = pad.top + plotH - ((r.avgScore - minY) / (maxY - minY)) * plotH; if (i === 0) ctx.moveTo(px, py2); else ctx.lineTo(px, py2); }); ctx.stroke(); @@ -471,10 +473,10 @@ const html = ` // Dots runs.forEach(function(r, i) { var px = pad.left + (runs.length === 1 ? plotW / 2 : (i / (runs.length - 1)) * plotW); - var py2 = pad.top + plotH - ((r.passRate * 100 - minY) / (maxY - minY)) * plotH; + var py2 = pad.top + plotH - ((r.avgScore - minY) / (maxY - minY)) * plotH; dotPositions.push({ x: px, y: py2, run: r }); ctx.beginPath(); ctx.arc(px, py2, 4, 0, Math.PI * 2); - ctx.fillStyle = r.passRate >= 0.7 ? '#3fb950' : '#f85149'; + ctx.fillStyle = r.avgScore >= 75 ? '#3fb950' : r.avgScore >= 40 ? '#f0883e' : '#f85149'; ctx.fill(); ctx.strokeStyle = '#0d1117'; ctx.lineWidth = 2; ctx.stroke(); }); } @@ -491,11 +493,10 @@ const html = ` if (closest && closestDist < 40) { var r = closest.run; - var passed = Math.round(r.passRate * r.total); document.getElementById('tt-date').textContent = r.date; - document.getElementById('tt-score').textContent = (r.passRate * 100).toFixed(1) + '%'; - document.getElementById('tt-score').style.color = r.passRate >= 0.7 ? '#3fb950' : '#f85149'; - document.getElementById('tt-detail').textContent = passed + '/' + r.total + ' pass \\u00B7 ' + (r.avgDurationMs / 1000).toFixed(0) + 's avg \\u00B7 ' + r.model; + document.getElementById('tt-score').textContent = r.avgScore.toFixed(1) + '%'; + document.getElementById('tt-score').style.color = r.avgScore >= 75 ? '#3fb950' : r.avgScore >= 40 ? '#f0883e' : '#f85149'; + document.getElementById('tt-detail').textContent = 'score ' + r.avgScore.toFixed(1) + '% \\u00B7 ' + r.total + ' tasks \\u00B7 ' + (r.avgDurationMs / 1000).toFixed(0) + 's avg \\u00B7 ' + r.model; tooltip.style.display = 'block'; var tx = closest.x + 12, ty = closest.y - 50; @@ -508,7 +509,7 @@ const html = ` ctx.beginPath(); ctx.arc(closest.x, closest.y, 7, 0, Math.PI * 2); ctx.fillStyle = 'rgba(88, 166, 255, 0.3)'; ctx.fill(); ctx.beginPath(); ctx.arc(closest.x, closest.y, 5, 0, Math.PI * 2); - ctx.fillStyle = r.passRate >= 0.7 ? '#3fb950' : '#f85149'; ctx.fill(); + ctx.fillStyle = r.avgScore >= 75 ? '#3fb950' : r.avgScore >= 40 ? '#f0883e' : '#f85149'; ctx.fill(); ctx.strokeStyle = '#e6edf3'; ctx.lineWidth = 2; ctx.stroke(); canvas.style.cursor = 'pointer'; } else { @@ -584,7 +585,7 @@ console.log(` View at: ${cdnBaseUrl}/report.html`) // Print summary console.log('\nScore trend:') for (const run of runs.slice(-10)) { - const bar = '\u2588'.repeat(Math.round(run.passRate * 20)) - const pct = (run.passRate * 100).toFixed(0).padStart(3) + const bar = '\u2588'.repeat(Math.round(run.avgScore / 5)) + const pct = run.avgScore.toFixed(0).padStart(3) console.log(` ${run.date} ${pct}% ${bar}`) } diff --git a/packages/browseros-agent/apps/eval/src/dashboard/viewer.html b/packages/browseros-agent/apps/eval/src/dashboard/viewer.html index 81f31a6c8..aafa27bef 100644 --- a/packages/browseros-agent/apps/eval/src/dashboard/viewer.html +++ b/packages/browseros-agent/apps/eval/src/dashboard/viewer.html @@ -1221,6 +1221,12 @@ const graders = task.graderResults || {}; const keys = Object.keys(graders); if (keys.length === 0) return { label: '', cls: '' }; + const firstKey = keys[0]; + const score = graders[firstKey].score; + if (typeof score === 'number') { + const pct = Math.round(score * 100); + return { label: pct + '%', cls: pct >= 75 ? 'pass' : 'fail' }; + } const anyPass = keys.some((k) => graders[k].pass); return { label: anyPass ? 'PASS' : 'FAIL', cls: anyPass ? 'pass' : 'fail' }; }