diff --git a/packages/browseros-agent/apps/eval/scripts/weekly-report.ts b/packages/browseros-agent/apps/eval/scripts/weekly-report.ts
index 20a828038..ec67ea9a4 100644
--- a/packages/browseros-agent/apps/eval/scripts/weekly-report.ts
+++ b/packages/browseros-agent/apps/eval/scripts/weekly-report.ts
@@ -47,7 +47,7 @@ interface RunSummary {
runId: string
configName: string
date: string
- passRate: number
+ avgScore: number
total: number
completed: number
failed: number
@@ -135,20 +135,20 @@ const runs: RunSummary[] = manifests
const failed = m.tasks.filter((t) => t.status === 'failed').length
const timeout = m.tasks.filter((t) => t.status === 'timeout').length
- let graded = 0
- let passed = 0
+ let scoredCount = 0
+ let scoreSum = 0
for (const task of m.tasks) {
if (!task.graderResults) continue
for (const name of PASS_FAIL_GRADER_ORDER) {
if (task.graderResults[name]) {
- graded++
- if (task.graderResults[name].pass) passed++
+ scoredCount++
+ scoreSum += task.graderResults[name].score ?? 0
break
}
}
}
- const passRate = graded > 0 ? passed / graded : 0
+ const avgScore = scoredCount > 0 ? (scoreSum / scoredCount) * 100 : 0
const durations = m.tasks
.filter((t) => t.durationMs > 0)
.map((t) => t.durationMs)
@@ -170,7 +170,7 @@ const runs: RunSummary[] = manifests
runId: m.runId,
configName,
date,
- passRate,
+ avgScore,
total,
completed,
failed,
@@ -242,7 +242,7 @@ const html = `
.stat-value.big { font-size: 2.5rem; font-weight: 700; }
.pass { color: #3fb950; }
.fail { color: #f85149; }
- .neutral { color: #8b949e; }
+ .neutral { color: #f0883e; }
.trend-up { color: #3fb950; }
.trend-down { color: #f85149; }
.trend-flat { color: #8b949e; }
@@ -314,7 +314,7 @@ const html = `
Model |
Dataset |
Architecture |
- Pass Rate |
+ Score |
Tasks |
Timeout |
Avg Duration |
@@ -327,7 +327,6 @@ const html = `
.reverse()
.map((r) => {
const viewerUrl = `viewer.html?run=${encodeURIComponent(r.runId)}`
- const passed = Math.round(r.passRate * r.total)
const archLabel =
r.agentType === 'orchestrator-executor'
? 'Orch-Exec'
@@ -342,7 +341,7 @@ const html = `
${escHtml(r.model)} |
${escHtml(r.dataset)} |
${escHtml(archLabel)} |
- ${(r.passRate * 100).toFixed(1)}% (${passed}/${r.total}) |
+ ${r.avgScore.toFixed(1)}% |
${r.total} |
${r.timeout} |
${(r.avgDurationMs / 1000).toFixed(0)}s |
@@ -386,10 +385,12 @@ const html = `
: latest.agentType === 'single' ? 'Single Agent (Tool Loop)'
: latest.agentType === 'gemini-computer-use' ? 'Gemini Computer Use'
: latest.agentType || 'Unknown';
+ var scoreColor = latest.avgScore >= 75 ? '#3fb950' : latest.avgScore >= 40 ? '#f0883e' : '#f85149';
el.innerHTML =
'Architecture' + archLabel + '
' +
'Model' + (latest.model || 'unknown') + '
' +
'Dataset' + (latest.dataset || 'unknown') + '
' +
+ 'Latest Score' + latest.avgScore.toFixed(1) + '%
' +
'Tasks' + latest.total + '
' +
'Runs' + runs.length + '
';
}
@@ -400,15 +401,16 @@ const html = `
if (runs.length === 0) { el.innerHTML = ''; return; }
var latest = runs[runs.length - 1];
var prev = runs.length >= 2 ? runs[runs.length - 2] : null;
- var best = Math.max.apply(null, runs.map(function(r) { return r.passRate; }));
- var delta = prev ? latest.passRate - prev.passRate : 0;
+ var best = Math.max.apply(null, runs.map(function(r) { return r.avgScore; }));
+ var delta = prev ? latest.avgScore - prev.avgScore : 0;
var sign = delta > 0 ? '+' : '';
var trendCls = delta > 0 ? 'trend-up' : delta < 0 ? 'trend-down' : 'trend-flat';
+ var latestColor = latest.avgScore >= 75 ? 'pass' : latest.avgScore >= 40 ? 'neutral' : 'fail';
el.innerHTML =
- 'Latest Pass Rate
' + (latest.passRate * 100).toFixed(1) + '%
' +
- 'Trend
' + (prev ? sign + (delta * 100).toFixed(1) + ' pp' : 'N/A') + '
' +
- 'Best Score
' + (best * 100).toFixed(1) + '%
' +
+ 'Latest Score
' + latest.avgScore.toFixed(1) + '%
' +
+ 'Trend
' + (prev ? sign + delta.toFixed(1) + ' pp' : 'N/A') + '
' +
+ 'Best Score
' + best.toFixed(1) + '%
' +
'Avg Duration
' + (latest.avgDurationMs / 1000).toFixed(0) + 's
' +
'';
}
@@ -436,7 +438,7 @@ const html = `
return;
}
- var scores = runs.map(function(r) { return r.passRate * 100; });
+ var scores = runs.map(function(r) { return r.avgScore; });
var minY = Math.max(0, Math.floor(Math.min.apply(null, scores) / 10) * 10 - 10);
var maxY = Math.min(100, Math.ceil(Math.max.apply(null, scores) / 10) * 10 + 10);
if (minY === maxY) { minY = Math.max(0, minY - 10); maxY = Math.min(100, maxY + 10); }
@@ -463,7 +465,7 @@ const html = `
ctx.strokeStyle = '#58a6ff'; ctx.lineWidth = 2; ctx.beginPath();
runs.forEach(function(r, i) {
var px = pad.left + (runs.length === 1 ? plotW / 2 : (i / (runs.length - 1)) * plotW);
- var py2 = pad.top + plotH - ((r.passRate * 100 - minY) / (maxY - minY)) * plotH;
+ var py2 = pad.top + plotH - ((r.avgScore - minY) / (maxY - minY)) * plotH;
if (i === 0) ctx.moveTo(px, py2); else ctx.lineTo(px, py2);
});
ctx.stroke();
@@ -471,10 +473,10 @@ const html = `
// Dots
runs.forEach(function(r, i) {
var px = pad.left + (runs.length === 1 ? plotW / 2 : (i / (runs.length - 1)) * plotW);
- var py2 = pad.top + plotH - ((r.passRate * 100 - minY) / (maxY - minY)) * plotH;
+ var py2 = pad.top + plotH - ((r.avgScore - minY) / (maxY - minY)) * plotH;
dotPositions.push({ x: px, y: py2, run: r });
ctx.beginPath(); ctx.arc(px, py2, 4, 0, Math.PI * 2);
- ctx.fillStyle = r.passRate >= 0.7 ? '#3fb950' : '#f85149';
+ ctx.fillStyle = r.avgScore >= 75 ? '#3fb950' : r.avgScore >= 40 ? '#f0883e' : '#f85149';
ctx.fill(); ctx.strokeStyle = '#0d1117'; ctx.lineWidth = 2; ctx.stroke();
});
}
@@ -491,11 +493,10 @@ const html = `
if (closest && closestDist < 40) {
var r = closest.run;
- var passed = Math.round(r.passRate * r.total);
document.getElementById('tt-date').textContent = r.date;
- document.getElementById('tt-score').textContent = (r.passRate * 100).toFixed(1) + '%';
- document.getElementById('tt-score').style.color = r.passRate >= 0.7 ? '#3fb950' : '#f85149';
- document.getElementById('tt-detail').textContent = passed + '/' + r.total + ' pass \\u00B7 ' + (r.avgDurationMs / 1000).toFixed(0) + 's avg \\u00B7 ' + r.model;
+ document.getElementById('tt-score').textContent = r.avgScore.toFixed(1) + '%';
+ document.getElementById('tt-score').style.color = r.avgScore >= 75 ? '#3fb950' : r.avgScore >= 40 ? '#f0883e' : '#f85149';
+ document.getElementById('tt-detail').textContent = 'score ' + r.avgScore.toFixed(1) + '% \\u00B7 ' + r.total + ' tasks \\u00B7 ' + (r.avgDurationMs / 1000).toFixed(0) + 's avg \\u00B7 ' + r.model;
tooltip.style.display = 'block';
var tx = closest.x + 12, ty = closest.y - 50;
@@ -508,7 +509,7 @@ const html = `
ctx.beginPath(); ctx.arc(closest.x, closest.y, 7, 0, Math.PI * 2);
ctx.fillStyle = 'rgba(88, 166, 255, 0.3)'; ctx.fill();
ctx.beginPath(); ctx.arc(closest.x, closest.y, 5, 0, Math.PI * 2);
- ctx.fillStyle = r.passRate >= 0.7 ? '#3fb950' : '#f85149'; ctx.fill();
+ ctx.fillStyle = r.avgScore >= 75 ? '#3fb950' : r.avgScore >= 40 ? '#f0883e' : '#f85149'; ctx.fill();
ctx.strokeStyle = '#e6edf3'; ctx.lineWidth = 2; ctx.stroke();
canvas.style.cursor = 'pointer';
} else {
@@ -584,7 +585,7 @@ console.log(` View at: ${cdnBaseUrl}/report.html`)
// Print summary
console.log('\nScore trend:')
for (const run of runs.slice(-10)) {
- const bar = '\u2588'.repeat(Math.round(run.passRate * 20))
- const pct = (run.passRate * 100).toFixed(0).padStart(3)
+ const bar = '\u2588'.repeat(Math.round(run.avgScore / 5))
+ const pct = run.avgScore.toFixed(0).padStart(3)
console.log(` ${run.date} ${pct}% ${bar}`)
}
diff --git a/packages/browseros-agent/apps/eval/src/dashboard/viewer.html b/packages/browseros-agent/apps/eval/src/dashboard/viewer.html
index 81f31a6c8..aafa27bef 100644
--- a/packages/browseros-agent/apps/eval/src/dashboard/viewer.html
+++ b/packages/browseros-agent/apps/eval/src/dashboard/viewer.html
@@ -1221,6 +1221,12 @@
const graders = task.graderResults || {};
const keys = Object.keys(graders);
if (keys.length === 0) return { label: '', cls: '' };
+ const firstKey = keys[0];
+ const score = graders[firstKey].score;
+ if (typeof score === 'number') {
+ const pct = Math.round(score * 100);
+ return { label: pct + '%', cls: pct >= 75 ? 'pass' : 'fail' };
+ }
const anyPass = keys.some((k) => graders[k].pass);
return { label: anyPass ? 'PASS' : 'FAIL', cls: anyPass ? 'pass' : 'fail' };
}