mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
* feat: add deterministic eval graders (AGI SDK + WebArena-Infinity) Two new benchmark integrations with programmatic grading — no LLM judge. AGI SDK / REAL Bench (52 tasks): - 11 React/Next.js clones of consumer apps (DoorDash, Amazon, Gmail, etc.) - Grader navigates browser to /finish, extracts state diff from <pre> tag - Python verifier checks exact values via jmespath queries WebArena-Infinity (50 hard tasks): - 13 LLM-generated SaaS clones (Gmail, GitLab, Linear, Figma, etc.) - InfinityAppManager starts fresh app server per task per worker - Python verifier calls /api/state and asserts on JSON state Infrastructure: - GraderInput extended with mcpUrl + infinityAppUrl for parallel workers - Each worker gets isolated ports (no cross-worker state contamination) - CI workflow: pip install agisdk, clone webarena-infinity repo * chore: switch eval configs back to kimi-k2p5 * fix: register deterministic graders in pass rate calculation Add agisdk_state_diff and infinity_state to PASS_FAIL_GRADER_ORDER in both runner types and weekly report script, so scores show correctly in the dashboard. * chore: temp switch to opus 4.6 for eval run * chore: restore kimi-k2p5 as default eval config * ci: add timeout and continue-on-error for trend report step
84 lines
2.2 KiB
Python
Vendored
84 lines
2.2 KiB
Python
Vendored
#!/usr/bin/env python3
|
|
"""
|
|
Build JSONL dataset for AGI SDK / REAL Bench evaluation.
|
|
|
|
Reads task definitions from the agisdk package, filters to feasible
|
|
action-only tasks (excludes llm_boolean evaluators), and outputs JSONL
|
|
to stdout in the BrowserOS eval framework format.
|
|
|
|
Usage:
|
|
python scripts/build-agisdk-dataset.py > data/agisdk-real.jsonl
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
|
|
|
|
def has_llm_eval(task: dict) -> bool:
|
|
return any(e.get("type") == "llm_boolean" for e in task.get("evals", []))
|
|
|
|
|
|
def main():
|
|
try:
|
|
from agisdk.REAL.tasks import all_tasks
|
|
except ImportError:
|
|
print(
|
|
"Error: agisdk package not installed. Run: pip install agisdk",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(1)
|
|
|
|
count = 0
|
|
skipped_infeasible = 0
|
|
skipped_llm = 0
|
|
|
|
for task in all_tasks:
|
|
if not task.get("possible", True):
|
|
skipped_infeasible += 1
|
|
continue
|
|
|
|
if has_llm_eval(task):
|
|
skipped_llm += 1
|
|
continue
|
|
|
|
task_id = task["id"]
|
|
website = task.get("website", {})
|
|
goal = task.get("goal", "")
|
|
start_url = website.get("url", "")
|
|
|
|
if not start_url or not goal:
|
|
print(f"Warning: Skipping {task_id} — missing url or goal", file=sys.stderr)
|
|
continue
|
|
|
|
entry = {
|
|
"query_id": f"agisdk-{task_id}",
|
|
"dataset": "agisdk-real",
|
|
"query": goal,
|
|
"graders": ["agisdk_state_diff"],
|
|
"start_url": start_url,
|
|
"metadata": {
|
|
"original_task_id": task_id,
|
|
"website": website.get("name", ""),
|
|
"category": "agisdk-real",
|
|
"additional": {
|
|
"agisdk_task_id": task_id,
|
|
"challenge_type": task.get("challengeType", "action"),
|
|
"difficulty": task.get("difficulty", "unknown"),
|
|
"similar_to": website.get("similarTo", ""),
|
|
},
|
|
},
|
|
}
|
|
|
|
print(json.dumps(entry))
|
|
count += 1
|
|
|
|
print(
|
|
f"Generated {count} tasks (skipped {skipped_infeasible} infeasible, "
|
|
f"{skipped_llm} llm_boolean)",
|
|
file=sys.stderr,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|