mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-20 04:21:23 +00:00
* feat: deterministic eval graders (AGI SDK + WebArena-Infinity) (#664) * feat: add deterministic eval graders (AGI SDK + WebArena-Infinity) Two new benchmark integrations with programmatic grading — no LLM judge. AGI SDK / REAL Bench (52 tasks): - 11 React/Next.js clones of consumer apps (DoorDash, Amazon, Gmail, etc.) - Grader navigates browser to /finish, extracts state diff from <pre> tag - Python verifier checks exact values via jmespath queries WebArena-Infinity (50 hard tasks): - 13 LLM-generated SaaS clones (Gmail, GitLab, Linear, Figma, etc.) - InfinityAppManager starts fresh app server per task per worker - Python verifier calls /api/state and asserts on JSON state Infrastructure: - GraderInput extended with mcpUrl + infinityAppUrl for parallel workers - Each worker gets isolated ports (no cross-worker state contamination) - CI workflow: pip install agisdk, clone webarena-infinity repo * chore: switch eval configs back to kimi-k2p5 * fix: register deterministic graders in pass rate calculation Add agisdk_state_diff and infinity_state to PASS_FAIL_GRADER_ORDER in both runner types and weekly report script, so scores show correctly in the dashboard. * chore: temp switch to opus 4.6 for eval run * chore: restore kimi-k2p5 as default eval config * ci: add timeout and continue-on-error for trend report step * fix(eval): drop omnizon from AGISDK dataset (DMCA takedown) evals-omnizon.vercel.app returns HTTP 451 ("This content has been blocked for legal reasons / DMCA_TAKEDOWN"). All 5 omnizon-* tasks fail grading with "Failed to fetch /finish endpoint: JSON Parse error". Adds an EXCLUDED_WEBSITES set to the dataset builder and regenerates agisdk-real.jsonl (52 → 47 tasks). * fix(eval): correct Infinity port-assignment bugs Two related bugs in the Infinity eval runner that cause silent port collisions / fallbacks under parallel execution: 1. build-infinity-dataset.py emitted "app_port" but task-executor and the committed JSONL both read "app_base_port". Re-running the build script would silently make every task fall back to the 8000 default, ignoring per-app port assignments. Renamed the key to match. 2. task-executor derived workerIndex as `base_server_port - 9110`, but parallel-executor doesn't override base_server_port per worker — only server_url. Every worker computed workerIndex = 0, causing all parallel workers to spawn Infinity app servers on the same port. Threading workerIndex explicitly through TaskExecutor instead. Also drops an unused app_name parameter from load_tasks().
119 lines
3.3 KiB
Python
Vendored
119 lines
3.3 KiB
Python
Vendored
#!/usr/bin/env python3
|
|
"""
|
|
Dataset generator for WebArena-Infinity benchmark.
|
|
|
|
Reads real-tasks.json from each app directory and outputs JSONL
|
|
in the eval framework's TaskSchema format.
|
|
|
|
Usage:
|
|
python build-infinity-dataset.py --apps-dir /path/to/webarena-infinity/apps
|
|
python build-infinity-dataset.py --apps-dir /path/to/apps --apps gmail linear --difficulty medium
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
|
|
|
|
def load_tasks(app_dir: str) -> list[dict]:
|
|
tasks_file = os.path.join(app_dir, "real-tasks.json")
|
|
if not os.path.exists(tasks_file):
|
|
print(f"Warning: No real-tasks.json found in {app_dir}", file=sys.stderr)
|
|
return []
|
|
with open(tasks_file) as f:
|
|
return json.load(f)
|
|
|
|
|
|
def build_task_entry(
|
|
app_name: str,
|
|
task: dict,
|
|
base_port: int,
|
|
) -> dict:
|
|
task_id = task.get("id", task.get("task_id", "unknown"))
|
|
difficulty = task.get("difficulty", "unknown")
|
|
query = task.get("query", task.get("instruction", task.get("task", "")))
|
|
verifier_path = task.get(
|
|
"verify",
|
|
task.get("verifier_path", f"real-tasks/{task_id}.py"),
|
|
)
|
|
|
|
return {
|
|
"query_id": f"infinity-{app_name}-{task_id}",
|
|
"dataset": "webarena-infinity",
|
|
"query": query,
|
|
"graders": ["infinity_state"],
|
|
"start_url": f"http://localhost:{base_port}",
|
|
"setup_script": f"POST http://localhost:{base_port}/api/reset",
|
|
"metadata": {
|
|
"original_task_id": f"{app_name}-{task_id}",
|
|
"website": app_name,
|
|
"category": "webarena-infinity",
|
|
"additional": {
|
|
"app_name": app_name,
|
|
"difficulty": difficulty,
|
|
"verifier_path": verifier_path,
|
|
"app_base_port": base_port,
|
|
},
|
|
},
|
|
}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate JSONL dataset from WebArena-Infinity apps"
|
|
)
|
|
parser.add_argument(
|
|
"--apps-dir",
|
|
required=True,
|
|
help="Path to webarena-infinity/apps/ directory",
|
|
)
|
|
parser.add_argument(
|
|
"--apps",
|
|
nargs="*",
|
|
default=None,
|
|
help="Filter to specific app names (default: all)",
|
|
)
|
|
parser.add_argument(
|
|
"--difficulty",
|
|
choices=["easy", "medium", "hard"],
|
|
default=None,
|
|
help="Filter by difficulty tier",
|
|
)
|
|
parser.add_argument(
|
|
"--base-port",
|
|
type=int,
|
|
default=8000,
|
|
help="Starting port number for apps (default: 8000)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if not os.path.isdir(args.apps_dir):
|
|
print(f"Error: {args.apps_dir} is not a directory", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
app_dirs = sorted(os.listdir(args.apps_dir))
|
|
if args.apps:
|
|
app_dirs = [d for d in app_dirs if d in args.apps]
|
|
|
|
port = args.base_port
|
|
for app_name in app_dirs:
|
|
app_path = os.path.join(args.apps_dir, app_name)
|
|
if not os.path.isdir(app_path):
|
|
continue
|
|
|
|
tasks = load_tasks(app_path)
|
|
for task in tasks:
|
|
difficulty = task.get("difficulty", "unknown")
|
|
if args.difficulty and difficulty != args.difficulty:
|
|
continue
|
|
|
|
entry = build_task_entry(app_name, task, port)
|
|
print(json.dumps(entry))
|
|
|
|
port += 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|