mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
* feat: add deterministic eval graders (AGI SDK + WebArena-Infinity) Two new benchmark integrations with programmatic grading — no LLM judge. AGI SDK / REAL Bench (52 tasks): - 11 React/Next.js clones of consumer apps (DoorDash, Amazon, Gmail, etc.) - Grader navigates browser to /finish, extracts state diff from <pre> tag - Python verifier checks exact values via jmespath queries WebArena-Infinity (50 hard tasks): - 13 LLM-generated SaaS clones (Gmail, GitLab, Linear, Figma, etc.) - InfinityAppManager starts fresh app server per task per worker - Python verifier calls /api/state and asserts on JSON state Infrastructure: - GraderInput extended with mcpUrl + infinityAppUrl for parallel workers - Each worker gets isolated ports (no cross-worker state contamination) - CI workflow: pip install agisdk, clone webarena-infinity repo * chore: switch eval configs back to kimi-k2p5 * fix: register deterministic graders in pass rate calculation Add agisdk_state_diff and infinity_state to PASS_FAIL_GRADER_ORDER in both runner types and weekly report script, so scores show correctly in the dashboard. * chore: temp switch to opus 4.6 for eval run * chore: restore kimi-k2p5 as default eval config * ci: add timeout and continue-on-error for trend report step
83 lines
2.4 KiB
Python
Vendored
83 lines
2.4 KiB
Python
Vendored
#!/usr/bin/env python3
|
|
"""
|
|
Evaluation helper for WebArena-Infinity verifier scripts.
|
|
|
|
Reads JSON from stdin with app_server_url, verifier_path, and task_id.
|
|
Runs the verifier against the app server and outputs a JSON result.
|
|
|
|
Verifiers have the signature: verify(server_url: str) -> tuple[bool, str]
|
|
They fetch /api/state internally and return (passed, message).
|
|
|
|
Usage:
|
|
echo '{"app_server_url": "http://localhost:8000", "verifier_path": "/path/to/verify.py"}' | python infinity-evaluate.py
|
|
"""
|
|
|
|
import importlib.util
|
|
import json
|
|
import sys
|
|
import traceback
|
|
|
|
|
|
def load_verifier(verifier_path: str):
|
|
spec = importlib.util.spec_from_file_location("verifier", verifier_path)
|
|
if spec is None or spec.loader is None:
|
|
raise ImportError(f"Cannot load verifier from {verifier_path}")
|
|
module = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(module)
|
|
return module
|
|
|
|
|
|
def main():
|
|
try:
|
|
data = json.loads(sys.stdin.read())
|
|
except json.JSONDecodeError as e:
|
|
print(json.dumps({"pass": False, "reward": 0.0, "message": f"Invalid JSON input: {e}"}))
|
|
sys.exit(1)
|
|
|
|
server_url = data.get("app_server_url", "")
|
|
verifier_path = data.get("verifier_path", "")
|
|
|
|
if not server_url or not verifier_path:
|
|
print(json.dumps({
|
|
"pass": False,
|
|
"reward": 0.0,
|
|
"message": "Missing app_server_url or verifier_path",
|
|
}))
|
|
sys.exit(1)
|
|
|
|
try:
|
|
verifier = load_verifier(verifier_path)
|
|
fn = getattr(verifier, "verify", None)
|
|
if not callable(fn):
|
|
raise AttributeError(
|
|
f"Verifier has no verify() function. "
|
|
f"Available: {[a for a in dir(verifier) if not a.startswith('_')]}"
|
|
)
|
|
|
|
# Verifiers take server_url and fetch state internally
|
|
result = fn(server_url)
|
|
|
|
# Return is tuple[bool, str]
|
|
if isinstance(result, tuple) and len(result) >= 2:
|
|
passed, message = result[0], str(result[1])
|
|
else:
|
|
passed, message = bool(result), str(result)
|
|
|
|
except Exception as e:
|
|
print(json.dumps({
|
|
"pass": False,
|
|
"reward": 0.0,
|
|
"message": f"Verifier error: {e}\n{traceback.format_exc()}",
|
|
}))
|
|
sys.exit(1)
|
|
|
|
print(json.dumps({
|
|
"pass": passed,
|
|
"reward": 1.0 if passed else 0.0,
|
|
"message": message,
|
|
}))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|