mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-20 04:21:23 +00:00
- Add hover_at, type_at, drag_at coordinate tools to server - Add hoverAt, typeAt, dragAt methods to Browser class - Export server internals (browser, tool-loop, registry) for eval imports - Copy eval app from enterprise repo with agents, graders, runner, dashboard - Nest eval-targets inside apps/eval - Adapt sessionExecutionDir → workingDir for current server API - Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
157 lines
4.9 KiB
Python
Vendored
157 lines
4.9 KiB
Python
Vendored
"""
|
|
Build BrowseComp eval datasets (decrypted, 2 difficulty tiers).
|
|
Decryption uses XOR with the canary field as key (from OpenAI's simple-evals).
|
|
|
|
Usage: python3 apps/eval/scripts/build-browsecomp-sets.py
|
|
"""
|
|
import base64
|
|
import csv
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import random
|
|
import urllib.request
|
|
from collections import defaultdict
|
|
|
|
random.seed(42)
|
|
|
|
OUT_DIR = "apps/eval/data"
|
|
BC_URL = "https://openaipublic.blob.core.windows.net/simple-evals/browse_comp_test_set.csv"
|
|
BC_CACHE = "apps/eval/data/webbench/browsecomp.csv"
|
|
|
|
|
|
def derive_key(password: str, length: int) -> bytes:
|
|
hasher = hashlib.sha256()
|
|
hasher.update(password.encode())
|
|
key = hasher.digest()
|
|
return key * (length // len(key)) + key[: length % len(key)]
|
|
|
|
|
|
def decrypt(ciphertext_b64: str, password: str) -> str:
|
|
encrypted = base64.b64decode(ciphertext_b64)
|
|
key = derive_key(password, len(encrypted))
|
|
decrypted = bytes(a ^ b for a, b in zip(encrypted, key))
|
|
return decrypted.decode()
|
|
|
|
|
|
def stratified_sample_by_topic(tasks, n):
|
|
"""Round-robin sample across topics for diversity."""
|
|
groups = defaultdict(list)
|
|
for t in tasks:
|
|
groups[t["metadata"]["additional"]["topic"]].append(t)
|
|
|
|
for key in groups:
|
|
random.shuffle(groups[key])
|
|
|
|
selected = []
|
|
topic_keys = sorted(groups.keys())
|
|
iters = {k: iter(groups[k]) for k in topic_keys}
|
|
|
|
while len(selected) < n:
|
|
added = False
|
|
for key in topic_keys:
|
|
if len(selected) >= n:
|
|
break
|
|
try:
|
|
selected.append(next(iters[key]))
|
|
added = True
|
|
except StopIteration:
|
|
continue
|
|
if not added:
|
|
break
|
|
|
|
return selected
|
|
|
|
|
|
# Download if needed
|
|
if not os.path.exists(BC_CACHE):
|
|
print("Downloading BrowseComp dataset...")
|
|
urllib.request.urlretrieve(BC_URL, BC_CACHE)
|
|
|
|
# Load and decrypt
|
|
tasks = []
|
|
with open(BC_CACHE, newline="", encoding="utf-8") as f:
|
|
for i, row in enumerate(csv.DictReader(f)):
|
|
canary = row["canary"]
|
|
problem = decrypt(row["problem"], canary)
|
|
answer = decrypt(row["answer"], canary)
|
|
topic = row["problem_topic"]
|
|
query_len = len(problem)
|
|
|
|
# Difficulty based on query length (more constraints = harder)
|
|
if query_len < 450:
|
|
difficulty = "medium"
|
|
elif query_len < 700:
|
|
difficulty = "hard"
|
|
else:
|
|
difficulty = "very-hard"
|
|
|
|
tasks.append({
|
|
"query_id": f"bc-{i}",
|
|
"dataset": "browsecomp",
|
|
"query": problem,
|
|
"start_url": "https://www.google.com/",
|
|
"metadata": {
|
|
"original_task_id": f"bc-{i}",
|
|
"website": "google.com",
|
|
"category": "information-retrieval",
|
|
"additional": {
|
|
"topic": topic,
|
|
"difficulty": difficulty,
|
|
"answer": answer,
|
|
},
|
|
},
|
|
})
|
|
|
|
print(f"Loaded {len(tasks)} BrowseComp tasks (decrypted)")
|
|
|
|
# Difficulty distribution
|
|
diffs = defaultdict(int)
|
|
for t in tasks:
|
|
diffs[t["metadata"]["additional"]["difficulty"]] += 1
|
|
print(f"Difficulty: {dict(sorted(diffs.items()))}")
|
|
|
|
# Topic distribution
|
|
topics = defaultdict(int)
|
|
for t in tasks:
|
|
topics[t["metadata"]["additional"]["topic"]] += 1
|
|
print(f"Topics: {dict(sorted(topics.items()))}")
|
|
|
|
# Build medium-hard set: medium + hard tasks
|
|
mh_pool = [t for t in tasks if t["metadata"]["additional"]["difficulty"] in ("medium", "hard")]
|
|
bc_medium_hard = stratified_sample_by_topic(mh_pool, 50)
|
|
|
|
# Build very-hard set: very-hard + remaining hard tasks
|
|
vh_pool = [t for t in tasks if t["metadata"]["additional"]["difficulty"] == "very-hard"]
|
|
hard_remaining = [t for t in tasks if t["metadata"]["additional"]["difficulty"] == "hard" and t not in bc_medium_hard]
|
|
vh_pool.extend(hard_remaining)
|
|
bc_very_hard = stratified_sample_by_topic(vh_pool, 50)
|
|
|
|
# Write files
|
|
def write_jsonl(data, path):
|
|
with open(path, "w") as f:
|
|
for t in data:
|
|
f.write(json.dumps(t, ensure_ascii=False) + "\n")
|
|
|
|
|
|
mh_path = os.path.join(OUT_DIR, "browsecomp-medium-hard-50.jsonl")
|
|
vh_path = os.path.join(OUT_DIR, "browsecomp-very-hard-50.jsonl")
|
|
write_jsonl(bc_medium_hard, mh_path)
|
|
write_jsonl(bc_very_hard, vh_path)
|
|
|
|
# Print stats
|
|
for name, data in [("browsecomp-medium-hard-50", bc_medium_hard), ("browsecomp-very-hard-50", bc_very_hard)]:
|
|
diffs = defaultdict(int)
|
|
topics = defaultdict(int)
|
|
for t in data:
|
|
diffs[t["metadata"]["additional"]["difficulty"]] += 1
|
|
topics[t["metadata"]["additional"]["topic"]] += 1
|
|
print(f"\n{name}: {len(data)} tasks")
|
|
print(f" difficulty: {dict(sorted(diffs.items()))}")
|
|
print(f" topics: {dict(sorted(topics.items()))}")
|
|
# Show first 2 samples
|
|
for t in data[:2]:
|
|
print(f" [{t['query_id']}] {t['metadata']['additional']['topic']}")
|
|
print(f" Q: {t['query'][:150]}")
|
|
print(f" A: {t['metadata']['additional']['answer']}")
|