mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
- Add hover_at, type_at, drag_at coordinate tools to server - Add hoverAt, typeAt, dragAt methods to Browser class - Export server internals (browser, tool-loop, registry) for eval imports - Copy eval app from enterprise repo with agents, graders, runner, dashboard - Nest eval-targets inside apps/eval - Adapt sessionExecutionDir → workingDir for current server API - Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
389 lines
13 KiB
Python
Vendored
389 lines
13 KiB
Python
Vendored
"""
|
|
Build WebBench eval datasets (READ-only, no-auth) + BrowseComp sets.
|
|
|
|
WebBench: 6 files — full + 50-sample for 0/4, 1/4, 2/4 buckets (READ-only).
|
|
BrowseComp: 2 files — medium-hard (~50) and very-hard (~50).
|
|
|
|
Usage: python3 apps/eval/scripts/build-webbench-sets.py
|
|
"""
|
|
import csv
|
|
import json
|
|
import os
|
|
import random
|
|
import re
|
|
from collections import defaultdict
|
|
from urllib.parse import urlparse
|
|
|
|
random.seed(42)
|
|
|
|
DATA_DIR = "apps/eval/data/webbench"
|
|
OUT_DIR = "apps/eval/data"
|
|
|
|
AGENTS = [
|
|
{"file": "anthropicfinal.csv", "eval_col": "Anthropic_Eval", "key": "anthropic_cua"},
|
|
{"file": "skyvern2.0final.csv", "eval_col": "Skyvern2.0Eval", "key": "skyvern_2"},
|
|
{"file": "skyvern2.0browserbasefinal.csv", "eval_col": "Browserbase_SkyvernEval", "key": "skyvern_bb"},
|
|
{"file": "openaicuafinal.csv", "eval_col": "CUAEval", "key": "openai_cua"},
|
|
]
|
|
|
|
CONSTRAINT_SUFFIX = re.compile(
|
|
r"\s*Only use https?://\S+ to achieve the task\..*$", re.DOTALL
|
|
)
|
|
|
|
# Only these keywords indicate actual auth requirements in READ tasks
|
|
# (exclude "publish" — it's almost always about finding published articles)
|
|
AUTH_KEYWORDS_STRICT = [
|
|
"log in", "login", "sign in", "signin", "sign up", "signup",
|
|
"your account", "your profile", "your wishlist", "your order",
|
|
"your cart", "your dashboard", "your settings", "your subscription",
|
|
"your inbox", "your message", "your review", "your playlist",
|
|
"your favorites", "your saved", "your address", "your payment",
|
|
"my account", "my profile", "my wishlist", "my order", "my cart",
|
|
"my dashboard", "my settings", "my subscription", "my inbox",
|
|
]
|
|
|
|
|
|
def load_agent(agent):
|
|
path = os.path.join(DATA_DIR, agent["file"])
|
|
results = {}
|
|
with open(path, newline="", encoding="utf-8") as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
try:
|
|
task_id = int(row["ID"])
|
|
except (ValueError, KeyError):
|
|
continue
|
|
eval_val = row.get(agent["eval_col"], "")
|
|
results[task_id] = {
|
|
"eval": eval_val,
|
|
"difficulty": row.get("Difficulty", ""),
|
|
"category": row.get("Category", ""),
|
|
"task": row.get("Task", ""),
|
|
"url": row.get("Starting URL", ""),
|
|
}
|
|
return results
|
|
|
|
|
|
def extract_domain(url):
|
|
parsed = urlparse(url)
|
|
host = parsed.hostname or ""
|
|
return re.sub(r"^www\.", "", host)
|
|
|
|
|
|
def clean_query(task_text):
|
|
return CONSTRAINT_SUFFIX.sub("", task_text).strip()
|
|
|
|
|
|
def needs_auth(task_text):
|
|
task_lower = task_text.lower()
|
|
for kw in AUTH_KEYWORDS_STRICT:
|
|
if kw in task_lower:
|
|
return True
|
|
return False
|
|
|
|
|
|
def build_task_entry(tid, info, pass_count, agent_evals):
|
|
domain = extract_domain(info["url"])
|
|
return {
|
|
"query_id": f"wb-{tid}",
|
|
"dataset": "webbench",
|
|
"query": clean_query(info["task"]),
|
|
"start_url": info["url"],
|
|
"metadata": {
|
|
"original_task_id": f"wb-{tid}",
|
|
"website": domain,
|
|
"category": info["category"],
|
|
"additional": {
|
|
"webbench_id": tid,
|
|
"difficulty": info["difficulty"],
|
|
"pass_count_4": pass_count,
|
|
"agent_results": agent_evals,
|
|
},
|
|
},
|
|
}
|
|
|
|
|
|
def stratified_sample(tasks, n):
|
|
"""Sample n tasks with diversity across difficulty, category, and website (max 2 per domain)."""
|
|
if len(tasks) <= n:
|
|
return tasks
|
|
|
|
groups = defaultdict(list)
|
|
for t in tasks:
|
|
diff = t["metadata"]["additional"]["difficulty"]
|
|
groups[diff].append(t)
|
|
|
|
selected = []
|
|
domain_counts = defaultdict(int)
|
|
|
|
group_keys = sorted(groups.keys())
|
|
for key in group_keys:
|
|
random.shuffle(groups[key])
|
|
|
|
group_iters = {key: iter(groups[key]) for key in group_keys}
|
|
|
|
while len(selected) < n:
|
|
added_this_round = False
|
|
for key in group_keys:
|
|
if len(selected) >= n:
|
|
break
|
|
it = group_iters[key]
|
|
for t in it:
|
|
domain = t["metadata"]["website"]
|
|
if domain_counts[domain] < 2:
|
|
selected.append(t)
|
|
domain_counts[domain] += 1
|
|
added_this_round = True
|
|
break
|
|
if not added_this_round:
|
|
remaining = [t for t in tasks if t not in selected]
|
|
random.shuffle(remaining)
|
|
for t in remaining:
|
|
if len(selected) >= n:
|
|
break
|
|
if t not in selected:
|
|
selected.append(t)
|
|
|
|
return selected[:n]
|
|
|
|
|
|
def write_jsonl(tasks, path):
|
|
with open(path, "w") as f:
|
|
for t in tasks:
|
|
f.write(json.dumps(t, ensure_ascii=False) + "\n")
|
|
|
|
|
|
def print_stats(name, tasks):
|
|
cats = defaultdict(int)
|
|
diffs = defaultdict(int)
|
|
domains = set()
|
|
for t in tasks:
|
|
cats[t["metadata"].get("category", t["metadata"].get("additional", {}).get("topic", "?"))] += 1
|
|
diff = t["metadata"].get("additional", {}).get("difficulty", "?")
|
|
diffs[diff] += 1
|
|
domains.add(t["metadata"].get("website", "?"))
|
|
cat_str = ", ".join(f"{c}({n})" for c, n in sorted(cats.items(), key=lambda x: -x[1]))
|
|
diff_str = ", ".join(f"{d}({n})" for d, n in sorted(diffs.items(), key=lambda x: -x[1]))
|
|
print(f" {name}: {len(tasks)} tasks | {len(domains)} websites")
|
|
print(f" difficulty: {diff_str}")
|
|
if cat_str:
|
|
print(f" categories: {cat_str}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# PART 1: WebBench READ-only datasets
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
print("=" * 60)
|
|
print("PART 1: WebBench READ-only datasets")
|
|
print("=" * 60)
|
|
|
|
print("\nLoading agents...")
|
|
agent_results = {}
|
|
for agent in AGENTS:
|
|
agent_results[agent["key"]] = load_agent(agent)
|
|
print(f" {agent['key']}: {len(agent_results[agent['key']])} tasks")
|
|
|
|
all_ids = set(agent_results[AGENTS[0]["key"]].keys())
|
|
for agent in AGENTS[1:]:
|
|
all_ids &= set(agent_results[agent["key"]].keys())
|
|
|
|
buckets = defaultdict(list)
|
|
skipped_non_read = 0
|
|
skipped_auth = 0
|
|
|
|
for tid in sorted(all_ids):
|
|
info = agent_results[AGENTS[0]["key"]][tid]
|
|
|
|
# READ-only filter
|
|
if info["category"] != "READ":
|
|
skipped_non_read += 1
|
|
continue
|
|
|
|
# Auth filter
|
|
if needs_auth(info["task"]):
|
|
skipped_auth += 1
|
|
continue
|
|
|
|
pass_count = 0
|
|
agent_evals = {}
|
|
for agent in AGENTS:
|
|
r = agent_results[agent["key"]][tid]
|
|
is_success = "success" in r["eval"].lower() if r["eval"] else False
|
|
if is_success:
|
|
pass_count += 1
|
|
agent_evals[agent["key"]] = "PASS" if is_success else "FAIL"
|
|
|
|
entry = build_task_entry(tid, info, pass_count, agent_evals)
|
|
buckets[pass_count].append(entry)
|
|
|
|
print(f"\nFiltered: {skipped_non_read} non-READ, {skipped_auth} auth-required")
|
|
print("READ-only buckets:")
|
|
for pc in range(5):
|
|
print(f" {pc}/4: {len(buckets[pc])} tasks")
|
|
|
|
# Build 6 WebBench datasets
|
|
for pc in [0, 1, 2]:
|
|
full = buckets[pc]
|
|
sampled = stratified_sample(full, 50)
|
|
|
|
full_path = os.path.join(OUT_DIR, f"webbench-{pc}of4.jsonl")
|
|
sample_path = os.path.join(OUT_DIR, f"webbench-{pc}of4-50.jsonl")
|
|
|
|
write_jsonl(full, full_path)
|
|
write_jsonl(sampled, sample_path)
|
|
|
|
print(f"\n{'─' * 40}")
|
|
print_stats(f"webbench-{pc}of4 (full)", full)
|
|
print_stats(f"webbench-{pc}of4-50 (sampled)", sampled)
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# PART 2: BrowseComp datasets
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
print(f"\n{'=' * 60}")
|
|
print("PART 2: BrowseComp datasets")
|
|
print("=" * 60)
|
|
|
|
browsecomp_path = os.path.join(DATA_DIR, "browsecomp.csv")
|
|
if not os.path.exists(browsecomp_path):
|
|
print(f"\n Downloading BrowseComp dataset...")
|
|
import urllib.request
|
|
url = "https://openaipublic.blob.core.windows.net/simple-evals/browse_comp_test_set.csv"
|
|
urllib.request.urlretrieve(url, browsecomp_path)
|
|
print(f" Saved to {browsecomp_path}")
|
|
|
|
# Load BrowseComp
|
|
bc_tasks = []
|
|
with open(browsecomp_path, newline="", encoding="utf-8") as f:
|
|
reader = csv.DictReader(f)
|
|
for i, row in enumerate(reader):
|
|
topic = row.get("problem_topic", "Other")
|
|
bc_tasks.append({
|
|
"query_id": f"bc-{i}",
|
|
"dataset": "browsecomp",
|
|
"query": row.get("problem", ""),
|
|
"start_url": "https://www.google.com/",
|
|
"metadata": {
|
|
"original_task_id": f"bc-{i}",
|
|
"website": "google.com",
|
|
"category": "information-retrieval",
|
|
"additional": {
|
|
"topic": topic,
|
|
"answer_length": len(row.get("answer", "")),
|
|
},
|
|
},
|
|
})
|
|
|
|
print(f"\nLoaded {len(bc_tasks)} BrowseComp tasks")
|
|
|
|
# Categorize difficulty by answer_length and query complexity
|
|
# Shorter answers + shorter queries = relatively easier
|
|
# Longer answers + longer queries = harder
|
|
for t in bc_tasks:
|
|
query_len = len(t["query"])
|
|
ans_len = t["metadata"]["additional"]["answer_length"]
|
|
# Simple heuristic: longer query = more constraints = harder
|
|
if query_len < 600 and ans_len < 50:
|
|
t["metadata"]["additional"]["difficulty"] = "medium"
|
|
elif query_len < 1000:
|
|
t["metadata"]["additional"]["difficulty"] = "hard"
|
|
else:
|
|
t["metadata"]["additional"]["difficulty"] = "very-hard"
|
|
|
|
diffs = defaultdict(int)
|
|
for t in bc_tasks:
|
|
diffs[t["metadata"]["additional"]["difficulty"]] += 1
|
|
print(f"Difficulty distribution: {dict(diffs)}")
|
|
|
|
# Topics
|
|
topics = defaultdict(int)
|
|
for t in bc_tasks:
|
|
topics[t["metadata"]["additional"]["topic"]] += 1
|
|
print(f"Topics: {dict(topics)}")
|
|
|
|
# Build medium-hard set: sample from medium + hard
|
|
medium_hard_pool = [t for t in bc_tasks if t["metadata"]["additional"]["difficulty"] in ("medium", "hard")]
|
|
random.shuffle(medium_hard_pool)
|
|
|
|
# Stratify by topic
|
|
topic_groups = defaultdict(list)
|
|
for t in medium_hard_pool:
|
|
topic_groups[t["metadata"]["additional"]["topic"]].append(t)
|
|
|
|
bc_medium_hard = []
|
|
topic_keys = sorted(topic_groups.keys())
|
|
for key in topic_keys:
|
|
random.shuffle(topic_groups[key])
|
|
|
|
topic_iters = {key: iter(topic_groups[key]) for key in topic_keys}
|
|
while len(bc_medium_hard) < 50:
|
|
added = False
|
|
for key in topic_keys:
|
|
if len(bc_medium_hard) >= 50:
|
|
break
|
|
try:
|
|
bc_medium_hard.append(next(topic_iters[key]))
|
|
added = True
|
|
except StopIteration:
|
|
continue
|
|
if not added:
|
|
break
|
|
|
|
# Build very-hard set: sample from very-hard + remaining hard
|
|
very_hard_pool = [t for t in bc_tasks if t["metadata"]["additional"]["difficulty"] == "very-hard"]
|
|
# Add hard tasks not already selected
|
|
hard_remaining = [t for t in bc_tasks if t["metadata"]["additional"]["difficulty"] == "hard" and t not in bc_medium_hard]
|
|
very_hard_pool.extend(hard_remaining)
|
|
random.shuffle(very_hard_pool)
|
|
|
|
topic_groups2 = defaultdict(list)
|
|
for t in very_hard_pool:
|
|
topic_groups2[t["metadata"]["additional"]["topic"]].append(t)
|
|
|
|
bc_very_hard = []
|
|
topic_keys2 = sorted(topic_groups2.keys())
|
|
for key in topic_keys2:
|
|
random.shuffle(topic_groups2[key])
|
|
|
|
topic_iters2 = {key: iter(topic_groups2[key]) for key in topic_keys2}
|
|
while len(bc_very_hard) < 50:
|
|
added = False
|
|
for key in topic_keys2:
|
|
if len(bc_very_hard) >= 50:
|
|
break
|
|
try:
|
|
bc_very_hard.append(next(topic_iters2[key]))
|
|
added = True
|
|
except StopIteration:
|
|
continue
|
|
if not added:
|
|
break
|
|
|
|
# Write BrowseComp files
|
|
bc_mh_path = os.path.join(OUT_DIR, "browsecomp-medium-hard-50.jsonl")
|
|
bc_vh_path = os.path.join(OUT_DIR, "browsecomp-very-hard-50.jsonl")
|
|
write_jsonl(bc_medium_hard, bc_mh_path)
|
|
write_jsonl(bc_very_hard, bc_vh_path)
|
|
|
|
print(f"\n{'─' * 40}")
|
|
print_stats("browsecomp-medium-hard-50", bc_medium_hard)
|
|
print_stats("browsecomp-very-hard-50", bc_very_hard)
|
|
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
# SUMMARY
|
|
# ══════════════════════════════════════════════════════════════════════
|
|
print(f"\n{'=' * 60}")
|
|
print("ALL FILES WRITTEN")
|
|
print("=" * 60)
|
|
files = [
|
|
"webbench-0of4.jsonl", "webbench-0of4-50.jsonl",
|
|
"webbench-1of4.jsonl", "webbench-1of4-50.jsonl",
|
|
"webbench-2of4.jsonl", "webbench-2of4-50.jsonl",
|
|
"browsecomp-medium-hard-50.jsonl", "browsecomp-very-hard-50.jsonl",
|
|
]
|
|
for f in files:
|
|
path = os.path.join(OUT_DIR, f)
|
|
with open(path) as fh:
|
|
count = sum(1 for _ in fh)
|
|
print(f" {f}: {count} tasks")
|