BrowserOS/packages/browseros-agent/apps/eval/scripts/build-webbench-sets.py

"""
Build WebBench eval datasets (READ-only, no-auth) + BrowseComp sets.

WebBench: 6 files — full + 50-sample for 0/4, 1/4, 2/4 buckets (READ-only).
BrowseComp: 2 files — medium-hard (~50) and very-hard (~50).

Usage: python3 apps/eval/scripts/build-webbench-sets.py
"""
import csv
import json
import os
import random
import re
from collections import defaultdict
from urllib.parse import urlparse

random.seed(42)

DATA_DIR = "apps/eval/data/webbench"
OUT_DIR = "apps/eval/data"

AGENTS = [
    {"file": "anthropicfinal.csv", "eval_col": "Anthropic_Eval", "key": "anthropic_cua"},
    {"file": "skyvern2.0final.csv", "eval_col": "Skyvern2.0Eval", "key": "skyvern_2"},
    {"file": "skyvern2.0browserbasefinal.csv", "eval_col": "Browserbase_SkyvernEval", "key": "skyvern_bb"},
    {"file": "openaicuafinal.csv", "eval_col": "CUAEval", "key": "openai_cua"},
]

CONSTRAINT_SUFFIX = re.compile(
    r"\s*Only use https?://\S+ to achieve the task\..*$", re.DOTALL
)

# Only these keywords indicate actual auth requirements in READ tasks
# (exclude "publish" — it's almost always about finding published articles)
AUTH_KEYWORDS_STRICT = [
    "log in", "login", "sign in", "signin", "sign up", "signup",
    "your account", "your profile", "your wishlist", "your order",
    "your cart", "your dashboard", "your settings", "your subscription",
    "your inbox", "your message", "your review", "your playlist",
    "your favorites", "your saved", "your address", "your payment",
    "my account", "my profile", "my wishlist", "my order", "my cart",
    "my dashboard", "my settings", "my subscription", "my inbox",
]


def load_agent(agent):
    path = os.path.join(DATA_DIR, agent["file"])
    results = {}
    with open(path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            try:
                task_id = int(row["ID"])
            except (ValueError, KeyError):
                continue
            eval_val = row.get(agent["eval_col"], "")
            results[task_id] = {
                "eval": eval_val,
                "difficulty": row.get("Difficulty", ""),
                "category": row.get("Category", ""),
                "task": row.get("Task", ""),
                "url": row.get("Starting URL", ""),
            }
    return results


def extract_domain(url):
    parsed = urlparse(url)
    host = parsed.hostname or ""
    return re.sub(r"^www\.", "", host)


def clean_query(task_text):
    return CONSTRAINT_SUFFIX.sub("", task_text).strip()


def needs_auth(task_text):
    task_lower = task_text.lower()
    for kw in AUTH_KEYWORDS_STRICT:
        if kw in task_lower:
            return True
    return False


def build_task_entry(tid, info, pass_count, agent_evals):
    domain = extract_domain(info["url"])
    return {
        "query_id": f"wb-{tid}",
        "dataset": "webbench",
        "query": clean_query(info["task"]),
        "start_url": info["url"],
        "metadata": {
            "original_task_id": f"wb-{tid}",
            "website": domain,
            "category": info["category"],
            "additional": {
                "webbench_id": tid,
                "difficulty": info["difficulty"],
                "pass_count_4": pass_count,
                "agent_results": agent_evals,
            },
        },
    }


def stratified_sample(tasks, n):
    """Sample n tasks with diversity across difficulty, category, and website (max 2 per domain)."""
    if len(tasks) <= n:
        return tasks

    groups = defaultdict(list)
    for t in tasks:
        diff = t["metadata"]["additional"]["difficulty"]
        groups[diff].append(t)

    selected = []
    domain_counts = defaultdict(int)

    group_keys = sorted(groups.keys())
    for key in group_keys:
        random.shuffle(groups[key])

    group_iters = {key: iter(groups[key]) for key in group_keys}

    while len(selected) < n:
        added_this_round = False
        for key in group_keys:
            if len(selected) >= n:
                break
            it = group_iters[key]
            for t in it:
                domain = t["metadata"]["website"]
                if domain_counts[domain] < 2:
                    selected.append(t)
                    domain_counts[domain] += 1
                    added_this_round = True
                    break
        if not added_this_round:
            remaining = [t for t in tasks if t not in selected]
            random.shuffle(remaining)
            for t in remaining:
                if len(selected) >= n:
                    break
                if t not in selected:
                    selected.append(t)

    return selected[:n]


def write_jsonl(tasks, path):
    with open(path, "w") as f:
        for t in tasks:
            f.write(json.dumps(t, ensure_ascii=False) + "\n")


def print_stats(name, tasks):
    cats = defaultdict(int)
    diffs = defaultdict(int)
    domains = set()
    for t in tasks:
        cats[t["metadata"].get("category", t["metadata"].get("additional", {}).get("topic", "?"))] += 1
        diff = t["metadata"].get("additional", {}).get("difficulty", "?")
        diffs[diff] += 1
        domains.add(t["metadata"].get("website", "?"))
    cat_str = ", ".join(f"{c}({n})" for c, n in sorted(cats.items(), key=lambda x: -x[1]))
    diff_str = ", ".join(f"{d}({n})" for d, n in sorted(diffs.items(), key=lambda x: -x[1]))
    print(f"  {name}: {len(tasks)} tasks | {len(domains)} websites")
    print(f"    difficulty: {diff_str}")
    if cat_str:
        print(f"    categories: {cat_str}")


# ══════════════════════════════════════════════════════════════════════
# PART 1: WebBench READ-only datasets
# ══════════════════════════════════════════════════════════════════════
print("=" * 60)
print("PART 1: WebBench READ-only datasets")
print("=" * 60)

print("\nLoading agents...")
agent_results = {}
for agent in AGENTS:
    agent_results[agent["key"]] = load_agent(agent)
    print(f"  {agent['key']}: {len(agent_results[agent['key']])} tasks")

all_ids = set(agent_results[AGENTS[0]["key"]].keys())
for agent in AGENTS[1:]:
    all_ids &= set(agent_results[agent["key"]].keys())

buckets = defaultdict(list)
skipped_non_read = 0
skipped_auth = 0

for tid in sorted(all_ids):
    info = agent_results[AGENTS[0]["key"]][tid]

    # READ-only filter
    if info["category"] != "READ":
        skipped_non_read += 1
        continue

    # Auth filter
    if needs_auth(info["task"]):
        skipped_auth += 1
        continue

    pass_count = 0
    agent_evals = {}
    for agent in AGENTS:
        r = agent_results[agent["key"]][tid]
        is_success = "success" in r["eval"].lower() if r["eval"] else False
        if is_success:
            pass_count += 1
        agent_evals[agent["key"]] = "PASS" if is_success else "FAIL"

    entry = build_task_entry(tid, info, pass_count, agent_evals)
    buckets[pass_count].append(entry)

print(f"\nFiltered: {skipped_non_read} non-READ, {skipped_auth} auth-required")
print("READ-only buckets:")
for pc in range(5):
    print(f"  {pc}/4: {len(buckets[pc])} tasks")

# Build 6 WebBench datasets
for pc in [0, 1, 2]:
    full = buckets[pc]
    sampled = stratified_sample(full, 50)

    full_path = os.path.join(OUT_DIR, f"webbench-{pc}of4.jsonl")
    sample_path = os.path.join(OUT_DIR, f"webbench-{pc}of4-50.jsonl")

    write_jsonl(full, full_path)
    write_jsonl(sampled, sample_path)

    print(f"\n{'─' * 40}")
    print_stats(f"webbench-{pc}of4 (full)", full)
    print_stats(f"webbench-{pc}of4-50 (sampled)", sampled)


# ══════════════════════════════════════════════════════════════════════
# PART 2: BrowseComp datasets
# ══════════════════════════════════════════════════════════════════════
print(f"\n{'=' * 60}")
print("PART 2: BrowseComp datasets")
print("=" * 60)

browsecomp_path = os.path.join(DATA_DIR, "browsecomp.csv")
if not os.path.exists(browsecomp_path):
    print(f"\n  Downloading BrowseComp dataset...")
    import urllib.request
    url = "https://openaipublic.blob.core.windows.net/simple-evals/browse_comp_test_set.csv"
    urllib.request.urlretrieve(url, browsecomp_path)
    print(f"  Saved to {browsecomp_path}")

# Load BrowseComp
bc_tasks = []
with open(browsecomp_path, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for i, row in enumerate(reader):
        topic = row.get("problem_topic", "Other")
        bc_tasks.append({
            "query_id": f"bc-{i}",
            "dataset": "browsecomp",
            "query": row.get("problem", ""),
            "start_url": "https://www.google.com/",
            "metadata": {
                "original_task_id": f"bc-{i}",
                "website": "google.com",
                "category": "information-retrieval",
                "additional": {
                    "topic": topic,
                    "answer_length": len(row.get("answer", "")),
                },
            },
        })

print(f"\nLoaded {len(bc_tasks)} BrowseComp tasks")

# Categorize difficulty by answer_length and query complexity
# Shorter answers + shorter queries = relatively easier
# Longer answers + longer queries = harder
for t in bc_tasks:
    query_len = len(t["query"])
    ans_len = t["metadata"]["additional"]["answer_length"]
    # Simple heuristic: longer query = more constraints = harder
    if query_len < 600 and ans_len < 50:
        t["metadata"]["additional"]["difficulty"] = "medium"
    elif query_len < 1000:
        t["metadata"]["additional"]["difficulty"] = "hard"
    else:
        t["metadata"]["additional"]["difficulty"] = "very-hard"

diffs = defaultdict(int)
for t in bc_tasks:
    diffs[t["metadata"]["additional"]["difficulty"]] += 1
print(f"Difficulty distribution: {dict(diffs)}")

# Topics
topics = defaultdict(int)
for t in bc_tasks:
    topics[t["metadata"]["additional"]["topic"]] += 1
print(f"Topics: {dict(topics)}")

# Build medium-hard set: sample from medium + hard
medium_hard_pool = [t for t in bc_tasks if t["metadata"]["additional"]["difficulty"] in ("medium", "hard")]
random.shuffle(medium_hard_pool)

# Stratify by topic
topic_groups = defaultdict(list)
for t in medium_hard_pool:
    topic_groups[t["metadata"]["additional"]["topic"]].append(t)

bc_medium_hard = []
topic_keys = sorted(topic_groups.keys())
for key in topic_keys:
    random.shuffle(topic_groups[key])

topic_iters = {key: iter(topic_groups[key]) for key in topic_keys}
while len(bc_medium_hard) < 50:
    added = False
    for key in topic_keys:
        if len(bc_medium_hard) >= 50:
            break
        try:
            bc_medium_hard.append(next(topic_iters[key]))
            added = True
        except StopIteration:
            continue
    if not added:
        break

# Build very-hard set: sample from very-hard + remaining hard
very_hard_pool = [t for t in bc_tasks if t["metadata"]["additional"]["difficulty"] == "very-hard"]
# Add hard tasks not already selected
hard_remaining = [t for t in bc_tasks if t["metadata"]["additional"]["difficulty"] == "hard" and t not in bc_medium_hard]
very_hard_pool.extend(hard_remaining)
random.shuffle(very_hard_pool)

topic_groups2 = defaultdict(list)
for t in very_hard_pool:
    topic_groups2[t["metadata"]["additional"]["topic"]].append(t)

bc_very_hard = []
topic_keys2 = sorted(topic_groups2.keys())
for key in topic_keys2:
    random.shuffle(topic_groups2[key])

topic_iters2 = {key: iter(topic_groups2[key]) for key in topic_keys2}
while len(bc_very_hard) < 50:
    added = False
    for key in topic_keys2:
        if len(bc_very_hard) >= 50:
            break
        try:
            bc_very_hard.append(next(topic_iters2[key]))
            added = True
        except StopIteration:
            continue
    if not added:
        break

# Write BrowseComp files
bc_mh_path = os.path.join(OUT_DIR, "browsecomp-medium-hard-50.jsonl")
bc_vh_path = os.path.join(OUT_DIR, "browsecomp-very-hard-50.jsonl")
write_jsonl(bc_medium_hard, bc_mh_path)
write_jsonl(bc_very_hard, bc_vh_path)

print(f"\n{'─' * 40}")
print_stats("browsecomp-medium-hard-50", bc_medium_hard)
print_stats("browsecomp-very-hard-50", bc_very_hard)

# ══════════════════════════════════════════════════════════════════════
# SUMMARY
# ══════════════════════════════════════════════════════════════════════
print(f"\n{'=' * 60}")
print("ALL FILES WRITTEN")
print("=" * 60)
files = [
    "webbench-0of4.jsonl", "webbench-0of4-50.jsonl",
    "webbench-1of4.jsonl", "webbench-1of4-50.jsonl",
    "webbench-2of4.jsonl", "webbench-2of4-50.jsonl",
    "browsecomp-medium-hard-50.jsonl", "browsecomp-very-hard-50.jsonl",
]
for f in files:
    path = os.path.join(OUT_DIR, f)
    with open(path) as fh:
        count = sum(1 for _ in fh)
    print(f"  {f}: {count} tasks")