mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
feat: deterministic eval graders (AGI SDK + WebArena-Infinity) (#664)
* feat: add deterministic eval graders (AGI SDK + WebArena-Infinity) Two new benchmark integrations with programmatic grading — no LLM judge. AGI SDK / REAL Bench (52 tasks): - 11 React/Next.js clones of consumer apps (DoorDash, Amazon, Gmail, etc.) - Grader navigates browser to /finish, extracts state diff from <pre> tag - Python verifier checks exact values via jmespath queries WebArena-Infinity (50 hard tasks): - 13 LLM-generated SaaS clones (Gmail, GitLab, Linear, Figma, etc.) - InfinityAppManager starts fresh app server per task per worker - Python verifier calls /api/state and asserts on JSON state Infrastructure: - GraderInput extended with mcpUrl + infinityAppUrl for parallel workers - Each worker gets isolated ports (no cross-worker state contamination) - CI workflow: pip install agisdk, clone webarena-infinity repo * chore: switch eval configs back to kimi-k2p5 * fix: register deterministic graders in pass rate calculation Add agisdk_state_diff and infinity_state to PASS_FAIL_GRADER_ORDER in both runner types and weekly report script, so scores show correctly in the dashboard. * chore: temp switch to opus 4.6 for eval run * chore: restore kimi-k2p5 as default eval config * ci: add timeout and continue-on-error for trend report step
This commit is contained in:
10
.github/workflows/eval-weekly.yml
vendored
10
.github/workflows/eval-weekly.yml
vendored
@@ -43,6 +43,12 @@ jobs:
|
||||
working-directory: packages/browseros-agent
|
||||
run: bun install --ignore-scripts && bun run build:agent-sdk
|
||||
|
||||
- name: Install Python eval dependencies
|
||||
run: pip install agisdk requests
|
||||
|
||||
- name: Clone WebArena-Infinity
|
||||
run: git clone --depth 1 https://github.com/web-arena-x/webarena-infinity.git /tmp/webarena-infinity
|
||||
|
||||
- name: Install xvfb
|
||||
run: sudo apt-get update && sudo apt-get install -y xvfb
|
||||
|
||||
@@ -57,9 +63,11 @@ jobs:
|
||||
working-directory: packages/browseros-agent/apps/eval
|
||||
env:
|
||||
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
|
||||
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
||||
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
|
||||
NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
|
||||
BROWSEROS_BINARY: /usr/bin/browseros
|
||||
WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
|
||||
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
|
||||
run: |
|
||||
echo "Running eval with config: $EVAL_CONFIG"
|
||||
@@ -81,6 +89,8 @@ jobs:
|
||||
|
||||
- name: Generate trend report
|
||||
if: success()
|
||||
timeout-minutes: 5
|
||||
continue-on-error: true
|
||||
working-directory: packages/browseros-agent
|
||||
env:
|
||||
EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
|
||||
|
||||
26
packages/browseros-agent/apps/eval/configs/agisdk-real-smoke.json
vendored
Normal file
26
packages/browseros-agent/apps/eval/configs/agisdk-real-smoke.json
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "single",
|
||||
"provider": "openai-compatible",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/agisdk-real.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"timeout_ms": 1800000
|
||||
}
|
||||
26
packages/browseros-agent/apps/eval/configs/infinity-hard-50.json
vendored
Normal file
26
packages/browseros-agent/apps/eval/configs/infinity-hard-50.json
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "single",
|
||||
"provider": "openai-compatible",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/webarena-infinity-hard-50.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["infinity_state"],
|
||||
"timeout_ms": 1800000
|
||||
}
|
||||
52
packages/browseros-agent/apps/eval/data/agisdk-real.jsonl
vendored
Normal file
52
packages/browseros-agent/apps/eval/data/agisdk-real.jsonl
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
|
||||
{"query_id": "agisdk-fly-unified-5", "dataset": "agisdk-real", "query": "Find me the cheapest fare for a flight from Orlando to Milwaukee on December 5th, 2024 and book it.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/25, Security Code: 420 Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-5", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "United Airlines"}}}
|
||||
{"query_id": "agisdk-udriver-10", "dataset": "agisdk-real", "query": "Order me a ride for 4pm, I'll be at the de Young muesum headed to the Waterbar, fanciest option possible please.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-10", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
|
||||
{"query_id": "agisdk-udriver-9", "dataset": "agisdk-real", "query": "Book me a ride from the thai restaurant I last took a ride to for later today at 2pm, I'll be at 333 Apartments on Fremont", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-9", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-9", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Uber"}}}
|
||||
{"query_id": "agisdk-topwork-4", "dataset": "agisdk-real", "query": "Create a job post for a UI/UX Designer with expertise in Figma, Sketch, and Adobe Creative Suite, including project details, timeline, and required skills (Wireframing, Prototyping, Responsive Design).", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-4", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
|
||||
{"query_id": "agisdk-gocalendar-4", "dataset": "agisdk-real", "query": "Change the \"Team Check-In\" event on July 18, 2024, name to \"Project Kickoff\" and update the location to \"Zoom\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-4", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
|
||||
{"query_id": "agisdk-staynb-6", "dataset": "agisdk-real", "query": "Find and book the stay with the best value for money (cheapest stay with the best reviews) for 1 day. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-6", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-6", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
|
||||
{"query_id": "agisdk-omnizon-10", "dataset": "agisdk-real", "query": "Click on \"buy now\" on any product, increase its quantity to the maximum allowed, update the delivery date to the last available, and place the order.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-10", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Amazon"}}}
|
||||
{"query_id": "agisdk-fly-unified-9", "dataset": "agisdk-real", "query": "Book me a flight from San Francisco to Chicago in Basic Economy on December 18th at 10:00. Ensure no seat selection is made.\nPassenger: David Lee\nDate of Birth: 07/22/1985\nSex: Male\nSeat Selection: No\nPayment: Credit Card (9999 8888 7777), Exp: 03/30, Address: 987 Cedar St, Chicago, IL, 60601, USA, Phone: 555-987-1234, Email: davidlee@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-9", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-9", "challenge_type": "action", "difficulty": "hard", "similar_to": "United Airlines"}}}
|
||||
{"query_id": "agisdk-networkin-9", "dataset": "agisdk-real", "query": "Find a professional who attended Stanford and send them a connection request and a message.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-9", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-9", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
|
||||
{"query_id": "agisdk-udriver-11", "dataset": "agisdk-real", "query": "I need to go from Pacific Catch on Chestnut back home to 333 Fremont now. If the fancy version is within ten dollars of the regular one, book that.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-11", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-11", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
|
||||
{"query_id": "agisdk-fly-unified-4", "dataset": "agisdk-real", "query": "Book me a round-trip flight from Providence (Rhode Island) to Indianapolis, departing on December 5th, 2024 at 08:00 and returning on December 9th at 14:00.\nPassenger: Jane Smith\nDate of Birth: 02/14/1995\nSex: Female\nSeat Selection: Yes (Window seat)\nPayment: Credit Card (378342143523967), Exp: 06/26, security code: 345 Address: 456 Elm St, Miami, FL, 33101, USA, Phone: 555-987-6543, Email: janesmith@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-4", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "United Airlines"}}}
|
||||
{"query_id": "agisdk-networkin-5", "dataset": "agisdk-real", "query": "Send a connection request to John Smith.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-5", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-5", "challenge_type": "action", "difficulty": "easy", "similar_to": "LinkedIn"}}}
|
||||
{"query_id": "agisdk-zilloft-6", "dataset": "agisdk-real", "query": "Select a property listed in San Francisco as \"Condos\" within a price range under $300,000 and request a tour for tomorrow at 4:00 PM. Use these contact details: Name: Sarah Brown, Email: sarahbrown@example.com, Phone: 555-987-6543.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-6", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "Zillow"}}}
|
||||
{"query_id": "agisdk-topwork-2", "dataset": "agisdk-real", "query": "Create a job posting for a Backend Developer specializing in Python, Django, and Flask to develop a high-performance web application. Include project details such as required skills (PostgreSQL, Docker, AWS, CI/CD), estimated project timeline, and budget.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-2", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-2", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
|
||||
{"query_id": "agisdk-gocalendar-3", "dataset": "agisdk-real", "query": "Delete the event titled \"Breakfast Meeting with Client\" scheduled for July 19, 2024", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-3", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "Google Calendar"}}}
|
||||
{"query_id": "agisdk-topwork-3", "dataset": "agisdk-real", "query": "Create a job listing for a Full-Stack Developer with expertise in Java, Spring Boot, and Angular, outlining the project scope, estimated duration, and required skills (MySQL, Docker, Kubernetes, and Jenkins). The ideal candidate should have experience in enterprise-level applications and building scalable microservices. After creating the job post, please describe what you included in the job listing.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-3", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-3", "challenge_type": "retrieval", "difficulty": "medium", "similar_to": "Upwork"}}}
|
||||
{"query_id": "agisdk-fly-unified-2", "dataset": "agisdk-real", "query": "Book me a one-way flight from Indiana to New York on December 2nd 2024 at 12:00.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/25, Security Code: 245, Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-2", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "United Airlines"}}}
|
||||
{"query_id": "agisdk-dashdish-7", "dataset": "agisdk-real", "query": "Select \"Express Delivery\" for an order from \"DragonEats\" of \"Mushroom Swiss Burger\" and complete the checkout with the pre-loaded Visa card.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-7", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-7", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
|
||||
{"query_id": "agisdk-networkin-3", "dataset": "agisdk-real", "query": "Write a post inviting users to a networking event, including details about the event's purpose, date, and target audience.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-3", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-3", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
|
||||
{"query_id": "agisdk-gomail-7", "dataset": "agisdk-real", "query": "Delete the email with the subject \"New Leadership Articles You Can't Miss\" from the Inbox.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-7", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-7", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Gmail"}}}
|
||||
{"query_id": "agisdk-opendining-8", "dataset": "agisdk-real", "query": "Identify and book the restaurant with the lowest rating. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-8", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-8", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "OpenTable"}}}
|
||||
{"query_id": "agisdk-omnizon-2", "dataset": "agisdk-real", "query": "Search for \"smartphones\" using the search bar, add the first two to your cart, view the details of the third product, click on \"Buy Now,\" and proceed through the checkout process.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-2", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-2", "challenge_type": "action", "difficulty": "medium", "similar_to": "Amazon"}}}
|
||||
{"query_id": "agisdk-udriver-1", "dataset": "agisdk-real", "query": "Book a ride from Fitness Urbano to Pacific Cafe", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-1", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-1", "challenge_type": "action", "difficulty": "easy", "similar_to": "Uber"}}}
|
||||
{"query_id": "agisdk-staynb-2", "dataset": "agisdk-real", "query": "Click on one of the stays displayed on the homepage and book it for a family of 4 (2 adults and 2 children). For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-2", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Airbnb"}}}
|
||||
{"query_id": "agisdk-opendining-10", "dataset": "agisdk-real", "query": "Check the menus of all restaurants for vegetarian options and make a reservation at the one with the most vegetarian choices. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-10", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-10", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "OpenTable"}}}
|
||||
{"query_id": "agisdk-opendining-4", "dataset": "agisdk-real", "query": "Use the search bar to search for a restaurant on September 2nd at 4:30 PM for 7 people, using \"Japanese\" as the search term, and book the first result. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-4", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-4", "challenge_type": "action", "difficulty": "hard", "similar_to": "OpenTable"}}}
|
||||
{"query_id": "agisdk-gomail-8", "dataset": "agisdk-real", "query": "Clear all emails from \"GitHub\" in the inbox to trash.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-8", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-8", "challenge_type": "action", "difficulty": "medium", "similar_to": "Gmail"}}}
|
||||
{"query_id": "agisdk-dashdish-4", "dataset": "agisdk-real", "query": "Schedule a delivery order from \"Taco Bell\" adding a \"Classic Cheeseburger\" large size for later and add the note \"Leave at the front door\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-4", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Doordash"}}}
|
||||
{"query_id": "agisdk-networkin-1", "dataset": "agisdk-real", "query": "Create a new text post for the feed with a professional update about AI trends in 2025, mentioning three key advancements and their impact on the job market.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-1", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
|
||||
{"query_id": "agisdk-dashdish-5", "dataset": "agisdk-real", "query": "Add three \"Loaded Bacon Cheese Fries\" to the shopping cart from \"Man vs. Fries\". Proceed to checkout and select \"Pickup\" as the delivery method.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-5", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Doordash"}}}
|
||||
{"query_id": "agisdk-opendining-5", "dataset": "agisdk-real", "query": "Scroll through the homepage carousel until \"Ocean Breeze\" is visible, select the second available time slot, and complete the reservation. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-5", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-5", "challenge_type": "action", "difficulty": "medium", "similar_to": "OpenTable"}}}
|
||||
{"query_id": "agisdk-topwork-1", "dataset": "agisdk-real", "query": "Create a new job post for a Frontend Developer with expertise in React and TypeScript, specifying project details such as estimated duration, required skills, and budget.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-1", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
|
||||
{"query_id": "agisdk-gocalendar-1", "dataset": "agisdk-real", "query": "Create a new event titled \"Team Meeting\" on July 19, 2024, from 2 PM to 2:30 PM, and include \"Conference Room A\" as the location", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-1", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
|
||||
{"query_id": "agisdk-gomail-5", "dataset": "agisdk-real", "query": "Schedule an email to jane.doe@example.com with the subject \"Weekly Update\" to be sent next Monday at 9:00 AM.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-5", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Gmail"}}}
|
||||
{"query_id": "agisdk-staynb-4", "dataset": "agisdk-real", "query": "Book a stay for 2 children with 1 adult. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-4", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Airbnb"}}}
|
||||
{"query_id": "agisdk-omnizon-8", "dataset": "agisdk-real", "query": "Search for \"Automatic Espresso Machine,\" click on the cheapest one, change the quantity to 5, use \"buy now\" to purchase them and complete the checkout.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-8", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-8", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "Amazon"}}}
|
||||
{"query_id": "agisdk-networkin-6", "dataset": "agisdk-real", "query": "Choose a random person who you haven't connected with, connect with them, and send them a message saying, 'howdy, partner'.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-6", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
|
||||
{"query_id": "agisdk-dashdish-2", "dataset": "agisdk-real", "query": "Add a \"Medium Pepperoni Pizza\" from the restaurant \"Papa Johns Pizza\" to the shopping cart and purchase it.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-2", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Doordash"}}}
|
||||
{"query_id": "agisdk-staynb-8", "dataset": "agisdk-real", "query": "Scroll through the homepage and book the last stay located in Paris.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-8", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-8", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
|
||||
{"query_id": "agisdk-omnizon-4", "dataset": "agisdk-real", "query": "Search for a \"Marshall Emberton II Portable Bluetooth Speaker\" and add it to your cart, then search for the \"Michael Kors Oversized Slim Runway Men's Watch,\" add it to the cart, and complete the checkout with both items.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-4", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-4", "challenge_type": "action", "difficulty": "hard", "similar_to": "Amazon"}}}
|
||||
{"query_id": "agisdk-gomail-2", "dataset": "agisdk-real", "query": "Mark the first email in the Inbox as \"read\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-2", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Gmail"}}}
|
||||
{"query_id": "agisdk-networkin-10", "dataset": "agisdk-real", "query": "Generate a polite follow-up message for a previous unanswered chat, starting with \"Following up on\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-10", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-10", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
|
||||
{"query_id": "agisdk-gomail-3", "dataset": "agisdk-real", "query": "Compose a new email to jonathan.smith@example.com with the subject \"Meeting Notes\" and body \"Please find the meeting notes attached.\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-3", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "Gmail"}}}
|
||||
{"query_id": "agisdk-udriver-6", "dataset": "agisdk-real", "query": "Me and 4 friends need a ride from the Palace Hotel to dinner at Osha Thai leaving now", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-6", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-6", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
|
||||
{"query_id": "agisdk-staynb-9", "dataset": "agisdk-real", "query": "Book a stay with the maximum number of guests supported. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-9", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-9", "challenge_type": "action", "difficulty": "hard", "similar_to": "Airbnb"}}}
|
||||
{"query_id": "agisdk-zilloft-3", "dataset": "agisdk-real", "query": "Find a home in San Diego priced under $150,000 with at least 2 bedrooms and request a tour. Use these details: Contact Name: John Doe, Email: johndoe@example.com, Phone: 555-123-4567, Tour Time: 2:00 PM, Tour Date: First available.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-3", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-3", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "Zillow"}}}
|
||||
{"query_id": "agisdk-fly-unified-6", "dataset": "agisdk-real", "query": "Reserve me a seat for the flight from Austin to Pittsburgh departing on December 11th, 2024 at 8:00 in Basic Economy.\nPassenger: Alice Brown\nDate of Birth: 05/20/1992\nSex: Female\nSeat Selection: Yes (Aisle seat)\nPayment: Credit Card (378342143523967), Exp: 09/27, security code: 332 Address: 789 Pine St, Los Angeles, CA, 90012, USA, Phone: 555-456-7890, Email: alicebrown@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-6", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "United Airlines"}}}
|
||||
{"query_id": "agisdk-opendining-3", "dataset": "agisdk-real", "query": "Book a table at \"The Royal Dine\" for a party of 4 on July 20, 2024, at 7 PM. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-3", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "OpenTable"}}}
|
||||
{"query_id": "agisdk-omnizon-9", "dataset": "agisdk-real", "query": "Search for \"PlayStation DualSense\", purchase it using the \"buy now\" button after opening the first result and change the default payment method to:\nname: Jack Fulton\ncard number: 9231 3432 8927 7764\nexp date: 1/2029\nsecurity code: 128\n before placing your order. ", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-9", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-9", "challenge_type": "action", "difficulty": "hard", "similar_to": "Amazon"}}}
|
||||
{"query_id": "agisdk-gocalendar-7", "dataset": "agisdk-real", "query": "Reschedule the \"Morning Coffee with sister\" event from July 18, 2024, at 9 AM to July 19, 2024, at 10AM using drag-and-drop functionality", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-7", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-7", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
|
||||
{"query_id": "agisdk-staynb-5", "dataset": "agisdk-real", "query": "Use the search bar to look for a stay. For the \"Where\" section, use the \"Search by region\" popover and select \"Europe\". Set the check-in date to October 13th and the check-out date to October 23rd. For the \"Who\" section, select 1 infant, 2 children, and 2 adults. Press the search button, select the first stay, and book it.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-5", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-5", "challenge_type": "action", "difficulty": "medium", "similar_to": "Airbnb"}}}
|
||||
50
packages/browseros-agent/apps/eval/data/webarena-infinity-hard-50.jsonl
vendored
Normal file
50
packages/browseros-agent/apps/eval/data/webarena-infinity-hard-50.jsonl
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
{"query_id": "infinity-elation-prescriptions-task_h69", "dataset": "webarena-infinity", "query": "Approve all pending refill requests except for any medication that is involved in a major drug-drug interaction with another of the patient's active medications. Deny those with the reason 'Drug interaction \u2014 needs provider review before renewal'.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h69", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h69.py", "app_base_port": 8020}}}
|
||||
{"query_id": "infinity-elation-clinical-records-task_h52", "dataset": "webarena-infinity", "query": "Add the document tag 'Provider-Reviewed' to every visit note template that was created by the current logged-in provider. Do not modify templates created by other providers.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h52", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h52.py", "app_base_port": 8000}}}
|
||||
{"query_id": "infinity-gmail-accounts-and-contacts-task_h44", "dataset": "webarena-infinity", "query": "Your sister's husband is one of your contacts. Find him, star his entry, and add the Friends label.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h44", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h44.py", "app_base_port": 8070}}}
|
||||
{"query_id": "infinity-gmail-task_h2", "dataset": "webarena-infinity", "query": "Update the Datadog alerts filter to also archive matching emails and forward them to priya.sharma@cloudnine.dev instead of nate.patel@devops.tools.", "graders": ["infinity_state"], "start_url": "http://localhost:8060", "metadata": {"original_task_id": "gmail-task_h2", "website": "gmail", "category": "webarena-infinity", "additional": {"app_name": "gmail", "difficulty": "hard", "verifier_path": "real-tasks/task_h2.py", "app_base_port": 8060}}}
|
||||
{"query_id": "infinity-gitlab-plan-and-track-task_h58", "dataset": "webarena-infinity", "query": "The Performance Initiative epic has two child epics. For the child epic with more open issues, set the weight of every issue in it to 13. For the other child epic, close all its open issues.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h58", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h58.py", "app_base_port": 8050}}}
|
||||
{"query_id": "infinity-figma-slides-task_h46", "dataset": "webarena-infinity", "query": "There are two slides with tables in the deck. Lock the table that compares competitors, and change the font size to 16 on the table that tracks quarterly feature adoption.", "graders": ["infinity_state"], "start_url": "http://localhost:8030", "metadata": {"original_task_id": "figma-slides-task_h46", "website": "figma-slides", "category": "webarena-infinity", "additional": {"app_name": "figma-slides", "difficulty": "hard", "verifier_path": "real-tasks/task_h46.py", "app_base_port": 8030}}}
|
||||
{"query_id": "infinity-elation-prescriptions-task_h50", "dataset": "webarena-infinity", "query": "Deny the pending refill for the patient's cholesterol medication because his lipid panel is overdue. Then deny the Lisinopril refill as well \u2014 he needs a follow-up blood pressure check first.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h50", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h50.py", "app_base_port": 8020}}}
|
||||
{"query_id": "infinity-elation-prescriptions-task_h19", "dataset": "webarena-infinity", "query": "Discontinue the Omeprazole and prescribe Famotidine 20mg tablet twice daily as a replacement for GERD \u2014 qty 60, 3 refills, send to CVS #4521.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h19", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h19.py", "app_base_port": 8020}}}
|
||||
{"query_id": "infinity-paypal-my-wallet-task_h25", "dataset": "webarena-infinity", "query": "Convert all of my Australian dollars to euros.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h25", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h25.py", "app_base_port": 8100}}}
|
||||
{"query_id": "infinity-elation-clinical-records-task_h66", "dataset": "webarena-infinity", "query": "Create a new template called 'Anxiety Management' with HPI and Assessment sections, and billing code 99213 with description 'Office visit, established, low complexity'. Then create a visit note for Emily Nakamura using that new template and the Telehealth category, add a Psychological Status block to the note, and sign it.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h66", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h66.py", "app_base_port": 8000}}}
|
||||
{"query_id": "infinity-elation-clinical-records-task_h62", "dataset": "webarena-infinity", "query": "Look up which template is assigned to the COVID Vaccine appointment type. Remove all its existing document tags and replace them with the single tag 'COVID-Protocol'. Then also assign that same template to the Urgent Same-Day appointment type.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h62", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h62.py", "app_base_port": 8000}}}
|
||||
{"query_id": "infinity-elation-prescriptions-task_h32", "dataset": "webarena-infinity", "query": "The patient has a medication that's being dispensed as written (brand name only). Discontinue that prescription and replace it with a new one \u2014 same medication, same sig, same pharmacy \u2014 but allow generic substitution this time. Qty 30, 3 refills, 30 days supply.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h32", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h32.py", "app_base_port": 8020}}}
|
||||
{"query_id": "infinity-gitlab-plan-and-track-task_h48", "dataset": "webarena-infinity", "query": "Add the 'breaking-change' label to every open issue in the API v3 Migration epic and remove any existing workflow-scoped labels from those issues.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h48", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h48.py", "app_base_port": 8050}}}
|
||||
{"query_id": "infinity-gitlab-plan-and-track-task_h77", "dataset": "webarena-infinity", "query": "Rename the 'UX' label to 'user-experience', change its type to 'group', and then add it to every open issue in the Frontend Modernization epic that doesn't already have it.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h77", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h77.py", "app_base_port": 8050}}}
|
||||
{"query_id": "infinity-xero-invoicing-task_h15", "dataset": "webarena-infinity", "query": "Create a new invoice for Summit Health Group for an annual software license and 12 months of support with a 10% discount on support.", "graders": ["infinity_state"], "start_url": "http://localhost:8120", "metadata": {"original_task_id": "xero-invoicing-task_h15", "website": "xero-invoicing", "category": "webarena-infinity", "additional": {"app_name": "xero-invoicing", "difficulty": "hard", "verifier_path": "real-tasks/task_h15.py", "app_base_port": 8120}}}
|
||||
{"query_id": "infinity-elation-clinical-records-task_h55", "dataset": "webarena-infinity", "query": "Resolve every problem across all patients in the system that currently has a status of Controlled.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h55", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h55.py", "app_base_port": 8000}}}
|
||||
{"query_id": "infinity-gitlab-plan-and-track-task_h8", "dataset": "webarena-infinity", "query": "Create a confidential issue titled 'Emergency security patch' with priority::critical and the 'security' label, assigned to James O'Brien and Oliver Schmidt, with weight 2 in the Security Hardening milestone.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h8", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h8.py", "app_base_port": 8050}}}
|
||||
{"query_id": "infinity-paypal-my-wallet-task_h20", "dataset": "webarena-infinity", "query": "Make a $200 payment on PayPal Credit and change autopay to pay the full balance.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h20", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h20.py", "app_base_port": 8100}}}
|
||||
{"query_id": "infinity-gitlab-plan-and-track-task_h52", "dataset": "webarena-infinity", "query": "Create a new board called 'Performance Tracker' with lists for the priority::critical, priority::high, and priority::medium labels. Then add the 'priority::high' label to every open issue in the v4.1 milestone that has the 'performance' label.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h52", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h52.py", "app_base_port": 8050}}}
|
||||
{"query_id": "infinity-paypal-my-wallet-task_h80", "dataset": "webarena-infinity", "query": "Save all available Food & Drink offers, buy a $25 DoorDash gift card for yourself, and switch currency conversion to use my card issuer.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h80", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h80.py", "app_base_port": 8100}}}
|
||||
{"query_id": "infinity-gmail-accounts-and-contacts-task_h50", "dataset": "webarena-infinity", "query": "Add the Emergency label to every contact who is currently listed as a delegate (active, pending, or expired). Then remove all delegates whose status is not 'active'.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h50", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h50.py", "app_base_port": 8070}}}
|
||||
{"query_id": "infinity-elation-clinical-records-task_h14", "dataset": "webarena-infinity", "query": "Add the tag 'Flu-Season' to every patient whose primary provider is Dr. Sarah Chen.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h14", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h14.py", "app_base_port": 8000}}}
|
||||
{"query_id": "infinity-figma-text-and-typography-task_h7", "dataset": "webarena-infinity", "query": "Remove all list formatting from every layer.", "graders": ["infinity_state"], "start_url": "http://localhost:8040", "metadata": {"original_task_id": "figma-text-and-typography-task_h7", "website": "figma-text-and-typography", "category": "webarena-infinity", "additional": {"app_name": "figma-text-and-typography", "difficulty": "hard", "verifier_path": "real-tasks/task_h7.py", "app_base_port": 8040}}}
|
||||
{"query_id": "infinity-paypal-my-wallet-task_h26", "dataset": "webarena-infinity", "query": "Send a $50 Amazon gift card to sarah.chen@email.com with 'Thank you!' as the message, and save the Amazon cashback offer.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h26", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h26.py", "app_base_port": 8100}}}
|
||||
{"query_id": "infinity-handshake-career-exploration-task_h97", "dataset": "webarena-infinity", "query": "Find the single most helpful answer across all Q&A questions and mark it helpful. Then find the most-viewed question and submit your own answer to it.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h97", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h97.py", "app_base_port": 8080}}}
|
||||
{"query_id": "infinity-figma-slides-task_h79", "dataset": "webarena-infinity", "query": "In the adoption table, find the feature with the highest Target Q4 percentage. In the competitive table, change DesignCraft's entry for that same feature to 'Market Leader'. Then update that feature's Target Q4 to '95%'.", "graders": ["infinity_state"], "start_url": "http://localhost:8030", "metadata": {"original_task_id": "figma-slides-task_h79", "website": "figma-slides", "category": "webarena-infinity", "additional": {"app_name": "figma-slides", "difficulty": "hard", "verifier_path": "real-tasks/task_h79.py", "app_base_port": 8030}}}
|
||||
{"query_id": "infinity-gitlab-plan-and-track-task_h41", "dataset": "webarena-infinity", "query": "For every open issue in the v4.2 - Security Hardening milestone: if it is already confidential, set its health status to 'at risk'. If it is not confidential, make it confidential and set its health status to 'needs attention'.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h41", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h41.py", "app_base_port": 8050}}}
|
||||
{"query_id": "infinity-handshake-career-exploration-task_h90", "dataset": "webarena-infinity", "query": "A student in the feed mentioned attending the NSBE conference. That student also answered a Q&A question about diversity programs in tech. Submit your own answer to that same question sharing your experience, then bookmark that student's feed post.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h90", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h90.py", "app_base_port": 8080}}}
|
||||
{"query_id": "infinity-elation-prescriptions-task_h30", "dataset": "webarena-infinity", "query": "The patient has three temporary medications. Discontinue the corticosteroid taper and the penicillin antibiotic \u2014 the patient completed both courses. Move the remaining temporary medication to permanent Rx.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h30", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h30.py", "app_base_port": 8020}}}
|
||||
{"query_id": "infinity-linear-account-settings-task_h19", "dataset": "webarena-infinity", "query": "Turn off all desktop application settings: open in desktop app, notification badge, and spell check.", "graders": ["infinity_state"], "start_url": "http://localhost:8090", "metadata": {"original_task_id": "linear-account-settings-task_h19", "website": "linear-account-settings", "category": "webarena-infinity", "additional": {"app_name": "linear-account-settings", "difficulty": "hard", "verifier_path": "real-tasks/task_h19.py", "app_base_port": 8090}}}
|
||||
{"query_id": "infinity-elation-prescriptions-task_h39", "dataset": "webarena-infinity", "query": "Change the default pharmacy to Express Scripts Mail Pharmacy for mail-order prescriptions. Then document that the patient takes Magnesium Citrate 400mg tablet as an OTC supplement \u2014 once daily at bedtime, 30-day supply.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h39", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h39.py", "app_base_port": 8020}}}
|
||||
{"query_id": "infinity-handshake-career-exploration-task_h136", "dataset": "webarena-infinity", "query": "Your earliest completed appointment was a specific type. Schedule a follow-up appointment of the same category and type with the same staff member, for March 28, 2026 at 9:00 AM, in person.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h136", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h136.py", "app_base_port": 8080}}}
|
||||
{"query_id": "infinity-handshake-career-exploration-task_h105", "dataset": "webarena-infinity", "query": "Find the second-most-viewed question in Q&A. It has two answers \u2014 mark the one with fewer helpful votes as helpful.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h105", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h105.py", "app_base_port": 8080}}}
|
||||
{"query_id": "infinity-gmail-accounts-and-contacts-task_h22", "dataset": "webarena-infinity", "query": "The Engineering Manager at TechCorp is listed as one of your delegates. Remove her delegation and unstar her contact.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h22", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h22.py", "app_base_port": 8070}}}
|
||||
{"query_id": "infinity-elation-patient-communication-task_h9", "dataset": "webarena-infinity", "query": "Acknowledge all unacknowledged reminders in the system.", "graders": ["infinity_state"], "start_url": "http://localhost:8010", "metadata": {"original_task_id": "elation-patient-communication-task_h9", "website": "elation-patient-communication", "category": "webarena-infinity", "additional": {"app_name": "elation-patient-communication", "difficulty": "hard", "verifier_path": "real-tasks/task_h9.py", "app_base_port": 8010}}}
|
||||
{"query_id": "infinity-superhuman-general-task_h1", "dataset": "webarena-infinity", "query": "Label the FinancePlus partnership email and the QuantumLab prototype email as 'Clients'.", "graders": ["infinity_state"], "start_url": "http://localhost:8110", "metadata": {"original_task_id": "superhuman-general-task_h1", "website": "superhuman-general", "category": "webarena-infinity", "additional": {"app_name": "superhuman-general", "difficulty": "hard", "verifier_path": "real-tasks/task_h1.py", "app_base_port": 8110}}}
|
||||
{"query_id": "infinity-xero-invoicing-task_h79", "dataset": "webarena-infinity", "query": "Change the invoice prefix to 'AUS-' and the next number to 100, then create a new invoice for CloudNine Analytics for 8 hours of UI/UX design work.", "graders": ["infinity_state"], "start_url": "http://localhost:8120", "metadata": {"original_task_id": "xero-invoicing-task_h79", "website": "xero-invoicing", "category": "webarena-infinity", "additional": {"app_name": "xero-invoicing", "difficulty": "hard", "verifier_path": "real-tasks/task_h79.py", "app_base_port": 8120}}}
|
||||
{"query_id": "infinity-figma-slides-task_h16", "dataset": "webarena-infinity", "query": "Enable slide numbers on every slide using the 'with total' format and change the aspect ratio to 4:3.", "graders": ["infinity_state"], "start_url": "http://localhost:8030", "metadata": {"original_task_id": "figma-slides-task_h16", "website": "figma-slides", "category": "webarena-infinity", "additional": {"app_name": "figma-slides", "difficulty": "hard", "verifier_path": "real-tasks/task_h16.py", "app_base_port": 8030}}}
|
||||
{"query_id": "infinity-linear-account-settings-task_h16", "dataset": "webarena-infinity", "query": "Revoke all API keys that have an expiration date.", "graders": ["infinity_state"], "start_url": "http://localhost:8090", "metadata": {"original_task_id": "linear-account-settings-task_h16", "website": "linear-account-settings", "category": "webarena-infinity", "additional": {"app_name": "linear-account-settings", "difficulty": "hard", "verifier_path": "real-tasks/task_h16.py", "app_base_port": 8090}}}
|
||||
{"query_id": "infinity-elation-prescriptions-task_h2", "dataset": "webarena-infinity", "query": "Prescribe Buspirone 10mg for the patient's anxiety \u2014 once daily in the morning, qty 30, 5 refills. Send it to the same pharmacy that fills his Sertraline.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h2", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h2.py", "app_base_port": 8020}}}
|
||||
{"query_id": "infinity-handshake-career-exploration-task_h1", "dataset": "webarena-infinity", "query": "Follow all consulting firms on Handshake.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h1", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h1.py", "app_base_port": 8080}}}
|
||||
{"query_id": "infinity-handshake-career-exploration-task_h141", "dataset": "webarena-infinity", "query": "Some of your saved jobs are from employers you haven't followed yet. Find and follow each of those employers.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h141", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h141.py", "app_base_port": 8080}}}
|
||||
{"query_id": "infinity-figma-text-and-typography-task_h74", "dataset": "webarena-infinity", "query": "Set the spelling language to Japanese, the big nudge amount to 50, and the default horizontal alignment to right.", "graders": ["infinity_state"], "start_url": "http://localhost:8040", "metadata": {"original_task_id": "figma-text-and-typography-task_h74", "website": "figma-text-and-typography", "category": "webarena-infinity", "additional": {"app_name": "figma-text-and-typography", "difficulty": "hard", "verifier_path": "real-tasks/task_h74.py", "app_base_port": 8040}}}
|
||||
{"query_id": "infinity-elation-patient-communication-task_h63", "dataset": "webarena-infinity", "query": "Check the visit summaries to find the patient whose BNP level improved. Reply to their most recent message confirming they can resume light activity, then update their emergency contact's phone number to (650) 555-0001.", "graders": ["infinity_state"], "start_url": "http://localhost:8010", "metadata": {"original_task_id": "elation-patient-communication-task_h63", "website": "elation-patient-communication", "category": "webarena-infinity", "additional": {"app_name": "elation-patient-communication", "difficulty": "hard", "verifier_path": "real-tasks/task_h63.py", "app_base_port": 8010}}}
|
||||
{"query_id": "infinity-elation-patient-communication-task_h14", "dataset": "webarena-infinity", "query": "Change Dr. Torres's notification timeframe to 'Do not notify me' and remove Dr. Torres from Dr. Chen's General Question routing.", "graders": ["infinity_state"], "start_url": "http://localhost:8010", "metadata": {"original_task_id": "elation-patient-communication-task_h14", "website": "elation-patient-communication", "category": "webarena-infinity", "additional": {"app_name": "elation-patient-communication", "difficulty": "hard", "verifier_path": "real-tasks/task_h14.py", "app_base_port": 8010}}}
|
||||
{"query_id": "infinity-gitlab-plan-and-track-task_h67", "dataset": "webarena-infinity", "query": "Delete all time entries from the GraphQL gateway issue, add a single new entry of 16 hours with summary 'Complete rewrite estimate', and set its time estimate to 40 hours.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h67", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h67.py", "app_base_port": 8050}}}
|
||||
{"query_id": "infinity-gmail-accounts-and-contacts-task_h73", "dataset": "webarena-infinity", "query": "Among the individual people in your other contacts (those with a first and last name), find the one who was saved most recently. Move them to your main contacts, set their company to 'Salesforce', job title to 'Account Executive', and add the Work label.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h73", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h73.py", "app_base_port": 8070}}}
|
||||
{"query_id": "infinity-elation-prescriptions-task_h4", "dataset": "webarena-infinity", "query": "Run a medication reconciliation and mark the Calcium+D3 supplement for discontinuation during the review.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h4", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h4.py", "app_base_port": 8020}}}
|
||||
{"query_id": "infinity-elation-prescriptions-task_h47", "dataset": "webarena-infinity", "query": "The patient's SSRI is currently dispensed at a different pharmacy than most of his other medications. Prescribe a refill of the same SSRI at the same dose and sig, but send it to CVS #4521 instead \u2014 qty 30, 5 refills, 30 days supply.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h47", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h47.py", "app_base_port": 8020}}}
|
||||
{"query_id": "infinity-paypal-my-wallet-task_h89", "dataset": "webarena-infinity", "query": "If your USD PayPal balance is above $2,500, convert $500 to Japanese Yen. If it is $2,500 or below, first add $500 from your Chase bank account, then convert $500 to JPY. Either way, set the debit card cash back category to Fuel.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h89", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h89.py", "app_base_port": 8100}}}
|
||||
88
packages/browseros-agent/apps/eval/scripts/agisdk-evaluate.py
vendored
Normal file
88
packages/browseros-agent/apps/eval/scripts/agisdk-evaluate.py
vendored
Normal file
@@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AGI SDK evaluation helper for BrowserOS eval framework.
|
||||
|
||||
Reads JSON from stdin with task_id and env_state, runs the agisdk
|
||||
evaluator, and outputs the result as JSON to stdout.
|
||||
|
||||
Input format:
|
||||
{"task_id": "dashdish-1", "env_state": {...}, "model_response": ""}
|
||||
|
||||
Output format:
|
||||
{"reward": 0.0, "pass": false, "message": "...", "per_criterion": [...]}
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
data = json.loads(sys.stdin.read())
|
||||
task_id = data["task_id"]
|
||||
env_state = data["env_state"]
|
||||
model_response = data.get("model_response", "")
|
||||
|
||||
try:
|
||||
from agisdk.REAL.browsergym.webclones.evaluate import WebCloneEvaluator
|
||||
from agisdk.REAL.browsergym.webclones.task_config import TaskConfig
|
||||
except ImportError:
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"reward": 0,
|
||||
"pass": False,
|
||||
"message": "agisdk package not installed. Run: pip install agisdk",
|
||||
"per_criterion": [],
|
||||
}
|
||||
)
|
||||
)
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
# Redirect stdout to stderr during evaluation — agisdk's rich logger
|
||||
# prints directly to stdout, which would corrupt our JSON output
|
||||
real_stdout = sys.stdout
|
||||
sys.stdout = sys.stderr
|
||||
|
||||
tc = TaskConfig(task_id)
|
||||
evaluator = WebCloneEvaluator(tc)
|
||||
reward_val, _done, message, info = evaluator.evaluate(
|
||||
env_state=env_state, model_response=model_response
|
||||
)
|
||||
|
||||
sys.stdout = real_stdout
|
||||
|
||||
reward_val = float(reward_val) if reward_val is not None else 0.0
|
||||
results = info.get("results", [])
|
||||
per_criterion = [
|
||||
{"passed": r[0], "detail": str(r[1]) if len(r) > 1 else ""}
|
||||
for r in results
|
||||
]
|
||||
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"reward": reward_val,
|
||||
"pass": reward_val == 1.0,
|
||||
"message": str(message),
|
||||
"per_criterion": per_criterion,
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = real_stdout if "real_stdout" in dir() else sys.__stdout__
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"reward": 0,
|
||||
"pass": False,
|
||||
"message": f"Evaluation error: {str(e)}",
|
||||
"per_criterion": [],
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
83
packages/browseros-agent/apps/eval/scripts/build-agisdk-dataset.py
vendored
Normal file
83
packages/browseros-agent/apps/eval/scripts/build-agisdk-dataset.py
vendored
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Build JSONL dataset for AGI SDK / REAL Bench evaluation.
|
||||
|
||||
Reads task definitions from the agisdk package, filters to feasible
|
||||
action-only tasks (excludes llm_boolean evaluators), and outputs JSONL
|
||||
to stdout in the BrowserOS eval framework format.
|
||||
|
||||
Usage:
|
||||
python scripts/build-agisdk-dataset.py > data/agisdk-real.jsonl
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
|
||||
|
||||
def has_llm_eval(task: dict) -> bool:
|
||||
return any(e.get("type") == "llm_boolean" for e in task.get("evals", []))
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from agisdk.REAL.tasks import all_tasks
|
||||
except ImportError:
|
||||
print(
|
||||
"Error: agisdk package not installed. Run: pip install agisdk",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
count = 0
|
||||
skipped_infeasible = 0
|
||||
skipped_llm = 0
|
||||
|
||||
for task in all_tasks:
|
||||
if not task.get("possible", True):
|
||||
skipped_infeasible += 1
|
||||
continue
|
||||
|
||||
if has_llm_eval(task):
|
||||
skipped_llm += 1
|
||||
continue
|
||||
|
||||
task_id = task["id"]
|
||||
website = task.get("website", {})
|
||||
goal = task.get("goal", "")
|
||||
start_url = website.get("url", "")
|
||||
|
||||
if not start_url or not goal:
|
||||
print(f"Warning: Skipping {task_id} — missing url or goal", file=sys.stderr)
|
||||
continue
|
||||
|
||||
entry = {
|
||||
"query_id": f"agisdk-{task_id}",
|
||||
"dataset": "agisdk-real",
|
||||
"query": goal,
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"start_url": start_url,
|
||||
"metadata": {
|
||||
"original_task_id": task_id,
|
||||
"website": website.get("name", ""),
|
||||
"category": "agisdk-real",
|
||||
"additional": {
|
||||
"agisdk_task_id": task_id,
|
||||
"challenge_type": task.get("challengeType", "action"),
|
||||
"difficulty": task.get("difficulty", "unknown"),
|
||||
"similar_to": website.get("similarTo", ""),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
print(json.dumps(entry))
|
||||
count += 1
|
||||
|
||||
print(
|
||||
f"Generated {count} tasks (skipped {skipped_infeasible} infeasible, "
|
||||
f"{skipped_llm} llm_boolean)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
118
packages/browseros-agent/apps/eval/scripts/build-infinity-dataset.py
vendored
Normal file
118
packages/browseros-agent/apps/eval/scripts/build-infinity-dataset.py
vendored
Normal file
@@ -0,0 +1,118 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dataset generator for WebArena-Infinity benchmark.
|
||||
|
||||
Reads real-tasks.json from each app directory and outputs JSONL
|
||||
in the eval framework's TaskSchema format.
|
||||
|
||||
Usage:
|
||||
python build-infinity-dataset.py --apps-dir /path/to/webarena-infinity/apps
|
||||
python build-infinity-dataset.py --apps-dir /path/to/apps --apps gmail linear --difficulty medium
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def load_tasks(app_dir: str, app_name: str) -> list[dict]:
|
||||
tasks_file = os.path.join(app_dir, "real-tasks.json")
|
||||
if not os.path.exists(tasks_file):
|
||||
print(f"Warning: No real-tasks.json found in {app_dir}", file=sys.stderr)
|
||||
return []
|
||||
with open(tasks_file) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def build_task_entry(
|
||||
app_name: str,
|
||||
task: dict,
|
||||
base_port: int,
|
||||
) -> dict:
|
||||
task_id = task.get("id", task.get("task_id", "unknown"))
|
||||
difficulty = task.get("difficulty", "unknown")
|
||||
query = task.get("query", task.get("instruction", task.get("task", "")))
|
||||
verifier_path = task.get(
|
||||
"verify",
|
||||
task.get("verifier_path", f"real-tasks/{task_id}.py"),
|
||||
)
|
||||
|
||||
return {
|
||||
"query_id": f"infinity-{app_name}-{task_id}",
|
||||
"dataset": "webarena-infinity",
|
||||
"query": query,
|
||||
"graders": ["infinity_state"],
|
||||
"start_url": f"http://localhost:{base_port}",
|
||||
"setup_script": f"POST http://localhost:{base_port}/api/reset",
|
||||
"metadata": {
|
||||
"original_task_id": f"{app_name}-{task_id}",
|
||||
"website": app_name,
|
||||
"category": "webarena-infinity",
|
||||
"additional": {
|
||||
"app_name": app_name,
|
||||
"difficulty": difficulty,
|
||||
"verifier_path": verifier_path,
|
||||
"app_port": base_port,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate JSONL dataset from WebArena-Infinity apps"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--apps-dir",
|
||||
required=True,
|
||||
help="Path to webarena-infinity/apps/ directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--apps",
|
||||
nargs="*",
|
||||
default=None,
|
||||
help="Filter to specific app names (default: all)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--difficulty",
|
||||
choices=["easy", "medium", "hard"],
|
||||
default=None,
|
||||
help="Filter by difficulty tier",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-port",
|
||||
type=int,
|
||||
default=8000,
|
||||
help="Starting port number for apps (default: 8000)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.isdir(args.apps_dir):
|
||||
print(f"Error: {args.apps_dir} is not a directory", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
app_dirs = sorted(os.listdir(args.apps_dir))
|
||||
if args.apps:
|
||||
app_dirs = [d for d in app_dirs if d in args.apps]
|
||||
|
||||
port = args.base_port
|
||||
for app_name in app_dirs:
|
||||
app_path = os.path.join(args.apps_dir, app_name)
|
||||
if not os.path.isdir(app_path):
|
||||
continue
|
||||
|
||||
tasks = load_tasks(app_path, app_name)
|
||||
for task in tasks:
|
||||
difficulty = task.get("difficulty", "unknown")
|
||||
if args.difficulty and difficulty != args.difficulty:
|
||||
continue
|
||||
|
||||
entry = build_task_entry(app_name, task, port)
|
||||
print(json.dumps(entry))
|
||||
|
||||
port += 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
82
packages/browseros-agent/apps/eval/scripts/infinity-evaluate.py
vendored
Normal file
82
packages/browseros-agent/apps/eval/scripts/infinity-evaluate.py
vendored
Normal file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Evaluation helper for WebArena-Infinity verifier scripts.
|
||||
|
||||
Reads JSON from stdin with app_server_url, verifier_path, and task_id.
|
||||
Runs the verifier against the app server and outputs a JSON result.
|
||||
|
||||
Verifiers have the signature: verify(server_url: str) -> tuple[bool, str]
|
||||
They fetch /api/state internally and return (passed, message).
|
||||
|
||||
Usage:
|
||||
echo '{"app_server_url": "http://localhost:8000", "verifier_path": "/path/to/verify.py"}' | python infinity-evaluate.py
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
import json
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
|
||||
def load_verifier(verifier_path: str):
|
||||
spec = importlib.util.spec_from_file_location("verifier", verifier_path)
|
||||
if spec is None or spec.loader is None:
|
||||
raise ImportError(f"Cannot load verifier from {verifier_path}")
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
data = json.loads(sys.stdin.read())
|
||||
except json.JSONDecodeError as e:
|
||||
print(json.dumps({"pass": False, "reward": 0.0, "message": f"Invalid JSON input: {e}"}))
|
||||
sys.exit(1)
|
||||
|
||||
server_url = data.get("app_server_url", "")
|
||||
verifier_path = data.get("verifier_path", "")
|
||||
|
||||
if not server_url or not verifier_path:
|
||||
print(json.dumps({
|
||||
"pass": False,
|
||||
"reward": 0.0,
|
||||
"message": "Missing app_server_url or verifier_path",
|
||||
}))
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
verifier = load_verifier(verifier_path)
|
||||
fn = getattr(verifier, "verify", None)
|
||||
if not callable(fn):
|
||||
raise AttributeError(
|
||||
f"Verifier has no verify() function. "
|
||||
f"Available: {[a for a in dir(verifier) if not a.startswith('_')]}"
|
||||
)
|
||||
|
||||
# Verifiers take server_url and fetch state internally
|
||||
result = fn(server_url)
|
||||
|
||||
# Return is tuple[bool, str]
|
||||
if isinstance(result, tuple) and len(result) >= 2:
|
||||
passed, message = result[0], str(result[1])
|
||||
else:
|
||||
passed, message = bool(result), str(result)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
"pass": False,
|
||||
"reward": 0.0,
|
||||
"message": f"Verifier error: {e}\n{traceback.format_exc()}",
|
||||
}))
|
||||
sys.exit(1)
|
||||
|
||||
print(json.dumps({
|
||||
"pass": passed,
|
||||
"reward": 1.0 if passed else 0.0,
|
||||
"message": message,
|
||||
}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -59,6 +59,8 @@ interface RunSummary {
|
||||
}
|
||||
|
||||
const PASS_FAIL_GRADER_ORDER = [
|
||||
'agisdk_state_diff',
|
||||
'infinity_state',
|
||||
'performance_grader',
|
||||
'webvoyager_grader',
|
||||
'fara_combined',
|
||||
|
||||
202
packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts
vendored
Normal file
202
packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts
vendored
Normal file
@@ -0,0 +1,202 @@
|
||||
import { spawn } from 'node:child_process'
|
||||
import { join } from 'node:path'
|
||||
import type { GraderResult } from '../../types'
|
||||
import { callMcpTool } from '../../utils/mcp-client'
|
||||
import type { Grader, GraderInput } from '../types'
|
||||
|
||||
const EVAL_SCRIPT = join(
|
||||
import.meta.dirname,
|
||||
'..',
|
||||
'..',
|
||||
'..',
|
||||
'scripts',
|
||||
'agisdk-evaluate.py',
|
||||
)
|
||||
|
||||
export class AgisdkStateDiffGrader implements Grader {
|
||||
name = 'agisdk_state_diff'
|
||||
|
||||
async grade(input: GraderInput): Promise<GraderResult> {
|
||||
const taskId = this.extractTaskId(input.task.query_id)
|
||||
const startUrl = this.extractStartUrl(input)
|
||||
const mcpEndpoint =
|
||||
input.mcpUrl ||
|
||||
`${process.env.BROWSEROS_SERVER_URL || 'http://127.0.0.1:9110'}/mcp`
|
||||
|
||||
if (!startUrl) {
|
||||
return {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: 'Could not determine clone site URL from task',
|
||||
}
|
||||
}
|
||||
|
||||
const origin = new URL(startUrl).origin
|
||||
|
||||
let envState: Record<string, unknown>
|
||||
try {
|
||||
envState = await this.fetchFinishState(origin, mcpEndpoint)
|
||||
} catch (error) {
|
||||
return {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: `Failed to fetch /finish endpoint: ${error instanceof Error ? error.message : String(error)}`,
|
||||
details: { origin, error: true },
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.runPythonEvaluator(
|
||||
taskId,
|
||||
envState,
|
||||
input.finalAnswer || '',
|
||||
)
|
||||
return {
|
||||
score: result.reward,
|
||||
pass: result.pass,
|
||||
reasoning:
|
||||
result.message ||
|
||||
(result.pass ? 'All criteria passed' : 'Some criteria failed'),
|
||||
details: {
|
||||
reward: result.reward,
|
||||
per_criterion: result.per_criterion,
|
||||
origin,
|
||||
agisdk_task_id: taskId,
|
||||
},
|
||||
}
|
||||
} catch (error) {
|
||||
return {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: `Python evaluator error: ${error instanceof Error ? error.message : String(error)}`,
|
||||
details: { error: true },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private extractTaskId(queryId: string): string {
|
||||
return queryId.replace(/^agisdk-/, '')
|
||||
}
|
||||
|
||||
private extractStartUrl(input: GraderInput): string | null {
|
||||
// Derive from task_id: "dashdish-10" → "https://evals-dashdish.vercel.app"
|
||||
// Task IDs are "{site}-{number}" where site may contain hyphens (e.g. "fly-unified-5")
|
||||
const taskId = this.extractTaskId(input.task.query_id)
|
||||
const siteId = taskId.replace(/-\d+$/, '')
|
||||
if (siteId) return `https://evals-${siteId}.vercel.app`
|
||||
|
||||
// Fallback: search messages for vercel.app URLs
|
||||
for (const msg of input.messages) {
|
||||
const text =
|
||||
msg.type === 'user'
|
||||
? msg.content
|
||||
: msg.type === 'tool-input-available'
|
||||
? JSON.stringify(msg.input)
|
||||
: ''
|
||||
const urlMatch = text.match(/https?:\/\/[^\s"']+\.vercel\.app/)
|
||||
if (urlMatch) return urlMatch[0]
|
||||
}
|
||||
|
||||
return null
|
||||
}
|
||||
|
||||
private async fetchFinishState(
|
||||
origin: string,
|
||||
mcpEndpoint: string,
|
||||
): Promise<Record<string, unknown>> {
|
||||
const finishUrl = `${origin}/finish`
|
||||
|
||||
// Navigate browser to /finish page (state diff is rendered client-side)
|
||||
await callMcpTool(mcpEndpoint, 'navigate_page', {
|
||||
url: finishUrl,
|
||||
page: 1,
|
||||
})
|
||||
|
||||
// Wait for the page to render, then extract JSON from <pre> element
|
||||
const result = await callMcpTool(mcpEndpoint, 'evaluate_script', {
|
||||
page: 1,
|
||||
expression: `
|
||||
new Promise((resolve, reject) => {
|
||||
let attempts = 0;
|
||||
const check = () => {
|
||||
const pre = document.querySelector('pre');
|
||||
if (pre && pre.textContent.trim().startsWith('{')) {
|
||||
resolve(pre.textContent);
|
||||
} else if (++attempts > 20) {
|
||||
reject(new Error('Timed out waiting for <pre> JSON on /finish'));
|
||||
} else {
|
||||
setTimeout(check, 500);
|
||||
}
|
||||
};
|
||||
check();
|
||||
})
|
||||
`,
|
||||
})
|
||||
|
||||
const textContent = result.content?.find(
|
||||
(c: { type: string }) => c.type === 'text',
|
||||
)
|
||||
if (!textContent?.text) {
|
||||
throw new Error('No text content returned from /finish page')
|
||||
}
|
||||
|
||||
return JSON.parse(textContent.text) as Record<string, unknown>
|
||||
}
|
||||
|
||||
private runPythonEvaluator(
|
||||
taskId: string,
|
||||
envState: Record<string, unknown>,
|
||||
modelResponse: string,
|
||||
): Promise<{
|
||||
reward: number
|
||||
pass: boolean
|
||||
message: string
|
||||
per_criterion: unknown[]
|
||||
}> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const proc = spawn('python3', [EVAL_SCRIPT], {
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
})
|
||||
|
||||
const inputData = JSON.stringify({
|
||||
task_id: taskId,
|
||||
env_state: envState,
|
||||
model_response: modelResponse,
|
||||
})
|
||||
|
||||
let stdout = ''
|
||||
let stderr = ''
|
||||
|
||||
proc.stdout.on('data', (data: Buffer) => {
|
||||
stdout += data.toString()
|
||||
})
|
||||
|
||||
proc.stderr.on('data', (data: Buffer) => {
|
||||
stderr += data.toString()
|
||||
})
|
||||
|
||||
proc.on('close', (code) => {
|
||||
if (code !== 0) {
|
||||
reject(
|
||||
new Error(`Python evaluator exited with code ${code}: ${stderr}`),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
const result = JSON.parse(stdout.trim())
|
||||
resolve(result)
|
||||
} catch {
|
||||
reject(new Error(`Failed to parse evaluator output: ${stdout}`))
|
||||
}
|
||||
})
|
||||
|
||||
proc.on('error', (err) => {
|
||||
reject(new Error(`Failed to spawn Python evaluator: ${err.message}`))
|
||||
})
|
||||
|
||||
proc.stdin.write(inputData)
|
||||
proc.stdin.end()
|
||||
})
|
||||
}
|
||||
}
|
||||
134
packages/browseros-agent/apps/eval/src/graders/benchmark/infinity-state.ts
vendored
Normal file
134
packages/browseros-agent/apps/eval/src/graders/benchmark/infinity-state.ts
vendored
Normal file
@@ -0,0 +1,134 @@
|
||||
import { join, resolve } from 'node:path'
|
||||
import type { GraderResult } from '../../types'
|
||||
import type { Grader, GraderInput } from '../types'
|
||||
|
||||
interface InfinityEvalInput {
|
||||
app_server_url: string
|
||||
verifier_path: string
|
||||
task_id: string
|
||||
}
|
||||
|
||||
interface InfinityEvalOutput {
|
||||
pass: boolean
|
||||
reward: number
|
||||
message: string
|
||||
}
|
||||
|
||||
const EVAL_SCRIPT = resolve(
|
||||
import.meta.dir,
|
||||
'../../../scripts/infinity-evaluate.py',
|
||||
)
|
||||
|
||||
export class InfinityStateGrader implements Grader {
|
||||
name = 'infinity_state'
|
||||
|
||||
async grade(input: GraderInput): Promise<GraderResult> {
|
||||
const parsed = this.parseQueryId(input.task.query_id)
|
||||
if (!parsed) {
|
||||
return {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: `Cannot parse query_id "${input.task.query_id}" — expected format: infinity-{app}-{task_id}`,
|
||||
}
|
||||
}
|
||||
|
||||
const appServerUrl = this.resolveAppServerUrl(input)
|
||||
if (!appServerUrl) {
|
||||
return {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: 'Cannot determine app server URL',
|
||||
}
|
||||
}
|
||||
|
||||
const infinityDir = process.env.WEBARENA_INFINITY_DIR
|
||||
if (!infinityDir) {
|
||||
return {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning:
|
||||
'WEBARENA_INFINITY_DIR env var not set. Point it to the webarena-infinity repo root.',
|
||||
}
|
||||
}
|
||||
|
||||
const verifierPath = join(
|
||||
infinityDir,
|
||||
'apps',
|
||||
parsed.appName,
|
||||
'real-tasks',
|
||||
`${parsed.taskId}.py`,
|
||||
)
|
||||
|
||||
const evalInput: InfinityEvalInput = {
|
||||
app_server_url: appServerUrl,
|
||||
verifier_path: verifierPath,
|
||||
task_id: input.task.query_id,
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await this.runPythonEvaluator(evalInput)
|
||||
return {
|
||||
score: result.pass ? 1 : 0,
|
||||
pass: result.pass,
|
||||
reasoning: result.message,
|
||||
details: {
|
||||
reward: result.reward,
|
||||
app_name: parsed.appName,
|
||||
app_server_url: appServerUrl,
|
||||
},
|
||||
}
|
||||
} catch (error) {
|
||||
return {
|
||||
score: 0,
|
||||
pass: false,
|
||||
reasoning: `Evaluator process error: ${error instanceof Error ? error.message : String(error)}`,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private parseQueryId(
|
||||
queryId: string,
|
||||
): { appName: string; taskId: string } | null {
|
||||
// Task IDs start with "task_", app names may contain hyphens
|
||||
// e.g. "infinity-elation-prescriptions-task_h69"
|
||||
const match = queryId.match(/^infinity-(.+)-(task_.+)$/)
|
||||
if (!match) return null
|
||||
return { appName: match[1], taskId: match[2] }
|
||||
}
|
||||
|
||||
private resolveAppServerUrl(input: GraderInput): string | null {
|
||||
// Passed directly from task executor (started by InfinityAppManager)
|
||||
if (input.infinityAppUrl) return input.infinityAppUrl
|
||||
|
||||
// Fallback: env var for manual testing
|
||||
if (process.env.INFINITY_APP_URL) return process.env.INFINITY_APP_URL
|
||||
|
||||
return null
|
||||
}
|
||||
|
||||
private async runPythonEvaluator(
|
||||
evalInput: InfinityEvalInput,
|
||||
): Promise<InfinityEvalOutput> {
|
||||
const proc = Bun.spawn(['python3', EVAL_SCRIPT], {
|
||||
stdin: 'pipe',
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
|
||||
const inputJson = JSON.stringify(evalInput)
|
||||
proc.stdin.write(inputJson)
|
||||
proc.stdin.end()
|
||||
|
||||
const stdout = await new Response(proc.stdout).text()
|
||||
const stderr = await new Response(proc.stderr).text()
|
||||
const exitCode = await proc.exited
|
||||
|
||||
if (exitCode !== 0) {
|
||||
throw new Error(
|
||||
`Python evaluator exited with code ${exitCode}: ${stderr || stdout}`,
|
||||
)
|
||||
}
|
||||
|
||||
return JSON.parse(stdout.trim()) as InfinityEvalOutput
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,6 @@
|
||||
import type { GraderResult } from '../types'
|
||||
import { AgisdkStateDiffGrader } from './benchmark/agisdk-state-diff'
|
||||
import { InfinityStateGrader } from './benchmark/infinity-state'
|
||||
import { Mind2WebJudgeGrader } from './benchmark/mind2web'
|
||||
import { WebVoyagerGrader } from './benchmark/webvoyager'
|
||||
import { FaraAlignmentGrader } from './fara/alignment'
|
||||
@@ -19,7 +21,13 @@ export function createGrader(
|
||||
options: GraderOptions | null,
|
||||
): Grader | null {
|
||||
switch (name) {
|
||||
// Benchmark graders
|
||||
// Deterministic benchmark graders (no LLM judge)
|
||||
case 'agisdk_state_diff':
|
||||
return new AgisdkStateDiffGrader()
|
||||
case 'infinity_state':
|
||||
return new InfinityStateGrader()
|
||||
|
||||
// LLM-based benchmark graders
|
||||
case 'webvoyager_grader':
|
||||
if (!options?.apiKey) return null
|
||||
return new WebVoyagerGrader(
|
||||
@@ -107,10 +115,12 @@ export async function runGraders(
|
||||
|
||||
// Export grader classes for direct use
|
||||
export {
|
||||
AgisdkStateDiffGrader,
|
||||
FaraAlignmentGrader,
|
||||
FaraCombinedGrader,
|
||||
FaraMultimodalGrader,
|
||||
FaraRubricGrader,
|
||||
InfinityStateGrader,
|
||||
Mind2WebJudgeGrader,
|
||||
PerformanceGrader,
|
||||
WebVoyagerGrader,
|
||||
|
||||
@@ -11,6 +11,8 @@ export interface GraderInput {
|
||||
finalAnswer: string | null
|
||||
expectedAnswer?: string | null
|
||||
outputDir: string
|
||||
mcpUrl?: string
|
||||
infinityAppUrl?: string
|
||||
}
|
||||
|
||||
export interface Grader {
|
||||
|
||||
89
packages/browseros-agent/apps/eval/src/runner/infinity-app-manager.ts
vendored
Normal file
89
packages/browseros-agent/apps/eval/src/runner/infinity-app-manager.ts
vendored
Normal file
@@ -0,0 +1,89 @@
|
||||
/**
|
||||
* Manages WebArena-Infinity app server lifecycle per task.
|
||||
*
|
||||
* Each worker gets a unique port: base_port + worker_index.
|
||||
* Server is started fresh before each task and killed after,
|
||||
* guaranteeing clean state.
|
||||
*/
|
||||
|
||||
import { type ChildProcess, spawn } from 'node:child_process'
|
||||
import { join } from 'node:path'
|
||||
|
||||
export class InfinityAppManager {
|
||||
private proc: ChildProcess | null = null
|
||||
private port: number
|
||||
private infinityDir: string
|
||||
|
||||
constructor(
|
||||
private workerIndex: number,
|
||||
private basePort: number = 8000,
|
||||
) {
|
||||
this.port = basePort + workerIndex
|
||||
this.infinityDir = process.env.WEBARENA_INFINITY_DIR || ''
|
||||
}
|
||||
|
||||
async startApp(appName: string): Promise<string> {
|
||||
await this.stop()
|
||||
|
||||
if (!this.infinityDir) {
|
||||
throw new Error('WEBARENA_INFINITY_DIR env var not set')
|
||||
}
|
||||
|
||||
const serverScript = join(this.infinityDir, 'apps', appName, 'server.py')
|
||||
this.proc = spawn('python3', [serverScript, '--port', String(this.port)], {
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
cwd: join(this.infinityDir, 'apps', appName),
|
||||
})
|
||||
|
||||
// Wait for server to be ready
|
||||
const url = `http://localhost:${this.port}`
|
||||
await this.waitForReady(url)
|
||||
return url
|
||||
}
|
||||
|
||||
async stop(): Promise<void> {
|
||||
if (this.proc) {
|
||||
this.proc.kill('SIGTERM')
|
||||
await new Promise<void>((resolve) => {
|
||||
const timeout = setTimeout(() => {
|
||||
this.proc?.kill('SIGKILL')
|
||||
resolve()
|
||||
}, 3000)
|
||||
this.proc?.on('exit', () => {
|
||||
clearTimeout(timeout)
|
||||
resolve()
|
||||
})
|
||||
})
|
||||
this.proc = null
|
||||
}
|
||||
}
|
||||
|
||||
getPort(): number {
|
||||
return this.port
|
||||
}
|
||||
|
||||
getUrl(): string {
|
||||
return `http://localhost:${this.port}`
|
||||
}
|
||||
|
||||
private async waitForReady(
|
||||
url: string,
|
||||
maxAttempts = 30,
|
||||
intervalMs = 500,
|
||||
): Promise<void> {
|
||||
for (let i = 0; i < maxAttempts; i++) {
|
||||
try {
|
||||
const resp = await fetch(url, {
|
||||
signal: AbortSignal.timeout(2000),
|
||||
})
|
||||
if (resp.ok) return
|
||||
} catch {
|
||||
// Server not ready yet
|
||||
}
|
||||
await new Promise((r) => setTimeout(r, intervalMs))
|
||||
}
|
||||
throw new Error(
|
||||
`Infinity app server not ready after ${maxAttempts * intervalMs}ms on port ${this.port}`,
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -9,6 +9,7 @@ import {
|
||||
import { runGraders } from '../graders/registry'
|
||||
import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
|
||||
import { callMcpTool } from '../utils/mcp-client'
|
||||
import { InfinityAppManager } from './infinity-app-manager'
|
||||
import type { GraderOptions, TaskResult } from './types'
|
||||
|
||||
// ============================================================================
|
||||
@@ -101,6 +102,36 @@ export class TaskExecutor {
|
||||
// Resolve page ID once — fresh browser has exactly one page
|
||||
const pageId = await this.resolveInitialPageId(mcpUrl)
|
||||
|
||||
// For Infinity tasks, start a fresh app server per task
|
||||
let infinityManager: InfinityAppManager | null = null
|
||||
let actualStartUrl = task.start_url
|
||||
|
||||
if (task.dataset === 'webarena-infinity') {
|
||||
const appName = (task.metadata?.additional as Record<string, unknown>)
|
||||
?.app_name as string
|
||||
const appBasePort =
|
||||
((task.metadata?.additional as Record<string, unknown>)
|
||||
?.app_base_port as number) || 8000
|
||||
const workerIndex = this.config.browseros.base_server_port - 9110 // derive from port offset
|
||||
|
||||
if (appName && process.env.WEBARENA_INFINITY_DIR) {
|
||||
infinityManager = new InfinityAppManager(workerIndex, appBasePort)
|
||||
try {
|
||||
actualStartUrl = await infinityManager.startApp(appName)
|
||||
console.log(
|
||||
` Infinity app "${appName}" started on port ${infinityManager.getPort()}`,
|
||||
)
|
||||
} catch (error) {
|
||||
throw new TaskExecutionError(
|
||||
`Failed to start Infinity app: ${error instanceof Error ? error.message : String(error)}`,
|
||||
task,
|
||||
'navigation',
|
||||
error instanceof Error ? error : undefined,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
// Phase 1: Set viewport + navigate to start URL
|
||||
try {
|
||||
@@ -114,10 +145,10 @@ export class TaskExecutor {
|
||||
)
|
||||
}
|
||||
|
||||
if (task.start_url && task.start_url !== 'about:blank') {
|
||||
if (actualStartUrl && actualStartUrl !== 'about:blank') {
|
||||
try {
|
||||
await callMcpTool(mcpUrl, 'navigate_page', {
|
||||
url: task.start_url,
|
||||
url: actualStartUrl,
|
||||
page: pageId,
|
||||
})
|
||||
} catch (error) {
|
||||
@@ -134,7 +165,11 @@ export class TaskExecutor {
|
||||
const agentResult = await this.executeAgent(task, pageId)
|
||||
|
||||
// Phase 3: Run graders
|
||||
const graderResults = await this.runGraders(task, agentResult)
|
||||
const graderResults = await this.runGraders(
|
||||
task,
|
||||
agentResult,
|
||||
infinityManager?.getUrl(),
|
||||
)
|
||||
|
||||
const status =
|
||||
agentResult.metadata.termination_reason === 'timeout'
|
||||
@@ -169,6 +204,11 @@ export class TaskExecutor {
|
||||
} catch {
|
||||
// Ignore cleanup errors
|
||||
}
|
||||
|
||||
// Stop Infinity app server if running
|
||||
if (infinityManager) {
|
||||
await infinityManager.stop().catch(() => {})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,6 +249,7 @@ export class TaskExecutor {
|
||||
private async runGraders(
|
||||
task: Task,
|
||||
agentResult: AgentResult,
|
||||
infinityAppUrl?: string,
|
||||
): Promise<Record<string, GraderResult>> {
|
||||
const configGraders = this.config.graders ?? []
|
||||
const taskGraders = task.graders ?? []
|
||||
@@ -234,6 +275,8 @@ export class TaskExecutor {
|
||||
expectedAnswer: (task.metadata?.additional as Record<string, unknown>)
|
||||
?.answer as string | undefined,
|
||||
outputDir: join(this.outputDir, task.query_id),
|
||||
mcpUrl: `${this.config.browseros.server_url}/mcp`,
|
||||
infinityAppUrl,
|
||||
},
|
||||
this.deps.graderOptions,
|
||||
)
|
||||
|
||||
@@ -100,6 +100,8 @@ export interface TaskResultSummary {
|
||||
// ============================================================================
|
||||
|
||||
export const PASS_FAIL_GRADER_ORDER = [
|
||||
'agisdk_state_diff',
|
||||
'infinity_state',
|
||||
'performance_grader',
|
||||
'webvoyager_grader',
|
||||
'fara_combined',
|
||||
|
||||
Reference in New Issue
Block a user