Files
BrowserOS/packages/browseros-agent/apps/eval/data/mind2web_e2e_test.jsonl
shivammittal274 29056226bb feat: add eval framework and coordinate-based input tools (#453)
- Add hover_at, type_at, drag_at coordinate tools to server
- Add hoverAt, typeAt, dragAt methods to Browser class
- Export server internals (browser, tool-loop, registry) for eval imports
- Copy eval app from enterprise repo with agents, graders, runner, dashboard
- Nest eval-targets inside apps/eval
- Adapt sessionExecutionDir → workingDir for current server API
- Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
2026-03-16 23:12:23 +05:30

11 lines
4.4 KiB
JSON

{"query_id": "87f4c5128e36cdb9366a138a7b61bb00", "dataset": "online-mind2web", "query": "View the speakers that are bluetooth and wireless and filter the results to only show models that are on sale and cost less than $50.", "graders": ["mind2web_judge", "fara_combined"], "start_url": "https://www.bestbuy.com/", "metadata": {"original_task_id": "87f4c5128e36cdb9366a138a7b61bb00", "website": "https://www.bestbuy.com/", "category": "medium", "additional": {"level": "medium", "reference_length": 6}}}
{"query_id": "cfafe3771369d1d261e9f7ecd44c296d", "dataset": "online-mind2web", "query": "Find the highest-rated dealer for Cadillac with a rating above 4 stars within 20 miles of zip 60606.", "graders": ["mind2web_judge", "fara_combined"], "start_url": "https://www.cars.com/", "metadata": {"original_task_id": "cfafe3771369d1d261e9f7ecd44c296d", "website": "https://www.cars.com/", "category": "medium", "additional": {"level": "medium", "reference_length": 6}}}
{"query_id": "816851ff92ff0219acf4364dcc2c4692", "dataset": "online-mind2web", "query": "Search for boys' infant pajamas below $40.", "graders": ["mind2web_judge", "fara_combined"], "start_url": "https://www.macys.com/", "metadata": {"original_task_id": "816851ff92ff0219acf4364dcc2c4692", "website": "https://www.macys.com/", "category": "medium", "additional": {"level": "medium", "reference_length": 10}}}
{"query_id": "905cb53061c33aa2d77e485fe1fca516", "dataset": "online-mind2web", "query": "Browse dermatologists within 10 miles of zip code 10019 and filter by only those who accept Blue Medicare Advantage.", "graders": ["mind2web_judge", "fara_combined"], "start_url": "https://www.healthgrades.com/", "metadata": {"original_task_id": "905cb53061c33aa2d77e485fe1fca516", "website": "https://www.healthgrades.com/", "category": "hard", "additional": {"level": "hard", "reference_length": 11}}}
{"query_id": "bbbc243b4f18a7a897f0bc84e11d293f", "dataset": "online-mind2web", "query": "Find out how many assists Chris Paul has been averaging in the current season.", "graders": ["mind2web_judge", "fara_combined"], "start_url": "https://www.nba.com/", "metadata": {"original_task_id": "bbbc243b4f18a7a897f0bc84e11d293f", "website": "https://www.nba.com/", "category": "easy", "additional": {"level": "easy", "reference_length": 4}}}
{"query_id": "d71be72aa25c3eab8eea47a0e60382e2", "dataset": "online-mind2web", "query": "Find technical specs for the latest Macbook Air on Apple.", "graders": ["mind2web_judge", "fara_combined"], "start_url": "https://www.apple.com/", "metadata": {"original_task_id": "d71be72aa25c3eab8eea47a0e60382e2", "website": "https://www.apple.com/", "category": "easy", "additional": {"level": "easy", "reference_length": 4}}}
{"query_id": "3c1ffc3f494e423b3c434c79e35da8f3", "dataset": "online-mind2web", "query": "Find 12 Monkeys community and view the latest posts mentioning James Cole.", "graders": ["mind2web_judge", "fara_combined"], "start_url": "https://www.reddit.com/", "metadata": {"original_task_id": "3c1ffc3f494e423b3c434c79e35da8f3", "website": "https://www.reddit.com/", "category": "medium", "additional": {"level": "medium", "reference_length": 6}}}
{"query_id": "608c595eec271fa5dc03506923519994", "dataset": "online-mind2web", "query": "Calculate a FedEx Ground shipping rate for a 3-pound package from zip code 10019 to zip code 90028.", "graders": ["mind2web_judge", "fara_combined"], "start_url": "https://www.fedex.com/en-us/home.html", "metadata": {"original_task_id": "608c595eec271fa5dc03506923519994", "website": "https://www.fedex.com/en-us/home.html", "category": "medium", "additional": {"level": "medium", "reference_length": 9}}}
{"query_id": "a7a73c8fa75441fc76df9746c327bdd6", "dataset": "online-mind2web", "query": "Estimate the cost of a photographer in 07055 for a 4-hour project.", "graders": ["mind2web_judge", "fara_combined"], "start_url": "https://www.thumbtack.com/", "metadata": {"original_task_id": "a7a73c8fa75441fc76df9746c327bdd6", "website": "https://www.thumbtack.com/", "category": "medium", "additional": {"level": "medium", "reference_length": 8}}}
{"query_id": "56f8890a837c49f7df766b9c981646f3", "dataset": "online-mind2web", "query": "Show crazy credits for the movie \" Prometheus\" on IMDb.", "graders": ["mind2web_judge", "fara_combined"], "start_url": "https://www.imdb.com/", "metadata": {"original_task_id": "56f8890a837c49f7df766b9c981646f3", "website": "https://www.imdb.com/", "category": "medium", "additional": {"level": "medium", "reference_length": 6}}}