mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
- Add hover_at, type_at, drag_at coordinate tools to server - Add hoverAt, typeAt, dragAt methods to Browser class - Export server internals (browser, tool-loop, registry) for eval imports - Copy eval app from enterprise repo with agents, graders, runner, dashboard - Nest eval-targets inside apps/eval - Adapt sessionExecutionDir → workingDir for current server API - Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
69 lines
34 KiB
JSON
Vendored
69 lines
34 KiB
JSON
Vendored
{"query_id": "wb-1", "dataset": "webbench", "query": "Visit the store locator page to find out which Ace Hardware stores in California offer in-store pickup for online orders, then list the store names and addresses.", "start_url": "https://www.acehardware.com", "metadata": {"original_task_id": "wb-1", "website": "acehardware.com", "category": "READ", "additional": {"webbench_id": 1, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-74", "dataset": "webbench", "query": "Navigate to the \"National Park Guides\" section and identify three safety tips provided for hiking in Yosemite National Park.", "start_url": "https://www.alltrails.com", "metadata": {"original_task_id": "wb-74", "website": "alltrails.com", "category": "READ", "additional": {"webbench_id": 74, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-75", "dataset": "webbench", "query": "Look up the \"Appalachian Trail\" on AllTrails and extract key details such as length, rating, and difficulty from one of its segments.", "start_url": "https://www.alltrails.com", "metadata": {"original_task_id": "wb-75", "website": "alltrails.com", "category": "READ", "additional": {"webbench_id": 75, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-76", "dataset": "webbench", "query": "Use the search functionality to find trails in California with lengths between 5 and 10 miles, then list the top five trails sorted by user ratings.", "start_url": "https://www.alltrails.com", "metadata": {"original_task_id": "wb-76", "website": "alltrails.com", "category": "READ", "additional": {"webbench_id": 76, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-77", "dataset": "webbench", "query": "Search for the \"Bear Peak\" trail near Boulder, CO, then list the first three user reviews displayed on its page.", "start_url": "https://www.alltrails.com", "metadata": {"original_task_id": "wb-77", "website": "alltrails.com", "category": "READ", "additional": {"webbench_id": 77, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-78", "dataset": "webbench", "query": "Browse the \"National Park Guides\" section, select the Yosemite National Park guide, and extract three key highlights mentioned within it.", "start_url": "https://www.alltrails.com", "metadata": {"original_task_id": "wb-78", "website": "alltrails.com", "category": "READ", "additional": {"webbench_id": 78, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-210", "dataset": "webbench", "query": "Navigate to the video reviews section and record the titles of two video reviews found on the page.", "start_url": "https://www.caranddriver.com", "metadata": {"original_task_id": "wb-210", "website": "caranddriver.com", "category": "READ", "additional": {"webbench_id": 210, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-213", "dataset": "webbench", "query": "In the \"New Cars\" section, check the pricing for 2024 BMW 3 Series 330e and list at least two pricing components (e.g., MSRP, incentives) that are displayed.", "start_url": "https://www.cars.com", "metadata": {"original_task_id": "wb-213", "website": "cars.com", "category": "READ", "additional": {"webbench_id": 213, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-226", "dataset": "webbench", "query": "Use the search function to find archived CBS News articles about \"Climate Change\" from 2023 or earlier and summarize the key findings of one article.", "start_url": "https://www.cbsnews.com", "metadata": {"original_task_id": "wb-226", "website": "cbsnews.com", "category": "READ", "additional": {"webbench_id": 226, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-227", "dataset": "webbench", "query": "Identify an article that includes a real-time Twitter feed from a CBS correspondent and transcribe two recent tweets displayed.", "start_url": "https://www.cbsnews.com", "metadata": {"original_task_id": "wb-227", "website": "cbsnews.com", "category": "READ", "additional": {"webbench_id": 227, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-314", "dataset": "webbench", "query": "Search for companies categorized under \"Artificial Intelligence\" located in San Francisco and list the top 5 by funding amount.", "start_url": "https://www.crunchbase.com", "metadata": {"original_task_id": "wb-314", "website": "crunchbase.com", "category": "READ", "additional": {"webbench_id": 314, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-329", "dataset": "webbench", "query": "Browse the Crunchyroll homepage and list the featured anime series highlighted for the current season.", "start_url": "https://www.crunchyroll.com", "metadata": {"original_task_id": "wb-329", "website": "crunchyroll.com", "category": "READ", "additional": {"webbench_id": 329, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-330", "dataset": "webbench", "query": "Use the search function to look for \"Naruto\" and list the top 5 results along with their release years.", "start_url": "https://www.crunchyroll.com", "metadata": {"original_task_id": "wb-330", "website": "crunchyroll.com", "category": "READ", "additional": {"webbench_id": 330, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-331", "dataset": "webbench", "query": "Visit the news or blog section and summarize the details of an upcoming Crunchyroll event.", "start_url": "https://www.crunchyroll.com", "metadata": {"original_task_id": "wb-331", "website": "crunchyroll.com", "category": "READ", "additional": {"webbench_id": 331, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-332", "dataset": "webbench", "query": "Navigate to the details page of the anime \"One Piece\" and extract the available language and subtitle options.", "start_url": "https://www.crunchyroll.com", "metadata": {"original_task_id": "wb-332", "website": "crunchyroll.com", "category": "READ", "additional": {"webbench_id": 332, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-333", "dataset": "webbench", "query": "Check the \"Trending\" section on Crunchyroll and list the titles along with a brief description for each trending anime.", "start_url": "https://www.crunchyroll.com", "metadata": {"original_task_id": "wb-333", "website": "crunchyroll.com", "category": "READ", "additional": {"webbench_id": 333, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-371", "dataset": "webbench", "query": "Sort the products in the Women's Bags category by \"Price Low to High\" and list the top five items along with their prices.", "start_url": "https://www.dickssportinggoods.com", "metadata": {"original_task_id": "wb-371", "website": "dickssportinggoods.com", "category": "READ", "additional": {"webbench_id": 371, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-383", "dataset": "webbench", "query": "Browse the Dreamstime Public Domain collection and extract the names of all available image categories listed on the page.", "start_url": "https://www.dreamstime.com/", "metadata": {"original_task_id": "wb-383", "website": "dreamstime.com", "category": "READ", "additional": {"webbench_id": 383, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-384", "dataset": "webbench", "query": "Filter search results for \"business conference illustrations\" and list the preview watermarked thumbnail URLs for the first 3 assets.", "start_url": "https://www.dreamstime.com/", "metadata": {"original_task_id": "wb-384", "website": "dreamstime.com", "category": "READ", "additional": {"webbench_id": 384, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-385", "dataset": "webbench", "query": "Search for \"sunset beach\" images on Dreamstime, filter the results to show only photos, and list the top 5 image titles along with their licensing types.", "start_url": "https://www.dreamstime.com/", "metadata": {"original_task_id": "wb-385", "website": "dreamstime.com", "category": "READ", "additional": {"webbench_id": 385, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-454", "dataset": "webbench", "query": "Search for handmade wooden spoons and list the top 5 results.", "start_url": "https://www.etsy.com", "metadata": {"original_task_id": "wb-454", "website": "etsy.com", "category": "READ", "additional": {"webbench_id": 454, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-455", "dataset": "webbench", "query": "Add a personalized Mug to my Gifts collection. Make sure the Mug has over 2000 reviews and personalize it with the name 'Jerry'", "start_url": "https://www.etsy.com", "metadata": {"original_task_id": "wb-455", "website": "etsy.com", "category": "READ", "additional": {"webbench_id": 455, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-576", "dataset": "webbench", "query": "Browse the archive for articles published in the 1980s and list the publication years of the first 5 articles displayed.", "start_url": "https://fortune.com", "metadata": {"original_task_id": "wb-576", "website": "fortune.com", "category": "READ", "additional": {"webbench_id": 576, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-590", "dataset": "webbench", "query": "Navigate to the curated collection “Modern UI” and record the collection’s description and the featured design styles.", "start_url": "https://www.freepik.com", "metadata": {"original_task_id": "wb-590", "website": "freepik.com", "category": "READ", "additional": {"webbench_id": 590, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-689", "dataset": "webbench", "query": "Navigate to the \"Interviews\" section, search for interview experiences at \"Tesla\", and report the most frequently mentioned interview question.", "start_url": "https://www.glassdoor.com/index.htm", "metadata": {"original_task_id": "wb-689", "website": "glassdoor.com", "category": "READ", "additional": {"webbench_id": 689, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-850", "dataset": "webbench", "query": "Check if a \"Delivery Driver\" job in Orlando provides 401k and paid time off. If it does, add it to my save list.", "start_url": "http://indeed.com", "metadata": {"original_task_id": "wb-850", "website": "indeed.com", "category": "READ", "additional": {"webbench_id": 850, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-974", "dataset": "webbench", "query": "Use the in-store navigation tool by selecting a nearby Kroger store, then note the aisle locations for the dairy and bakery sections.", "start_url": "https://www.kroger.com", "metadata": {"original_task_id": "wb-974", "website": "kroger.com", "category": "READ", "additional": {"webbench_id": 974, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1030", "dataset": "webbench", "query": "Return the names of 4 people who work as analysts or associates in consulting roles in San Francisco, CA.", "start_url": "https://www.linkedin.com", "metadata": {"original_task_id": "wb-1030", "website": "linkedin.com", "category": "READ", "additional": {"webbench_id": 1030, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1100", "dataset": "webbench", "query": "Look up the promotional offers page and report the description and validity dates of the current weekend discount.", "start_url": "https://www.marriott.com/default.mi", "metadata": {"original_task_id": "wb-1100", "website": "marriott.com", "category": "READ", "additional": {"webbench_id": 1100, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1169", "dataset": "webbench", "query": "Search for an in‐depth review of the film \"Avatar: The Way of Water\" on Movieweb and provide a summary of the critic’s main points.", "start_url": "https://movieweb.com", "metadata": {"original_task_id": "wb-1169", "website": "movieweb.com", "category": "READ", "additional": {"webbench_id": 1169, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1268", "dataset": "webbench", "query": "Identify the five most recent articles in the Business section and determine which article has the most comments", "start_url": "https://www.nytimes.com", "metadata": {"original_task_id": "wb-1268", "website": "nytimes.com", "category": "READ", "additional": {"webbench_id": 1268, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1305", "dataset": "webbench", "query": "Search for updates on upcoming trainings for apprenticeship programs on Oregon.gov and list the training topics along with their registration deadlines.", "start_url": "https://www.oregon.gov/Pages/index.aspx", "metadata": {"original_task_id": "wb-1305", "website": "oregon.gov", "category": "READ", "additional": {"webbench_id": 1305, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1349", "dataset": "webbench", "query": "Search for the Pinterest help article on linking Instagram to your Pinterest profile and list the first three instructions provided.", "start_url": "https://www.pinterest.com", "metadata": {"original_task_id": "wb-1349", "website": "pinterest.com", "category": "READ", "additional": {"webbench_id": 1349, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1376", "dataset": "webbench", "query": "Navigate to the background page and download two images for free.", "start_url": "https://pngtree.com", "metadata": {"original_task_id": "wb-1376", "website": "pngtree.com", "category": "READ", "additional": {"webbench_id": 1376, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1384", "dataset": "webbench", "query": "Access the analytics dashboard for press release distribution and extract the media impressions for the press release titled \"New Product Launch.\"", "start_url": "https://www.prnewswire.com", "metadata": {"original_task_id": "wb-1384", "website": "prnewswire.com", "category": "READ", "additional": {"webbench_id": 1384, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1396", "dataset": "webbench", "query": "Ask a new question \"How will the rise of browser use agents impact the internet?", "start_url": "https://www.quora.com", "metadata": {"original_task_id": "wb-1396", "website": "quora.com", "category": "READ", "additional": {"webbench_id": 1396, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1397", "dataset": "webbench", "query": "Find how many upvotes the top answer for 'What is the best way to learn Python?' has.", "start_url": "https://www.quora.com", "metadata": {"original_task_id": "wb-1397", "website": "quora.com", "category": "READ", "additional": {"webbench_id": 1397, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1430", "dataset": "webbench", "query": "Go into settings, click on preferences, and turn on filters so the account cannot see mature content", "start_url": "https://www.reddit.com/?rdt=61899", "metadata": {"original_task_id": "wb-1430", "website": "reddit.com", "category": "READ", "additional": {"webbench_id": 1430, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1439", "dataset": "webbench", "query": "Navigate to the \"Utility Estimates\" section on the property detail page for \"Modern 2BR Apartment on Market St\" and report the listed utility cost estimates.", "start_url": "https://www.rent.com", "metadata": {"original_task_id": "wb-1439", "website": "rent.com", "category": "READ", "additional": {"webbench_id": 1439, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1537", "dataset": "webbench", "query": "Explore the \"Healthy Recipes\" section and provide a short summary of the \"Quinoa Salad with Citrus Dressing\" recipe, including its health benefits.", "start_url": "https://www.simplyrecipes.com", "metadata": {"original_task_id": "wb-1537", "website": "simplyrecipes.com", "category": "READ", "additional": {"webbench_id": 1537, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1542", "dataset": "webbench", "query": "Browse the Sports section and extract the schedule for upcoming Formula 1 and cricket events with their start times.", "start_url": "https://www.sky.com", "metadata": {"original_task_id": "wb-1542", "website": "sky.com", "category": "READ", "additional": {"webbench_id": 1542, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1562", "dataset": "webbench", "query": "Use the comparison tool to select two top basketball players, such as LeBron James and Kevin Durant, and list three performance metrics that are compared.", "start_url": "https://www.sofascore.com", "metadata": {"original_task_id": "wb-1562", "website": "sofascore.com", "category": "READ", "additional": {"webbench_id": 1562, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1726", "dataset": "webbench", "query": "Visit the Community Discussions page and identify the top three trending threads in the \"PC Gaming\" category.", "start_url": "https://store.steampowered.com", "metadata": {"original_task_id": "wb-1726", "website": "store.steampowered.com", "category": "READ", "additional": {"webbench_id": 1726, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1749", "dataset": "webbench", "query": "Open the “Classic Meatloaf” recipe page and extract its nutritional analysis information, including the calorie count per serving.", "start_url": "https://www.tasteofhome.com", "metadata": {"original_task_id": "wb-1749", "website": "tasteofhome.com", "category": "READ", "additional": {"webbench_id": 1749, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1906", "dataset": "webbench", "query": "Go to a well-known streamer’s channel (e.g., Shroud) and extract the current follower count along with the number of live viewers.", "start_url": "https://www.twitch.tv", "metadata": {"original_task_id": "wb-1906", "website": "twitch.tv", "category": "READ", "additional": {"webbench_id": 1906, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1907", "dataset": "webbench", "query": "Search for live streams featuring “Apex Legends” and list the usernames of the top three channels by viewer count.", "start_url": "https://www.twitch.tv", "metadata": {"original_task_id": "wb-1907", "website": "twitch.tv", "category": "READ", "additional": {"webbench_id": 1907, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1908", "dataset": "webbench", "query": "Go to the community events page, filter for upcoming charity streams, and list the names of at least three scheduled charity events.", "start_url": "https://www.twitch.tv", "metadata": {"original_task_id": "wb-1908", "website": "twitch.tv", "category": "READ", "additional": {"webbench_id": 1908, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1909", "dataset": "webbench", "query": "Visit the homepage and extract the titles of the top five live streams currently featured under the “Just Chatting” category.", "start_url": "https://www.twitch.tv", "metadata": {"original_task_id": "wb-1909", "website": "twitch.tv", "category": "READ", "additional": {"webbench_id": 1909, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1910", "dataset": "webbench", "query": "Search for the copyright guidelines section in the Twitch help center and list the main rules outlined for streamers.", "start_url": "https://www.twitch.tv", "metadata": {"original_task_id": "wb-1910", "website": "twitch.tv", "category": "READ", "additional": {"webbench_id": 1910, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2040", "dataset": "webbench", "query": "Search for family-friendly experiences in Orlando, FL, and list the top three tours along with their prices and customer ratings.", "start_url": "https://www.viator.com", "metadata": {"original_task_id": "wb-2040", "website": "viator.com", "category": "READ", "additional": {"webbench_id": 2040, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2041", "dataset": "webbench", "query": "Filter Viator’s search results for \"food tours\" in New York City and list the first five experiences that are under $100.", "start_url": "https://www.viator.com", "metadata": {"original_task_id": "wb-2041", "website": "viator.com", "category": "READ", "additional": {"webbench_id": 2041, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2053", "dataset": "webbench", "query": "Search for “driver’s license renewal” information and list the required documents mentioned on the process page.", "start_url": "https://wa.gov", "metadata": {"original_task_id": "wb-2053", "website": "wa.gov", "category": "READ", "additional": {"webbench_id": 2053, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2198", "dataset": "webbench", "query": "Archive the latest email from halluminate@yahoo.com", "start_url": "https://www.yahoo.com", "metadata": {"original_task_id": "wb-2198", "website": "yahoo.com", "category": "READ", "additional": {"webbench_id": 2198, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2200", "dataset": "webbench", "query": "Change the Yahoo homepage background color to green", "start_url": "https://www.yahoo.com", "metadata": {"original_task_id": "wb-2200", "website": "yahoo.com", "category": "READ", "additional": {"webbench_id": 2200, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2216", "dataset": "webbench", "query": "Check if Din Tai Fung NYC has a table for two this Friday at 7pm ET. If not, please find the next available Friday with seats close to 7pm ET.", "start_url": "https://www.yelp.com", "metadata": {"original_task_id": "wb-2216", "website": "yelp.com", "category": "READ", "additional": {"webbench_id": 2216, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2229", "dataset": "webbench", "query": "Retrieve the Zestimate® and full listing details (photos, description, square footage) for the property located at \"12345 Main St, San Francisco, CA.\"", "start_url": "https://www.zillow.com", "metadata": {"original_task_id": "wb-2229", "website": "zillow.com", "category": "READ", "additional": {"webbench_id": 2229, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2230", "dataset": "webbench", "query": "Filter search results for luxury properties in \"Beverly Hills, CA\" and list the key amenities (e.g., pool, number of bedrooms, outdoor space) of the top listing.", "start_url": "https://www.zillow.com", "metadata": {"original_task_id": "wb-2230", "website": "zillow.com", "category": "READ", "additional": {"webbench_id": 2230, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2268", "dataset": "webbench", "query": "Locate a question about French idioms, then compare and summarize the differences between AI-generated and human answers.", "start_url": "https://hinative.com", "metadata": {"original_task_id": "wb-2268", "website": "hinative.com", "category": "READ", "additional": {"webbench_id": 2268, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2292", "dataset": "webbench", "query": "Access your sales dashboard and read the latest monthly sales summary, noting the number of items sold, average price, and total earnings.", "start_url": "https://poshmark.com", "metadata": {"original_task_id": "wb-2292", "website": "poshmark.com", "category": "READ", "additional": {"webbench_id": 2292, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2347", "dataset": "webbench", "query": "Use the advanced search filters to find academic papers in \"Artificial Intelligence\" with over 100 citations and list the top 3 titles along with their citation counts.", "start_url": "https://www.academia.edu", "metadata": {"original_task_id": "wb-2347", "website": "academia.edu", "category": "READ", "additional": {"webbench_id": 2347, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2475", "dataset": "webbench", "query": "Play a track that is currently charting on Deezer, then view the lyrics display to note if the real-time translation option is available.", "start_url": "https://www.deezer.com/us", "metadata": {"original_task_id": "wb-2475", "website": "deezer.com", "category": "READ", "additional": {"webbench_id": 2475, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2560", "dataset": "webbench", "query": "Search for flights from San Francisco (SFO) to Tokyo (NRT) for travel dates December 1–10 and list the top 5 most affordable options, including both direct and one-stop itineraries.", "start_url": "https://www.kayak.com", "metadata": {"original_task_id": "wb-2560", "website": "kayak.com", "category": "READ", "additional": {"webbench_id": 2560, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2561", "dataset": "webbench", "query": "Look up the current hotel price trends in Rome, Italy, for a stay during the first week of October and provide the lowest forecasted rate.", "start_url": "https://www.kayak.com", "metadata": {"original_task_id": "wb-2561", "website": "kayak.com", "category": "READ", "additional": {"webbench_id": 2561, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2572", "dataset": "webbench", "query": "Search for hotel deals in Goa for a 3‑night stay starting on 25th May 2024 and note the names of hotels with 4‑star ratings.", "start_url": "https://www.makemytrip.global/?cc=am&redirectedBy=gl", "metadata": {"original_task_id": "wb-2572", "website": "makemytrip.global", "category": "READ", "additional": {"webbench_id": 2572, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2593", "dataset": "webbench", "query": "Use the search function to find articles about \"local festivals\" and extract the publication dates of the first 5 results.", "start_url": "https://www.newsbreak.com", "metadata": {"original_task_id": "wb-2593", "website": "newsbreak.com", "category": "READ", "additional": {"webbench_id": 2593, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2648", "dataset": "webbench", "query": "Search for photos of “modern office interiors” and filter the results by vertical orientation; then list the titles of the top 5 images.", "start_url": "https://www.shutterstock.com", "metadata": {"original_task_id": "wb-2648", "website": "shutterstock.com", "category": "READ", "additional": {"webbench_id": 2648, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2657", "dataset": "webbench", "query": "Using Temu’s search bar, search for “wireless earbuds”, sort the results by lowest price, and list the names and prices of the first five products.", "start_url": "https://www.temu.com", "metadata": {"original_task_id": "wb-2657", "website": "temu.com", "category": "READ", "additional": {"webbench_id": 2657, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2659", "dataset": "webbench", "query": "Locate the section under ”Customer Service\" called \"Return and refund policy\" and extract the main steps needed to initiate a product return—provide a summary of at least three key steps.", "start_url": "https://www.temu.com", "metadata": {"original_task_id": "wb-2659", "website": "temu.com", "category": "READ", "additional": {"webbench_id": 2659, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|