mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
- Add hover_at, type_at, drag_at coordinate tools to server - Add hoverAt, typeAt, dragAt methods to Browser class - Export server internals (browser, tool-loop, registry) for eval imports - Copy eval app from enterprise repo with agents, graders, runner, dashboard - Nest eval-targets inside apps/eval - Adapt sessionExecutionDir → workingDir for current server API - Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
51 lines
25 KiB
JSON
51 lines
25 KiB
JSON
{"query_id": "wb-850", "dataset": "webbench", "query": "Check if a \"Delivery Driver\" job in Orlando provides 401k and paid time off. If it does, add it to my save list.", "start_url": "http://indeed.com", "metadata": {"original_task_id": "wb-850", "website": "indeed.com", "category": "READ", "additional": {"webbench_id": 850, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-689", "dataset": "webbench", "query": "Navigate to the \"Interviews\" section, search for interview experiences at \"Tesla\", and report the most frequently mentioned interview question.", "start_url": "https://www.glassdoor.com/index.htm", "metadata": {"original_task_id": "wb-689", "website": "glassdoor.com", "category": "READ", "additional": {"webbench_id": 689, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1169", "dataset": "webbench", "query": "Search for an in‐depth review of the film \"Avatar: The Way of Water\" on Movieweb and provide a summary of the critic’s main points.", "start_url": "https://movieweb.com", "metadata": {"original_task_id": "wb-1169", "website": "movieweb.com", "category": "READ", "additional": {"webbench_id": 1169, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-314", "dataset": "webbench", "query": "Search for companies categorized under \"Artificial Intelligence\" located in San Francisco and list the top 5 by funding amount.", "start_url": "https://www.crunchbase.com", "metadata": {"original_task_id": "wb-314", "website": "crunchbase.com", "category": "READ", "additional": {"webbench_id": 314, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1537", "dataset": "webbench", "query": "Explore the \"Healthy Recipes\" section and provide a short summary of the \"Quinoa Salad with Citrus Dressing\" recipe, including its health benefits.", "start_url": "https://www.simplyrecipes.com", "metadata": {"original_task_id": "wb-1537", "website": "simplyrecipes.com", "category": "READ", "additional": {"webbench_id": 1537, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2572", "dataset": "webbench", "query": "Search for hotel deals in Goa for a 3‑night stay starting on 25th May 2024 and note the names of hotels with 4‑star ratings.", "start_url": "https://www.makemytrip.global/?cc=am&redirectedBy=gl", "metadata": {"original_task_id": "wb-2572", "website": "makemytrip.global", "category": "READ", "additional": {"webbench_id": 2572, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-333", "dataset": "webbench", "query": "Check the \"Trending\" section on Crunchyroll and list the titles along with a brief description for each trending anime.", "start_url": "https://www.crunchyroll.com", "metadata": {"original_task_id": "wb-333", "website": "crunchyroll.com", "category": "READ", "additional": {"webbench_id": 333, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1030", "dataset": "webbench", "query": "Return the names of 4 people who work as analysts or associates in consulting roles in San Francisco, CA.", "start_url": "https://www.linkedin.com", "metadata": {"original_task_id": "wb-1030", "website": "linkedin.com", "category": "READ", "additional": {"webbench_id": 1030, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1376", "dataset": "webbench", "query": "Navigate to the background page and download two images for free.", "start_url": "https://pngtree.com", "metadata": {"original_task_id": "wb-1376", "website": "pngtree.com", "category": "READ", "additional": {"webbench_id": 1376, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2268", "dataset": "webbench", "query": "Locate a question about French idioms, then compare and summarize the differences between AI-generated and human answers.", "start_url": "https://hinative.com", "metadata": {"original_task_id": "wb-2268", "website": "hinative.com", "category": "READ", "additional": {"webbench_id": 2268, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-332", "dataset": "webbench", "query": "Navigate to the details page of the anime \"One Piece\" and extract the available language and subtitle options.", "start_url": "https://www.crunchyroll.com", "metadata": {"original_task_id": "wb-332", "website": "crunchyroll.com", "category": "READ", "additional": {"webbench_id": 332, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1268", "dataset": "webbench", "query": "Identify the five most recent articles in the Business section and determine which article has the most comments", "start_url": "https://www.nytimes.com", "metadata": {"original_task_id": "wb-1268", "website": "nytimes.com", "category": "READ", "additional": {"webbench_id": 1268, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1542", "dataset": "webbench", "query": "Browse the Sports section and extract the schedule for upcoming Formula 1 and cricket events with their start times.", "start_url": "https://www.sky.com", "metadata": {"original_task_id": "wb-1542", "website": "sky.com", "category": "READ", "additional": {"webbench_id": 1542, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1907", "dataset": "webbench", "query": "Search for live streams featuring “Apex Legends” and list the usernames of the top three channels by viewer count.", "start_url": "https://www.twitch.tv", "metadata": {"original_task_id": "wb-1907", "website": "twitch.tv", "category": "READ", "additional": {"webbench_id": 1907, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1906", "dataset": "webbench", "query": "Go to a well-known streamer’s channel (e.g., Shroud) and extract the current follower count along with the number of live viewers.", "start_url": "https://www.twitch.tv", "metadata": {"original_task_id": "wb-1906", "website": "twitch.tv", "category": "READ", "additional": {"webbench_id": 1906, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1", "dataset": "webbench", "query": "Visit the store locator page to find out which Ace Hardware stores in California offer in-store pickup for online orders, then list the store names and addresses.", "start_url": "https://www.acehardware.com", "metadata": {"original_task_id": "wb-1", "website": "acehardware.com", "category": "READ", "additional": {"webbench_id": 1, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-383", "dataset": "webbench", "query": "Browse the Dreamstime Public Domain collection and extract the names of all available image categories listed on the page.", "start_url": "https://www.dreamstime.com/", "metadata": {"original_task_id": "wb-383", "website": "dreamstime.com", "category": "READ", "additional": {"webbench_id": 383, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2657", "dataset": "webbench", "query": "Using Temu’s search bar, search for “wireless earbuds”, sort the results by lowest price, and list the names and prices of the first five products.", "start_url": "https://www.temu.com", "metadata": {"original_task_id": "wb-2657", "website": "temu.com", "category": "READ", "additional": {"webbench_id": 2657, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-78", "dataset": "webbench", "query": "Browse the \"National Park Guides\" section, select the Yosemite National Park guide, and extract three key highlights mentioned within it.", "start_url": "https://www.alltrails.com", "metadata": {"original_task_id": "wb-78", "website": "alltrails.com", "category": "READ", "additional": {"webbench_id": 78, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2560", "dataset": "webbench", "query": "Search for flights from San Francisco (SFO) to Tokyo (NRT) for travel dates December 1–10 and list the top 5 most affordable options, including both direct and one-stop itineraries.", "start_url": "https://www.kayak.com", "metadata": {"original_task_id": "wb-2560", "website": "kayak.com", "category": "READ", "additional": {"webbench_id": 2560, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2053", "dataset": "webbench", "query": "Search for “driver’s license renewal” information and list the required documents mentioned on the process page.", "start_url": "https://wa.gov", "metadata": {"original_task_id": "wb-2053", "website": "wa.gov", "category": "READ", "additional": {"webbench_id": 2053, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2561", "dataset": "webbench", "query": "Look up the current hotel price trends in Rome, Italy, for a stay during the first week of October and provide the lowest forecasted rate.", "start_url": "https://www.kayak.com", "metadata": {"original_task_id": "wb-2561", "website": "kayak.com", "category": "READ", "additional": {"webbench_id": 2561, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-210", "dataset": "webbench", "query": "Navigate to the video reviews section and record the titles of two video reviews found on the page.", "start_url": "https://www.caranddriver.com", "metadata": {"original_task_id": "wb-210", "website": "caranddriver.com", "category": "READ", "additional": {"webbench_id": 210, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-455", "dataset": "webbench", "query": "Add a personalized Mug to my Gifts collection. Make sure the Mug has over 2000 reviews and personalize it with the name 'Jerry'", "start_url": "https://www.etsy.com", "metadata": {"original_task_id": "wb-455", "website": "etsy.com", "category": "READ", "additional": {"webbench_id": 455, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2292", "dataset": "webbench", "query": "Access your sales dashboard and read the latest monthly sales summary, noting the number of items sold, average price, and total earnings.", "start_url": "https://poshmark.com", "metadata": {"original_task_id": "wb-2292", "website": "poshmark.com", "category": "READ", "additional": {"webbench_id": 2292, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2216", "dataset": "webbench", "query": "Check if Din Tai Fung NYC has a table for two this Friday at 7pm ET. If not, please find the next available Friday with seats close to 7pm ET.", "start_url": "https://www.yelp.com", "metadata": {"original_task_id": "wb-2216", "website": "yelp.com", "category": "READ", "additional": {"webbench_id": 2216, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1397", "dataset": "webbench", "query": "Find how many upvotes the top answer for 'What is the best way to learn Python?' has.", "start_url": "https://www.quora.com", "metadata": {"original_task_id": "wb-1397", "website": "quora.com", "category": "READ", "additional": {"webbench_id": 1397, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1384", "dataset": "webbench", "query": "Access the analytics dashboard for press release distribution and extract the media impressions for the press release titled \"New Product Launch.\"", "start_url": "https://www.prnewswire.com", "metadata": {"original_task_id": "wb-1384", "website": "prnewswire.com", "category": "READ", "additional": {"webbench_id": 1384, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-371", "dataset": "webbench", "query": "Sort the products in the Women's Bags category by \"Price Low to High\" and list the top five items along with their prices.", "start_url": "https://www.dickssportinggoods.com", "metadata": {"original_task_id": "wb-371", "website": "dickssportinggoods.com", "category": "READ", "additional": {"webbench_id": 371, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2041", "dataset": "webbench", "query": "Filter Viator’s search results for \"food tours\" in New York City and list the first five experiences that are under $100.", "start_url": "https://www.viator.com", "metadata": {"original_task_id": "wb-2041", "website": "viator.com", "category": "READ", "additional": {"webbench_id": 2041, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1749", "dataset": "webbench", "query": "Open the “Classic Meatloaf” recipe page and extract its nutritional analysis information, including the calorie count per serving.", "start_url": "https://www.tasteofhome.com", "metadata": {"original_task_id": "wb-1749", "website": "tasteofhome.com", "category": "READ", "additional": {"webbench_id": 1749, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1305", "dataset": "webbench", "query": "Search for updates on upcoming trainings for apprenticeship programs on Oregon.gov and list the training topics along with their registration deadlines.", "start_url": "https://www.oregon.gov/Pages/index.aspx", "metadata": {"original_task_id": "wb-1305", "website": "oregon.gov", "category": "READ", "additional": {"webbench_id": 1305, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2229", "dataset": "webbench", "query": "Retrieve the Zestimate® and full listing details (photos, description, square footage) for the property located at \"12345 Main St, San Francisco, CA.\"", "start_url": "https://www.zillow.com", "metadata": {"original_task_id": "wb-2229", "website": "zillow.com", "category": "READ", "additional": {"webbench_id": 2229, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-226", "dataset": "webbench", "query": "Use the search function to find archived CBS News articles about \"Climate Change\" from 2023 or earlier and summarize the key findings of one article.", "start_url": "https://www.cbsnews.com", "metadata": {"original_task_id": "wb-226", "website": "cbsnews.com", "category": "READ", "additional": {"webbench_id": 226, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2659", "dataset": "webbench", "query": "Locate the section under ”Customer Service\" called \"Return and refund policy\" and extract the main steps needed to initiate a product return—provide a summary of at least three key steps.", "start_url": "https://www.temu.com", "metadata": {"original_task_id": "wb-2659", "website": "temu.com", "category": "READ", "additional": {"webbench_id": 2659, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-227", "dataset": "webbench", "query": "Identify an article that includes a real-time Twitter feed from a CBS correspondent and transcribe two recent tweets displayed.", "start_url": "https://www.cbsnews.com", "metadata": {"original_task_id": "wb-227", "website": "cbsnews.com", "category": "READ", "additional": {"webbench_id": 227, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1562", "dataset": "webbench", "query": "Use the comparison tool to select two top basketball players, such as LeBron James and Kevin Durant, and list three performance metrics that are compared.", "start_url": "https://www.sofascore.com", "metadata": {"original_task_id": "wb-1562", "website": "sofascore.com", "category": "READ", "additional": {"webbench_id": 1562, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2347", "dataset": "webbench", "query": "Use the advanced search filters to find academic papers in \"Artificial Intelligence\" with over 100 citations and list the top 3 titles along with their citation counts.", "start_url": "https://www.academia.edu", "metadata": {"original_task_id": "wb-2347", "website": "academia.edu", "category": "READ", "additional": {"webbench_id": 2347, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1726", "dataset": "webbench", "query": "Visit the Community Discussions page and identify the top three trending threads in the \"PC Gaming\" category.", "start_url": "https://store.steampowered.com", "metadata": {"original_task_id": "wb-1726", "website": "store.steampowered.com", "category": "READ", "additional": {"webbench_id": 1726, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2200", "dataset": "webbench", "query": "Change the Yahoo homepage background color to green", "start_url": "https://www.yahoo.com", "metadata": {"original_task_id": "wb-2200", "website": "yahoo.com", "category": "READ", "additional": {"webbench_id": 2200, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1430", "dataset": "webbench", "query": "Go into settings, click on preferences, and turn on filters so the account cannot see mature content", "start_url": "https://www.reddit.com/?rdt=61899", "metadata": {"original_task_id": "wb-1430", "website": "reddit.com", "category": "READ", "additional": {"webbench_id": 1430, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-76", "dataset": "webbench", "query": "Use the search functionality to find trails in California with lengths between 5 and 10 miles, then list the top five trails sorted by user ratings.", "start_url": "https://www.alltrails.com", "metadata": {"original_task_id": "wb-76", "website": "alltrails.com", "category": "READ", "additional": {"webbench_id": 76, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1396", "dataset": "webbench", "query": "Ask a new question \"How will the rise of browser use agents impact the internet?", "start_url": "https://www.quora.com", "metadata": {"original_task_id": "wb-1396", "website": "quora.com", "category": "READ", "additional": {"webbench_id": 1396, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1100", "dataset": "webbench", "query": "Look up the promotional offers page and report the description and validity dates of the current weekend discount.", "start_url": "https://www.marriott.com/default.mi", "metadata": {"original_task_id": "wb-1100", "website": "marriott.com", "category": "READ", "additional": {"webbench_id": 1100, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1349", "dataset": "webbench", "query": "Search for the Pinterest help article on linking Instagram to your Pinterest profile and list the first three instructions provided.", "start_url": "https://www.pinterest.com", "metadata": {"original_task_id": "wb-1349", "website": "pinterest.com", "category": "READ", "additional": {"webbench_id": 1349, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2198", "dataset": "webbench", "query": "Archive the latest email from halluminate@yahoo.com", "start_url": "https://www.yahoo.com", "metadata": {"original_task_id": "wb-2198", "website": "yahoo.com", "category": "READ", "additional": {"webbench_id": 2198, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2475", "dataset": "webbench", "query": "Play a track that is currently charting on Deezer, then view the lyrics display to note if the real-time translation option is available.", "start_url": "https://www.deezer.com/us", "metadata": {"original_task_id": "wb-2475", "website": "deezer.com", "category": "READ", "additional": {"webbench_id": 2475, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-974", "dataset": "webbench", "query": "Use the in-store navigation tool by selecting a nearby Kroger store, then note the aisle locations for the dairy and bakery sections.", "start_url": "https://www.kroger.com", "metadata": {"original_task_id": "wb-974", "website": "kroger.com", "category": "READ", "additional": {"webbench_id": 974, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-576", "dataset": "webbench", "query": "Browse the archive for articles published in the 1980s and list the publication years of the first 5 articles displayed.", "start_url": "https://fortune.com", "metadata": {"original_task_id": "wb-576", "website": "fortune.com", "category": "READ", "additional": {"webbench_id": 576, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2648", "dataset": "webbench", "query": "Search for photos of “modern office interiors” and filter the results by vertical orientation; then list the titles of the top 5 images.", "start_url": "https://www.shutterstock.com", "metadata": {"original_task_id": "wb-2648", "website": "shutterstock.com", "category": "READ", "additional": {"webbench_id": 2648, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|