Files
BrowserOS/packages/browseros-agent/apps/eval/data/webbench-0of4-50.jsonl
shivammittal274 29056226bb feat: add eval framework and coordinate-based input tools (#453)
- Add hover_at, type_at, drag_at coordinate tools to server
- Add hoverAt, typeAt, dragAt methods to Browser class
- Export server internals (browser, tool-loop, registry) for eval imports
- Copy eval app from enterprise repo with agents, graders, runner, dashboard
- Nest eval-targets inside apps/eval
- Adapt sessionExecutionDir → workingDir for current server API
- Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
2026-03-16 23:12:23 +05:30

51 lines
25 KiB
JSON
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{"query_id": "wb-850", "dataset": "webbench", "query": "Check if a \"Delivery Driver\" job in Orlando provides 401k and paid time off. If it does, add it to my save list.", "start_url": "http://indeed.com", "metadata": {"original_task_id": "wb-850", "website": "indeed.com", "category": "READ", "additional": {"webbench_id": 850, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-689", "dataset": "webbench", "query": "Navigate to the \"Interviews\" section, search for interview experiences at \"Tesla\", and report the most frequently mentioned interview question.", "start_url": "https://www.glassdoor.com/index.htm", "metadata": {"original_task_id": "wb-689", "website": "glassdoor.com", "category": "READ", "additional": {"webbench_id": 689, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1169", "dataset": "webbench", "query": "Search for an indepth review of the film \"Avatar: The Way of Water\" on Movieweb and provide a summary of the critics main points.", "start_url": "https://movieweb.com", "metadata": {"original_task_id": "wb-1169", "website": "movieweb.com", "category": "READ", "additional": {"webbench_id": 1169, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-314", "dataset": "webbench", "query": "Search for companies categorized under \"Artificial Intelligence\" located in San Francisco and list the top 5 by funding amount.", "start_url": "https://www.crunchbase.com", "metadata": {"original_task_id": "wb-314", "website": "crunchbase.com", "category": "READ", "additional": {"webbench_id": 314, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1537", "dataset": "webbench", "query": "Explore the \"Healthy Recipes\" section and provide a short summary of the \"Quinoa Salad with Citrus Dressing\" recipe, including its health benefits.", "start_url": "https://www.simplyrecipes.com", "metadata": {"original_task_id": "wb-1537", "website": "simplyrecipes.com", "category": "READ", "additional": {"webbench_id": 1537, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2572", "dataset": "webbench", "query": "Search for hotel deals in Goa for a 3night stay starting on 25th May 2024 and note the names of hotels with 4star ratings.", "start_url": "https://www.makemytrip.global/?cc=am&redirectedBy=gl", "metadata": {"original_task_id": "wb-2572", "website": "makemytrip.global", "category": "READ", "additional": {"webbench_id": 2572, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-333", "dataset": "webbench", "query": "Check the \"Trending\" section on Crunchyroll and list the titles along with a brief description for each trending anime.", "start_url": "https://www.crunchyroll.com", "metadata": {"original_task_id": "wb-333", "website": "crunchyroll.com", "category": "READ", "additional": {"webbench_id": 333, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1030", "dataset": "webbench", "query": "Return the names of 4 people who work as analysts or associates in consulting roles in San Francisco, CA.", "start_url": "https://www.linkedin.com", "metadata": {"original_task_id": "wb-1030", "website": "linkedin.com", "category": "READ", "additional": {"webbench_id": 1030, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1376", "dataset": "webbench", "query": "Navigate to the background page and download two images for free.", "start_url": "https://pngtree.com", "metadata": {"original_task_id": "wb-1376", "website": "pngtree.com", "category": "READ", "additional": {"webbench_id": 1376, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2268", "dataset": "webbench", "query": "Locate a question about French idioms, then compare and summarize the differences between AI-generated and human answers.", "start_url": "https://hinative.com", "metadata": {"original_task_id": "wb-2268", "website": "hinative.com", "category": "READ", "additional": {"webbench_id": 2268, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-332", "dataset": "webbench", "query": "Navigate to the details page of the anime \"One Piece\" and extract the available language and subtitle options.", "start_url": "https://www.crunchyroll.com", "metadata": {"original_task_id": "wb-332", "website": "crunchyroll.com", "category": "READ", "additional": {"webbench_id": 332, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1268", "dataset": "webbench", "query": "Identify the five most recent articles in the Business section and determine which article has the most comments", "start_url": "https://www.nytimes.com", "metadata": {"original_task_id": "wb-1268", "website": "nytimes.com", "category": "READ", "additional": {"webbench_id": 1268, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1542", "dataset": "webbench", "query": "Browse the Sports section and extract the schedule for upcoming Formula 1 and cricket events with their start times.", "start_url": "https://www.sky.com", "metadata": {"original_task_id": "wb-1542", "website": "sky.com", "category": "READ", "additional": {"webbench_id": 1542, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1907", "dataset": "webbench", "query": "Search for live streams featuring “Apex Legends” and list the usernames of the top three channels by viewer count.", "start_url": "https://www.twitch.tv", "metadata": {"original_task_id": "wb-1907", "website": "twitch.tv", "category": "READ", "additional": {"webbench_id": 1907, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1906", "dataset": "webbench", "query": "Go to a well-known streamers channel (e.g., Shroud) and extract the current follower count along with the number of live viewers.", "start_url": "https://www.twitch.tv", "metadata": {"original_task_id": "wb-1906", "website": "twitch.tv", "category": "READ", "additional": {"webbench_id": 1906, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1", "dataset": "webbench", "query": "Visit the store locator page to find out which Ace Hardware stores in California offer in-store pickup for online orders, then list the store names and addresses.", "start_url": "https://www.acehardware.com", "metadata": {"original_task_id": "wb-1", "website": "acehardware.com", "category": "READ", "additional": {"webbench_id": 1, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-383", "dataset": "webbench", "query": "Browse the Dreamstime Public Domain collection and extract the names of all available image categories listed on the page.", "start_url": "https://www.dreamstime.com/", "metadata": {"original_task_id": "wb-383", "website": "dreamstime.com", "category": "READ", "additional": {"webbench_id": 383, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2657", "dataset": "webbench", "query": "Using Temus search bar, search for “wireless earbuds”, sort the results by lowest price, and list the names and prices of the first five products.", "start_url": "https://www.temu.com", "metadata": {"original_task_id": "wb-2657", "website": "temu.com", "category": "READ", "additional": {"webbench_id": 2657, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-78", "dataset": "webbench", "query": "Browse the \"National Park Guides\" section, select the Yosemite National Park guide, and extract three key highlights mentioned within it.", "start_url": "https://www.alltrails.com", "metadata": {"original_task_id": "wb-78", "website": "alltrails.com", "category": "READ", "additional": {"webbench_id": 78, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2560", "dataset": "webbench", "query": "Search for flights from San Francisco (SFO) to Tokyo (NRT) for travel dates December 110 and list the top 5 most affordable options, including both direct and one-stop itineraries.", "start_url": "https://www.kayak.com", "metadata": {"original_task_id": "wb-2560", "website": "kayak.com", "category": "READ", "additional": {"webbench_id": 2560, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2053", "dataset": "webbench", "query": "Search for “drivers license renewal” information and list the required documents mentioned on the process page.", "start_url": "https://wa.gov", "metadata": {"original_task_id": "wb-2053", "website": "wa.gov", "category": "READ", "additional": {"webbench_id": 2053, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2561", "dataset": "webbench", "query": "Look up the current hotel price trends in Rome, Italy, for a stay during the first week of October and provide the lowest forecasted rate.", "start_url": "https://www.kayak.com", "metadata": {"original_task_id": "wb-2561", "website": "kayak.com", "category": "READ", "additional": {"webbench_id": 2561, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-210", "dataset": "webbench", "query": "Navigate to the video reviews section and record the titles of two video reviews found on the page.", "start_url": "https://www.caranddriver.com", "metadata": {"original_task_id": "wb-210", "website": "caranddriver.com", "category": "READ", "additional": {"webbench_id": 210, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-455", "dataset": "webbench", "query": "Add a personalized Mug to my Gifts collection. Make sure the Mug has over 2000 reviews and personalize it with the name 'Jerry'", "start_url": "https://www.etsy.com", "metadata": {"original_task_id": "wb-455", "website": "etsy.com", "category": "READ", "additional": {"webbench_id": 455, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2292", "dataset": "webbench", "query": "Access your sales dashboard and read the latest monthly sales summary, noting the number of items sold, average price, and total earnings.", "start_url": "https://poshmark.com", "metadata": {"original_task_id": "wb-2292", "website": "poshmark.com", "category": "READ", "additional": {"webbench_id": 2292, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2216", "dataset": "webbench", "query": "Check if Din Tai Fung NYC has a table for two this Friday at 7pm ET. If not, please find the next available Friday with seats close to 7pm ET.", "start_url": "https://www.yelp.com", "metadata": {"original_task_id": "wb-2216", "website": "yelp.com", "category": "READ", "additional": {"webbench_id": 2216, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1397", "dataset": "webbench", "query": "Find how many upvotes the top answer for 'What is the best way to learn Python?' has.", "start_url": "https://www.quora.com", "metadata": {"original_task_id": "wb-1397", "website": "quora.com", "category": "READ", "additional": {"webbench_id": 1397, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1384", "dataset": "webbench", "query": "Access the analytics dashboard for press release distribution and extract the media impressions for the press release titled \"New Product Launch.\"", "start_url": "https://www.prnewswire.com", "metadata": {"original_task_id": "wb-1384", "website": "prnewswire.com", "category": "READ", "additional": {"webbench_id": 1384, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-371", "dataset": "webbench", "query": "Sort the products in the Women's Bags category by \"Price Low to High\" and list the top five items along with their prices.", "start_url": "https://www.dickssportinggoods.com", "metadata": {"original_task_id": "wb-371", "website": "dickssportinggoods.com", "category": "READ", "additional": {"webbench_id": 371, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2041", "dataset": "webbench", "query": "Filter Viators search results for \"food tours\" in New York City and list the first five experiences that are under $100.", "start_url": "https://www.viator.com", "metadata": {"original_task_id": "wb-2041", "website": "viator.com", "category": "READ", "additional": {"webbench_id": 2041, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1749", "dataset": "webbench", "query": "Open the “Classic Meatloaf” recipe page and extract its nutritional analysis information, including the calorie count per serving.", "start_url": "https://www.tasteofhome.com", "metadata": {"original_task_id": "wb-1749", "website": "tasteofhome.com", "category": "READ", "additional": {"webbench_id": 1749, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1305", "dataset": "webbench", "query": "Search for updates on upcoming trainings for apprenticeship programs on Oregon.gov and list the training topics along with their registration deadlines.", "start_url": "https://www.oregon.gov/Pages/index.aspx", "metadata": {"original_task_id": "wb-1305", "website": "oregon.gov", "category": "READ", "additional": {"webbench_id": 1305, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2229", "dataset": "webbench", "query": "Retrieve the Zestimate® and full listing details (photos, description, square footage) for the property located at \"12345 Main St, San Francisco, CA.\"", "start_url": "https://www.zillow.com", "metadata": {"original_task_id": "wb-2229", "website": "zillow.com", "category": "READ", "additional": {"webbench_id": 2229, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-226", "dataset": "webbench", "query": "Use the search function to find archived CBS News articles about \"Climate Change\" from 2023 or earlier and summarize the key findings of one article.", "start_url": "https://www.cbsnews.com", "metadata": {"original_task_id": "wb-226", "website": "cbsnews.com", "category": "READ", "additional": {"webbench_id": 226, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2659", "dataset": "webbench", "query": "Locate the section under ”Customer Service\" called \"Return and refund policy\" and extract the main steps needed to initiate a product return—provide a summary of at least three key steps.", "start_url": "https://www.temu.com", "metadata": {"original_task_id": "wb-2659", "website": "temu.com", "category": "READ", "additional": {"webbench_id": 2659, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-227", "dataset": "webbench", "query": "Identify an article that includes a real-time Twitter feed from a CBS correspondent and transcribe two recent tweets displayed.", "start_url": "https://www.cbsnews.com", "metadata": {"original_task_id": "wb-227", "website": "cbsnews.com", "category": "READ", "additional": {"webbench_id": 227, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1562", "dataset": "webbench", "query": "Use the comparison tool to select two top basketball players, such as LeBron James and Kevin Durant, and list three performance metrics that are compared.", "start_url": "https://www.sofascore.com", "metadata": {"original_task_id": "wb-1562", "website": "sofascore.com", "category": "READ", "additional": {"webbench_id": 1562, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2347", "dataset": "webbench", "query": "Use the advanced search filters to find academic papers in \"Artificial Intelligence\" with over 100 citations and list the top 3 titles along with their citation counts.", "start_url": "https://www.academia.edu", "metadata": {"original_task_id": "wb-2347", "website": "academia.edu", "category": "READ", "additional": {"webbench_id": 2347, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1726", "dataset": "webbench", "query": "Visit the Community Discussions page and identify the top three trending threads in the \"PC Gaming\" category.", "start_url": "https://store.steampowered.com", "metadata": {"original_task_id": "wb-1726", "website": "store.steampowered.com", "category": "READ", "additional": {"webbench_id": 1726, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2200", "dataset": "webbench", "query": "Change the Yahoo homepage background color to green", "start_url": "https://www.yahoo.com", "metadata": {"original_task_id": "wb-2200", "website": "yahoo.com", "category": "READ", "additional": {"webbench_id": 2200, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1430", "dataset": "webbench", "query": "Go into settings, click on preferences, and turn on filters so the account cannot see mature content", "start_url": "https://www.reddit.com/?rdt=61899", "metadata": {"original_task_id": "wb-1430", "website": "reddit.com", "category": "READ", "additional": {"webbench_id": 1430, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-76", "dataset": "webbench", "query": "Use the search functionality to find trails in California with lengths between 5 and 10 miles, then list the top five trails sorted by user ratings.", "start_url": "https://www.alltrails.com", "metadata": {"original_task_id": "wb-76", "website": "alltrails.com", "category": "READ", "additional": {"webbench_id": 76, "difficulty": "hard", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1396", "dataset": "webbench", "query": "Ask a new question \"How will the rise of browser use agents impact the internet?", "start_url": "https://www.quora.com", "metadata": {"original_task_id": "wb-1396", "website": "quora.com", "category": "READ", "additional": {"webbench_id": 1396, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1100", "dataset": "webbench", "query": "Look up the promotional offers page and report the description and validity dates of the current weekend discount.", "start_url": "https://www.marriott.com/default.mi", "metadata": {"original_task_id": "wb-1100", "website": "marriott.com", "category": "READ", "additional": {"webbench_id": 1100, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-1349", "dataset": "webbench", "query": "Search for the Pinterest help article on linking Instagram to your Pinterest profile and list the first three instructions provided.", "start_url": "https://www.pinterest.com", "metadata": {"original_task_id": "wb-1349", "website": "pinterest.com", "category": "READ", "additional": {"webbench_id": 1349, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2198", "dataset": "webbench", "query": "Archive the latest email from halluminate@yahoo.com", "start_url": "https://www.yahoo.com", "metadata": {"original_task_id": "wb-2198", "website": "yahoo.com", "category": "READ", "additional": {"webbench_id": 2198, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2475", "dataset": "webbench", "query": "Play a track that is currently charting on Deezer, then view the lyrics display to note if the real-time translation option is available.", "start_url": "https://www.deezer.com/us", "metadata": {"original_task_id": "wb-2475", "website": "deezer.com", "category": "READ", "additional": {"webbench_id": 2475, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-974", "dataset": "webbench", "query": "Use the in-store navigation tool by selecting a nearby Kroger store, then note the aisle locations for the dairy and bakery sections.", "start_url": "https://www.kroger.com", "metadata": {"original_task_id": "wb-974", "website": "kroger.com", "category": "READ", "additional": {"webbench_id": 974, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-576", "dataset": "webbench", "query": "Browse the archive for articles published in the 1980s and list the publication years of the first 5 articles displayed.", "start_url": "https://fortune.com", "metadata": {"original_task_id": "wb-576", "website": "fortune.com", "category": "READ", "additional": {"webbench_id": 576, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
{"query_id": "wb-2648", "dataset": "webbench", "query": "Search for photos of “modern office interiors” and filter the results by vertical orientation; then list the titles of the top 5 images.", "start_url": "https://www.shutterstock.com", "metadata": {"original_task_id": "wb-2648", "website": "shutterstock.com", "category": "READ", "additional": {"webbench_id": 2648, "difficulty": "easy", "pass_count_4": 0, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}