mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
- Add hover_at, type_at, drag_at coordinate tools to server - Add hoverAt, typeAt, dragAt methods to Browser class - Export server internals (browser, tool-loop, registry) for eval imports - Copy eval app from enterprise repo with agents, graders, runner, dashboard - Nest eval-targets inside apps/eval - Adapt sessionExecutionDir → workingDir for current server API - Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
121 lines
60 KiB
JSON
Vendored
121 lines
60 KiB
JSON
Vendored
{"query_id": "wb-0", "dataset": "webbench", "query": "On the product details page for the \"Black & Decker Power Tool Combo Kit,\" list its specifications including dimensions, voltage, and warranty information.", "start_url": "https://www.acehardware.com", "metadata": {"original_task_id": "wb-0", "website": "acehardware.com", "category": "READ", "additional": {"webbench_id": 0, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-10", "dataset": "webbench", "query": "Use the flight search tool to find options from Sydney to Auckland on a specific travel date, then summarize in text the pricing differences between economy and business classes.", "start_url": "https://www.agoda.com", "metadata": {"original_task_id": "wb-10", "website": "agoda.com", "category": "READ", "additional": {"webbench_id": 10, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-22", "dataset": "webbench", "query": "Browse the “Play” section in Paris and extract the titles and brief descriptions of the top five properties.", "start_url": "https://www.airbnb.com", "metadata": {"original_task_id": "wb-22", "website": "airbnb.com", "category": "READ", "additional": {"webbench_id": 22, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-35", "dataset": "webbench", "query": "Search for \"RFID tags\", filter by a minimum order quantity of less than 1000, and list the first 3 supplier names.", "start_url": "https://www.alibaba.com", "metadata": {"original_task_id": "wb-35", "website": "alibaba.com", "category": "READ", "additional": {"webbench_id": 35, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-43", "dataset": "webbench", "query": "How do I cancel an order that has already shipped?", "start_url": "https://aliexpress.com", "metadata": {"original_task_id": "wb-43", "website": "aliexpress.com", "category": "READ", "additional": {"webbench_id": 43, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-159", "dataset": "webbench", "query": "Locate the guide titled \"How to Choose the Best 4K TV: The Essential Guide\" and summarize the top 3 tips provided.", "start_url": "https://www.bestbuy.com", "metadata": {"original_task_id": "wb-159", "website": "bestbuy.com", "category": "READ", "additional": {"webbench_id": 159, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-168", "dataset": "webbench", "query": "Review the \"Design Guidelines\" page and list the recommended image resolution (aspect ratio) for header backgrounds.", "start_url": "https://www.billboard.com", "metadata": {"original_task_id": "wb-168", "website": "billboard.com", "category": "READ", "additional": {"webbench_id": 168, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-172", "dataset": "webbench", "query": "Search for 8 review score or above hotels in Manhattan, New York for next weekend with free cancellation, free Wi-Fi, and breakfast included. Compare at least four options and pick the one closest to the Empire State Building.", "start_url": "https://www.booking.com", "metadata": {"original_task_id": "wb-172", "website": "booking.com", "category": "READ", "additional": {"webbench_id": 172, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-207", "dataset": "webbench", "query": "Search the site for a feature article on traffic safety trends in 2021 and summarize the three main trends outlined.", "start_url": "https://www.caranddriver.com", "metadata": {"original_task_id": "wb-207", "website": "caranddriver.com", "category": "READ", "additional": {"webbench_id": 207, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-208", "dataset": "webbench", "query": "Visit the newsletter subscription page and list the different newsletter options offered by Car and Driver.", "start_url": "https://www.caranddriver.com", "metadata": {"original_task_id": "wb-208", "website": "caranddriver.com", "category": "READ", "additional": {"webbench_id": 208, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-214", "dataset": "webbench", "query": "Use the \"Side-by-side comparisons\" tool to compare a 2018 Ford F-150 and a 2018 Chevrolet Silverado 1500, and list three key differences in their features.", "start_url": "https://www.cars.com", "metadata": {"original_task_id": "wb-214", "website": "cars.com", "category": "READ", "additional": {"webbench_id": 214, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-298", "dataset": "webbench", "query": "Search the \"community\" section for events related to yoga in Atlanta and display the event details along with the organizer's contact.", "start_url": "https://newyork.craigslist.org", "metadata": {"original_task_id": "wb-298", "website": "newyork.craigslist.org", "category": "READ", "additional": {"webbench_id": 298, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-316", "dataset": "webbench", "query": "Navigate to the \"Insights\" section on the homepage and list 2 insights provided, as well as the type of insight.(e.g. product launch, leadership hire, etc.)", "start_url": "https://www.crunchbase.com", "metadata": {"original_task_id": "wb-316", "website": "crunchbase.com", "category": "READ", "additional": {"webbench_id": 316, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-370", "dataset": "webbench", "query": "Navigate to the camping equipment section and extract the details (brand, price, and average user rating) of the top five tents displayed.", "start_url": "https://www.dickssportinggoods.com", "metadata": {"original_task_id": "wb-370", "website": "dickssportinggoods.com", "category": "READ", "additional": {"webbench_id": 370, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-372", "dataset": "webbench", "query": "Use the search bar to look for \"baseball gloves\" and list the first three product results, including their prices and availability.", "start_url": "https://www.dickssportinggoods.com", "metadata": {"original_task_id": "wb-372", "website": "dickssportinggoods.com", "category": "READ", "additional": {"webbench_id": 372, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-373", "dataset": "webbench", "query": "Access the return policy page and summarize the key points, including the return timeframe and conditions required.", "start_url": "https://www.dickssportinggoods.com", "metadata": {"original_task_id": "wb-373", "website": "dickssportinggoods.com", "category": "READ", "additional": {"webbench_id": 373, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-374", "dataset": "webbench", "query": "Open the store locator and search for Dick’s Sporting Goods stores near the zipcode 33101 then note the store hours and any special in-store services offered.", "start_url": "https://www.dickssportinggoods.com", "metadata": {"original_task_id": "wb-374", "website": "dickssportinggoods.com", "category": "READ", "additional": {"webbench_id": 374, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-393", "dataset": "webbench", "query": "Navigate to the EA Play section and extract the details of the free trial offer—including its duration and key benefits—then output this information in text.", "start_url": "https://www.ea.com", "metadata": {"original_task_id": "wb-393", "website": "ea.com", "category": "READ", "additional": {"webbench_id": 393, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-402", "dataset": "webbench", "query": "Search for \"used laptops\" within the price range of $300-$500. Filter by Buy now options and find an option with 8GB Ram and 500GB memory. Add it to cart.", "start_url": "https://www.ebay.com", "metadata": {"original_task_id": "wb-402", "website": "ebay.com", "category": "READ", "additional": {"webbench_id": 402, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-403", "dataset": "webbench", "query": "Find three different sellers offering the same \"Blue Tooth Speaker\" item. Compare the total cost (including shipping and taxes), seller ratings, return policies, and estimated delivery times. Create a table to compare the various speakers and recommend the best purchase option if I'm purchasing from New York City, NY.", "start_url": "https://www.ebay.com", "metadata": {"original_task_id": "wb-403", "website": "ebay.com", "category": "READ", "additional": {"webbench_id": 403, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-420", "dataset": "webbench", "query": "What does the 'Climate Change' article say is driving the current rise in global temperatures?", "start_url": "https://en.wikipedia.org/wiki/Main_Page", "metadata": {"original_task_id": "wb-420", "website": "en.wikipedia.org", "category": "READ", "additional": {"webbench_id": 420, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-446", "dataset": "webbench", "query": "Navigate through the archive by decade and note the publication year of one highlighted article from each decade (e.g., 1980s, 1990s, 2000s, etc.).", "start_url": "https://www.esquire.com", "metadata": {"original_task_id": "wb-446", "website": "esquire.com", "category": "READ", "additional": {"webbench_id": 446, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-465", "dataset": "webbench", "query": "Create a travel plan on Expedia that combines a flight, hotel, and rental car reservation for a business trip from Los Angeles to San Francisco.", "start_url": "https://www.expedia.com", "metadata": {"original_task_id": "wb-465", "website": "expedia.com", "category": "READ", "additional": {"webbench_id": 465, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-474", "dataset": "webbench", "query": "Create a multi-stop flight itinerary on Expedia with stops in Atlanta, Nashville, and New Orleans.", "start_url": "https://www.expedia.com", "metadata": {"original_task_id": "wb-474", "website": "expedia.com", "category": "READ", "additional": {"webbench_id": 474, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-479", "dataset": "webbench", "query": "Search for nonstop flights from Chicago to London on a chosen date and list the flight numbers together with their departure times.", "start_url": "https://www.expedia.com/", "metadata": {"original_task_id": "wb-479", "website": "expedia.com", "category": "READ", "additional": {"webbench_id": 479, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-480", "dataset": "webbench", "query": "Look up bundled vacation packages from New York to London that include flight, hotel, and car rental, then provide details of the top package deal.", "start_url": "https://www.expedia.com/", "metadata": {"original_task_id": "wb-480", "website": "expedia.com", "category": "READ", "additional": {"webbench_id": 480, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-591", "dataset": "webbench", "query": "Navigate to the Reimagine tool page and note the supported input file formats along with any usage requirements mentioned.", "start_url": "https://www.freepik.com", "metadata": {"original_task_id": "wb-591", "website": "freepik.com", "category": "READ", "additional": {"webbench_id": 591, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-608", "dataset": "webbench", "query": "Search for latest PS5 review on Game Rant and extract the publication date along with a brief summary of the main criticisms.", "start_url": "https://gamerant.com", "metadata": {"original_task_id": "wb-608", "website": "gamerant.com", "category": "READ", "additional": {"webbench_id": 608, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-669", "dataset": "webbench", "query": "Inspect the commit history for the repository \"nodejs/node\" and determine which commit shows the largest number of file changes.", "start_url": "https://github.com", "metadata": {"original_task_id": "wb-669", "website": "github.com", "category": "READ", "additional": {"webbench_id": 669, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-690", "dataset": "webbench", "query": "Search for \"Amazon\" on Glassdoor and extract the top three posts this month on the Amazon Employees Bowl.", "start_url": "https://www.glassdoor.com/index.htm", "metadata": {"original_task_id": "wb-690", "website": "glassdoor.com", "category": "READ", "additional": {"webbench_id": 690, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-691", "dataset": "webbench", "query": "Find the company ratings for \"Netflix\" and list the overall rating as well as the ratings for culture and work-life balance.", "start_url": "https://www.glassdoor.com/index.htm", "metadata": {"original_task_id": "wb-691", "website": "glassdoor.com", "category": "READ", "additional": {"webbench_id": 691, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-763", "dataset": "webbench", "query": "Use the website’s search function to find a video on \"Ancient Civilizations\" and record the title along with a brief overview.", "start_url": "https://www.history.com", "metadata": {"original_task_id": "wb-763", "website": "history.com", "category": "READ", "additional": {"webbench_id": 763, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-805", "dataset": "webbench", "query": "Find the article titled \"Innovative Home Office Designs\" and list the design tips mentioned in it.", "start_url": "https://www.housebeautiful.com", "metadata": {"original_task_id": "wb-805", "website": "housebeautiful.com", "category": "READ", "additional": {"webbench_id": 805, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-821", "dataset": "webbench", "query": "Using the website's live radio feature, search for radio stations in Los Angeles and list the names of the top 5 featured stations.", "start_url": "https://www.iheart.com", "metadata": {"original_task_id": "wb-821", "website": "iheart.com", "category": "READ", "additional": {"webbench_id": 821, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-822", "dataset": "webbench", "query": "Browse the iHeart website and list specific details (station names, genre) for the top stations Tyler, TX.", "start_url": "https://www.iheart.com", "metadata": {"original_task_id": "wb-822", "website": "iheart.com", "category": "READ", "additional": {"webbench_id": 822, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-914", "dataset": "webbench", "query": "Read the article on effective revision techniques provided on jagranjosh.com and summarize the top five methods mentioned.", "start_url": "https://www.jagranjosh.com", "metadata": {"original_task_id": "wb-914", "website": "jagranjosh.com", "category": "READ", "additional": {"webbench_id": 914, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-919", "dataset": "webbench", "query": "Search for an article on \"Innovation in Urban Design\" and list the keywords associated with it from the metadata.", "start_url": "https://www.jstor.org", "metadata": {"original_task_id": "wb-919", "website": "jstor.org", "category": "READ", "additional": {"webbench_id": 919, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-920", "dataset": "webbench", "query": "Using advance filter, Navigate to the Books section on JSTOR and list the first 3 book titles along with their primary authors.", "start_url": "https://www.jstor.org", "metadata": {"original_task_id": "wb-920", "website": "jstor.org", "category": "READ", "additional": {"webbench_id": 920, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-933", "dataset": "webbench", "query": "Find the page for the Executive Office for Immigration Review and list two recent news updates", "start_url": "https://www.justice.gov", "metadata": {"original_task_id": "wb-933", "website": "justice.gov", "category": "READ", "additional": {"webbench_id": 933, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-935", "dataset": "webbench", "query": "Access the Justice Department’s strategic plan document and highlight its major priorities.", "start_url": "https://www.justice.gov", "metadata": {"original_task_id": "wb-935", "website": "justice.gov", "category": "READ", "additional": {"webbench_id": 935, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-947", "dataset": "webbench", "query": "Navigate to the \"About\" page and extract three key milestones from Khan Academy’s history timeline.", "start_url": "https://www.khanacademy.org", "metadata": {"original_task_id": "wb-947", "website": "khanacademy.org", "category": "READ", "additional": {"webbench_id": 947, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-953", "dataset": "webbench", "query": "Navigate to the \"Parents\" section and locate an article about managing screen time for children; then, provide a brief summary of the article title and its main tips.", "start_url": "https://kidshealth.org", "metadata": {"original_task_id": "wb-953", "website": "kidshealth.org", "category": "READ", "additional": {"webbench_id": 953, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-970", "dataset": "webbench", "query": "Search for \"men's sneakers\" and on the first product’s details page, list the available size options along with any pricing tiers.", "start_url": "https://www.kohls.com", "metadata": {"original_task_id": "wb-970", "website": "kohls.com", "category": "READ", "additional": {"webbench_id": 970, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1081", "dataset": "webbench", "query": "Search for \"Nike Air Max shoes\" and check the availability in your local store with store location set to New York City, New York. If unavailable, check in a store within a 20-mile radius and note the estimated delivery time if ordered online.", "start_url": "https://www.macys.com", "metadata": {"original_task_id": "wb-1081", "website": "macys.com", "category": "READ", "additional": {"webbench_id": 1081, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1089", "dataset": "webbench", "query": "Use the search bar to locate content on \"cloud computing,\" then extract the conclusions section and list its key points.", "start_url": "https://www.makeuseof.com", "metadata": {"original_task_id": "wb-1089", "website": "makeuseof.com", "category": "READ", "additional": {"webbench_id": 1089, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1101", "dataset": "webbench", "query": "Search for Marriott hotels in New York, NY, and list the names, addresses, and one key amenity for each of the first five properties.", "start_url": "https://www.marriott.com/default.mi", "metadata": {"original_task_id": "wb-1101", "website": "marriott.com", "category": "READ", "additional": {"webbench_id": 1101, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1119", "dataset": "webbench", "query": "Find an article authored by a physician on “diet and nutrition” and list two recommended dietary practices mentioned.", "start_url": "https://www.medicinenet.com", "metadata": {"original_task_id": "wb-1119", "website": "medicinenet.com", "category": "READ", "additional": {"webbench_id": 1119, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1132", "dataset": "webbench", "query": "Search for articles about HIIT workouts published within the last month and list the first five titles along with their publication dates.", "start_url": "https://www.menshealth.com", "metadata": {"original_task_id": "wb-1132", "website": "menshealth.com", "category": "READ", "additional": {"webbench_id": 1132, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1133", "dataset": "webbench", "query": "Search Mercari for \"vintage denim jacket\" and list the first 5 results, including each item’s price and condition.", "start_url": "https://www.mercari.com", "metadata": {"original_task_id": "wb-1133", "website": "mercari.com", "category": "READ", "additional": {"webbench_id": 1133, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1212", "dataset": "webbench", "query": "Visit the Newegg Premier membership page, read through the listed benefits, and summarize at least five key membership perks.", "start_url": "https://www.newegg.com", "metadata": {"original_task_id": "wb-1212", "website": "newegg.com", "category": "READ", "additional": {"webbench_id": 1212, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1224", "dataset": "webbench", "query": "Locate the \"Food & Dining\" guide and extract the names of three recommended local restaurants highlighted in the guide.", "start_url": "https://www.nj.com", "metadata": {"original_task_id": "wb-1224", "website": "nj.com", "category": "READ", "additional": {"webbench_id": 1224, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1233", "dataset": "webbench", "query": "Use the search bar to locate BOSS Menswear Suit and record the product name, price, and available sizes displayed.", "start_url": "https://www.nordstrom.com", "metadata": {"original_task_id": "wb-1233", "website": "nordstrom.com", "category": "READ", "additional": {"webbench_id": 1233, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1234", "dataset": "webbench", "query": "Go to the Men's Shoes section, filter the results with size 10, and list the product names along with any discount percentages for the top five items.", "start_url": "https://www.nordstrom.com", "metadata": {"original_task_id": "wb-1234", "website": "nordstrom.com", "category": "READ", "additional": {"webbench_id": 1234, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1322", "dataset": "webbench", "query": "Navigate to the homepage video content section, select a recent gameplay review video, and record its title, duration, and publication date.", "start_url": "https://www.pcgamer.com", "metadata": {"original_task_id": "wb-1322", "website": "pcgamer.com", "category": "READ", "additional": {"webbench_id": 1322, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1346", "dataset": "webbench", "query": "Use the search filters to find videos about indoor plant care that are between 1-3 minutes long, and list return the video with more likes", "start_url": "https://www.pinterest.com", "metadata": {"original_task_id": "wb-1346", "website": "pinterest.com", "category": "READ", "additional": {"webbench_id": 1346, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1356", "dataset": "webbench", "query": "Go to the \"Videos\" section and output the file sizes from the download section and resolutions for the top 3 trending videos.", "start_url": "https://pixabay.com", "metadata": {"original_task_id": "wb-1356", "website": "pixabay.com", "category": "READ", "additional": {"webbench_id": 1356, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1367", "dataset": "webbench", "query": "Navigate to the PlayStation Store catalog and list the names and genres of the top five free games available for PS Plus subscribers.", "start_url": "https://www.playstation.com/en-us", "metadata": {"original_task_id": "wb-1367", "website": "playstation.com", "category": "READ", "additional": {"webbench_id": 1367, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1368", "dataset": "webbench", "query": "Visit the PlayStation Plus subscription banner and list three exclusive benefits mentioned for subscribers.", "start_url": "https://www.playstation.com/en-us", "metadata": {"original_task_id": "wb-1368", "website": "playstation.com", "category": "READ", "additional": {"webbench_id": 1368, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1370", "dataset": "webbench", "query": "Go to the digital wallet information area and list the payment options available for topping up.", "start_url": "https://www.playstation.com/en-us", "metadata": {"original_task_id": "wb-1370", "website": "playstation.com", "category": "READ", "additional": {"webbench_id": 1370, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1371", "dataset": "webbench", "query": "Locate the product details for \"Horizon Forbidden West\" and extract the release date, developer, and price information.", "start_url": "https://www.playstation.com/en-us", "metadata": {"original_task_id": "wb-1371", "website": "playstation.com", "category": "READ", "additional": {"webbench_id": 1371, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1389", "dataset": "webbench", "query": "Locate a section providing COVID-19 guidelines specifically for students and list two safety practices mentioned.", "start_url": "https://www.purdue.edu", "metadata": {"original_task_id": "wb-1389", "website": "purdue.edu", "category": "READ", "additional": {"webbench_id": 1389, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1398", "dataset": "webbench", "query": "Go to spaces and navigate to one of the recommended spaces to view. Check the profile of one of the top contributers in this space and return how many followers they have.", "start_url": "https://www.quora.com", "metadata": {"original_task_id": "wb-1398", "website": "quora.com", "category": "READ", "additional": {"webbench_id": 1398, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1432", "dataset": "webbench", "query": "Report the first post on the videogames subreddit for inappropriate content", "start_url": "https://www.reddit.com/?rdt=40707", "metadata": {"original_task_id": "wb-1432", "website": "reddit.com", "category": "READ", "additional": {"webbench_id": 1432, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1470", "dataset": "webbench", "query": "Find Arden, in New York. List the available dining times for tonight.", "start_url": "https://resy.com", "metadata": {"original_task_id": "wb-1470", "website": "resy.com", "category": "READ", "additional": {"webbench_id": 1470, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1496", "dataset": "webbench", "query": "Find how many articles about \"renewable energy\" were published by Harvard from the last 5 years, filter by open access.", "start_url": "https://www.sciencedirect.com", "metadata": {"original_task_id": "wb-1496", "website": "sciencedirect.com", "category": "READ", "additional": {"webbench_id": 1496, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1497", "dataset": "webbench", "query": "Locate a paper about \"deep learning\" and use the \"References\" section to find the three most-cited papers it references. Record their titles and years.", "start_url": "https://www.sciencedirect.com", "metadata": {"original_task_id": "wb-1497", "website": "sciencedirect.com", "category": "READ", "additional": {"webbench_id": 1497, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1504", "dataset": "webbench", "query": "Find the article '10 TV Shows That Went Downhill After A Major Cliffhanger' and summarize why 'The Flash' is included.", "start_url": "https://screenrant.com", "metadata": {"original_task_id": "wb-1504", "website": "screenrant.com", "category": "READ", "additional": {"webbench_id": 1504, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1509", "dataset": "webbench", "query": "Navigate to the \"Lifestyle\" section and extract the publication date and author name from the featured article \"The Ultimate Self-Care Routine.\"", "start_url": "https://www.self.com", "metadata": {"original_task_id": "wb-1509", "website": "self.com", "category": "READ", "additional": {"webbench_id": 1509, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1585", "dataset": "webbench", "query": "Access the live scores page, click on a current NBA match, and note down the current score and quarter information.", "start_url": "https://www.sportskeeda.com", "metadata": {"original_task_id": "wb-1585", "website": "sportskeeda.com", "category": "READ", "additional": {"webbench_id": 1585, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1609", "dataset": "webbench", "query": "Browse the Spotify home page and list the names of the top 5 trending playlists currently featured.", "start_url": "https://open.spotify.com", "metadata": {"original_task_id": "wb-1609", "website": "open.spotify.com", "category": "READ", "additional": {"webbench_id": 1609, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1610", "dataset": "webbench", "query": "Go to Spotify’s \"Browse\" section and list the names of the top 5 mood-based playlists featured there.", "start_url": "https://open.spotify.com", "metadata": {"original_task_id": "wb-1610", "website": "open.spotify.com", "category": "READ", "additional": {"webbench_id": 1610, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1626", "dataset": "webbench", "query": "Locate the article \"Deep Learning in Medical Imaging\" and list all types of supplementary materials (e.g., datasets, videos) available for it.", "start_url": "https://www.springer.com/us", "metadata": {"original_task_id": "wb-1626", "website": "springer.com", "category": "READ", "additional": {"webbench_id": 1626, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1724", "dataset": "webbench", "query": "Search for \"Doom Eternal\" on the Steam store and list its current price, user rating percentage, and release date.", "start_url": "https://store.steampowered.com", "metadata": {"original_task_id": "wb-1724", "website": "store.steampowered.com", "category": "READ", "additional": {"webbench_id": 1724, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1737", "dataset": "webbench", "query": "Visit the StudyList section available StudyList created by a user with the username \"AlexS\" that focuses on math resources, then report its title and content count.", "start_url": "https://www.studocu.com", "metadata": {"original_task_id": "wb-1737", "website": "studocu.com", "category": "READ", "additional": {"webbench_id": 1737, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1739", "dataset": "webbench", "query": "Navigate to the Premium subscription page and summarize the details of the quarterly premium pricing, including any additional benefits.", "start_url": "https://www.studocu.com", "metadata": {"original_task_id": "wb-1739", "website": "studocu.com", "category": "READ", "additional": {"webbench_id": 1739, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1744", "dataset": "webbench", "query": "Find in-store pickup instructions and see if you are required to bring an ID to get the pickup item.", "start_url": "https://www.target.com", "metadata": {"original_task_id": "wb-1744", "website": "target.com", "category": "READ", "additional": {"webbench_id": 1744, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1776", "dataset": "webbench", "query": "Use the website’s search function to look for articles on “climate change” published in the last month, then list the titles of the top five results.", "start_url": "https://www.the-independent.com", "metadata": {"original_task_id": "wb-1776", "website": "the-independent.com", "category": "READ", "additional": {"webbench_id": 1776, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1820", "dataset": "webbench", "query": "Find an opinion column discussing housing policy and list its main arguments as presented in the article.", "start_url": "https://www.thesun.co.uk", "metadata": {"original_task_id": "wb-1820", "website": "thesun.co.uk", "category": "READ", "additional": {"webbench_id": 1820, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1832", "dataset": "webbench", "query": "Visit the New York Giants' page, and find an available game to find tickets for.", "start_url": "https://www.ticketmaster.com", "metadata": {"original_task_id": "wb-1832", "website": "ticketmaster.com", "category": "READ", "additional": {"webbench_id": 1832, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1837", "dataset": "webbench", "query": "Visit Mr. Beast’s profile and record the engagement metrics (likes and comments) on their three most-viewed videos.", "start_url": "https://www.tiktok.com/explore", "metadata": {"original_task_id": "wb-1837", "website": "tiktok.com", "category": "READ", "additional": {"webbench_id": 1837, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1878", "dataset": "webbench", "query": "Search the website for the latest travel trends for 2023 and extract four trend highlights mentioned in a recent article.", "start_url": "https://www.travelandleisure.com", "metadata": {"original_task_id": "wb-1878", "website": "travelandleisure.com", "category": "READ", "additional": {"webbench_id": 1878, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1889", "dataset": "webbench", "query": "Search for flights from New York (JFK) to London (LHR) departing on December 10 and returning on December 17, then list the flight times and prices for the top three options.", "start_url": "https://us.trip.com/?locale=en-us", "metadata": {"original_task_id": "wb-1889", "website": "us.trip.com", "category": "READ", "additional": {"webbench_id": 1889, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1891", "dataset": "webbench", "query": "Write a new review for the Knickerbocker hotel in New York, give it 5 stars and a raving review about the quality of the room. Fill in all other information in review based on your best assumptions.", "start_url": "https://www.tripadvisor.com", "metadata": {"original_task_id": "wb-1891", "website": "tripadvisor.com", "category": "READ", "additional": {"webbench_id": 1891, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1892", "dataset": "webbench", "query": "Identify the top-rated hotel in Paris, verify if it offers free cancellation, and analyze at least three recent guest reviews to see if they mention staff helpfulness.", "start_url": "https://www.tripadvisor.com", "metadata": {"original_task_id": "wb-1892", "website": "tripadvisor.com", "category": "READ", "additional": {"webbench_id": 1892, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1936", "dataset": "webbench", "query": "Find ukulele chords for \"Riptide\" by Vance Joy and add the chords to a new playlist called \"Easy Ukulele Songs\".", "start_url": "https://www.ultimate-guitar.com", "metadata": {"original_task_id": "wb-1936", "website": "ultimate-guitar.com", "category": "READ", "additional": {"webbench_id": 1936, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2005", "dataset": "webbench", "query": "Search and locate a “Lady Gaga” article, then note three keypoints.", "start_url": "https://www.usmagazine.com", "metadata": {"original_task_id": "wb-2005", "website": "usmagazine.com", "category": "READ", "additional": {"webbench_id": 2005, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2039", "dataset": "webbench", "query": "Find detailed information on the cancellation policy for a group safari tour in Kenya and summarize the key points.", "start_url": "https://www.viator.com", "metadata": {"original_task_id": "wb-2039", "website": "viator.com", "category": "READ", "additional": {"webbench_id": 2039, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2049", "dataset": "webbench", "query": "Apply filters for vacation rentals in Orlando, FL with a private pool available from April 10 to April 15, then extract and display the star ratings of the top three listings.", "start_url": "https://www.vrbo.com", "metadata": {"original_task_id": "wb-2049", "website": "vrbo.com", "category": "READ", "additional": {"webbench_id": 2049, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2062", "dataset": "webbench", "query": "Find the store hours for the Walmart Supercenter near Dallas, TX (zip code 75201) and also check if the pharmacy has different hours.", "start_url": "https://www.walmart.com", "metadata": {"original_task_id": "wb-2062", "website": "walmart.com", "category": "READ", "additional": {"webbench_id": 2062, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2066", "dataset": "webbench", "query": "Check if Walmart has any same-day grocery delivery available for 92 2nd Ave, New York City, 10003", "start_url": "https://www.walmart.com", "metadata": {"original_task_id": "wb-2066", "website": "walmart.com", "category": "READ", "additional": {"webbench_id": 2066, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2179", "dataset": "webbench", "query": "Locate the official guidance on COVID-19 travel regulations on Gov.uk and list the recommended preventative measures.", "start_url": "https://www.gov.uk", "metadata": {"original_task_id": "wb-2179", "website": "gov.uk", "category": "READ", "additional": {"webbench_id": 2179, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2217", "dataset": "webbench", "query": "Rate \"Cafe Gratitude\" in LA with 5 stars and leave a detailed review describing one of the menu items.", "start_url": "https://www.yelp.com", "metadata": {"original_task_id": "wb-2217", "website": "yelp.com", "category": "READ", "additional": {"webbench_id": 2217, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2218", "dataset": "webbench", "query": "Return the phone number of \"Joe's Pizza\" in Manhattan", "start_url": "https://www.yelp.com", "metadata": {"original_task_id": "wb-2218", "website": "yelp.com", "category": "READ", "additional": {"webbench_id": 2218, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2220", "dataset": "webbench", "query": "Search for the word \"quintessential\" and extract its etymology along with one illustrative example sentence.", "start_url": "https://www.yourdictionary.com", "metadata": {"original_task_id": "wb-2220", "website": "yourdictionary.com", "category": "READ", "additional": {"webbench_id": 2220, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2228", "dataset": "webbench", "query": "Identify listings in \"Las Vegas, NV\" that feature a 3D home tour and provide the property address, price, and tour availability details of the top result.", "start_url": "https://www.zillow.com", "metadata": {"original_task_id": "wb-2228", "website": "zillow.com", "category": "READ", "additional": {"webbench_id": 2228, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2231", "dataset": "webbench", "query": "Search job listings for “Software Engineer” in bangalore, WA and list the top 5 job titles along with their corresponding company names.", "start_url": "https://www.ziprecruiter.com", "metadata": {"original_task_id": "wb-2231", "website": "ziprecruiter.com", "category": "READ", "additional": {"webbench_id": 2231, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2266", "dataset": "webbench", "query": "Use the site’s search function to find questions related to “English idioms,” filter by beginner level, and provide the titles of the first 5 results.", "start_url": "https://hinative.com", "metadata": {"original_task_id": "wb-2266", "website": "hinative.com", "category": "READ", "additional": {"webbench_id": 2266, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2267", "dataset": "webbench", "query": "Look up a question on how to pronounce “Rendezvous” and extract the top 3 highest-voted answers.", "start_url": "https://hinative.com", "metadata": {"original_task_id": "wb-2267", "website": "hinative.com", "category": "READ", "additional": {"webbench_id": 2267, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2269", "dataset": "webbench", "query": "Search for a question tagged “Russian beginner” and record the response count along with a brief summary of the first answer.", "start_url": "https://hinative.com", "metadata": {"original_task_id": "wb-2269", "website": "hinative.com", "category": "READ", "additional": {"webbench_id": 2269, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2290", "dataset": "webbench", "query": "Locate the seasonal guide for summer activities on Parade.com and list the top three recommended activities.", "start_url": "https://parade.com", "metadata": {"original_task_id": "wb-2290", "website": "parade.com", "category": "READ", "additional": {"webbench_id": 2290, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2299", "dataset": "webbench", "query": "Navigate to the SoundCloud homepage, search for tracks tagged “lo-fi,” and list the titles and associated artists of the first 5 results.", "start_url": "https://soundcloud.com", "metadata": {"original_task_id": "wb-2299", "website": "soundcloud.com", "category": "READ", "additional": {"webbench_id": 2299, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2301", "dataset": "webbench", "query": "Search for “UK electronic music” and list the titles of the first five tracks that appear.", "start_url": "https://soundcloud.com", "metadata": {"original_task_id": "wb-2301", "website": "soundcloud.com", "category": "READ", "additional": {"webbench_id": 2301, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2303", "dataset": "webbench", "query": "Find the Copyright in the top right corner under the three dots and read the SoundCloud Copyright, and summarize the key points in bullet form.", "start_url": "https://soundcloud.com", "metadata": {"original_task_id": "wb-2303", "website": "soundcloud.com", "category": "READ", "additional": {"webbench_id": 2303, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2327", "dataset": "webbench", "query": "Search for 2-bedroom apartments available for rent in Manhattan under $3,000 per month and list the names, addresses, and prices of 3", "start_url": "https://streeteasy.com", "metadata": {"original_task_id": "wb-2327", "website": "streeteasy.com", "category": "READ", "additional": {"webbench_id": 2327, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2371", "dataset": "webbench", "query": "Use the search bar to find \"chocolate cake\" recipes and display the first 5 recipe titles along with their publication dates.", "start_url": "https://www.bbcgoodfood.com", "metadata": {"original_task_id": "wb-2371", "website": "bbcgoodfood.com", "category": "READ", "additional": {"webbench_id": 2371, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2400", "dataset": "webbench", "query": "Identify a market trends article on Bloomberg and extract the conclusion paragraph that summarizes the market outlook for the next quarter.", "start_url": "https://www.bloomberg.com", "metadata": {"original_task_id": "wb-2400", "website": "bloomberg.com", "category": "READ", "additional": {"webbench_id": 2400, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2437", "dataset": "webbench", "query": "Use the warehouse locator by entering the zip code 90210 and list the address, operating hours, and available ancillary services (e.g., pharmacy or optical) for the nearest Costco store.", "start_url": "https://www.costco.com", "metadata": {"original_task_id": "wb-2437", "website": "costco.com", "category": "READ", "additional": {"webbench_id": 2437, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2492", "dataset": "webbench", "query": "Search for \"sushi\" on DoorDash, filter for restaurants with delivery feed under $3, and list the first five restaurants by customer rating.", "start_url": "https://www.doordash.com", "metadata": {"original_task_id": "wb-2492", "website": "doordash.com", "category": "READ", "additional": {"webbench_id": 2492, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2494", "dataset": "webbench", "query": "Locate a restaurant that offers gluten-free options in zip code 10013 and summarize the customer reviews for its best-selling dish.", "start_url": "https://www.doordash.com", "metadata": {"original_task_id": "wb-2494", "website": "doordash.com", "category": "READ", "additional": {"webbench_id": 2494, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2566", "dataset": "webbench", "query": "Look for articles mentioning “Inflation trends” published in 2023, and list the headlines of the top five results.", "start_url": "https://www.livemint.com", "metadata": {"original_task_id": "wb-2566", "website": "livemint.com", "category": "READ", "additional": {"webbench_id": 2566, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2573", "dataset": "webbench", "query": "Search for a holiday package that bundles flights and hotel stays for a Dubai trip and view the detailed itinerary.", "start_url": "https://www.makemytrip.global/?cc=am&redirectedBy=gl", "metadata": {"original_task_id": "wb-2573", "website": "makemytrip.global", "category": "READ", "additional": {"webbench_id": 2573, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2574", "dataset": "webbench", "query": "Look up travel packages for a family trip to Singapore and list the key highlights mentioned in the package details.", "start_url": "https://www.makemytrip.global/?cc=am&redirectedBy=gl", "metadata": {"original_task_id": "wb-2574", "website": "makemytrip.global", "category": "READ", "additional": {"webbench_id": 2574, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2596", "dataset": "webbench", "query": "Navigate to the Newsweek homepage and locate a breaking news article about U.S. politics; then extract the article’s headline, publication date, and author name (if available).", "start_url": "https://www.newsweek.com", "metadata": {"original_task_id": "wb-2596", "website": "newsweek.com", "category": "READ", "additional": {"webbench_id": 2596, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2597", "dataset": "webbench", "query": "Use the search bar to find articles related to \"space exploration\" and list the titles and summaries of the top five results.", "start_url": "https://www.newsweek.com", "metadata": {"original_task_id": "wb-2597", "website": "newsweek.com", "category": "READ", "additional": {"webbench_id": 2597, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2598", "dataset": "webbench", "query": "Search for an opinion piece by Jim Banks, and list the headline and publication date of his most recent post", "start_url": "https://www.newsweek.com", "metadata": {"original_task_id": "wb-2598", "website": "newsweek.com", "category": "READ", "additional": {"webbench_id": 2598, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2632", "dataset": "webbench", "query": "Navigate to the Qatar Airways homepage and search for flights from Doha to Paris departing in the upcoming week; then list the available fare classes and prices.", "start_url": "https://www.qatarairways.com/en-us/homepage.html", "metadata": {"original_task_id": "wb-2632", "website": "qatarairways.com", "category": "READ", "additional": {"webbench_id": 2632, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2650", "dataset": "webbench", "query": "Search the Shutterstock blog for articles on “picture composition techniques” and list the headlines of the top three posts.", "start_url": "https://www.shutterstock.com", "metadata": {"original_task_id": "wb-2650", "website": "shutterstock.com", "category": "READ", "additional": {"webbench_id": 2650, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2671", "dataset": "webbench", "query": "Find the flight schedule for a trip from Ankara to Dubai, and provide the departure time, arrival time, and estimated flight duration as listed.", "start_url": "https://www.turkishairlines.com", "metadata": {"original_task_id": "wb-2671", "website": "turkishairlines.com", "category": "READ", "additional": {"webbench_id": 2671, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2674", "dataset": "webbench", "query": "Visit the Restaurant Info page for “Chipotle” and list several customer reviews along with their corresponding star ratings.", "start_url": "https://www.ubereats.com", "metadata": {"original_task_id": "wb-2674", "website": "ubereats.com", "category": "READ", "additional": {"webbench_id": 2674, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2717", "dataset": "webbench", "query": "Determine which movie release this weekend had the highest box office budget, then compare it with the movie with the lowest box office budget and return the difference.\nOnly use imdb.com to achieve the task. Don't go to any other site. The task is achievable with just navigation from this site.", "start_url": "https://www.imdb.com", "metadata": {"original_task_id": "wb-2717", "website": "imdb.com", "category": "READ", "additional": {"webbench_id": 2717, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|