mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
- Add hover_at, type_at, drag_at coordinate tools to server - Add hoverAt, typeAt, dragAt methods to Browser class - Export server internals (browser, tool-loop, registry) for eval imports - Copy eval app from enterprise repo with agents, graders, runner, dashboard - Nest eval-targets inside apps/eval - Adapt sessionExecutionDir → workingDir for current server API - Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
200 lines
98 KiB
JSON
200 lines
98 KiB
JSON
{"query_id": "wb-21", "dataset": "webbench", "query": "Filter properties in Chicago by those offering free cancellation for a one-week stay in August, then list the first five options with their prices.", "start_url": "https://www.airbnb.com", "metadata": {"original_task_id": "wb-21", "website": "airbnb.com", "category": "READ", "additional": {"webbench_id": 21, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-24", "dataset": "webbench", "query": "Search for beachfront properties in Miami with nightly rates under $300, and list the top three property names along with their prices.", "start_url": "https://www.airbnb.com", "metadata": {"original_task_id": "wb-24", "website": "airbnb.com", "category": "READ", "additional": {"webbench_id": 24, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-33", "dataset": "webbench", "query": "Search for \"smartphones\" on Alibaba and note how many listings are available.", "start_url": "https://www.alibaba.com", "metadata": {"original_task_id": "wb-33", "website": "alibaba.com", "category": "READ", "additional": {"webbench_id": 33, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-45", "dataset": "webbench", "query": "Find a smart watch with more than 20 reviews. Filter the reviews to the most recent and summarize the most common positive and negative points mentioned by customers.", "start_url": "https://aliexpress.com", "metadata": {"original_task_id": "wb-45", "website": "aliexpress.com", "category": "READ", "additional": {"webbench_id": 45, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-66", "dataset": "webbench", "query": "Search Comfort Food recipes and list the names of the first 3 recipes with a 30 minutes cook time.", "start_url": "https://www.allrecipes.com", "metadata": {"original_task_id": "wb-66", "website": "allrecipes.com", "category": "READ", "additional": {"webbench_id": 66, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-98", "dataset": "webbench", "query": "Search for apartments in \"New York\" and list three four star rated apartments", "start_url": "https://www.apartments.com", "metadata": {"original_task_id": "wb-98", "website": "apartments.com", "category": "READ", "additional": {"webbench_id": 98, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-99", "dataset": "webbench", "query": "Check the rent estimate calculator results for a property located at 123 Main St, Los Angeles, CA.", "start_url": "https://www.apartments.com", "metadata": {"original_task_id": "wb-99", "website": "apartments.com", "category": "READ", "additional": {"webbench_id": 99, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-102", "dataset": "webbench", "query": "Search for the app \"Spotify\" on APKPure and list the latest version number along with its release date as shown on the version history section.", "start_url": "https://apkpure.com", "metadata": {"original_task_id": "wb-102", "website": "apkpure.com", "category": "READ", "additional": {"webbench_id": 102, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-113", "dataset": "webbench", "query": "Use advanced search for ”Space images\" on archive.org and output the capture dates and titles of the first 10 images listed.", "start_url": "https://archive.org", "metadata": {"original_task_id": "wb-113", "website": "archive.org", "category": "READ", "additional": {"webbench_id": 113, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-121", "dataset": "webbench", "query": "Navigate to the trending releases section and list the first 5 track titles along with their corresponding artist names.", "start_url": "https://bandcamp.com", "metadata": {"original_task_id": "wb-121", "website": "bandcamp.com", "category": "READ", "additional": {"webbench_id": 121, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-122", "dataset": "webbench", "query": "Locate an artist’s page (e.g., search for \"Tame Impala\") and list the available album formats (MP3, FLAC, etc.) offered on that page.", "start_url": "https://bandcamp.com", "metadata": {"original_task_id": "wb-122", "website": "bandcamp.com", "category": "READ", "additional": {"webbench_id": 122, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-129", "dataset": "webbench", "query": "Use the store locator by entering ZIP code 90210, then display the address, store hours, and contact details of the nearest location.", "start_url": "http://www.barnesandnoble.com", "metadata": {"original_task_id": "wb-129", "website": "barnesandnoble.com", "category": "READ", "additional": {"webbench_id": 129, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-130", "dataset": "webbench", "query": "Browse the \"Best Sellers\" section for physical books and list the top 5 titles featured on the homepage.", "start_url": "http://www.barnesandnoble.com", "metadata": {"original_task_id": "wb-130", "website": "barnesandnoble.com", "category": "READ", "additional": {"webbench_id": 130, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-135", "dataset": "webbench", "query": "Search the Barron's archive for articles containing \"value investing\" posted in the last 30 days, and list each title along with its publication date.", "start_url": "https://www.barrons.com", "metadata": {"original_task_id": "wb-135", "website": "barrons.com", "category": "READ", "additional": {"webbench_id": 135, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-136", "dataset": "webbench", "query": "Open the latest edition of Barron's digital magazine and identify the featured article’s headline along with the author's name.", "start_url": "https://www.barrons.com", "metadata": {"original_task_id": "wb-136", "website": "barrons.com", "category": "READ", "additional": {"webbench_id": 136, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-175", "dataset": "webbench", "query": "Investigate Mount Everest’s elevation in different sources on the site, and check compare it with the height of K2 and see which is higher.", "start_url": "https://www.britannica.com", "metadata": {"original_task_id": "wb-175", "website": "britannica.com", "category": "READ", "additional": {"webbench_id": 175, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-212", "dataset": "webbench", "query": "Find and display the detailed specifications—including fuel type and VIN—for the 2020 Toyota Camry offered by a local dealer in Dallas, TX.", "start_url": "https://www.cars.com", "metadata": {"original_task_id": "wb-212", "website": "cars.com", "category": "READ", "additional": {"webbench_id": 212, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-219", "dataset": "webbench", "query": "Locate a section or article on \"upcoming releases\" and list the titles and release dates of the movies or comics mentioned.", "start_url": "https://www.cbr.com", "metadata": {"original_task_id": "wb-219", "website": "cbr.com", "category": "READ", "additional": {"webbench_id": 219, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-225", "dataset": "webbench", "query": "Locate the featured investigative report on the homepage and write a brief summary outlining its main argument.", "start_url": "https://www.cbsnews.com", "metadata": {"original_task_id": "wb-225", "website": "cbsnews.com", "category": "READ", "additional": {"webbench_id": 225, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-229", "dataset": "webbench", "query": "Visit the CBS Sports HQ streaming page, identify the current featured sports segment, and provide a brief summary of its content.", "start_url": "https://www.cbssports.com", "metadata": {"original_task_id": "wb-229", "website": "cbssports.com", "category": "READ", "additional": {"webbench_id": 229, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-259", "dataset": "webbench", "query": "Examine the \"TV\" category and extract the title and summary of the first TV review article displayed.", "start_url": "https://collider.com", "metadata": {"original_task_id": "wb-259", "website": "collider.com", "category": "READ", "additional": {"webbench_id": 259, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-294", "dataset": "webbench", "query": "Search for \"free sofa\" listings in the \"for sale\" section in Boston and list the first 5 ad titles.", "start_url": "https://newyork.craigslist.org", "metadata": {"original_task_id": "wb-294", "website": "newyork.craigslist.org", "category": "READ", "additional": {"webbench_id": 294, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-295", "dataset": "webbench", "query": "Browse the \"jobs\" category in Chicago for \"restaurant server\" positions and extract the employment type from the top result.", "start_url": "https://newyork.craigslist.org", "metadata": {"original_task_id": "wb-295", "website": "newyork.craigslist.org", "category": "READ", "additional": {"webbench_id": 295, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-296", "dataset": "webbench", "query": "Browse the \"services\" section in Dallas, TX for listings related to \"computer repair\" and note down the business names from the top five ads.", "start_url": "https://newyork.craigslist.org", "metadata": {"original_task_id": "wb-296", "website": "newyork.craigslist.org", "category": "READ", "additional": {"webbench_id": 296, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-315", "dataset": "webbench", "query": "Retrieve a list of potential membership plans, and list the best features of each.", "start_url": "https://www.crunchbase.com", "metadata": {"original_task_id": "wb-315", "website": "crunchbase.com", "category": "READ", "additional": {"webbench_id": 315, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-317", "dataset": "webbench", "query": "Access the Crunchbase profile for Uber and extract the details of its most recent funding round.", "start_url": "https://www.crunchbase.com", "metadata": {"original_task_id": "wb-317", "website": "crunchbase.com", "category": "READ", "additional": {"webbench_id": 317, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-318", "dataset": "webbench", "query": "Search Crunchbase for companies with a valuation above $1 billion and output the names of the first 5 companies.", "start_url": "https://www.crunchbase.com", "metadata": {"original_task_id": "wb-318", "website": "crunchbase.com", "category": "READ", "additional": {"webbench_id": 318, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-345", "dataset": "webbench", "query": "Use the Delish search bar to look up \"Marry Me Chicken\" and list the ingredient quantities mentioned in the recipe.", "start_url": "https://www.delish.com", "metadata": {"original_task_id": "wb-345", "website": "delish.com", "category": "READ", "additional": {"webbench_id": 345, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-347", "dataset": "webbench", "query": "Locate the step-by-step guide for making \"No-Bake Cheesecake\" on Delish and list all the preparation steps provided.", "start_url": "https://www.delish.com", "metadata": {"original_task_id": "wb-347", "website": "delish.com", "category": "READ", "additional": {"webbench_id": 347, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-360", "dataset": "webbench", "query": "Explore the Groups section to identify 3 active art collaboration groups and list their group names.", "start_url": "https://www.deviantart.com", "metadata": {"original_task_id": "wb-360", "website": "deviantart.com", "category": "READ", "additional": {"webbench_id": 360, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-362", "dataset": "webbench", "query": "Visit the profile of the artist “BisBiswas” and return the number of pageviews and deviations, as well as the artist's birthday", "start_url": "https://www.deviantart.com", "metadata": {"original_task_id": "wb-362", "website": "deviantart.com", "category": "READ", "additional": {"webbench_id": 362, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-390", "dataset": "webbench", "query": "Visit the Podcast section, select the latest episode, and provide its title along with a brief description.", "start_url": "https://www.dw.com", "metadata": {"original_task_id": "wb-390", "website": "dw.com", "category": "READ", "additional": {"webbench_id": 390, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-410", "dataset": "webbench", "query": "Use the site’s search function to look for “cryptocurrency regulations” and list the first 5 article titles with their publication dates.", "start_url": "https://economictimes.indiatimes.com", "metadata": {"original_task_id": "wb-410", "website": "economictimes.indiatimes.com", "category": "READ", "additional": {"webbench_id": 410, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-412", "dataset": "webbench", "query": "Navigate to the Markets section and record the current BSE and NSE index values as displayed on the dashboard.", "start_url": "https://economictimes.indiatimes.com", "metadata": {"original_task_id": "wb-412", "website": "economictimes.indiatimes.com", "category": "READ", "additional": {"webbench_id": 412, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-421", "dataset": "webbench", "query": "Select the 'Page information' option under the 'Tools' dropdown to see how many page views the 'COVID-19' article has in the past 30 days.", "start_url": "https://en.wikipedia.org/wiki/Main_Page", "metadata": {"original_task_id": "wb-421", "website": "en.wikipedia.org", "category": "READ", "additional": {"webbench_id": 421, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-432", "dataset": "webbench", "query": "Find the latest update on air quality data for Los Angeles on the EPA’s Air Quality System page and record the reported index value.", "start_url": "https://www.epa.gov", "metadata": {"original_task_id": "wb-432", "website": "epa.gov", "category": "READ", "additional": {"webbench_id": 432, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-483", "dataset": "webbench", "query": "Filter search results for “pet-friendly hotels” in San Diego and list three hotels with their names and starting prices.", "start_url": "https://www.expedia.com/", "metadata": {"original_task_id": "wb-483", "website": "expedia.com", "category": "READ", "additional": {"webbench_id": 483, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-505", "dataset": "webbench", "query": "Find an image posted to the Stranger Things Wiki forum and identify the dimensions of the largest downloadable option", "start_url": "https://www.fandom.com", "metadata": {"original_task_id": "wb-505", "website": "fandom.com", "category": "READ", "additional": {"webbench_id": 505, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-509", "dataset": "webbench", "query": "Look up the latest FDA guidance on AI/ML in medical device software and summarize the key points mentioned in the introduction.", "start_url": "https://www.fda.gov", "metadata": {"original_task_id": "wb-509", "website": "fda.gov", "category": "READ", "additional": {"webbench_id": 509, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-518", "dataset": "webbench", "query": "Browse the parenting blog on FirstCry and extract the title and publication date of the latest article about baby nutrition.", "start_url": "https://www.firstcry.com", "metadata": {"original_task_id": "wb-518", "website": "firstcry.com", "category": "READ", "additional": {"webbench_id": 518, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-520", "dataset": "webbench", "query": "Locate the product page for the \"Chicco KeyFit 30 Infant Car Seat\" and summarize its detailed specifications, available colors, and safety ratings.", "start_url": "https://www.firstcry.com", "metadata": {"original_task_id": "wb-520", "website": "firstcry.com", "category": "READ", "additional": {"webbench_id": 520, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-535", "dataset": "webbench", "query": "Browse the Explore page and extract the titles of the top 5 most favorited photos currently trending.", "start_url": "https://flickr.com", "metadata": {"original_task_id": "wb-535", "website": "flickr.com", "category": "READ", "additional": {"webbench_id": 535, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-543", "dataset": "webbench", "query": "In the \"Laptops\" section, apply the filter for \"Dell\" and extract the average discount percentage on the first 3 Dell laptops displayed.", "start_url": "https://www.flipkart.com", "metadata": {"original_task_id": "wb-543", "website": "flipkart.com", "category": "READ", "additional": {"webbench_id": 543, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-560", "dataset": "webbench", "query": "Search Forbes for the latest startup that raised over $500 million", "start_url": "https://www.forbes.com", "metadata": {"original_task_id": "wb-560", "website": "forbes.com", "category": "READ", "additional": {"webbench_id": 560, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-570", "dataset": "webbench", "query": "Navigate to the Ford homepage and locate detailed specifications for the latest Ford Mustang, then list the engine type, horsepower, and fuel efficiency details.", "start_url": "https://www.ford.com", "metadata": {"original_task_id": "wb-570", "website": "ford.com", "category": "READ", "additional": {"webbench_id": 570, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-604", "dataset": "webbench", "query": "Use the search function to find articles about \"Cyberpunk 2077\" and provide the headlines of the first five results.", "start_url": "https://gamerant.com", "metadata": {"original_task_id": "wb-604", "website": "gamerant.com", "category": "READ", "additional": {"webbench_id": 604, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-605", "dataset": "webbench", "query": "Use the website’s filtering tools to display \"Retro Gaming\" articles and extract the titles of the top three most recent posts.", "start_url": "https://gamerant.com", "metadata": {"original_task_id": "wb-605", "website": "gamerant.com", "category": "READ", "additional": {"webbench_id": 605, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-613", "dataset": "webbench", "query": "Search for \"E3 2023\" coverage articles on Gamespot and list the titles of the first five relevant articles you find.", "start_url": "https://www.gamespot.com", "metadata": {"original_task_id": "wb-613", "website": "gamespot.com", "category": "READ", "additional": {"webbench_id": 613, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-635", "dataset": "webbench", "query": "What email should I contact if I'm interesting in working for Genius as a journalist?", "start_url": "https://genius.com", "metadata": {"original_task_id": "wb-635", "website": "genius.com", "category": "READ", "additional": {"webbench_id": 635, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-639", "dataset": "webbench", "query": "Filter search results for \"vintage journalism\" images by the \"Editorial\" category and output the license types for the first 5 assets.", "start_url": "https://www.gettyimages.com", "metadata": {"original_task_id": "wb-639", "website": "gettyimages.com", "category": "READ", "additional": {"webbench_id": 639, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-668", "dataset": "webbench", "query": "Search for repositories containing the phrase \"machine learning\" in their description and extract the names of the first 10 results.", "start_url": "https://github.com", "metadata": {"original_task_id": "wb-668", "website": "github.com", "category": "READ", "additional": {"webbench_id": 668, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-688", "dataset": "webbench", "query": "Navigate to the New York City Bowl and display the number of members as well as the bio of the bowl.", "start_url": "https://www.glassdoor.com/index.htm", "metadata": {"original_task_id": "wb-688", "website": "glassdoor.com", "category": "READ", "additional": {"webbench_id": 688, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-695", "dataset": "webbench", "query": "Find the match analysis of the last Champions League game and write down the main tactical observation discussed in the article.", "start_url": "https://www.goal.com/en-us", "metadata": {"original_task_id": "wb-695", "website": "goal.com", "category": "READ", "additional": {"webbench_id": 695, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-696", "dataset": "webbench", "query": "Search for the latest match report on Manchester United and list the headline of the report.", "start_url": "https://www.goal.com/en-us", "metadata": {"original_task_id": "wb-696", "website": "goal.com", "category": "READ", "additional": {"webbench_id": 696, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-744", "dataset": "webbench", "query": "Search for Italian restaurants in zip code 60611 (Chicago, IL), sort by \"4 Stars and Up,\" and list 5 restaurants rated 4.7 and above.", "start_url": "https://www.grubhub.com", "metadata": {"original_task_id": "wb-744", "website": "grubhub.com", "category": "READ", "additional": {"webbench_id": 744, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-790", "dataset": "webbench", "query": "Browse the \"Research\" section to identify a recently published news article on cardiovascular research, then note its title and publication date.", "start_url": "https://www.hopkinsmedicine.org", "metadata": {"original_task_id": "wb-790", "website": "hopkinsmedicine.org", "category": "READ", "additional": {"webbench_id": 790, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-812", "dataset": "webbench", "query": "Browse the \"Technology\" hub and extract the titles of the three most recent articles.", "start_url": "https://discover.hubpages.com/", "metadata": {"original_task_id": "wb-812", "website": "discover.hubpages.com", "category": "READ", "additional": {"webbench_id": 812, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-814", "dataset": "webbench", "query": "Use the HubPages search function to find articles on travel and summarize the main idea of the first result.", "start_url": "https://discover.hubpages.com/", "metadata": {"original_task_id": "wb-814", "website": "discover.hubpages.com", "category": "READ", "additional": {"webbench_id": 814, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-834", "dataset": "webbench", "query": "On the product page for the \"MALM bed frame,\" scroll to the product details section to find the assembly instructions and extract the first three steps described.", "start_url": "https://www.ikea.com", "metadata": {"original_task_id": "wb-834", "website": "ikea.com", "category": "READ", "additional": {"webbench_id": 834, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-841", "dataset": "webbench", "query": "Find any upcoming public meetings or events related to state government present on the site and list the dates and topics for at least two events.", "start_url": "https://www.illinois.gov", "metadata": {"original_task_id": "wb-841", "website": "illinois.gov", "category": "READ", "additional": {"webbench_id": 841, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-849", "dataset": "webbench", "query": "Check language requirements for a \"Bank Teller\" jobs in Miami, then see if Spanish is a required", "start_url": "http://indeed.com", "metadata": {"original_task_id": "wb-849", "website": "indeed.com", "category": "READ", "additional": {"webbench_id": 849, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-851", "dataset": "webbench", "query": "Search for \"Software Engineer\" jobs in New York and list the first five job titles", "start_url": "https://www.indeed.com/", "metadata": {"original_task_id": "wb-851", "website": "indeed.com", "category": "READ", "additional": {"webbench_id": 851, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-860", "dataset": "webbench", "query": "Look up \"LED lighting systems\" and note any indicators of supplier verification and lead response times that appear on the product pages.", "start_url": "https://www.indiamart.com", "metadata": {"original_task_id": "wb-860", "website": "indiamart.com", "category": "READ", "additional": {"webbench_id": 860, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-883", "dataset": "webbench", "query": "Search for organic bananas on Instacart and list the top 3 prices along with their retailer names.", "start_url": "https://www.instacart.com", "metadata": {"original_task_id": "wb-883", "website": "instacart.com", "category": "READ", "additional": {"webbench_id": 883, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-902", "dataset": "webbench", "query": "Use the iStock search bar to find stock photos of \"autumn forest\" and list the titles of the first 5 images that appear.", "start_url": "https://www.istockphoto.com", "metadata": {"original_task_id": "wb-902", "website": "istockphoto.com", "category": "READ", "additional": {"webbench_id": 902, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-917", "dataset": "webbench", "query": "Locate the latest solved question paper for SSC exams and enumerate the subjects included in the paper.", "start_url": "https://www.jagranjosh.com", "metadata": {"original_task_id": "wb-917", "website": "jagranjosh.com", "category": "READ", "additional": {"webbench_id": 917, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-930", "dataset": "webbench", "query": "Extract the contact email and phone number from the business profile of “City Hospital” in Chennai.", "start_url": "https://www.justdial.com", "metadata": {"original_task_id": "wb-930", "website": "justdial.com", "category": "READ", "additional": {"webbench_id": 930, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-931", "dataset": "webbench", "query": "Retrieve the latest press release from the Justice Department and summarize its key points.", "start_url": "https://www.justice.gov", "metadata": {"original_task_id": "wb-931", "website": "justice.gov", "category": "READ", "additional": {"webbench_id": 931, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-932", "dataset": "webbench", "query": "Browse for any content on collaborative law enforcement initiatives and provide a short summary of its main points.", "start_url": "https://www.justice.gov", "metadata": {"original_task_id": "wb-932", "website": "justice.gov", "category": "READ", "additional": {"webbench_id": 932, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-934", "dataset": "webbench", "query": "Browse the DOJ homepage and list the titles of the main sections displayed.", "start_url": "https://www.justice.gov", "metadata": {"original_task_id": "wb-934", "website": "justice.gov", "category": "READ", "additional": {"webbench_id": 934, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-952", "dataset": "webbench", "query": "Find the \"Ask an Expert\" information page (if available) and summarize the guidelines on how parents can contact pediatric specialists.", "start_url": "https://kidshealth.org", "metadata": {"original_task_id": "wb-952", "website": "kidshealth.org", "category": "READ", "additional": {"webbench_id": 952, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-971", "dataset": "webbench", "query": "Visit the \"Customer Service\" page and extract the live chat support hours, outputting them as a short text summary.", "start_url": "https://www.kohls.com", "metadata": {"original_task_id": "wb-971", "website": "kohls.com", "category": "READ", "additional": {"webbench_id": 971, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-973", "dataset": "webbench", "query": "Open the digital circulars section and extract the details (product names, prices, and discounts) for all items in the “Cereal” category.", "start_url": "https://www.kroger.com", "metadata": {"original_task_id": "wb-973", "website": "kroger.com", "category": "READ", "additional": {"webbench_id": 973, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-984", "dataset": "webbench", "query": "Use the historical trends tool to list the top three most amended clauses over the past year by frequency.", "start_url": "https://www.lawinsider.com", "metadata": {"original_task_id": "wb-984", "website": "lawinsider.com", "category": "READ", "additional": {"webbench_id": 984, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-990", "dataset": "webbench", "query": "Search the support section for the latest article on Lenovo AI Core integration and note the main benefits described.", "start_url": "https://www.lenovo.com/us/en", "metadata": {"original_task_id": "wb-990", "website": "lenovo.com", "category": "READ", "additional": {"webbench_id": 990, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1001", "dataset": "webbench", "query": "Use the advanced search to filter films in the \"Documentary\" genre released after 2018 and provide the titles of the first 10 films.", "start_url": "https://letterboxd.com", "metadata": {"original_task_id": "wb-1001", "website": "letterboxd.com", "category": "READ", "additional": {"webbench_id": 1001, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1028", "dataset": "webbench", "query": "Navigate to the “Jobs” section, apply the filter for “Remote” positions, and list the titles of the top 3 job listings.", "start_url": "https://www.linkedin.com", "metadata": {"original_task_id": "wb-1028", "website": "linkedin.com", "category": "READ", "additional": {"webbench_id": 1028, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1029", "dataset": "webbench", "query": "Search for and list the titles of the first 5 LinkedIn articles in the “Technology” category.", "start_url": "https://www.linkedin.com", "metadata": {"original_task_id": "wb-1029", "website": "linkedin.com", "category": "READ", "additional": {"webbench_id": 1029, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1044", "dataset": "webbench", "query": "Search for blog posts tagged \"travel\" on LiveJournal and list the titles of the three most recent entries.", "start_url": "https://www.livejournal.com", "metadata": {"original_task_id": "wb-1044", "website": "livejournal.com", "category": "READ", "additional": {"webbench_id": 1044, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1058", "dataset": "webbench", "query": "In the legislation section, locate a document related to voting rights and list its title and publication date.", "start_url": "https://www.loc.gov", "metadata": {"original_task_id": "wb-1058", "website": "loc.gov", "category": "READ", "additional": {"webbench_id": 1058, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1083", "dataset": "webbench", "query": "Search for \"Levi's 501 jeans\" and filter the results by male only, and size Large. Return the number of items available and the price and number of ratings for the first item.", "start_url": "https://www.macys.com", "metadata": {"original_task_id": "wb-1083", "website": "macys.com", "category": "READ", "additional": {"webbench_id": 1083, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1094", "dataset": "webbench", "query": "Navigate to the \"Food\" section and list the names and prices of the first five ready-meal products.", "start_url": "https://www.marksandspencer.com", "metadata": {"original_task_id": "wb-1094", "website": "marksandspencer.com", "category": "READ", "additional": {"webbench_id": 1094, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1099", "dataset": "webbench", "query": "Navigate to the Marriott Bonvoy page and extract the details of current Gold tier benefits, including one example of a member-exclusive offer.", "start_url": "https://www.marriott.com/default.mi", "metadata": {"original_task_id": "wb-1099", "website": "marriott.com", "category": "READ", "additional": {"webbench_id": 1099, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1108", "dataset": "webbench", "query": "Review an article related to digital marketing trends and extract any statistics or data figures included in the text.", "start_url": "https://mashable.com", "metadata": {"original_task_id": "wb-1108", "website": "mashable.com", "category": "READ", "additional": {"webbench_id": 1108, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1110", "dataset": "webbench", "query": "Browse the Mashable homepage and list the top three trending headlines along with their publication dates.", "start_url": "https://mashable.com", "metadata": {"original_task_id": "wb-1110", "website": "mashable.com", "category": "READ", "additional": {"webbench_id": 1110, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1145", "dataset": "webbench", "query": "Browse the TV shows category and list the titles, metascores, and number of critic reviews for shows scoring below 60 with at least 10 critic reviews.", "start_url": "https://www.metacritic.com", "metadata": {"original_task_id": "wb-1145", "website": "metacritic.com", "category": "READ", "additional": {"webbench_id": 1145, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1152", "dataset": "webbench", "query": "Search for articles mentioning \"Brexit\" and list the titles of the first five results.", "start_url": "https://metro.co.uk", "metadata": {"original_task_id": "wb-1152", "website": "metro.co.uk", "category": "READ", "additional": {"webbench_id": 1152, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1155", "dataset": "webbench", "query": "Search for the Michigan CARS e-Services page and list the steps provided for accessing vehicle records.", "start_url": "https://www.michigan.gov/som", "metadata": {"original_task_id": "wb-1155", "website": "michigan.gov", "category": "READ", "additional": {"webbench_id": 1155, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1158", "dataset": "webbench", "query": "Access the Michigan Department of Health page via Michigan.gov and list the steps provided for scheduling a vaccination appointment.", "start_url": "https://www.michigan.gov/som", "metadata": {"original_task_id": "wb-1158", "website": "michigan.gov", "category": "READ", "additional": {"webbench_id": 1158, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1161", "dataset": "webbench", "query": "Access the MLB.TV subscription page and extract the available pricing options and plan durations offered.", "start_url": "https://www.mlb.com", "metadata": {"original_task_id": "wb-1161", "website": "mlb.com", "category": "READ", "additional": {"webbench_id": 1161, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1163", "dataset": "webbench", "query": "Go to the “Businesses” section and extract details about any state-sponsored business incentives or programs available for Missouri businesses.", "start_url": "https://www.mo.gov", "metadata": {"original_task_id": "wb-1163", "website": "mo.gov", "category": "READ", "additional": {"webbench_id": 1163, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1168", "dataset": "webbench", "query": "Find the five most recent movie news articles on Movieweb and list their headlines with publication dates.", "start_url": "https://movieweb.com", "metadata": {"original_task_id": "wb-1168", "website": "movieweb.com", "category": "READ", "additional": {"webbench_id": 1168, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1183", "dataset": "webbench", "query": "Check the trading summary for the NYSE and record its current value.", "start_url": "https://www.nasdaq.com", "metadata": {"original_task_id": "wb-1183", "website": "nasdaq.com", "category": "READ", "additional": {"webbench_id": 1183, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1194", "dataset": "webbench", "query": "Go to the live schedule section and list the start times and matchups for all NFL games scheduled for today.", "start_url": "https://www.nbcsports.com", "metadata": {"original_task_id": "wb-1194", "website": "nbcsports.com", "category": "READ", "additional": {"webbench_id": 1194, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1202", "dataset": "webbench", "query": "Visit the NerdWallet personal loans comparison page and extract the interest rate ranges for the top five loan options listed.", "start_url": "https://www.nerdwallet.com", "metadata": {"original_task_id": "wb-1202", "website": "nerdwallet.com", "category": "READ", "additional": {"webbench_id": 1202, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1232", "dataset": "webbench", "query": "Navigate to Nordstrom’s \"Designer\" section and list five names of featured designer brands.", "start_url": "https://www.nordstrom.com", "metadata": {"original_task_id": "wb-1232", "website": "nordstrom.com", "category": "READ", "additional": {"webbench_id": 1232, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1302", "dataset": "webbench", "query": "Search for hotels in New York City for December 10–15 and list the three properties with the highest guest ratings and best amenities.", "start_url": "https://www.orbitz.com/", "metadata": {"original_task_id": "wb-1302", "website": "orbitz.com", "category": "READ", "additional": {"webbench_id": 1302, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1303", "dataset": "webbench", "query": "Review the detailed amenities and guest feedback for the \"Marriott Marquis\" in Chicago by navigating to its hotel description page.", "start_url": "https://www.orbitz.com/", "metadata": {"original_task_id": "wb-1303", "website": "orbitz.com", "category": "READ", "additional": {"webbench_id": 1303, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1309", "dataset": "webbench", "query": "Find a list of student organizations and clubs and filter the list to display organizations related to computer science.", "start_url": "https://www.osu.edu", "metadata": {"original_task_id": "wb-1309", "website": "osu.edu", "category": "READ", "additional": {"webbench_id": 1309, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1313", "dataset": "webbench", "query": "Search for a PBS news article on civic engagement and output the publication date along with the author’s name.", "start_url": "https://www.pbs.org", "metadata": {"original_task_id": "wb-1313", "website": "pbs.org", "category": "READ", "additional": {"webbench_id": 1313, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1325", "dataset": "webbench", "query": "Access the digital magazine archive and identify the issue that covers hardware benchmarks; provide its publication month.", "start_url": "https://www.pcgamer.com", "metadata": {"original_task_id": "wb-1325", "website": "pcgamer.com", "category": "READ", "additional": {"webbench_id": 1325, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1347", "dataset": "webbench", "query": "Search for boards about sustainable living, identify the one with the most pins, and list both the board name and follower count of the creator", "start_url": "https://www.pinterest.com", "metadata": {"original_task_id": "wb-1347", "website": "pinterest.com", "category": "READ", "additional": {"webbench_id": 1347, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1357", "dataset": "webbench", "query": "Search for \"sunset\" on Pixabay and list the titles and resolution details (e.g., pixel dimensions) of the first 5 images displayed.", "start_url": "https://pixabay.com", "metadata": {"original_task_id": "wb-1357", "website": "pixabay.com", "category": "READ", "additional": {"webbench_id": 1357, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1359", "dataset": "webbench", "query": "Search for \"vintage\" imagery and list the publication dates (if provided) and contributor ids (if provided) for the top 5 results.", "start_url": "https://pixabay.com", "metadata": {"original_task_id": "wb-1359", "website": "pixabay.com", "category": "READ", "additional": {"webbench_id": 1359, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1369", "dataset": "webbench", "query": "Browse the homepage and identify the main featured release; provide its headline and a brief summary.", "start_url": "https://www.playstation.com/en-us", "metadata": {"original_task_id": "wb-1369", "website": "playstation.com", "category": "READ", "additional": {"webbench_id": 1369, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1412", "dataset": "webbench", "query": "Scroll to the bottom of Real Simple’s homepage and list any featured tags or categories shown in the footer section.", "start_url": "https://www.realsimple.com", "metadata": {"original_task_id": "wb-1412", "website": "realsimple.com", "category": "READ", "additional": {"webbench_id": 1412, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1434", "dataset": "webbench", "query": "Access the property valuation tool and check the estimated home value for a \"3560 Nashville Hwy\" property; record the valuation along with the estimate's date.", "start_url": "https://www.redfin.com", "metadata": {"original_task_id": "wb-1434", "website": "redfin.com", "category": "READ", "additional": {"webbench_id": 1434, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1438", "dataset": "webbench", "query": "Search for \"hiking boots\" on REI.com and list the names, prices, and ratings of the top three results.", "start_url": "https://www.rei.com", "metadata": {"original_task_id": "wb-1438", "website": "rei.com", "category": "READ", "additional": {"webbench_id": 1438, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1440", "dataset": "webbench", "query": "Filter search results for \"pet-friendly rentals\" in Chicago, IL and report the total number of listings available.", "start_url": "https://www.rent.com", "metadata": {"original_task_id": "wb-1440", "website": "rent.com", "category": "READ", "additional": {"webbench_id": 1440, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1485", "dataset": "webbench", "query": "Compare the ratings of \"Blade Runner 2049\" and \"Mad Max: Fury Road\" by noting their Tomatometer and Audience scores, describing any noticeable differences.", "start_url": "https://www.rottentomatoes.com", "metadata": {"original_task_id": "wb-1485", "website": "rottentomatoes.com", "category": "READ", "additional": {"webbench_id": 1485, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1486", "dataset": "webbench", "query": "Look up the FAQ section and record any information regarding membership renewal policies.", "start_url": "https://www.samsclub.com", "metadata": {"original_task_id": "wb-1486", "website": "samsclub.com", "category": "READ", "additional": {"webbench_id": 1486, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1503", "dataset": "webbench", "query": "What is the description of the game 'Marvel Rivals'?", "start_url": "https://screenrant.com", "metadata": {"original_task_id": "wb-1503", "website": "screenrant.com", "category": "READ", "additional": {"webbench_id": 1503, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1520", "dataset": "webbench", "query": "Browse the \"New In\" section and list the product names, prices, and available colors of the top 5 most popular items.", "start_url": "https://us.shein.com/?ref=www&rep=dir&ret=us", "metadata": {"original_task_id": "wb-1520", "website": "us.shein.com", "category": "READ", "additional": {"webbench_id": 1520, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1544", "dataset": "webbench", "query": "Search for \"Sky Original Films\" in the Sky Cinema section and provide a list of the latest 5 original series titles featured on the page.", "start_url": "https://www.sky.com", "metadata": {"original_task_id": "wb-1544", "website": "sky.com", "category": "READ", "additional": {"webbench_id": 1544, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1566", "dataset": "webbench", "query": "Explore the heatmap visualization on a live football game page and specify the zone where the highest concentration of shots occurred.", "start_url": "https://www.sofascore.com", "metadata": {"original_task_id": "wb-1566", "website": "sofascore.com", "category": "READ", "additional": {"webbench_id": 1566, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1581", "dataset": "webbench", "query": "Browse the \"Tennis\" section on Sportskeeda and list the headlines of the latest 5 articles related to the Australian Open.", "start_url": "https://www.sportskeeda.com", "metadata": {"original_task_id": "wb-1581", "website": "sportskeeda.com", "category": "READ", "additional": {"webbench_id": 1581, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1607", "dataset": "webbench", "query": "Look up the \"Spotify AI DJ\" feature, read about its functionality, and provide a brief summary of the main steps described.", "start_url": "https://open.spotify.com", "metadata": {"original_task_id": "wb-1607", "website": "open.spotify.com", "category": "READ", "additional": {"webbench_id": 1607, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1662", "dataset": "webbench", "query": "Use the main search bar to find questions tagged with \"python\" in the Stack Overflow community and output the titles of the first 5 results.", "start_url": "https://stackexchange.com", "metadata": {"original_task_id": "wb-1662", "website": "stackexchange.com", "category": "READ", "additional": {"webbench_id": 1662, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1663", "dataset": "webbench", "query": "Search for posts containing the keyword \"server error 500\" on the Software Engineering community and output the titles of the top 3 matching questions.", "start_url": "https://stackexchange.com", "metadata": {"original_task_id": "wb-1663", "website": "stackexchange.com", "category": "READ", "additional": {"webbench_id": 1663, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1691", "dataset": "webbench", "query": "Navigate to a popular question on debugging techniques in Java and list all the tags associated with it.", "start_url": "https://stackoverflow.com/questions", "metadata": {"original_task_id": "wb-1691", "website": "stackoverflow.com", "category": "READ", "additional": {"webbench_id": 1691, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1700", "dataset": "webbench", "query": "Use the website’s search function to look up \"London transport updates\" and provide the titles and publication dates of the first three articles that appear.", "start_url": "https://www.standard.co.uk", "metadata": {"original_task_id": "wb-1700", "website": "standard.co.uk", "category": "READ", "additional": {"webbench_id": 1700, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1701", "dataset": "webbench", "query": "Open the \"Technology & Media\" section and locate an article about London-based tech startups; then copy the first paragraph of that article.", "start_url": "https://www.standard.co.uk", "metadata": {"original_task_id": "wb-1701", "website": "standard.co.uk", "category": "READ", "additional": {"webbench_id": 1701, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1735", "dataset": "webbench", "query": "Search for study materials available for \"Computer Science 101\" and list the titles of the first 5 documents.", "start_url": "https://www.studocu.com", "metadata": {"original_task_id": "wb-1735", "website": "studocu.com", "category": "READ", "additional": {"webbench_id": 1735, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1738", "dataset": "webbench", "query": "Filter documents by course \"Calculus I\" for the Harvard University repository and record the names of the top 3 most rated documents.", "start_url": "https://www.studocu.com", "metadata": {"original_task_id": "wb-1738", "website": "studocu.com", "category": "READ", "additional": {"webbench_id": 1738, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1741", "dataset": "webbench", "query": "Use the search function to find newsletters about \"art\" and extract the first 5 newsletter names along with their subscription models (free or paid).", "start_url": "https://substack.com/home", "metadata": {"original_task_id": "wb-1741", "website": "substack.com", "category": "READ", "additional": {"webbench_id": 1741, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1743", "dataset": "webbench", "query": "Visit a popular publication’s archive and list the publication dates for its 5 most recent posts.", "start_url": "https://substack.com/home", "metadata": {"original_task_id": "wb-1743", "website": "substack.com", "category": "READ", "additional": {"webbench_id": 1743, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1786", "dataset": "webbench", "query": "Navigate to the homepage and use the search bar to find the latest news article about \"Elden Ring\"; then copy the first three paragraphs of the article's body.", "start_url": "https://www.thegamer.com", "metadata": {"original_task_id": "wb-1786", "website": "thegamer.com", "category": "READ", "additional": {"webbench_id": 1786, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1800", "dataset": "webbench", "query": "Use the advanced search to filter movies released in 2022 and output the first 5 results with their average ratings.", "start_url": "https://www.themoviedb.org", "metadata": {"original_task_id": "wb-1800", "website": "themoviedb.org", "category": "READ", "additional": {"webbench_id": 1800, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1809", "dataset": "webbench", "query": "Browse the \"Food & Recipes\" category on The Pioneer Woman website and list the titles and publication dates of the latest 5 recipe posts.", "start_url": "https://www.thepioneerwoman.com", "metadata": {"original_task_id": "wb-1809", "website": "thepioneerwoman.com", "category": "READ", "additional": {"webbench_id": 1809, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1831", "dataset": "webbench", "query": "Visit the profile of a specific user (e.g., @username) and list the titles of their three most recent posts.", "start_url": "https://www.threads.net", "metadata": {"original_task_id": "wb-1831", "website": "threads.net", "category": "READ", "additional": {"webbench_id": 1831, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1839", "dataset": "webbench", "query": "Navigate to the Discover page and extract the top 3 trending hashtags currently shown.", "start_url": "https://www.tiktok.com/explore", "metadata": {"original_task_id": "wb-1839", "website": "tiktok.com", "category": "READ", "additional": {"webbench_id": 1839, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1860", "dataset": "webbench", "query": "Search for articles on “healthy breakfast recipes” and display the title and summary of the top result.", "start_url": "https://www.today.com", "metadata": {"original_task_id": "wb-1860", "website": "today.com", "category": "READ", "additional": {"webbench_id": 1860, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1862", "dataset": "webbench", "query": "Access the Opinion section, search for commentary on “climate change,” and list the titles of the three most recent pieces.", "start_url": "https://www.today.com", "metadata": {"original_task_id": "wb-1862", "website": "today.com", "category": "READ", "additional": {"webbench_id": 1862, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1877", "dataset": "webbench", "query": "Search for destination guides for Italy and extract the titles and publication dates of the first three results.", "start_url": "https://www.travelandleisure.com", "metadata": {"original_task_id": "wb-1877", "website": "travelandleisure.com", "category": "READ", "additional": {"webbench_id": 1877, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1880", "dataset": "webbench", "query": "Look up hotels in Paris, France; filter for 4‑star properties priced under $200 per night and list 5 hotel names with their addresses.", "start_url": "https://www.travelocity.com", "metadata": {"original_task_id": "wb-1880", "website": "travelocity.com", "category": "READ", "additional": {"webbench_id": 1880, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1882", "dataset": "webbench", "query": "Retrieve the detailed cancellation policy for a hotel booking in Orlando, FL by selecting a specific property and reserving a room.", "start_url": "https://www.travelocity.com", "metadata": {"original_task_id": "wb-1882", "website": "travelocity.com", "category": "READ", "additional": {"webbench_id": 1882, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1886", "dataset": "webbench", "query": "Identify an analytical report on “tourism recovery” and extract the entire executive summary in text format.", "start_url": "https://www.travelweekly.com", "metadata": {"original_task_id": "wb-1886", "website": "travelweekly.com", "category": "READ", "additional": {"webbench_id": 1886, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1895", "dataset": "webbench", "query": "Filter hotel search results in Paris by selecting properties that offer free breakfast; then extract and list the names and average review scores of the first 5 hotels.", "start_url": "https://www.trivago.com", "metadata": {"original_task_id": "wb-1895", "website": "trivago.com", "category": "READ", "additional": {"webbench_id": 1895, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1940", "dataset": "webbench", "query": "Search for “undergraduate admissions requirements” on the website and summarize three key criteria mentioned.", "start_url": "https://umich.edu", "metadata": {"original_task_id": "wb-1940", "website": "umich.edu", "category": "READ", "additional": {"webbench_id": 1940, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1967", "dataset": "webbench", "query": "Visit the \"Contact Us\" page and record the main administrative office’s phone number and email address.", "start_url": "https://www.upenn.edu", "metadata": {"original_task_id": "wb-1967", "website": "upenn.edu", "category": "READ", "additional": {"webbench_id": 1967, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2018", "dataset": "webbench", "query": "Use the search function to find articles on \"climate change policy\" and extract the publication dates of the first five results.", "start_url": "https://www.usnews.com", "metadata": {"original_task_id": "wb-2018", "website": "usnews.com", "category": "READ", "additional": {"webbench_id": 2018, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2021", "dataset": "webbench", "query": "Search the site for \"economic trends 2023\" and list the titles and publication dates of the first five matching articles.", "start_url": "https://www.usnews.com", "metadata": {"original_task_id": "wb-2021", "website": "usnews.com", "category": "READ", "additional": {"webbench_id": 2021, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2023", "dataset": "webbench", "query": "Locate the University’s dedicated AI portal (e.g., ai.utah.edu) and list one AI research tool or resource available for users.", "start_url": "https://www.utah.edu", "metadata": {"original_task_id": "wb-2023", "website": "utah.edu", "category": "READ", "additional": {"webbench_id": 2023, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2024", "dataset": "webbench", "query": "Look up \"CIS\" (Campus Information Systems) on the site and describe its primary function or purpose.", "start_url": "https://www.utah.edu", "metadata": {"original_task_id": "wb-2024", "website": "utah.edu", "category": "READ", "additional": {"webbench_id": 2024, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2027", "dataset": "webbench", "query": "Use the facility locator tool to list the names and addresses of the first three VA facilities near Arlington, VA.", "start_url": "https://www.va.gov", "metadata": {"original_task_id": "wb-2027", "website": "va.gov", "category": "READ", "additional": {"webbench_id": 2027, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2050", "dataset": "webbench", "query": "Browse the VRBO blog section and retrieve the headline of the most recent travel tips article.", "start_url": "https://www.vrbo.com", "metadata": {"original_task_id": "wb-2050", "website": "vrbo.com", "category": "READ", "additional": {"webbench_id": 2050, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2054", "dataset": "webbench", "query": "Search for COVID-19 vaccinations on Walgreens and list the available appointment options, eligibility criteria, and any cost details provided.", "start_url": "https://www.walgreens.com", "metadata": {"original_task_id": "wb-2054", "website": "walgreens.com", "category": "READ", "additional": {"webbench_id": 2054, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2071", "dataset": "webbench", "query": "Filter the furniture category by \"sectional sofas\" and extract the dimensions, available colors, and material details of the first product listed.", "start_url": "https://www.wayfair.com", "metadata": {"original_task_id": "wb-2071", "website": "wayfair.com", "category": "READ", "additional": {"webbench_id": 2071, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2072", "dataset": "webbench", "query": "Use the search function to locate \"recliner chairs\" and record the average customer rating along with one highlighted customer comment from the top result.", "start_url": "https://www.wayfair.com", "metadata": {"original_task_id": "wb-2072", "website": "wayfair.com", "category": "READ", "additional": {"webbench_id": 2072, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2097", "dataset": "webbench", "query": "Run a SPARQL query that retrieves the population of all countries in Europe.", "start_url": "https://www.wikidata.org/wiki/Wikidata:Main_Page", "metadata": {"original_task_id": "wb-2097", "website": "wikidata.org", "category": "READ", "additional": {"webbench_id": 2097, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2105", "dataset": "webbench", "query": "Navigate to the Relationships category on wikiHow and extract both the title and a short summary of the guide “How to Apologize Effectively.”", "start_url": "https://www.wikihow.com/Main-Page", "metadata": {"original_task_id": "wb-2105", "website": "wikihow.com", "category": "READ", "additional": {"webbench_id": 2105, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2124", "dataset": "webbench", "query": "Go to the events or webinars section and list the upcoming academic events, including dates and topics.", "start_url": "https://www.wiley.com/en-us", "metadata": {"original_task_id": "wb-2124", "website": "wiley.com", "category": "READ", "additional": {"webbench_id": 2124, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2149", "dataset": "webbench", "query": "Search for translations of the word \"friend\" into French, Spanish, and German and provide the corresponding translations.", "start_url": "https://www.wordhippo.com", "metadata": {"original_task_id": "wb-2149", "website": "wordhippo.com", "category": "READ", "additional": {"webbench_id": 2149, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2159", "dataset": "webbench", "query": "In a forum discussion on regional variations in Spanish, summarize the key differences mentioned by the community members.", "start_url": "https://www.wordreference.com", "metadata": {"original_task_id": "wb-2159", "website": "wordreference.com", "category": "READ", "additional": {"webbench_id": 2159, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2162", "dataset": "webbench", "query": "In the Italian–English dictionary, find the translation of \"friendship\" and report one or two example sentences demonstrating its usage.", "start_url": "https://www.wordreference.com", "metadata": {"original_task_id": "wb-2162", "website": "wordreference.com", "category": "READ", "additional": {"webbench_id": 2162, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2212", "dataset": "webbench", "query": "Locate a local gym in Oakland, CA with the highest rating on YellowPages and output its contact information along with membership hours.", "start_url": "https://www.yellowpages.com", "metadata": {"original_task_id": "wb-2212", "website": "yellowpages.com", "category": "READ", "additional": {"webbench_id": 2212, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2232", "dataset": "webbench", "query": "Browse the AI-generated job recommendations for a mid-level marketing role and extract the job titles, company names, and locations of the first 5 postings.", "start_url": "https://www.ziprecruiter.com", "metadata": {"original_task_id": "wb-2232", "website": "ziprecruiter.com", "category": "READ", "additional": {"webbench_id": 2232, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2287", "dataset": "webbench", "query": "Locate a Medium article from the publication \"The Startup\" and list its URL, title, and a brief summary of its content.", "start_url": "https://medium.com/explore-topics", "metadata": {"original_task_id": "wb-2287", "website": "medium.com", "category": "READ", "additional": {"webbench_id": 2287, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2289", "dataset": "webbench", "query": "Search for the latest article about holiday recipes on Parade.com and summarize the key steps or ingredients mentioned in the recipe.", "start_url": "https://parade.com", "metadata": {"original_task_id": "wb-2289", "website": "parade.com", "category": "READ", "additional": {"webbench_id": 2289, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2300", "dataset": "webbench", "query": "Open the “Trending Playlists” section, filter by the “Hip-Hop” genre, and output the titles of the top 3 trending tracks.", "start_url": "https://soundcloud.com", "metadata": {"original_task_id": "wb-2300", "website": "soundcloud.com", "category": "READ", "additional": {"webbench_id": 2300, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2302", "dataset": "webbench", "query": "Use the search function to find tracks containing the keyword “ambient” and output a list of the top 10 track titles along with their durations.", "start_url": "https://soundcloud.com", "metadata": {"original_task_id": "wb-2302", "website": "soundcloud.com", "category": "READ", "additional": {"webbench_id": 2302, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2328", "dataset": "webbench", "query": "Filter listings for sale in Brooklyn that were added in the last 7 days and list the first 5 with price details.", "start_url": "https://streeteasy.com", "metadata": {"original_task_id": "wb-2328", "website": "streeteasy.com", "category": "READ", "additional": {"webbench_id": 2328, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2329", "dataset": "webbench", "query": "Use the advanced search filters to find listings near Times Square with a doorman and list the first three property names along with their prices. Verify they have a doorman or concierge.", "start_url": "https://streeteasy.com", "metadata": {"original_task_id": "wb-2329", "website": "streeteasy.com", "category": "READ", "additional": {"webbench_id": 2329, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2359", "dataset": "webbench", "query": "Search for the question \"What is quantum mechanics?\" on Answers.com and list the first three answers provided by users.", "start_url": "https://www.answers.com", "metadata": {"original_task_id": "wb-2359", "website": "answers.com", "category": "READ", "additional": {"webbench_id": 2359, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2360", "dataset": "webbench", "query": "Use Answers.com's search bar to find the definition for \"photosynthesis\" and copy the definition into a plain text document.", "start_url": "https://www.answers.com", "metadata": {"original_task_id": "wb-2360", "website": "answers.com", "category": "READ", "additional": {"webbench_id": 2360, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2391", "dataset": "webbench", "query": "Use Bing Maps to find walking directions to Central Park in New York and copy the step-by-step route details provided.", "start_url": "https://www.bing.com", "metadata": {"original_task_id": "wb-2391", "website": "bing.com", "category": "READ", "additional": {"webbench_id": 2391, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2399", "dataset": "webbench", "query": "In the Technology section, select an article on semiconductor supply chains and list the key data points provided about current market trends.", "start_url": "https://www.bloomberg.com", "metadata": {"original_task_id": "wb-2399", "website": "bloomberg.com", "category": "READ", "additional": {"webbench_id": 2399, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2404", "dataset": "webbench", "query": "Browse the \"U.S. Economy at a Glance\" dashboard and list the key labor market indicators mentioned on the page.", "start_url": "https://www.bls.gov", "metadata": {"original_task_id": "wb-2404", "website": "bls.gov", "category": "READ", "additional": {"webbench_id": 2404, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2405", "dataset": "webbench", "query": "Navigate to the BLS news releases page and list the titles of the two most recent press releases.", "start_url": "https://www.bls.gov", "metadata": {"original_task_id": "wb-2405", "website": "bls.gov", "category": "READ", "additional": {"webbench_id": 2405, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2407", "dataset": "webbench", "query": "Browse the \"Publications\" section to identify the two most recent copies of the Monthly Labor Review; note down their titles and issue dates.", "start_url": "https://www.bls.gov", "metadata": {"original_task_id": "wb-2407", "website": "bls.gov", "category": "READ", "additional": {"webbench_id": 2407, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2436", "dataset": "webbench", "query": "Search for \"wireless earbuds\" on Costco.com and list the first three product names, prices, and available pack sizes.", "start_url": "https://www.costco.com", "metadata": {"original_task_id": "wb-2436", "website": "costco.com", "category": "READ", "additional": {"webbench_id": 2436, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2438", "dataset": "webbench", "query": "Look up the \"Business Membership\" page and list three benefits offered to business customers.", "start_url": "https://www.costco.com", "metadata": {"original_task_id": "wb-2438", "website": "costco.com", "category": "READ", "additional": {"webbench_id": 2438, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2439", "dataset": "webbench", "query": "Search for \"Kirkland Signature Organic Extra Virgin Olive Oil\" and extract the different pack sizes along with their prices.", "start_url": "https://www.costco.com", "metadata": {"original_task_id": "wb-2439", "website": "costco.com", "category": "READ", "additional": {"webbench_id": 2439, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2440", "dataset": "webbench", "query": "Explore the \"Travel\" page and list two current travel package deals including the destination and starting price.", "start_url": "https://www.costco.com", "metadata": {"original_task_id": "wb-2440", "website": "costco.com", "category": "READ", "additional": {"webbench_id": 2440, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2457", "dataset": "webbench", "query": "Navigate to the \"Coronavirus\" section (if available) and list the top three headlines along with their brief summaries.", "start_url": "https://www.dailymail.co.uk/ushome/index.html", "metadata": {"original_task_id": "wb-2457", "website": "dailymail.co.uk", "category": "READ", "additional": {"webbench_id": 2457, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2473", "dataset": "webbench", "query": "Browse the Deezer homepage and list the names of the three featured playlists currently highlighted.", "start_url": "https://www.deezer.com/us", "metadata": {"original_task_id": "wb-2473", "website": "deezer.com", "category": "READ", "additional": {"webbench_id": 2473, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2474", "dataset": "webbench", "query": "Use the search bar to look for the artist \"Taylor Swift\" and record the titles of the top 5 albums displayed.", "start_url": "https://www.deezer.com/us", "metadata": {"original_task_id": "wb-2474", "website": "deezer.com", "category": "READ", "additional": {"webbench_id": 2474, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2476", "dataset": "webbench", "query": "Search for \"lo-fi beats\" playlists and record the titles of the first 10 results shown.", "start_url": "https://www.deezer.com/us", "metadata": {"original_task_id": "wb-2476", "website": "deezer.com", "category": "READ", "additional": {"webbench_id": 2476, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2487", "dataset": "webbench", "query": "Access the \"TV Shows\" area, identify the currently highlighted Disney Channel episodes, and provide their titles along with the premiere dates.", "start_url": "https://www.disney.com", "metadata": {"original_task_id": "wb-2487", "website": "disney.com", "category": "READ", "additional": {"webbench_id": 2487, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2495", "dataset": "webbench", "query": "Navigate to the \"Pick Up\" section and provide the names of restaurants available for pickup within a 5-mile radius of zip code 10013.", "start_url": "https://www.doordash.com", "metadata": {"original_task_id": "wb-2495", "website": "doordash.com", "category": "READ", "additional": {"webbench_id": 2495, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2507", "dataset": "webbench", "query": "Access the FAQ section related to account management and summarize in three steps how to reset your password.", "start_url": "https://www.foodnetwork.com", "metadata": {"original_task_id": "wb-2507", "website": "foodnetwork.com", "category": "READ", "additional": {"webbench_id": 2507, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2508", "dataset": "webbench", "query": "Watch a how-to video on knife sharpening and note the key techniques demonstrated in the description.", "start_url": "https://www.foodnetwork.com", "metadata": {"original_task_id": "wb-2508", "website": "foodnetwork.com", "category": "READ", "additional": {"webbench_id": 2508, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2529", "dataset": "webbench", "query": "Search the site for emergency alert subscription options and list the types of alerts available for residents.", "start_url": "https://www.in.gov/core/index.html", "metadata": {"original_task_id": "wb-2529", "website": "in.gov", "category": "READ", "additional": {"webbench_id": 2529, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2533", "dataset": "webbench", "query": "Search for content on \"India Olympics 2024\" and list the headlines of the top 3 matching articles.", "start_url": "https://www.indiatoday.in", "metadata": {"original_task_id": "wb-2533", "website": "indiatoday.in", "category": "READ", "additional": {"webbench_id": 2533, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2562", "dataset": "webbench", "query": "Filter car rental search results in Los Angeles by the \"economy\" category and list the available providers along with their estimated daily rates.", "start_url": "https://www.kayak.com", "metadata": {"original_task_id": "wb-2562", "website": "kayak.com", "category": "READ", "additional": {"webbench_id": 2562, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2570", "dataset": "webbench", "query": "Search for “porch swing” in the outdoor furniture section and list the details (price, dimensions, and material) of the first three products displayed.", "start_url": "https://www.lowes.com", "metadata": {"original_task_id": "wb-2570", "website": "lowes.com", "category": "READ", "additional": {"webbench_id": 2570, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2576", "dataset": "webbench", "query": "Use MDPI’s journal search filters to identify journals that offer an ultra-rapid publication process, then list the names and scopes of the first five journals displayed.", "start_url": "https://www.mdpi.com", "metadata": {"original_task_id": "wb-2576", "website": "mdpi.com", "category": "READ", "additional": {"webbench_id": 2576, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2584", "dataset": "webbench", "query": "Search for expert articles on toddler nutrition and provide the titles and one-sentence summaries of the first three results.", "start_url": "https://www.mumsnet.com", "metadata": {"original_task_id": "wb-2584", "website": "mumsnet.com", "category": "READ", "additional": {"webbench_id": 2584, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2624", "dataset": "webbench", "query": "Search for \"quintessential\" and record its frequency indicator details from the Oxford 3000/5000 list.", "start_url": "https://www.oxfordlearnersdictionaries.com/us", "metadata": {"original_task_id": "wb-2624", "website": "oxfordlearnersdictionaries.com", "category": "READ", "additional": {"webbench_id": 2624, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2634", "dataset": "webbench", "query": "Use the search bar to find an article about the \"500 Greatest Albums\" list; then list the title and URL of the article.", "start_url": "https://www.rollingstone.com", "metadata": {"original_task_id": "wb-2634", "website": "rollingstone.com", "category": "READ", "additional": {"webbench_id": 2634, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2658", "dataset": "webbench", "query": "Browse the Electronics category, apply a filter for products priced under $10, and extract details (name and rating) for one product that has at least 50 reviews.", "start_url": "https://www.temu.com", "metadata": {"original_task_id": "wb-2658", "website": "temu.com", "category": "READ", "additional": {"webbench_id": 2658, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2661", "dataset": "webbench", "query": "Find a recipe that offers a printable version and note down its name along with the author’s name.", "start_url": "https://www.thekitchn.com", "metadata": {"original_task_id": "wb-2661", "website": "thekitchn.com", "category": "READ", "additional": {"webbench_id": 2661, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2667", "dataset": "webbench", "query": "Search for posts using the keyword \"DIY crafts\" and determine whether the majority of the results are photo, video, or text posts.", "start_url": "https://www.tumblr.com", "metadata": {"original_task_id": "wb-2667", "website": "tumblr.com", "category": "READ", "additional": {"webbench_id": 2667, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2672", "dataset": "webbench", "query": "Use the search functionality to look up “Sushi” restaurants, then output the estimated delivery time and fee for the first restaurant in the search results.", "start_url": "https://www.ubereats.com", "metadata": {"original_task_id": "wb-2672", "website": "ubereats.com", "category": "READ", "additional": {"webbench_id": 2672, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2673", "dataset": "webbench", "query": "Use the search feature to find nearby restaurants offering vegan options and output the names and ratings of the top 5 results.", "start_url": "https://www.ubereats.com", "metadata": {"original_task_id": "wb-2673", "website": "ubereats.com", "category": "READ", "additional": {"webbench_id": 2673, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2675", "dataset": "webbench", "query": "Navigate to the “Events & News” section and extract the headline and summary of the latest press release regarding UN peacekeeping operations.", "start_url": "https://www.un.org/en", "metadata": {"original_task_id": "wb-2675", "website": "un.org", "category": "READ", "additional": {"webbench_id": 2675, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2692", "dataset": "webbench", "query": "Browse the “Free Vectors” category and identify the vector asset with the highest download count; then output its title.", "start_url": "https://www.vecteezy.com", "metadata": {"original_task_id": "wb-2692", "website": "vecteezy.com", "category": "READ", "additional": {"webbench_id": 2692, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2712", "dataset": "webbench", "query": "Locate the Xbox Support contact page and find \"request a call\" and fill the form to describe the connection issue with my xbox", "start_url": "https://www.xbox.com/en-US", "metadata": {"original_task_id": "wb-2712", "website": "xbox.com", "category": "READ", "additional": {"webbench_id": 2712, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2719", "dataset": "webbench", "query": "Navigate to Mayo Clinic's COVID-19 long-term effects page and identify three potential symptoms, then search Mayo clinic for recommended treatment strategies for each.\nOnly use mayoclinic.org to achieve the task. Don't go to any other site. The task is achievable with just navigation from this site.", "start_url": "https://www.mayoclinic.org", "metadata": {"original_task_id": "wb-2719", "website": "mayoclinic.org", "category": "READ", "additional": {"webbench_id": 2719, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|