mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
- Add hover_at, type_at, drag_at coordinate tools to server - Add hoverAt, typeAt, dragAt methods to Browser class - Export server internals (browser, tool-loop, registry) for eval imports - Copy eval app from enterprise repo with agents, graders, runner, dashboard - Nest eval-targets inside apps/eval - Adapt sessionExecutionDir → workingDir for current server API - Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
51 lines
25 KiB
JSON
51 lines
25 KiB
JSON
{"query_id": "wb-2218", "dataset": "webbench", "query": "Return the phone number of \"Joe's Pizza\" in Manhattan", "start_url": "https://www.yelp.com", "metadata": {"original_task_id": "wb-2218", "website": "yelp.com", "category": "READ", "additional": {"webbench_id": 2218, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-446", "dataset": "webbench", "query": "Navigate through the archive by decade and note the publication year of one highlighted article from each decade (e.g., 1980s, 1990s, 2000s, etc.).", "start_url": "https://www.esquire.com", "metadata": {"original_task_id": "wb-446", "website": "esquire.com", "category": "READ", "additional": {"webbench_id": 446, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-298", "dataset": "webbench", "query": "Search the \"community\" section for events related to yoga in Atlanta and display the event details along with the organizer's contact.", "start_url": "https://newyork.craigslist.org", "metadata": {"original_task_id": "wb-298", "website": "newyork.craigslist.org", "category": "READ", "additional": {"webbench_id": 298, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1776", "dataset": "webbench", "query": "Use the website’s search function to look for articles on “climate change” published in the last month, then list the titles of the top five results.", "start_url": "https://www.the-independent.com", "metadata": {"original_task_id": "wb-1776", "website": "the-independent.com", "category": "READ", "additional": {"webbench_id": 1776, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1585", "dataset": "webbench", "query": "Access the live scores page, click on a current NBA match, and note down the current score and quarter information.", "start_url": "https://www.sportskeeda.com", "metadata": {"original_task_id": "wb-1585", "website": "sportskeeda.com", "category": "READ", "additional": {"webbench_id": 1585, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-43", "dataset": "webbench", "query": "How do I cancel an order that has already shipped?", "start_url": "https://aliexpress.com", "metadata": {"original_task_id": "wb-43", "website": "aliexpress.com", "category": "READ", "additional": {"webbench_id": 43, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-374", "dataset": "webbench", "query": "Open the store locator and search for Dick’s Sporting Goods stores near the zipcode 33101 then note the store hours and any special in-store services offered.", "start_url": "https://www.dickssportinggoods.com", "metadata": {"original_task_id": "wb-374", "website": "dickssportinggoods.com", "category": "READ", "additional": {"webbench_id": 374, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2327", "dataset": "webbench", "query": "Search for 2-bedroom apartments available for rent in Manhattan under $3,000 per month and list the names, addresses, and prices of 3", "start_url": "https://streeteasy.com", "metadata": {"original_task_id": "wb-2327", "website": "streeteasy.com", "category": "READ", "additional": {"webbench_id": 2327, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2566", "dataset": "webbench", "query": "Look for articles mentioning “Inflation trends” published in 2023, and list the headlines of the top five results.", "start_url": "https://www.livemint.com", "metadata": {"original_task_id": "wb-2566", "website": "livemint.com", "category": "READ", "additional": {"webbench_id": 2566, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-465", "dataset": "webbench", "query": "Create a travel plan on Expedia that combines a flight, hotel, and rental car reservation for a business trip from Los Angeles to San Francisco.", "start_url": "https://www.expedia.com", "metadata": {"original_task_id": "wb-465", "website": "expedia.com", "category": "READ", "additional": {"webbench_id": 465, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2179", "dataset": "webbench", "query": "Locate the official guidance on COVID-19 travel regulations on Gov.uk and list the recommended preventative measures.", "start_url": "https://www.gov.uk", "metadata": {"original_task_id": "wb-2179", "website": "gov.uk", "category": "READ", "additional": {"webbench_id": 2179, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-920", "dataset": "webbench", "query": "Using advance filter, Navigate to the Books section on JSTOR and list the first 3 book titles along with their primary authors.", "start_url": "https://www.jstor.org", "metadata": {"original_task_id": "wb-920", "website": "jstor.org", "category": "READ", "additional": {"webbench_id": 920, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2005", "dataset": "webbench", "query": "Search and locate a “Lady Gaga” article, then note three keypoints.", "start_url": "https://www.usmagazine.com", "metadata": {"original_task_id": "wb-2005", "website": "usmagazine.com", "category": "READ", "additional": {"webbench_id": 2005, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-474", "dataset": "webbench", "query": "Create a multi-stop flight itinerary on Expedia with stops in Atlanta, Nashville, and New Orleans.", "start_url": "https://www.expedia.com", "metadata": {"original_task_id": "wb-474", "website": "expedia.com", "category": "READ", "additional": {"webbench_id": 474, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1371", "dataset": "webbench", "query": "Locate the product details for \"Horizon Forbidden West\" and extract the release date, developer, and price information.", "start_url": "https://www.playstation.com/en-us", "metadata": {"original_task_id": "wb-1371", "website": "playstation.com", "category": "READ", "additional": {"webbench_id": 1371, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-35", "dataset": "webbench", "query": "Search for \"RFID tags\", filter by a minimum order quantity of less than 1000, and list the first 3 supplier names.", "start_url": "https://www.alibaba.com", "metadata": {"original_task_id": "wb-35", "website": "alibaba.com", "category": "READ", "additional": {"webbench_id": 35, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1432", "dataset": "webbench", "query": "Report the first post on the videogames subreddit for inappropriate content", "start_url": "https://www.reddit.com/?rdt=40707", "metadata": {"original_task_id": "wb-1432", "website": "reddit.com", "category": "READ", "additional": {"webbench_id": 1432, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1356", "dataset": "webbench", "query": "Go to the \"Videos\" section and output the file sizes from the download section and resolutions for the top 3 trending videos.", "start_url": "https://pixabay.com", "metadata": {"original_task_id": "wb-1356", "website": "pixabay.com", "category": "READ", "additional": {"webbench_id": 1356, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1370", "dataset": "webbench", "query": "Go to the digital wallet information area and list the payment options available for topping up.", "start_url": "https://www.playstation.com/en-us", "metadata": {"original_task_id": "wb-1370", "website": "playstation.com", "category": "READ", "additional": {"webbench_id": 1370, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1398", "dataset": "webbench", "query": "Go to spaces and navigate to one of the recommended spaces to view. Check the profile of one of the top contributers in this space and return how many followers they have.", "start_url": "https://www.quora.com", "metadata": {"original_task_id": "wb-1398", "website": "quora.com", "category": "READ", "additional": {"webbench_id": 1398, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2231", "dataset": "webbench", "query": "Search job listings for “Software Engineer” in bangalore, WA and list the top 5 job titles along with their corresponding company names.", "start_url": "https://www.ziprecruiter.com", "metadata": {"original_task_id": "wb-2231", "website": "ziprecruiter.com", "category": "READ", "additional": {"webbench_id": 2231, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1496", "dataset": "webbench", "query": "Find how many articles about \"renewable energy\" were published by Harvard from the last 5 years, filter by open access.", "start_url": "https://www.sciencedirect.com", "metadata": {"original_task_id": "wb-1496", "website": "sciencedirect.com", "category": "READ", "additional": {"webbench_id": 1496, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-373", "dataset": "webbench", "query": "Access the return policy page and summarize the key points, including the return timeframe and conditions required.", "start_url": "https://www.dickssportinggoods.com", "metadata": {"original_task_id": "wb-373", "website": "dickssportinggoods.com", "category": "READ", "additional": {"webbench_id": 373, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1820", "dataset": "webbench", "query": "Find an opinion column discussing housing policy and list its main arguments as presented in the article.", "start_url": "https://www.thesun.co.uk", "metadata": {"original_task_id": "wb-1820", "website": "thesun.co.uk", "category": "READ", "additional": {"webbench_id": 1820, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-0", "dataset": "webbench", "query": "On the product details page for the \"Black & Decker Power Tool Combo Kit,\" list its specifications including dimensions, voltage, and warranty information.", "start_url": "https://www.acehardware.com", "metadata": {"original_task_id": "wb-0", "website": "acehardware.com", "category": "READ", "additional": {"webbench_id": 0, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-690", "dataset": "webbench", "query": "Search for \"Amazon\" on Glassdoor and extract the top three posts this month on the Amazon Employees Bowl.", "start_url": "https://www.glassdoor.com/index.htm", "metadata": {"original_task_id": "wb-690", "website": "glassdoor.com", "category": "READ", "additional": {"webbench_id": 690, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-914", "dataset": "webbench", "query": "Read the article on effective revision techniques provided on jagranjosh.com and summarize the top five methods mentioned.", "start_url": "https://www.jagranjosh.com", "metadata": {"original_task_id": "wb-914", "website": "jagranjosh.com", "category": "READ", "additional": {"webbench_id": 914, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1497", "dataset": "webbench", "query": "Locate a paper about \"deep learning\" and use the \"References\" section to find the three most-cited papers it references. Record their titles and years.", "start_url": "https://www.sciencedirect.com", "metadata": {"original_task_id": "wb-1497", "website": "sciencedirect.com", "category": "READ", "additional": {"webbench_id": 1497, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2598", "dataset": "webbench", "query": "Search for an opinion piece by Jim Banks, and list the headline and publication date of his most recent post", "start_url": "https://www.newsweek.com", "metadata": {"original_task_id": "wb-2598", "website": "newsweek.com", "category": "READ", "additional": {"webbench_id": 2598, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1081", "dataset": "webbench", "query": "Search for \"Nike Air Max shoes\" and check the availability in your local store with store location set to New York City, New York. If unavailable, check in a store within a 20-mile radius and note the estimated delivery time if ordered online.", "start_url": "https://www.macys.com", "metadata": {"original_task_id": "wb-1081", "website": "macys.com", "category": "READ", "additional": {"webbench_id": 1081, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1133", "dataset": "webbench", "query": "Search Mercari for \"vintage denim jacket\" and list the first 5 results, including each item’s price and condition.", "start_url": "https://www.mercari.com", "metadata": {"original_task_id": "wb-1133", "website": "mercari.com", "category": "READ", "additional": {"webbench_id": 1133, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-172", "dataset": "webbench", "query": "Search for 8 review score or above hotels in Manhattan, New York for next weekend with free cancellation, free Wi-Fi, and breakfast included. Compare at least four options and pick the one closest to the Empire State Building.", "start_url": "https://www.booking.com", "metadata": {"original_task_id": "wb-172", "website": "booking.com", "category": "READ", "additional": {"webbench_id": 172, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1212", "dataset": "webbench", "query": "Visit the Newegg Premier membership page, read through the listed benefits, and summarize at least five key membership perks.", "start_url": "https://www.newegg.com", "metadata": {"original_task_id": "wb-1212", "website": "newegg.com", "category": "READ", "additional": {"webbench_id": 1212, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2228", "dataset": "webbench", "query": "Identify listings in \"Las Vegas, NV\" that feature a 3D home tour and provide the property address, price, and tour availability details of the top result.", "start_url": "https://www.zillow.com", "metadata": {"original_task_id": "wb-2228", "website": "zillow.com", "category": "READ", "additional": {"webbench_id": 2228, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-763", "dataset": "webbench", "query": "Use the website’s search function to find a video on \"Ancient Civilizations\" and record the title along with a brief overview.", "start_url": "https://www.history.com", "metadata": {"original_task_id": "wb-763", "website": "history.com", "category": "READ", "additional": {"webbench_id": 763, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1132", "dataset": "webbench", "query": "Search for articles about HIIT workouts published within the last month and list the first five titles along with their publication dates.", "start_url": "https://www.menshealth.com", "metadata": {"original_task_id": "wb-1132", "website": "menshealth.com", "category": "READ", "additional": {"webbench_id": 1132, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-393", "dataset": "webbench", "query": "Navigate to the EA Play section and extract the details of the free trial offer—including its duration and key benefits—then output this information in text.", "start_url": "https://www.ea.com", "metadata": {"original_task_id": "wb-393", "website": "ea.com", "category": "READ", "additional": {"webbench_id": 393, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1837", "dataset": "webbench", "query": "Visit Mr. Beast’s profile and record the engagement metrics (likes and comments) on their three most-viewed videos.", "start_url": "https://www.tiktok.com/explore", "metadata": {"original_task_id": "wb-1837", "website": "tiktok.com", "category": "READ", "additional": {"webbench_id": 1837, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1101", "dataset": "webbench", "query": "Search for Marriott hotels in New York, NY, and list the names, addresses, and one key amenity for each of the first five properties.", "start_url": "https://www.marriott.com/default.mi", "metadata": {"original_task_id": "wb-1101", "website": "marriott.com", "category": "READ", "additional": {"webbench_id": 1101, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-10", "dataset": "webbench", "query": "Use the flight search tool to find options from Sydney to Auckland on a specific travel date, then summarize in text the pricing differences between economy and business classes.", "start_url": "https://www.agoda.com", "metadata": {"original_task_id": "wb-10", "website": "agoda.com", "category": "READ", "additional": {"webbench_id": 10, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-805", "dataset": "webbench", "query": "Find the article titled \"Innovative Home Office Designs\" and list the design tips mentioned in it.", "start_url": "https://www.housebeautiful.com", "metadata": {"original_task_id": "wb-805", "website": "housebeautiful.com", "category": "READ", "additional": {"webbench_id": 805, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1892", "dataset": "webbench", "query": "Identify the top-rated hotel in Paris, verify if it offers free cancellation, and analyze at least three recent guest reviews to see if they mention staff helpfulness.", "start_url": "https://www.tripadvisor.com", "metadata": {"original_task_id": "wb-1892", "website": "tripadvisor.com", "category": "READ", "additional": {"webbench_id": 1892, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1739", "dataset": "webbench", "query": "Navigate to the Premium subscription page and summarize the details of the quarterly premium pricing, including any additional benefits.", "start_url": "https://www.studocu.com", "metadata": {"original_task_id": "wb-1739", "website": "studocu.com", "category": "READ", "additional": {"webbench_id": 1739, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1346", "dataset": "webbench", "query": "Use the search filters to find videos about indoor plant care that are between 1-3 minutes long, and list return the video with more likes", "start_url": "https://www.pinterest.com", "metadata": {"original_task_id": "wb-1346", "website": "pinterest.com", "category": "READ", "additional": {"webbench_id": 1346, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2303", "dataset": "webbench", "query": "Find the Copyright in the top right corner under the three dots and read the SoundCloud Copyright, and summarize the key points in bullet form.", "start_url": "https://soundcloud.com", "metadata": {"original_task_id": "wb-2303", "website": "soundcloud.com", "category": "READ", "additional": {"webbench_id": 2303, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-402", "dataset": "webbench", "query": "Search for \"used laptops\" within the price range of $300-$500. Filter by Buy now options and find an option with 8GB Ram and 500GB memory. Add it to cart.", "start_url": "https://www.ebay.com", "metadata": {"original_task_id": "wb-402", "website": "ebay.com", "category": "READ", "additional": {"webbench_id": 402, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-214", "dataset": "webbench", "query": "Use the \"Side-by-side comparisons\" tool to compare a 2018 Ford F-150 and a 2018 Chevrolet Silverado 1500, and list three key differences in their features.", "start_url": "https://www.cars.com", "metadata": {"original_task_id": "wb-214", "website": "cars.com", "category": "READ", "additional": {"webbench_id": 214, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2573", "dataset": "webbench", "query": "Search for a holiday package that bundles flights and hotel stays for a Dubai trip and view the detailed itinerary.", "start_url": "https://www.makemytrip.global/?cc=am&redirectedBy=gl", "metadata": {"original_task_id": "wb-2573", "website": "makemytrip.global", "category": "READ", "additional": {"webbench_id": 2573, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2301", "dataset": "webbench", "query": "Search for “UK electronic music” and list the titles of the first five tracks that appear.", "start_url": "https://soundcloud.com", "metadata": {"original_task_id": "wb-2301", "website": "soundcloud.com", "category": "READ", "additional": {"webbench_id": 2301, "difficulty": "easy", "pass_count_4": 1, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-207", "dataset": "webbench", "query": "Search the site for a feature article on traffic safety trends in 2021 and summarize the three main trends outlined.", "start_url": "https://www.caranddriver.com", "metadata": {"original_task_id": "wb-207", "website": "caranddriver.com", "category": "READ", "additional": {"webbench_id": 207, "difficulty": "hard", "pass_count_4": 1, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|