mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
- Add hover_at, type_at, drag_at coordinate tools to server - Add hoverAt, typeAt, dragAt methods to Browser class - Export server internals (browser, tool-loop, registry) for eval imports - Copy eval app from enterprise repo with agents, graders, runner, dashboard - Nest eval-targets inside apps/eval - Adapt sessionExecutionDir → workingDir for current server API - Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
51 lines
25 KiB
JSON
51 lines
25 KiB
JSON
{"query_id": "wb-2634", "dataset": "webbench", "query": "Use the search bar to find an article about the \"500 Greatest Albums\" list; then list the title and URL of the article.", "start_url": "https://www.rollingstone.com", "metadata": {"original_task_id": "wb-2634", "website": "rollingstone.com", "category": "READ", "additional": {"webbench_id": 2634, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2097", "dataset": "webbench", "query": "Run a SPARQL query that retrieves the population of all countries in Europe.", "start_url": "https://www.wikidata.org/wiki/Wikidata:Main_Page", "metadata": {"original_task_id": "wb-2097", "website": "wikidata.org", "category": "READ", "additional": {"webbench_id": 2097, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2675", "dataset": "webbench", "query": "Navigate to the “Events & News” section and extract the headline and summary of the latest press release regarding UN peacekeeping operations.", "start_url": "https://www.un.org/en", "metadata": {"original_task_id": "wb-2675", "website": "un.org", "category": "READ", "additional": {"webbench_id": 2675, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-24", "dataset": "webbench", "query": "Search for beachfront properties in Miami with nightly rates under $300, and list the top three property names along with their prices.", "start_url": "https://www.airbnb.com", "metadata": {"original_task_id": "wb-24", "website": "airbnb.com", "category": "READ", "additional": {"webbench_id": 24, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2124", "dataset": "webbench", "query": "Go to the events or webinars section and list the upcoming academic events, including dates and topics.", "start_url": "https://www.wiley.com/en-us", "metadata": {"original_task_id": "wb-2124", "website": "wiley.com", "category": "READ", "additional": {"webbench_id": 2124, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2287", "dataset": "webbench", "query": "Locate a Medium article from the publication \"The Startup\" and list its URL, title, and a brief summary of its content.", "start_url": "https://medium.com/explore-topics", "metadata": {"original_task_id": "wb-2287", "website": "medium.com", "category": "READ", "additional": {"webbench_id": 2287, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1183", "dataset": "webbench", "query": "Check the trading summary for the NYSE and record its current value.", "start_url": "https://www.nasdaq.com", "metadata": {"original_task_id": "wb-1183", "website": "nasdaq.com", "category": "READ", "additional": {"webbench_id": 1183, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2329", "dataset": "webbench", "query": "Use the advanced search filters to find listings near Times Square with a doorman and list the first three property names along with their prices. Verify they have a doorman or concierge.", "start_url": "https://streeteasy.com", "metadata": {"original_task_id": "wb-2329", "website": "streeteasy.com", "category": "READ", "additional": {"webbench_id": 2329, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2391", "dataset": "webbench", "query": "Use Bing Maps to find walking directions to Central Park in New York and copy the step-by-step route details provided.", "start_url": "https://www.bing.com", "metadata": {"original_task_id": "wb-2391", "website": "bing.com", "category": "READ", "additional": {"webbench_id": 2391, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2673", "dataset": "webbench", "query": "Use the search feature to find nearby restaurants offering vegan options and output the names and ratings of the top 5 results.", "start_url": "https://www.ubereats.com", "metadata": {"original_task_id": "wb-2673", "website": "ubereats.com", "category": "READ", "additional": {"webbench_id": 2673, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-219", "dataset": "webbench", "query": "Locate a section or article on \"upcoming releases\" and list the titles and release dates of the movies or comics mentioned.", "start_url": "https://www.cbr.com", "metadata": {"original_task_id": "wb-219", "website": "cbr.com", "category": "READ", "additional": {"webbench_id": 219, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-814", "dataset": "webbench", "query": "Use the HubPages search function to find articles on travel and summarize the main idea of the first result.", "start_url": "https://discover.hubpages.com/", "metadata": {"original_task_id": "wb-814", "website": "discover.hubpages.com", "category": "READ", "additional": {"webbench_id": 814, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1152", "dataset": "webbench", "query": "Search for articles mentioning \"Brexit\" and list the titles of the first five results.", "start_url": "https://metro.co.uk", "metadata": {"original_task_id": "wb-1152", "website": "metro.co.uk", "category": "READ", "additional": {"webbench_id": 1152, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1325", "dataset": "webbench", "query": "Access the digital magazine archive and identify the issue that covers hardware benchmarks; provide its publication month.", "start_url": "https://www.pcgamer.com", "metadata": {"original_task_id": "wb-1325", "website": "pcgamer.com", "category": "READ", "additional": {"webbench_id": 1325, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1862", "dataset": "webbench", "query": "Access the Opinion section, search for commentary on “climate change,” and list the titles of the three most recent pieces.", "start_url": "https://www.today.com", "metadata": {"original_task_id": "wb-1862", "website": "today.com", "category": "READ", "additional": {"webbench_id": 1862, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2576", "dataset": "webbench", "query": "Use MDPI’s journal search filters to identify journals that offer an ultra-rapid publication process, then list the names and scopes of the first five journals displayed.", "start_url": "https://www.mdpi.com", "metadata": {"original_task_id": "wb-2576", "website": "mdpi.com", "category": "READ", "additional": {"webbench_id": 2576, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-102", "dataset": "webbench", "query": "Search for the app \"Spotify\" on APKPure and list the latest version number along with its release date as shown on the version history section.", "start_url": "https://apkpure.com", "metadata": {"original_task_id": "wb-102", "website": "apkpure.com", "category": "READ", "additional": {"webbench_id": 102, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-984", "dataset": "webbench", "query": "Use the historical trends tool to list the top three most amended clauses over the past year by frequency.", "start_url": "https://www.lawinsider.com", "metadata": {"original_task_id": "wb-984", "website": "lawinsider.com", "category": "READ", "additional": {"webbench_id": 984, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-390", "dataset": "webbench", "query": "Visit the Podcast section, select the latest episode, and provide its title along with a brief description.", "start_url": "https://www.dw.com", "metadata": {"original_task_id": "wb-390", "website": "dw.com", "category": "READ", "additional": {"webbench_id": 390, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2071", "dataset": "webbench", "query": "Filter the furniture category by \"sectional sofas\" and extract the dimensions, available colors, and material details of the first product listed.", "start_url": "https://www.wayfair.com", "metadata": {"original_task_id": "wb-2071", "website": "wayfair.com", "category": "READ", "additional": {"webbench_id": 2071, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2302", "dataset": "webbench", "query": "Use the search function to find tracks containing the keyword “ambient” and output a list of the top 10 track titles along with their durations.", "start_url": "https://soundcloud.com", "metadata": {"original_task_id": "wb-2302", "website": "soundcloud.com", "category": "READ", "additional": {"webbench_id": 2302, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1895", "dataset": "webbench", "query": "Filter hotel search results in Paris by selecting properties that offer free breakfast; then extract and list the names and average review scores of the first 5 hotels.", "start_url": "https://www.trivago.com", "metadata": {"original_task_id": "wb-1895", "website": "trivago.com", "category": "READ", "additional": {"webbench_id": 1895, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-605", "dataset": "webbench", "query": "Use the website’s filtering tools to display \"Retro Gaming\" articles and extract the titles of the top three most recent posts.", "start_url": "https://gamerant.com", "metadata": {"original_task_id": "wb-605", "website": "gamerant.com", "category": "READ", "additional": {"webbench_id": 605, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2159", "dataset": "webbench", "query": "In a forum discussion on regional variations in Spanish, summarize the key differences mentioned by the community members.", "start_url": "https://www.wordreference.com", "metadata": {"original_task_id": "wb-2159", "website": "wordreference.com", "category": "READ", "additional": {"webbench_id": 2159, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1158", "dataset": "webbench", "query": "Access the Michigan Department of Health page via Michigan.gov and list the steps provided for scheduling a vaccination appointment.", "start_url": "https://www.michigan.gov/som", "metadata": {"original_task_id": "wb-1158", "website": "michigan.gov", "category": "READ", "additional": {"webbench_id": 1158, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2054", "dataset": "webbench", "query": "Search for COVID-19 vaccinations on Walgreens and list the available appointment options, eligibility criteria, and any cost details provided.", "start_url": "https://www.walgreens.com", "metadata": {"original_task_id": "wb-2054", "website": "walgreens.com", "category": "READ", "additional": {"webbench_id": 2054, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-635", "dataset": "webbench", "query": "What email should I contact if I'm interesting in working for Genius as a journalist?", "start_url": "https://genius.com", "metadata": {"original_task_id": "wb-635", "website": "genius.com", "category": "READ", "additional": {"webbench_id": 635, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1145", "dataset": "webbench", "query": "Browse the TV shows category and list the titles, metascores, and number of critic reviews for shows scoring below 60 with at least 10 critic reviews.", "start_url": "https://www.metacritic.com", "metadata": {"original_task_id": "wb-1145", "website": "metacritic.com", "category": "READ", "additional": {"webbench_id": 1145, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-1161", "dataset": "webbench", "query": "Access the MLB.TV subscription page and extract the available pricing options and plan durations offered.", "start_url": "https://www.mlb.com", "metadata": {"original_task_id": "wb-1161", "website": "mlb.com", "category": "READ", "additional": {"webbench_id": 1161, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2289", "dataset": "webbench", "query": "Search for the latest article about holiday recipes on Parade.com and summarize the key steps or ingredients mentioned in the recipe.", "start_url": "https://parade.com", "metadata": {"original_task_id": "wb-2289", "website": "parade.com", "category": "READ", "additional": {"webbench_id": 2289, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1662", "dataset": "webbench", "query": "Use the main search bar to find questions tagged with \"python\" in the Stack Overflow community and output the titles of the first 5 results.", "start_url": "https://stackexchange.com", "metadata": {"original_task_id": "wb-1662", "website": "stackexchange.com", "category": "READ", "additional": {"webbench_id": 1662, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-505", "dataset": "webbench", "query": "Find an image posted to the Stranger Things Wiki forum and identify the dimensions of the largest downloadable option", "start_url": "https://www.fandom.com", "metadata": {"original_task_id": "wb-505", "website": "fandom.com", "category": "READ", "additional": {"webbench_id": 505, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1967", "dataset": "webbench", "query": "Visit the \"Contact Us\" page and record the main administrative office’s phone number and email address.", "start_url": "https://www.upenn.edu", "metadata": {"original_task_id": "wb-1967", "website": "upenn.edu", "category": "READ", "additional": {"webbench_id": 1967, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-744", "dataset": "webbench", "query": "Search for Italian restaurants in zip code 60611 (Chicago, IL), sort by \"4 Stars and Up,\" and list 5 restaurants rated 4.7 and above.", "start_url": "https://www.grubhub.com", "metadata": {"original_task_id": "wb-744", "website": "grubhub.com", "category": "READ", "additional": {"webbench_id": 744, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-345", "dataset": "webbench", "query": "Use the Delish search bar to look up \"Marry Me Chicken\" and list the ingredient quantities mentioned in the recipe.", "start_url": "https://www.delish.com", "metadata": {"original_task_id": "wb-345", "website": "delish.com", "category": "READ", "additional": {"webbench_id": 345, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2667", "dataset": "webbench", "query": "Search for posts using the keyword \"DIY crafts\" and determine whether the majority of the results are photo, video, or text posts.", "start_url": "https://www.tumblr.com", "metadata": {"original_task_id": "wb-2667", "website": "tumblr.com", "category": "READ", "additional": {"webbench_id": 2667, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-917", "dataset": "webbench", "query": "Locate the latest solved question paper for SSC exams and enumerate the subjects included in the paper.", "start_url": "https://www.jagranjosh.com", "metadata": {"original_task_id": "wb-917", "website": "jagranjosh.com", "category": "READ", "additional": {"webbench_id": 917, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-543", "dataset": "webbench", "query": "In the \"Laptops\" section, apply the filter for \"Dell\" and extract the average discount percentage on the first 3 Dell laptops displayed.", "start_url": "https://www.flipkart.com", "metadata": {"original_task_id": "wb-543", "website": "flipkart.com", "category": "READ", "additional": {"webbench_id": 543, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-834", "dataset": "webbench", "query": "On the product page for the \"MALM bed frame,\" scroll to the product details section to find the assembly instructions and extract the first three steps described.", "start_url": "https://www.ikea.com", "metadata": {"original_task_id": "wb-834", "website": "ikea.com", "category": "READ", "additional": {"webbench_id": 834, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2212", "dataset": "webbench", "query": "Locate a local gym in Oakland, CA with the highest rating on YellowPages and output its contact information along with membership hours.", "start_url": "https://www.yellowpages.com", "metadata": {"original_task_id": "wb-2212", "website": "yellowpages.com", "category": "READ", "additional": {"webbench_id": 2212, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-841", "dataset": "webbench", "query": "Find any upcoming public meetings or events related to state government present on the site and list the dates and topics for at least two events.", "start_url": "https://www.illinois.gov", "metadata": {"original_task_id": "wb-841", "website": "illinois.gov", "category": "READ", "additional": {"webbench_id": 841, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-883", "dataset": "webbench", "query": "Search for organic bananas on Instacart and list the top 3 prices along with their retailer names.", "start_url": "https://www.instacart.com", "metadata": {"original_task_id": "wb-883", "website": "instacart.com", "category": "READ", "additional": {"webbench_id": 883, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1029", "dataset": "webbench", "query": "Search for and list the titles of the first 5 LinkedIn articles in the “Technology” category.", "start_url": "https://www.linkedin.com", "metadata": {"original_task_id": "wb-1029", "website": "linkedin.com", "category": "READ", "additional": {"webbench_id": 1029, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2658", "dataset": "webbench", "query": "Browse the Electronics category, apply a filter for products priced under $10, and extract details (name and rating) for one product that has at least 50 reviews.", "start_url": "https://www.temu.com", "metadata": {"original_task_id": "wb-2658", "website": "temu.com", "category": "READ", "additional": {"webbench_id": 2658, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "FAIL", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-2457", "dataset": "webbench", "query": "Navigate to the \"Coronavirus\" section (if available) and list the top three headlines along with their brief summaries.", "start_url": "https://www.dailymail.co.uk/ushome/index.html", "metadata": {"original_task_id": "wb-2457", "website": "dailymail.co.uk", "category": "READ", "additional": {"webbench_id": 2457, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-2719", "dataset": "webbench", "query": "Navigate to Mayo Clinic's COVID-19 long-term effects page and identify three potential symptoms, then search Mayo clinic for recommended treatment strategies for each.\nOnly use mayoclinic.org to achieve the task. Don't go to any other site. The task is achievable with just navigation from this site.", "start_url": "https://www.mayoclinic.org", "metadata": {"original_task_id": "wb-2719", "website": "mayoclinic.org", "category": "READ", "additional": {"webbench_id": 2719, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-362", "dataset": "webbench", "query": "Visit the profile of the artist “BisBiswas” and return the number of pageviews and deviations, as well as the artist's birthday", "start_url": "https://www.deviantart.com", "metadata": {"original_task_id": "wb-362", "website": "deviantart.com", "category": "READ", "additional": {"webbench_id": 362, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-1347", "dataset": "webbench", "query": "Search for boards about sustainable living, identify the one with the most pins, and list both the board name and follower count of the creator", "start_url": "https://www.pinterest.com", "metadata": {"original_task_id": "wb-1347", "website": "pinterest.com", "category": "READ", "additional": {"webbench_id": 1347, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "PASS", "openai_cua": "FAIL"}}}}
|
||
{"query_id": "wb-296", "dataset": "webbench", "query": "Browse the \"services\" section in Dallas, TX for listings related to \"computer repair\" and note down the business names from the top five ads.", "start_url": "https://newyork.craigslist.org", "metadata": {"original_task_id": "wb-296", "website": "newyork.craigslist.org", "category": "READ", "additional": {"webbench_id": 296, "difficulty": "easy", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "FAIL", "skyvern_bb": "FAIL", "openai_cua": "PASS"}}}}
|
||
{"query_id": "wb-509", "dataset": "webbench", "query": "Look up the latest FDA guidance on AI/ML in medical device software and summarize the key points mentioned in the introduction.", "start_url": "https://www.fda.gov", "metadata": {"original_task_id": "wb-509", "website": "fda.gov", "category": "READ", "additional": {"webbench_id": 509, "difficulty": "hard", "pass_count_4": 2, "agent_results": {"anthropic_cua": "PASS", "skyvern_2": "PASS", "skyvern_bb": "FAIL", "openai_cua": "FAIL"}}}}
|