mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
- Add hover_at, type_at, drag_at coordinate tools to server - Add hoverAt, typeAt, dragAt methods to Browser class - Export server internals (browser, tool-loop, registry) for eval imports - Copy eval app from enterprise repo with agents, graders, runner, dashboard - Nest eval-targets inside apps/eval - Adapt sessionExecutionDir → workingDir for current server API - Add biome ignore for dashboard HTML to prevent lint breaking onclick handlers
51 lines
21 KiB
JSON
51 lines
21 KiB
JSON
{"query_id":"amazon-multi-filter-1","dataset":"browseros-eval","query":"Find a noise-cancelling over-ear Bluetooth headphone on Amazon with at least 4.5 stars and over 1000 reviews, priced between $50 and $100, and add the cheapest option to my cart.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.amazon.com/","metadata":{"original_task_id":"amazon-multi-filter-1","website":"Amazon","category":"shopping","additional":{}}}
|
|
{"query_id":"bestbuy-trade-in-1","dataset":"browseros-eval","query":"Check the trade-in value of a 7th generation Intel Core i5 HP laptop with 8 GB RAM running Windows 10 in fair condition on Best Buy.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.bestbuy.com/","metadata":{"original_task_id":"bestbuy-trade-in-1","website":"Best Buy","category":"shopping","additional":{}}}
|
|
{"query_id":"target-grocery-1","dataset":"browseros-eval","query":"Find a frozen vegan cheese pizza on Target priced between $5 and $10 that is available for same-day delivery to zip code 90210.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.target.com/","metadata":{"original_task_id":"target-grocery-1","website":"Target","category":"shopping","additional":{}}}
|
|
{"query_id":"walmart-compare-1","dataset":"browseros-eval","query":"Compare the top two best-selling 65-inch 4K smart TVs on Walmart by price, rating, and number of reviews, and tell me which one offers better value.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.walmart.com/","metadata":{"original_task_id":"walmart-compare-1","website":"Walmart","category":"shopping","additional":{}}}
|
|
{"query_id":"nike-shoe-1","dataset":"browseros-eval","query":"Find a men's running shoe on Nike in size 10, color black, with a price under $130 and at least 4 stars. Add it to the cart.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.nike.com/","metadata":{"original_task_id":"nike-shoe-1","website":"Nike","category":"shopping","additional":{}}}
|
|
{"query_id":"costco-membership-1","dataset":"browseros-eval","query":"Find the price difference between Gold Star and Executive membership on Costco and list the extra benefits the Executive membership provides.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.costco.com/","metadata":{"original_task_id":"costco-membership-1","website":"Costco","category":"shopping","additional":{}}}
|
|
{"query_id":"ikea-furniture-1","dataset":"browseros-eval","query":"Find the cheapest black leather sofa on IKEA with at least 3 seats and a customer rating of 4 stars or higher. Show me the price and dimensions.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.ikea.com/","metadata":{"original_task_id":"ikea-furniture-1","website":"IKEA","category":"shopping","additional":{}}}
|
|
{"query_id":"apple-config-1","dataset":"browseros-eval","query":"Configure a 16-inch MacBook Pro with M4 Max chip, 48 GB RAM, and 1 TB SSD on the Apple Store. What is the total price?","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.apple.com/","metadata":{"original_task_id":"apple-config-1","website":"Apple","category":"shopping","additional":{}}}
|
|
{"query_id":"homedepot-tool-1","dataset":"browseros-eval","query":"Find a cordless drill kit on Home Depot with at least 2 batteries included, 20V or higher, rated 4.5 stars or above, and priced under $150.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.homedepot.com/","metadata":{"original_task_id":"homedepot-tool-1","website":"Home Depot","category":"shopping","additional":{}}}
|
|
{"query_id":"booking-hotel-1","dataset":"browseros-eval","query":"Find the highest-rated hotel in downtown Chicago for 2 adults checking in next Friday and checking out Sunday, with free cancellation and breakfast included. Show me the price breakdown.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.booking.com/","metadata":{"original_task_id":"booking-hotel-1","website":"Booking.com","category":"travel","additional":{}}}
|
|
{"query_id":"airbnb-stay-1","dataset":"browseros-eval","query":"Find an entire home in Austin, TX for 4 guests with a pool and free parking, checking in two weeks from today for 3 nights. Sort by lowest price and show me the top result.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.airbnb.com/","metadata":{"original_task_id":"airbnb-stay-1","website":"Airbnb","category":"travel","additional":{}}}
|
|
{"query_id":"google-maps-transit-1","dataset":"browseros-eval","query":"Find the fastest public transit route from Times Square, New York to JFK Airport departing at 8 AM tomorrow. How long does the trip take and what transfers are needed?","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.google.com/maps/","metadata":{"original_task_id":"google-maps-transit-1","website":"Google Maps","category":"travel","additional":{}}}
|
|
{"query_id":"expedia-package-1","dataset":"browseros-eval","query":"Search for a round-trip flight plus hotel package from San Francisco to Miami for 2 travelers, departing next month on the 15th and returning on the 20th. Show me the cheapest bundle.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.expedia.com/","metadata":{"original_task_id":"expedia-package-1","website":"Expedia","category":"travel","additional":{}}}
|
|
{"query_id":"spothero-parking-1","dataset":"browseros-eval","query":"Find covered parking near the Museum of Modern Art in San Francisco from this Saturday 10 AM to 4 PM for a full-size SUV. Show me the cheapest option with the walk time.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://spothero.com/","metadata":{"original_task_id":"spothero-parking-1","website":"SpotHero","category":"travel","additional":{}}}
|
|
{"query_id":"allrecipes-diet-1","dataset":"browseros-eval","query":"Find a gluten-free chicken dinner recipe on Allrecipes with at least 4.5 stars, over 50 reviews, and a total cook time under 45 minutes. List the ingredients.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.allrecipes.com/","metadata":{"original_task_id":"allrecipes-diet-1","website":"Allrecipes","category":"food","additional":{}}}
|
|
{"query_id":"yelp-restaurant-1","dataset":"browseros-eval","query":"Find the highest-rated Mexican restaurant in downtown Los Angeles on Yelp that is open now, accepts reservations, and has a price range of $$ or less. Show me the top 3 most recent reviews.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.yelp.com/","metadata":{"original_task_id":"yelp-restaurant-1","website":"Yelp","category":"food","additional":{}}}
|
|
{"query_id":"zillow-search-1","dataset":"browseros-eval","query":"Search for 2-bedroom apartments for rent in Seattle, WA under $2500/month with in-unit laundry and parking included. Sort by newest and show me the first three results with their prices.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.zillow.com/","metadata":{"original_task_id":"zillow-search-1","website":"Zillow","category":"real-estate","additional":{}}}
|
|
{"query_id":"redfin-listing-1","dataset":"browseros-eval","query":"Find the most recently listed 3-bedroom house for sale in Austin, TX between $400,000 and $600,000 with at least 2 bathrooms and a garage. Show the listing details.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.redfin.com/","metadata":{"original_task_id":"redfin-listing-1","website":"Redfin","category":"real-estate","additional":{}}}
|
|
{"query_id":"linkedin-jobs-1","dataset":"browseros-eval","query":"Search for remote Senior Software Engineer positions on LinkedIn posted in the last week that offer a salary of $150,000 or more. Show me the first 3 results.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.linkedin.com/jobs/","metadata":{"original_task_id":"linkedin-jobs-1","website":"LinkedIn","category":"jobs","additional":{}}}
|
|
{"query_id":"glassdoor-salary-1","dataset":"browseros-eval","query":"Look up the average base salary for a Product Manager in San Francisco on Glassdoor and show me the salary range and how it compares to the national average.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.glassdoor.com/","metadata":{"original_task_id":"glassdoor-salary-1","website":"Glassdoor","category":"jobs","additional":{}}}
|
|
{"query_id":"indeed-jobs-1","dataset":"browseros-eval","query":"Find entry-level Data Analyst jobs in New York City on Indeed posted within the last 3 days with a salary estimate of at least $60,000/year. List the top 3 results with company names.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.indeed.com/","metadata":{"original_task_id":"indeed-jobs-1","website":"Indeed","category":"jobs","additional":{}}}
|
|
{"query_id":"wikipedia-compare-1","dataset":"browseros-eval","query":"Compare the population, area, and GDP of Germany and France using their Wikipedia pages and summarize which country is larger by each metric.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.wikipedia.org/","metadata":{"original_task_id":"wikipedia-compare-1","website":"Wikipedia","category":"research","additional":{}}}
|
|
{"query_id":"arxiv-search-1","dataset":"browseros-eval","query":"Search for the most recent papers on \"large language model alignment\" on ArXiv under the cs.CL category, submitted in the last month. Show me the titles and authors of the top 3 results.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://arxiv.org/","metadata":{"original_task_id":"arxiv-search-1","website":"ArXiv","category":"research","additional":{}}}
|
|
{"query_id":"stackoverflow-debug-1","dataset":"browseros-eval","query":"Find the highest-voted answer on Stack Overflow for the error \"CORS policy: No Access-Control-Allow-Origin header\" in a React app making fetch requests. Summarize the solution.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://stackoverflow.com/","metadata":{"original_task_id":"stackoverflow-debug-1","website":"Stack Overflow","category":"research","additional":{}}}
|
|
{"query_id":"ted-talk-1","dataset":"browseros-eval","query":"Find the most viewed TED talk about artificial intelligence that is between 10 and 20 minutes long. What is the speaker's name and the number of views?","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.ted.com/","metadata":{"original_task_id":"ted-talk-1","website":"TED","category":"research","additional":{}}}
|
|
{"query_id":"chase-calculator-1","dataset":"browseros-eval","query":"Use the Chase 401(k) calculator to estimate my retirement savings if I start at age 25, retire at 65, contribute $500/month, with a 7% annual return and a current balance of $10,000.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.chase.com/","metadata":{"original_task_id":"chase-calculator-1","website":"Chase","category":"finance","additional":{}}}
|
|
{"query_id":"sec-filing-1","dataset":"browseros-eval","query":"Find Apple Inc.'s most recent 10-K annual filing on SEC EDGAR and tell me the total revenue reported for the most recent fiscal year.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.sec.gov/cgi-bin/browse-edgar","metadata":{"original_task_id":"sec-filing-1","website":"SEC EDGAR","category":"finance","additional":{}}}
|
|
{"query_id":"healthline-diet-1","dataset":"browseros-eval","query":"Find and compare the Mediterranean diet and the DASH diet on Healthline. List the key differences in allowed foods and which one is better for lowering blood pressure.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.healthline.com/","metadata":{"original_task_id":"healthline-diet-1","website":"Healthline","category":"health","additional":{}}}
|
|
{"query_id":"webmd-symptom-1","dataset":"browseros-eval","query":"Use the WebMD symptom checker for an adult male experiencing persistent headache, fatigue, and blurred vision. What possible conditions does it suggest?","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.webmd.com/","metadata":{"original_task_id":"webmd-symptom-1","website":"WebMD","category":"health","additional":{}}}
|
|
{"query_id":"babycenter-growth-1","dataset":"browseros-eval","query":"Use the child height predictor on BabyCenter for a 5-year-old girl who is currently 3 feet 6 inches tall and weighs 40 pounds. What is the predicted adult height?","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.babycenter.com/","metadata":{"original_task_id":"babycenter-growth-1","website":"BabyCenter","category":"health","additional":{}}}
|
|
{"query_id":"youtube-playlist-1","dataset":"browseros-eval","query":"Search for \"beginner piano tutorial\" on YouTube, filter by videos over 20 minutes long and uploaded this year. Find the one with the most views and tell me the channel name and view count.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.youtube.com/","metadata":{"original_task_id":"youtube-playlist-1","website":"YouTube","category":"entertainment","additional":{}}}
|
|
{"query_id":"reddit-thread-1","dataset":"browseros-eval","query":"Find the top post of all time on the r/personalfinance subreddit on Reddit. Summarize the main advice given in the post and the top comment.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.reddit.com/","metadata":{"original_task_id":"reddit-thread-1","website":"Reddit","category":"entertainment","additional":{}}}
|
|
{"query_id":"imdb-movie-1","dataset":"browseros-eval","query":"Look at the IMDb Top 250 movies list and find the highest-rated movie from the 2020s. Show me its title, rating, director, and a brief plot summary.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.imdb.com/","metadata":{"original_task_id":"imdb-movie-1","website":"IMDb","category":"entertainment","additional":{}}}
|
|
{"query_id":"spotify-playlist-1","dataset":"browseros-eval","query":"Find the \"Today's Top Hits\" playlist on Spotify and tell me the first 5 songs listed, including the artist names and the total number of likes the playlist has.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://open.spotify.com/","metadata":{"original_task_id":"spotify-playlist-1","website":"Spotify","category":"entertainment","additional":{}}}
|
|
{"query_id":"espn-stats-1","dataset":"browseros-eval","query":"Find the current NBA season's leading scorer on ESPN. Show me their points per game, total points, and their team's current win-loss record.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.espn.com/","metadata":{"original_task_id":"espn-stats-1","website":"ESPN","category":"entertainment","additional":{}}}
|
|
{"query_id":"steam-review-1","dataset":"browseros-eval","query":"Find the game that won Steam's Game of the Year 2024 award. Show me its current price, overall review rating, and read the most helpful recent negative review.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://store.steampowered.com/","metadata":{"original_task_id":"steam-review-1","website":"Steam","category":"entertainment","additional":{}}}
|
|
{"query_id":"govuk-visa-1","dataset":"browseros-eval","query":"Check on GOV.UK whether a US citizen needs a visa to work in the UK for 12 months in the technology sector. What type of visa is required and what are the main requirements?","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.gov.uk/","metadata":{"original_task_id":"govuk-visa-1","website":"GOV.UK","category":"government","additional":{}}}
|
|
{"query_id":"irs-refund-1","dataset":"browseros-eval","query":"Find the current standard deduction amount for a single filer under 65 on the IRS website for the 2025 tax year. Also find the income tax brackets for single filers.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.irs.gov/","metadata":{"original_task_id":"irs-refund-1","website":"IRS","category":"government","additional":{}}}
|
|
{"query_id":"cargurus-search-1","dataset":"browseros-eval","query":"Find a used 2020-2023 Toyota RAV4 Hybrid on CarGurus near zip code 94102 with under 40,000 miles, priced under $30,000. Sort by lowest price and show me the top result with its deal rating.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.cargurus.com/","metadata":{"original_task_id":"cargurus-search-1","website":"CarGurus","category":"automotive","additional":{}}}
|
|
{"query_id":"kbb-value-1","dataset":"browseros-eval","query":"Look up the trade-in value of a 2019 Honda Civic EX sedan with 45,000 miles in good condition on Kelley Blue Book. What is the fair market range?","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.kbb.com/","metadata":{"original_task_id":"kbb-value-1","website":"Kelley Blue Book","category":"automotive","additional":{}}}
|
|
{"query_id":"kaggle-competition-1","dataset":"browseros-eval","query":"Find the currently active Kaggle competition with the highest prize money. Show me the competition name, prize amount, deadline, and the number of teams participating.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.kaggle.com/","metadata":{"original_task_id":"kaggle-competition-1","website":"Kaggle","category":"education","additional":{}}}
|
|
{"query_id":"pypi-package-1","dataset":"browseros-eval","query":"Search for Python packages on PyPI related to \"data validation\" that support Python 3.11, have a stable release, and are MIT licensed. Show me the top 3 results by relevance.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://pypi.org/","metadata":{"original_task_id":"pypi-package-1","website":"PyPI","category":"education","additional":{}}}
|
|
{"query_id":"coursera-course-1","dataset":"browseros-eval","query":"Find a beginner-level machine learning course on Coursera that is free to audit, has a rating of 4.7 or higher, and takes less than 3 months to complete. Show the course name and instructor.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.coursera.org/","metadata":{"original_task_id":"coursera-course-1","website":"Coursera","category":"education","additional":{}}}
|
|
{"query_id":"huggingface-model-1","dataset":"browseros-eval","query":"Find the most downloaded text-generation model on Hugging Face that was updated in the last month. Show me the model name, download count, and its license.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://huggingface.co/","metadata":{"original_task_id":"huggingface-model-1","website":"Hugging Face","category":"technology","additional":{}}}
|
|
{"query_id":"github-repo-1","dataset":"browseros-eval","query":"Find the most starred open-source repository on GitHub that was created in 2025. Show me the repo name, star count, primary language, and a brief description.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://github.com/","metadata":{"original_task_id":"github-repo-1","website":"GitHub","category":"technology","additional":{}}}
|
|
{"query_id":"nvidia-driver-1","dataset":"browseros-eval","query":"Find the latest NVIDIA driver for an RTX 4090 GPU running on Ubuntu 22.04 with an x86_64 architecture. Show me the driver version number and download size.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.nvidia.com/","metadata":{"original_task_id":"nvidia-driver-1","website":"NVIDIA","category":"technology","additional":{}}}
|
|
{"query_id":"azure-pricing-1","dataset":"browseros-eval","query":"Use the Azure pricing calculator to estimate the monthly cost of running a Standard_D4s_v3 virtual machine in East US region with Linux, 24/7 uptime, and 128 GB premium SSD storage.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://azure.microsoft.com/","metadata":{"original_task_id":"azure-pricing-1","website":"Azure","category":"technology","additional":{}}}
|
|
{"query_id":"petfinder-adopt-1","dataset":"browseros-eval","query":"Find young female cats available for adoption within 25 miles of zip code 10001 on Petfinder that are good with other cats and are spayed. Show me the first 3 results.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.petfinder.com/","metadata":{"original_task_id":"petfinder-adopt-1","website":"Petfinder","category":"pets","additional":{}}}
|
|
{"query_id":"vivino-wine-1","dataset":"browseros-eval","query":"Find the highest-rated red wine from Napa Valley on Vivino priced under $50 that pairs well with steak. Show me the wine name, rating, and price.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.vivino.com/","metadata":{"original_task_id":"vivino-wine-1","website":"Vivino","category":"food","additional":{}}}
|
|
{"query_id":"multi-hop-weather-flight-1","dataset":"browseros-eval","query":"Search Google for the current weather in Tokyo, Japan, then search for the cheapest round-trip flight from Los Angeles to Tokyo next month on Google Flights. Show me the weather forecast and the flight price.","graders":["webvoyager_grader","fara_combined"],"start_url":"https://www.google.com/","metadata":{"original_task_id":"multi-hop-weather-flight-1","website":"Google","category":"multi-hop","additional":{}}}
|