|
|
|
|
@@ -1,20 +1,16 @@
|
|
|
|
|
{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
|
|
|
|
|
{"query_id": "agisdk-fly-unified-5", "dataset": "agisdk-real", "query": "Find me the cheapest fare for a flight from Orlando to Milwaukee on December 5th, 2024 and book it.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/25, Security Code: 420 Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-5", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "United Airlines"}}}
|
|
|
|
|
{"query_id": "agisdk-fly-unified-5", "dataset": "agisdk-real", "query": "Find me the cheapest fare for a flight from Orlando to Milwaukee on December 5th, 2024 and book it.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/30, Security Code: 420 Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-5", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "United Airlines"}}}
|
|
|
|
|
{"query_id": "agisdk-udriver-10", "dataset": "agisdk-real", "query": "Order me a ride for 4pm, I'll be at the de Young muesum headed to the Waterbar, fanciest option possible please.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-10", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
|
|
|
|
|
{"query_id": "agisdk-udriver-9", "dataset": "agisdk-real", "query": "Book me a ride from the thai restaurant I last took a ride to for later today at 2pm, I'll be at 333 Apartments on Fremont", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-9", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-9", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Uber"}}}
|
|
|
|
|
{"query_id": "agisdk-topwork-4", "dataset": "agisdk-real", "query": "Create a job post for a UI/UX Designer with expertise in Figma, Sketch, and Adobe Creative Suite, including project details, timeline, and required skills (Wireframing, Prototyping, Responsive Design).", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-4", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
|
|
|
|
|
{"query_id": "agisdk-gocalendar-4", "dataset": "agisdk-real", "query": "Change the \"Team Check-In\" event on July 18, 2024, name to \"Project Kickoff\" and update the location to \"Zoom\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-4", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
|
|
|
|
|
{"query_id": "agisdk-staynb-6", "dataset": "agisdk-real", "query": "Find and book the stay with the best value for money (cheapest stay with the best reviews) for 1 day. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-6", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-6", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
|
|
|
|
|
{"query_id": "agisdk-fly-unified-9", "dataset": "agisdk-real", "query": "Book me a flight from San Francisco to Chicago in Basic Economy on December 18th at 10:00. Ensure no seat selection is made.\nPassenger: David Lee\nDate of Birth: 07/22/1985\nSex: Male\nSeat Selection: No\nPayment: Credit Card (9999 8888 7777), Exp: 03/30, Address: 987 Cedar St, Chicago, IL, 60601, USA, Phone: 555-987-1234, Email: davidlee@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-9", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-9", "challenge_type": "action", "difficulty": "hard", "similar_to": "United Airlines"}}}
|
|
|
|
|
{"query_id": "agisdk-networkin-9", "dataset": "agisdk-real", "query": "Find a professional who attended Stanford and send them a connection request and a message.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-9", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-9", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
|
|
|
|
|
{"query_id": "agisdk-udriver-11", "dataset": "agisdk-real", "query": "I need to go from Pacific Catch on Chestnut back home to 333 Fremont now. If the fancy version is within ten dollars of the regular one, book that.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-11", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-11", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
|
|
|
|
|
{"query_id": "agisdk-fly-unified-4", "dataset": "agisdk-real", "query": "Book me a round-trip flight from Providence (Rhode Island) to Indianapolis, departing on December 5th, 2024 at 08:00 and returning on December 9th at 14:00.\nPassenger: Jane Smith\nDate of Birth: 02/14/1995\nSex: Female\nSeat Selection: Yes (Window seat)\nPayment: Credit Card (378342143523967), Exp: 06/26, security code: 345 Address: 456 Elm St, Miami, FL, 33101, USA, Phone: 555-987-6543, Email: janesmith@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-4", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "United Airlines"}}}
|
|
|
|
|
{"query_id": "agisdk-networkin-5", "dataset": "agisdk-real", "query": "Send a connection request to John Smith.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-5", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-5", "challenge_type": "action", "difficulty": "easy", "similar_to": "LinkedIn"}}}
|
|
|
|
|
{"query_id": "agisdk-zilloft-6", "dataset": "agisdk-real", "query": "Select a property listed in San Francisco as \"Condos\" within a price range under $300,000 and request a tour for tomorrow at 4:00 PM. Use these contact details: Name: Sarah Brown, Email: sarahbrown@example.com, Phone: 555-987-6543.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-6", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "Zillow"}}}
|
|
|
|
|
{"query_id": "agisdk-topwork-2", "dataset": "agisdk-real", "query": "Create a job posting for a Backend Developer specializing in Python, Django, and Flask to develop a high-performance web application. Include project details such as required skills (PostgreSQL, Docker, AWS, CI/CD), estimated project timeline, and budget.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-2", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-2", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
|
|
|
|
|
{"query_id": "agisdk-gocalendar-3", "dataset": "agisdk-real", "query": "Delete the event titled \"Breakfast Meeting with Client\" scheduled for July 19, 2024", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-3", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "Google Calendar"}}}
|
|
|
|
|
{"query_id": "agisdk-topwork-3", "dataset": "agisdk-real", "query": "Create a job listing for a Full-Stack Developer with expertise in Java, Spring Boot, and Angular, outlining the project scope, estimated duration, and required skills (MySQL, Docker, Kubernetes, and Jenkins). The ideal candidate should have experience in enterprise-level applications and building scalable microservices. After creating the job post, please describe what you included in the job listing.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-3", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-3", "challenge_type": "retrieval", "difficulty": "medium", "similar_to": "Upwork"}}}
|
|
|
|
|
{"query_id": "agisdk-fly-unified-2", "dataset": "agisdk-real", "query": "Book me a one-way flight from Indiana to New York on December 2nd 2024 at 12:00.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/25, Security Code: 245, Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-2", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "United Airlines"}}}
|
|
|
|
|
{"query_id": "agisdk-dashdish-7", "dataset": "agisdk-real", "query": "Select \"Express Delivery\" for an order from \"DragonEats\" of \"Mushroom Swiss Burger\" and complete the checkout with the pre-loaded Visa card.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-7", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-7", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
|
|
|
|
|
{"query_id": "agisdk-networkin-3", "dataset": "agisdk-real", "query": "Write a post inviting users to a networking event, including details about the event's purpose, date, and target audience.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-3", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-3", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
|
|
|
|
|
{"query_id": "agisdk-gomail-7", "dataset": "agisdk-real", "query": "Delete the email with the subject \"New Leadership Articles You Can't Miss\" from the Inbox.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-7", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-7", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Gmail"}}}
|
|
|
|
|
@@ -23,16 +19,13 @@
|
|
|
|
|
{"query_id": "agisdk-staynb-2", "dataset": "agisdk-real", "query": "Click on one of the stays displayed on the homepage and book it for a family of 4 (2 adults and 2 children). For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-2", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Airbnb"}}}
|
|
|
|
|
{"query_id": "agisdk-opendining-10", "dataset": "agisdk-real", "query": "Check the menus of all restaurants for vegetarian options and make a reservation at the one with the most vegetarian choices. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-10", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-10", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "OpenTable"}}}
|
|
|
|
|
{"query_id": "agisdk-opendining-4", "dataset": "agisdk-real", "query": "Use the search bar to search for a restaurant on September 2nd at 4:30 PM for 7 people, using \"Japanese\" as the search term, and book the first result. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-4", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-4", "challenge_type": "action", "difficulty": "hard", "similar_to": "OpenTable"}}}
|
|
|
|
|
{"query_id": "agisdk-gomail-8", "dataset": "agisdk-real", "query": "Clear all emails from \"GitHub\" in the inbox to trash.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-8", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-8", "challenge_type": "action", "difficulty": "medium", "similar_to": "Gmail"}}}
|
|
|
|
|
{"query_id": "agisdk-dashdish-4", "dataset": "agisdk-real", "query": "Schedule a delivery order from \"Taco Bell\" adding a \"Classic Cheeseburger\" large size for later and add the note \"Leave at the front door\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-4", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Doordash"}}}
|
|
|
|
|
{"query_id": "agisdk-networkin-1", "dataset": "agisdk-real", "query": "Create a new text post for the feed with a professional update about AI trends in 2025, mentioning three key advancements and their impact on the job market.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-1", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
|
|
|
|
|
{"query_id": "agisdk-dashdish-5", "dataset": "agisdk-real", "query": "Add three \"Loaded Bacon Cheese Fries\" to the shopping cart from \"Man vs. Fries\". Proceed to checkout and select \"Pickup\" as the delivery method.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-5", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Doordash"}}}
|
|
|
|
|
{"query_id": "agisdk-opendining-5", "dataset": "agisdk-real", "query": "Scroll through the homepage carousel until \"Ocean Breeze\" is visible, select the second available time slot, and complete the reservation. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-5", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-5", "challenge_type": "action", "difficulty": "medium", "similar_to": "OpenTable"}}}
|
|
|
|
|
{"query_id": "agisdk-topwork-1", "dataset": "agisdk-real", "query": "Create a new job post for a Frontend Developer with expertise in React and TypeScript, specifying project details such as estimated duration, required skills, and budget.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-1", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
|
|
|
|
|
{"query_id": "agisdk-gocalendar-1", "dataset": "agisdk-real", "query": "Create a new event titled \"Team Meeting\" on July 19, 2024, from 2 PM to 2:30 PM, and include \"Conference Room A\" as the location", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-1", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
|
|
|
|
|
{"query_id": "agisdk-gomail-5", "dataset": "agisdk-real", "query": "Schedule an email to jane.doe@example.com with the subject \"Weekly Update\" to be sent next Monday at 9:00 AM.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-5", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Gmail"}}}
|
|
|
|
|
{"query_id": "agisdk-staynb-4", "dataset": "agisdk-real", "query": "Book a stay for 2 children with 1 adult. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-4", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Airbnb"}}}
|
|
|
|
|
{"query_id": "agisdk-networkin-6", "dataset": "agisdk-real", "query": "Choose a random person who you haven't connected with, connect with them, and send them a message saying, 'howdy, partner'.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-6", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
|
|
|
|
|
{"query_id": "agisdk-dashdish-2", "dataset": "agisdk-real", "query": "Add a \"Medium Pepperoni Pizza\" from the restaurant \"Papa Johns Pizza\" to the shopping cart and purchase it.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-2", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Doordash"}}}
|
|
|
|
|
{"query_id": "agisdk-staynb-8", "dataset": "agisdk-real", "query": "Scroll through the homepage and book the last stay located in Paris.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-8", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-8", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
|
|
|
|
|
{"query_id": "agisdk-gomail-2", "dataset": "agisdk-real", "query": "Mark the first email in the Inbox as \"read\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-2", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Gmail"}}}
|
|
|
|
|
|