import { z } from 'zod'; import { NxtscapeTool } from '../base/NxtscapeTool'; import { ToolConfig } from '../base/ToolConfig'; import { ExecutionContext } from '@/lib/runtime/ExecutionContext'; import { HumanMessage, SystemMessage } from '@langchain/core/messages'; import { withFlexibleStructuredOutput } from '@/lib/llm/utils/structuredOutput'; import { BrowserState } from '@/lib/browser/BrowserContext'; import { Logging } from '@/lib/utils/Logging'; import { profileAsync } from '@/lib/utils/Profiler'; /** * Schema for find element tool input */ export const FindElementInputSchema = z.object({ elementDescription: z.string(), // Natural language description of the element to find intent: z.string().optional() // Optional context about why finding this element }); export type FindElementInput = z.infer; /** * Schema for find element tool output */ export const FindElementOutputSchema = z.object({ success: z.boolean(), // Whether an element was found index: z.number().optional(), // Index of the found element confidence: z.enum(['high', 'medium', 'low']).optional(), // Confidence in the match elementInfo: z.object({ // Information about the found element tagName: z.string(), // HTML tag name text: z.string(), // Visible text content attributes: z.record(z.string()).optional() // Key attributes }).optional(), message: z.string() // Human-readable result message }); export type FindElementOutput = z.infer; /** * Tool for finding elements on a page using natural language descriptions * Uses Chrome BrowserOS V2 API for element identification */ export class FindElementTool extends NxtscapeTool { constructor(executionContext: ExecutionContext) { const config: ToolConfig = { name: 'find_element', description: 'Find an element on the page using a natural language description. Returns the element index to use with the interact tool. Uses AI to match your description to the best element.', category: 'navigation', version: '1.0.0', inputSchema: FindElementInputSchema, outputSchema: FindElementOutputSchema, examples: [ { description: 'Find a submit button', input: { elementDescription: 'submit button', intent: 'Looking for the form submission button' }, output: { success: true, index: 23, confidence: 'high', elementInfo: { tagName: 'button', text: 'Submit', attributes: { type: 'submit' } }, message: 'Found submit button at index 23 with high confidence' } }, { description: 'Find an email input field', input: { elementDescription: 'email address input field' }, output: { success: true, index: 10, confidence: 'high', elementInfo: { tagName: 'input', text: '', attributes: { type: 'email', placeholder: 'Enter your email' } }, message: 'Found email input field at index 10 with high confidence' } }, { description: 'Element not found', input: { elementDescription: 'login button' }, output: { success: false, message: 'No element found matching "login button"' } } ], streamingConfig: { displayName: 'Find Element', icon: '🔍', progressMessage: 'Searching for element...' } }; super(config, executionContext); } /** * Override: Generate contextual display message */ getProgressMessage(args: FindElementInput): string { try { // Note: args should already be parsed by StreamEventProcessor const description = args?.elementDescription; const intent = args?.intent; if (intent) { return intent; } else if (description) { return `Finding: ${description}`; } return 'Searching for element...'; } catch { return 'Searching for element...'; } } /** * Override: Format result for display */ FormatResultForUI(output: FindElementOutput): string { if (!output.success) { return `❌ ${output.message}`; } if (output.index !== undefined && output.confidence) { return `🔍 Found element at index ${output.index} (${output.confidence} confidence)`; } return `✅ ${output.message}`; } protected async execute(input: FindElementInput): Promise { return profileAsync(`FindElementTool.execute[${input.elementDescription}]`, async () => { try { // Get browser state (V2 doesn't support vision/screenshots) const browserState = await this.executionContext.browserContext.getBrowserState(); if (browserState.clickableElements.length === 0) { return { success: false, message: 'No clickable elements found on the current page' }; } // Use LLM to find the element const result = await this.findElementWithLLM( input.elementDescription, browserState, ); return result; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); return { success: false, message: `Failed to find element: ${errorMessage}` }; } }); } /** * Find element using LLM with structured output * Uses V2 browser state with unified element indexing */ private async findElementWithLLM( description: string, browserState: BrowserState, ): Promise { return profileAsync(`FindElementTool.findElementWithLLM[${description}]`, async () => { // Get LLM with low temperature for consistency const llm = await this.getLLM({ temperature: 0.1 }); // Define output schema for LLM const findElementSchema = z.object({ found: z.boolean().describe('Whether a matching element was found'), index: z.number().optional().describe('The index number of the best matching element'), confidence: z.enum(['high', 'medium', 'low']).optional().describe('Confidence level in the match'), reasoning: z.string().describe('Brief explanation of the decision') }); // Create LLM with structured output using flexible schema handling const structuredLLM = await withFlexibleStructuredOutput(llm, findElementSchema); // Build system prompt const systemPrompt = `You are an expert at finding elements on web pages using Chrome BrowserOS V2 API. Your task is to find the element that best matches the user's description. **IMPORTANT INSTRUCTIONS:** 1. Elements are shown with nodeId in square brackets like [0], [1], [23], etc. 2. The nodeId is a sequential index assigned by Chrome BrowserOS to interactive elements 3. Return the NUMBER inside the brackets as the index (e.g., for [23] return 23) 4. Elements use a compact format with indentation showing DOM hierarchy: [nodeId] "name" ctx:"context" path:"...>..." attr:"key=value ..." Where: - Indentation (spaces) indicates depth in the DOM tree - is the type: for clickable/selectable, for typeable - is the HTML tag (button, input, a, div, etc.) - "name" is the visible text (truncated to 40 chars) - ctx:"context" shows surrounding text (truncated to 60 chars) - path:"...>..." shows last 3 ancestors in DOM (e.g., "nav>ul>a") - attr:"..." shows key attributes like type, placeholder, value, aria-label Examples: [1]