mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-19 11:31:03 +00:00
* clean-up bunch of files for re-write * more clean-up and adding basic agent * Minor fix moved types into respective files. * Deleted bunch of old files backup Update gitignore Deleted a bunch of files Remove message manager Deleted old docs Update rules rename Profiler to profiler * Temporarily adding old code * Adding two small things back * backup * Implemented LangChainProvider and updated cursor rules backup LangChainProvider curosr rules * Implement tests for LangChainProvider -- unit test and integration test integration test passes integration test backup * Tool Design Tools Desing tools design * NavigationTool ready NavigationTool ready NavigationTool ready NaivgationTool ready backup * MessageManager MessageManager backup * Fixed integration test * Agent design new Updated agent design and added bunch of /NTN commands agent new design * Delete old agent design * MessageManagerReadOnly class * PlannerTool ready PlannerTool almost ready * ToolManager and DoneTool * Integration of BrowserAgent * BrowserAgent implementation v0.1 * BrowserAgent small fix v0.2 * Tool calling design too call design tool design claude * Update agent tool design with // NTN * add zod-to-json npm install * BrowserAGent v0.3 * BrowserAgent v0.4 * BrowserAgent v0.5 * fixes * Build error fixes in my NEWLY added code build errors fix * Build error fixes in old code (integration work) backup * Comment StreamEventProcessor for now, it is not used * Small build error fix * Small rename * Added integration test to check structuredLLM and changed to 4o-mini change default to nxtscape integration test * Small docstring * Simplified BrowserAgent code and added integration test Simplified BrowserAgent code BrowserAGent integrationt est * Update CLAUDE.md with project memory and instructions on how to write code Update CLAUDE.md with project memory and instructions on how to write code Project Memory * Just a mova.. Moved ToolManager outside. Build works. * TabOperations tool TabOperations Tool and fixing some test tab operations * Update CLAUDE.md * Added ClassificationTool classifiction tool classification prommpt * Refactored and simplified PlannerTool unit test and integration test * Updated Plnnaer tool * Update CLAUDE.md * BrowserAgent modified to do classification BrowserAgent with classification * minor fix to ToolManager * Instead of ToolCall and ToolResult -- just updating message manager once * minor fix to BrowserAgent integration test * Changed done to "done_tool" * Updated CLAUDE.md to reflect understanding of claude * Uncommented stream event processor * Renamed EventBus to StreamEventBus * Commented StreamEventProcessor * Event Processor * Integrated EventProcessor with BrowserAgent Added EventProcessor to BrowserAgetn * Renamed StreamEventBus to EventBus * Made EventBus required parameter in ExecutionContext * PlanGenerator rewrite PlanGenerator rewrite backup * For simple task, explicitly tell it to call done tool * Max attempts for simple task * backup * Revert "backup" This reverts commit 7d79a3d4d5774bfef79ec9827878b74edad3593f. * Consolidating where EventBus and EventProcessor are created and initialized backup * Update CLAUDE.md Update CLAUDE.md * Improving agent loop code Cleaned up processTooCall classification task * Create test-writer subAgent test-agent-prompt test agent prompt test-agent-prompt Update test-writer.md * BrowserAgent test Browseragent test BrowserAgent test * BrowserAgent refactor backup backup * Minor fixes * Minor fix * minor change -- NEW AGENT LOOP IS WORKING WELL * Update cursor rules * Small change * Improved BrowserAgent integration test Improved BrowserAgent integration test * Small change * Update CLAUDE.md * Different tools * FindElementTool is ready Find element update backup find element backup * Updated to test strings to say "tests..." * ScrollTool is ready * RefreshStateTool is updated as well * MessageManager updated * SearchTool is ready backup * Interaction Element is also ready * Add debugMessage emitter * ValidatorTool ready and tests are passing Validation Tool validator tool backup backup * GroupTabs tool ready * Registered all the tools * Planning changed to 5 steps * BrowserAgent integration test fix * Minor string changes * backup * Removed too many confusing events in EventProcessor -- there is only event.info right now * Abort control implemented backup Abort * Formatter for toolResult Formatter for toolResult backup * Always render using Markdown * Minor fix --------- Co-authored-by: Nikhil Sonti <nikhilsv92@gmail.com>
341 lines
12 KiB
TypeScript
341 lines
12 KiB
TypeScript
import { z } from 'zod';
|
|
import { NxtscapeTool } from '../base/NxtscapeTool';
|
|
import { ToolConfig } from '../base/ToolConfig';
|
|
import { ExecutionContext } from '@/lib/runtime/ExecutionContext';
|
|
import { HumanMessage, SystemMessage } from '@langchain/core/messages';
|
|
import { withFlexibleStructuredOutput } from '@/lib/llm/utils/structuredOutput';
|
|
import { BrowserState } from '@/lib/browser/BrowserContext';
|
|
import { Logging } from '@/lib/utils/Logging';
|
|
import { profileAsync } from '@/lib/utils/Profiler';
|
|
|
|
/**
|
|
* Schema for find element tool input
|
|
*/
|
|
export const FindElementInputSchema = z.object({
|
|
elementDescription: z.string(), // Natural language description of the element to find
|
|
intent: z.string().optional() // Optional context about why finding this element
|
|
});
|
|
|
|
export type FindElementInput = z.infer<typeof FindElementInputSchema>;
|
|
|
|
/**
|
|
* Schema for find element tool output
|
|
*/
|
|
export const FindElementOutputSchema = z.object({
|
|
success: z.boolean(), // Whether an element was found
|
|
index: z.number().optional(), // Index of the found element
|
|
confidence: z.enum(['high', 'medium', 'low']).optional(), // Confidence in the match
|
|
elementInfo: z.object({ // Information about the found element
|
|
tagName: z.string(), // HTML tag name
|
|
text: z.string(), // Visible text content
|
|
attributes: z.record(z.string()).optional() // Key attributes
|
|
}).optional(),
|
|
message: z.string() // Human-readable result message
|
|
});
|
|
|
|
export type FindElementOutput = z.infer<typeof FindElementOutputSchema>;
|
|
|
|
/**
|
|
* Tool for finding elements on a page using natural language descriptions
|
|
* Uses Chrome BrowserOS V2 API for element identification
|
|
*/
|
|
export class FindElementTool extends NxtscapeTool<FindElementInput, FindElementOutput> {
|
|
constructor(executionContext: ExecutionContext) {
|
|
const config: ToolConfig<FindElementInput, FindElementOutput> = {
|
|
name: 'find_element',
|
|
description: 'Find an element on the page using a natural language description. Returns the element index to use with the interact tool. Uses AI to match your description to the best element.',
|
|
category: 'navigation',
|
|
version: '1.0.0',
|
|
inputSchema: FindElementInputSchema,
|
|
outputSchema: FindElementOutputSchema,
|
|
examples: [
|
|
{
|
|
description: 'Find a submit button',
|
|
input: {
|
|
elementDescription: 'submit button',
|
|
intent: 'Looking for the form submission button'
|
|
},
|
|
output: {
|
|
success: true,
|
|
index: 23,
|
|
confidence: 'high',
|
|
elementInfo: {
|
|
tagName: 'button',
|
|
text: 'Submit',
|
|
attributes: { type: 'submit' }
|
|
},
|
|
message: 'Found submit button at index 23 with high confidence'
|
|
}
|
|
},
|
|
{
|
|
description: 'Find an email input field',
|
|
input: {
|
|
elementDescription: 'email address input field'
|
|
},
|
|
output: {
|
|
success: true,
|
|
index: 10,
|
|
confidence: 'high',
|
|
elementInfo: {
|
|
tagName: 'input',
|
|
text: '',
|
|
attributes: { type: 'email', placeholder: 'Enter your email' }
|
|
},
|
|
message: 'Found email input field at index 10 with high confidence'
|
|
}
|
|
},
|
|
{
|
|
description: 'Element not found',
|
|
input: {
|
|
elementDescription: 'login button'
|
|
},
|
|
output: {
|
|
success: false,
|
|
message: 'No element found matching "login button"'
|
|
}
|
|
}
|
|
],
|
|
streamingConfig: {
|
|
displayName: 'Find Element',
|
|
icon: '🔍',
|
|
progressMessage: 'Searching for element...'
|
|
}
|
|
};
|
|
|
|
super(config, executionContext);
|
|
}
|
|
|
|
/**
|
|
* Override: Generate contextual display message
|
|
*/
|
|
getProgressMessage(args: FindElementInput): string {
|
|
try {
|
|
// Note: args should already be parsed by StreamEventProcessor
|
|
|
|
const description = args?.elementDescription;
|
|
const intent = args?.intent;
|
|
|
|
if (intent) {
|
|
return intent;
|
|
} else if (description) {
|
|
return `Finding: ${description}`;
|
|
}
|
|
|
|
return 'Searching for element...';
|
|
} catch {
|
|
return 'Searching for element...';
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Override: Format result for display
|
|
*/
|
|
FormatResultForUI(output: FindElementOutput): string {
|
|
if (!output.success) {
|
|
return `❌ ${output.message}`;
|
|
}
|
|
|
|
if (output.index !== undefined && output.confidence) {
|
|
return `🔍 Found element at index ${output.index} (${output.confidence} confidence)`;
|
|
}
|
|
|
|
return `✅ ${output.message}`;
|
|
}
|
|
|
|
protected async execute(input: FindElementInput): Promise<FindElementOutput> {
|
|
return profileAsync(`FindElementTool.execute[${input.elementDescription}]`, async () => {
|
|
try {
|
|
// Get browser state (V2 doesn't support vision/screenshots)
|
|
const browserState = await this.executionContext.browserContext.getBrowserState();
|
|
|
|
if (browserState.clickableElements.length === 0) {
|
|
return {
|
|
success: false,
|
|
message: 'No clickable elements found on the current page'
|
|
};
|
|
}
|
|
|
|
// Use LLM to find the element
|
|
const result = await this.findElementWithLLM(
|
|
input.elementDescription,
|
|
browserState,
|
|
);
|
|
|
|
return result;
|
|
} catch (error) {
|
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
|
|
return {
|
|
success: false,
|
|
message: `Failed to find element: ${errorMessage}`
|
|
};
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Find element using LLM with structured output
|
|
* Uses V2 browser state with unified element indexing
|
|
*/
|
|
private async findElementWithLLM(
|
|
description: string,
|
|
browserState: BrowserState,
|
|
): Promise<FindElementOutput> {
|
|
return profileAsync(`FindElementTool.findElementWithLLM[${description}]`, async () => {
|
|
// Get LLM with low temperature for consistency
|
|
const llm = await this.getLLM({ temperature: 0.1 });
|
|
|
|
// Define output schema for LLM
|
|
const findElementSchema = z.object({
|
|
found: z.boolean().describe('Whether a matching element was found'),
|
|
index: z.number().optional().describe('The index number of the best matching element'),
|
|
confidence: z.enum(['high', 'medium', 'low']).optional().describe('Confidence level in the match'),
|
|
reasoning: z.string().describe('Brief explanation of the decision')
|
|
});
|
|
|
|
// Create LLM with structured output using flexible schema handling
|
|
const structuredLLM = await withFlexibleStructuredOutput(llm, findElementSchema);
|
|
|
|
// Build system prompt
|
|
const systemPrompt = `You are an expert at finding elements on web pages using Chrome BrowserOS V2 API.
|
|
|
|
Your task is to find the element that best matches the user's description.
|
|
|
|
**IMPORTANT INSTRUCTIONS:**
|
|
1. Elements are shown with nodeId in square brackets like [0], [1], [23], etc.
|
|
2. The nodeId is a sequential index assigned by Chrome BrowserOS to interactive elements
|
|
3. Return the NUMBER inside the brackets as the index (e.g., for [23] return 23)
|
|
4. Elements use a compact format with indentation showing DOM hierarchy:
|
|
[nodeId] <T> <tag> "name" ctx:"context" path:"...>..." attr:"key=value ..."
|
|
|
|
Where:
|
|
- Indentation (spaces) indicates depth in the DOM tree
|
|
- <T> is the type: <C> for clickable/selectable, <T> for typeable
|
|
- <tag> is the HTML tag (button, input, a, div, etc.)
|
|
- "name" is the visible text (truncated to 40 chars)
|
|
- ctx:"context" shows surrounding text (truncated to 60 chars)
|
|
- path:"...>..." shows last 3 ancestors in DOM (e.g., "nav>ul>a")
|
|
- attr:"..." shows key attributes like type, placeholder, value, aria-label
|
|
|
|
Examples:
|
|
[1] <C> <button> "Submit" ctx:"Contact form - Send us a message" path:"main>form>button"
|
|
[2] <C> <a> "Products" ctx:"Main navigation menu" path:"header>nav>a"
|
|
[3] <C> <a> "Electronics" ctx:"Shop by category" path:"nav>ul>a"
|
|
[10] <T> <input> "Email" ctx:"Sign up for newsletter" path:"footer>form>input" attr:"type=email placeholder=Enter your email"
|
|
|
|
5. The context field helps identify the element's purpose within the page
|
|
6. The path field shows the element's location in a concise format
|
|
7. Indentation visually shows parent-child relationships
|
|
8. Consider all available information when matching:
|
|
- Type indicator (<C> or <T>)
|
|
- HTML tag
|
|
- Visible name/text
|
|
- Context from surrounding elements
|
|
- Path showing location
|
|
- Attributes for inputs (type, placeholder, etc.)
|
|
9. Choose the SINGLE BEST match if multiple candidates exist
|
|
|
|
**SCREENSHOT GUIDANCE:**
|
|
If a screenshot is provided, use it for spatial awareness of the page layout:
|
|
- Visual positioning helps disambiguate elements with similar text
|
|
- Layout context shows which elements are grouped together
|
|
- Visual prominence (size, color, position) indicates importance
|
|
- Use the screenshot to understand the overall page structure and make better decisions
|
|
|
|
**Return format:**
|
|
- found: true if a matching element exists, false otherwise
|
|
- index: the nodeId of the element (the number inside the brackets)
|
|
- confidence: your confidence level (high/medium/low)
|
|
- reasoning: brief explanation of why you chose this element`;
|
|
|
|
// Get DOM content as text
|
|
// const domContent = browserState.hierarchicalStructure || browserState.clickableElementsString || '';
|
|
const domContent = browserState.clickableElementsString + '\n' + browserState.typeableElementsString;
|
|
|
|
// Build user message
|
|
let userMessage: HumanMessage;
|
|
|
|
// Check if screenshot is available
|
|
if (browserState.screenshot) {
|
|
// Create multi-modal message with text and screenshot
|
|
userMessage = new HumanMessage({
|
|
content: [
|
|
{
|
|
type: 'text',
|
|
text: `Find the element matching this description: "${description}"
|
|
|
|
Interactive elements on the page:
|
|
${domContent}`
|
|
},
|
|
{
|
|
type: 'image_url',
|
|
image_url: { url: `data:image/jpeg;base64,${browserState.screenshot}` }
|
|
}
|
|
]
|
|
});
|
|
} else {
|
|
// Text-only message
|
|
userMessage = new HumanMessage(
|
|
`Find the element matching this description: "${description}"
|
|
|
|
Interactive elements on the page:
|
|
${domContent}`
|
|
);
|
|
}
|
|
|
|
try {
|
|
// Get LLM response
|
|
const llmResult = await structuredLLM.invoke([
|
|
new SystemMessage(systemPrompt),
|
|
userMessage
|
|
]);
|
|
|
|
// Handle not found case
|
|
if (!llmResult.found || llmResult.index === undefined) {
|
|
return {
|
|
success: false,
|
|
message: llmResult.reasoning || `No element found matching "${description}"`
|
|
};
|
|
}
|
|
|
|
// The index returned by LLM is actually the nodeId (they're the same in V2)
|
|
const foundInClickable = browserState.clickableElements.find(element => element.nodeId === llmResult.index);
|
|
const foundInTypeable = browserState.typeableElements.find(element => element.nodeId === llmResult.index);
|
|
|
|
if (!foundInClickable && !foundInTypeable) {
|
|
return {
|
|
success: false,
|
|
message: `Invalid index ${llmResult.index} returned - element not found in browser state`
|
|
};
|
|
}
|
|
|
|
// Log the found element details
|
|
const foundElement = foundInClickable || foundInTypeable;
|
|
Logging.log('FindElementTool', `Found element at index ${llmResult.index}:`, 'info');
|
|
Logging.log('FindElementTool', ` - Text: "${foundElement?.text || '(no text)'}"`, 'info');
|
|
Logging.log('FindElementTool', ` - Tag: <${foundElement?.tag || 'unknown'}>`, 'info');
|
|
Logging.log('FindElementTool', ` - Type: ${foundInClickable ? 'clickable' : 'typeable'}`, 'info');
|
|
Logging.log('FindElementTool', ` - Confidence: ${llmResult.confidence}`, 'info');
|
|
Logging.log('FindElementTool', ` - LLM Reasoning: ${llmResult.reasoning}`, 'info');
|
|
|
|
// Return successful result with limited info since we don't have direct element access
|
|
return {
|
|
success: true,
|
|
index: llmResult.index,
|
|
confidence: llmResult.confidence,
|
|
elementInfo: {
|
|
tagName: foundElement?.tag || 'element', // Use actual tag
|
|
text: foundElement?.text || '', // Use actual text
|
|
attributes: {}
|
|
},
|
|
message: `Found ${description} at index ${llmResult.index} with ${llmResult.confidence} confidence`
|
|
};
|
|
} catch (error) {
|
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
throw new Error(`LLM element finding failed: ${errorMessage}`);
|
|
}
|
|
});
|
|
}
|
|
|
|
}
|