Files
BrowserOS/reference-code/old-lib/tools/browser-navigation/InteractionTool.ts
Felarof 8245dfe0ff Rewrite Agent Loop (#7)
* clean-up bunch of files for re-write

* more clean-up and adding basic agent

* Minor fix moved types into respective files.

* Deleted bunch of old files

backup

Update gitignore

Deleted a bunch of files

Remove message manager

Deleted old docs

Update rules

rename Profiler to profiler

* Temporarily adding old code

* Adding two small things back

* backup

* Implemented LangChainProvider and updated cursor rules

backup

LangChainProvider

curosr rules

* Implement tests for LangChainProvider -- unit test and integration test

integration test passes

integration test backup

* Tool Design

Tools Desing

tools design

* NavigationTool ready

NavigationTool ready

NavigationTool ready

NaivgationTool ready

backup

* MessageManager

MessageManager

backup

* Fixed integration test

* Agent design new

Updated agent design and added bunch of /NTN commands

agent new design

* Delete old agent design

* MessageManagerReadOnly class

* PlannerTool ready

PlannerTool almost ready

* ToolManager and DoneTool

* Integration of BrowserAgent

* BrowserAgent implementation v0.1

* BrowserAgent small fix v0.2

* Tool calling design

too call design

tool design claude

* Update agent tool design with // NTN

* add zod-to-json npm install

* BrowserAGent v0.3

* BrowserAgent v0.4

* BrowserAgent v0.5

* fixes

* Build error fixes in my NEWLY added code

build errors fix

* Build error fixes in old code (integration work)

backup

* Comment StreamEventProcessor for now, it is not used

* Small build error fix

* Small rename

* Added integration test to check structuredLLM and changed to 4o-mini

change default to nxtscape

integration test

* Small docstring

* Simplified BrowserAgent code and added integration test

Simplified BrowserAgent code

BrowserAGent integrationt est

* Update CLAUDE.md with project memory and instructions on how to write code

Update CLAUDE.md with project memory and instructions on how to write code

Project Memory

* Just a mova.. Moved ToolManager outside. Build works.

* TabOperations tool

TabOperations Tool and fixing some test

tab operations

* Update CLAUDE.md

* Added ClassificationTool

classifiction tool

classification prommpt

* Refactored and simplified PlannerTool unit test and integration test

* Updated Plnnaer tool

* Update CLAUDE.md

* BrowserAgent modified to do classification

BrowserAgent with classification

* minor fix to ToolManager

* Instead of ToolCall and ToolResult -- just updating message manager once

* minor fix to BrowserAgent integration test

* Changed done to "done_tool"

* Updated CLAUDE.md to reflect understanding of claude

* Uncommented stream event processor

* Renamed EventBus to StreamEventBus

* Commented StreamEventProcessor

* Event Processor

* Integrated EventProcessor with BrowserAgent

Added EventProcessor to BrowserAgetn

* Renamed StreamEventBus to EventBus

* Made EventBus required parameter in ExecutionContext

* PlanGenerator rewrite

PlanGenerator rewrite

backup

* For simple task, explicitly tell it to call done tool

* Max attempts for simple task

* backup

* Revert "backup"

This reverts commit 7d79a3d4d5774bfef79ec9827878b74edad3593f.

* Consolidating where EventBus and EventProcessor are created and initialized

backup

* Update CLAUDE.md

Update CLAUDE.md

* Improving agent loop code

Cleaned up processTooCall

classification task

* Create test-writer subAgent

test-agent-prompt

test agent prompt

test-agent-prompt

Update test-writer.md

* BrowserAgent test

Browseragent test

BrowserAgent test

* BrowserAgent refactor

backup

backup

* Minor fixes

* Minor fix

* minor change -- NEW AGENT LOOP IS WORKING WELL

* Update cursor rules

* Small change

* Improved BrowserAgent integration test

Improved BrowserAgent integration test

* Small change

* Update CLAUDE.md

* Different tools

* FindElementTool is ready

Find element update

backup

find element backup

* Updated to test strings to say "tests..."

* ScrollTool is ready

* RefreshStateTool is updated as well

* MessageManager updated

* SearchTool is ready

backup

* Interaction Element is also ready

* Add debugMessage emitter

* ValidatorTool ready and tests are passing

Validation Tool

validator tool

backup

backup

* GroupTabs tool ready

* Registered all the tools

* Planning changed to 5 steps

* BrowserAgent integration test fix

* Minor string changes

* backup

* Removed too many confusing events in EventProcessor -- there is only event.info right now

* Abort control implemented

backup

Abort

* Formatter for toolResult

Formatter for toolResult

backup

* Always render using Markdown

* Minor fix

---------

Co-authored-by: Nikhil Sonti <nikhilsv92@gmail.com>
2025-07-29 08:14:45 -07:00

523 lines
17 KiB
TypeScript

import { z } from 'zod';
import { NxtscapeTool } from '../base/NxtscapeTool';
import { ToolConfig } from '../base/ToolConfig';
import { ExecutionContext } from '@/lib/runtime/ExecutionContext';
import { BrowserPage } from '@/lib/browser/BrowserPage';
import { Logging } from '@/lib/utils/Logging';
import { profileAsync } from '@/lib/utils/Profiler';
/**
* Enum for interaction operations
*/
export const InteractionOperationTypeEnum = z.enum([
'click', // Click an element
'input_text', // Input text into an element
'clear', // Clear text from an input element
'send_keys' // Send keyboard keys/shortcuts
]);
export type InteractionOperationType = z.infer<typeof InteractionOperationTypeEnum>;
/**
* Schema for interaction tool input
*/
export const InteractionInputSchema = z.object({
operationType: InteractionOperationTypeEnum, // The operation to perform
index: z.number().optional(), // Element index from the selector map (optional for send_keys)
text: z.string().optional(), // Text for input_text
keys: z.string().optional(), // Keys for send_keys operation
intent: z.string().optional() // Optional description of why this interaction is being performed
});
export type InteractionInput = z.infer<typeof InteractionInputSchema>;
/**
* Schema for interaction tool output
*/
export const InteractionOutputSchema = z.object({
success: z.boolean(), // Whether the operation succeeded
operationType: InteractionOperationTypeEnum, // Operation that was performed
message: z.string(), // Human-readable result message
elementInfo: z.object({
tagName: z.string(), // Element tag name
text: z.string().optional(), // Element text content
type: z.string().optional(), // Input type if applicable
value: z.string().optional() // Current value if applicable
}).optional(),
newTabOpened: z.boolean().optional() // Whether a new tab was opened (for clicks)
});
export type InteractionOutput = z.infer<typeof InteractionOutputSchema>;
/**
* Unified tool for element interactions
*/
export class InteractionTool extends NxtscapeTool<InteractionInput, InteractionOutput> {
constructor(executionContext: ExecutionContext) {
const config: ToolConfig<InteractionInput, InteractionOutput> = {
name: 'interact',
description: 'Perform element interactions. Operations: "click" (click element), "input_text" (type text into element), "clear" (clear input field), "send_keys" (send keyboard keys). Always pass operationType. Pass index for element operations. Pass text for input_text, keys for send_keys. Note: For dropdowns, click to open them and then click the desired option.',
category: 'interaction',
version: '1.0.0',
inputSchema: InteractionInputSchema,
outputSchema: InteractionOutputSchema,
examples: [
{
description: 'Click a button',
input: {
operationType: 'click',
index: 15,
intent: 'Clicking the submit button'
},
output: {
success: true,
operationType: 'click',
message: 'Clicked element with index 15',
elementInfo: {
tagName: 'button',
text: 'Submit'
}
}
},
{
description: 'Input text into a field',
input: {
operationType: 'input_text',
index: 8,
text: 'john.doe@example.com',
intent: 'Entering email address'
},
output: {
success: true,
operationType: 'input_text',
message: 'Input text into element with index 8',
elementInfo: {
tagName: 'input',
type: 'email',
value: 'john.doe@example.com'
}
}
},
{
description: 'Clear an input field',
input: {
operationType: 'clear',
index: 12,
intent: 'Clearing the search box'
},
output: {
success: true,
operationType: 'clear',
message: 'Cleared element with index 12',
elementInfo: {
tagName: 'input',
type: 'text',
value: ''
}
}
},
{
description: 'Send keyboard keys',
input: {
operationType: 'send_keys',
keys: 'Enter',
intent: 'Pressing Enter to submit form'
},
output: {
success: true,
operationType: 'send_keys',
message: 'Sent keys: Enter'
}
}
],
streamingConfig: {
displayName: 'Interact',
icon: '🖱️',
progressMessage: 'Interacting with element...'
}
};
super(config, executionContext);
}
/**
* Override: Generate contextual display message based on operation
*/
getProgressMessage(args: InteractionInput): string {
try {
// Note: args should already be parsed by StreamEventProcessor
const operationType = args?.operationType;
const index = args?.index;
const intent = args?.intent;
// Use intent if provided, otherwise generate based on operation
if (intent) {
return intent;
}
switch (operationType) {
case 'click':
return `Clicking element ${index}`;
case 'input_text':
return `Typing into element ${index}`;
case 'clear':
return `Clearing element ${index}`;
case 'send_keys':
return `Sending keys: ${args?.keys}`;
default:
return 'Interacting with element...';
}
} catch {
return 'Interacting with element...';
}
}
/**
* Override: Format result based on operation type
*/
FormatResultForUI(output: InteractionOutput): string {
if (!output.success) {
return `${output.message}`;
}
switch (output.operationType) {
case 'click':
if (output.newTabOpened) {
return `🖱️ Clicked element (new tab opened)`;
}
return `🖱️ Clicked element`;
case 'input_text':
return `⌨️ Entered text`;
case 'clear':
return `🧹 Cleared field`;
case 'send_keys':
// Extract keys from message
const keysMatch = output.message.match(/Sent keys: (.+)/);
if (keysMatch && keysMatch[1]) {
return `⌨️ Pressed ${keysMatch[1]}`;
}
return `⌨️ Sent keys`;
default:
return `${output.message}`;
}
}
protected async execute(input: InteractionInput): Promise<InteractionOutput> {
return profileAsync(`InteractionTool.execute[${input.operationType}]`, async () => {
// Validate inputs for operations that need them
const requiresIndex = ['click', 'input_text', 'clear'];
if (requiresIndex.includes(input.operationType) && input.index === undefined) {
return {
success: false,
operationType: input.operationType,
message: `${input.operationType} operation requires index parameter`
};
}
switch (input.operationType) {
case 'input_text':
if (!input.text) {
return {
success: false,
operationType: input.operationType,
message: 'input_text operation requires text parameter'
};
}
break;
case 'send_keys':
if (!input.keys) {
return {
success: false,
operationType: input.operationType,
message: 'send_keys operation requires keys parameter'
};
}
break;
}
try {
// Get the current page
const page = await this.executionContext.browserContext.getCurrentPage();
// Execute the operation
switch (input.operationType) {
case 'click':
return await this.clickElement(page, input.index!);
case 'input_text':
return await this.inputText(page, input.index!, input.text!);
case 'clear':
return await this.clearElement(page, input.index!);
case 'send_keys':
return await this.sendKeys(page, input.keys!);
default:
return {
success: false,
operationType: 'click',
message: 'Invalid operation type specified'
};
}
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
success: false,
operationType: input.operationType,
message: `Interaction failed: ${errorMessage}`
};
}
});
}
/**
* Click an element
*/
private async clickElement(page: BrowserPage, index: number): Promise<InteractionOutput> {
return profileAsync(`InteractionTool.clickElement[${index}]`, async () => {
try {
// Get the element from the selector map
const element = await page.getElementByIndex(index);
if (!element) {
return {
success: false,
operationType: 'click',
message: `Element with index ${index} not found`
};
}
// Log element details before clicking
Logging.log('InteractionTool', `Clicking element at index ${index}:`, 'info');
Logging.log('InteractionTool', ` - NodeId: ${element.nodeId}`, 'info');
Logging.log('InteractionTool', ` - Text: "${element.name || '(no text)'}"`, 'info');
Logging.log('InteractionTool', ` - Tag: <${element.attributes?.['html-tag'] || 'unknown'}>`, 'info');
Logging.log('InteractionTool', ` - Type: ${element.type}`, 'info');
// Check if element is a file uploader
if (page.isFileUploader(element)) {
return {
success: false,
operationType: 'click',
message: `Element ${index} opens a file upload dialog. File uploads are not supported in automated mode.`,
elementInfo: {
tagName: element.attributes?.['html-tag'] || 'unknown',
type: 'file'
}
};
}
// Get initial state before click
const initialUrl = page.url();
const initialTabIds = await this.executionContext.browserContext.getAllTabIds();
// Click the element (V2 doesn't support vision)
await page.clickElement(element.nodeId);
// Simple wait after interaction
await new Promise(resolve => setTimeout(resolve, 1000));
let newTabOpened = false;
// Check for new tabs
const currentTabIds = await this.executionContext.browserContext.getAllTabIds();
newTabOpened = currentTabIds.size > initialTabIds.size;
if (newTabOpened) {
const newTabId = Array.from(currentTabIds).find(id => !initialTabIds.has(id));
if (newTabId) {
Logging.log('InteractionTool', `New tab opened with ID: ${newTabId}`, 'info');
await this.executionContext.browserContext.switchTab(newTabId);
}
}
// Get element info for response
const elementInfo = {
tagName: element.attributes?.['html-tag'] || 'unknown',
text: element.name || ''
};
Logging.log('InteractionTool', `Successfully clicked element at index ${index}`, 'info');
return {
success: true,
operationType: 'click',
message: `Clicked element with index ${index}: ${elementInfo.text}`,
elementInfo,
newTabOpened
};
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
// Check for specific error types
if (errorMessage.includes('no longer available')) {
return {
success: false,
operationType: 'click',
message: `Element ${index} is no longer available - the page may have changed`
};
}
return {
success: false,
operationType: 'click',
message: `Failed to click element ${index}: ${errorMessage}`
};
}
});
}
/**
* Input text into an element
*/
private async inputText(page: BrowserPage, index: number, text: string): Promise<InteractionOutput> {
return profileAsync(`InteractionTool.inputText[${index}]`, async () => {
try {
// Get the element from the selector map
const element = await page.getElementByIndex(index);
if (!element) {
return {
success: false,
operationType: 'input_text',
message: `Element with index ${index} not found`
};
}
// Log element details before inputting text
Logging.log('InteractionTool', `Inputting text into element at index ${index}:`, 'info');
Logging.log('InteractionTool', ` - NodeId: ${element.nodeId}`, 'info');
Logging.log('InteractionTool', ` - Text: "${element.name || '(no text)'}"`, 'info');
Logging.log('InteractionTool', ` - Tag: <${element.attributes?.['html-tag'] || 'unknown'}>`, 'info');
Logging.log('InteractionTool', ` - Type: ${element.type}`, 'info');
Logging.log('InteractionTool', ` - Input text: "${text}"`, 'info');
// Input the text (V2 doesn't support vision)
await page.inputText(element.nodeId, text);
// Get element info for response
const elementInfo = {
tagName: element.attributes?.['html-tag'] || 'unknown',
type: 'text',
value: text
};
Logging.log('InteractionTool', `Successfully input text into element at index ${index}`, 'info');
return {
success: true,
operationType: 'input_text',
message: `Input text into element with index ${index}`,
elementInfo
};
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
// Check for specific error types
if (errorMessage.includes('not an input')) {
return {
success: false,
operationType: 'input_text',
message: `Element ${index} is not an input field`
};
}
return {
success: false,
operationType: 'input_text',
message: `Failed to input text into element ${index}: ${errorMessage}`
};
}
});
}
/**
* Clear text from an element
*/
private async clearElement(page: BrowserPage, index: number): Promise<InteractionOutput> {
return profileAsync(`InteractionTool.clearElement[${index}]`, async () => {
try {
// Get the element from the selector map
const element = await page.getElementByIndex(index);
if (!element) {
return {
success: false,
operationType: 'clear',
message: `Element with index ${index} not found`
};
}
// Log element details before clearing
Logging.log('InteractionTool', `Clearing element at index ${index}:`, 'info');
Logging.log('InteractionTool', ` - NodeId: ${element.nodeId}`, 'info');
Logging.log('InteractionTool', ` - Text: "${element.name || '(no text)'}"`, 'info');
Logging.log('InteractionTool', ` - Tag: <${element.attributes?.['html-tag'] || 'unknown'}>`, 'info');
Logging.log('InteractionTool', ` - Type: ${element.type}`, 'info');
// Clear the element using the new V2 API
await page.clearElement(element.nodeId);
// Get element info for response
const elementInfo = {
tagName: element.attributes?.['html-tag'] || 'unknown',
type: 'text',
value: ''
};
Logging.log('InteractionTool', `Successfully cleared element at index ${index}`, 'info');
return {
success: true,
operationType: 'clear',
message: `Cleared element with index ${index}`,
elementInfo
};
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
success: false,
operationType: 'clear',
message: `Failed to clear element ${index}: ${errorMessage}`
};
}
});
}
/**
* Send keyboard keys
*/
private async sendKeys(page: BrowserPage, keys: string): Promise<InteractionOutput> {
return profileAsync(`InteractionTool.sendKeys[${keys}]`, async () => {
try {
// Log the keys being sent
Logging.log('InteractionTool', `Sending keys: "${keys}"`, 'info');
await page.sendKeys(keys);
return {
success: true,
operationType: 'send_keys',
message: `Sent keys: ${keys}`
};
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
success: false,
operationType: 'send_keys',
message: `Failed to send keys: ${errorMessage}`
};
}
});
}
}