mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-21 12:55:09 +00:00
* clean-up bunch of files for re-write * more clean-up and adding basic agent * Minor fix moved types into respective files. * Deleted bunch of old files backup Update gitignore Deleted a bunch of files Remove message manager Deleted old docs Update rules rename Profiler to profiler * Temporarily adding old code * Adding two small things back * backup * Implemented LangChainProvider and updated cursor rules backup LangChainProvider curosr rules * Implement tests for LangChainProvider -- unit test and integration test integration test passes integration test backup * Tool Design Tools Desing tools design * NavigationTool ready NavigationTool ready NavigationTool ready NaivgationTool ready backup * MessageManager MessageManager backup * Fixed integration test * Agent design new Updated agent design and added bunch of /NTN commands agent new design * Delete old agent design * MessageManagerReadOnly class * PlannerTool ready PlannerTool almost ready * ToolManager and DoneTool * Integration of BrowserAgent * BrowserAgent implementation v0.1 * BrowserAgent small fix v0.2 * Tool calling design too call design tool design claude * Update agent tool design with // NTN * add zod-to-json npm install * BrowserAGent v0.3 * BrowserAgent v0.4 * BrowserAgent v0.5 * fixes * Build error fixes in my NEWLY added code build errors fix * Build error fixes in old code (integration work) backup * Comment StreamEventProcessor for now, it is not used * Small build error fix * Small rename * Added integration test to check structuredLLM and changed to 4o-mini change default to nxtscape integration test * Small docstring * Simplified BrowserAgent code and added integration test Simplified BrowserAgent code BrowserAGent integrationt est * Update CLAUDE.md with project memory and instructions on how to write code Update CLAUDE.md with project memory and instructions on how to write code Project Memory * Just a mova.. Moved ToolManager outside. Build works. * TabOperations tool TabOperations Tool and fixing some test tab operations * Update CLAUDE.md * Added ClassificationTool classifiction tool classification prommpt * Refactored and simplified PlannerTool unit test and integration test * Updated Plnnaer tool * Update CLAUDE.md * BrowserAgent modified to do classification BrowserAgent with classification * minor fix to ToolManager * Instead of ToolCall and ToolResult -- just updating message manager once * minor fix to BrowserAgent integration test * Changed done to "done_tool" * Updated CLAUDE.md to reflect understanding of claude * Uncommented stream event processor * Renamed EventBus to StreamEventBus * Commented StreamEventProcessor * Event Processor * Integrated EventProcessor with BrowserAgent Added EventProcessor to BrowserAgetn * Renamed StreamEventBus to EventBus * Made EventBus required parameter in ExecutionContext * PlanGenerator rewrite PlanGenerator rewrite backup * For simple task, explicitly tell it to call done tool * Max attempts for simple task * backup * Revert "backup" This reverts commit 7d79a3d4d5774bfef79ec9827878b74edad3593f. * Consolidating where EventBus and EventProcessor are created and initialized backup * Update CLAUDE.md Update CLAUDE.md * Improving agent loop code Cleaned up processTooCall classification task * Create test-writer subAgent test-agent-prompt test agent prompt test-agent-prompt Update test-writer.md * BrowserAgent test Browseragent test BrowserAgent test * BrowserAgent refactor backup backup * Minor fixes * Minor fix * minor change -- NEW AGENT LOOP IS WORKING WELL * Update cursor rules * Small change * Improved BrowserAgent integration test Improved BrowserAgent integration test * Small change * Update CLAUDE.md * Different tools * FindElementTool is ready Find element update backup find element backup * Updated to test strings to say "tests..." * ScrollTool is ready * RefreshStateTool is updated as well * MessageManager updated * SearchTool is ready backup * Interaction Element is also ready * Add debugMessage emitter * ValidatorTool ready and tests are passing Validation Tool validator tool backup backup * GroupTabs tool ready * Registered all the tools * Planning changed to 5 steps * BrowserAgent integration test fix * Minor string changes * backup * Removed too many confusing events in EventProcessor -- there is only event.info right now * Abort control implemented backup Abort * Formatter for toolResult Formatter for toolResult backup * Always render using Markdown * Minor fix --------- Co-authored-by: Nikhil Sonti <nikhilsv92@gmail.com>
258 lines
8.9 KiB
TypeScript
258 lines
8.9 KiB
TypeScript
import { z } from 'zod';
|
|
import { createReactAgent } from '@langchain/langgraph/prebuilt';
|
|
import { Logging } from '@/lib/utils/Logging';
|
|
import { RunnableConfig } from '@langchain/core/runnables';
|
|
|
|
// Import base agent
|
|
import { BaseAgent, AgentOptions, AgentInput } from './BaseAgent';
|
|
|
|
// Import new tool system
|
|
import { ToolRegistry } from '@/lib/tools/base';
|
|
import { ExtractTool } from '@/lib/tools/answer/ExtractTool';
|
|
import { NavigationTool } from '@/lib/tools/browser-navigation/NavigationTool';
|
|
import { FindElementTool } from '@/lib/tools/browser-navigation/FindElementTool';
|
|
import { InteractionTool } from '@/lib/tools/browser-navigation/InteractionTool';
|
|
import { ScrollTool } from '@/lib/tools/browser-navigation/ScrollTool';
|
|
import { SearchTool } from '@/lib/tools/browser-navigation/SearchTool';
|
|
import { TabOperationsTool } from '@/lib/tools/tab/TabOperationsTool';
|
|
import { DoneTool } from '@/lib/tools/utility/DoneTool';
|
|
import { WaitTool } from '@/lib/tools/utility/WaitTool';
|
|
import { TodoListManagerTool } from '@/lib/tools/utility/TodoListManagerTool';
|
|
|
|
// Import prompt
|
|
import { BrowseAgentPrompt } from '@/lib/prompts/BrowseAgentPrompt';
|
|
import { RefreshStateTool } from '../tools';
|
|
|
|
/**
|
|
* Browse agent output schema
|
|
*/
|
|
export const BrowseOutputSchema = z.object({
|
|
completed: z.boolean(), // Whether the browsing task was completed
|
|
actions_taken: z.array(z.string()), // List of actions performed
|
|
final_state: z.string(), // Description of final page state
|
|
extracted_data: z.record(z.unknown()).optional() // Any data extracted during browsing
|
|
});
|
|
|
|
export type BrowseOutput = z.infer<typeof BrowseOutputSchema>;
|
|
|
|
/**
|
|
* Agent specialized for web browsing automation using ReAct pattern.
|
|
* Uses tools to complete complex web tasks through multi-step reasoning.
|
|
*/
|
|
export class BrowseAgent extends BaseAgent {
|
|
/**
|
|
* Creates a new instance of BrowseAgent
|
|
* @param options - Configuration options for the browse agent
|
|
*/
|
|
constructor(options: AgentOptions) {
|
|
super(options);
|
|
}
|
|
|
|
/**
|
|
* Override: Create tool registry for the agent
|
|
* @returns ToolRegistry with browse tools
|
|
*/
|
|
protected createToolRegistry(): ToolRegistry {
|
|
const registry = new ToolRegistry();
|
|
|
|
// Register only browser navigation, extraction, and utility tools
|
|
registry.registerAll([
|
|
// Browser navigation tools
|
|
new NavigationTool(this.executionContext),
|
|
new FindElementTool(this.executionContext),
|
|
new SearchTool(this.executionContext),
|
|
new InteractionTool(this.executionContext),
|
|
new ScrollTool(this.executionContext),
|
|
// Tab management tools
|
|
new TabOperationsTool(this.executionContext),
|
|
// Utility tools
|
|
new DoneTool(this.executionContext),
|
|
// new WaitTool(this.executionContext),
|
|
new TodoListManagerTool(this.executionContext),
|
|
// Extraction tools
|
|
new ExtractTool(this.executionContext),
|
|
new RefreshStateTool(this.executionContext),
|
|
]);
|
|
|
|
return registry;
|
|
}
|
|
|
|
/**
|
|
* Override: Generate system prompt for browse agent
|
|
* @returns System prompt string
|
|
*/
|
|
protected generateSystemPrompt(): string {
|
|
// Use the tool registry to generate documentation
|
|
const toolRegistry = this.toolRegistry;
|
|
const toolDocs = toolRegistry?.generateSystemPrompt() || '';
|
|
|
|
// Create and use the browse agent prompt with tool documentation
|
|
const promptGenerator = new BrowseAgentPrompt(toolDocs);
|
|
return promptGenerator.generate();
|
|
}
|
|
|
|
/**
|
|
* Override: Get the agent name for logging
|
|
* @returns Agent name
|
|
*/
|
|
protected getAgentName(): string {
|
|
return 'BrowseAgent';
|
|
}
|
|
|
|
|
|
/**
|
|
* Execute browsing agent - handles instruction enhancement and execution
|
|
* @param input - Agent input containing instruction and context
|
|
* @param callbacks - Optional streaming callbacks
|
|
* @param config - Optional configuration for LangGraph web compatibility
|
|
* @returns Parsed browse output
|
|
*/
|
|
protected async executeAgent(
|
|
input: AgentInput,
|
|
config?: RunnableConfig
|
|
): Promise<BrowseOutput> {
|
|
try {
|
|
await this.ensureInitialized();
|
|
|
|
// 1. Add system prompt (agent-specific)
|
|
// TODO: do we need to add system prompt here? we add as messageModifier below
|
|
this.executionContext.messageManager.addSystemMessage(this.systemPrompt, 0);
|
|
this.systemPromptAdded = true;
|
|
|
|
// 2. Add browser state before execution
|
|
if (!this.stateMessageAdded) {
|
|
const browserState = await this.executionContext.browserContext.getBrowserStateString();
|
|
this.executionContext.messageManager.addBrowserStateMessage(browserState);
|
|
this.stateMessageAdded = true;
|
|
|
|
// Debug: Log browser state details
|
|
const currentPage = await this.browserContext.getCurrentPage();
|
|
this.log('🌐 Browser state captured', 'info', {
|
|
url: currentPage.url(),
|
|
title: await currentPage.title(),
|
|
useVision: this.options.useVision,
|
|
hasScreenshot: browserState.includes('Screenshot:')
|
|
});
|
|
}
|
|
|
|
// Add selected tabs instruction if any
|
|
const selectedTabsInstruction = await this.getSelectedTabsInstruction();
|
|
if (selectedTabsInstruction) {
|
|
this.executionContext.messageManager.addHumanMessage(`[Context: ${selectedTabsInstruction}]`);
|
|
}
|
|
|
|
|
|
// Get LLM and tools
|
|
const llm = await this.getLLM();
|
|
const tools = this.createTools();
|
|
const isGemini = llm._llmType()?.indexOf('google') !== -1 || false;
|
|
|
|
const messages = this.executionContext.messageManager.getMessages(isGemini);
|
|
|
|
// Debug: Log agent configuration
|
|
this.log('🤖 Creating browse agent', 'info', {
|
|
instruction: input.instruction,
|
|
toolCount: tools.length,
|
|
tools: tools.map(t => t.name),
|
|
llmType: llm._llmType(),
|
|
messageCount: messages.length,
|
|
hasSelectedTabs: !!selectedTabsInstruction
|
|
});
|
|
|
|
// Create ReAct agent
|
|
const agent = createReactAgent({
|
|
llm,
|
|
tools,
|
|
});
|
|
|
|
// Use centralized streaming execution
|
|
const { result, allMessages } = await this.executeReactAgentWithStreaming(
|
|
agent,
|
|
input.instruction,
|
|
config,
|
|
messages
|
|
);
|
|
|
|
// 3. Remove browser state and system prompt after execution
|
|
if (this.stateMessageAdded) {
|
|
this.executionContext.messageManager.removeBrowserStateMessages();
|
|
this.stateMessageAdded = false;
|
|
|
|
// Debug log handled by base log method
|
|
}
|
|
//
|
|
if (this.systemPromptAdded) {
|
|
this.executionContext.messageManager.removeSystemMessage();
|
|
this.systemPromptAdded = false;
|
|
|
|
// Debug log handled by base log method
|
|
}
|
|
|
|
// Extract the final message content and model output
|
|
const lastMessage = allMessages[allMessages.length - 1];
|
|
const finalContent = typeof lastMessage?.content === 'string'
|
|
? lastMessage.content
|
|
: 'Task completed';
|
|
|
|
// Check for done tool usage to determine completion
|
|
const actionsTaken: string[] = [];
|
|
let completed = false;
|
|
const modelOutput: Record<string, any> | null = null;
|
|
|
|
for (const message of allMessages) {
|
|
// Check if message has tool_calls property (type guard)
|
|
if ('tool_calls' in message && message.tool_calls && Array.isArray(message.tool_calls)) {
|
|
for (const toolCall of message.tool_calls) {
|
|
actionsTaken.push(`${toolCall.name}: ${JSON.stringify(toolCall.args)}`);
|
|
if (toolCall.name === 'done') {
|
|
completed = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Debug: Log execution results
|
|
this.log('🏁 Browse execution complete', 'info', {
|
|
completed,
|
|
actionCount: actionsTaken.length,
|
|
toolCalls: actionsTaken.map(action => action.split(':')[0]),
|
|
finalStateLength: finalContent.length
|
|
});
|
|
|
|
return {
|
|
completed,
|
|
actions_taken: actionsTaken,
|
|
final_state: finalContent,
|
|
extracted_data: input.context || {} // Include any context data passed in
|
|
};
|
|
|
|
} catch (error) {
|
|
// Ensure state and system prompt are cleaned up on error
|
|
if (this.stateMessageAdded) {
|
|
this.executionContext.messageManager.removeBrowserStateMessages();
|
|
this.stateMessageAdded = false;
|
|
}
|
|
|
|
if (this.systemPromptAdded) {
|
|
this.executionContext.messageManager.removeSystemMessage();
|
|
this.systemPromptAdded = false;
|
|
}
|
|
|
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
|
|
// Debug: Log error details
|
|
this.log('❌ Browse task failed', 'error', {
|
|
error: errorMessage,
|
|
stack: error instanceof Error ? error.stack : undefined
|
|
});
|
|
|
|
return {
|
|
completed: false,
|
|
actions_taken: [],
|
|
final_state: `Task failed: ${errorMessage}`,
|
|
extracted_data: {}
|
|
};
|
|
}
|
|
}
|
|
}
|