mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-20 04:21:23 +00:00
* clean-up bunch of files for re-write * more clean-up and adding basic agent * Minor fix moved types into respective files. * Deleted bunch of old files backup Update gitignore Deleted a bunch of files Remove message manager Deleted old docs Update rules rename Profiler to profiler * Temporarily adding old code * Adding two small things back * backup * Implemented LangChainProvider and updated cursor rules backup LangChainProvider curosr rules * Implement tests for LangChainProvider -- unit test and integration test integration test passes integration test backup * Tool Design Tools Desing tools design * NavigationTool ready NavigationTool ready NavigationTool ready NaivgationTool ready backup * MessageManager MessageManager backup * Fixed integration test * Agent design new Updated agent design and added bunch of /NTN commands agent new design * Delete old agent design * MessageManagerReadOnly class * PlannerTool ready PlannerTool almost ready * ToolManager and DoneTool * Integration of BrowserAgent * BrowserAgent implementation v0.1 * BrowserAgent small fix v0.2 * Tool calling design too call design tool design claude * Update agent tool design with // NTN * add zod-to-json npm install * BrowserAGent v0.3 * BrowserAgent v0.4 * BrowserAgent v0.5 * fixes * Build error fixes in my NEWLY added code build errors fix * Build error fixes in old code (integration work) backup * Comment StreamEventProcessor for now, it is not used * Small build error fix * Small rename * Added integration test to check structuredLLM and changed to 4o-mini change default to nxtscape integration test * Small docstring * Simplified BrowserAgent code and added integration test Simplified BrowserAgent code BrowserAGent integrationt est * Update CLAUDE.md with project memory and instructions on how to write code Update CLAUDE.md with project memory and instructions on how to write code Project Memory * Just a mova.. Moved ToolManager outside. Build works. * TabOperations tool TabOperations Tool and fixing some test tab operations * Update CLAUDE.md * Added ClassificationTool classifiction tool classification prommpt * Refactored and simplified PlannerTool unit test and integration test * Updated Plnnaer tool * Update CLAUDE.md * BrowserAgent modified to do classification BrowserAgent with classification * minor fix to ToolManager * Instead of ToolCall and ToolResult -- just updating message manager once * minor fix to BrowserAgent integration test * Changed done to "done_tool" * Updated CLAUDE.md to reflect understanding of claude * Uncommented stream event processor * Renamed EventBus to StreamEventBus * Commented StreamEventProcessor * Event Processor * Integrated EventProcessor with BrowserAgent Added EventProcessor to BrowserAgetn * Renamed StreamEventBus to EventBus * Made EventBus required parameter in ExecutionContext * PlanGenerator rewrite PlanGenerator rewrite backup * For simple task, explicitly tell it to call done tool * Max attempts for simple task * backup * Revert "backup" This reverts commit 7d79a3d4d5774bfef79ec9827878b74edad3593f. * Consolidating where EventBus and EventProcessor are created and initialized backup * Update CLAUDE.md Update CLAUDE.md * Improving agent loop code Cleaned up processTooCall classification task * Create test-writer subAgent test-agent-prompt test agent prompt test-agent-prompt Update test-writer.md * BrowserAgent test Browseragent test BrowserAgent test * BrowserAgent refactor backup backup * Minor fixes * Minor fix * minor change -- NEW AGENT LOOP IS WORKING WELL * Update cursor rules * Small change * Improved BrowserAgent integration test Improved BrowserAgent integration test * Small change * Update CLAUDE.md * Different tools * FindElementTool is ready Find element update backup find element backup * Updated to test strings to say "tests..." * ScrollTool is ready * RefreshStateTool is updated as well * MessageManager updated * SearchTool is ready backup * Interaction Element is also ready * Add debugMessage emitter * ValidatorTool ready and tests are passing Validation Tool validator tool backup backup * GroupTabs tool ready * Registered all the tools * Planning changed to 5 steps * BrowserAgent integration test fix * Minor string changes * backup * Removed too many confusing events in EventProcessor -- there is only event.info right now * Abort control implemented backup Abort * Formatter for toolResult Formatter for toolResult backup * Always render using Markdown * Minor fix --------- Co-authored-by: Nikhil Sonti <nikhilsv92@gmail.com>
537 lines
16 KiB
TypeScript
537 lines
16 KiB
TypeScript
import { z } from 'zod';
|
|
import BrowserPage from './BrowserPage';
|
|
import { Logging } from '../utils/Logging';
|
|
import { profileAsync } from '../utils/Profiler';
|
|
|
|
// ============= Browser Context Configuration =============
|
|
|
|
// Browser context window size schema
|
|
export const BrowserContextWindowSizeSchema = z.object({
|
|
width: z.number().int().positive(), // Window width in pixels
|
|
height: z.number().int().positive() // Window height in pixels
|
|
})
|
|
|
|
export type BrowserContextWindowSize = z.infer<typeof BrowserContextWindowSizeSchema>
|
|
|
|
// Browser context configuration schema
|
|
export const BrowserContextConfigSchema = z.object({
|
|
maximumWaitPageLoadTime: z.number().default(5.0), // Maximum time to wait for page load
|
|
waitBetweenActions: z.number().default(0.1), // Time to wait between multiple actions
|
|
homePageUrl: z.string().default('https://www.google.com'), // Home page url
|
|
useVision: z.boolean().default(true) // Use vision mode
|
|
})
|
|
|
|
export type BrowserContextConfig = z.infer<typeof BrowserContextConfigSchema>
|
|
|
|
// Default configuration
|
|
export const DEFAULT_BROWSER_CONTEXT_CONFIG: BrowserContextConfig = BrowserContextConfigSchema.parse({})
|
|
|
|
// Tab info schema
|
|
export const TabInfoSchema = z.object({
|
|
id: z.number().int().positive(), // Tab ID
|
|
url: z.string(), // Tab URL
|
|
title: z.string() // Tab title
|
|
})
|
|
|
|
export type TabInfo = z.infer<typeof TabInfoSchema>
|
|
|
|
// Browser state schema for V2
|
|
export const BrowserStateSchema = z.object({
|
|
// Current tab info
|
|
tabId: z.number(), // Current tab ID
|
|
url: z.string(), // Current page URL
|
|
title: z.string(), // Current page title
|
|
|
|
// All tabs info
|
|
tabs: z.array(TabInfoSchema), // All open tabs
|
|
|
|
// Interactive elements as structured data
|
|
clickableElements: z.array(z.object({
|
|
nodeId: z.number(), // Chrome BrowserOS node ID
|
|
text: z.string(), // Element text (axName or tag)
|
|
tag: z.string() // HTML tag name
|
|
})), // Clickable elements with nodeId, text, and tag
|
|
|
|
typeableElements: z.array(z.object({
|
|
nodeId: z.number(), // Chrome BrowserOS node ID
|
|
text: z.string(), // Element text (axName or tag)
|
|
tag: z.string() // HTML tag name
|
|
})), // Typeable elements with nodeId, text, and tag
|
|
|
|
// Pre-formatted strings for display
|
|
clickableElementsString: z.string(), // Formatted string of clickable elements
|
|
typeableElementsString: z.string(), // Formatted string of typeable elements
|
|
|
|
// Hierarchical structure from BrowserOS API
|
|
hierarchicalStructure: z.string().nullable().optional(), // Hierarchical text representation with context
|
|
|
|
// Screenshot support (base64 encoded)
|
|
screenshot: z.string().nullable().optional() // Base64 encoded screenshot data
|
|
})
|
|
|
|
export type BrowserState = z.infer<typeof BrowserStateSchema>
|
|
|
|
// Error classes
|
|
export class BrowserError extends Error {
|
|
constructor(message?: string) {
|
|
super(message)
|
|
this.name = 'BrowserError'
|
|
}
|
|
}
|
|
|
|
export class URLNotAllowedError extends BrowserError {
|
|
constructor(message?: string) {
|
|
super(message)
|
|
this.name = 'URLNotAllowedError'
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Simplified BrowserContext that uses BrowserPageV2
|
|
*
|
|
* Key differences from V1:
|
|
* - No Puppeteer dependencies
|
|
* - No tab attachment/detachment logic (pages are always "attached")
|
|
* - Simplified state management
|
|
* - Direct Chrome API usage
|
|
*/
|
|
export class BrowserContext {
|
|
private _config: BrowserContextConfig;
|
|
private _userSelectedTabIds: number[] | null = null;
|
|
private _executionLockedTabId: number | null = null;
|
|
|
|
// Simple page cache - no attachment state needed
|
|
private _pageCache: Map<number, BrowserPage> = new Map();
|
|
|
|
constructor(config: Partial<BrowserContextConfig> = {}) {
|
|
this._config = { ...DEFAULT_BROWSER_CONTEXT_CONFIG, ...config };
|
|
}
|
|
|
|
public getConfig(): BrowserContextConfig {
|
|
return this._config;
|
|
}
|
|
|
|
public updateConfig(config: Partial<BrowserContextConfig>): void {
|
|
this._config = { ...this._config, ...config };
|
|
}
|
|
|
|
// ============= Core Page Operations =============
|
|
|
|
/**
|
|
* Get or create a Page instance for a tab
|
|
*/
|
|
private async _getOrCreatePage(tab: chrome.tabs.Tab): Promise<BrowserPage> {
|
|
if (!tab.id) {
|
|
throw new Error('Tab ID is not available');
|
|
}
|
|
|
|
// Check cache
|
|
const existingPage = this._pageCache.get(tab.id);
|
|
if (existingPage) {
|
|
return existingPage;
|
|
}
|
|
|
|
// Create new page
|
|
const page = new BrowserPage(tab.id, tab.url || 'Unknown URL', tab.title || 'Unknown Title');
|
|
this._pageCache.set(tab.id, page);
|
|
|
|
Logging.log('BrowserContextV2', `Created page for tab ${tab.id}`);
|
|
return page;
|
|
}
|
|
|
|
/**
|
|
* Get the current page
|
|
*/
|
|
public async getCurrentPage(): Promise<BrowserPage> {
|
|
return profileAsync('BrowserContext.getCurrentPage', async () => {
|
|
const targetTab = await this.getTargetTab();
|
|
|
|
if (!targetTab.id) {
|
|
throw new Error('Target tab has no ID');
|
|
}
|
|
|
|
const page = await this._getOrCreatePage(targetTab);
|
|
|
|
// Set execution lock for single-tab operations
|
|
if (!this._executionLockedTabId) {
|
|
this.lockExecutionToTab(targetTab.id);
|
|
}
|
|
|
|
return page;
|
|
});
|
|
}
|
|
|
|
// ============= Tab Management =============
|
|
|
|
/**
|
|
* Switch to a different tab
|
|
*/
|
|
public async switchTab(tabId: number): Promise<BrowserPage> {
|
|
return profileAsync(`BrowserContext.switchTab[${tabId}]`, async () => {
|
|
Logging.log('BrowserContextV2', `Switching to tab ${tabId}`);
|
|
|
|
await chrome.tabs.update(tabId, { active: true });
|
|
const tab = await chrome.tabs.get(tabId);
|
|
|
|
const page = await this._getOrCreatePage(tab);
|
|
this._executionLockedTabId = tabId;
|
|
|
|
return page;
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Get tab information
|
|
*/
|
|
public async getTabs(): Promise<TabInfo[]> {
|
|
const tabs = await chrome.tabs.query({});
|
|
const tabInfos: TabInfo[] = [];
|
|
|
|
for (const tab of tabs) {
|
|
if (tab.id && tab.url && tab.title) {
|
|
tabInfos.push({
|
|
id: tab.id,
|
|
url: tab.url,
|
|
title: tab.title,
|
|
});
|
|
}
|
|
}
|
|
return tabInfos;
|
|
}
|
|
|
|
// ============= Navigation Operations =============
|
|
|
|
/**
|
|
* Navigate to a URL
|
|
*/
|
|
public async navigateTo(url: string): Promise<void> {
|
|
const page = await this.getCurrentPage();
|
|
await page.navigateTo(url);
|
|
}
|
|
|
|
/**
|
|
* Open a new tab with URL
|
|
*/
|
|
public async openTab(url: string): Promise<BrowserPage> {
|
|
return profileAsync('BrowserContext.openTab', async () => {
|
|
// Create the new tab
|
|
const tab = await chrome.tabs.create({ url, active: true });
|
|
if (!tab.id) {
|
|
throw new Error('No tab ID available');
|
|
}
|
|
|
|
// Wait a bit for tab to initialize
|
|
await new Promise(resolve => setTimeout(resolve, 100));
|
|
|
|
// Get updated tab information
|
|
const updatedTab = await chrome.tabs.get(tab.id);
|
|
const page = await this._getOrCreatePage(updatedTab);
|
|
this._executionLockedTabId = tab.id;
|
|
|
|
return page;
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Close a tab
|
|
*/
|
|
public async closeTab(tabId: number): Promise<void> {
|
|
// Remove from cache
|
|
this._pageCache.delete(tabId);
|
|
|
|
// Close the tab
|
|
await chrome.tabs.remove(tabId);
|
|
|
|
// Update execution locked tab id if needed
|
|
if (this._executionLockedTabId === tabId) {
|
|
this._executionLockedTabId = null;
|
|
}
|
|
|
|
// Remove from user selected tabs if present
|
|
if (this._userSelectedTabIds && this._userSelectedTabIds.includes(tabId)) {
|
|
this._userSelectedTabIds = this._userSelectedTabIds.filter(id => id !== tabId);
|
|
}
|
|
}
|
|
|
|
// ============= State Operations =============
|
|
|
|
/**
|
|
* Get detailed browser state description for agents
|
|
*/
|
|
public async getBrowserStateString(): Promise<string> {
|
|
return profileAsync('BrowserContext.getBrowserStateString', async () => {
|
|
try {
|
|
// Use the structured getBrowserState API
|
|
const browserState = await this.getBrowserState();
|
|
|
|
// Format current tab
|
|
const currentTab = `{id: ${browserState.tabId}, url: ${browserState.url}, title: ${browserState.title}}`;
|
|
|
|
// Format other tabs
|
|
const otherTabs = browserState.tabs
|
|
.filter(tab => tab.id !== browserState.tabId)
|
|
.map(tab => `- {id: ${tab.id}, url: ${tab.url}, title: ${tab.title}}`);
|
|
|
|
// Get current date/time
|
|
const timeStr = new Date().toISOString().slice(0, 16).replace('T', ' ');
|
|
|
|
// Combine clickable and typeable elements
|
|
|
|
let elementsText = '';
|
|
const parts: string[] = [];
|
|
if (browserState.clickableElementsString) {
|
|
parts.push('Clickable elements:\n' + browserState.clickableElementsString);
|
|
}
|
|
if (browserState.typeableElementsString) {
|
|
parts.push('Input fields:\n' + browserState.typeableElementsString);
|
|
}
|
|
elementsText = parts.join('\n\n') || 'No interactive elements found';
|
|
|
|
|
|
// Include hierarchical structure if available
|
|
// elementsText = browserState.hierarchicalStructure? `\n\nHierarchical structure:\n${browserState.hierarchicalStructure}` : '';
|
|
|
|
// Build state description
|
|
const stateDescription = `
|
|
BROWSER STATE:
|
|
Current tab: ${currentTab}
|
|
Other available tabs:
|
|
${otherTabs.join('\n ')}
|
|
Current date and time: ${timeStr}
|
|
|
|
Interactive elements from the current page (numbers in [brackets] are nodeIds):
|
|
${elementsText}
|
|
`;
|
|
|
|
return stateDescription;
|
|
} catch (error) {
|
|
Logging.log('BrowserContextV2', `Failed to get detailed browser state: ${error}`, 'warning');
|
|
const currentPage = await this.getCurrentPage();
|
|
const url = await currentPage.url();
|
|
const title = await currentPage.title();
|
|
return `BROWSER STATE:\nCurrent page: ${url} - ${title}`;
|
|
}
|
|
});
|
|
}
|
|
|
|
// ============= Multi-Tab Operations =============
|
|
|
|
/**
|
|
* Get pages for specific tab IDs
|
|
*/
|
|
public async getPages(tabIds?: number[]): Promise<BrowserPage[]> {
|
|
try {
|
|
// If no tab IDs provided, return current page
|
|
if (!tabIds || tabIds.length === 0) {
|
|
const currentPage = await this.getCurrentPage();
|
|
return [currentPage];
|
|
}
|
|
|
|
// Get pages for specified tabs
|
|
const pages: BrowserPage[] = [];
|
|
|
|
for (const tabId of tabIds) {
|
|
try {
|
|
const tab = await chrome.tabs.get(tabId);
|
|
const page = await this._getOrCreatePage(tab);
|
|
pages.push(page);
|
|
} catch (error) {
|
|
Logging.log('BrowserContextV2', `Failed to get page for tab ${tabId}: ${error}`, 'warning');
|
|
}
|
|
}
|
|
|
|
if (pages.length === 0) {
|
|
throw new Error(`Failed to get any of the selected tabs (${tabIds.join(', ')})`);
|
|
}
|
|
|
|
return pages;
|
|
} catch (error) {
|
|
Logging.log('BrowserContextV2', `Error getting pages: ${error}`, 'error');
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get all tab IDs from the current window
|
|
*/
|
|
public async getAllTabIds(): Promise<Set<number>> {
|
|
try {
|
|
const tabs = await chrome.tabs.query({ currentWindow: true });
|
|
return new Set(tabs.map(tab => tab.id).filter((id): id is number => id !== undefined));
|
|
} catch (error) {
|
|
Logging.log('BrowserContextV2', `Failed to get tab IDs: ${error}`, 'warning');
|
|
return new Set();
|
|
}
|
|
}
|
|
|
|
// ============= Execution Lock Management =============
|
|
|
|
/**
|
|
* Get the target tab for operations
|
|
*/
|
|
private async getTargetTab(): Promise<chrome.tabs.Tab> {
|
|
// Check if we're in a locked execution context
|
|
if (this._executionLockedTabId) {
|
|
try {
|
|
const tab = await chrome.tabs.get(this._executionLockedTabId);
|
|
if (tab) {
|
|
return tab;
|
|
}
|
|
} catch (error) {
|
|
Logging.log('BrowserContextV2', `Execution-locked tab ${this._executionLockedTabId} no longer exists`, 'warning');
|
|
this._executionLockedTabId = null;
|
|
}
|
|
}
|
|
|
|
// No locked tab - use the active tab
|
|
const [activeTab] = await chrome.tabs.query({ active: true, currentWindow: true });
|
|
if (!activeTab?.id) {
|
|
throw new Error('No active tab available');
|
|
}
|
|
|
|
return activeTab;
|
|
}
|
|
|
|
/**
|
|
* Lock execution to a specific tab
|
|
*/
|
|
public lockExecutionToTab(tabId: number): void {
|
|
this._executionLockedTabId = tabId;
|
|
Logging.log('BrowserContextV2', `Execution locked to tab ${tabId}`);
|
|
}
|
|
|
|
/**
|
|
* Unlock execution
|
|
*/
|
|
public async unlockExecution(): Promise<void> {
|
|
const previousLockedTab = this._executionLockedTabId;
|
|
this._executionLockedTabId = null;
|
|
Logging.log('BrowserContextV2', `Execution unlocked${previousLockedTab ? ` (was locked to tab ${previousLockedTab})` : ''}`);
|
|
}
|
|
|
|
// ============= Window Management =============
|
|
|
|
public async getCurrentWindow(): Promise<chrome.windows.Window> {
|
|
try {
|
|
const tab = await this.getTargetTab();
|
|
if (tab && tab.windowId) {
|
|
const window = await chrome.windows.get(tab.windowId);
|
|
return window;
|
|
}
|
|
} catch (error) {
|
|
Logging.log('BrowserContextV2', `Failed to get window from target tab: ${error}`, 'warning');
|
|
}
|
|
|
|
// Fall back to current window
|
|
const window = await chrome.windows.getCurrent();
|
|
if (window) {
|
|
return window;
|
|
}
|
|
|
|
throw new Error('No window found');
|
|
}
|
|
|
|
/**
|
|
* Get structured browser state (V2 clean API)
|
|
* @returns BrowserState object with current page info and interactive elements
|
|
*/
|
|
public async getBrowserState(): Promise<BrowserState> {
|
|
return profileAsync('BrowserContext.getBrowserState', async () => {
|
|
try {
|
|
const currentPage = await this.getCurrentPage();
|
|
const tabs = await this.getTabs();
|
|
|
|
// Get current page info
|
|
const url = await currentPage.url();
|
|
const title = await currentPage.title();
|
|
const tabId = currentPage.tabId;
|
|
|
|
// Get formatted strings from the page
|
|
const clickableElementsString = await currentPage.getClickableElementsString();
|
|
const typeableElementsString = await currentPage.getTypeableElementsString();
|
|
|
|
// Get structured elements from the page
|
|
const clickableElements = await currentPage.getClickableElements();
|
|
const typeableElements = await currentPage.getTypeableElements();
|
|
|
|
// Get hierarchical structure
|
|
const hierarchicalStructure = await currentPage.getHierarchicalStructure();
|
|
|
|
// Get screenshot if vision is enabled
|
|
let screenshot: string | null = null;
|
|
if (this._config.useVision) {
|
|
screenshot = await currentPage.takeScreenshot();
|
|
}
|
|
|
|
// Build structured state
|
|
const state: BrowserState = {
|
|
// Current tab info
|
|
tabId,
|
|
url,
|
|
title,
|
|
|
|
// All tabs
|
|
tabs,
|
|
|
|
// Interactive elements
|
|
clickableElements,
|
|
typeableElements,
|
|
|
|
// Pre-formatted strings
|
|
clickableElementsString,
|
|
typeableElementsString,
|
|
|
|
// Hierarchical structure
|
|
hierarchicalStructure,
|
|
|
|
// Screenshot (if vision enabled)
|
|
screenshot
|
|
};
|
|
|
|
return state;
|
|
} catch (error) {
|
|
Logging.log('BrowserContextV2', `Failed to get state: ${error}`, 'warning');
|
|
|
|
// Return minimal state on error
|
|
const minimalState: BrowserState = {
|
|
tabId: 0,
|
|
url: 'about:blank',
|
|
title: 'New Tab',
|
|
tabs: [],
|
|
clickableElements: [],
|
|
typeableElements: [],
|
|
clickableElementsString: '',
|
|
typeableElementsString: '',
|
|
hierarchicalStructure: null,
|
|
screenshot: null
|
|
};
|
|
|
|
return minimalState;
|
|
}
|
|
});
|
|
}
|
|
|
|
|
|
// ============= Cleanup Operations =============
|
|
|
|
/**
|
|
* Clean up all resources
|
|
*/
|
|
public async cleanup(): Promise<void> {
|
|
try {
|
|
Logging.log('BrowserContextV2', 'Cleaning up browser context');
|
|
|
|
// Clear all state
|
|
this._pageCache.clear();
|
|
this._executionLockedTabId = null;
|
|
this._userSelectedTabIds = null;
|
|
|
|
Logging.log('BrowserContextV2', 'Browser context cleaned up successfully');
|
|
} catch (error) {
|
|
Logging.log('BrowserContextV2', `Error during cleanup: ${error}`, 'error');
|
|
}
|
|
}
|
|
}
|
|
|
|
export default BrowserContext;
|