From fd3cdace91ded604b8e2637e9756a44145848867 Mon Sep 17 00:00:00 2001 From: shivammittal274 <56757235+shivammittal274@users.noreply.github.com> Date: Wed, 21 Jan 2026 02:36:58 +0530 Subject: [PATCH] feat: new APIs for eval mode support (#250) * feat: eval mode * feat: eval mode --- .../src/actions/browser/CloseWindowAction.ts | 33 +++++++++ .../src/actions/browser/CreateWindowAction.ts | 68 ++++++++++++++++++ .../src/background/BrowserOSController.ts | 5 ++ apps/server/package.json | 12 ++++ apps/server/src/agent/gemini-agent.ts | 39 ++++++++++- apps/server/src/agent/types.ts | 2 + .../src/tools/controller-based/registry.ts | 7 +- .../tools/window-management.ts | 70 +++++++++++++++++++ bun.lock | 4 +- 9 files changed, 236 insertions(+), 4 deletions(-) create mode 100644 apps/controller-ext/src/actions/browser/CloseWindowAction.ts create mode 100644 apps/controller-ext/src/actions/browser/CreateWindowAction.ts create mode 100644 apps/server/src/tools/controller-based/tools/window-management.ts diff --git a/apps/controller-ext/src/actions/browser/CloseWindowAction.ts b/apps/controller-ext/src/actions/browser/CloseWindowAction.ts new file mode 100644 index 00000000..d5e46458 --- /dev/null +++ b/apps/controller-ext/src/actions/browser/CloseWindowAction.ts @@ -0,0 +1,33 @@ +/** + * @license + * Copyright 2025 BrowserOS + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +import { z } from 'zod' +import { ActionHandler } from '../ActionHandler' + +const CloseWindowInputSchema = z.object({ + windowId: z.number().int().positive().describe('ID of the window to close'), +}) + +const CloseWindowOutputSchema = z.object({ + success: z.boolean().describe('Whether the window was successfully closed'), +}) + +type CloseWindowInput = z.infer +type CloseWindowOutput = z.infer + +export class CloseWindowAction extends ActionHandler< + CloseWindowInput, + CloseWindowOutput +> { + readonly inputSchema = CloseWindowInputSchema + + async execute(input: CloseWindowInput): Promise { + await chrome.windows.remove(input.windowId) + + return { + success: true, + } + } +} diff --git a/apps/controller-ext/src/actions/browser/CreateWindowAction.ts b/apps/controller-ext/src/actions/browser/CreateWindowAction.ts new file mode 100644 index 00000000..3312fd24 --- /dev/null +++ b/apps/controller-ext/src/actions/browser/CreateWindowAction.ts @@ -0,0 +1,68 @@ +/** + * @license + * Copyright 2025 BrowserOS + * SPDX-License-Identifier: AGPL-3.0-or-later + */ +import { z } from 'zod' +import { ActionHandler } from '../ActionHandler' + +const CreateWindowInputSchema = z.object({ + url: z + .string() + .optional() + .default('about:blank') + .describe('URL to open in the new window'), + incognito: z + .boolean() + .optional() + .default(false) + .describe('Create an incognito window'), + focused: z + .boolean() + .optional() + .default(true) + .describe('Whether to focus the new window'), +}) + +const CreateWindowOutputSchema = z.object({ + windowId: z.number().describe('ID of the newly created window'), + tabId: z.number().describe('ID of the first tab in the new window'), +}) + +type CreateWindowInput = z.infer +type CreateWindowOutput = z.infer + +export class CreateWindowAction extends ActionHandler< + CreateWindowInput, + CreateWindowOutput +> { + readonly inputSchema = CreateWindowInputSchema + + async execute(input: CreateWindowInput): Promise { + const createData: chrome.windows.CreateData = { + url: input.url, + focused: input.focused, + incognito: input.incognito, + } + + const createdWindow = await chrome.windows.create(createData) + + if (!createdWindow) { + throw new Error('Failed to create window') + } + + if (createdWindow.id === undefined) { + throw new Error('Created window has no ID') + } + + const tabId = createdWindow.tabs?.[0]?.id + if (tabId === undefined) { + throw new Error('Created window has no tab') + } + + return { + windowId: createdWindow.id, + tabId, + } + } +} diff --git a/apps/controller-ext/src/background/BrowserOSController.ts b/apps/controller-ext/src/background/BrowserOSController.ts index 3836e87a..41c3cdca 100644 --- a/apps/controller-ext/src/background/BrowserOSController.ts +++ b/apps/controller-ext/src/background/BrowserOSController.ts @@ -12,6 +12,8 @@ import { CaptureScreenshotPointerAction } from '@/actions/browser/CaptureScreens import { ClearAction } from '@/actions/browser/ClearAction' import { ClickAction } from '@/actions/browser/ClickAction' import { ClickCoordinatesAction } from '@/actions/browser/ClickCoordinatesAction' +import { CloseWindowAction } from '@/actions/browser/CloseWindowAction' +import { CreateWindowAction } from '@/actions/browser/CreateWindowAction' import { ExecuteJavaScriptAction } from '@/actions/browser/ExecuteJavaScriptAction' import { GetAccessibilityTreeAction } from '@/actions/browser/GetAccessibilityTreeAction' import { GetInteractiveSnapshotAction } from '@/actions/browser/GetInteractiveSnapshotAction' @@ -192,6 +194,9 @@ export class BrowserOSController { this.actionRegistry.register('updateTabGroup', new UpdateTabGroupAction()) this.actionRegistry.register('ungroupTabs', new UngroupTabsAction()) + this.actionRegistry.register('createWindow', new CreateWindowAction()) + this.actionRegistry.register('closeWindow', new CloseWindowAction()) + this.actionRegistry.register('getBookmarks', new GetBookmarksAction()) this.actionRegistry.register('createBookmark', new CreateBookmarkAction()) this.actionRegistry.register('removeBookmark', new RemoveBookmarkAction()) diff --git a/apps/server/package.json b/apps/server/package.json index 10ce544c..f9acc440 100644 --- a/apps/server/package.json +++ b/apps/server/package.json @@ -23,6 +23,18 @@ ".": { "types": "./src/rpc.ts", "import": "./src/rpc.ts" + }, + "./agent": { + "types": "./src/agent/gemini-agent.ts", + "default": "./src/agent/gemini-agent.ts" + }, + "./agent/types": { + "types": "./src/agent/types.ts", + "default": "./src/agent/types.ts" + }, + "./lib/clients/gateway": { + "types": "./src/lib/clients/gateway.ts", + "default": "./src/lib/clients/gateway.ts" } }, "dependencies": { diff --git a/apps/server/src/agent/gemini-agent.ts b/apps/server/src/agent/gemini-agent.ts index 5b1d6ce8..e95be9bf 100644 --- a/apps/server/src/agent/gemini-agent.ts +++ b/apps/server/src/agent/gemini-agent.ts @@ -26,13 +26,29 @@ import type { HonoSSEStream } from './provider-adapter/types' import { UIMessageStreamWriter } from './provider-adapter/ui-message-stream' import type { ResolvedAgentConfig } from './types' -interface ToolExecutionResult { +export interface ToolExecutionResult { parts: Part[] isError: boolean errorMessage?: string } +export interface ToolExecutionHooks { + onBeforeToolCall?: ( + toolName: string, + args: unknown, + browserContext?: BrowserContext, + ) => Promise + + onAfterToolCall?: ( + toolName: string, + result: ToolExecutionResult, + browserContext?: BrowserContext, + ) => Promise +} + export class GeminiAgent { + private toolHooks?: ToolExecutionHooks + private constructor( private client: GeminiClient, private geminiConfig: GeminiConfig, @@ -40,6 +56,10 @@ export class GeminiAgent { private conversationId: string, ) {} + setToolHooks(hooks: ToolExecutionHooks): void { + this.toolHooks = hooks + } + /** * Creates a GeminiAgent with pre-resolved config and MCP servers. * Config resolution and MCP building happens in ChatService (visible there). @@ -81,6 +101,7 @@ export class GeminiAgent { // Build excluded tools list - always exclude save_memory and google_web_search // Conditionally exclude screenshot tools if model doesn't support images + // Exclude window management tools unless in eval mode const excludedTools = ['save_memory', 'google_web_search'] if (config.supportsImages === false) { excludedTools.push( @@ -89,6 +110,9 @@ export class GeminiAgent { ) logger.info('Model does not support images, excluding screenshot tools') } + if (config.evalMode !== true) { + excludedTools.push('browser_create_window', 'browser_close_window') + } const geminiConfig = new GeminiConfig({ sessionId: config.conversationId, @@ -303,11 +327,24 @@ export class GeminiAgent { for (const requestInfo of toolCallRequests) { if (abortSignal.aborted) break + await this.toolHooks?.onBeforeToolCall?.( + requestInfo.name, + requestInfo.args, + browserContext, + ) + const result = await this.handleToolExecution( requestInfo, abortSignal, browserContext, ) + + await this.toolHooks?.onAfterToolCall?.( + requestInfo.name, + result, + browserContext, + ) + toolResponseParts.push(...result.parts) if (uiStream) { diff --git a/apps/server/src/agent/types.ts b/apps/server/src/agent/types.ts index 857dae6a..72ec7cc9 100644 --- a/apps/server/src/agent/types.ts +++ b/apps/server/src/agent/types.ts @@ -35,4 +35,6 @@ export interface ResolvedAgentConfig { sessionExecutionDir: string /** Whether the model supports image inputs (vision). Defaults to true. */ supportsImages?: boolean + /** Eval mode - enables window management tools. Defaults to false. */ + evalMode?: boolean } diff --git a/apps/server/src/tools/controller-based/registry.ts b/apps/server/src/tools/controller-based/registry.ts index b5c686af..da31f4a7 100644 --- a/apps/server/src/tools/controller-based/registry.ts +++ b/apps/server/src/tools/controller-based/registry.ts @@ -47,6 +47,8 @@ export { ungroupTabs, updateTabGroup, } from './tools/tab-management' +// Window Management +export { closeWindow, createWindow } from './tools/window-management' // Types export type { Context } from './types/context' export type { ImageContentData, Response } from './types/response' @@ -86,8 +88,9 @@ import { ungroupTabs, updateTabGroup, } from './tools/tab-management' +import { closeWindow, createWindow } from './tools/window-management' -// Array export for convenience (32 tools) +// Array export for convenience (33 tools total) export const allControllerTools = [ getActiveTab, listTabs, @@ -121,4 +124,6 @@ export const allControllerTools = [ removeBookmark, searchHistory, getRecentHistory, + createWindow, + closeWindow, ] diff --git a/apps/server/src/tools/controller-based/tools/window-management.ts b/apps/server/src/tools/controller-based/tools/window-management.ts new file mode 100644 index 00000000..cfa9d65f --- /dev/null +++ b/apps/server/src/tools/controller-based/tools/window-management.ts @@ -0,0 +1,70 @@ +/** + * @license + * Copyright 2025 BrowserOS + */ +import { z } from 'zod' + +import { ToolCategories } from '../../types/tool-categories' +import { defineTool } from '../../types/tool-definition' +import type { Context } from '../types/context' +import type { Response } from '../types/response' + +export const createWindow = defineTool({ + name: 'browser_create_window', + description: + 'Create a new browser window. Returns the windowId and tabId of the created window.', + annotations: { + category: ToolCategories.TAB_MANAGEMENT, + readOnlyHint: false, + }, + schema: { + url: z + .string() + .optional() + .describe('URL to open in the new window. Defaults to about:blank'), + incognito: z.boolean().optional().describe('Create an incognito window'), + focused: z + .boolean() + .optional() + .describe('Whether to focus the new window. Defaults to true'), + }, + handler: async (request, response, context) => { + const params = request.params as { + url?: string + incognito?: boolean + focused?: boolean + } + + const result = await context.executeAction('createWindow', { + url: params.url || 'about:blank', + incognito: params.incognito || false, + focused: params.focused ?? true, + }) + const data = result as { windowId: number; tabId: number } + + response.appendResponseLine(`Created window ${data.windowId}`) + response.appendResponseLine(`Tab ID: ${data.tabId}`) + + response.addStructuredContent('windowId', data.windowId) + response.addStructuredContent('tabId', data.tabId) + }, +}) + +export const closeWindow = defineTool({ + name: 'browser_close_window', + description: 'Close a browser window by its windowId.', + annotations: { + category: ToolCategories.TAB_MANAGEMENT, + readOnlyHint: false, + }, + schema: { + windowId: z.coerce.number().describe('The ID of the window to close'), + }, + handler: async (request, response, context) => { + const { windowId } = request.params as { windowId: number } + + await context.executeAction('closeWindow', { windowId }) + + response.appendResponseLine(`Closed window ${windowId}`) + }, +}) diff --git a/bun.lock b/bun.lock index b74d2d88..0fa0a318 100644 --- a/bun.lock +++ b/bun.lock @@ -124,7 +124,7 @@ }, "apps/server": { "name": "@browseros/server", - "version": "0.0.42", + "version": "0.0.44", "bin": { "browseros-server": "./src/index.ts", }, @@ -175,7 +175,7 @@ }, "packages/agent-sdk": { "name": "@browseros-ai/agent-sdk", - "version": "0.0.4", + "version": "0.0.5", "dependencies": { "eventsource-parser": "^3.0.6", "zod-to-json-schema": "^3.24.1",