diff --git a/apps/server/src/agent/gemini-agent.ts b/apps/server/src/agent/gemini-agent.ts index 2aff7524..5ca1e3d5 100644 --- a/apps/server/src/agent/gemini-agent.ts +++ b/apps/server/src/agent/gemini-agent.ts @@ -228,22 +228,6 @@ export class GeminiAgent { return `${contextLines.join('\n')}\n\n---\n\n` } - private injectWindowIdIntoToolArgs( - requestInfo: ToolCallRequestInfo, - browserContext?: BrowserContext, - ): void { - if (browserContext?.windowId && requestInfo.name.startsWith('browser_')) { - logger.debug('Injecting windowId into tool args', { - tool: requestInfo.name, - windowId: browserContext.windowId, - }) - requestInfo.args = { - ...requestInfo.args, - windowId: browserContext.windowId, - } - } - } - private async executeToolWithTimeout( requestInfo: ToolCallRequestInfo, abortSignal: AbortSignal, @@ -271,10 +255,7 @@ export class GeminiAgent { private async handleToolExecution( requestInfo: ToolCallRequestInfo, abortSignal: AbortSignal, - browserContext?: BrowserContext, ): Promise { - this.injectWindowIdIntoToolArgs(requestInfo, browserContext) - try { const completedToolCall = await this.executeToolWithTimeout( requestInfo, @@ -368,11 +349,7 @@ export class GeminiAgent { browserContext, ) - const result = await this.handleToolExecution( - requestInfo, - abortSignal, - browserContext, - ) + const result = await this.handleToolExecution(requestInfo, abortSignal) await this.toolHooks?.onAfterToolCall?.( requestInfo.name, diff --git a/apps/server/src/api/routes/mcp.ts b/apps/server/src/api/routes/mcp.ts index 94445ced..43ad6980 100644 --- a/apps/server/src/api/routes/mcp.ts +++ b/apps/server/src/api/routes/mcp.ts @@ -4,18 +4,27 @@ * SPDX-License-Identifier: AGPL-3.0-or-later */ +import { AsyncLocalStorage } from 'node:async_hooks' import { StreamableHTTPTransport } from '@hono/mcp' import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js' -import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js' +import type { + CallToolResult, + ImageContent, + TextContent, +} from '@modelcontextprotocol/sdk/types.js' import { SetLevelRequestSchema } from '@modelcontextprotocol/sdk/types.js' import { Hono } from 'hono' import type { z } from 'zod' import type { McpContext } from '../../browser/cdp/context' -import type { ControllerContext } from '../../browser/extension/context' +import { + type ControllerContext, + ScopedControllerContext, +} from '../../browser/extension/context' import { logger } from '../../lib/logger' import { metrics } from '../../lib/metrics' import type { MutexPool } from '../../lib/mutex' import { Sentry } from '../../lib/sentry' +import { ControllerResponse } from '../../tools/controller-based/response/controller-response' import { McpResponse } from '../../tools/response/mcp-response' import type { ToolDefinition } from '../../tools/types/tool-definition' import type { Env } from '../types' @@ -31,6 +40,9 @@ interface McpRouteDeps { } const MCP_SOURCE_HEADER = 'X-BrowserOS-Source' +const MCP_WINDOW_ID_HEADER = 'X-BrowserOS-Window-Id' + +const windowIdStore = new AsyncLocalStorage() type McpRequestSource = 'gemini-agent' | 'sdk-internal' | 'third-party' @@ -77,31 +89,44 @@ function createMcpServerWithTools(deps: McpRouteDeps): McpServer { (async (params: Record): Promise => { const startTime = performance.now() - // Serialize tool execution per-window (allows parallel execution across windows) - const windowId = params.windowId as number | undefined + // Resolve windowId: explicit param takes priority over request header + const windowId = + (params.windowId as number | undefined) ?? windowIdStore.getStore() const guard = await mutexPool.getMutex(windowId).acquire() try { + const isControllerTool = tool.name.startsWith('browser_') + logger.info( `${tool.name} request: ${JSON.stringify(params, null, ' ')}`, ) - // Detect if this is a controller tool (browser_* tools) - const isControllerTool = tool.name.startsWith('browser_') - const contextForResponse = - isControllerTool && controllerContext - ? controllerContext - : cdpContext - - // Create response handler and execute tool - const response = new McpResponse() - await tool.handler({ params }, response, cdpContext) - - // Process and return response try { - const content = await response.handle( - tool.name, - contextForResponse as McpContext, - ) + let content: Array + let structuredContent: Record | undefined + + if (isControllerTool) { + const { windowId: _, ...cleanParams } = params + const scopedContext = new ScopedControllerContext( + controllerContext.bridge, + windowId, + ) + const response = new ControllerResponse() + await tool.handler( + { params: cleanParams }, + response, + scopedContext, + ) + content = await response.handle(scopedContext) + structuredContent = response.structuredContent + } else { + const response = new McpResponse() + await tool.handler({ params }, response, cdpContext) + content = await response.handle( + tool.name, + cdpContext as McpContext, + ) + structuredContent = response.structuredContent + } // Log successful tool execution (non-blocking) metrics.log('tool_executed', { @@ -110,7 +135,6 @@ function createMcpServerWithTools(deps: McpRouteDeps): McpServer { success: true, }) - const structuredContent = response.structuredContent return { content, ...(structuredContent && { structuredContent }), @@ -158,36 +182,41 @@ export function createMcpRoutes(deps: McpRouteDeps) { } const source = getMcpRequestSource(c.req.header(MCP_SOURCE_HEADER)) + const headerWindowId = c.req.header(MCP_WINDOW_ID_HEADER) + const requestWindowId = headerWindowId ? Number(headerWindowId) : undefined + metrics.log('mcp.request', { source }) - try { - // Create a new transport for EACH request to prevent request ID collisions. - // Different clients may use the same JSON-RPC request IDs, which would cause - // responses to be routed to the wrong HTTP connections if transport state is shared. - const transport = new StreamableHTTPTransport({ - sessionIdGenerator: undefined, // Stateless mode - no session management - enableJsonResponse: true, // Return JSON responses (not SSE streams) - }) + return windowIdStore.run(requestWindowId, async () => { + try { + // Create a new transport for EACH request to prevent request ID collisions. + // Different clients may use the same JSON-RPC request IDs, which would cause + // responses to be routed to the wrong HTTP connections if transport state is shared. + const transport = new StreamableHTTPTransport({ + sessionIdGenerator: undefined, // Stateless mode - no session management + enableJsonResponse: true, // Return JSON responses (not SSE streams) + }) - // Connect the server to this transport - await mcpServer.connect(transport) + // Connect the server to this transport + await mcpServer.connect(transport) - // Handle the request and return response - return transport.handleRequest(c) - } catch (error) { - Sentry.captureException(error) - logger.error('Error handling MCP request', { - error: error instanceof Error ? error.message : String(error), - }) + // Handle the request and return response + return transport.handleRequest(c) + } catch (error) { + Sentry.captureException(error) + logger.error('Error handling MCP request', { + error: error instanceof Error ? error.message : String(error), + }) - return c.json( - { - jsonrpc: '2.0', - error: { code: -32603, message: 'Internal server error' }, - id: null, - }, - 500, - ) - } + return c.json( + { + jsonrpc: '2.0', + error: { code: -32603, message: 'Internal server error' }, + id: null, + }, + 500, + ) + } + }) }) } diff --git a/apps/server/src/api/services/chat-service.ts b/apps/server/src/api/services/chat-service.ts index ed5de1e1..aefe92d7 100644 --- a/apps/server/src/api/services/chat-service.ts +++ b/apps/server/src/api/services/chat-service.ts @@ -168,6 +168,9 @@ export class ChatService { headers: { Accept: 'application/json, text/event-stream', 'X-BrowserOS-Source': 'gemini-agent', + ...(browserContext?.windowId != null && { + 'X-BrowserOS-Window-Id': String(browserContext.windowId), + }), }, trust: true, }) diff --git a/apps/server/src/browser/extension/context.ts b/apps/server/src/browser/extension/context.ts index b05a8de1..fbf21a43 100644 --- a/apps/server/src/browser/extension/context.ts +++ b/apps/server/src/browser/extension/context.ts @@ -22,4 +22,31 @@ export class ControllerContext implements Context { isConnected(): boolean { return this.controllerBridge.isConnected() } + + get bridge(): ControllerBridge { + return this.controllerBridge + } +} + +export class ScopedControllerContext implements Context { + constructor( + private controllerBridge: ControllerBridge, + private windowId?: number, + ) {} + + async executeAction(action: string, payload: unknown): Promise { + const enriched = + this.windowId != null + ? { ...(payload as Record), windowId: this.windowId } + : payload + return this.controllerBridge.sendRequest( + action, + enriched, + TIMEOUTS.CONTROLLER_DEFAULT, + ) + } + + isConnected(): boolean { + return this.controllerBridge.isConnected() + } } diff --git a/apps/server/src/main.ts b/apps/server/src/main.ts index 469ba2ed..0bb265d7 100644 --- a/apps/server/src/main.ts +++ b/apps/server/src/main.ts @@ -68,7 +68,7 @@ export class Application { logger.info( `Loaded ${(await import('./tools/controller-based/registry')).allControllerTools.length} controller (extension) tools`, ) - const tools = createToolRegistry(cdpContext, controllerContext) + const tools = createToolRegistry(cdpContext) const mutexPool = new MutexPool() try { diff --git a/apps/server/src/tools/controller-based/response/controller-response.ts b/apps/server/src/tools/controller-based/response/controller-response.ts index 911c4b3f..b107b710 100644 --- a/apps/server/src/tools/controller-based/response/controller-response.ts +++ b/apps/server/src/tools/controller-based/response/controller-response.ts @@ -7,6 +7,7 @@ import type { TextContent, } from '@modelcontextprotocol/sdk/types.js' +import type { Context } from '../types/context' import type { ImageContentData, Response } from '../types/response' /** @@ -50,6 +51,46 @@ export class ControllerResponse implements Response { : undefined } + #includeSnapshot = false + #includeScreenshot = false + + setIncludeSnapshot(value: boolean): void { + this.#includeSnapshot = value + } + + setIncludeScreenshot(value: boolean): void { + this.#includeScreenshot = value + } + + async handle(context: Context): Promise> { + const content = this.toContent() + + if (this.#includeSnapshot) { + const result = await context.executeAction('getPageContent', {}) + const text = (result as { content?: string })?.content + if (text) { + content.push({ + type: 'text', + text: `\n## Page Content After Action\n${text}`, + }) + } + } + + if (this.#includeScreenshot) { + const result = await context.executeAction('captureScreenshot', {}) + const data = result as { data?: string; mimeType?: string } + if (data?.data) { + content.push({ + type: 'image', + data: data.data, + mimeType: data.mimeType ?? 'image/png', + }) + } + } + + return content + } + /** * Convert collected data to MCP content format */ diff --git a/apps/server/src/tools/controller-based/tools/advanced.ts b/apps/server/src/tools/controller-based/tools/advanced.ts index 525ffa38..d3f9c8b9 100644 --- a/apps/server/src/tools/controller-based/tools/advanced.ts +++ b/apps/server/src/tools/controller-based/tools/advanced.ts @@ -20,19 +20,16 @@ export const executeJavaScript = defineTool({ schema: { tabId: z.coerce.number().describe('Tab ID to execute code in'), code: z.string().describe('JavaScript code to execute'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, code, windowId } = request.params as { + const { tabId, code } = request.params as { tabId: number code: string - windowId?: number } const result = await context.executeAction('executeJavaScript', { tabId, code, - windowId, }) // biome-ignore lint/suspicious/noExplicitAny: JS execution returns arbitrary values const data = result as { result: any } @@ -70,23 +67,21 @@ export const sendKeys = defineTool({ 'PageDown', ]) .describe('Keyboard key to send'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, key, windowId } = request.params as { + const { tabId, key } = request.params as { tabId: number key: string - windowId?: number } const result = await context.executeAction('sendKeys', { tabId, key, - windowId, }) const data = result as { success: boolean; message: string } response.appendResponseLine(data.message) + response.setIncludeSnapshot?.(true) }, }) @@ -97,12 +92,9 @@ export const checkAvailability = defineTool({ category: ToolCategories.ADVANCED, readOnlyHint: true, }, - schema: { - windowId: z.number().optional().describe('Window ID for routing'), - }, - handler: async (request, response, context) => { - const { windowId } = request.params as { windowId?: number } - const result = await context.executeAction('checkBrowserOS', { windowId }) + schema: {}, + handler: async (_request, response, context) => { + const result = await context.executeAction('checkBrowserOS', {}) const data = result as { available: boolean apis?: string[] diff --git a/apps/server/src/tools/controller-based/tools/bookmarks.ts b/apps/server/src/tools/controller-based/tools/bookmarks.ts index fda5bd46..d1527943 100644 --- a/apps/server/src/tools/controller-based/tools/bookmarks.ts +++ b/apps/server/src/tools/controller-based/tools/bookmarks.ts @@ -21,17 +21,12 @@ export const getBookmarks = defineTool({ .string() .optional() .describe('Optional folder ID to get bookmarks from (omit for all)'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { folderId, windowId } = request.params as { - folderId?: string - windowId?: number - } + const { folderId } = request.params as { folderId?: string } const result = await context.executeAction('getBookmarks', { folderId, - windowId, }) const data = result as { bookmarks: Array<{ @@ -75,21 +70,18 @@ export const createBookmark = defineTool({ .describe( 'Folder ID to create bookmark in (from browser_get_bookmarks or browser_create_bookmark_folder)', ), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { title, url, parentId, windowId } = request.params as { + const { title, url, parentId } = request.params as { title: string url: string parentId?: string - windowId?: number } const result = await context.executeAction('createBookmark', { title, url, parentId, - windowId, }) const data = result as { id: string; title: string; url: string } @@ -108,15 +100,11 @@ export const removeBookmark = defineTool({ }, schema: { bookmarkId: z.string().describe('Bookmark ID to remove'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { bookmarkId, windowId } = request.params as { - bookmarkId: string - windowId?: number - } + const { bookmarkId } = request.params as { bookmarkId: string } - await context.executeAction('removeBookmark', { id: bookmarkId, windowId }) + await context.executeAction('removeBookmark', { id: bookmarkId }) response.appendResponseLine(`Removed bookmark ${bookmarkId}`) }, @@ -133,21 +121,18 @@ export const updateBookmark = defineTool({ bookmarkId: z.string().describe('Bookmark ID to update'), title: z.string().optional().describe('New title for the bookmark'), url: z.string().url().optional().describe('New URL for the bookmark'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { bookmarkId, title, url, windowId } = request.params as { + const { bookmarkId, title, url } = request.params as { bookmarkId: string title?: string url?: string - windowId?: number } const result = await context.executeAction('updateBookmark', { id: bookmarkId, title, url, - windowId, }) const data = result as { id: string; title: string; url?: string } @@ -177,19 +162,16 @@ export const createBookmarkFolder = defineTool< .string() .optional() .describe('Parent folder ID (defaults to Bookmarks Bar)'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { title, parentId, windowId } = request.params as { + const { title, parentId } = request.params as { title: string parentId?: string - windowId?: number } const result = await context.executeAction('createBookmarkFolder', { title, parentId, - windowId, }) const data = result as { id: string @@ -215,17 +197,12 @@ export const getBookmarkChildren = defineTool( }, schema: { folderId: z.string().describe('Folder ID to get children from'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { folderId, windowId } = request.params as { - folderId: string - windowId?: number - } + const { folderId } = request.params as { folderId: string } const result = await context.executeAction('getBookmarkChildren', { folderId, - windowId, }) const data = result as { children: Array<{ @@ -278,21 +255,18 @@ export const moveBookmark = defineTool({ .min(0) .optional() .describe('Position within parent (0-based)'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { bookmarkId, parentId, index, windowId } = request.params as { + const { bookmarkId, parentId, index } = request.params as { bookmarkId: string parentId?: string index?: number - windowId?: number } const result = await context.executeAction('moveBookmark', { id: bookmarkId, parentId, index, - windowId, }) const data = result as { id: string @@ -322,19 +296,16 @@ export const removeBookmarkTree = defineTool({ schema: { folderId: z.string().describe('Folder ID to remove'), confirm: z.boolean().describe('Must be true to confirm recursive deletion'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { folderId, confirm, windowId } = request.params as { + const { folderId, confirm } = request.params as { folderId: string confirm: boolean - windowId?: number } const result = await context.executeAction('removeBookmarkTree', { id: folderId, confirm, - windowId, }) const data = result as { success: boolean diff --git a/apps/server/src/tools/controller-based/tools/content.ts b/apps/server/src/tools/controller-based/tools/content.ts index 629c83a0..d7cfd274 100644 --- a/apps/server/src/tools/controller-based/tools/content.ts +++ b/apps/server/src/tools/controller-based/tools/content.ts @@ -29,7 +29,6 @@ export const getPageContent = defineTool({ }, schema: { tabId: z.coerce.number().describe('Tab ID to extract content from'), - windowId: z.number().optional().describe('Window ID for routing'), type: z .enum(['text', 'text-with-links']) .describe('Type of content to extract: text or text-with-links'), @@ -81,7 +80,6 @@ export const getPageContent = defineTool({ page?: string contextWindow?: string options?: { context?: 'visible' | 'full'; includeSections?: string[] } - windowId?: number } try { @@ -101,7 +99,6 @@ export const getPageContent = defineTool({ const snapshotResult = await context.executeAction('getSnapshot', { tabId: params.tabId, type: includeLinks ? 'links' : 'text', - windowId: params.windowId, }) const snapshot = snapshotResult as Snapshot diff --git a/apps/server/src/tools/controller-based/tools/coordinates.ts b/apps/server/src/tools/controller-based/tools/coordinates.ts index fe368e8e..c16d58a2 100644 --- a/apps/server/src/tools/controller-based/tools/coordinates.ts +++ b/apps/server/src/tools/controller-based/tools/coordinates.ts @@ -20,17 +20,15 @@ export const clickCoordinates = defineTool({ tabId: z.coerce.number().describe('Tab ID to click in'), x: z.coerce.number().describe('X coordinate'), y: z.coerce.number().describe('Y coordinate'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, x, y, windowId } = request.params as { + const { tabId, x, y } = request.params as { tabId: number x: number y: number - windowId?: number } - await context.executeAction('clickCoordinates', { tabId, x, y, windowId }) + await context.executeAction('clickCoordinates', { tabId, x, y }) response.appendResponseLine( `Clicked at coordinates (${x}, ${y}) in tab ${tabId}`, @@ -50,15 +48,13 @@ export const typeAtCoordinates = defineTool({ x: z.coerce.number().describe('X coordinate'), y: z.coerce.number().describe('Y coordinate'), text: z.string().describe('Text to type'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, x, y, text, windowId } = request.params as { + const { tabId, x, y, text } = request.params as { tabId: number x: number y: number text: string - windowId?: number } await context.executeAction('typeAtCoordinates', { @@ -66,7 +62,6 @@ export const typeAtCoordinates = defineTool({ x, y, text, - windowId, }) response.appendResponseLine( diff --git a/apps/server/src/tools/controller-based/tools/history.ts b/apps/server/src/tools/controller-based/tools/history.ts index 2113fa47..7059f578 100644 --- a/apps/server/src/tools/controller-based/tools/history.ts +++ b/apps/server/src/tools/controller-based/tools/history.ts @@ -22,19 +22,16 @@ export const searchHistory = defineTool({ .number() .optional() .describe('Maximum number of results to return (default: 100)'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { query, maxResults, windowId } = request.params as { + const { query, maxResults } = request.params as { query: string maxResults?: number - windowId?: number } const result = await context.executeAction('searchHistory', { query, maxResults, - windowId, }) const data = result as { items: Array<{ @@ -80,17 +77,12 @@ export const getRecentHistory = defineTool({ .number() .optional() .describe('Number of recent items to retrieve (default: 20)'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { count, windowId } = request.params as { - count?: number - windowId?: number - } + const { count } = request.params as { count?: number } const result = await context.executeAction('getRecentHistory', { count, - windowId, }) const data = result as { items: Array<{ diff --git a/apps/server/src/tools/controller-based/tools/interaction.ts b/apps/server/src/tools/controller-based/tools/interaction.ts index 2926fe68..1a7d6b6b 100644 --- a/apps/server/src/tools/controller-based/tools/interaction.ts +++ b/apps/server/src/tools/controller-based/tools/interaction.ts @@ -35,22 +35,15 @@ export const getInteractiveElements = defineTool< .boolean() .optional() .describe('Use simplified format (default: false)'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { - tabId, - simplified = false, - windowId, - } = request.params as { + const { tabId, simplified = false } = request.params as { tabId: number simplified?: boolean - windowId?: number } const result = await context.executeAction('getInteractiveSnapshot', { tabId, - windowId, }) const snapshot = result as { snapshotId: number @@ -148,24 +141,20 @@ export const grepInteractiveElements = defineTool< .describe( 'Number of elements to show before and after each match (default: 2). Set to 0 to show only matches.', ), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, ctx) => { const { tabId, pattern, context: contextLines = 2, - windowId, } = request.params as { tabId: number pattern: string context?: number - windowId?: number } const result = await ctx.executeAction('getInteractiveSnapshot', { tabId, - windowId, }) const snapshot = result as { snapshotId: number @@ -262,18 +251,17 @@ export const clickElement = defineTool({ nodeId: z.coerce .number() .describe('Node ID from browser_get_interactive_elements'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, nodeId, windowId } = request.params as { + const { tabId, nodeId } = request.params as { tabId: number nodeId: number - windowId?: number } - await context.executeAction('click', { tabId, nodeId, windowId }) + await context.executeAction('click', { tabId, nodeId }) response.appendResponseLine(`Clicked element ${nodeId} in tab ${tabId}`) + response.setIncludeSnapshot?.(true) }, }) @@ -288,22 +276,21 @@ export const typeText = defineTool({ tabId: z.coerce.number().describe('Tab ID containing the element'), nodeId: z.coerce.number().describe('Node ID of the input element'), text: z.string().describe('Text to type into the element'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, nodeId, text, windowId } = request.params as { + const { tabId, nodeId, text } = request.params as { tabId: number nodeId: number text: string - windowId?: number } - await context.executeAction('click', { tabId, nodeId, windowId }) - await context.executeAction('inputText', { tabId, nodeId, text, windowId }) + await context.executeAction('click', { tabId, nodeId }) + await context.executeAction('inputText', { tabId, nodeId, text }) response.appendResponseLine( `Typed text into element ${nodeId} in tab ${tabId}`, ) + response.setIncludeSnapshot?.(true) }, }) @@ -317,17 +304,15 @@ export const clearInput = defineTool({ schema: { tabId: z.coerce.number().describe('Tab ID containing the element'), nodeId: z.coerce.number().describe('Node ID of the input element'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, nodeId, windowId } = request.params as { + const { tabId, nodeId } = request.params as { tabId: number nodeId: number - windowId?: number } - await context.executeAction('click', { tabId, nodeId, windowId }) - await context.executeAction('clear', { tabId, nodeId, windowId }) + await context.executeAction('click', { tabId, nodeId }) + await context.executeAction('clear', { tabId, nodeId }) response.appendResponseLine(`Cleared element ${nodeId} in tab ${tabId}`) }, @@ -343,16 +328,14 @@ export const scrollToElement = defineTool({ schema: { tabId: z.coerce.number().describe('Tab ID containing the element'), nodeId: z.coerce.number().describe('Node ID of the element to scroll to'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, nodeId, windowId } = request.params as { + const { tabId, nodeId } = request.params as { tabId: number nodeId: number - windowId?: number } - await context.executeAction('scrollToNode', { tabId, nodeId, windowId }) + await context.executeAction('scrollToNode', { tabId, nodeId }) response.appendResponseLine(`Scrolled to element ${nodeId} in tab ${tabId}`) }, diff --git a/apps/server/src/tools/controller-based/tools/navigation.ts b/apps/server/src/tools/controller-based/tools/navigation.ts index 92ecc220..be4b14ec 100644 --- a/apps/server/src/tools/controller-based/tools/navigation.ts +++ b/apps/server/src/tools/controller-based/tools/navigation.ts @@ -22,16 +22,11 @@ export const navigate = defineTool({ .number() .optional() .describe('Tab ID to navigate (optional, defaults to active tab)'), - windowId: z - .number() - .optional() - .describe('Window ID (used when tabId not provided)'), }, handler: async (request, response, context) => { const params = request.params as { url: string tabId?: number - windowId?: number } const result = await context.executeAction('navigate', params) @@ -49,5 +44,6 @@ export const navigate = defineTool({ response.addStructuredContent('tabId', data.tabId) response.addStructuredContent('windowId', data.windowId) response.addStructuredContent('url', data.url) + response.setIncludeSnapshot?.(true) }, }) diff --git a/apps/server/src/tools/controller-based/tools/screenshot.ts b/apps/server/src/tools/controller-based/tools/screenshot.ts index 645c7379..1fc6b633 100644 --- a/apps/server/src/tools/controller-based/tools/screenshot.ts +++ b/apps/server/src/tools/controller-based/tools/screenshot.ts @@ -39,7 +39,6 @@ export const getScreenshotPointer = defineTool< .string() .optional() .describe('Optional label to show with pointer (e.g., "Click", "Type")'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { const params = request.params as { @@ -47,7 +46,6 @@ export const getScreenshotPointer = defineTool< nodeId: number size?: string pointerLabel?: string - windowId?: number } const result = await context.executeAction( @@ -104,7 +102,6 @@ export const getScreenshot = defineTool({ .number() .optional() .describe('Exact height in pixels (overrides size)'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { const params = request.params as { @@ -113,7 +110,6 @@ export const getScreenshot = defineTool({ showHighlights?: boolean width?: number height?: number - windowId?: number } const result = await context.executeAction('captureScreenshot', params) diff --git a/apps/server/src/tools/controller-based/tools/scrolling.ts b/apps/server/src/tools/controller-based/tools/scrolling.ts index de8c2661..6e04c01c 100644 --- a/apps/server/src/tools/controller-based/tools/scrolling.ts +++ b/apps/server/src/tools/controller-based/tools/scrolling.ts @@ -18,15 +18,11 @@ export const scrollDown = defineTool({ }, schema: { tabId: z.coerce.number().describe('Tab ID to scroll'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, windowId } = request.params as { - tabId: number - windowId?: number - } + const { tabId } = request.params as { tabId: number } - await context.executeAction('scrollDown', { tabId, windowId }) + await context.executeAction('scrollDown', { tabId }) response.appendResponseLine(`Scrolled down in tab ${tabId}`) }, @@ -41,15 +37,11 @@ export const scrollUp = defineTool({ }, schema: { tabId: z.coerce.number().describe('Tab ID to scroll'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, windowId } = request.params as { - tabId: number - windowId?: number - } + const { tabId } = request.params as { tabId: number } - await context.executeAction('scrollUp', { tabId, windowId }) + await context.executeAction('scrollUp', { tabId }) response.appendResponseLine(`Scrolled up in tab ${tabId}`) }, diff --git a/apps/server/src/tools/controller-based/tools/tab-management.ts b/apps/server/src/tools/controller-based/tools/tab-management.ts index 06d9cc60..4c0301a9 100644 --- a/apps/server/src/tools/controller-based/tools/tab-management.ts +++ b/apps/server/src/tools/controller-based/tools/tab-management.ts @@ -16,12 +16,9 @@ export const getActiveTab = defineTool({ category: ToolCategories.TAB_MANAGEMENT, readOnlyHint: true, }, - schema: { - windowId: z.number().optional().describe('Window ID (injected by agent)'), - }, - handler: async (request, response, context) => { - const params = request.params as { windowId?: number } - const result = await context.executeAction('getActiveTab', params) + schema: {}, + handler: async (_request, response, context) => { + const result = await context.executeAction('getActiveTab', {}) const data = result as { tabId: number url: string @@ -48,12 +45,9 @@ export const listTabs = defineTool({ category: ToolCategories.TAB_MANAGEMENT, readOnlyHint: true, }, - schema: { - windowId: z.number().optional().describe('Window ID (injected by agent)'), - }, - handler: async (request, response, context) => { - const params = request.params as { windowId?: number } - const result = await context.executeAction('getTabs', params) + schema: {}, + handler: async (_request, response, context) => { + const result = await context.executeAction('getTabs', {}) const data = result as { tabs: Array<{ id: number @@ -99,13 +93,11 @@ export const openTab = defineTool({ .boolean() .optional() .describe('Whether to make the new tab active (default: true)'), - windowId: z.number().optional().describe('Window ID (injected by agent)'), }, handler: async (request, response, context) => { const params = request.params as { url?: string active?: boolean - windowId?: number } const result = await context.executeAction('openTab', params) @@ -126,15 +118,11 @@ export const closeTab = defineTool({ }, schema: { tabId: z.coerce.number().describe('ID of the tab to close'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, windowId } = request.params as { - tabId: number - windowId?: number - } + const { tabId } = request.params as { tabId: number } - await context.executeAction('closeTab', { tabId, windowId }) + await context.executeAction('closeTab', { tabId }) response.appendResponseLine(`Closed tab ${tabId}`) }, @@ -149,15 +137,11 @@ export const switchTab = defineTool({ }, schema: { tabId: z.coerce.number().describe('ID of the tab to switch to'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, windowId } = request.params as { - tabId: number - windowId?: number - } + const { tabId } = request.params as { tabId: number } - const result = await context.executeAction('switchTab', { tabId, windowId }) + const result = await context.executeAction('switchTab', { tabId }) const data = result as { tabId: number; url: string; title: string } response.appendResponseLine(`Switched to tab: ${data.title}`) @@ -174,17 +158,12 @@ export const getLoadStatus = defineTool({ }, schema: { tabId: z.coerce.number().describe('Tab ID to check'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabId, windowId } = request.params as { - tabId: number - windowId?: number - } + const { tabId } = request.params as { tabId: number } const result = await context.executeAction('getPageLoadStatus', { tabId, - windowId, }) const data = result as { tabId: number @@ -218,12 +197,9 @@ export const listTabGroups = defineTool({ category: ToolCategories.TAB_MANAGEMENT, readOnlyHint: true, }, - schema: { - windowId: z.number().optional().describe('Window ID (injected by agent)'), - }, - handler: async (request, response, context) => { - const params = request.params as { windowId?: number } - const result = await context.executeAction('listTabGroups', params) + schema: {}, + handler: async (_request, response, context) => { + const result = await context.executeAction('listTabGroups', {}) const data = result as { groups: Array<{ id: number @@ -291,15 +267,13 @@ export const groupTabs = defineTool({ .number() .optional() .describe('Existing group ID to add tabs to'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabIds, title, color, groupId, windowId } = request.params as { + const { tabIds, title, color, groupId } = request.params as { tabIds: number[] title?: string color?: string groupId?: number - windowId?: number } const result = await context.executeAction('groupTabs', { @@ -307,7 +281,6 @@ export const groupTabs = defineTool({ title, color, groupId, - windowId, }) const data = result as { groupId: number @@ -356,15 +329,13 @@ export const updateTabGroup = defineTool({ .boolean() .optional() .describe('Whether to collapse (hide) the group tabs'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { groupId, title, color, collapsed, windowId } = request.params as { + const { groupId, title, color, collapsed } = request.params as { groupId: number title?: string color?: string collapsed?: boolean - windowId?: number } const result = await context.executeAction('updateTabGroup', { @@ -372,7 +343,6 @@ export const updateTabGroup = defineTool({ title, color, collapsed, - windowId, }) const data = result as { groupId: number @@ -403,17 +373,12 @@ export const ungroupTabs = defineTool({ tabIds: z .array(z.coerce.number()) .describe('Array of tab IDs to remove from their groups'), - windowId: z.number().optional().describe('Window ID for routing'), }, handler: async (request, response, context) => { - const { tabIds, windowId } = request.params as { - tabIds: number[] - windowId?: number - } + const { tabIds } = request.params as { tabIds: number[] } const result = await context.executeAction('ungroupTabs', { tabIds, - windowId, }) const data = result as { ungroupedCount: number } diff --git a/apps/server/src/tools/controller-based/types/response.ts b/apps/server/src/tools/controller-based/types/response.ts index db743e68..b8589066 100644 --- a/apps/server/src/tools/controller-based/types/response.ts +++ b/apps/server/src/tools/controller-based/types/response.ts @@ -40,4 +40,16 @@ export interface Response { * Add a key-value pair to structured content (flat, no nesting) */ addStructuredContent(key: string, value: unknown): void + + /** + * Request page content snapshot to be appended after tool execution. + * Only supported by ControllerResponse (no-op on other implementations). + */ + setIncludeSnapshot?(value: boolean): void + + /** + * Request screenshot to be appended after tool execution. + * Only supported by ControllerResponse (no-op on other implementations). + */ + setIncludeScreenshot?(value: boolean): void } diff --git a/apps/server/src/tools/registry.ts b/apps/server/src/tools/registry.ts index dd7152cf..ac189ca8 100644 --- a/apps/server/src/tools/registry.ts +++ b/apps/server/src/tools/registry.ts @@ -7,7 +7,6 @@ */ import type { McpContext } from '../browser/cdp/context' -import type { ControllerContext } from '../browser/extension/context' import { logger } from '../lib/logger' import { allCdpTools } from './cdp-based/registry' @@ -16,34 +15,14 @@ import type { ToolDefinition } from './types/tool-definition' export function createToolRegistry( cdpContext: McpContext | null, - controllerContext: ControllerContext, // biome-ignore lint/suspicious/noExplicitAny: heterogeneous tool registry requires any ): Array> { const cdpTools = cdpContext ? allCdpTools : [] - const wrappedControllerTools = wrapControllerTools( - allControllerTools, - controllerContext, - ) logger.info( - `Total tools available: ${cdpTools.length + wrappedControllerTools.length} ` + - `(${cdpTools.length} CDP + ${wrappedControllerTools.length} extension)`, + `Total tools available: ${cdpTools.length + allControllerTools.length} ` + + `(${cdpTools.length} CDP + ${allControllerTools.length} extension)`, ) - return [...cdpTools, ...wrappedControllerTools] -} - -function wrapControllerTools( - tools: typeof allControllerTools, - controllerContext: ControllerContext, - // biome-ignore lint/suspicious/noExplicitAny: wrapper function for heterogeneous tools -): Array> { - // biome-ignore lint/suspicious/noExplicitAny: tool has heterogeneous schema - return tools.map((tool: any) => ({ - ...tool, - // biome-ignore lint/suspicious/noExplicitAny: handler params are dynamically typed - handler: async (request: any, response: any, _context: any) => { - return tool.handler(request, response, controllerContext) - }, - })) + return [...cdpTools, ...allControllerTools] }