feat: new APIs for eval mode support (#250)

* feat: eval mode

* feat: eval mode
This commit is contained in:
shivammittal274
2026-01-21 02:36:58 +05:30
committed by GitHub
parent a4bae1f0c7
commit fd3cdace91
9 changed files with 236 additions and 4 deletions

View File

@@ -0,0 +1,33 @@
/**
* @license
* Copyright 2025 BrowserOS
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
import { z } from 'zod'
import { ActionHandler } from '../ActionHandler'
const CloseWindowInputSchema = z.object({
windowId: z.number().int().positive().describe('ID of the window to close'),
})
const CloseWindowOutputSchema = z.object({
success: z.boolean().describe('Whether the window was successfully closed'),
})
type CloseWindowInput = z.infer<typeof CloseWindowInputSchema>
type CloseWindowOutput = z.infer<typeof CloseWindowOutputSchema>
export class CloseWindowAction extends ActionHandler<
CloseWindowInput,
CloseWindowOutput
> {
readonly inputSchema = CloseWindowInputSchema
async execute(input: CloseWindowInput): Promise<CloseWindowOutput> {
await chrome.windows.remove(input.windowId)
return {
success: true,
}
}
}

View File

@@ -0,0 +1,68 @@
/**
* @license
* Copyright 2025 BrowserOS
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
import { z } from 'zod'
import { ActionHandler } from '../ActionHandler'
const CreateWindowInputSchema = z.object({
url: z
.string()
.optional()
.default('about:blank')
.describe('URL to open in the new window'),
incognito: z
.boolean()
.optional()
.default(false)
.describe('Create an incognito window'),
focused: z
.boolean()
.optional()
.default(true)
.describe('Whether to focus the new window'),
})
const CreateWindowOutputSchema = z.object({
windowId: z.number().describe('ID of the newly created window'),
tabId: z.number().describe('ID of the first tab in the new window'),
})
type CreateWindowInput = z.infer<typeof CreateWindowInputSchema>
type CreateWindowOutput = z.infer<typeof CreateWindowOutputSchema>
export class CreateWindowAction extends ActionHandler<
CreateWindowInput,
CreateWindowOutput
> {
readonly inputSchema = CreateWindowInputSchema
async execute(input: CreateWindowInput): Promise<CreateWindowOutput> {
const createData: chrome.windows.CreateData = {
url: input.url,
focused: input.focused,
incognito: input.incognito,
}
const createdWindow = await chrome.windows.create(createData)
if (!createdWindow) {
throw new Error('Failed to create window')
}
if (createdWindow.id === undefined) {
throw new Error('Created window has no ID')
}
const tabId = createdWindow.tabs?.[0]?.id
if (tabId === undefined) {
throw new Error('Created window has no tab')
}
return {
windowId: createdWindow.id,
tabId,
}
}
}

View File

@@ -12,6 +12,8 @@ import { CaptureScreenshotPointerAction } from '@/actions/browser/CaptureScreens
import { ClearAction } from '@/actions/browser/ClearAction' import { ClearAction } from '@/actions/browser/ClearAction'
import { ClickAction } from '@/actions/browser/ClickAction' import { ClickAction } from '@/actions/browser/ClickAction'
import { ClickCoordinatesAction } from '@/actions/browser/ClickCoordinatesAction' import { ClickCoordinatesAction } from '@/actions/browser/ClickCoordinatesAction'
import { CloseWindowAction } from '@/actions/browser/CloseWindowAction'
import { CreateWindowAction } from '@/actions/browser/CreateWindowAction'
import { ExecuteJavaScriptAction } from '@/actions/browser/ExecuteJavaScriptAction' import { ExecuteJavaScriptAction } from '@/actions/browser/ExecuteJavaScriptAction'
import { GetAccessibilityTreeAction } from '@/actions/browser/GetAccessibilityTreeAction' import { GetAccessibilityTreeAction } from '@/actions/browser/GetAccessibilityTreeAction'
import { GetInteractiveSnapshotAction } from '@/actions/browser/GetInteractiveSnapshotAction' import { GetInteractiveSnapshotAction } from '@/actions/browser/GetInteractiveSnapshotAction'
@@ -192,6 +194,9 @@ export class BrowserOSController {
this.actionRegistry.register('updateTabGroup', new UpdateTabGroupAction()) this.actionRegistry.register('updateTabGroup', new UpdateTabGroupAction())
this.actionRegistry.register('ungroupTabs', new UngroupTabsAction()) this.actionRegistry.register('ungroupTabs', new UngroupTabsAction())
this.actionRegistry.register('createWindow', new CreateWindowAction())
this.actionRegistry.register('closeWindow', new CloseWindowAction())
this.actionRegistry.register('getBookmarks', new GetBookmarksAction()) this.actionRegistry.register('getBookmarks', new GetBookmarksAction())
this.actionRegistry.register('createBookmark', new CreateBookmarkAction()) this.actionRegistry.register('createBookmark', new CreateBookmarkAction())
this.actionRegistry.register('removeBookmark', new RemoveBookmarkAction()) this.actionRegistry.register('removeBookmark', new RemoveBookmarkAction())

View File

@@ -23,6 +23,18 @@
".": { ".": {
"types": "./src/rpc.ts", "types": "./src/rpc.ts",
"import": "./src/rpc.ts" "import": "./src/rpc.ts"
},
"./agent": {
"types": "./src/agent/gemini-agent.ts",
"default": "./src/agent/gemini-agent.ts"
},
"./agent/types": {
"types": "./src/agent/types.ts",
"default": "./src/agent/types.ts"
},
"./lib/clients/gateway": {
"types": "./src/lib/clients/gateway.ts",
"default": "./src/lib/clients/gateway.ts"
} }
}, },
"dependencies": { "dependencies": {

View File

@@ -26,13 +26,29 @@ import type { HonoSSEStream } from './provider-adapter/types'
import { UIMessageStreamWriter } from './provider-adapter/ui-message-stream' import { UIMessageStreamWriter } from './provider-adapter/ui-message-stream'
import type { ResolvedAgentConfig } from './types' import type { ResolvedAgentConfig } from './types'
interface ToolExecutionResult { export interface ToolExecutionResult {
parts: Part[] parts: Part[]
isError: boolean isError: boolean
errorMessage?: string errorMessage?: string
} }
export interface ToolExecutionHooks {
onBeforeToolCall?: (
toolName: string,
args: unknown,
browserContext?: BrowserContext,
) => Promise<void>
onAfterToolCall?: (
toolName: string,
result: ToolExecutionResult,
browserContext?: BrowserContext,
) => Promise<void>
}
export class GeminiAgent { export class GeminiAgent {
private toolHooks?: ToolExecutionHooks
private constructor( private constructor(
private client: GeminiClient, private client: GeminiClient,
private geminiConfig: GeminiConfig, private geminiConfig: GeminiConfig,
@@ -40,6 +56,10 @@ export class GeminiAgent {
private conversationId: string, private conversationId: string,
) {} ) {}
setToolHooks(hooks: ToolExecutionHooks): void {
this.toolHooks = hooks
}
/** /**
* Creates a GeminiAgent with pre-resolved config and MCP servers. * Creates a GeminiAgent with pre-resolved config and MCP servers.
* Config resolution and MCP building happens in ChatService (visible there). * Config resolution and MCP building happens in ChatService (visible there).
@@ -81,6 +101,7 @@ export class GeminiAgent {
// Build excluded tools list - always exclude save_memory and google_web_search // Build excluded tools list - always exclude save_memory and google_web_search
// Conditionally exclude screenshot tools if model doesn't support images // Conditionally exclude screenshot tools if model doesn't support images
// Exclude window management tools unless in eval mode
const excludedTools = ['save_memory', 'google_web_search'] const excludedTools = ['save_memory', 'google_web_search']
if (config.supportsImages === false) { if (config.supportsImages === false) {
excludedTools.push( excludedTools.push(
@@ -89,6 +110,9 @@ export class GeminiAgent {
) )
logger.info('Model does not support images, excluding screenshot tools') logger.info('Model does not support images, excluding screenshot tools')
} }
if (config.evalMode !== true) {
excludedTools.push('browser_create_window', 'browser_close_window')
}
const geminiConfig = new GeminiConfig({ const geminiConfig = new GeminiConfig({
sessionId: config.conversationId, sessionId: config.conversationId,
@@ -303,11 +327,24 @@ export class GeminiAgent {
for (const requestInfo of toolCallRequests) { for (const requestInfo of toolCallRequests) {
if (abortSignal.aborted) break if (abortSignal.aborted) break
await this.toolHooks?.onBeforeToolCall?.(
requestInfo.name,
requestInfo.args,
browserContext,
)
const result = await this.handleToolExecution( const result = await this.handleToolExecution(
requestInfo, requestInfo,
abortSignal, abortSignal,
browserContext, browserContext,
) )
await this.toolHooks?.onAfterToolCall?.(
requestInfo.name,
result,
browserContext,
)
toolResponseParts.push(...result.parts) toolResponseParts.push(...result.parts)
if (uiStream) { if (uiStream) {

View File

@@ -35,4 +35,6 @@ export interface ResolvedAgentConfig {
sessionExecutionDir: string sessionExecutionDir: string
/** Whether the model supports image inputs (vision). Defaults to true. */ /** Whether the model supports image inputs (vision). Defaults to true. */
supportsImages?: boolean supportsImages?: boolean
/** Eval mode - enables window management tools. Defaults to false. */
evalMode?: boolean
} }

View File

@@ -47,6 +47,8 @@ export {
ungroupTabs, ungroupTabs,
updateTabGroup, updateTabGroup,
} from './tools/tab-management' } from './tools/tab-management'
// Window Management
export { closeWindow, createWindow } from './tools/window-management'
// Types // Types
export type { Context } from './types/context' export type { Context } from './types/context'
export type { ImageContentData, Response } from './types/response' export type { ImageContentData, Response } from './types/response'
@@ -86,8 +88,9 @@ import {
ungroupTabs, ungroupTabs,
updateTabGroup, updateTabGroup,
} from './tools/tab-management' } from './tools/tab-management'
import { closeWindow, createWindow } from './tools/window-management'
// Array export for convenience (32 tools) // Array export for convenience (33 tools total)
export const allControllerTools = [ export const allControllerTools = [
getActiveTab, getActiveTab,
listTabs, listTabs,
@@ -121,4 +124,6 @@ export const allControllerTools = [
removeBookmark, removeBookmark,
searchHistory, searchHistory,
getRecentHistory, getRecentHistory,
createWindow,
closeWindow,
] ]

View File

@@ -0,0 +1,70 @@
/**
* @license
* Copyright 2025 BrowserOS
*/
import { z } from 'zod'
import { ToolCategories } from '../../types/tool-categories'
import { defineTool } from '../../types/tool-definition'
import type { Context } from '../types/context'
import type { Response } from '../types/response'
export const createWindow = defineTool<z.ZodRawShape, Context, Response>({
name: 'browser_create_window',
description:
'Create a new browser window. Returns the windowId and tabId of the created window.',
annotations: {
category: ToolCategories.TAB_MANAGEMENT,
readOnlyHint: false,
},
schema: {
url: z
.string()
.optional()
.describe('URL to open in the new window. Defaults to about:blank'),
incognito: z.boolean().optional().describe('Create an incognito window'),
focused: z
.boolean()
.optional()
.describe('Whether to focus the new window. Defaults to true'),
},
handler: async (request, response, context) => {
const params = request.params as {
url?: string
incognito?: boolean
focused?: boolean
}
const result = await context.executeAction('createWindow', {
url: params.url || 'about:blank',
incognito: params.incognito || false,
focused: params.focused ?? true,
})
const data = result as { windowId: number; tabId: number }
response.appendResponseLine(`Created window ${data.windowId}`)
response.appendResponseLine(`Tab ID: ${data.tabId}`)
response.addStructuredContent('windowId', data.windowId)
response.addStructuredContent('tabId', data.tabId)
},
})
export const closeWindow = defineTool<z.ZodRawShape, Context, Response>({
name: 'browser_close_window',
description: 'Close a browser window by its windowId.',
annotations: {
category: ToolCategories.TAB_MANAGEMENT,
readOnlyHint: false,
},
schema: {
windowId: z.coerce.number().describe('The ID of the window to close'),
},
handler: async (request, response, context) => {
const { windowId } = request.params as { windowId: number }
await context.executeAction('closeWindow', { windowId })
response.appendResponseLine(`Closed window ${windowId}`)
},
})

View File

@@ -124,7 +124,7 @@
}, },
"apps/server": { "apps/server": {
"name": "@browseros/server", "name": "@browseros/server",
"version": "0.0.42", "version": "0.0.44",
"bin": { "bin": {
"browseros-server": "./src/index.ts", "browseros-server": "./src/index.ts",
}, },
@@ -175,7 +175,7 @@
}, },
"packages/agent-sdk": { "packages/agent-sdk": {
"name": "@browseros-ai/agent-sdk", "name": "@browseros-ai/agent-sdk",
"version": "0.0.4", "version": "0.0.5",
"dependencies": { "dependencies": {
"eventsource-parser": "^3.0.6", "eventsource-parser": "^3.0.6",
"zod-to-json-schema": "^3.24.1", "zod-to-json-schema": "^3.24.1",