feat: new APIs for eval mode support (#250)

* feat: eval mode

* feat: eval mode
This commit is contained in:
shivammittal274
2026-01-21 02:36:58 +05:30
committed by GitHub
parent a4bae1f0c7
commit fd3cdace91
9 changed files with 236 additions and 4 deletions

View File

@@ -0,0 +1,33 @@
/**
* @license
* Copyright 2025 BrowserOS
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
import { z } from 'zod'
import { ActionHandler } from '../ActionHandler'
const CloseWindowInputSchema = z.object({
windowId: z.number().int().positive().describe('ID of the window to close'),
})
const CloseWindowOutputSchema = z.object({
success: z.boolean().describe('Whether the window was successfully closed'),
})
type CloseWindowInput = z.infer<typeof CloseWindowInputSchema>
type CloseWindowOutput = z.infer<typeof CloseWindowOutputSchema>
export class CloseWindowAction extends ActionHandler<
CloseWindowInput,
CloseWindowOutput
> {
readonly inputSchema = CloseWindowInputSchema
async execute(input: CloseWindowInput): Promise<CloseWindowOutput> {
await chrome.windows.remove(input.windowId)
return {
success: true,
}
}
}

View File

@@ -0,0 +1,68 @@
/**
* @license
* Copyright 2025 BrowserOS
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
import { z } from 'zod'
import { ActionHandler } from '../ActionHandler'
const CreateWindowInputSchema = z.object({
url: z
.string()
.optional()
.default('about:blank')
.describe('URL to open in the new window'),
incognito: z
.boolean()
.optional()
.default(false)
.describe('Create an incognito window'),
focused: z
.boolean()
.optional()
.default(true)
.describe('Whether to focus the new window'),
})
const CreateWindowOutputSchema = z.object({
windowId: z.number().describe('ID of the newly created window'),
tabId: z.number().describe('ID of the first tab in the new window'),
})
type CreateWindowInput = z.infer<typeof CreateWindowInputSchema>
type CreateWindowOutput = z.infer<typeof CreateWindowOutputSchema>
export class CreateWindowAction extends ActionHandler<
CreateWindowInput,
CreateWindowOutput
> {
readonly inputSchema = CreateWindowInputSchema
async execute(input: CreateWindowInput): Promise<CreateWindowOutput> {
const createData: chrome.windows.CreateData = {
url: input.url,
focused: input.focused,
incognito: input.incognito,
}
const createdWindow = await chrome.windows.create(createData)
if (!createdWindow) {
throw new Error('Failed to create window')
}
if (createdWindow.id === undefined) {
throw new Error('Created window has no ID')
}
const tabId = createdWindow.tabs?.[0]?.id
if (tabId === undefined) {
throw new Error('Created window has no tab')
}
return {
windowId: createdWindow.id,
tabId,
}
}
}

View File

@@ -12,6 +12,8 @@ import { CaptureScreenshotPointerAction } from '@/actions/browser/CaptureScreens
import { ClearAction } from '@/actions/browser/ClearAction'
import { ClickAction } from '@/actions/browser/ClickAction'
import { ClickCoordinatesAction } from '@/actions/browser/ClickCoordinatesAction'
import { CloseWindowAction } from '@/actions/browser/CloseWindowAction'
import { CreateWindowAction } from '@/actions/browser/CreateWindowAction'
import { ExecuteJavaScriptAction } from '@/actions/browser/ExecuteJavaScriptAction'
import { GetAccessibilityTreeAction } from '@/actions/browser/GetAccessibilityTreeAction'
import { GetInteractiveSnapshotAction } from '@/actions/browser/GetInteractiveSnapshotAction'
@@ -192,6 +194,9 @@ export class BrowserOSController {
this.actionRegistry.register('updateTabGroup', new UpdateTabGroupAction())
this.actionRegistry.register('ungroupTabs', new UngroupTabsAction())
this.actionRegistry.register('createWindow', new CreateWindowAction())
this.actionRegistry.register('closeWindow', new CloseWindowAction())
this.actionRegistry.register('getBookmarks', new GetBookmarksAction())
this.actionRegistry.register('createBookmark', new CreateBookmarkAction())
this.actionRegistry.register('removeBookmark', new RemoveBookmarkAction())

View File

@@ -23,6 +23,18 @@
".": {
"types": "./src/rpc.ts",
"import": "./src/rpc.ts"
},
"./agent": {
"types": "./src/agent/gemini-agent.ts",
"default": "./src/agent/gemini-agent.ts"
},
"./agent/types": {
"types": "./src/agent/types.ts",
"default": "./src/agent/types.ts"
},
"./lib/clients/gateway": {
"types": "./src/lib/clients/gateway.ts",
"default": "./src/lib/clients/gateway.ts"
}
},
"dependencies": {

View File

@@ -26,13 +26,29 @@ import type { HonoSSEStream } from './provider-adapter/types'
import { UIMessageStreamWriter } from './provider-adapter/ui-message-stream'
import type { ResolvedAgentConfig } from './types'
interface ToolExecutionResult {
export interface ToolExecutionResult {
parts: Part[]
isError: boolean
errorMessage?: string
}
export interface ToolExecutionHooks {
onBeforeToolCall?: (
toolName: string,
args: unknown,
browserContext?: BrowserContext,
) => Promise<void>
onAfterToolCall?: (
toolName: string,
result: ToolExecutionResult,
browserContext?: BrowserContext,
) => Promise<void>
}
export class GeminiAgent {
private toolHooks?: ToolExecutionHooks
private constructor(
private client: GeminiClient,
private geminiConfig: GeminiConfig,
@@ -40,6 +56,10 @@ export class GeminiAgent {
private conversationId: string,
) {}
setToolHooks(hooks: ToolExecutionHooks): void {
this.toolHooks = hooks
}
/**
* Creates a GeminiAgent with pre-resolved config and MCP servers.
* Config resolution and MCP building happens in ChatService (visible there).
@@ -81,6 +101,7 @@ export class GeminiAgent {
// Build excluded tools list - always exclude save_memory and google_web_search
// Conditionally exclude screenshot tools if model doesn't support images
// Exclude window management tools unless in eval mode
const excludedTools = ['save_memory', 'google_web_search']
if (config.supportsImages === false) {
excludedTools.push(
@@ -89,6 +110,9 @@ export class GeminiAgent {
)
logger.info('Model does not support images, excluding screenshot tools')
}
if (config.evalMode !== true) {
excludedTools.push('browser_create_window', 'browser_close_window')
}
const geminiConfig = new GeminiConfig({
sessionId: config.conversationId,
@@ -303,11 +327,24 @@ export class GeminiAgent {
for (const requestInfo of toolCallRequests) {
if (abortSignal.aborted) break
await this.toolHooks?.onBeforeToolCall?.(
requestInfo.name,
requestInfo.args,
browserContext,
)
const result = await this.handleToolExecution(
requestInfo,
abortSignal,
browserContext,
)
await this.toolHooks?.onAfterToolCall?.(
requestInfo.name,
result,
browserContext,
)
toolResponseParts.push(...result.parts)
if (uiStream) {

View File

@@ -35,4 +35,6 @@ export interface ResolvedAgentConfig {
sessionExecutionDir: string
/** Whether the model supports image inputs (vision). Defaults to true. */
supportsImages?: boolean
/** Eval mode - enables window management tools. Defaults to false. */
evalMode?: boolean
}

View File

@@ -47,6 +47,8 @@ export {
ungroupTabs,
updateTabGroup,
} from './tools/tab-management'
// Window Management
export { closeWindow, createWindow } from './tools/window-management'
// Types
export type { Context } from './types/context'
export type { ImageContentData, Response } from './types/response'
@@ -86,8 +88,9 @@ import {
ungroupTabs,
updateTabGroup,
} from './tools/tab-management'
import { closeWindow, createWindow } from './tools/window-management'
// Array export for convenience (32 tools)
// Array export for convenience (33 tools total)
export const allControllerTools = [
getActiveTab,
listTabs,
@@ -121,4 +124,6 @@ export const allControllerTools = [
removeBookmark,
searchHistory,
getRecentHistory,
createWindow,
closeWindow,
]

View File

@@ -0,0 +1,70 @@
/**
* @license
* Copyright 2025 BrowserOS
*/
import { z } from 'zod'
import { ToolCategories } from '../../types/tool-categories'
import { defineTool } from '../../types/tool-definition'
import type { Context } from '../types/context'
import type { Response } from '../types/response'
export const createWindow = defineTool<z.ZodRawShape, Context, Response>({
name: 'browser_create_window',
description:
'Create a new browser window. Returns the windowId and tabId of the created window.',
annotations: {
category: ToolCategories.TAB_MANAGEMENT,
readOnlyHint: false,
},
schema: {
url: z
.string()
.optional()
.describe('URL to open in the new window. Defaults to about:blank'),
incognito: z.boolean().optional().describe('Create an incognito window'),
focused: z
.boolean()
.optional()
.describe('Whether to focus the new window. Defaults to true'),
},
handler: async (request, response, context) => {
const params = request.params as {
url?: string
incognito?: boolean
focused?: boolean
}
const result = await context.executeAction('createWindow', {
url: params.url || 'about:blank',
incognito: params.incognito || false,
focused: params.focused ?? true,
})
const data = result as { windowId: number; tabId: number }
response.appendResponseLine(`Created window ${data.windowId}`)
response.appendResponseLine(`Tab ID: ${data.tabId}`)
response.addStructuredContent('windowId', data.windowId)
response.addStructuredContent('tabId', data.tabId)
},
})
export const closeWindow = defineTool<z.ZodRawShape, Context, Response>({
name: 'browser_close_window',
description: 'Close a browser window by its windowId.',
annotations: {
category: ToolCategories.TAB_MANAGEMENT,
readOnlyHint: false,
},
schema: {
windowId: z.coerce.number().describe('The ID of the window to close'),
},
handler: async (request, response, context) => {
const { windowId } = request.params as { windowId: number }
await context.executeAction('closeWindow', { windowId })
response.appendResponseLine(`Closed window ${windowId}`)
},
})

View File

@@ -124,7 +124,7 @@
},
"apps/server": {
"name": "@browseros/server",
"version": "0.0.42",
"version": "0.0.44",
"bin": {
"browseros-server": "./src/index.ts",
},
@@ -175,7 +175,7 @@
},
"packages/agent-sdk": {
"name": "@browseros-ai/agent-sdk",
"version": "0.0.4",
"version": "0.0.5",
"dependencies": {
"eventsource-parser": "^3.0.6",
"zod-to-json-schema": "^3.24.1",