Compare commits

...

2 Commits

Author SHA1 Message Date
Nikhil Sonti
dd73a4f5e1 fix(eval): address executor backend review comments 2026-04-29 17:41:05 -07:00
Nikhil Sonti
f2f8f5cbd2 refactor(eval): split orchestrated executor backends 2026-04-29 17:28:47 -07:00
10 changed files with 321 additions and 351 deletions

View File

@@ -1,22 +1,27 @@
import { randomUUID } from 'node:crypto'
import { MAX_ACTIONS_PER_DELEGATION } from '../../constants'
import { McpClient, type McpToolResult } from '../../utils/mcp-client'
import { sleep } from '../../utils/sleep'
import { MAX_ACTIONS_PER_DELEGATION } from '../../../../constants'
import { McpClient, type McpToolResult } from '../../../../utils/mcp-client'
import { sleep } from '../../../../utils/sleep'
import type {
ExecutorConfig,
ExecutorResult,
} from '../../../orchestrator-executor/types'
import type { ExecutorCallbacks } from '../../executor-backend'
import {
extractCladoThinking,
formatCladoHistory,
getCladoActionSignature,
parseCladoActions,
summarizeCladoPrediction,
} from '../orchestrated/backends/clado/clado-actions'
} from './clado-actions'
import {
normalizeCladoDirection,
normalizeCladoPressKey,
normalizeCladoScrollAmount,
prepareCladoToolArgs,
resolveCladoPoint,
} from '../orchestrated/backends/clado/clado-browser-driver'
import { CladoActionClient } from '../orchestrated/backends/clado/clado-client'
} from './clado-browser-driver'
import { CladoActionClient } from './clado-client'
import {
CLADO_ACTION_PROVIDER,
type CladoAction,
@@ -24,9 +29,7 @@ import {
type CladoActionResponse,
type CladoViewport,
isCladoActionProvider,
} from '../orchestrated/backends/clado/types'
import type { ExecutorCallbacks } from './executor'
import type { ExecutorConfig, ExecutorResult } from './types'
} from './types'
const MAX_CONSECUTIVE_PARSE_FAILURES = 3
@@ -45,10 +48,8 @@ export class CladoActionExecutor {
private currentUrl = ''
constructor(
private readonly config: ExecutorConfig,
config: ExecutorConfig,
serverUrl: string,
readonly _windowId?: number,
readonly _tabId?: number,
initialPageId?: number,
) {
if (!isCladoActionProvider(config.provider)) {

View File

@@ -0,0 +1,56 @@
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
import type {
DelegationResult,
ExecutorBackend,
ExecutorCallbacks,
} from '../../executor-backend'
import { CladoActionExecutor } from './clado-action-executor'
export interface CladoExecutorBackendOptions {
configTemplate: ResolvedAgentConfig
serverUrl: string
initialPageId?: number
callbacks?: ExecutorCallbacks
}
/** Executes delegated goals through the Clado visual action model. */
export class CladoExecutorBackend implements ExecutorBackend {
readonly kind = 'clado'
private executor: CladoActionExecutor | null = null
constructor(private readonly options: CladoExecutorBackendOptions) {}
async execute(
instruction: string,
signal?: AbortSignal,
): Promise<DelegationResult> {
const executor = this.getExecutor()
const result = await executor.execute(instruction, signal)
return result
}
async close(): Promise<void> {
await this.executor?.close()
}
getTotalSteps(): number {
return this.executor?.getTotalSteps() ?? 0
}
private getExecutor(): CladoActionExecutor {
if (this.executor) return this.executor
this.executor = new CladoActionExecutor(
{
provider: this.options.configTemplate.provider,
model: this.options.configTemplate.model,
apiKey: this.options.configTemplate.apiKey ?? '',
baseUrl: this.options.configTemplate.baseUrl,
},
this.options.serverUrl,
this.options.initialPageId,
)
this.executor.setCallbacks(this.options.callbacks ?? {})
return this.executor
}
}

View File

@@ -1,8 +1,13 @@
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
import type { Browser } from '@browseros/server/browser'
import type { ExecutorCallbacks } from '../../orchestrator-executor/executor'
import type { ExecutorBackend, ExecutorBackendKind } from '../executor-backend'
import { ExecutorAdapterBackend } from './tool-loop-backend'
import type {
ExecutorBackend,
ExecutorBackendKind,
ExecutorCallbacks,
} from '../executor-backend'
import { CladoExecutorBackend } from './clado/clado-executor-backend'
import { isCladoActionProvider } from './clado/types'
import { ToolLoopExecutorBackend } from './tool-loop/tool-loop-executor-backend'
export interface CreateExecutorBackendOptions {
backendKind?: ExecutorBackendKind
@@ -18,28 +23,38 @@ export interface CreateExecutorBackendOptions {
}
export function backendKindForProvider(provider: string): ExecutorBackendKind {
return provider === 'clado-action' ? 'clado' : 'tool-loop'
return isCladoActionProvider(provider) ? 'clado' : 'tool-loop'
}
/** Creates the backend used for one orchestrator delegation. */
export function createExecutorBackend(
options: CreateExecutorBackendOptions,
): ExecutorBackend {
if (options.executor) return options.executor
const kind =
options.backendKind ??
backendKindForProvider(
options.provider ?? options.configTemplate?.provider ?? '',
)
return new ExecutorAdapterBackend({
kind,
configTemplate: options.configTemplate,
browser: options.browser,
serverUrl: options.serverUrl,
windowId: options.windowId,
tabId: options.tabId,
initialPageId: options.initialPageId,
if (kind === 'clado') {
return new CladoExecutorBackend({
configTemplate: required(options.configTemplate, 'configTemplate'),
serverUrl: required(options.serverUrl, 'serverUrl'),
initialPageId: options.initialPageId,
callbacks: options.callbacks,
})
}
return new ToolLoopExecutorBackend({
configTemplate: required(options.configTemplate, 'configTemplate'),
browser: options.browser ?? null,
callbacks: options.callbacks,
executor: options.executor,
})
}
function required<T>(value: T | undefined, name: string): T {
if (value === undefined) throw new Error(`${name} is required`)
return value
}

View File

@@ -1,72 +0,0 @@
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
import type { Browser } from '@browseros/server/browser'
import {
Executor,
type ExecutorCallbacks,
} from '../../orchestrator-executor/executor'
import type {
DelegationResult,
ExecutorBackend,
ExecutorBackendKind,
} from '../executor-backend'
interface ExecutorRunner {
execute(instruction: string, signal?: AbortSignal): Promise<DelegationResult>
close(): Promise<void>
getTotalSteps(): number
}
export interface ExecutorAdapterBackendOptions {
kind: ExecutorBackendKind
configTemplate?: ResolvedAgentConfig
browser?: Browser | null
serverUrl?: string
windowId?: number
tabId?: number
initialPageId?: number
callbacks?: ExecutorCallbacks
executor?: ExecutorRunner
}
export class ExecutorAdapterBackend implements ExecutorBackend {
readonly kind: ExecutorBackendKind
private readonly executor: ExecutorRunner
constructor(options: ExecutorAdapterBackendOptions) {
this.kind = options.kind
this.executor =
options.executor ??
new Executor(
required(options.configTemplate, 'configTemplate'),
options.browser ?? null,
required(options.serverUrl, 'serverUrl'),
{
isCladoAction: options.kind === 'clado',
windowId: options.windowId,
tabId: options.tabId,
initialPageId: options.initialPageId,
callbacks: options.callbacks,
},
)
}
execute(
instruction: string,
signal?: AbortSignal,
): Promise<DelegationResult> {
return this.executor.execute(instruction, signal)
}
close(): Promise<void> {
return this.executor.close()
}
getTotalSteps(): number {
return this.executor.getTotalSteps()
}
}
function required<T>(value: T | undefined, name: string): T {
if (value === undefined) throw new Error(`${name} is required`)
return value
}

View File

@@ -0,0 +1,144 @@
import { randomUUID } from 'node:crypto'
import { AiSdkAgent } from '@browseros/server/agent/tool-loop'
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
import type { Browser } from '@browseros/server/browser'
import { registry } from '@browseros/server/tools/registry'
import type { BrowserContext } from '@browseros/shared/schemas/browser-context'
import type {
DelegationResult,
ExecutorBackend,
ExecutorCallbacks,
} from '../../executor-backend'
import { TOOL_LOOP_EXECUTOR_SYSTEM_PROMPT } from './tool-loop-executor-prompt'
export interface ToolLoopExecutorBackendOptions {
configTemplate: ResolvedAgentConfig
browser: Browser | null
callbacks?: ExecutorCallbacks
}
/** Executes delegated goals through the BrowserOS ToolLoopAgent. */
export class ToolLoopExecutorBackend implements ExecutorBackend {
readonly kind = 'tool-loop'
private stepsUsed = 0
private currentUrl = ''
constructor(private readonly options: ToolLoopExecutorBackendOptions) {}
async execute(
instruction: string,
signal?: AbortSignal,
): Promise<DelegationResult> {
const browser = this.options.browser
if (!browser) {
throw new Error('Browser instance is required for tool-loop executor')
}
const stepsAtStart = this.stepsUsed
const toolsUsed: string[] = []
let status: DelegationResult['status'] = 'done'
let resultText = ''
const conversationId = randomUUID()
const agentConfig: ResolvedAgentConfig = {
...this.options.configTemplate,
conversationId,
userSystemPrompt: TOOL_LOOP_EXECUTOR_SYSTEM_PROMPT,
evalMode: true,
workingDir: `/tmp/browseros-eval-executor-${conversationId}`,
}
const browserContext = await this.browserContext(browser)
let agent: AiSdkAgent | null = null
try {
agent = await AiSdkAgent.create({
resolvedConfig: agentConfig,
browser,
registry,
browserContext,
})
await agent.toolLoopAgent.generate({
prompt: instruction,
abortSignal: signal,
experimental_onToolCallStart: ({ toolCall }) => {
const input = toolCall.input as Record<string, unknown> | undefined
if (input && typeof input.url === 'string' && input.url.length > 0) {
this.currentUrl = input.url
}
this.options.callbacks?.onToolCallStart?.({
toolCallId: toolCall.toolCallId,
toolName: toolCall.toolName,
input: toolCall.input,
})
},
experimental_onToolCallFinish: async () => {
this.stepsUsed++
await this.options.callbacks?.onToolCallFinish?.()
},
onStepFinish: async ({ toolCalls, toolResults, text }) => {
if (toolCalls) {
for (const toolCall of toolCalls) {
if (!toolsUsed.includes(toolCall.toolName)) {
toolsUsed.push(toolCall.toolName)
}
}
}
if (text) resultText = text
await this.options.callbacks?.onStepFinish?.({
toolCalls,
toolResults,
text,
})
},
})
} catch {
status = signal?.aborted ? 'timeout' : 'blocked'
} finally {
if (agent) await agent.dispose().catch(() => {})
}
if (status === 'done' && signal?.aborted) {
status = 'timeout'
}
return {
observation: resultText || 'Execution completed with no actions taken.',
status,
url: this.currentUrl,
actionsPerformed: this.stepsUsed - stepsAtStart,
toolsUsed,
}
}
async close(): Promise<void> {
// No persistent resources; AiSdkAgent is disposed at the end of each execute() call.
}
getTotalSteps(): number {
return this.stepsUsed
}
private async browserContext(
browser: Browser,
): Promise<BrowserContext | undefined> {
const pages = await browser.listPages()
const activePage = pages[0]
if (!activePage) return undefined
return {
activeTab: {
id: activePage.tabId,
pageId: activePage.pageId,
url: activePage.url,
title: activePage.title,
},
}
}
}

View File

@@ -0,0 +1,21 @@
export const TOOL_LOOP_EXECUTOR_SYSTEM_PROMPT = `You are a browser executor. You receive a single goal-level instruction and execute it using browser tools.
## Your Job
1. Execute browser actions to achieve the given goal
2. Stop as soon as the goal is accomplished -- do NOT perform extra actions
3. Write a final observation describing the result
## Final Response Format
When done, your response MUST include:
- What you accomplished (or what went wrong)
- What the page currently shows: key headings, links, data, or content visible
- The current URL from the address bar
- If you got stuck, what is blocking progress
## Rules
- Only do what was asked. Do not navigate away, open extra tabs, or reorganize the browser.
- If the goal is to navigate somewhere, confirm you arrived by describing what you see.
- If the goal is to click something, confirm the result of the click.
- If you cannot find what was asked for, say so clearly -- do not guess or improvise.
- Prefer browser_navigate over browser_open_tab for going to URLs.
- Do NOT call browser_group_tabs or other organizational tools.`

View File

@@ -3,6 +3,28 @@ import type { ExecutorResult } from '../orchestrator-executor/types'
export type ExecutorBackendKind = 'tool-loop' | 'clado'
export type DelegationResult = ExecutorResult
export interface ToolCallInfo {
toolCallId: string
toolName: string
input: unknown
}
export interface ToolResultInfo {
toolCallId: string
toolName: string
output: unknown
}
export interface ExecutorCallbacks {
onToolCallStart?: (toolCall: ToolCallInfo) => void
onToolCallFinish?: () => Promise<void>
onStepFinish?: (step: {
toolCalls?: ReadonlyArray<ToolCallInfo>
toolResults?: ReadonlyArray<ToolResultInfo>
text?: string
}) => Promise<void>
}
export interface ExecutorBackend {
readonly kind: ExecutorBackendKind
execute(instruction: string, signal?: AbortSignal): Promise<DelegationResult>

View File

@@ -1,243 +0,0 @@
/**
* Executor - Wraps AiSdkAgent for page-level browser actions (direct CDP)
*
* The executor:
* - Receives goal-level instructions from orchestrator
* - Executes browser actions until the goal is accomplished
* - Returns observation to orchestrator (not full history)
*/
import { randomUUID } from 'node:crypto'
import { AiSdkAgent } from '@browseros/server/agent/tool-loop'
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
import type { Browser } from '@browseros/server/browser'
import { registry } from '@browseros/server/tools/registry'
import type { BrowserContext } from '@browseros/shared/schemas/browser-context'
import { CladoActionExecutor } from './clado-action-executor'
import type { ExecutorResult } from './types'
const EXECUTOR_SYSTEM_PROMPT = `You are a browser executor. You receive a single goal-level instruction and execute it using browser tools.
## Your Job
1. Execute browser actions to achieve the given goal
2. Stop as soon as the goal is accomplished — do NOT perform extra actions
3. Write a final observation describing the result
## Final Response Format
When done, your response MUST include:
- What you accomplished (or what went wrong)
- What the page currently shows: key headings, links, data, or content visible
- The current URL from the address bar
- If you got stuck, what is blocking progress
## Rules
- Only do what was asked. Do not navigate away, open extra tabs, or reorganize the browser.
- If the goal is to navigate somewhere, confirm you arrived by describing what you see.
- If the goal is to click something, confirm the result of the click.
- If you cannot find what was asked for, say so clearly — do not guess or improvise.
- Prefer browser_navigate over browser_open_tab for going to URLs.
- Do NOT call browser_group_tabs or other organizational tools.`
export interface ToolCallInfo {
toolCallId: string
toolName: string
input: unknown
}
export interface ToolResultInfo {
toolCallId: string
toolName: string
output: unknown
}
export interface ExecutorCallbacks {
onToolCallStart?: (toolCall: ToolCallInfo) => void
onToolCallFinish?: () => Promise<void>
onStepFinish?: (step: {
toolCalls?: ReadonlyArray<ToolCallInfo>
toolResults?: ReadonlyArray<ToolResultInfo>
text?: string
}) => Promise<void>
}
export class Executor {
private cladoExecutor: CladoActionExecutor | null = null
private stepsUsed = 0
private currentUrl = ''
private configTemplate: ResolvedAgentConfig
private isCladoAction: boolean
private browser: Browser | null
private serverUrl: string
private windowId?: number
private tabId?: number
private initialPageId?: number
private callbacks: ExecutorCallbacks
constructor(
configTemplate: ResolvedAgentConfig,
browser: Browser | null,
serverUrl: string,
options?: {
isCladoAction?: boolean
windowId?: number
tabId?: number
initialPageId?: number
callbacks?: ExecutorCallbacks
},
) {
this.configTemplate = configTemplate
this.isCladoAction = options?.isCladoAction ?? false
this.browser = browser
this.serverUrl = serverUrl
this.windowId = options?.windowId
this.tabId = options?.tabId
this.initialPageId = options?.initialPageId
this.callbacks = options?.callbacks ?? {}
}
async execute(
instruction: string,
signal?: AbortSignal,
): Promise<ExecutorResult> {
if (this.isCladoAction) {
if (!this.cladoExecutor) {
this.cladoExecutor = new CladoActionExecutor(
{
provider: this.configTemplate.provider,
model: this.configTemplate.model,
apiKey: this.configTemplate.apiKey ?? '',
baseUrl: this.configTemplate.baseUrl,
},
this.serverUrl,
this.windowId,
this.tabId,
this.initialPageId,
)
this.cladoExecutor.setCallbacks(this.callbacks)
}
const result = await this.cladoExecutor.execute(instruction, signal)
this.stepsUsed = this.cladoExecutor.getTotalSteps()
this.currentUrl = result.url || this.currentUrl
return result
}
if (!this.browser) {
throw new Error('Browser instance is required for standard executor path')
}
const stepsAtStart = this.stepsUsed
const toolsUsed: string[] = []
let status: 'done' | 'blocked' | 'timeout' = 'done'
let resultText = ''
const conversationId = randomUUID()
const agentConfig: ResolvedAgentConfig = {
...this.configTemplate,
conversationId,
userSystemPrompt: EXECUTOR_SYSTEM_PROMPT,
evalMode: true,
workingDir: `/tmp/browseros-eval-executor-${conversationId}`,
}
// Build browser context so executor agent knows the correct page ID
let browserContext: BrowserContext | undefined
if (this.browser) {
const pages = await this.browser.listPages()
const activePage = pages[0]
if (activePage) {
browserContext = {
activeTab: {
id: activePage.tabId,
pageId: activePage.pageId,
url: activePage.url,
title: activePage.title,
},
}
}
}
let agent: AiSdkAgent | null = null
try {
agent = await AiSdkAgent.create({
resolvedConfig: agentConfig,
browser: this.browser,
registry,
browserContext,
})
await agent.toolLoopAgent.generate({
prompt: instruction,
abortSignal: signal,
experimental_onToolCallStart: ({ toolCall }) => {
const input = toolCall.input as Record<string, unknown> | undefined
if (input && typeof input.url === 'string' && input.url.length > 0) {
this.currentUrl = input.url
}
this.callbacks.onToolCallStart?.({
toolCallId: toolCall.toolCallId,
toolName: toolCall.toolName,
input: toolCall.input,
})
},
experimental_onToolCallFinish: async () => {
this.stepsUsed++
await this.callbacks.onToolCallFinish?.()
},
onStepFinish: async ({ toolCalls, toolResults, text }) => {
if (toolCalls) {
for (const tc of toolCalls) {
if (!toolsUsed.includes(tc.toolName)) {
toolsUsed.push(tc.toolName)
}
}
}
if (text) {
resultText = text
}
await this.callbacks.onStepFinish?.({ toolCalls, toolResults, text })
},
})
} catch {
if (signal?.aborted) {
status = 'timeout'
} else {
status = 'blocked'
}
} finally {
if (agent) await agent.dispose().catch(() => {})
}
if (status === 'done' && signal?.aborted) {
status = 'timeout'
}
const observation =
resultText || 'Execution completed with no actions taken.'
return {
observation,
status,
url: this.currentUrl,
actionsPerformed: this.stepsUsed - stepsAtStart,
toolsUsed,
}
}
async close(): Promise<void> {
await this.cladoExecutor?.close()
}
getTotalSteps(): number {
if (this.isCladoAction) {
return this.cladoExecutor?.getTotalSteps() ?? 0
}
return this.stepsUsed
}
}

View File

@@ -24,16 +24,16 @@ import {
resolveProviderConfig,
} from '../../utils/resolve-provider-config'
import { withEvalTimeout } from '../../utils/with-eval-timeout'
import { isCladoActionProvider } from '../orchestrated/backends/clado/types'
import { createExecutorBackend } from '../orchestrated/backends/create-executor-backend'
import type { ExecutorCallbacks } from '../orchestrated/executor-backend'
import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
import type { ExecutorCallbacks } from './executor'
import { OrchestratorAgent } from './orchestrator-agent'
import type { ExecutorFactory, ExecutorResult } from './types'
interface ResolvedConfigs {
orchestratorConfig: ResolvedAgentConfig & { maxTurns?: number }
executorConfig: ResolvedAgentConfig
isCladoAction: boolean
}
function toResolvedAgentConfig(
@@ -68,7 +68,10 @@ async function resolveAgentConfig(
if (!executorModel) {
throw new Error('executor.model is required in config')
}
if (config.executor.provider === 'clado-action' && !config.executor.baseUrl) {
if (
isCladoActionProvider(config.executor.provider) &&
!config.executor.baseUrl
) {
throw new Error(
'executor.baseUrl is required in config for clado-action provider',
)
@@ -76,10 +79,8 @@ async function resolveAgentConfig(
const resolvedOrchestrator = await resolveProviderConfig(config.orchestrator)
const isCladoAction = config.executor.provider === 'clado-action'
let executorConfig: ResolvedAgentConfig
if (isCladoAction) {
if (isCladoActionProvider(config.executor.provider)) {
executorConfig = {
conversationId: crypto.randomUUID(),
provider: config.executor.provider as ResolvedAgentConfig['provider'],
@@ -108,7 +109,7 @@ async function resolveAgentConfig(
maxTurns: config.orchestrator.maxTurns,
}
return { orchestratorConfig, executorConfig, isCladoAction }
return { orchestratorConfig, executorConfig }
}
export class OrchestratorExecutorEvaluator implements AgentEvaluator {
@@ -128,7 +129,7 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
}
const agentConfig = config.agent as OrchestratorExecutorConfig
const { orchestratorConfig, executorConfig, isCladoAction } =
const { orchestratorConfig, executorConfig } =
await resolveAgentConfig(agentConfig)
// Connect to Chrome via CDP — same per-worker offset used by app-manager.
@@ -237,7 +238,6 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
capture.emitEvent(task.query_id, delegateInputEvent)
const executor = createExecutorBackend({
backendKind: isCladoAction ? 'clado' : 'tool-loop',
configTemplate: executorConfig,
browser,
serverUrl: config.browseros.server_url,
@@ -331,6 +331,5 @@ export class OrchestratorExecutorEvaluator implements AgentEvaluator {
}
}
export { Executor } from './executor'
export { OrchestratorAgent } from './orchestrator-agent'
export * from './types'

View File

@@ -1,8 +1,10 @@
import { describe, expect, it } from 'bun:test'
import { CladoExecutorBackend } from '../../src/agents/orchestrated/backends/clado/clado-executor-backend'
import {
backendKindForProvider,
createExecutorBackend,
} from '../../src/agents/orchestrated/backends/create-executor-backend'
import { ToolLoopExecutorBackend } from '../../src/agents/orchestrated/backends/tool-loop/tool-loop-executor-backend'
import type { ExecutorBackend } from '../../src/agents/orchestrated/executor-backend'
describe('executor backend boundary', () => {
@@ -11,6 +13,32 @@ describe('executor backend boundary', () => {
expect(backendKindForProvider('openai-compatible')).toBe('tool-loop')
})
it('creates concrete backend classes for each executor path', () => {
expect(
createExecutorBackend({
backendKind: 'tool-loop',
configTemplate: {
provider: 'openai-compatible',
model: 'tool-loop-model',
},
browser: null,
serverUrl: 'http://127.0.0.1:9110',
}),
).toBeInstanceOf(ToolLoopExecutorBackend)
expect(
createExecutorBackend({
backendKind: 'clado',
configTemplate: {
provider: 'clado-action',
model: 'clado-model',
baseUrl: 'https://clado.example.test',
},
serverUrl: 'http://127.0.0.1:9110',
}),
).toBeInstanceOf(CladoExecutorBackend)
})
it('forwards execution and step state through the backend interface', async () => {
const signal = new AbortController().signal
const fakeBackend: ExecutorBackend = {
@@ -33,7 +61,6 @@ describe('executor backend boundary', () => {
}
const backend = createExecutorBackend({
backendKind: 'tool-loop',
executor: fakeBackend,
})
const result = await backend.execute('Click checkout', signal)