Compare commits

...

13 Commits

Author SHA1 Message Date
Neel Gupta
0f7e2098c4 fix(agent): improve gui eval reliability 2026-05-07 20:44:46 +01:00
Neel Gupta
283f76eba7 fix(eval): continue after empty tool-result stop 2026-05-07 12:31:49 +01:00
Neel Gupta
758445fbbe fix: avoid wheel dispatch for page scroll 2026-05-06 17:38:39 +01:00
Neel Gupta
9031062676 fix(eval): route opus via openrouter bedrock 2026-05-06 17:28:37 +01:00
Neel Gupta
0eddab8499 fix: tune opus eval provider configs 2026-05-06 13:59:28 +01:00
Neel Gupta
a62f9771a2 feat: return GUI click hit element feedback 2026-05-06 13:59:28 +01:00
Neel Gupta
0816e217e8 feat(server): log agent finish reasons 2026-05-06 13:59:28 +01:00
Neel Gupta
fc2f669445 feat(eval): retry provider errors before failing 2026-05-06 13:59:28 +01:00
Neel Gupta
daeb33fc6c feat(eval): explain AGISDK state-diff failures 2026-05-06 13:59:27 +01:00
Neel Gupta
b444afa117 feat: enabled reasoning and added configs 2026-05-06 13:59:27 +01:00
Neel Gupta
3c66cbabae fix: evals & timeout
fix: extend molmo point timeout

feat: modal endpoint
2026-05-06 13:59:27 +01:00
Neel Gupta
940ef6dbd5 feat: added screenshot for agent perception 2026-05-06 13:59:26 +01:00
Neel Gupta
3bc399f94c feat(agent): Initial click model integration 2026-05-06 13:59:26 +01:00
50 changed files with 2271 additions and 277 deletions

View File

@@ -36,6 +36,7 @@ const VERB_OVERRIDES: Record<string, string> = {
hover: 'Hovered',
hover_at: 'Hovered at coordinates',
type_at: 'Typed at coordinates',
type_text: 'Typed text',
drag_at: 'Dragged',
focus: 'Focused element',
fill: 'Filled field',
@@ -186,8 +187,8 @@ const SUBJECT_EXTRACTORS: Record<string, SubjectExtractor> = {
find_files: (i) => quote(stringField(i, 'pattern', 'query')),
// Element interactions
click: (i) => stringField(i, 'element'),
hover: (i) => stringField(i, 'element'),
click: (i) => stringField(i, 'prompt'),
hover: (i) => stringField(i, 'prompt', 'element'),
focus: (i) => stringField(i, 'element'),
clear: (i) => stringField(i, 'element'),
check: (i) => stringField(i, 'element'),
@@ -199,6 +200,7 @@ const SUBJECT_EXTRACTORS: Record<string, SubjectExtractor> = {
return target ?? truncate(text, 40)
},
press_key: (i) => stringField(i, 'key'),
type_text: (i) => truncate(stringField(i, 'text'), 40),
// Coordinate-based input
click_at: (i) => coords(i.x, i.y),

View File

@@ -0,0 +1,26 @@
{
"agent": {
"type": "single",
"provider": "openai-compatible",
"model": "openai/gpt-5.5",
"apiKey": "OPENROUTER_API_KEY",
"baseUrl": "https://openrouter.ai/api/v1",
"supportsImages": true
},
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 20,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": false
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["agisdk_state_diff"],
"timeout_ms": 1800000
}

View File

@@ -8,7 +8,7 @@
"supportsImages": true
},
"dataset": "../../data/agisdk-real-smoke.jsonl",
"num_workers": 1,
"num_workers": 20,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",

View File

@@ -8,7 +8,7 @@
"supportsImages": true
},
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 4,
"num_workers": 20,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",

View File

@@ -1,15 +1,21 @@
{
"agent": {
"type": "single",
"provider": "bedrock",
"model": "global.anthropic.claude-opus-4-6-v1",
"region": "AWS_REGION",
"accessKeyId": "AWS_ACCESS_KEY_ID",
"secretAccessKey": "AWS_SECRET_ACCESS_KEY",
"supportsImages": true
"provider": "openrouter",
"model": "anthropic/claude-opus-4.6",
"apiKey": "OPENROUTER_API_KEY",
"supportsImages": true,
"reasoning": {
"enabled": true
},
"verbosity": "high",
"providerRouting": {
"only": ["amazon-bedrock"],
"allowFallbacks": false
}
},
"dataset": "../../data/agisdk-real.jsonl",
"num_workers": 2,
"num_workers": 10,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",

View File

@@ -5,7 +5,7 @@
"type": "single"
},
"graders": ["agisdk_state_diff"],
"workers": 1,
"workers": 20,
"restartBrowserPerTask": true,
"timeoutMs": 1800000,
"browseros": {

View File

@@ -5,7 +5,7 @@
"type": "single"
},
"graders": ["agisdk_state_diff"],
"workers": 1,
"workers": 20,
"restartBrowserPerTask": true,
"timeoutMs": 1800000,
"browseros": {

View File

@@ -5,7 +5,7 @@
"type": "single"
},
"graders": ["agisdk_state_diff"],
"workers": 1,
"workers": 20,
"restartBrowserPerTask": true,
"timeoutMs": 1800000,
"browseros": {

View File

@@ -9,11 +9,52 @@ import { CdpBackend } from '@browseros/server/browser/backends/cdp'
import { registry } from '@browseros/server/tools/registry'
import { CaptchaWaiter } from '../capture/captcha-waiter'
import { DEFAULT_TIMEOUT_MS } from '../constants'
import type { TaskMetadata } from '../types'
import type { TaskMetadata, UIMessageStreamEvent } from '../types'
import {
isProviderExecutionError,
retryProviderErrors,
} from '../utils/provider-error-retry'
import { resolveProviderConfig } from '../utils/resolve-provider-config'
import { withEvalTimeout } from '../utils/with-eval-timeout'
import type { AgentContext, AgentEvaluator, AgentResult } from './types'
const EMPTY_TOOL_RESULT_STOP_CONTINUATION_LIMIT = 2
interface ToolLoopResultShape {
text: string
finishReason: string
toolCalls: readonly unknown[]
steps: ReadonlyArray<{
toolResults: readonly unknown[]
}>
}
export function shouldContinueAfterEmptyToolResultStop(
result: ToolLoopResultShape,
): boolean {
const previousStep = result.steps.at(-2)
return (
result.finishReason === 'stop' &&
result.text.trim().length === 0 &&
result.toolCalls.length === 0 &&
(previousStep?.toolResults.length ?? 0) > 0
)
}
export function buildEmptyToolResultStopContinuationPrompt(
taskQuery: string,
): string {
return [
'Continue the eval task from the current browser state.',
'',
'The previous model response stopped immediately after a tool result without issuing another tool call or a final answer. Do not stop after routine tool results. If the requested workflow is complete, respond with a brief completion message. Otherwise, inspect the page if needed and continue using tools.',
'',
'Original task:',
taskQuery,
].join('\n')
}
export class SingleAgentEvaluator implements AgentEvaluator {
constructor(private ctx: AgentContext) {}
@@ -89,87 +130,128 @@ export class SingleAgentEvaluator implements AgentEvaluator {
capture,
async (signal) => {
if (!agent) throw new Error('Agent was not initialized')
// Format prompt with browser context so the agent knows what page it's on
// (same formatting as chat-service.ts → formatUserMessage)
const prompt = formatUserMessage(task.query, browserContext)
const result = await agent.toolLoopAgent.generate({
prompt,
abortSignal: signal,
const activeAgent = agent
experimental_onToolCallStart: ({ toolCall }) => {
const input = toolCall.input as
| Record<string, unknown>
| undefined
if (input && typeof input.page === 'number') {
capture.setActivePageId(input.page)
}
},
let continuationCount = 0
let currentQuery = task.query
experimental_onToolCallFinish: async () => {
try {
if (captchaWaiter) {
await captchaWaiter.waitIfCaptchaPresent(
browser,
capture.getActivePageId(),
)
}
const screenshotNum = await capture.screenshot.capture(
capture.getActivePageId(),
)
capture.emitEvent(task.query_id, {
type: 'screenshot-captured',
screenshot: screenshotNum,
})
} catch {
// Screenshot failures are non-fatal
}
},
for (;;) {
// Format prompt with browser context so the agent knows what page it's on
// (same formatting as chat-service.ts → formatUserMessage)
const prompt = formatUserMessage(currentQuery, browserContext)
const result = await retryProviderErrors(
() =>
activeAgent.toolLoopAgent.generate({
prompt,
abortSignal: signal,
onStepFinish: async ({ toolCalls, toolResults, text }) => {
if (toolCalls) {
for (const tc of toolCalls) {
const inputEvent = {
type: 'tool-input-available',
toolCallId: tc.toolCallId,
toolName: tc.toolName,
input: tc.input,
} as any
await capture.messageLogger.logStreamEvent(inputEvent)
capture.emitEvent(task.query_id, inputEvent)
}
}
experimental_onToolCallStart: ({ toolCall }) => {
const input = toolCall.input as
| Record<string, unknown>
| undefined
if (input && typeof input.page === 'number') {
capture.setActivePageId(input.page)
}
},
if (toolResults) {
for (const tr of toolResults) {
const outputEvent = {
type: 'tool-output-available',
toolCallId: tr.toolCallId,
output: tr.output,
} as any
await capture.messageLogger.logStreamEvent(outputEvent)
capture.emitEvent(task.query_id, outputEvent)
}
}
experimental_onToolCallFinish: async () => {
try {
if (captchaWaiter) {
await captchaWaiter.waitIfCaptchaPresent(
browser,
capture.getActivePageId(),
)
}
const screenshotNum = await capture.screenshot.capture(
capture.getActivePageId(),
)
capture.emitEvent(task.query_id, {
type: 'screenshot-captured',
screenshot: screenshotNum,
})
} catch {
// Screenshot failures are non-fatal
}
},
if (text) {
const textId = randomUUID()
const startEvent = { type: 'text-start', id: textId } as any
const deltaEvent = {
type: 'text-delta',
id: textId,
delta: text,
} as any
const endEvent = { type: 'text-end', id: textId } as any
await capture.messageLogger.logStreamEvent(startEvent)
await capture.messageLogger.logStreamEvent(deltaEvent)
await capture.messageLogger.logStreamEvent(endEvent)
capture.emitEvent(task.query_id, deltaEvent)
}
},
})
onStepFinish: async ({ toolCalls, toolResults, text }) => {
if (toolCalls) {
for (const tc of toolCalls) {
const inputEvent: UIMessageStreamEvent = {
type: 'tool-input-available',
toolCallId: tc.toolCallId,
toolName: tc.toolName,
input: tc.input,
}
await capture.messageLogger.logStreamEvent(inputEvent)
capture.emitEvent(task.query_id, inputEvent)
}
}
finalText = result.text || null
if (toolResults) {
for (const tr of toolResults) {
const outputEvent: UIMessageStreamEvent = {
type: 'tool-output-available',
toolCallId: tr.toolCallId,
output: tr.output,
}
await capture.messageLogger.logStreamEvent(outputEvent)
capture.emitEvent(task.query_id, outputEvent)
}
}
if (text) {
const textId = randomUUID()
const startEvent: UIMessageStreamEvent = {
type: 'text-start',
id: textId,
}
const deltaEvent: UIMessageStreamEvent = {
type: 'text-delta',
id: textId,
delta: text,
}
const endEvent: UIMessageStreamEvent = {
type: 'text-end',
id: textId,
}
await capture.messageLogger.logStreamEvent(startEvent)
await capture.messageLogger.logStreamEvent(deltaEvent)
await capture.messageLogger.logStreamEvent(endEvent)
capture.emitEvent(task.query_id, deltaEvent)
}
},
}),
{
label: `single-agent ${task.query_id}`,
signal,
},
)
if (!shouldContinueAfterEmptyToolResultStop(result)) {
finalText = result.text || null
break
}
if (
continuationCount >= EMPTY_TOOL_RESULT_STOP_CONTINUATION_LIMIT
) {
throw new Error(
`Model stopped with empty output immediately after a tool result ${continuationCount + 1} times`,
)
}
continuationCount++
capture.addWarning(
'agent_execution',
`Model stopped with empty output immediately after a tool result; continuing task (${continuationCount}/${EMPTY_TOOL_RESULT_STOP_CONTINUATION_LIMIT})`,
)
currentQuery = buildEmptyToolResultStopContinuationPrompt(
task.query,
)
}
},
{ rethrowError: isProviderExecutionError },
)
const endTime = Date.now()

View File

@@ -18,6 +18,9 @@ export interface SuiteCliArgs {
apiKey?: string
baseUrl?: string
publishTarget?: PublishTarget
query?: string
startUrl?: string
outputDir?: string
}
export interface RunCliArgs
@@ -83,6 +86,9 @@ function parseSuiteLikeArgs(
'api-key': { type: 'string' },
'base-url': { type: 'string' },
publish: { type: 'string' },
query: { type: 'string' },
'start-url': { type: 'string' },
'output-dir': { type: 'string' },
},
})
@@ -104,6 +110,12 @@ function parseSuiteLikeArgs(
if (apiKey) parsed.apiKey = apiKey
const baseUrl = stringValue(values['base-url'])
if (baseUrl) parsed.baseUrl = baseUrl
const query = stringValue(values.query)
if (query) parsed.query = query
const startUrl = stringValue(values['start-url'])
if (startUrl) parsed.startUrl = startUrl
const outputDir = stringValue(values['output-dir'])
if (outputDir) parsed.outputDir = outputDir
if (command === 'suite') {
const target = publishTarget(stringValue(values.publish))

View File

@@ -15,6 +15,9 @@ export async function runRunCommand(
model: args.model,
apiKey: args.apiKey,
baseUrl: args.baseUrl,
query: args.query,
startUrl: args.startUrl,
outputDir: args.outputDir,
},
deps,
)

View File

@@ -21,6 +21,9 @@ export interface SuiteCommandOptions {
apiKey?: string
baseUrl?: string
publishTarget?: PublishTarget
query?: string
startUrl?: string
outputDir?: string
env?: Env
}
@@ -179,11 +182,19 @@ export async function runSuiteCommand(
const resolved = await resolveSuiteCommand(options)
const runOptions: RunEvalOptions =
resolved.kind === 'config'
? { configPath: resolved.configPath }
? {
configPath: resolved.configPath,
query: options.query,
startUrl: options.startUrl,
outputDir: options.outputDir,
}
: {
configPath: resolved.suitePath,
dataPath: resolved.datasetPath,
config: resolved.evalConfig,
query: options.query,
startUrl: options.startUrl,
outputDir: options.outputDir,
}
const result = await runEval(runOptions)

View File

@@ -15,6 +15,7 @@ Usage:
bun run eval suite --suite <suite.json> --variant <id> [--publish r2]
bun run eval run --config <config.json>
bun run eval run --suite <suite.json> --variant <id>
bun run eval run --config <config.json> --query "..." --start-url <url>
bun run eval grade --run <results/run-dir>
bun run eval publish --run <results/run-dir> --target r2
bun run eval -c <config.json>

View File

@@ -31,6 +31,16 @@ interface AgisdkEvaluatorOutput {
per_criterion: unknown[]
}
interface FailedAgisdkCriterion {
index: number
detail: string
expected?: unknown
actual?: unknown
}
const MAX_REASONING_CRITERIA = 8
const MAX_REASONING_DETAIL_CHARS = 700
export class AgisdkStateDiffGrader implements Grader {
name = 'agisdk_state_diff'
@@ -99,15 +109,23 @@ export class AgisdkStateDiffGrader implements Grader {
'stderr.txt',
evaluation.stderr,
)
const failedCriteria = this.extractFailedCriteria(result.per_criterion)
if (failedCriteria.length > 0) {
await writeGraderJsonArtifact(
input,
this.name,
'failed-criteria.json',
failedCriteria,
)
}
return {
score: result.reward,
pass: result.pass,
reasoning:
result.message ||
(result.pass ? 'All criteria passed' : 'Some criteria failed'),
reasoning: this.buildReasoning(result, failedCriteria),
details: {
reward: result.reward,
per_criterion: result.per_criterion,
failed_criteria: failedCriteria,
origin,
agisdk_task_id: taskId,
},
@@ -148,6 +166,69 @@ export class AgisdkStateDiffGrader implements Grader {
return null
}
private buildReasoning(
result: AgisdkEvaluatorOutput,
failedCriteria: FailedAgisdkCriterion[],
): string {
const base =
result.message ||
(result.pass ? 'All criteria passed' : 'Some criteria failed')
if (result.pass || failedCriteria.length === 0) return base
const shown = failedCriteria.slice(0, MAX_REASONING_CRITERIA)
const lines = shown.map(
(criterion) =>
`${criterion.index + 1}. ${this.formatFailedCriterion(criterion)}`,
)
const remaining = failedCriteria.length - shown.length
if (remaining > 0) {
lines.push(`... ${remaining} more failed criteria`)
}
return `${base}\nFailed criteria:\n${lines.join('\n')}`
}
private extractFailedCriteria(
perCriterion: unknown[],
): FailedAgisdkCriterion[] {
return perCriterion.flatMap((criterion, index) => {
if (!criterion || typeof criterion !== 'object') return []
const record = criterion as Record<string, unknown>
if (record.passed === true) return []
const detail =
typeof record.detail === 'string'
? record.detail
: this.stringifyCriterionValue(record.raw_detail ?? record)
const failed: FailedAgisdkCriterion = {
index,
detail,
}
if ('expected_value' in record) failed.expected = record.expected_value
if ('actual_value' in record) failed.actual = record.actual_value
return [failed]
})
}
private formatFailedCriterion(criterion: FailedAgisdkCriterion): string {
const parts = [criterion.detail]
if ('expected' in criterion) {
parts.push(`expected=${this.stringifyCriterionValue(criterion.expected)}`)
}
if ('actual' in criterion) {
parts.push(`actual=${this.stringifyCriterionValue(criterion.actual)}`)
}
const text = parts.join(' | ')
if (text.length <= MAX_REASONING_DETAIL_CHARS) return text
return `${text.slice(0, MAX_REASONING_DETAIL_CHARS)}... (+${text.length - MAX_REASONING_DETAIL_CHARS} chars)`
}
private stringifyCriterionValue(value: unknown): string {
return typeof value === 'string' ? value : JSON.stringify(value)
}
private async fetchFinishState(
origin: string,
mcpEndpoint: string,

View File

@@ -26,6 +26,18 @@ import sys
_STRICT = os.environ.get("AGISDK_STRICT_STRINGS", "").lower() in ("1", "true", "yes")
def _json_safe(value: object) -> object:
try:
json.dumps(value)
return value
except TypeError:
if isinstance(value, dict):
return {str(k): _json_safe(v) for k, v in value.items()}
if isinstance(value, (list, tuple)):
return [_json_safe(v) for v in value]
return str(value)
def _soft_string_match(detail: object) -> bool:
"""Return True iff `detail` is `{actual_value, expected_value}` with both
strings and a non-empty `expected_value` that is contained in `actual_value`
@@ -87,7 +99,16 @@ def main():
for r in results:
passed = bool(r[0])
detail = r[1] if len(r) > 1 else ""
entry: dict = {"passed": passed, "detail": str(detail)}
entry: dict = {
"passed": passed,
"detail": str(detail),
"raw_detail": _json_safe(detail),
}
if isinstance(detail, dict):
if "actual_value" in detail:
entry["actual_value"] = _json_safe(detail.get("actual_value"))
if "expected_value" in detail:
entry["expected_value"] = _json_safe(detail.get("expected_value"))
if not _STRICT and not passed and _soft_string_match(detail):
entry["passed"] = True
entry["softened"] = True

View File

@@ -62,6 +62,12 @@ const CAPTCHA_EXT_DIR = join(
'../../extensions/nopecha',
)
export function resolveServerStartScript(
env: Record<string, string | undefined> = process.env,
): string {
return env.BROWSEROS_EVAL_SERVER_START_SCRIPT || 'start:ci'
}
export class BrowserOSAppManager {
private ports: EvalPorts
private chromeProc: Subprocess | null = null
@@ -215,9 +221,10 @@ export class BrowserOSAppManager {
}
this.serverLogFd = logFd
// `start:ci` skips `--watch` (no file-watcher overhead in CI). Falls back
// to the regular `start` script outside CI for the dev-watch experience.
const startScript = process.env.CI ? 'start:ci' : 'start'
// Eval servers must not use `start` because it runs Bun in watch mode; a
// source edit during a long eval can restart the worker server before the
// grader fetches /finish. Keep an escape hatch for local debugging.
const startScript = resolveServerStartScript()
this.serverProc = spawn({
cmd: ['bun', 'run', '--filter', '@browseros/server', startScript],
cwd: MONOREPO_ROOT,

View File

@@ -256,10 +256,27 @@ function printTaskProgress(
for (const [name, gr] of Object.entries(result.graderResults)) {
const icon = gr.pass ? 'PASS' : 'FAIL'
console.log(` ${name}: ${icon}`)
if (!gr.pass && gr.reasoning) {
printIndentedReasoning(gr.reasoning, ' ')
}
}
}
}
function printIndentedReasoning(
reasoning: string,
indent: string,
maxLines = 12,
): void {
const lines = reasoning.trim().split('\n')
for (const line of lines.slice(0, maxLines)) {
console.log(`${indent}${line}`)
}
if (lines.length > maxLines) {
console.log(`${indent}... ${lines.length - maxLines} more lines`)
}
}
// ============================================================================
// Summary
// ============================================================================

View File

@@ -0,0 +1,211 @@
import { sleep } from './sleep'
const DEFAULT_PROVIDER_ERROR_RETRIES = 5
const DEFAULT_PROVIDER_ERROR_RETRY_WINDOW_MS = 10_000
const DEFAULT_RATE_LIMIT_RETRIES = 8
const DEFAULT_RATE_LIMIT_RETRY_WINDOW_MS = 120_000
const PROVIDER_ERROR_LOG_MAX_STRING_CHARS = 10_000
const PROVIDER_ERROR_LOG_MAX_DEPTH = 5
const REDACTED_KEYS = /authorization|api[-_]?key|token|secret|cookie/i
export interface ProviderErrorRetryEvent {
retryNumber: number
maxRetries: number
delayMs: number
error: unknown
}
export interface ProviderErrorRetryOptions {
label: string
signal?: AbortSignal
retries?: number
windowMs?: number
rateLimitRetries?: number
rateLimitWindowMs?: number
onRetry?: (event: ProviderErrorRetryEvent) => void
}
function readStringProperty(value: unknown, key: string): string | undefined {
if (!value || typeof value !== 'object') return undefined
const raw = (value as Record<string, unknown>)[key]
return typeof raw === 'string' ? raw : undefined
}
function readArrayProperty(value: unknown, key: string): unknown[] {
if (!value || typeof value !== 'object') return []
const raw = (value as Record<string, unknown>)[key]
return Array.isArray(raw) ? raw : []
}
function errorMarkers(error: unknown, seen = new Set<unknown>()): string[] {
if (!error || seen.has(error)) return []
seen.add(error)
const markers = [
readStringProperty(error, 'name'),
error instanceof Error ? error.message : undefined,
].filter((value): value is string => !!value)
if (error && typeof error === 'object') {
const record = error as Record<string, unknown>
if ('isRetryable' in record) markers.push('isRetryable')
if ('statusCode' in record) markers.push('statusCode')
if ('statusCode' in record) markers.push(String(record.statusCode))
if ('responseBody' in record) markers.push('responseBody')
if ('responseBody' in record) markers.push(String(record.responseBody))
if ('cause' in record) {
markers.push(...errorMarkers(record.cause, seen))
}
}
for (const nestedError of readArrayProperty(error, 'errors')) {
markers.push(...errorMarkers(nestedError, seen))
}
return markers
}
export function isProviderExecutionError(error: unknown): boolean {
const markerText = errorMarkers(error).join('\n')
return (
isProviderRateLimitError(error) ||
markerText.includes('Provider returned error') ||
markerText.includes('APICallError') ||
markerText.includes('AI_RetryError') ||
markerText.includes('RetryError') ||
markerText.includes('isRetryable') ||
markerText.includes('statusCode') ||
markerText.includes('responseBody')
)
}
export function isProviderRateLimitError(error: unknown): boolean {
const markerText = errorMarkers(error).join('\n').toLowerCase()
return (
markerText.includes('rate limit') ||
markerText.includes('rate-limit') ||
markerText.includes('too many requests') ||
markerText.includes('statuscode\n429') ||
markerText.includes('\n429\n')
)
}
function errorMessage(error: unknown): string {
return error instanceof Error ? error.message : String(error)
}
function truncateString(value: string): string {
if (value.length <= PROVIDER_ERROR_LOG_MAX_STRING_CHARS) return value
return `${value.slice(0, PROVIDER_ERROR_LOG_MAX_STRING_CHARS)}... (+${value.length - PROVIDER_ERROR_LOG_MAX_STRING_CHARS} chars)`
}
function serializeForLog(
value: unknown,
depth = 0,
seen = new Set<unknown>(),
): unknown {
if (typeof value === 'string') return truncateString(value)
if (value === null || typeof value !== 'object') return value
if (seen.has(value)) return '[Circular]'
if (depth >= PROVIDER_ERROR_LOG_MAX_DEPTH) return '[MaxDepth]'
seen.add(value)
if (value instanceof Error) {
const serialized: Record<string, unknown> = {
name: value.name,
message: value.message,
stack: value.stack,
}
for (const key of Object.getOwnPropertyNames(value)) {
if (key in serialized) continue
serialized[key] = REDACTED_KEYS.test(key)
? '[Redacted]'
: serializeForLog(
(value as unknown as Record<string, unknown>)[key],
depth + 1,
seen,
)
}
if ('cause' in value) {
serialized.cause = serializeForLog(value.cause, depth + 1, seen)
}
return serialized
}
if (Array.isArray(value)) {
return value.map((item) => serializeForLog(item, depth + 1, seen))
}
const serialized: Record<string, unknown> = {}
for (const [key, item] of Object.entries(value)) {
serialized[key] = REDACTED_KEYS.test(key)
? '[Redacted]'
: serializeForLog(item, depth + 1, seen)
}
return serialized
}
function logFinalProviderError(
label: string,
error: unknown,
attempts: number,
): void {
console.error(
`[provider-retry] ${label}: provider error persisted after ${attempts} attempts. Final error:\n${JSON.stringify(
serializeForLog(error),
null,
2,
)}`,
)
}
export async function retryProviderErrors<T>(
operation: () => Promise<T>,
options: ProviderErrorRetryOptions,
): Promise<T> {
const providerRetries = options.retries ?? DEFAULT_PROVIDER_ERROR_RETRIES
const providerWindowMs =
options.windowMs ?? DEFAULT_PROVIDER_ERROR_RETRY_WINDOW_MS
const rateLimitRetries =
options.rateLimitRetries ?? DEFAULT_RATE_LIMIT_RETRIES
const rateLimitWindowMs =
options.rateLimitWindowMs ?? DEFAULT_RATE_LIMIT_RETRY_WINDOW_MS
for (let attempt = 0; ; attempt++) {
try {
return await operation()
} catch (error) {
const isProviderError = isProviderExecutionError(error)
if (options.signal?.aborted || !isProviderError) {
throw error
}
const isRateLimit = isProviderRateLimitError(error)
const maxRetries = isRateLimit ? rateLimitRetries : providerRetries
const windowMs = isRateLimit ? rateLimitWindowMs : providerWindowMs
const delayMs = maxRetries > 0 ? Math.floor(windowMs / maxRetries) : 0
if (attempt >= maxRetries) {
logFinalProviderError(options.label, error, attempt + 1)
throw error
}
const event = {
retryNumber: attempt + 1,
maxRetries,
delayMs,
error,
}
options.onRetry?.(event)
console.warn(
`[provider-retry] ${options.label}: retry ${event.retryNumber}/${maxRetries} in ${delayMs}ms after ${isRateLimit ? 'rate limit' : 'provider error'}: ${errorMessage(error)}`,
)
await sleep(delayMs, options.signal)
}
}
}

View File

@@ -13,10 +13,15 @@ export interface TimeoutResult<T> {
terminationReason: TerminationReason
}
export interface EvalTimeoutOptions {
rethrowError?: (error: Error) => boolean
}
export async function withEvalTimeout<T>(
timeoutMs: number,
capture: CaptureContext,
fn: (signal: AbortSignal) => Promise<T>,
options: EvalTimeoutOptions = {},
): Promise<TimeoutResult<T>> {
const abortController = new AbortController()
const timeoutHandle = setTimeout(() => abortController.abort(), timeoutMs)
@@ -39,6 +44,9 @@ export async function withEvalTimeout<T>(
capture.addError('agent_execution', error.message, {
stack: error.stack,
})
if (options.rethrowError?.(error)) {
throw error
}
}
return { terminationReason }

View File

@@ -0,0 +1,50 @@
import { describe, expect, it } from 'bun:test'
import {
buildEmptyToolResultStopContinuationPrompt,
shouldContinueAfterEmptyToolResultStop,
} from '../../src/agents/single-agent'
describe('single-agent empty tool-result stop handling', () => {
it('continues when the model emits an empty stop after a tool result', () => {
expect(
shouldContinueAfterEmptyToolResultStop({
text: '',
finishReason: 'stop',
toolCalls: [],
steps: [{ toolResults: [{ ok: true }] }, { toolResults: [] }],
}),
).toBe(true)
})
it('does not continue for normal final text', () => {
expect(
shouldContinueAfterEmptyToolResultStop({
text: 'Done',
finishReason: 'stop',
toolCalls: [],
steps: [{ toolResults: [{ ok: true }] }, { toolResults: [] }],
}),
).toBe(false)
})
it('does not continue when there was no previous tool result', () => {
expect(
shouldContinueAfterEmptyToolResultStop({
text: '',
finishReason: 'stop',
toolCalls: [],
steps: [{ toolResults: [] }],
}),
).toBe(false)
})
it('builds a continuation prompt with the original task', () => {
const prompt = buildEmptyToolResultStopContinuationPrompt(
'Delete the target email.',
)
expect(prompt).toContain('Continue the eval task')
expect(prompt).toContain('Do not stop after routine tool results')
expect(prompt).toContain('Delete the target email.')
})
})

View File

@@ -52,6 +52,28 @@ describe('parseEvalCliArgs', () => {
})
})
it('parses one-off query overrides for a single eval run', () => {
expect(
parseEvalCliArgs([
'run',
'--config',
'configs/legacy/gui-click-amazon-smoke.json',
'--query',
'open the cart',
'--start-url',
'https://www.amazon.com/',
'--output-dir',
'/tmp/gui-click-eval',
]),
).toEqual({
command: 'run',
configPath: 'configs/legacy/gui-click-amazon-smoke.json',
query: 'open the cart',
startUrl: 'https://www.amazon.com/',
outputDir: '/tmp/gui-click-eval',
})
})
it('rejects missing required command options with targeted errors', () => {
expect(() => parseEvalCliArgs(['run'])).toThrow(
'run requires --config or --suite',

View File

@@ -116,6 +116,9 @@ describe('suite command', () => {
suitePath,
model: 'moonshotai/kimi-k2.5',
provider: 'openai-compatible',
query: 'Open Amazon cart',
startUrl: 'https://www.amazon.com/',
outputDir: '/tmp/gui-click-eval',
env: {},
},
{
@@ -132,5 +135,8 @@ describe('suite command', () => {
expect(basename(calls[1].configPath)).toBe('agisdk-daily-10.json')
expect(calls[1].config).toBeDefined()
expect(calls[1].dataPath?.endsWith('tasks.jsonl')).toBe(true)
expect(calls[1].query).toBe('Open Amazon cart')
expect(calls[1].startUrl).toBe('https://www.amazon.com/')
expect(calls[1].outputDir).toBe('/tmp/gui-click-eval')
})
})

View File

@@ -31,7 +31,14 @@ describe('AgisdkStateDiffGrader artifacts', () => {
reward: 0,
pass: false,
message: 'Missing entree',
per_criterion: [{ passed: false, detail: 'entree missing' }],
per_criterion: [
{
passed: false,
detail: 'cart item mismatch',
expected_value: 'Entree',
actual_value: 'Soup',
},
],
},
stderr: 'criterion log',
})
@@ -53,6 +60,17 @@ describe('AgisdkStateDiffGrader artifacts', () => {
const result = await grader.grade(input)
expect(result.pass).toBe(false)
expect(result.reasoning).toContain('Failed criteria:')
expect(result.reasoning).toContain('expected=Entree')
expect(result.reasoning).toContain('actual=Soup')
expect(result.details?.failed_criteria).toEqual([
{
index: 0,
detail: 'cart item mismatch',
expected: 'Entree',
actual: 'Soup',
},
])
expect(
JSON.parse(
await readFile(
@@ -69,6 +87,21 @@ describe('AgisdkStateDiffGrader artifacts', () => {
),
),
).toMatchObject({ message: 'Missing entree' })
expect(
JSON.parse(
await readFile(
join(dir, 'grader-artifacts/agisdk_state_diff/failed-criteria.json'),
'utf-8',
),
),
).toEqual([
{
index: 0,
detail: 'cart item mismatch',
expected: 'Entree',
actual: 'Soup',
},
])
expect(
await readFile(
join(dir, 'grader-artifacts/agisdk_state_diff/stderr.txt'),

View File

@@ -42,7 +42,7 @@ describe('adaptEvalConfigFile', () => {
const kimi = await adaptEvalConfigFile(
'apps/eval/configs/legacy/browseros-agent-kimi-k2-5-agisdk-real.json',
)
const opus = await adaptEvalConfigFile(
const openrouterBedrockOpus = await adaptEvalConfigFile(
'apps/eval/configs/legacy/browseros-agent-opus-4-6-agisdk-real.json',
)
@@ -54,16 +54,22 @@ describe('adaptEvalConfigFile', () => {
})
expect(kimi.evalConfig.num_workers).toBe(3)
expect(opus.suite.id).toBe('browseros-agent-opus-4-6-agisdk-real')
expect(opus.evalConfig.agent).toMatchObject({
expect(openrouterBedrockOpus.suite.id).toBe(
'browseros-agent-opus-4-6-agisdk-real',
)
expect(openrouterBedrockOpus.evalConfig.agent).toMatchObject({
type: 'single',
provider: 'bedrock',
model: 'global.anthropic.claude-opus-4-6-v1',
region: 'AWS_REGION',
accessKeyId: 'AWS_ACCESS_KEY_ID',
secretAccessKey: 'AWS_SECRET_ACCESS_KEY',
provider: 'openrouter',
model: 'anthropic/claude-opus-4.6',
apiKey: 'OPENROUTER_API_KEY',
reasoning: { enabled: true },
verbosity: 'high',
providerRouting: {
only: ['amazon-bedrock'],
allowFallbacks: false,
},
})
expect(opus.evalConfig.num_workers).toBe(2)
expect(openrouterBedrockOpus.evalConfig.num_workers).toBe(2)
})
it('adapts claude-code configs without provider credentials', async () => {

View File

@@ -0,0 +1,137 @@
import { describe, expect, it } from 'bun:test'
import {
isProviderExecutionError,
isProviderRateLimitError,
retryProviderErrors,
} from '../../src/utils/provider-error-retry'
function providerError(message = 'Provider returned error'): Error {
const error = new Error(message)
error.name = 'APICallError'
;(error as unknown as Record<string, unknown>).statusCode = 500
;(error as unknown as Record<string, unknown>).responseBody =
'{"error":"upstream failed"}'
return error
}
function rateLimitError(): Error {
const error = new Error('rate limit exceeded, please try again later')
error.name = 'AI_APICallError'
;(error as unknown as Record<string, unknown>).statusCode = 429
return error
}
async function withoutRetryWarnings<T>(fn: () => Promise<T>): Promise<T> {
const originalWarn = console.warn
const originalError = console.error
console.warn = () => {}
console.error = () => {}
try {
return await fn()
} finally {
console.warn = originalWarn
console.error = originalError
}
}
describe('provider error retries', () => {
it('detects provider errors from SDK-style markers', () => {
expect(isProviderExecutionError(providerError())).toBe(true)
expect(isProviderExecutionError(rateLimitError())).toBe(true)
expect(isProviderRateLimitError(rateLimitError())).toBe(true)
expect(
isProviderExecutionError(
new Error('rate limit exceeded, please try again later'),
),
).toBe(true)
expect(isProviderExecutionError(new Error('regular tool failure'))).toBe(
false,
)
})
it('retries provider errors and returns a later success', async () => {
await withoutRetryWarnings(async () => {
let calls = 0
const result = await retryProviderErrors(
async () => {
calls++
if (calls <= 3) throw providerError()
return 'ok'
},
{ label: 'test', retries: 5, windowMs: 0 },
)
expect(result).toBe('ok')
expect(calls).toBe(4)
})
})
it('uses the rate-limit retry policy for provider rate limits', async () => {
await withoutRetryWarnings(async () => {
let calls = 0
const result = await retryProviderErrors(
async () => {
calls++
if (calls <= 2) throw rateLimitError()
return 'ok'
},
{
label: 'test',
retries: 0,
windowMs: 0,
rateLimitRetries: 2,
rateLimitWindowMs: 0,
},
)
expect(result).toBe('ok')
expect(calls).toBe(3)
})
})
it('throws the final provider error after retries are exhausted', async () => {
const originalWarn = console.warn
const originalError = console.error
const errorLogs: string[] = []
console.warn = () => {}
console.error = (message?: unknown) => {
errorLogs.push(String(message))
}
try {
let calls = 0
await expect(
retryProviderErrors(
async () => {
calls++
throw providerError()
},
{ label: 'test', retries: 5, windowMs: 0 },
),
).rejects.toThrow('Provider returned error')
expect(calls).toBe(6)
expect(errorLogs.join('\n')).toContain(
'provider error persisted after 6 attempts',
)
expect(errorLogs.join('\n')).toContain('responseBody')
expect(errorLogs.join('\n')).toContain('upstream failed')
} finally {
console.warn = originalWarn
console.error = originalError
}
})
it('does not retry non-provider errors', async () => {
let calls = 0
await expect(
retryProviderErrors(
async () => {
calls++
throw new Error('tool failed')
},
{ label: 'test', retries: 5, windowMs: 0 },
),
).rejects.toThrow('tool failed')
expect(calls).toBe(1)
})
})

View File

@@ -32,6 +32,11 @@ import { buildMemoryToolSet } from '../tools/memory/build-toolset'
import type { ToolRegistry } from '../tools/tool-registry'
import { CHAT_MODE_ALLOWED_TOOLS } from './chat-mode'
import { createCompactionPrepareStep, type StepWithUsage } from './compaction'
import {
GUI_CLICK_ONLY_BROWSER_TOOL_NAMES,
GUI_CLICK_ONLY_MODE,
isGuiClickOnlyBrowserToolAllowed,
} from './gui-click-only'
import { buildMcpServerSpecs, createMcpClients } from './mcp-builder'
import {
getMessageNormalizationOptions,
@@ -101,6 +106,7 @@ export class AiSdkAgent {
session: {
origin: config.resolvedConfig.origin,
originPageId,
suppressSnapshotOutputs: GUI_CLICK_ONLY_MODE,
},
aclRules: config.aclRules,
}
@@ -109,32 +115,48 @@ export class AiSdkAgent {
toolContext,
config.resolvedConfig.toolApprovalConfig,
)
const browserTools = config.resolvedConfig.chatMode
? Object.fromEntries(
Object.entries(allBrowserTools).filter(([name]) =>
CHAT_MODE_ALLOWED_TOOLS.has(name),
),
)
: allBrowserTools
let browserTools = allBrowserTools
if (GUI_CLICK_ONLY_MODE) {
browserTools = Object.fromEntries(
Object.entries(allBrowserTools).filter(([name]) =>
isGuiClickOnlyBrowserToolAllowed(name),
),
)
} else if (config.resolvedConfig.chatMode) {
browserTools = Object.fromEntries(
Object.entries(allBrowserTools).filter(([name]) =>
CHAT_MODE_ALLOWED_TOOLS.has(name),
),
)
}
if (config.resolvedConfig.chatMode) {
logger.info('Chat mode enabled, restricting to read-only browser tools', {
allowedTools: Array.from(CHAT_MODE_ALLOWED_TOOLS),
})
}
if (GUI_CLICK_ONLY_MODE) {
logger.info('GUI click-only mode enabled, restricting browser tools', {
allowedTools: Array.from(GUI_CLICK_ONLY_BROWSER_TOOL_NAMES),
})
}
// Get Klavis tools from shared background handle (no per-session connection).
// Only expose when user has enabled servers — matches old per-session gating.
const klavisTools =
!GUI_CLICK_ONLY_MODE &&
config.klavisRef?.handle &&
config.browserContext?.enabledMcpServers?.length
? buildKlavisToolSet(config.klavisRef.handle)
: {}
// Connect custom (non-Klavis) MCP servers per-session
const specs = await buildMcpServerSpecs({
browserContext: config.browserContext,
})
const { clients, tools: customMcpTools } = await createMcpClients(specs)
const { clients, tools: customMcpTools } = GUI_CLICK_ONLY_MODE
? { clients: [] as Array<{ close(): Promise<void> }>, tools: {} }
: await createMcpClients(
await buildMcpServerSpecs({
browserContext: config.browserContext,
}),
)
const collidingToolNames = Object.keys(customMcpTools).filter(
(name) => name in klavisTools,
)
@@ -183,12 +205,15 @@ export class AiSdkAgent {
// Add filesystem tools — skip in chat mode (read-only) and when no workspace is selected
const filesystemTools =
!config.resolvedConfig.chatMode && config.resolvedConfig.workingDir
!GUI_CLICK_ONLY_MODE &&
!config.resolvedConfig.chatMode &&
config.resolvedConfig.workingDir
? buildFilesystemToolSet(config.resolvedConfig.workingDir)
: {}
const memoryTools = config.resolvedConfig.chatMode
? {}
: buildMemoryToolSet()
const memoryTools =
config.resolvedConfig.chatMode || GUI_CLICK_ONLY_MODE
? {}
: buildMemoryToolSet()
const tools = {
...browserTools,
...externalMcpTools,
@@ -212,6 +237,15 @@ export class AiSdkAgent {
) {
excludeSections.push('nudges')
}
if (GUI_CLICK_ONLY_MODE) {
excludeSections.push(
'external-integrations',
'memory-and-identity',
'workspace',
'skills',
'nudges',
)
}
const soulContent = await readSoul()
const isBootstrap = await isSoulBootstrap()
@@ -233,6 +267,8 @@ export class AiSdkAgent {
declinedApps: config.resolvedConfig.declinedApps,
skillsCatalog,
origin: config.resolvedConfig.origin,
guiClickOnly: GUI_CLICK_ONLY_MODE,
evalMode: config.resolvedConfig.evalMode,
})
// Configure compaction for context window management
@@ -267,6 +303,45 @@ export class AiSdkAgent {
tools,
stopWhen: [stepCountIs(AGENT_LIMITS.MAX_TURNS)],
prepareStep,
onFinish: (event) => {
const previousStep = event.steps.at(-2)
const totalToolCalls = event.steps.reduce(
(sum, step) => sum + step.toolCalls.length,
0,
)
const totalToolResults = event.steps.reduce(
(sum, step) => sum + step.toolResults.length,
0,
)
logger.info('Agent tool loop finished', {
conversationId: config.resolvedConfig.conversationId,
provider: config.resolvedConfig.provider,
model: config.resolvedConfig.model,
finishReason: event.finishReason,
rawFinishReason: event.rawFinishReason,
stepNumber: event.stepNumber,
stepCount: event.steps.length,
finalTextLength: event.text.length,
emptyFinalText: event.text.trim().length === 0,
lastStepToolCallCount: event.toolCalls.length,
lastStepToolResultCount: event.toolResults.length,
previousStepFinishReason: previousStep?.finishReason,
previousStepToolCallCount: previousStep?.toolCalls.length,
previousStepToolResultCount: previousStep?.toolResults.length,
endedAfterToolResult:
event.toolCalls.length === 0 &&
(previousStep?.toolResults.length ?? 0) > 0,
totalToolCalls,
totalToolResults,
totalInputTokens: event.totalUsage.inputTokens,
totalOutputTokens: event.totalUsage.outputTokens,
totalTokens: event.totalUsage.totalTokens,
finalStepInputTokens: event.usage.inputTokens,
finalStepOutputTokens: event.usage.outputTokens,
finalStepTotalTokens: event.usage.totalTokens,
})
},
...(isChatGPTPro && {
providerOptions: {
openai: {

View File

@@ -0,0 +1,18 @@
export const GUI_CLICK_ONLY_MODE = true
export const GUI_CLICK_ONLY_BROWSER_TOOL_NAMES = new Set([
'click',
'hover',
'scroll',
'type_text',
'take_screenshot',
'get_active_page',
'list_pages',
'navigate_page',
'new_page',
'close_page',
])
export function isGuiClickOnlyBrowserToolAllowed(name: string): boolean {
return GUI_CLICK_ONLY_BROWSER_TOOL_NAMES.has(name)
}

View File

@@ -19,6 +19,10 @@ type ToolResultContentPart = Extract<
type UserMessagePart = Exclude<UserContent, string>[number]
type UserMediaPart = Extract<UserMessagePart, ImagePart | FilePart>
const MAX_SCREENSHOTS_IN_MODEL_HISTORY = 3
const SCREENSHOT_HISTORY_PLACEHOLDER =
'<screenshot omitted from context: keeping latest 3 screenshots>'
export interface MessageNormalizationOptions {
supportsImages: boolean
supportsMediaInToolResults: boolean
@@ -113,6 +117,90 @@ function toolResultContentPartToUserMedia(
}
}
function isScreenshotToolResult(part: ToolResultPart): boolean {
return (
part.type === 'tool-result' &&
typeof part.toolName === 'string' &&
(part.toolName.includes('screenshot') || part.toolName === 'snapshot')
)
}
function isImageToolResultContentPart(part: ToolResultContentPart): boolean {
switch (part.type) {
case 'media':
case 'image-data':
case 'file-data':
return part.mediaType.startsWith('image/')
case 'image-url':
case 'image-file-id':
return true
default:
return false
}
}
function pruneScreenshotHistory(messages: ModelMessage[]): ModelMessage[] {
let remainingScreenshots = MAX_SCREENSHOTS_IN_MODEL_HISTORY
let changed = false
const pruned = [...messages]
for (
let messageIndex = messages.length - 1;
messageIndex >= 0;
messageIndex--
) {
const message = messages[messageIndex]
if (message.role !== 'tool') continue
let messageChanged = false
const content = [...message.content]
for (let partIndex = content.length - 1; partIndex >= 0; partIndex--) {
const part = content[partIndex]
if (
part.type !== 'tool-result' ||
part.output.type !== 'content' ||
!isScreenshotToolResult(part)
) {
continue
}
let partChanged = false
const value = [...part.output.value]
for (let valueIndex = value.length - 1; valueIndex >= 0; valueIndex--) {
if (!isImageToolResultContentPart(value[valueIndex])) continue
if (remainingScreenshots > 0) {
remainingScreenshots--
continue
}
value[valueIndex] = {
type: 'text',
text: SCREENSHOT_HISTORY_PLACEHOLDER,
}
partChanged = true
}
if (!partChanged) continue
content[partIndex] = {
...part,
output: { ...part.output, value },
}
messageChanged = true
}
if (!messageChanged) continue
pruned[messageIndex] = { ...message, content }
changed = true
}
return changed ? pruned : messages
}
function normalizeToolMessageForModel(
message: ToolModelMessage,
supportsImages: boolean,
@@ -178,14 +266,16 @@ export function normalizeMessagesForModel(
messages: ModelMessage[],
options: MessageNormalizationOptions,
): ModelMessage[] {
const screenshotPrunedMessages = pruneScreenshotHistory(messages)
if (options.supportsMediaInToolResults) {
return messages
return screenshotPrunedMessages
}
let changed = false
const normalized: ModelMessage[] = []
for (const message of messages) {
for (const message of screenshotPrunedMessages) {
if (message.role !== 'tool') {
normalized.push(message)
continue
@@ -201,5 +291,5 @@ export function normalizeMessagesForModel(
normalized.push(...replacement)
}
return changed ? normalized : messages
return changed ? normalized : screenshotPrunedMessages
}

View File

@@ -31,6 +31,14 @@ function getRoleAndMode(
_exclude: Set<string>,
options?: BuildSystemPromptOptions,
): string {
if (options?.guiClickOnly) {
return `<role>
You are BrowserOS running in an experimental GUI click model mode. Page clicks are mediated through the \`click\` tool, which uses a visual model to choose coordinates from the current screenshot.
Your tool surface is intentionally small: open or manage pages, then interact with visible page targets through GUI-backed clicks. You cannot read page content or inspect elements through DOM, accessibility-tree, snapshot, page-text, link-extraction, or script-evaluation tools in this mode.
</role>`
}
const hasWorkspace = !!options?.workspaceDir
let role: string
@@ -124,6 +132,23 @@ function getCapabilities(
): string {
const hasWorkspace = !!options?.workspaceDir
if (options?.guiClickOnly) {
return `<capabilities>
## Your Capabilities
### Browser Control
Use these browser tools under the GUI click model constraint:
- \`click\` captures the current page screenshot internally, asks the GUI click model where to click based on your prompt, then executes that coordinate click. Make sure to be brief, concise and capture the semantic essence of where you want to click.
- \`hover\` captures the current page screenshot internally, asks the GUI model where to hover based on your prompt, then moves the cursor there.
- \`type_text\` types into the currently focused element. Use it after \`click\` focuses a text field.
- \`scroll\` scrolls the page viewport.
- \`take_screenshot\` returns a visual screenshot for feedback. It does not expose DOM, accessibility tree, page text, links, or scripts.
- \`get_active_page\`, \`list_pages\`, \`navigate_page\`, \`new_page\`, and \`close_page\` are available for opening and managing pages.
You cannot inspect the DOM, accessibility tree, snapshots, page text, links, or scripts. Use the Page ID from Browser Context directly and issue concise visual click prompts for page targets.
</capabilities>`
}
let capabilities = `<capabilities>
## Your Capabilities
@@ -195,6 +220,22 @@ function getExecution(
_exclude: Set<string>,
options?: BuildSystemPromptOptions,
): string {
if (options?.guiClickOnly) {
return `<execution>
## Execution
- Use \`click\` for visible page targets. It is the only click path that should choose page coordinates.
- Use \`hover\` for visible hover targets, \`type_text\` after focusing a field, and \`scroll\` to move the viewport.
- Use \`take_screenshot\` when you need explicit visual feedback about the current page before choosing the next action.
- After each \`click\` or \`hover\`, inspect the returned \`hitElement\` before choosing the next action. If it is null or does not match the intended target, use \`take_screenshot\` or retry with a more specific visual prompt.
- Use \`new_page\` or \`navigate_page\` to open websites. Use \`get_active_page\`, \`list_pages\`, and \`close_page\` only when needed for page management.
- Use the Page ID from Browser Context directly.
- Do not try to observe the page with snapshots, DOM, accessibility trees, scripts, link extraction, or text extraction.
- You are blind to page content except for explicit \`take_screenshot\` results. Make one concise visual click prompt at a time, then continue from your best estimate of the resulting page state.
- If the task clearly cannot proceed without page observation, say what blocked you.
</execution>`
}
const isNewTab = options?.origin === 'newtab'
let executionContent = `<execution>
@@ -283,6 +324,21 @@ function getToolSelection(
_exclude: Set<string>,
options?: BuildSystemPromptOptions,
): string {
if (options?.guiClickOnly) {
return `<tool_selection>
## Tool Selection
- Use \`click\` for visible page targets.
- Use \`hover\` for hover menus or targets that reveal content.
- Use \`type_text\` only after a prior GUI click likely focused a text input. Include a newline in \`text\` when you need to submit with Enter.
- Use \`scroll\` to move the page viewport when the target is likely below or above the visible area.
- Use \`take_screenshot\` when you need visual feedback before deciding what to click, type, hover, or scroll next.
- The \`prompt\` argument should describe the visible target to click, for example: "click the search box", "click the Add to Cart button", or "click the first product result".
- Use page-opening and page-management tools only to get to the website or manage tabs; they do not replace visual page clicking.
- Do not request or rely on element IDs, snapshots, DOM nodes, page text, scripts, link extraction, or coordinate click tools.
</tool_selection>`
}
const isNewTab = options?.origin === 'newtab'
const navTable = isNewTab
@@ -413,6 +469,19 @@ function getErrorRecovery(
): string {
const hasWorkspace = !!options?.workspaceDir
if (options?.guiClickOnly) {
return `<error_recovery>
## Error Recovery
### Browser interaction errors
- If a click does not appear to make progress, try one more click with a more specific visual prompt.
- After 2 failed attempts, describe the blocker and ask the user for guidance.
### Page errors
- If you infer that a site is blocked by login, CAPTCHA, 2FA, geo-blocking, or payment confirmation, pause and ask the user to handle it.
</error_recovery>`
}
let recovery = `<error_recovery>
## Error Recovery
@@ -455,6 +524,28 @@ function getErrorRecovery(
return recovery
}
// -----------------------------------------------------------------------------
// section: eval-mode
// -----------------------------------------------------------------------------
function getEvalMode(
_exclude: Set<string>,
options?: BuildSystemPromptOptions,
): string {
if (!options?.evalMode) return ''
return `<eval_mode>
## Eval Mode
You are running inside a benchmark environment. The user's task is pre-authorized for the eval clone site.
- Do not ask the user for confirmation before clicking final submit, place-order, book, send, save, create, or similar completion buttons required by the task.
- If the task asks you to complete a workflow and the visible page asks for fields the user did not specify, choose reasonable dummy values.
- Complete the task end-to-end using the available tools, then report the outcome.
- Only pause for credentials, CAPTCHA, 2FA, real payment confirmation, or blocked site access.
</eval_mode>`
}
// -----------------------------------------------------------------------------
// section: memory-and-identity
// -----------------------------------------------------------------------------
@@ -601,6 +692,20 @@ function getStyle(
_exclude: Set<string>,
options?: BuildSystemPromptOptions,
): string {
if (options?.guiClickOnly) {
return `<style_rules>
## Style
<tool_call_style>
- Keep click prompts concise and visual.
- Do not narrate routine clicks before calling the tool.
</tool_call_style>
- Be concise.
- Report blockers plainly when GUI clicks and page opening are insufficient.
</style_rules>`
}
const hasWorkspace = !!options?.workspaceDir
let style = `<style_rules>
@@ -664,8 +769,9 @@ function getUserContext(
'\nYou are running as a **scheduled background task** on a system-managed hidden page.'
}
pageCtx +=
'\n\n**CRITICAL RULES:**\n1. **Do NOT call `get_active_page` or `list_pages` to find your starting page.** Use the **page ID from the Browser Context** directly.'
pageCtx += options?.guiClickOnly
? '\n\n**CRITICAL RULE:** Use the **page ID from the Browser Context** directly when calling `click`.'
: '\n\n**CRITICAL RULES:**\n1. **Do NOT call `get_active_page` or `list_pages` to find your starting page.** Use the **page ID from the Browser Context** directly.'
if (options?.isScheduledTask) {
const pageRef = options.scheduledTaskPageId
@@ -725,6 +831,7 @@ const promptSections: Record<string, PromptSectionFn> = {
) => getToolSelection(_exclude, options),
'external-integrations': getExternalIntegrations,
'error-recovery': getErrorRecovery,
'eval-mode': getEvalMode,
'memory-and-identity': getMemoryAndIdentity,
workspace: getWorkspace,
skills: (_exclude: Set<string>, options?: BuildSystemPromptOptions) =>
@@ -751,10 +858,21 @@ export interface BuildSystemPromptOptions {
skillsCatalog?: string
/** Where the chat session originates from — determines navigation behavior. */
origin?: 'sidepanel' | 'newtab'
/** Experimental mode: browser control is limited to GUI-backed click only. */
guiClickOnly?: boolean
/** Eval mode: benchmark tasks are pre-authorized within clone sites. */
evalMode?: boolean
}
export function buildSystemPrompt(options?: BuildSystemPromptOptions): string {
const exclude = new Set(options?.exclude)
if (options?.guiClickOnly) {
exclude.add('external-integrations')
exclude.add('memory-and-identity')
exclude.add('workspace')
exclude.add('skills')
exclude.add('nudges')
}
const sections = Object.entries(promptSections)
.filter(([key]) => !exclude.has(key))

View File

@@ -44,13 +44,46 @@ function createGoogleFactory(
return createGoogleGenerativeAI({ apiKey: config.apiKey })
}
function buildOpenRouterExtraBody(
config: ResolvedAgentConfig,
): Record<string, unknown> {
const body: Record<string, unknown> = {}
if (config.reasoning?.enabled !== undefined) {
body.reasoning = { enabled: config.reasoning.enabled }
}
if (config.verbosity !== undefined) body.verbosity = config.verbosity
if (config.providerRouting !== undefined) {
body.provider = {
...(config.providerRouting.order !== undefined && {
order: config.providerRouting.order,
}),
...(config.providerRouting.only !== undefined && {
only: config.providerRouting.only,
}),
...(config.providerRouting.ignore !== undefined && {
ignore: config.providerRouting.ignore,
}),
...(config.providerRouting.allowFallbacks !== undefined && {
allow_fallbacks: config.providerRouting.allowFallbacks,
}),
...(config.providerRouting.requireParameters !== undefined && {
require_parameters: config.providerRouting.requireParameters,
}),
}
}
return body
}
function createOpenRouterFactory(
config: ResolvedAgentConfig,
): (modelId: string) => unknown {
if (!config.apiKey) throw new Error('OpenRouter provider requires apiKey')
const extraBody = buildOpenRouterExtraBody(config)
return createOpenRouter({
apiKey: config.apiKey,
extraBody: { reasoning: {} },
...(Object.keys(extraBody).length > 0 ? { extraBody } : {}),
fetch: createOpenRouterCompatibleFetch(),
})
}

View File

@@ -79,6 +79,7 @@ export function buildBrowserToolSet(
content: result.content,
isError: result.isError ?? false,
metadata: result.metadata,
structuredContent: result.structuredContent,
}
} catch (error) {
const errorText =

View File

@@ -4,7 +4,10 @@
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
import type { ToolApprovalConfig } from '@browseros/shared/constants/tool-approval'
import type { LLMProvider } from '@browseros/shared/schemas/llm'
import type {
LLMProvider,
OpenRouterProviderRouting,
} from '@browseros/shared/schemas/llm'
export interface ProviderConfig {
provider: LLMProvider
@@ -34,6 +37,9 @@ export interface ResolvedAgentConfig {
accountId?: string
reasoningEffort?: string
reasoningSummary?: string
reasoning?: { enabled?: boolean; maxTokens?: number; effort?: string }
verbosity?: 'low' | 'medium' | 'high' | 'xhigh' | 'max'
providerRouting?: OpenRouterProviderRouting
contextWindowSize?: number
userSystemPrompt?: string
workingDir?: string

View File

@@ -4,7 +4,11 @@
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
import { createAgentUIStreamResponse, type UIMessage } from 'ai'
import {
createAgentUIStreamResponse,
type FinishReason,
type UIMessage,
} from 'ai'
import { AiSdkAgent } from '../../agent/ai-sdk-agent'
import { formatUserMessage } from '../../agent/format-message'
import {
@@ -272,6 +276,9 @@ export class ChatService {
// Handle tool approval responses: patch the agent's messages and re-run
if (request.toolApprovalResponses?.length) {
let finishReason: FinishReason | undefined
let rawFinishReason: string | undefined
let stepNumber: number | undefined
this.applyToolApprovalResponses(
session.agent.messages,
request.toolApprovalResponses,
@@ -284,8 +291,21 @@ export class ChatService {
agent: session.agent.toolLoopAgent,
uiMessages: filterValidMessages(session.agent.messages),
abortSignal,
onStepFinish: (step) => {
finishReason = step.finishReason
rawFinishReason = step.rawFinishReason
stepNumber = step.stepNumber
},
onFinish: async ({ messages }: { messages: UIMessage[] }) => {
session.agent.messages = filterValidMessages(messages)
logger.info('Agent execution complete', {
conversationId: request.conversationId,
totalMessages: messages.length,
finishReason,
rawFinishReason,
stepNumber,
isToolApprovalResponse: true,
})
},
})
}
@@ -333,10 +353,19 @@ export class ChatService {
: msg,
)
let finishReason: FinishReason | undefined
let rawFinishReason: string | undefined
let stepNumber: number | undefined
return createAgentUIStreamResponse({
agent: session.agent.toolLoopAgent,
uiMessages: promptUiMessages,
abortSignal,
onStepFinish: (step) => {
finishReason = step.finishReason
rawFinishReason = step.rawFinishReason
stepNumber = step.stepNumber
},
onFinish: async ({ messages }: { messages: UIMessage[] }) => {
// The agent loop returns `messages` containing the prompt-
// wrapped user text. Restore the raw form before persisting
@@ -354,6 +383,9 @@ export class ChatService {
logger.info('Agent execution complete', {
conversationId: request.conversationId,
totalMessages: restored.length,
finishReason,
rawFinishReason,
stepNumber,
})
if (session?.hiddenPageId) {

View File

@@ -364,6 +364,31 @@ export class Browser {
}
}
async resolveFocusedElement(pageId: number): Promise<number | null> {
const session = await this.resolveSession(pageId)
try {
const result = await session.Runtime.evaluate({
expression: `(() => {
let element = document.activeElement;
while (element?.shadowRoot?.activeElement) {
element = element.shadowRoot.activeElement;
}
return element instanceof Element ? element : null;
})()`,
})
const objectId = result.result?.objectId
if (!objectId) return null
const desc = await session.DOM.describeNode({ objectId })
const backendNodeId = desc.node?.backendNodeId
if (!backendNodeId) return null
return await this.resolveActionableElement(pageId, backendNodeId)
} catch {
return null
}
}
async resolveElementProperties(
pageId: number,
backendNodeId: number,
@@ -911,6 +936,16 @@ export class Browser {
}
}
async viewportSize(page: number): Promise<{ width: number; height: number }> {
const session = await this.resolveSession(page)
const metrics = await session.Page.getLayoutMetrics()
const viewport = metrics.cssVisualViewport ?? metrics.cssLayoutViewport
return {
width: viewport.clientWidth,
height: viewport.clientHeight,
}
}
async evaluate(
page: number,
expression: string,
@@ -1099,6 +1134,11 @@ export class Browser {
await keyboard.typeText(session, text)
}
async typeText(page: number, text: string): Promise<void> {
const session = await this.resolveSession(page)
await keyboard.typeText(session, text)
}
async dragAt(
page: number,
from: { x: number; y: number },
@@ -1212,68 +1252,16 @@ export class Browser {
if (deltaX === 0 && deltaY === 0) return
let x: number
let y: number
if (element !== undefined) {
const center = await elements.getElementCenter(session, element)
x = center.x
y = center.y
} else {
const metrics = await session.Page.getLayoutMetrics()
x = metrics.layoutViewport.clientWidth / 2
y = metrics.layoutViewport.clientHeight / 2
await mouse.dispatchScroll(session, center.x, center.y, deltaX, deltaY)
return
}
const beforeWindowPosition =
element === undefined
? await this.getWindowScrollPosition(session)
: undefined
await mouse.dispatchScroll(session, x, y, deltaX, deltaY)
if (beforeWindowPosition === undefined) return
const afterWindowPosition = await this.getWindowScrollPosition(session)
const moved = this.didScrollInExpectedDirection(
beforeWindowPosition,
afterWindowPosition,
deltaX,
deltaY,
)
if (moved) return
await this.fallbackWindowScroll(session, deltaX, deltaY)
await this.scrollWindow(session, deltaX, deltaY)
}
private async getWindowScrollPosition(
session: ProtocolApi,
): Promise<{ x: number; y: number }> {
const result = await session.Runtime.evaluate({
expression:
'({ x: window.scrollX ?? window.pageXOffset ?? 0, y: window.scrollY ?? window.pageYOffset ?? 0 })',
returnByValue: true,
})
const value = (result.result?.value ?? {}) as { x?: number; y?: number }
return {
x: typeof value.x === 'number' ? value.x : 0,
y: typeof value.y === 'number' ? value.y : 0,
}
}
private didScrollInExpectedDirection(
before: { x: number; y: number },
after: { x: number; y: number },
deltaX: number,
deltaY: number,
): boolean {
if (deltaX > 0 && after.x > before.x) return true
if (deltaX < 0 && after.x < before.x) return true
if (deltaY > 0 && after.y > before.y) return true
if (deltaY < 0 && after.y < before.y) return true
return false
}
private async fallbackWindowScroll(
private async scrollWindow(
session: ProtocolApi,
deltaX: number,
deltaY: number,

View File

@@ -44,11 +44,44 @@ function createGoogleModel(config: ResolvedLLMConfig): LanguageModel {
return createGoogleGenerativeAI({ apiKey: config.apiKey })(config.model)
}
function buildOpenRouterExtraBody(
config: ResolvedLLMConfig,
): Record<string, unknown> {
const body: Record<string, unknown> = {}
if (config.reasoning?.enabled !== undefined) {
body.reasoning = { enabled: config.reasoning.enabled }
}
if (config.verbosity !== undefined) body.verbosity = config.verbosity
if (config.providerRouting !== undefined) {
body.provider = {
...(config.providerRouting.order !== undefined && {
order: config.providerRouting.order,
}),
...(config.providerRouting.only !== undefined && {
only: config.providerRouting.only,
}),
...(config.providerRouting.ignore !== undefined && {
ignore: config.providerRouting.ignore,
}),
...(config.providerRouting.allowFallbacks !== undefined && {
allow_fallbacks: config.providerRouting.allowFallbacks,
}),
...(config.providerRouting.requireParameters !== undefined && {
require_parameters: config.providerRouting.requireParameters,
}),
}
}
return body
}
function createOpenRouterModel(config: ResolvedLLMConfig): LanguageModel {
if (!config.apiKey) throw new Error('OpenRouter provider requires apiKey')
const extraBody = buildOpenRouterExtraBody(config)
return createOpenRouter({
apiKey: config.apiKey,
extraBody: { reasoning: {} },
...(Object.keys(extraBody).length > 0 ? { extraBody } : {}),
fetch: createOpenRouterCompatibleFetch(),
})(config.model)
}

View File

@@ -1,5 +1,80 @@
import { APICallError } from '@ai-sdk/provider'
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value)
}
function getOpenRouterErrorMessage(parsed: unknown, fallback: string): string {
if (!isRecord(parsed) || !isRecord(parsed.error)) return fallback
let message =
typeof parsed.error.message === 'string' ? parsed.error.message : fallback
if (parsed.error.code !== undefined) {
message = `[${String(parsed.error.code)}] ${message}`
}
const metadata = isRecord(parsed.error.metadata)
? parsed.error.metadata
: undefined
if (metadata?.raw !== undefined) {
message += ` (${JSON.stringify(metadata.raw)})`
}
return message
}
function getOpenRouterErrorStatus(parsed: unknown, fallback: number): number {
if (!isRecord(parsed) || !isRecord(parsed.error)) return fallback
return typeof parsed.error.code === 'number' ? parsed.error.code : fallback
}
function sanitizeReasoningReplay(
options?: RequestInit,
): RequestInit | undefined {
if (typeof options?.body !== 'string') return options
let body: unknown
try {
body = JSON.parse(options.body)
} catch {
return options
}
if (!isRecord(body) || !Array.isArray(body.messages)) return options
let changed = false
const messages = body.messages.map((message) => {
if (
!isRecord(message) ||
message.role !== 'assistant' ||
!Array.isArray(message.reasoning_details) ||
message.reasoning_details.length === 0 ||
(!('reasoning' in message) && !('reasoning_content' in message))
) {
return message
}
const {
reasoning: _reasoning,
reasoning_content: _reasoningContent,
...rest
} = message
changed = true
return rest
})
if (!changed) return options
return {
...options,
body: JSON.stringify({
...body,
messages,
}),
}
}
/**
* Creates a fetch function that extracts detailed error messages from OpenRouter-style APIs.
*
@@ -13,25 +88,24 @@ import { APICallError } from '@ai-sdk/provider'
*/
export function createOpenRouterCompatibleFetch(): typeof fetch {
return (async (url: RequestInfo | URL, options?: RequestInit) => {
const response = await globalThis.fetch(url, options)
const response = await globalThis.fetch(
url,
sanitizeReasoningReplay(options),
)
let responseBody: string | undefined
let parsedResponseBody: unknown
if (!response.ok) {
const statusCode = response.status
let errorMessage = `HTTP ${statusCode}: ${response.statusText}`
let responseBody: string | undefined
try {
responseBody = await response.clone().text()
const parsed = JSON.parse(responseBody)
if (parsed.error?.message) {
errorMessage = parsed.error.message
if (parsed.error.code) {
errorMessage = `[${parsed.error.code}] ${errorMessage}`
}
if (parsed.error.metadata?.raw) {
errorMessage += ` (${JSON.stringify(parsed.error.metadata.raw)})`
}
}
parsedResponseBody = JSON.parse(responseBody)
errorMessage = getOpenRouterErrorMessage(
parsedResponseBody,
errorMessage,
)
} catch {
// Keep default error message if parsing fails
}
@@ -45,6 +119,28 @@ export function createOpenRouterCompatibleFetch(): typeof fetch {
})
}
if (response.headers.get('content-type')?.includes('application/json')) {
try {
responseBody = await response.clone().text()
parsedResponseBody = JSON.parse(responseBody)
} catch {
parsedResponseBody = undefined
}
if (isRecord(parsedResponseBody) && isRecord(parsedResponseBody.error)) {
throw new APICallError({
message: getOpenRouterErrorMessage(
parsedResponseBody,
'Provider returned error',
),
url: typeof url === 'string' ? url : url.toString(),
requestBodyValues: {},
statusCode: getOpenRouterErrorStatus(parsedResponseBody, 400),
responseBody,
})
}
}
return response
}) as typeof fetch
}

View File

@@ -19,6 +19,7 @@ const GUARDED_TOOLS = new Set([
'uncheck',
'select_option',
'press_key',
'type_text',
'upload_file',
])
@@ -39,6 +40,9 @@ async function resolveTargetElementId(
if (toolName === 'drag' && typeof args.sourceElement === 'number') {
return args.sourceElement
}
if (toolName === 'type_text') {
return (await browser.resolveFocusedElement(pageId)) ?? undefined
}
if (typeof args.x === 'number' && typeof args.y === 'number') {
return (

View File

@@ -29,6 +29,7 @@ export interface ToolDirectories {
export interface ToolSessionContext {
origin?: 'sidepanel' | 'newtab'
originPageId?: number
suppressSnapshotOutputs?: boolean
}
export type ToolContext = {
@@ -135,7 +136,9 @@ export async function executeTool(
response.error(`Internal error in ${tool.name}: ${message}`)
}
const result = await response.build(ctx.browser)
const result = await response.build(ctx.browser, {
suppressSnapshots: ctx.session?.suppressSnapshotOutputs,
})
const pageId = (args as Record<string, unknown>).page
if (typeof pageId === 'number') {

View File

@@ -0,0 +1,108 @@
import type { ToolContext } from './framework'
import {
getPngDimensionsFromBase64,
requestMolmoPoint,
} from './molmo-point-client'
const LOG_TEXT_MAX_CHARS = 500
function truncateForLog(value: string | undefined): string | undefined {
if (!value) return value
if (value.length <= LOG_TEXT_MAX_CHARS) return value
return `${value.slice(0, LOG_TEXT_MAX_CHARS)}... (+${value.length - LOG_TEXT_MAX_CHARS} chars)`
}
export interface GuiPointResult {
x: number
y: number
hitElement: GuiHitElement | null
log: Record<string, unknown>
}
export interface GuiHitElement {
tagName: string
role?: string
ariaLabel?: string
labelText?: string
textContent?: string
}
function summarizeHitElement(
hitElement: Awaited<
ReturnType<ToolContext['browser']['resolveElementProperties']>
>,
): GuiHitElement | null {
if (!hitElement) return null
return {
tagName: hitElement.tagName,
role: hitElement.role,
ariaLabel: truncateForLog(hitElement.ariaLabel),
labelText: truncateForLog(hitElement.labelText),
textContent: truncateForLog(hitElement.textContent),
}
}
export async function resolveGuiPoint(
ctx: ToolContext,
page: number,
prompt: string,
): Promise<GuiPointResult> {
const screenshot = await ctx.browser.screenshot(page, {
format: 'png',
fullPage: false,
})
const point = await requestMolmoPoint({
instruction: prompt,
imageB64: screenshot.data,
})
const dimensions = getPngDimensionsFromBase64(screenshot.data)
const viewport = await ctx.browser.viewportSize(page).catch(() => null)
const scaleX =
dimensions && viewport?.width
? dimensions.width / viewport.width
: screenshot.devicePixelRatio
const scaleY =
dimensions && viewport?.height
? dimensions.height / viewport.height
: screenshot.devicePixelRatio
const x = point.x / (scaleX || 1)
const y = point.y / (scaleY || 1)
const pageInfo = await ctx.browser.refreshPageInfo(page).catch(() => null)
const hitElementId = await ctx.browser
.resolveElementAtPoint(page, x, y)
.catch(() => null)
const hitElement =
hitElementId !== null
? await ctx.browser
.resolveElementProperties(page, hitElementId)
.catch(() => null)
: null
const hitElementSummary = summarizeHitElement(hitElement)
return {
x,
y,
hitElement: hitElementSummary,
log: {
page,
pageUrl: truncateForLog(pageInfo?.url),
pageTitle: truncateForLog(pageInfo?.title),
prompt: truncateForLog(prompt),
promptLength: prompt.length,
promptTruncated: prompt.length > LOG_TEXT_MAX_CHARS,
modelPoint: point,
resolvedPoint: { x, y },
scale: { x: scaleX, y: scaleY },
screenshot: {
width: dimensions?.width,
height: dimensions?.height,
devicePixelRatio: screenshot.devicePixelRatio,
},
viewport,
hitElementId,
hitElement: hitElementSummary,
},
}
}

View File

@@ -1,18 +1,88 @@
import { z } from 'zod'
import { defineToolWithCategory } from './framework'
import { logger } from '../lib/logger'
import { defineToolWithCategory, type ToolContext } from './framework'
import { type GuiHitElement, resolveGuiPoint } from './gui-click-resolver'
import type { ToolResponse } from './response'
const pageParam = z.number().describe('Page ID (from list_pages)')
const defineInputTool = defineToolWithCategory('input')
const elementParam = z
.number()
.describe('Element ID from snapshot (the number in [N])')
const guiHitElementOutput = z
.object({
tagName: z.string(),
role: z.string().optional(),
ariaLabel: z.string().optional(),
labelText: z.string().optional(),
textContent: z.string().optional(),
})
.nullable()
function quoteForAgent(value: string): string {
return JSON.stringify(value)
}
function formatHitElementForAgent(hitElement: GuiHitElement | null): string {
if (!hitElement) {
return 'The click was successful, but no hit element could be resolved at the click point.'
}
const details = [`tagName=${quoteForAgent(hitElement.tagName)}`]
if (hitElement.role) details.push(`role=${quoteForAgent(hitElement.role)}`)
if (hitElement.ariaLabel) {
details.push(`ariaLabel=${quoteForAgent(hitElement.ariaLabel)}`)
}
if (hitElement.labelText) {
details.push(`labelText=${quoteForAgent(hitElement.labelText)}`)
}
if (hitElement.textContent) {
details.push(`textContent=${quoteForAgent(hitElement.textContent)}`)
}
return `The click was successful and hit the element: ${details.join(', ')}.`
}
async function enforceAcl(
toolName: string,
args: Record<string, unknown>,
ctx: ToolContext,
response: ToolResponse,
): Promise<boolean> {
if (!ctx.aclRules?.length) return false
const { checkAcl } = await import('./acl/acl-guard')
const check = await checkAcl(toolName, args, ctx.browser, ctx.aclRules)
if (!check.blocked) return false
const desc =
check.rule?.description ??
check.rule?.textMatch ??
check.rule?.sitePattern ??
'ACL rule'
if (check.pageId !== undefined && check.elementId !== undefined) {
await ctx.browser.highlightBlockedElement(
check.pageId,
check.elementId,
desc,
)
}
response.error(
`Action blocked by ACL rule: "${desc}". The element on this page is restricted. Choose a different action or skip this step.`,
)
return true
}
export const click = defineInputTool({
name: 'click',
description: 'Click an element by its ID from the last snapshot',
description:
'Click a visible page target using the GUI click model. Provide a concise visual prompt for what to click.',
input: z.object({
page: pageParam,
element: elementParam,
prompt: z
.string()
.min(1)
.describe('Visual click instruction, e.g. "click the search box"'),
button: z
.enum(['left', 'right', 'middle'])
.default('left')
@@ -25,27 +95,49 @@ export const click = defineInputTool({
output: z.object({
action: z.literal('click'),
page: z.number(),
element: z.number(),
prompt: z.string(),
button: z.enum(['left', 'right', 'middle']),
clickCount: z.number(),
x: z.number(),
y: z.number(),
hitElement: guiHitElementOutput,
guiPointDebug: z.record(z.unknown()).optional(),
}),
handler: async (args, ctx, response) => {
const coords = await ctx.browser.click(args.page, args.element, {
const { x, y, hitElement, log } = await resolveGuiPoint(
ctx,
args.page,
args.prompt,
)
const clickLog = {
...log,
clickPoint: { x, y },
button: args.button,
clickCount: args.clickCount,
}
const blocked = await enforceAcl('click', { ...args, x, y }, ctx, response)
if (blocked) {
logger.info('GUI click blocked by ACL', clickLog)
return
}
await ctx.browser.clickAt(args.page, x, y, {
button: args.button,
clickCount: args.clickCount,
})
const coordText = coords
? ` at (${Math.round(coords.x)}, ${Math.round(coords.y)})`
: ''
response.text(`Clicked [${args.element}]${coordText}`)
response.text(formatHitElementForAgent(hitElement))
response.data({
action: 'click',
page: args.page,
element: args.element,
prompt: args.prompt,
button: args.button,
clickCount: args.clickCount,
x,
y,
hitElement,
guiPointDebug: clickLog,
})
response.includeSnapshot(args.page)
},
})
@@ -146,22 +238,43 @@ export const drag_at = defineInputTool({
export const hover = defineInputTool({
name: 'hover',
description: 'Hover over an element by its ID',
description:
'Hover over a visible page target using the GUI click model. Provide a concise visual prompt for what to hover.',
input: z.object({
page: pageParam,
element: elementParam,
prompt: z
.string()
.min(1)
.describe('Visual hover instruction, e.g. "hover the account menu"'),
}),
output: z.object({
action: z.literal('hover'),
page: z.number(),
element: z.number(),
prompt: z.string(),
x: z.number(),
y: z.number(),
guiPointDebug: z.record(z.unknown()).optional(),
}),
handler: async (args, ctx, response) => {
const coords = await ctx.browser.hover(args.page, args.element)
response.text(
`Hovered over [${args.element}] at (${Math.round(coords.x)}, ${Math.round(coords.y)})`,
)
response.data({ action: 'hover', page: args.page, element: args.element })
const { x, y, log } = await resolveGuiPoint(ctx, args.page, args.prompt)
const hoverLog = { ...log, hoverPoint: { x, y } }
const blocked = await enforceAcl('hover', { ...args, x, y }, ctx, response)
if (blocked) {
logger.info('GUI hover blocked by ACL', hoverLog)
return
}
await ctx.browser.hoverAt(args.page, x, y)
response.text('tool call executed successfully')
response.data({
action: 'hover',
page: args.page,
prompt: args.prompt,
x,
y,
guiPointDebug: hoverLog,
})
},
})
@@ -251,6 +364,32 @@ export const press_key = defineInputTool({
},
})
export const type_text = defineInputTool({
name: 'type_text',
description:
'Type text into the currently focused element. Use after GUI click focuses a text field.',
input: z.object({
page: pageParam,
text: z
.string()
.describe('Text to type into the currently focused element'),
}),
output: z.object({
action: z.literal('type_text'),
page: z.number(),
textLength: z.number(),
}),
handler: async (args, ctx, response) => {
await ctx.browser.typeText(args.page, args.text)
response.text('tool call executed successfully')
response.data({
action: 'type_text',
page: args.page,
textLength: args.text.length,
})
},
})
export const drag = defineInputTool({
name: 'drag',
description:
@@ -303,7 +442,7 @@ export const drag = defineInputTool({
export const scroll = defineInputTool({
name: 'scroll',
description: 'Scroll the page or a specific element',
description: 'Scroll the page viewport',
input: z.object({
page: pageParam,
direction: z
@@ -311,32 +450,21 @@ export const scroll = defineInputTool({
.default('down')
.describe('Scroll direction'),
amount: z.number().default(3).describe('Number of scroll ticks'),
element: z
.number()
.optional()
.describe('Element ID to scroll at (scrolls page center if omitted)'),
}),
output: z.object({
action: z.literal('scroll'),
page: z.number(),
direction: z.enum(['up', 'down', 'left', 'right']),
amount: z.number(),
element: z.number().optional(),
}),
handler: async (args, ctx, response) => {
await ctx.browser.scroll(
args.page,
args.direction,
args.amount,
args.element,
)
await ctx.browser.scroll(args.page, args.direction, args.amount)
response.text(`Scrolled ${args.direction} by ${args.amount}`)
response.data({
action: 'scroll',
page: args.page,
direction: args.direction,
amount: args.amount,
element: args.element,
})
},
})

View File

@@ -0,0 +1,146 @@
import { Buffer } from 'node:buffer'
import { logger } from '../lib/logger'
import {
MOLMO_POINT_ENDPOINT,
MOLMO_POINT_MAX_NEW_TOKENS,
MOLMO_POINT_TIMEOUT_MS,
} from './molmo-point-config'
interface MolmoPoint {
object_id?: unknown
image_num?: unknown
x?: unknown
y?: unknown
}
interface MolmoPointResponse {
text?: unknown
points?: unknown
}
export interface ClickPoint {
x: number
y: number
}
export interface PngDimensions {
width: number
height: number
}
const MOLMO_POINT_RESPONSE_LOG_MAX_CHARS = 2_000
const MOLMO_POINT_ERROR_BODY_MAX_CHARS = 500
const MOLMO_POINT_INSTRUCTION_LOG_MAX_CHARS = 1_000
function pointUrl(): string {
return new URL('/point', MOLMO_POINT_ENDPOINT).toString()
}
function truncateText(text: string, maxChars: number): string {
if (text.length <= maxChars) return text
return `${text.slice(0, maxChars)}... (+${text.length - maxChars} chars)`
}
function firstValidPoint(points: unknown): ClickPoint | null {
if (!Array.isArray(points)) return null
for (const rawPoint of points) {
const point = rawPoint as MolmoPoint
if (typeof point.x !== 'number' || typeof point.y !== 'number') continue
if (!Number.isFinite(point.x) || !Number.isFinite(point.y)) continue
return { x: point.x, y: point.y }
}
return null
}
export async function requestMolmoPoint(args: {
instruction: string
imageB64: string
}): Promise<ClickPoint> {
const endpoint = pointUrl()
const instruction = truncateText(
args.instruction,
MOLMO_POINT_INSTRUCTION_LOG_MAX_CHARS,
)
const instructionLength = args.instruction.length
const instructionTruncated =
instructionLength > MOLMO_POINT_INSTRUCTION_LOG_MAX_CHARS
const response = await fetch(endpoint, {
method: 'POST',
headers: { 'content-type': 'application/json' },
body: JSON.stringify({
instruction: args.instruction,
image_b64: args.imageB64,
max_new_tokens: MOLMO_POINT_MAX_NEW_TOKENS,
}),
signal: AbortSignal.timeout(MOLMO_POINT_TIMEOUT_MS),
})
if (!response.ok) {
const body = await response.text().catch(() => '')
logger.warn('Molmo point request failed', {
endpoint,
instruction,
instructionLength,
instructionTruncated,
status: response.status,
statusText: response.statusText,
rawResponseText: truncateText(body, MOLMO_POINT_RESPONSE_LOG_MAX_CHARS),
rawResponseTextLength: body.length,
rawResponseTextTruncated:
body.length > MOLMO_POINT_RESPONSE_LOG_MAX_CHARS,
})
const suffix = body
? `: ${truncateText(body, MOLMO_POINT_ERROR_BODY_MAX_CHARS)}`
: ''
throw new Error(`Molmo point request failed (${response.status})${suffix}`)
}
const rawResponseText = await response.text()
let payload: MolmoPointResponse
try {
payload = JSON.parse(rawResponseText) as MolmoPointResponse
} catch (error) {
logger.warn('Molmo point response parse failed', {
endpoint,
instruction,
instructionLength,
instructionTruncated,
status: response.status,
statusText: response.statusText,
rawResponseText: truncateText(
rawResponseText,
MOLMO_POINT_RESPONSE_LOG_MAX_CHARS,
),
rawResponseTextLength: rawResponseText.length,
rawResponseTextTruncated:
rawResponseText.length > MOLMO_POINT_RESPONSE_LOG_MAX_CHARS,
error: error instanceof Error ? error.message : String(error),
})
throw error
}
const point = firstValidPoint(payload.points)
if (!point) {
throw new Error('Molmo point response did not include a valid point')
}
return point
}
export function getPngDimensionsFromBase64(
imageB64: string,
): PngDimensions | null {
const buffer = Buffer.from(imageB64, 'base64')
if (buffer.length < 24) return null
const pngSignature = '89504e470d0a1a0a'
if (buffer.subarray(0, 8).toString('hex') !== pngSignature) return null
return {
width: buffer.readUInt32BE(16),
height: buffer.readUInt32BE(20),
}
}

View File

@@ -0,0 +1,5 @@
export const MOLMO_POINT_ENDPOINT =
'https://browseros--molmopoint-gui-molmopointserver-web.modal.run/'
export const MOLMO_POINT_MAX_NEW_TOKENS = 64
export const MOLMO_POINT_TIMEOUT_MS = 60_000

View File

@@ -31,6 +31,7 @@ import {
scroll,
select_option,
type_at,
type_text,
uncheck,
upload_file,
} from './input'
@@ -95,12 +96,13 @@ export const registry = createRegistry([
evaluate_script,
get_console_logs,
// Input (17)
// Input (18)
click,
click_at,
hover,
hover_at,
type_at,
type_text,
drag_at,
focus,
clear,

View File

@@ -25,6 +25,10 @@ interface ToolResponseOptions {
postActionTimeoutMs?: number
}
interface ToolResponseBuildOptions {
suppressSnapshots?: boolean
}
export class ToolResponse {
private content: ContentItem[] = []
private hasError = false
@@ -123,12 +127,19 @@ export class ToolResponse {
}
}
async build(browser: Browser): Promise<ToolResult> {
if (this.postActions.length > 0) {
async build(
browser: Browser,
options: ToolResponseBuildOptions = {},
): Promise<ToolResult> {
const postActions = options.suppressSnapshots
? this.postActions.filter((action) => action.type !== 'snapshot')
: this.postActions
if (postActions.length > 0) {
this.text('\n--- Additional context (auto-included) ---')
}
for (const action of this.postActions) {
for (const action of postActions) {
try {
await this.withTimeout(this.runPostAction(action, browser))
} catch {

View File

@@ -36,6 +36,7 @@ const VERB_OVERRIDES: Record<string, string> = {
hover: 'Hovered',
hover_at: 'Hovered at coordinates',
type_at: 'Typed at coordinates',
type_text: 'Typed text',
drag_at: 'Dragged',
focus: 'Focused element',
fill: 'Filled field',
@@ -186,8 +187,8 @@ const SUBJECT_EXTRACTORS: Record<string, SubjectExtractor> = {
find_files: (i) => quote(stringField(i, 'pattern', 'query')),
// Element interactions
click: (i) => stringField(i, 'element'),
hover: (i) => stringField(i, 'element'),
click: (i) => stringField(i, 'prompt'),
hover: (i) => stringField(i, 'prompt', 'element'),
focus: (i) => stringField(i, 'element'),
clear: (i) => stringField(i, 'element'),
check: (i) => stringField(i, 'element'),
@@ -199,6 +200,7 @@ const SUBJECT_EXTRACTORS: Record<string, SubjectExtractor> = {
return target ?? truncate(text, 40)
},
press_key: (i) => stringField(i, 'key'),
type_text: (i) => truncate(stringField(i, 'text'), 40),
// Coordinate-based input
click_at: (i) => coords(i.x, i.y),

View File

@@ -0,0 +1,30 @@
import { describe, expect, it } from 'bun:test'
import { isGuiClickOnlyBrowserToolAllowed } from '../../src/agent/gui-click-only'
describe('GUI click-only browser tool gating', () => {
it('keeps GUI click and basic page-opening tools available', () => {
expect(isGuiClickOnlyBrowserToolAllowed('click')).toBe(true)
expect(isGuiClickOnlyBrowserToolAllowed('hover')).toBe(true)
expect(isGuiClickOnlyBrowserToolAllowed('scroll')).toBe(true)
expect(isGuiClickOnlyBrowserToolAllowed('type_text')).toBe(true)
expect(isGuiClickOnlyBrowserToolAllowed('take_screenshot')).toBe(true)
expect(isGuiClickOnlyBrowserToolAllowed('new_page')).toBe(true)
expect(isGuiClickOnlyBrowserToolAllowed('navigate_page')).toBe(true)
expect(isGuiClickOnlyBrowserToolAllowed('list_pages')).toBe(true)
expect(isGuiClickOnlyBrowserToolAllowed('get_active_page')).toBe(true)
expect(isGuiClickOnlyBrowserToolAllowed('close_page')).toBe(true)
})
it('blocks page observation and legacy interaction tools', () => {
expect(isGuiClickOnlyBrowserToolAllowed('take_snapshot')).toBe(false)
expect(isGuiClickOnlyBrowserToolAllowed('take_enhanced_snapshot')).toBe(
false,
)
expect(isGuiClickOnlyBrowserToolAllowed('get_dom')).toBe(false)
expect(isGuiClickOnlyBrowserToolAllowed('get_page_content')).toBe(false)
expect(isGuiClickOnlyBrowserToolAllowed('click_at')).toBe(false)
expect(isGuiClickOnlyBrowserToolAllowed('fill')).toBe(false)
expect(isGuiClickOnlyBrowserToolAllowed('press_key')).toBe(false)
expect(isGuiClickOnlyBrowserToolAllowed('type_at')).toBe(false)
})
})

View File

@@ -92,6 +92,17 @@ function buildScheduled(overrides?: Partial<BuildSystemPromptOptions>): string {
})
}
/** Build a prompt for experimental GUI click-only mode */
function buildGuiClickOnly(
overrides?: Partial<BuildSystemPromptOptions>,
): string {
return buildSystemPrompt({
guiClickOnly: true,
origin: 'sidepanel',
...overrides,
})
}
// ---------------------------------------------------------------------------
// 1. SECTION PRESENCE
//
@@ -284,6 +295,23 @@ describe('mode-aware framing', () => {
expect(prompt).toContain('cannot interact with them')
})
it('GUI click-only mode exposes only GUI click and page-opening guidance', () => {
const prompt = buildGuiClickOnly()
expect(prompt).toContain('experimental GUI click model mode')
expect(prompt).toContain('Use `click` for visible page targets')
expect(prompt).toContain('Use `hover` for hover menus')
expect(prompt).toContain('Use `type_text` only after')
expect(prompt).toContain('Use `scroll` to move the page viewport')
expect(prompt).toContain('Use `take_screenshot` sparingly')
expect(prompt).toContain('`new_page`')
expect(prompt).toContain('`navigate_page`')
expect(prompt).toContain('`close_page`')
expect(prompt).not.toContain('take_snapshot')
expect(prompt).not.toContain('get_dom')
expect(prompt).not.toContain('`press_key`')
expect(prompt).not.toContain('<external_integrations>')
})
it('chat mode excludes memory-and-identity section', () => {
// Why: chat mode is read-only — no memory writes, no soul updates.
// The agent shouldn't even see memory tool instructions.

View File

@@ -13,8 +13,13 @@ import {
scroll,
select_option,
type_at,
type_text,
uncheck,
} from '../../src/tools/input'
import {
type ClickPoint,
getPngDimensionsFromBase64,
} from '../../src/tools/molmo-point-client'
import { close_page, navigate_page, new_page } from '../../src/tools/navigation'
import { evaluate_script, take_snapshot } from '../../src/tools/snapshot'
import { cleanupWithBrowser, withBrowser } from '../__helpers__/with-browser'
@@ -121,6 +126,39 @@ async function pointInsideElement(
return { x: point.x, y: point.y }
}
async function withMockedGuiPoint(
browser: Browser,
pageId: number,
viewportPoint: ClickPoint,
fn: () => Promise<void>,
): Promise<void> {
const screenshot = await browser.screenshot(pageId, {
format: 'png',
fullPage: false,
})
const dimensions = getPngDimensionsFromBase64(screenshot.data)
const viewport = await browser.viewportSize(pageId)
const scaleX = dimensions
? dimensions.width / viewport.width
: screenshot.devicePixelRatio
const scaleY = dimensions
? dimensions.height / viewport.height
: screenshot.devicePixelRatio
const originalFetch = globalThis.fetch
globalThis.fetch = (async () =>
new Response(
JSON.stringify({
points: [{ x: viewportPoint.x * scaleX, y: viewportPoint.y * scaleY }],
}),
{ status: 200, headers: { 'content-type': 'application/json' } },
)) as typeof fetch
try {
await fn()
} finally {
globalThis.fetch = originalFetch
}
}
const FORM_PAGE = `data:text/html,${encodeURIComponent(`<!DOCTYPE html>
<html><body>
<h1>Test Form</h1>
@@ -196,7 +234,7 @@ describe('input tools', () => {
}, 60_000)
it('click triggers a button', async () => {
await withBrowser(async ({ execute }) => {
await withBrowser(async ({ browser, execute }) => {
const newResult = await execute(new_page, { url: FORM_PAGE })
const pageId = pageIdOf(newResult)
@@ -206,18 +244,31 @@ describe('input tools', () => {
const inputId = findElementId(snapText, 'Enter name')
await execute(fill, { page: pageId, element: inputId, text: 'Alice' })
// Click submit
const btnId = findElementId(snapText, 'Submit')
const clickResult = await execute(click, {
page: pageId,
element: btnId,
})
assert.ok(!clickResult.isError, textOf(clickResult))
const clickData = structuredOf<{ action: string; element: number }>(
clickResult,
// Click submit via the GUI point model response.
const buttonPoint = await pointInsideElement(
{ browser, directories: { workingDir: process.cwd() } },
pageId,
'submit-btn',
)
assert.strictEqual(clickData.action, 'click')
assert.strictEqual(clickData.element, btnId)
await withMockedGuiPoint(browser, pageId, buttonPoint, async () => {
const clickResult = await execute(click, {
page: pageId,
prompt: 'click the Submit button',
})
assert.ok(!clickResult.isError, textOf(clickResult))
assert.match(
textOf(clickResult),
/The click was successful and hit the element: .*tagName="button".*textContent="Submit"/,
)
const clickData = structuredOf<{
action: string
prompt: string
hitElement: { tagName: string; textContent?: string } | null
}>(clickResult)
assert.strictEqual(clickData.action, 'click')
assert.strictEqual(clickData.prompt, 'click the Submit button')
assert.strictEqual(clickData.hitElement?.tagName, 'button')
})
const output = await execute(evaluate_script, {
page: pageId,
@@ -229,6 +280,61 @@ describe('input tools', () => {
})
}, 60_000)
it('click is blocked by ACL after the GUI point resolves', async () => {
await withBrowser(async ({ browser }) => {
const ctx: ToolContext = {
browser,
directories: { workingDir: process.cwd() },
aclRules: [
{
id: 'submit-rule',
sitePattern: '*',
textMatch: 'Submit',
enabled: true,
},
],
}
const newResult = await executeTool(
new_page,
{ url: FORM_PAGE },
ctx,
AbortSignal.timeout(30_000),
)
const pageId = pageIdOf(newResult)
const buttonPoint = await pointInsideElement(ctx, pageId, 'submit-btn')
await withMockedGuiPoint(browser, pageId, buttonPoint, async () => {
const clickResult = await executeTool(
click,
{ page: pageId, prompt: 'click the Submit button' },
ctx,
AbortSignal.timeout(30_000),
)
assert.ok(clickResult.isError, 'Expected ACL to block GUI click')
assert.ok(textOf(clickResult).includes('Action blocked by ACL rule'))
})
const output = await executeTool(
evaluate_script,
{
page: pageId,
expression: 'document.getElementById("output").textContent',
},
ctx,
AbortSignal.timeout(30_000),
)
assert.strictEqual(textOf(output), '')
await executeTool(
close_page,
{ page: pageId },
ctx,
AbortSignal.timeout(30_000),
)
})
}, 60_000)
it('check and uncheck toggle a checkbox', async () => {
await withBrowser(async ({ execute }) => {
const newResult = await execute(new_page, { url: FORM_PAGE })
@@ -397,6 +503,104 @@ describe('input tools', () => {
})
}, 60_000)
it('type_text types into the focused element', async () => {
await withBrowser(async ({ execute }) => {
const newResult = await execute(new_page, { url: FORM_PAGE })
const pageId = pageIdOf(newResult)
const snap = await execute(take_snapshot, { page: pageId })
const inputId = findElementId(textOf(snap), 'Enter name')
await execute(fill, { page: pageId, element: inputId, text: 'hello' })
const typeResult = await execute(type_text, {
page: pageId,
text: ' world',
})
assert.ok(!typeResult.isError, textOf(typeResult))
assert.strictEqual(textOf(typeResult), 'tool call executed successfully')
assert.deepStrictEqual(structuredOf(typeResult), {
action: 'type_text',
page: pageId,
textLength: ' world'.length,
})
const val = await execute(evaluate_script, {
page: pageId,
expression: 'document.getElementById("name").value',
})
assert.strictEqual(textOf(val), 'hello world')
await execute(close_page, { page: pageId })
})
}, 60_000)
it('type_text is blocked by ACL on the focused element', async () => {
await withBrowser(async ({ browser }) => {
const ctx: ToolContext = {
browser,
directories: { workingDir: process.cwd() },
}
const newResult = await executeTool(
new_page,
{ url: FORM_PAGE },
ctx,
AbortSignal.timeout(30_000),
)
const pageId = pageIdOf(newResult)
const snap = await executeTool(
take_snapshot,
{ page: pageId },
ctx,
AbortSignal.timeout(30_000),
)
const inputId = findElementId(textOf(snap), 'Enter name')
await executeTool(
fill,
{ page: pageId, element: inputId, text: 'hello' },
ctx,
AbortSignal.timeout(30_000),
)
ctx.aclRules = [
{
id: 'name-rule',
sitePattern: '*',
textMatch: 'Enter name',
enabled: true,
},
]
const typeResult = await executeTool(
type_text,
{ page: pageId, text: ' blocked' },
ctx,
AbortSignal.timeout(30_000),
)
assert.ok(typeResult.isError, 'Expected ACL to block focused typing')
assert.ok(textOf(typeResult).includes('Action blocked by ACL rule'))
const val = await executeTool(
evaluate_script,
{
page: pageId,
expression: 'document.getElementById("name").value',
},
ctx,
AbortSignal.timeout(30_000),
)
assert.strictEqual(textOf(val), 'hello')
await executeTool(
close_page,
{ page: pageId },
ctx,
AbortSignal.timeout(30_000),
)
})
}, 60_000)
it('scroll dispatches without error', async () => {
const calls: Array<{
page: number
@@ -437,24 +641,62 @@ describe('input tools', () => {
page: 7,
direction: 'down',
amount: 5,
element: undefined,
})
})
it('hover moves cursor over element', async () => {
it('scroll moves the page viewport', async () => {
await withBrowser(async ({ execute }) => {
const newResult = await execute(new_page, { url: FORM_PAGE })
const pageId = pageIdOf(newResult)
const snap = await execute(take_snapshot, { page: pageId })
const btnId = findElementId(textOf(snap), 'Submit')
const hoverResult = await execute(hover, {
const result = await execute(scroll, {
page: pageId,
element: btnId,
direction: 'down',
amount: 5,
})
assert.ok(!result.isError, textOf(result))
assert.ok(textOf(result).includes('Scrolled down'))
const position = await execute(evaluate_script, {
page: pageId,
expression: 'window.scrollY',
})
const data = structuredOf<{ value?: unknown }>(position)
const value = data.value
assert.strictEqual(typeof value, 'number')
assert.ok(value > 0)
await execute(close_page, { page: pageId })
})
}, 60_000)
it('hover moves cursor via the GUI point model response', async () => {
await withBrowser(async ({ browser, execute }) => {
const newResult = await execute(new_page, { url: FORM_PAGE })
const pageId = pageIdOf(newResult)
const buttonPoint = await pointInsideElement(
{ browser, directories: { workingDir: process.cwd() } },
pageId,
'submit-btn',
)
await withMockedGuiPoint(browser, pageId, buttonPoint, async () => {
const hoverResult = await execute(hover, {
page: pageId,
prompt: 'hover the Submit button',
})
assert.ok(!hoverResult.isError, textOf(hoverResult))
assert.strictEqual(
textOf(hoverResult),
'tool call executed successfully',
)
const hoverData = structuredOf<{ action: string; prompt: string }>(
hoverResult,
)
assert.strictEqual(hoverData.action, 'hover')
assert.strictEqual(hoverData.prompt, 'hover the Submit button')
})
assert.ok(!hoverResult.isError, textOf(hoverResult))
assert.ok(textOf(hoverResult).includes('Hovered'))
await execute(close_page, { page: pageId })
})
@@ -467,7 +709,7 @@ describe('input tools', () => {
directories: { workingDir: process.cwd() },
}
const run =
(tool: typeof new_page | typeof take_snapshot | typeof click) =>
(tool: typeof new_page | typeof take_snapshot | typeof fill) =>
(args: unknown) =>
executeTool(tool, args, ctx, AbortSignal.timeout(30_000))
@@ -475,21 +717,29 @@ describe('input tools', () => {
const pageId = pageIdOf(newResult)
const snap = await run(take_snapshot)({ page: pageId })
const btnId = findElementId(textOf(snap), 'Submit')
const inputId = findElementId(textOf(snap), 'Enter name')
const beforeBlock = await run(click)({ page: pageId, element: btnId })
const beforeBlock = await run(fill)({
page: pageId,
element: inputId,
text: 'Allowed',
})
assert.ok(!beforeBlock.isError, textOf(beforeBlock))
ctx.aclRules = [
{
id: 'submit-rule',
id: 'name-rule',
sitePattern: '*',
textMatch: 'Submit',
textMatch: 'Enter name',
enabled: true,
},
]
const afterBlock = await run(click)({ page: pageId, element: btnId })
const afterBlock = await run(fill)({
page: pageId,
element: inputId,
text: 'Blocked',
})
assert.ok(afterBlock.isError, 'Expected ACL block after updating rules')
assert.ok(textOf(afterBlock).includes('Action blocked by ACL rule'))
@@ -598,19 +848,9 @@ describe('input tools', () => {
)
assert.ok(!navResult.isError, textOf(navResult))
const snap = await executeTool(
take_snapshot,
{ page: pageId },
ctx,
AbortSignal.timeout(30_000),
)
const linkMatch = textOf(snap).match(/\[(\d+)\]\s*link/)
assert.ok(linkMatch, `Expected a link in snapshot:\n${textOf(snap)}`)
const linkId = Number(linkMatch?.[1])
const blockedClick = await executeTool(
click,
{ page: pageId, element: linkId },
click_at,
{ page: pageId, x: 10, y: 10 },
ctx,
AbortSignal.timeout(30_000),
)

View File

@@ -73,4 +73,26 @@ describe('ToolResponse', () => {
assert.ok(text.includes('[Page 1 snapshot]'))
assert.ok(text.includes('[42] button "Submit"'))
})
it('suppresses snapshot post-actions when requested', async () => {
const response = new ToolResponse({ postActionTimeoutMs: 200 })
response.text('ok')
response.includeSnapshot(1)
let called = false
const browser = {
snapshot: async () => {
called = true
return '[42] button "Submit"'
},
} as unknown as Browser
const result = await response.build(browser, { suppressSnapshots: true })
const text = textOf(result)
assert.equal(called, false)
assert.ok(text.includes('ok'))
assert.ok(!text.includes('Additional context'))
assert.ok(!text.includes('[Page 1 snapshot]'))
})
})

View File

@@ -68,6 +68,18 @@ export const LLMProviderSchema: z.ZodEnum<
export type LLMProvider = z.infer<typeof LLMProviderSchema>
const OpenRouterProviderRoutingSchema = z.object({
order: z.array(z.string()).optional(),
only: z.array(z.string()).optional(),
ignore: z.array(z.string()).optional(),
allowFallbacks: z.boolean().optional(),
requireParameters: z.boolean().optional(),
})
export type OpenRouterProviderRouting = z.infer<
typeof OpenRouterProviderRoutingSchema
>
/**
* LLM configuration schema
* Used by SDK endpoints and agent configuration
@@ -84,6 +96,17 @@ export const LLMConfigSchema: z.ZodObject<{
sessionToken: z.ZodOptional<z.ZodString>
reasoningEffort: z.ZodOptional<z.ZodEnum<['none', 'low', 'medium', 'high']>>
reasoningSummary: z.ZodOptional<z.ZodEnum<['auto', 'concise', 'detailed']>>
reasoning: z.ZodOptional<
z.ZodObject<{
enabled: z.ZodOptional<z.ZodBoolean>
maxTokens: z.ZodOptional<z.ZodNumber>
effort: z.ZodOptional<
z.ZodEnum<['minimal', 'low', 'medium', 'high', 'xhigh']>
>
}>
>
verbosity: z.ZodOptional<z.ZodEnum<['low', 'medium', 'high', 'xhigh', 'max']>>
providerRouting: z.ZodOptional<typeof OpenRouterProviderRoutingSchema>
}> = z.object({
provider: LLMProviderSchema,
model: z.string().optional(),
@@ -99,6 +122,16 @@ export const LLMConfigSchema: z.ZodObject<{
// ChatGPT Pro (Codex)
reasoningEffort: z.enum(['none', 'low', 'medium', 'high']).optional(),
reasoningSummary: z.enum(['auto', 'concise', 'detailed']).optional(),
// Provider-specific reasoning controls.
reasoning: z
.object({
enabled: z.boolean().optional(),
maxTokens: z.number().optional(),
effort: z.enum(['minimal', 'low', 'medium', 'high', 'xhigh']).optional(),
})
.optional(),
verbosity: z.enum(['low', 'medium', 'high', 'xhigh', 'max']).optional(),
providerRouting: OpenRouterProviderRoutingSchema.optional(),
})
export type LLMConfig = z.infer<typeof LLMConfigSchema>