mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-21 12:55:09 +00:00
chore(eval): refresh test-clado-api script for new Clado contract
Updated the local smoke-test to match the new Clado endpoint and response contract: - New action + health URLs (000159-merged checkpoint). - Drop the grounding-model branch (orchestrator-executor doesn't use it; the README David shared only documents the action model). - Health-check waits up to 6 minutes for cold start with a 30s warning so the operator knows it's spinning up. - Print every documented response field (action, x/y, text, key, direction, amount, drag start/end, time, final_answer, thinking, parse_error, inference_time_seconds). - Three-step run that exercises a click, a typing continuation with formatted history, and an end+final_answer probe.
This commit is contained in:
@@ -1,34 +1,73 @@
|
||||
/**
|
||||
* Test script for Clado API endpoints (grounding + action models)
|
||||
* Smoke-test for the Clado BrowserOS Action endpoint.
|
||||
*
|
||||
* Health-checks the model, then runs a generate call and prints every
|
||||
* field the new contract documents (action, coordinates, text, key,
|
||||
* direction, scroll/drag fields, wait, end+final_answer, thinking,
|
||||
* parse_error, raw_response).
|
||||
*
|
||||
* Usage:
|
||||
* bun apps/eval/scripts/test-clado-api.ts [screenshot-path]
|
||||
*
|
||||
* If no screenshot provided, captures one from a running BrowserOS server.
|
||||
* If no screenshot path is given, captures one over MCP from a
|
||||
* running BrowserOS server (default http://127.0.0.1:9110, override
|
||||
* with BROWSEROS_URL).
|
||||
*
|
||||
* Cold start can take ~5 minutes; the script waits up to 6.
|
||||
*/
|
||||
|
||||
import { readFile } from 'node:fs/promises'
|
||||
import { resolve } from 'node:path'
|
||||
|
||||
const ACTION_URL =
|
||||
'https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run'
|
||||
'https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run'
|
||||
const ACTION_HEALTH_URL =
|
||||
'https://clado-ai--clado-browseros-action-actionmodel-health.modal.run'
|
||||
const GROUNDING_URL =
|
||||
'https://clado-ai--clado-browseros-grounding-groundingmodel-generate.modal.run'
|
||||
const GROUNDING_HEALTH_URL =
|
||||
'https://clado-ai--clado-browseros-grounding-groundingmodel-health.modal.run'
|
||||
'https://clado-ai--clado-browseros-action-000159-merged-actionmod-5e5033.modal.run'
|
||||
|
||||
async function checkHealth(name: string, url: string): Promise<boolean> {
|
||||
console.log(`\n--- ${name} health check ---`)
|
||||
console.log(` URL: ${url}`)
|
||||
const COLD_START_BUDGET_MS = 360_000 // 6 min — Clado cold start is ~5 min
|
||||
const COLD_START_WARN_MS = 30_000
|
||||
|
||||
interface CladoResponse {
|
||||
action?: string | null
|
||||
thinking?: string | null
|
||||
raw_response?: string
|
||||
parse_error?: string | null
|
||||
inference_time_seconds?: number
|
||||
x?: number
|
||||
y?: number
|
||||
text?: string
|
||||
key?: string
|
||||
direction?: string
|
||||
amount?: number
|
||||
startX?: number
|
||||
startY?: number
|
||||
endX?: number
|
||||
endY?: number
|
||||
time?: number
|
||||
final_answer?: string | null
|
||||
}
|
||||
|
||||
async function checkHealth(): Promise<boolean> {
|
||||
console.log(`\n--- Action model health ---`)
|
||||
console.log(` URL: ${ACTION_HEALTH_URL}`)
|
||||
console.log(
|
||||
` Note: cold start can take ~5 min; waiting up to ${COLD_START_BUDGET_MS / 1000}s.`,
|
||||
)
|
||||
const start = performance.now()
|
||||
const warn = setTimeout(() => {
|
||||
console.log(
|
||||
` ...still waiting (${COLD_START_WARN_MS / 1000}s in) — model is likely cold-starting on Modal.`,
|
||||
)
|
||||
}, COLD_START_WARN_MS)
|
||||
|
||||
try {
|
||||
const resp = await fetch(url, { signal: AbortSignal.timeout(30_000) })
|
||||
const resp = await fetch(ACTION_HEALTH_URL, {
|
||||
signal: AbortSignal.timeout(COLD_START_BUDGET_MS),
|
||||
})
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
const body = await resp.text()
|
||||
console.log(` Status: ${resp.status} (${elapsed}s)`)
|
||||
console.log(` Body: ${body.slice(0, 200)}`)
|
||||
console.log(` Body: ${body.slice(0, 400)}`)
|
||||
return resp.ok
|
||||
} catch (err) {
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
@@ -36,63 +75,34 @@ async function checkHealth(name: string, url: string): Promise<boolean> {
|
||||
` FAILED (${elapsed}s): ${err instanceof Error ? err.message : err}`,
|
||||
)
|
||||
return false
|
||||
} finally {
|
||||
clearTimeout(warn)
|
||||
}
|
||||
}
|
||||
|
||||
async function testGenerate(
|
||||
name: string,
|
||||
url: string,
|
||||
async function generate(
|
||||
label: string,
|
||||
payload: Record<string, unknown>,
|
||||
): Promise<Record<string, unknown> | null> {
|
||||
console.log(`\n--- ${name} generate ---`)
|
||||
console.log(` URL: ${url}`)
|
||||
): Promise<CladoResponse | null> {
|
||||
console.log(`\n--- ${label} ---`)
|
||||
console.log(` URL: ${ACTION_URL}`)
|
||||
console.log(` Instruction: ${payload.instruction}`)
|
||||
console.log(
|
||||
` Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
|
||||
` Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
|
||||
)
|
||||
if (payload.history) console.log(` History: ${payload.history}`)
|
||||
if (payload.history && payload.history !== 'None') {
|
||||
console.log(` History: ${payload.history}`)
|
||||
}
|
||||
|
||||
const start = performance.now()
|
||||
let resp: Response
|
||||
try {
|
||||
const resp = await fetch(url, {
|
||||
resp = await fetch(ACTION_URL, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(payload),
|
||||
signal: AbortSignal.timeout(120_000),
|
||||
signal: AbortSignal.timeout(COLD_START_BUDGET_MS),
|
||||
})
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
|
||||
if (!resp.ok) {
|
||||
const body = await resp.text()
|
||||
console.log(` FAILED: HTTP ${resp.status} (${elapsed}s)`)
|
||||
console.log(` Body: ${body.slice(0, 400)}`)
|
||||
return null
|
||||
}
|
||||
|
||||
const result = (await resp.json()) as Record<string, unknown>
|
||||
console.log(` Status: ${resp.status} (${elapsed}s)`)
|
||||
console.log(` Action: ${result.action}`)
|
||||
if (result.x !== null && result.x !== undefined)
|
||||
console.log(` Coordinates: (${result.x}, ${result.y})`)
|
||||
if (result.text)
|
||||
console.log(` Text: ${(result.text as string).slice(0, 100)}`)
|
||||
if (result.key) console.log(` Key: ${result.key}`)
|
||||
if (result.inference_time_seconds)
|
||||
console.log(` Inference: ${result.inference_time_seconds}s`)
|
||||
|
||||
// Show thinking if present
|
||||
const raw = result.raw_response as string | undefined
|
||||
if (raw) {
|
||||
const thinkMatch = raw.match(/<thinking>([\s\S]*?)<\/thinking>/)
|
||||
if (thinkMatch) {
|
||||
const thinking = thinkMatch[1].trim()
|
||||
console.log(
|
||||
` Thinking: ${thinking.slice(0, 200)}${thinking.length > 200 ? '...' : ''}`,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
} catch (err) {
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
console.log(
|
||||
@@ -100,6 +110,50 @@ async function testGenerate(
|
||||
)
|
||||
return null
|
||||
}
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
|
||||
if (!resp.ok) {
|
||||
const body = await resp.text()
|
||||
console.log(` HTTP ${resp.status} ${resp.statusText} (${elapsed}s)`)
|
||||
console.log(` Body: ${body.slice(0, 400)}`)
|
||||
return null
|
||||
}
|
||||
|
||||
const result = (await resp.json()) as CladoResponse
|
||||
console.log(` HTTP ${resp.status} (${elapsed}s)`)
|
||||
console.log(` action: ${result.action ?? 'null'}`)
|
||||
if (result.parse_error) {
|
||||
console.log(` parse_error: ${result.parse_error}`)
|
||||
}
|
||||
if (result.thinking) {
|
||||
const trimmed = result.thinking.replace(/\s+/g, ' ').trim()
|
||||
console.log(
|
||||
` thinking: ${trimmed.slice(0, 240)}${trimmed.length > 240 ? '…' : ''}`,
|
||||
)
|
||||
}
|
||||
if (typeof result.x === 'number' || typeof result.y === 'number') {
|
||||
console.log(` x, y: ${result.x}, ${result.y}`)
|
||||
}
|
||||
if (typeof result.text === 'string')
|
||||
console.log(` text: ${result.text.slice(0, 120)}`)
|
||||
if (typeof result.key === 'string')
|
||||
console.log(` key: ${result.key}`)
|
||||
if (typeof result.direction === 'string')
|
||||
console.log(` direction: ${result.direction}`)
|
||||
if (typeof result.amount === 'number')
|
||||
console.log(` amount: ${result.amount}`)
|
||||
if (typeof result.startX === 'number' || typeof result.endX === 'number') {
|
||||
console.log(
|
||||
` drag: (${result.startX}, ${result.startY}) → (${result.endX}, ${result.endY})`,
|
||||
)
|
||||
}
|
||||
if (typeof result.time === 'number')
|
||||
console.log(` time: ${result.time}s`)
|
||||
if (result.final_answer)
|
||||
console.log(` final_answer: ${result.final_answer.slice(0, 240)}`)
|
||||
if (typeof result.inference_time_seconds === 'number')
|
||||
console.log(` inference_time_seconds: ${result.inference_time_seconds}`)
|
||||
return result
|
||||
}
|
||||
|
||||
async function loadScreenshot(path?: string): Promise<string> {
|
||||
@@ -110,10 +164,9 @@ async function loadScreenshot(path?: string): Promise<string> {
|
||||
return data.toString('base64')
|
||||
}
|
||||
|
||||
// Try to capture from a running BrowserOS server
|
||||
const serverUrl = process.env.BROWSEROS_URL || 'http://127.0.0.1:9110'
|
||||
console.log(
|
||||
`No screenshot path provided. Trying to capture from ${serverUrl}...`,
|
||||
`No screenshot path provided. Capturing from ${serverUrl} via MCP...`,
|
||||
)
|
||||
|
||||
const { Client } = await import('@modelcontextprotocol/sdk/client/index.js')
|
||||
@@ -134,82 +187,101 @@ async function loadScreenshot(path?: string): Promise<string> {
|
||||
arguments: { format: 'png', page: 1 },
|
||||
})) as { content: Array<{ type: string; data?: string }> }
|
||||
|
||||
const imageContent = result.content?.find((c) => c.type === 'image')
|
||||
if (!imageContent?.data)
|
||||
throw new Error('No image data in screenshot response')
|
||||
const image = result.content?.find((c) => c.type === 'image')
|
||||
if (!image?.data)
|
||||
throw new Error('No image data in take_screenshot response')
|
||||
|
||||
console.log(
|
||||
`Captured screenshot (${(imageContent.data.length / 1024).toFixed(0)} KB base64)`,
|
||||
`Captured screenshot (${(image.data.length / 1024).toFixed(0)} KB base64)`,
|
||||
)
|
||||
return imageContent.data
|
||||
return image.data
|
||||
} finally {
|
||||
try {
|
||||
await transport.close()
|
||||
} catch {}
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function summarize(history: CladoResponse[]): string {
|
||||
if (history.length === 0) return 'None'
|
||||
return history
|
||||
.map((h) => {
|
||||
switch (h.action) {
|
||||
case 'click':
|
||||
case 'double_click':
|
||||
case 'right_click':
|
||||
case 'hover':
|
||||
return `${h.action}(${h.x}, ${h.y})`
|
||||
case 'type':
|
||||
return `type(${JSON.stringify(h.text ?? '')})`
|
||||
case 'press_key':
|
||||
return `press_key(${JSON.stringify(h.key ?? '')})`
|
||||
case 'scroll':
|
||||
return `scroll(${h.direction ?? 'down'})`
|
||||
case 'drag':
|
||||
return `drag(${h.startX},${h.startY} -> ${h.endX},${h.endY})`
|
||||
case 'wait':
|
||||
return `wait(${h.time ?? 1}s)`
|
||||
case 'end':
|
||||
return 'end()'
|
||||
default:
|
||||
return h.action ?? 'invalid'
|
||||
}
|
||||
})
|
||||
.join(' -> ')
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const screenshotPath = process.argv[2]
|
||||
console.log('=== Clado action endpoint smoke test ===')
|
||||
|
||||
console.log('=== Clado API Test ===\n')
|
||||
|
||||
// Health checks (parallel)
|
||||
const [actionHealthy, groundingHealthy] = await Promise.all([
|
||||
checkHealth('Action Model', ACTION_HEALTH_URL),
|
||||
checkHealth('Grounding Model', GROUNDING_HEALTH_URL),
|
||||
])
|
||||
|
||||
if (!actionHealthy && !groundingHealthy) {
|
||||
console.log('\nBoth endpoints are down. Exiting.')
|
||||
const healthy = await checkHealth()
|
||||
if (!healthy) {
|
||||
console.log('\nHealth check failed. Exiting.')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
// Load screenshot
|
||||
let imageBase64: string
|
||||
try {
|
||||
imageBase64 = await loadScreenshot(screenshotPath)
|
||||
imageBase64 = await loadScreenshot(process.argv[2])
|
||||
} catch (err) {
|
||||
console.log(
|
||||
`\nFailed to load screenshot: ${err instanceof Error ? err.message : err}`,
|
||||
)
|
||||
console.log(
|
||||
'Provide a screenshot path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
|
||||
'Pass a path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const instruction = 'Click on the search button or search bar'
|
||||
const history: CladoResponse[] = []
|
||||
|
||||
// Test grounding model
|
||||
if (groundingHealthy) {
|
||||
await testGenerate('Grounding Model', GROUNDING_URL, {
|
||||
instruction,
|
||||
// Step 1: open task — let the model decide what to do.
|
||||
const step1 = await generate('Step 1: cold task', {
|
||||
instruction: 'Find the search bar and click it',
|
||||
image_base64: imageBase64,
|
||||
history: 'None',
|
||||
})
|
||||
if (step1?.action) history.push(step1)
|
||||
|
||||
// Step 2: continuation with history, asks for typing.
|
||||
if (step1?.action) {
|
||||
const step2 = await generate('Step 2: with history', {
|
||||
instruction: 'Type "hello world" into the search bar',
|
||||
image_base64: imageBase64,
|
||||
history: summarize(history),
|
||||
})
|
||||
} else {
|
||||
console.log('\nSkipping grounding model (unhealthy)')
|
||||
if (step2?.action) history.push(step2)
|
||||
}
|
||||
|
||||
// Test action model (no history)
|
||||
if (actionHealthy) {
|
||||
const result = await testGenerate('Action Model (step 1)', ACTION_URL, {
|
||||
instruction,
|
||||
image_base64: imageBase64,
|
||||
history: 'None',
|
||||
})
|
||||
|
||||
// Test action model with history (simulate multi-turn)
|
||||
if (result && result.action === 'click') {
|
||||
await testGenerate('Action Model (step 2, with history)', ACTION_URL, {
|
||||
instruction: 'Type "hello world" in the search bar',
|
||||
image_base64: imageBase64,
|
||||
history: `click(${result.x}, ${result.y})`,
|
||||
})
|
||||
}
|
||||
} else {
|
||||
console.log('\nSkipping action model (unhealthy)')
|
||||
}
|
||||
// Step 3: ask for end with a final answer to exercise that field.
|
||||
await generate('Step 3: ask for end+final_answer', {
|
||||
instruction:
|
||||
'You have completed the task. Reply with end() and final_answer="done".',
|
||||
image_base64: imageBase64,
|
||||
history: summarize(history),
|
||||
})
|
||||
|
||||
console.log('\n=== Done ===')
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user