diff --git a/packages/browseros-agent/apps/eval/scripts/test-clado-api.ts b/packages/browseros-agent/apps/eval/scripts/test-clado-api.ts index 9c3586a15..7c6cd4faf 100644 --- a/packages/browseros-agent/apps/eval/scripts/test-clado-api.ts +++ b/packages/browseros-agent/apps/eval/scripts/test-clado-api.ts @@ -1,34 +1,73 @@ /** - * Test script for Clado API endpoints (grounding + action models) + * Smoke-test for the Clado BrowserOS Action endpoint. + * + * Health-checks the model, then runs a generate call and prints every + * field the new contract documents (action, coordinates, text, key, + * direction, scroll/drag fields, wait, end+final_answer, thinking, + * parse_error, raw_response). * * Usage: * bun apps/eval/scripts/test-clado-api.ts [screenshot-path] * - * If no screenshot provided, captures one from a running BrowserOS server. + * If no screenshot path is given, captures one over MCP from a + * running BrowserOS server (default http://127.0.0.1:9110, override + * with BROWSEROS_URL). + * + * Cold start can take ~5 minutes; the script waits up to 6. */ import { readFile } from 'node:fs/promises' import { resolve } from 'node:path' const ACTION_URL = - 'https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run' + 'https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run' const ACTION_HEALTH_URL = - 'https://clado-ai--clado-browseros-action-actionmodel-health.modal.run' -const GROUNDING_URL = - 'https://clado-ai--clado-browseros-grounding-groundingmodel-generate.modal.run' -const GROUNDING_HEALTH_URL = - 'https://clado-ai--clado-browseros-grounding-groundingmodel-health.modal.run' + 'https://clado-ai--clado-browseros-action-000159-merged-actionmod-5e5033.modal.run' -async function checkHealth(name: string, url: string): Promise { - console.log(`\n--- ${name} health check ---`) - console.log(` URL: ${url}`) +const COLD_START_BUDGET_MS = 360_000 // 6 min — Clado cold start is ~5 min +const COLD_START_WARN_MS = 30_000 + +interface CladoResponse { + action?: string | null + thinking?: string | null + raw_response?: string + parse_error?: string | null + inference_time_seconds?: number + x?: number + y?: number + text?: string + key?: string + direction?: string + amount?: number + startX?: number + startY?: number + endX?: number + endY?: number + time?: number + final_answer?: string | null +} + +async function checkHealth(): Promise { + console.log(`\n--- Action model health ---`) + console.log(` URL: ${ACTION_HEALTH_URL}`) + console.log( + ` Note: cold start can take ~5 min; waiting up to ${COLD_START_BUDGET_MS / 1000}s.`, + ) const start = performance.now() + const warn = setTimeout(() => { + console.log( + ` ...still waiting (${COLD_START_WARN_MS / 1000}s in) — model is likely cold-starting on Modal.`, + ) + }, COLD_START_WARN_MS) + try { - const resp = await fetch(url, { signal: AbortSignal.timeout(30_000) }) + const resp = await fetch(ACTION_HEALTH_URL, { + signal: AbortSignal.timeout(COLD_START_BUDGET_MS), + }) const elapsed = ((performance.now() - start) / 1000).toFixed(2) const body = await resp.text() console.log(` Status: ${resp.status} (${elapsed}s)`) - console.log(` Body: ${body.slice(0, 200)}`) + console.log(` Body: ${body.slice(0, 400)}`) return resp.ok } catch (err) { const elapsed = ((performance.now() - start) / 1000).toFixed(2) @@ -36,63 +75,34 @@ async function checkHealth(name: string, url: string): Promise { ` FAILED (${elapsed}s): ${err instanceof Error ? err.message : err}`, ) return false + } finally { + clearTimeout(warn) } } -async function testGenerate( - name: string, - url: string, +async function generate( + label: string, payload: Record, -): Promise | null> { - console.log(`\n--- ${name} generate ---`) - console.log(` URL: ${url}`) +): Promise { + console.log(`\n--- ${label} ---`) + console.log(` URL: ${ACTION_URL}`) console.log(` Instruction: ${payload.instruction}`) console.log( - ` Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`, + ` Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`, ) - if (payload.history) console.log(` History: ${payload.history}`) + if (payload.history && payload.history !== 'None') { + console.log(` History: ${payload.history}`) + } const start = performance.now() + let resp: Response try { - const resp = await fetch(url, { + resp = await fetch(ACTION_URL, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload), - signal: AbortSignal.timeout(120_000), + signal: AbortSignal.timeout(COLD_START_BUDGET_MS), }) - const elapsed = ((performance.now() - start) / 1000).toFixed(2) - - if (!resp.ok) { - const body = await resp.text() - console.log(` FAILED: HTTP ${resp.status} (${elapsed}s)`) - console.log(` Body: ${body.slice(0, 400)}`) - return null - } - - const result = (await resp.json()) as Record - console.log(` Status: ${resp.status} (${elapsed}s)`) - console.log(` Action: ${result.action}`) - if (result.x !== null && result.x !== undefined) - console.log(` Coordinates: (${result.x}, ${result.y})`) - if (result.text) - console.log(` Text: ${(result.text as string).slice(0, 100)}`) - if (result.key) console.log(` Key: ${result.key}`) - if (result.inference_time_seconds) - console.log(` Inference: ${result.inference_time_seconds}s`) - - // Show thinking if present - const raw = result.raw_response as string | undefined - if (raw) { - const thinkMatch = raw.match(/([\s\S]*?)<\/thinking>/) - if (thinkMatch) { - const thinking = thinkMatch[1].trim() - console.log( - ` Thinking: ${thinking.slice(0, 200)}${thinking.length > 200 ? '...' : ''}`, - ) - } - } - - return result } catch (err) { const elapsed = ((performance.now() - start) / 1000).toFixed(2) console.log( @@ -100,6 +110,50 @@ async function testGenerate( ) return null } + const elapsed = ((performance.now() - start) / 1000).toFixed(2) + + if (!resp.ok) { + const body = await resp.text() + console.log(` HTTP ${resp.status} ${resp.statusText} (${elapsed}s)`) + console.log(` Body: ${body.slice(0, 400)}`) + return null + } + + const result = (await resp.json()) as CladoResponse + console.log(` HTTP ${resp.status} (${elapsed}s)`) + console.log(` action: ${result.action ?? 'null'}`) + if (result.parse_error) { + console.log(` parse_error: ${result.parse_error}`) + } + if (result.thinking) { + const trimmed = result.thinking.replace(/\s+/g, ' ').trim() + console.log( + ` thinking: ${trimmed.slice(0, 240)}${trimmed.length > 240 ? '…' : ''}`, + ) + } + if (typeof result.x === 'number' || typeof result.y === 'number') { + console.log(` x, y: ${result.x}, ${result.y}`) + } + if (typeof result.text === 'string') + console.log(` text: ${result.text.slice(0, 120)}`) + if (typeof result.key === 'string') + console.log(` key: ${result.key}`) + if (typeof result.direction === 'string') + console.log(` direction: ${result.direction}`) + if (typeof result.amount === 'number') + console.log(` amount: ${result.amount}`) + if (typeof result.startX === 'number' || typeof result.endX === 'number') { + console.log( + ` drag: (${result.startX}, ${result.startY}) → (${result.endX}, ${result.endY})`, + ) + } + if (typeof result.time === 'number') + console.log(` time: ${result.time}s`) + if (result.final_answer) + console.log(` final_answer: ${result.final_answer.slice(0, 240)}`) + if (typeof result.inference_time_seconds === 'number') + console.log(` inference_time_seconds: ${result.inference_time_seconds}`) + return result } async function loadScreenshot(path?: string): Promise { @@ -110,10 +164,9 @@ async function loadScreenshot(path?: string): Promise { return data.toString('base64') } - // Try to capture from a running BrowserOS server const serverUrl = process.env.BROWSEROS_URL || 'http://127.0.0.1:9110' console.log( - `No screenshot path provided. Trying to capture from ${serverUrl}...`, + `No screenshot path provided. Capturing from ${serverUrl} via MCP...`, ) const { Client } = await import('@modelcontextprotocol/sdk/client/index.js') @@ -134,82 +187,101 @@ async function loadScreenshot(path?: string): Promise { arguments: { format: 'png', page: 1 }, })) as { content: Array<{ type: string; data?: string }> } - const imageContent = result.content?.find((c) => c.type === 'image') - if (!imageContent?.data) - throw new Error('No image data in screenshot response') + const image = result.content?.find((c) => c.type === 'image') + if (!image?.data) + throw new Error('No image data in take_screenshot response') console.log( - `Captured screenshot (${(imageContent.data.length / 1024).toFixed(0)} KB base64)`, + `Captured screenshot (${(image.data.length / 1024).toFixed(0)} KB base64)`, ) - return imageContent.data + return image.data } finally { try { await transport.close() - } catch {} + } catch { + /* ignore */ + } } } +function summarize(history: CladoResponse[]): string { + if (history.length === 0) return 'None' + return history + .map((h) => { + switch (h.action) { + case 'click': + case 'double_click': + case 'right_click': + case 'hover': + return `${h.action}(${h.x}, ${h.y})` + case 'type': + return `type(${JSON.stringify(h.text ?? '')})` + case 'press_key': + return `press_key(${JSON.stringify(h.key ?? '')})` + case 'scroll': + return `scroll(${h.direction ?? 'down'})` + case 'drag': + return `drag(${h.startX},${h.startY} -> ${h.endX},${h.endY})` + case 'wait': + return `wait(${h.time ?? 1}s)` + case 'end': + return 'end()' + default: + return h.action ?? 'invalid' + } + }) + .join(' -> ') +} + async function main() { - const screenshotPath = process.argv[2] + console.log('=== Clado action endpoint smoke test ===') - console.log('=== Clado API Test ===\n') - - // Health checks (parallel) - const [actionHealthy, groundingHealthy] = await Promise.all([ - checkHealth('Action Model', ACTION_HEALTH_URL), - checkHealth('Grounding Model', GROUNDING_HEALTH_URL), - ]) - - if (!actionHealthy && !groundingHealthy) { - console.log('\nBoth endpoints are down. Exiting.') + const healthy = await checkHealth() + if (!healthy) { + console.log('\nHealth check failed. Exiting.') process.exit(1) } - // Load screenshot let imageBase64: string try { - imageBase64 = await loadScreenshot(screenshotPath) + imageBase64 = await loadScreenshot(process.argv[2]) } catch (err) { console.log( `\nFailed to load screenshot: ${err instanceof Error ? err.message : err}`, ) console.log( - 'Provide a screenshot path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png', + 'Pass a path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png', ) process.exit(1) } - const instruction = 'Click on the search button or search bar' + const history: CladoResponse[] = [] - // Test grounding model - if (groundingHealthy) { - await testGenerate('Grounding Model', GROUNDING_URL, { - instruction, + // Step 1: open task — let the model decide what to do. + const step1 = await generate('Step 1: cold task', { + instruction: 'Find the search bar and click it', + image_base64: imageBase64, + history: 'None', + }) + if (step1?.action) history.push(step1) + + // Step 2: continuation with history, asks for typing. + if (step1?.action) { + const step2 = await generate('Step 2: with history', { + instruction: 'Type "hello world" into the search bar', image_base64: imageBase64, + history: summarize(history), }) - } else { - console.log('\nSkipping grounding model (unhealthy)') + if (step2?.action) history.push(step2) } - // Test action model (no history) - if (actionHealthy) { - const result = await testGenerate('Action Model (step 1)', ACTION_URL, { - instruction, - image_base64: imageBase64, - history: 'None', - }) - - // Test action model with history (simulate multi-turn) - if (result && result.action === 'click') { - await testGenerate('Action Model (step 2, with history)', ACTION_URL, { - instruction: 'Type "hello world" in the search bar', - image_base64: imageBase64, - history: `click(${result.x}, ${result.y})`, - }) - } - } else { - console.log('\nSkipping action model (unhealthy)') - } + // Step 3: ask for end with a final answer to exercise that field. + await generate('Step 3: ask for end+final_answer', { + instruction: + 'You have completed the task. Reply with end() and final_answer="done".', + image_base64: imageBase64, + history: summarize(history), + }) console.log('\n=== Done ===') }