chore(eval): refresh test-clado-api script for new Clado contract

Updated the local smoke-test to match the new Clado endpoint and response contract: - New action + health URLs (000159-merged checkpoint). - Drop the grounding-model branch (orchestrator-executor doesn't use it; the README David shared only documents the action model). - Health-check waits up to 6 minutes for cold start with a 30s warning so the operator knows it's spinning up. - Print every documented response field (action, x/y, text, key, direction, amount, drag start/end, time, final_answer, thinking, parse_error, inference_time_seconds). - Three-step run that exercises a click, a typing continuation with formatted history, and an end+final_answer probe.
2026-05-21 12:55:09 +00:00 · 2026-04-29 16:28:29 +05:30
parent 34fdf08521
commit 72cbffe2bb
1 changed files with 176 additions and 104 deletions
--- a/packages/browseros-agent/apps/eval/scripts/test-clado-api.ts
+++ b/packages/browseros-agent/apps/eval/scripts/test-clado-api.ts
@@ -1,34 +1,73 @@
 /**
- * Test script for Clado API endpoints (grounding + action models)
+ * Smoke-test for the Clado BrowserOS Action endpoint.
+ *
+ * Health-checks the model, then runs a generate call and prints every
+ * field the new contract documents (action, coordinates, text, key,
+ * direction, scroll/drag fields, wait, end+final_answer, thinking,
+ * parse_error, raw_response).
 *
 * Usage:
 *   bun apps/eval/scripts/test-clado-api.ts [screenshot-path]
 *
- * If no screenshot provided, captures one from a running BrowserOS server.
+ * If no screenshot path is given, captures one over MCP from a
+ * running BrowserOS server (default http://127.0.0.1:9110, override
+ * with BROWSEROS_URL).
+ *
+ * Cold start can take ~5 minutes; the script waits up to 6.
 */

 import { readFile } from 'node:fs/promises'
 import { resolve } from 'node:path'

 const ACTION_URL =
-  'https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run'
+  'https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run'
 const ACTION_HEALTH_URL =
-  'https://clado-ai--clado-browseros-action-actionmodel-health.modal.run'
-const GROUNDING_URL =
-  'https://clado-ai--clado-browseros-grounding-groundingmodel-generate.modal.run'
-const GROUNDING_HEALTH_URL =
-  'https://clado-ai--clado-browseros-grounding-groundingmodel-health.modal.run'
+  'https://clado-ai--clado-browseros-action-000159-merged-actionmod-5e5033.modal.run'

-async function checkHealth(name: string, url: string): Promise<boolean> {
-  console.log(`\n--- ${name} health check ---`)
-  console.log(`  URL: ${url}`)
+const COLD_START_BUDGET_MS = 360_000 // 6 min — Clado cold start is ~5 min
+const COLD_START_WARN_MS = 30_000
+
+interface CladoResponse {
+  action?: string | null
+  thinking?: string | null
+  raw_response?: string
+  parse_error?: string | null
+  inference_time_seconds?: number
+  x?: number
+  y?: number
+  text?: string
+  key?: string
+  direction?: string
+  amount?: number
+  startX?: number
+  startY?: number
+  endX?: number
+  endY?: number
+  time?: number
+  final_answer?: string | null
+}
+
+async function checkHealth(): Promise<boolean> {
+  console.log(`\n--- Action model health ---`)
+  console.log(`  URL:   ${ACTION_HEALTH_URL}`)
+  console.log(
+    `  Note:  cold start can take ~5 min; waiting up to ${COLD_START_BUDGET_MS / 1000}s.`,
+  )
  const start = performance.now()
+  const warn = setTimeout(() => {
+    console.log(
+      `  ...still waiting (${COLD_START_WARN_MS / 1000}s in) — model is likely cold-starting on Modal.`,
+    )
+  }, COLD_START_WARN_MS)
+
  try {
-    const resp = await fetch(url, { signal: AbortSignal.timeout(30_000) })
+    const resp = await fetch(ACTION_HEALTH_URL, {
+      signal: AbortSignal.timeout(COLD_START_BUDGET_MS),
+    })
    const elapsed = ((performance.now() - start) / 1000).toFixed(2)
    const body = await resp.text()
    console.log(`  Status: ${resp.status} (${elapsed}s)`)
-    console.log(`  Body: ${body.slice(0, 200)}`)
+    console.log(`  Body:   ${body.slice(0, 400)}`)
    return resp.ok
  } catch (err) {
    const elapsed = ((performance.now() - start) / 1000).toFixed(2)
@@ -36,63 +75,34 @@ async function checkHealth(name: string, url: string): Promise<boolean> {
      `  FAILED (${elapsed}s): ${err instanceof Error ? err.message : err}`,
    )
    return false
+  } finally {
+    clearTimeout(warn)
  }
 }

-async function testGenerate(
-  name: string,
-  url: string,
+async function generate(
+  label: string,
  payload: Record<string, unknown>,
-): Promise<Record<string, unknown> | null> {
-  console.log(`\n--- ${name} generate ---`)
-  console.log(`  URL: ${url}`)
+): Promise<CladoResponse | null> {
+  console.log(`\n--- ${label} ---`)
+  console.log(`  URL:         ${ACTION_URL}`)
  console.log(`  Instruction: ${payload.instruction}`)
  console.log(
-    `  Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
+    `  Image size:  ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
  )
-  if (payload.history) console.log(`  History: ${payload.history}`)
+  if (payload.history && payload.history !== 'None') {
+    console.log(`  History:     ${payload.history}`)
+  }

  const start = performance.now()
+  let resp: Response
  try {
-    const resp = await fetch(url, {
+    resp = await fetch(ACTION_URL, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify(payload),
-      signal: AbortSignal.timeout(120_000),
+      signal: AbortSignal.timeout(COLD_START_BUDGET_MS),
    })
-    const elapsed = ((performance.now() - start) / 1000).toFixed(2)
-
-    if (!resp.ok) {
-      const body = await resp.text()
-      console.log(`  FAILED: HTTP ${resp.status} (${elapsed}s)`)
-      console.log(`  Body: ${body.slice(0, 400)}`)
-      return null
-    }
-
-    const result = (await resp.json()) as Record<string, unknown>
-    console.log(`  Status: ${resp.status} (${elapsed}s)`)
-    console.log(`  Action: ${result.action}`)
-    if (result.x !== null && result.x !== undefined)
-      console.log(`  Coordinates: (${result.x}, ${result.y})`)
-    if (result.text)
-      console.log(`  Text: ${(result.text as string).slice(0, 100)}`)
-    if (result.key) console.log(`  Key: ${result.key}`)
-    if (result.inference_time_seconds)
-      console.log(`  Inference: ${result.inference_time_seconds}s`)
-
-    // Show thinking if present
-    const raw = result.raw_response as string | undefined
-    if (raw) {
-      const thinkMatch = raw.match(/<thinking>([\s\S]*?)<\/thinking>/)
-      if (thinkMatch) {
-        const thinking = thinkMatch[1].trim()
-        console.log(
-          `  Thinking: ${thinking.slice(0, 200)}${thinking.length > 200 ? '...' : ''}`,
-        )
-      }
-    }
-
-    return result
  } catch (err) {
    const elapsed = ((performance.now() - start) / 1000).toFixed(2)
    console.log(
@@ -100,6 +110,50 @@ async function testGenerate(
    )
    return null
  }
+  const elapsed = ((performance.now() - start) / 1000).toFixed(2)
+
+  if (!resp.ok) {
+    const body = await resp.text()
+    console.log(`  HTTP ${resp.status} ${resp.statusText} (${elapsed}s)`)
+    console.log(`  Body: ${body.slice(0, 400)}`)
+    return null
+  }
+
+  const result = (await resp.json()) as CladoResponse
+  console.log(`  HTTP ${resp.status} (${elapsed}s)`)
+  console.log(`  action:                ${result.action ?? 'null'}`)
+  if (result.parse_error) {
+    console.log(`  parse_error:           ${result.parse_error}`)
+  }
+  if (result.thinking) {
+    const trimmed = result.thinking.replace(/\s+/g, ' ').trim()
+    console.log(
+      `  thinking:              ${trimmed.slice(0, 240)}${trimmed.length > 240 ? '…' : ''}`,
+    )
+  }
+  if (typeof result.x === 'number' || typeof result.y === 'number') {
+    console.log(`  x, y:                  ${result.x}, ${result.y}`)
+  }
+  if (typeof result.text === 'string')
+    console.log(`  text:                  ${result.text.slice(0, 120)}`)
+  if (typeof result.key === 'string')
+    console.log(`  key:                   ${result.key}`)
+  if (typeof result.direction === 'string')
+    console.log(`  direction:             ${result.direction}`)
+  if (typeof result.amount === 'number')
+    console.log(`  amount:                ${result.amount}`)
+  if (typeof result.startX === 'number' || typeof result.endX === 'number') {
+    console.log(
+      `  drag:                  (${result.startX}, ${result.startY}) → (${result.endX}, ${result.endY})`,
+    )
+  }
+  if (typeof result.time === 'number')
+    console.log(`  time:                  ${result.time}s`)
+  if (result.final_answer)
+    console.log(`  final_answer:          ${result.final_answer.slice(0, 240)}`)
+  if (typeof result.inference_time_seconds === 'number')
+    console.log(`  inference_time_seconds: ${result.inference_time_seconds}`)
+  return result
 }

 async function loadScreenshot(path?: string): Promise<string> {
@@ -110,10 +164,9 @@ async function loadScreenshot(path?: string): Promise<string> {
    return data.toString('base64')
  }

-  // Try to capture from a running BrowserOS server
  const serverUrl = process.env.BROWSEROS_URL || 'http://127.0.0.1:9110'
  console.log(
-    `No screenshot path provided. Trying to capture from ${serverUrl}...`,
+    `No screenshot path provided. Capturing from ${serverUrl} via MCP...`,
  )

  const { Client } = await import('@modelcontextprotocol/sdk/client/index.js')
@@ -134,82 +187,101 @@ async function loadScreenshot(path?: string): Promise<string> {
      arguments: { format: 'png', page: 1 },
    })) as { content: Array<{ type: string; data?: string }> }

-    const imageContent = result.content?.find((c) => c.type === 'image')
-    if (!imageContent?.data)
-      throw new Error('No image data in screenshot response')
+    const image = result.content?.find((c) => c.type === 'image')
+    if (!image?.data)
+      throw new Error('No image data in take_screenshot response')

    console.log(
-      `Captured screenshot (${(imageContent.data.length / 1024).toFixed(0)} KB base64)`,
+      `Captured screenshot (${(image.data.length / 1024).toFixed(0)} KB base64)`,
    )
-    return imageContent.data
+    return image.data
  } finally {
    try {
      await transport.close()
-    } catch {}
+    } catch {
+      /* ignore */
+    }
  }
 }

+function summarize(history: CladoResponse[]): string {
+  if (history.length === 0) return 'None'
+  return history
+    .map((h) => {
+      switch (h.action) {
+        case 'click':
+        case 'double_click':
+        case 'right_click':
+        case 'hover':
+          return `${h.action}(${h.x}, ${h.y})`
+        case 'type':
+          return `type(${JSON.stringify(h.text ?? '')})`
+        case 'press_key':
+          return `press_key(${JSON.stringify(h.key ?? '')})`
+        case 'scroll':
+          return `scroll(${h.direction ?? 'down'})`
+        case 'drag':
+          return `drag(${h.startX},${h.startY} -> ${h.endX},${h.endY})`
+        case 'wait':
+          return `wait(${h.time ?? 1}s)`
+        case 'end':
+          return 'end()'
+        default:
+          return h.action ?? 'invalid'
+      }
+    })
+    .join(' -> ')
+}
+
 async function main() {
-  const screenshotPath = process.argv[2]
+  console.log('=== Clado action endpoint smoke test ===')

-  console.log('=== Clado API Test ===\n')
-
-  // Health checks (parallel)
-  const [actionHealthy, groundingHealthy] = await Promise.all([
-    checkHealth('Action Model', ACTION_HEALTH_URL),
-    checkHealth('Grounding Model', GROUNDING_HEALTH_URL),
-  ])
-
-  if (!actionHealthy && !groundingHealthy) {
-    console.log('\nBoth endpoints are down. Exiting.')
+  const healthy = await checkHealth()
+  if (!healthy) {
+    console.log('\nHealth check failed. Exiting.')
    process.exit(1)
  }

-  // Load screenshot
  let imageBase64: string
  try {
-    imageBase64 = await loadScreenshot(screenshotPath)
+    imageBase64 = await loadScreenshot(process.argv[2])
  } catch (err) {
    console.log(
      `\nFailed to load screenshot: ${err instanceof Error ? err.message : err}`,
    )
    console.log(
-      'Provide a screenshot path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
+      'Pass a path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
    )
    process.exit(1)
  }

-  const instruction = 'Click on the search button or search bar'
+  const history: CladoResponse[] = []

-  // Test grounding model
-  if (groundingHealthy) {
-    await testGenerate('Grounding Model', GROUNDING_URL, {
-      instruction,
+  // Step 1: open task — let the model decide what to do.
+  const step1 = await generate('Step 1: cold task', {
+    instruction: 'Find the search bar and click it',
+    image_base64: imageBase64,
+    history: 'None',
+  })
+  if (step1?.action) history.push(step1)
+
+  // Step 2: continuation with history, asks for typing.
+  if (step1?.action) {
+    const step2 = await generate('Step 2: with history', {
+      instruction: 'Type "hello world" into the search bar',
      image_base64: imageBase64,
+      history: summarize(history),
    })
-  } else {
-    console.log('\nSkipping grounding model (unhealthy)')
+    if (step2?.action) history.push(step2)
  }

-  // Test action model (no history)
-  if (actionHealthy) {
-    const result = await testGenerate('Action Model (step 1)', ACTION_URL, {
-      instruction,
-      image_base64: imageBase64,
-      history: 'None',
-    })
-
-    // Test action model with history (simulate multi-turn)
-    if (result && result.action === 'click') {
-      await testGenerate('Action Model (step 2, with history)', ACTION_URL, {
-        instruction: 'Type "hello world" in the search bar',
-        image_base64: imageBase64,
-        history: `click(${result.x}, ${result.y})`,
-      })
-    }
-  } else {
-    console.log('\nSkipping action model (unhealthy)')
-  }
+  // Step 3: ask for end with a final answer to exercise that field.
+  await generate('Step 3: ask for end+final_answer', {
+    instruction:
+      'You have completed the task. Reply with end() and final_answer="done".',
+    image_base64: imageBase64,
+    history: summarize(history),
+  })

  console.log('\n=== Done ===')
 }