mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 23:53:25 +00:00
Compare commits
13 Commits
fix/clean-
...
feat/eval-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7ee8dedd53 | ||
|
|
a3b5ef4da3 | ||
|
|
3333728e4e | ||
|
|
5c6fd34d3e | ||
|
|
1a1220dff5 | ||
|
|
dc98858cc3 | ||
|
|
72cbffe2bb | ||
|
|
34fdf08521 | ||
|
|
be6858d589 | ||
|
|
33f68a0d74 | ||
|
|
5cf7b765d0 | ||
|
|
5ed0879d31 | ||
|
|
e136094305 |
11
.github/workflows/eval-weekly.yml
vendored
11
.github/workflows/eval-weekly.yml
vendored
@@ -71,6 +71,9 @@ jobs:
|
||||
NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
|
||||
BROWSEROS_BINARY: /usr/bin/browseros
|
||||
WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
|
||||
# OpenClaw container runtime is macOS-only; opt the Linux runner
|
||||
# into the no-op stub so the server can boot and the eval can run.
|
||||
BROWSEROS_SKIP_OPENCLAW: '1'
|
||||
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
|
||||
run: |
|
||||
echo "Running eval with config: $EVAL_CONFIG"
|
||||
@@ -109,3 +112,11 @@ jobs:
|
||||
with:
|
||||
name: eval-report-${{ github.run_id }}
|
||||
path: /tmp/eval-report.html
|
||||
|
||||
- name: Upload server stderr logs (for post-mortem on startup failures)
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: browseros-server-logs-${{ github.run_id }}
|
||||
path: /tmp/browseros-server-logs/
|
||||
if-no-files-found: ignore
|
||||
|
||||
4
packages/browseros-agent/apps/eval/README.md
vendored
4
packages/browseros-agent/apps/eval/README.md
vendored
@@ -66,9 +66,9 @@ The orchestrator works with any LLM provider. The executor can be another LLM, o
|
||||
},
|
||||
"executor": {
|
||||
"provider": "clado-action",
|
||||
"model": "qwen3-vl-30b-a3b-instruct",
|
||||
"model": "Qwen3.5-35B-A3B-action-000159-merged",
|
||||
"apiKey": "",
|
||||
"baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
|
||||
"baseUrl": "https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,13 +2,13 @@
|
||||
"agent": {
|
||||
"type": "single",
|
||||
"provider": "openai-compatible",
|
||||
"model": "moonshotai/kimi-k2.5",
|
||||
"apiKey": "OPENROUTER_API_KEY",
|
||||
"baseUrl": "https://openrouter.ai/api/v1",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1",
|
||||
"supportsImages": true
|
||||
},
|
||||
"dataset": "../data/agisdk-real.jsonl",
|
||||
"num_workers": 10,
|
||||
"num_workers": 4,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
|
||||
@@ -9,12 +9,12 @@
|
||||
},
|
||||
"executor": {
|
||||
"provider": "clado-action",
|
||||
"model": "qwen3-vl-30b-a3b-instruct",
|
||||
"model": "Qwen3.5-35B-A3B-action-000159-merged",
|
||||
"apiKey": "",
|
||||
"baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
|
||||
"baseUrl": "https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run"
|
||||
}
|
||||
},
|
||||
"dataset": "../data/webbench-2of4-50.jsonl",
|
||||
"dataset": "../data/agisdk-real.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
@@ -23,11 +23,11 @@
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": false
|
||||
"headless": true
|
||||
},
|
||||
"captcha": {
|
||||
"api_key_env": "NOPECHA_API_KEY"
|
||||
},
|
||||
"graders": ["performance_grader"],
|
||||
"graders": ["agisdk_state_diff"],
|
||||
"timeout_ms": 1800000
|
||||
}
|
||||
|
||||
@@ -81,13 +81,30 @@ def main():
|
||||
|
||||
reward_val = float(reward_val) if reward_val is not None else 0.0
|
||||
results = info.get("results", [])
|
||||
# `info["results"]` aligns 1:1 with `tc.task.evals` — zip them so we can
|
||||
# surface the human-readable description and JMESPath query alongside
|
||||
# the pass/fail. Without this the only feedback was a stringified dict.
|
||||
evals = list(getattr(tc.task, "evals", []))
|
||||
|
||||
per_criterion = []
|
||||
softened_count = 0
|
||||
for r in results:
|
||||
for idx, r in enumerate(results):
|
||||
passed = bool(r[0])
|
||||
detail = r[1] if len(r) > 1 else ""
|
||||
entry: dict = {"passed": passed, "detail": str(detail)}
|
||||
detail = r[1] if len(r) > 1 else {}
|
||||
ev = evals[idx] if idx < len(evals) else None
|
||||
|
||||
actual_value = expected_value = None
|
||||
if isinstance(detail, dict):
|
||||
actual_value = detail.get("actual_value")
|
||||
expected_value = detail.get("expected_value")
|
||||
|
||||
entry: dict = {
|
||||
"passed": passed,
|
||||
"description": getattr(ev, "description", "") or "",
|
||||
"query": getattr(ev, "query", "") or "",
|
||||
"expected_value": expected_value,
|
||||
"actual_value": actual_value,
|
||||
}
|
||||
if not _STRICT and not passed and _soft_string_match(detail):
|
||||
entry["passed"] = True
|
||||
entry["softened"] = True
|
||||
@@ -100,9 +117,43 @@ def main():
|
||||
if all_pass and reward_val != 1.0:
|
||||
reward_val = 1.0
|
||||
|
||||
out_message = str(message)
|
||||
if softened_count and all_pass:
|
||||
out_message = f"Task passed (with {softened_count} softened string criterion/criteria)."
|
||||
# Build a useful message: list every criterion with a pass/fail icon
|
||||
# so the viewer's grader pill shows the full check-list, not just
|
||||
# failures. This becomes the `reasoning` shown in the viewer.
|
||||
if not per_criterion:
|
||||
# Defensive: agisdk returned no criteria — fall back to its message.
|
||||
out_message = str(message)
|
||||
else:
|
||||
failures = [c for c in per_criterion if not c["passed"]]
|
||||
if all_pass:
|
||||
header = (
|
||||
f"All {len(per_criterion)} criteria passed"
|
||||
+ (
|
||||
f" ({softened_count} softened)."
|
||||
if softened_count
|
||||
else "."
|
||||
)
|
||||
)
|
||||
else:
|
||||
header = (
|
||||
f"{len(failures)} of {len(per_criterion)} criteria failed:"
|
||||
)
|
||||
|
||||
lines = []
|
||||
for c in per_criterion:
|
||||
icon = "✓" if c["passed"] else "✗"
|
||||
desc = c["description"] or c["query"] or "<unknown>"
|
||||
soft = " (softened)" if c.get("softened") else ""
|
||||
if c["passed"]:
|
||||
lines.append(f"{icon} {desc}{soft}")
|
||||
else:
|
||||
exp_s = repr(c["expected_value"])
|
||||
act_s = repr(c["actual_value"])
|
||||
lines.append(
|
||||
f"{icon} {desc}: expected {exp_s}, got {act_s}"
|
||||
)
|
||||
|
||||
out_message = header + "\n" + "\n".join(lines)
|
||||
|
||||
print(
|
||||
json.dumps(
|
||||
|
||||
@@ -1,34 +1,73 @@
|
||||
/**
|
||||
* Test script for Clado API endpoints (grounding + action models)
|
||||
* Smoke-test for the Clado BrowserOS Action endpoint.
|
||||
*
|
||||
* Health-checks the model, then runs a generate call and prints every
|
||||
* field the new contract documents (action, coordinates, text, key,
|
||||
* direction, scroll/drag fields, wait, end+final_answer, thinking,
|
||||
* parse_error, raw_response).
|
||||
*
|
||||
* Usage:
|
||||
* bun apps/eval/scripts/test-clado-api.ts [screenshot-path]
|
||||
*
|
||||
* If no screenshot provided, captures one from a running BrowserOS server.
|
||||
* If no screenshot path is given, captures one over MCP from a
|
||||
* running BrowserOS server (default http://127.0.0.1:9110, override
|
||||
* with BROWSEROS_URL).
|
||||
*
|
||||
* Cold start can take ~5 minutes; the script waits up to 6.
|
||||
*/
|
||||
|
||||
import { readFile } from 'node:fs/promises'
|
||||
import { resolve } from 'node:path'
|
||||
|
||||
const ACTION_URL =
|
||||
'https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run'
|
||||
'https://clado-ai--clado-browseros-action-000159-merged-actionmod-f4a6ef.modal.run'
|
||||
const ACTION_HEALTH_URL =
|
||||
'https://clado-ai--clado-browseros-action-actionmodel-health.modal.run'
|
||||
const GROUNDING_URL =
|
||||
'https://clado-ai--clado-browseros-grounding-groundingmodel-generate.modal.run'
|
||||
const GROUNDING_HEALTH_URL =
|
||||
'https://clado-ai--clado-browseros-grounding-groundingmodel-health.modal.run'
|
||||
'https://clado-ai--clado-browseros-action-000159-merged-actionmod-5e5033.modal.run'
|
||||
|
||||
async function checkHealth(name: string, url: string): Promise<boolean> {
|
||||
console.log(`\n--- ${name} health check ---`)
|
||||
console.log(` URL: ${url}`)
|
||||
const COLD_START_BUDGET_MS = 360_000 // 6 min — Clado cold start is ~5 min
|
||||
const COLD_START_WARN_MS = 30_000
|
||||
|
||||
interface CladoResponse {
|
||||
action?: string | null
|
||||
thinking?: string | null
|
||||
raw_response?: string
|
||||
parse_error?: string | null
|
||||
inference_time_seconds?: number
|
||||
x?: number
|
||||
y?: number
|
||||
text?: string
|
||||
key?: string
|
||||
direction?: string
|
||||
amount?: number
|
||||
startX?: number
|
||||
startY?: number
|
||||
endX?: number
|
||||
endY?: number
|
||||
time?: number
|
||||
final_answer?: string | null
|
||||
}
|
||||
|
||||
async function checkHealth(): Promise<boolean> {
|
||||
console.log(`\n--- Action model health ---`)
|
||||
console.log(` URL: ${ACTION_HEALTH_URL}`)
|
||||
console.log(
|
||||
` Note: cold start can take ~5 min; waiting up to ${COLD_START_BUDGET_MS / 1000}s.`,
|
||||
)
|
||||
const start = performance.now()
|
||||
const warn = setTimeout(() => {
|
||||
console.log(
|
||||
` ...still waiting (${COLD_START_WARN_MS / 1000}s in) — model is likely cold-starting on Modal.`,
|
||||
)
|
||||
}, COLD_START_WARN_MS)
|
||||
|
||||
try {
|
||||
const resp = await fetch(url, { signal: AbortSignal.timeout(30_000) })
|
||||
const resp = await fetch(ACTION_HEALTH_URL, {
|
||||
signal: AbortSignal.timeout(COLD_START_BUDGET_MS),
|
||||
})
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
const body = await resp.text()
|
||||
console.log(` Status: ${resp.status} (${elapsed}s)`)
|
||||
console.log(` Body: ${body.slice(0, 200)}`)
|
||||
console.log(` Body: ${body.slice(0, 400)}`)
|
||||
return resp.ok
|
||||
} catch (err) {
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
@@ -36,63 +75,34 @@ async function checkHealth(name: string, url: string): Promise<boolean> {
|
||||
` FAILED (${elapsed}s): ${err instanceof Error ? err.message : err}`,
|
||||
)
|
||||
return false
|
||||
} finally {
|
||||
clearTimeout(warn)
|
||||
}
|
||||
}
|
||||
|
||||
async function testGenerate(
|
||||
name: string,
|
||||
url: string,
|
||||
async function generate(
|
||||
label: string,
|
||||
payload: Record<string, unknown>,
|
||||
): Promise<Record<string, unknown> | null> {
|
||||
console.log(`\n--- ${name} generate ---`)
|
||||
console.log(` URL: ${url}`)
|
||||
): Promise<CladoResponse | null> {
|
||||
console.log(`\n--- ${label} ---`)
|
||||
console.log(` URL: ${ACTION_URL}`)
|
||||
console.log(` Instruction: ${payload.instruction}`)
|
||||
console.log(
|
||||
` Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
|
||||
` Image size: ${((payload.image_base64 as string).length / 1024).toFixed(0)} KB (base64)`,
|
||||
)
|
||||
if (payload.history) console.log(` History: ${payload.history}`)
|
||||
if (payload.history && payload.history !== 'None') {
|
||||
console.log(` History: ${payload.history}`)
|
||||
}
|
||||
|
||||
const start = performance.now()
|
||||
let resp: Response
|
||||
try {
|
||||
const resp = await fetch(url, {
|
||||
resp = await fetch(ACTION_URL, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(payload),
|
||||
signal: AbortSignal.timeout(120_000),
|
||||
signal: AbortSignal.timeout(COLD_START_BUDGET_MS),
|
||||
})
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
|
||||
if (!resp.ok) {
|
||||
const body = await resp.text()
|
||||
console.log(` FAILED: HTTP ${resp.status} (${elapsed}s)`)
|
||||
console.log(` Body: ${body.slice(0, 400)}`)
|
||||
return null
|
||||
}
|
||||
|
||||
const result = (await resp.json()) as Record<string, unknown>
|
||||
console.log(` Status: ${resp.status} (${elapsed}s)`)
|
||||
console.log(` Action: ${result.action}`)
|
||||
if (result.x !== null && result.x !== undefined)
|
||||
console.log(` Coordinates: (${result.x}, ${result.y})`)
|
||||
if (result.text)
|
||||
console.log(` Text: ${(result.text as string).slice(0, 100)}`)
|
||||
if (result.key) console.log(` Key: ${result.key}`)
|
||||
if (result.inference_time_seconds)
|
||||
console.log(` Inference: ${result.inference_time_seconds}s`)
|
||||
|
||||
// Show thinking if present
|
||||
const raw = result.raw_response as string | undefined
|
||||
if (raw) {
|
||||
const thinkMatch = raw.match(/<thinking>([\s\S]*?)<\/thinking>/)
|
||||
if (thinkMatch) {
|
||||
const thinking = thinkMatch[1].trim()
|
||||
console.log(
|
||||
` Thinking: ${thinking.slice(0, 200)}${thinking.length > 200 ? '...' : ''}`,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
} catch (err) {
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
console.log(
|
||||
@@ -100,6 +110,50 @@ async function testGenerate(
|
||||
)
|
||||
return null
|
||||
}
|
||||
const elapsed = ((performance.now() - start) / 1000).toFixed(2)
|
||||
|
||||
if (!resp.ok) {
|
||||
const body = await resp.text()
|
||||
console.log(` HTTP ${resp.status} ${resp.statusText} (${elapsed}s)`)
|
||||
console.log(` Body: ${body.slice(0, 400)}`)
|
||||
return null
|
||||
}
|
||||
|
||||
const result = (await resp.json()) as CladoResponse
|
||||
console.log(` HTTP ${resp.status} (${elapsed}s)`)
|
||||
console.log(` action: ${result.action ?? 'null'}`)
|
||||
if (result.parse_error) {
|
||||
console.log(` parse_error: ${result.parse_error}`)
|
||||
}
|
||||
if (result.thinking) {
|
||||
const trimmed = result.thinking.replace(/\s+/g, ' ').trim()
|
||||
console.log(
|
||||
` thinking: ${trimmed.slice(0, 240)}${trimmed.length > 240 ? '…' : ''}`,
|
||||
)
|
||||
}
|
||||
if (typeof result.x === 'number' || typeof result.y === 'number') {
|
||||
console.log(` x, y: ${result.x}, ${result.y}`)
|
||||
}
|
||||
if (typeof result.text === 'string')
|
||||
console.log(` text: ${result.text.slice(0, 120)}`)
|
||||
if (typeof result.key === 'string')
|
||||
console.log(` key: ${result.key}`)
|
||||
if (typeof result.direction === 'string')
|
||||
console.log(` direction: ${result.direction}`)
|
||||
if (typeof result.amount === 'number')
|
||||
console.log(` amount: ${result.amount}`)
|
||||
if (typeof result.startX === 'number' || typeof result.endX === 'number') {
|
||||
console.log(
|
||||
` drag: (${result.startX}, ${result.startY}) → (${result.endX}, ${result.endY})`,
|
||||
)
|
||||
}
|
||||
if (typeof result.time === 'number')
|
||||
console.log(` time: ${result.time}s`)
|
||||
if (result.final_answer)
|
||||
console.log(` final_answer: ${result.final_answer.slice(0, 240)}`)
|
||||
if (typeof result.inference_time_seconds === 'number')
|
||||
console.log(` inference_time_seconds: ${result.inference_time_seconds}`)
|
||||
return result
|
||||
}
|
||||
|
||||
async function loadScreenshot(path?: string): Promise<string> {
|
||||
@@ -110,10 +164,9 @@ async function loadScreenshot(path?: string): Promise<string> {
|
||||
return data.toString('base64')
|
||||
}
|
||||
|
||||
// Try to capture from a running BrowserOS server
|
||||
const serverUrl = process.env.BROWSEROS_URL || 'http://127.0.0.1:9110'
|
||||
console.log(
|
||||
`No screenshot path provided. Trying to capture from ${serverUrl}...`,
|
||||
`No screenshot path provided. Capturing from ${serverUrl} via MCP...`,
|
||||
)
|
||||
|
||||
const { Client } = await import('@modelcontextprotocol/sdk/client/index.js')
|
||||
@@ -134,82 +187,101 @@ async function loadScreenshot(path?: string): Promise<string> {
|
||||
arguments: { format: 'png', page: 1 },
|
||||
})) as { content: Array<{ type: string; data?: string }> }
|
||||
|
||||
const imageContent = result.content?.find((c) => c.type === 'image')
|
||||
if (!imageContent?.data)
|
||||
throw new Error('No image data in screenshot response')
|
||||
const image = result.content?.find((c) => c.type === 'image')
|
||||
if (!image?.data)
|
||||
throw new Error('No image data in take_screenshot response')
|
||||
|
||||
console.log(
|
||||
`Captured screenshot (${(imageContent.data.length / 1024).toFixed(0)} KB base64)`,
|
||||
`Captured screenshot (${(image.data.length / 1024).toFixed(0)} KB base64)`,
|
||||
)
|
||||
return imageContent.data
|
||||
return image.data
|
||||
} finally {
|
||||
try {
|
||||
await transport.close()
|
||||
} catch {}
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function summarize(history: CladoResponse[]): string {
|
||||
if (history.length === 0) return 'None'
|
||||
return history
|
||||
.map((h) => {
|
||||
switch (h.action) {
|
||||
case 'click':
|
||||
case 'double_click':
|
||||
case 'right_click':
|
||||
case 'hover':
|
||||
return `${h.action}(${h.x}, ${h.y})`
|
||||
case 'type':
|
||||
return `type(${JSON.stringify(h.text ?? '')})`
|
||||
case 'press_key':
|
||||
return `press_key(${JSON.stringify(h.key ?? '')})`
|
||||
case 'scroll':
|
||||
return `scroll(${h.direction ?? 'down'})`
|
||||
case 'drag':
|
||||
return `drag(${h.startX},${h.startY} -> ${h.endX},${h.endY})`
|
||||
case 'wait':
|
||||
return `wait(${h.time ?? 1}s)`
|
||||
case 'end':
|
||||
return 'end()'
|
||||
default:
|
||||
return h.action ?? 'invalid'
|
||||
}
|
||||
})
|
||||
.join(' -> ')
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const screenshotPath = process.argv[2]
|
||||
console.log('=== Clado action endpoint smoke test ===')
|
||||
|
||||
console.log('=== Clado API Test ===\n')
|
||||
|
||||
// Health checks (parallel)
|
||||
const [actionHealthy, groundingHealthy] = await Promise.all([
|
||||
checkHealth('Action Model', ACTION_HEALTH_URL),
|
||||
checkHealth('Grounding Model', GROUNDING_HEALTH_URL),
|
||||
])
|
||||
|
||||
if (!actionHealthy && !groundingHealthy) {
|
||||
console.log('\nBoth endpoints are down. Exiting.')
|
||||
const healthy = await checkHealth()
|
||||
if (!healthy) {
|
||||
console.log('\nHealth check failed. Exiting.')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
// Load screenshot
|
||||
let imageBase64: string
|
||||
try {
|
||||
imageBase64 = await loadScreenshot(screenshotPath)
|
||||
imageBase64 = await loadScreenshot(process.argv[2])
|
||||
} catch (err) {
|
||||
console.log(
|
||||
`\nFailed to load screenshot: ${err instanceof Error ? err.message : err}`,
|
||||
)
|
||||
console.log(
|
||||
'Provide a screenshot path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
|
||||
'Pass a path: bun apps/eval/scripts/test-clado-api.ts path/to/screenshot.png',
|
||||
)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const instruction = 'Click on the search button or search bar'
|
||||
const history: CladoResponse[] = []
|
||||
|
||||
// Test grounding model
|
||||
if (groundingHealthy) {
|
||||
await testGenerate('Grounding Model', GROUNDING_URL, {
|
||||
instruction,
|
||||
// Step 1: open task — let the model decide what to do.
|
||||
const step1 = await generate('Step 1: cold task', {
|
||||
instruction: 'Find the search bar and click it',
|
||||
image_base64: imageBase64,
|
||||
history: 'None',
|
||||
})
|
||||
if (step1?.action) history.push(step1)
|
||||
|
||||
// Step 2: continuation with history, asks for typing.
|
||||
if (step1?.action) {
|
||||
const step2 = await generate('Step 2: with history', {
|
||||
instruction: 'Type "hello world" into the search bar',
|
||||
image_base64: imageBase64,
|
||||
history: summarize(history),
|
||||
})
|
||||
} else {
|
||||
console.log('\nSkipping grounding model (unhealthy)')
|
||||
if (step2?.action) history.push(step2)
|
||||
}
|
||||
|
||||
// Test action model (no history)
|
||||
if (actionHealthy) {
|
||||
const result = await testGenerate('Action Model (step 1)', ACTION_URL, {
|
||||
instruction,
|
||||
image_base64: imageBase64,
|
||||
history: 'None',
|
||||
})
|
||||
|
||||
// Test action model with history (simulate multi-turn)
|
||||
if (result && result.action === 'click') {
|
||||
await testGenerate('Action Model (step 2, with history)', ACTION_URL, {
|
||||
instruction: 'Type "hello world" in the search bar',
|
||||
image_base64: imageBase64,
|
||||
history: `click(${result.x}, ${result.y})`,
|
||||
})
|
||||
}
|
||||
} else {
|
||||
console.log('\nSkipping action model (unhealthy)')
|
||||
}
|
||||
// Step 3: ask for end with a final answer to exercise that field.
|
||||
await generate('Step 3: ask for end+final_answer', {
|
||||
instruction:
|
||||
'You have completed the task. Reply with end() and final_answer="done".',
|
||||
image_base64: imageBase64,
|
||||
history: summarize(history),
|
||||
})
|
||||
|
||||
console.log('\n=== Done ===')
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ const PAGE_SCOPED_TOOLS = new Set<string>([
|
||||
])
|
||||
|
||||
interface CladoActionResponse {
|
||||
action?: string
|
||||
action?: string | null
|
||||
x?: number
|
||||
y?: number
|
||||
text?: string
|
||||
@@ -43,8 +43,11 @@ interface CladoActionResponse {
|
||||
endY?: number
|
||||
amount?: number
|
||||
time?: number
|
||||
final_answer?: string | null
|
||||
inference_time_seconds?: number
|
||||
raw_response?: string
|
||||
thinking?: string | null
|
||||
parse_error?: string | null
|
||||
}
|
||||
|
||||
interface Viewport {
|
||||
@@ -65,9 +68,14 @@ interface CladoAction {
|
||||
endY?: number
|
||||
amount?: number
|
||||
time?: number
|
||||
final_answer?: string
|
||||
}
|
||||
|
||||
type RawActionPayload = Partial<CladoAction>
|
||||
type RawActionPayload = Partial<Omit<CladoAction, 'final_answer'>> & {
|
||||
final_answer?: string | null
|
||||
}
|
||||
|
||||
const MAX_CONSECUTIVE_PARSE_FAILURES = 3
|
||||
|
||||
interface ActionPoint {
|
||||
x: number
|
||||
@@ -135,6 +143,8 @@ export class CladoActionExecutor {
|
||||
const actionHistory: CladoAction[] = []
|
||||
let predictionCalls = 0
|
||||
const thinkingTrace: string[] = []
|
||||
let consecutiveParseFailures = 0
|
||||
let finalAnswer: string | undefined
|
||||
|
||||
let status: ExecutorResult['status'] = 'done'
|
||||
let reason = 'Goal executed.'
|
||||
@@ -209,6 +219,17 @@ export class CladoActionExecutor {
|
||||
|
||||
const predictedActions = this.parseActions(prediction)
|
||||
if (predictedActions.length === 0) {
|
||||
// Per Clado contract: HTTP 200 with action=null on parse failure.
|
||||
// Count as an invalid step so the model can self-correct on the
|
||||
// next call instead of dropping the trajectory.
|
||||
consecutiveParseFailures++
|
||||
const parseError =
|
||||
prediction.parse_error ?? 'no parsable <answer> in raw_response'
|
||||
actionHistory.push({
|
||||
action: 'invalid',
|
||||
text: `parse_error: ${parseError}`,
|
||||
})
|
||||
this.stepsUsed++
|
||||
await this.callbacks.onStepFinish?.({
|
||||
toolCalls: [
|
||||
{
|
||||
@@ -224,14 +245,21 @@ export class CladoActionExecutor {
|
||||
output: {
|
||||
prediction: this.summarizePrediction(prediction),
|
||||
parsedActions: [],
|
||||
parseError,
|
||||
consecutiveParseFailures,
|
||||
},
|
||||
},
|
||||
],
|
||||
})
|
||||
status = 'blocked'
|
||||
reason = 'Clado action response did not contain a valid action.'
|
||||
break
|
||||
|
||||
if (consecutiveParseFailures >= MAX_CONSECUTIVE_PARSE_FAILURES) {
|
||||
status = 'blocked'
|
||||
reason = `Clado returned ${consecutiveParseFailures} consecutive unparseable responses.`
|
||||
break
|
||||
}
|
||||
continue
|
||||
}
|
||||
consecutiveParseFailures = 0
|
||||
|
||||
let requestedStop = false
|
||||
const executionNotes: string[] = []
|
||||
@@ -272,7 +300,12 @@ export class CladoActionExecutor {
|
||||
|
||||
actionHistory.push(predictedAction)
|
||||
if (predictedAction.action === 'end') {
|
||||
reason = 'Model requested end() and marked task complete.'
|
||||
if (predictedAction.final_answer) {
|
||||
finalAnswer = predictedAction.final_answer
|
||||
reason = `Model requested end() with final_answer: ${predictedAction.final_answer.slice(0, 240)}`
|
||||
} else {
|
||||
reason = 'Model requested end() and marked task complete.'
|
||||
}
|
||||
requestedStop = true
|
||||
break
|
||||
}
|
||||
@@ -327,6 +360,7 @@ export class CladoActionExecutor {
|
||||
actions: actionHistory,
|
||||
url: this.currentUrl,
|
||||
thinkingTrace,
|
||||
finalAnswer,
|
||||
})
|
||||
|
||||
return {
|
||||
@@ -440,6 +474,10 @@ export class CladoActionExecutor {
|
||||
endY: typeof payload.endY === 'number' ? payload.endY : undefined,
|
||||
amount: typeof payload.amount === 'number' ? payload.amount : undefined,
|
||||
time: typeof payload.time === 'number' ? payload.time : undefined,
|
||||
final_answer:
|
||||
typeof payload.final_answer === 'string'
|
||||
? payload.final_answer
|
||||
: undefined,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -578,7 +616,9 @@ export class CladoActionExecutor {
|
||||
}
|
||||
|
||||
case 'end': {
|
||||
return 'Model requested end().'
|
||||
return action.final_answer
|
||||
? `Model requested end() with final_answer: ${action.final_answer.slice(0, 240)}`
|
||||
: 'Model requested end().'
|
||||
}
|
||||
|
||||
default: {
|
||||
@@ -588,9 +628,10 @@ export class CladoActionExecutor {
|
||||
}
|
||||
|
||||
private async captureScreenshotBase64(signal?: AbortSignal): Promise<string> {
|
||||
// Clado contract is PNG or JPEG; use PNG for lossless input.
|
||||
const result = await this.runTool(
|
||||
'take_screenshot',
|
||||
{ format: 'webp', quality: 80 },
|
||||
{ format: 'png' },
|
||||
signal,
|
||||
)
|
||||
|
||||
@@ -754,6 +795,11 @@ export class CladoActionExecutor {
|
||||
'C-S-tab': 'Control+Shift+Tab',
|
||||
'C-S-n': 'Control+Shift+N',
|
||||
'C-down': 'Control+ArrowDown',
|
||||
// macOS Cmd shortcuts (Meta in CDP).
|
||||
'M-a': 'Meta+A',
|
||||
'M-c': 'Meta+C',
|
||||
'M-v': 'Meta+V',
|
||||
'M-x': 'Meta+X',
|
||||
'M-f4': 'Alt+F4',
|
||||
}
|
||||
return map[raw] ?? raw
|
||||
@@ -841,7 +887,11 @@ export class CladoActionExecutor {
|
||||
case 'wait':
|
||||
return `${action.action}:${action.time ?? 1}`
|
||||
case 'end':
|
||||
return 'end()'
|
||||
return action.final_answer
|
||||
? `end(${action.final_answer.slice(0, 32)})`
|
||||
: 'end()'
|
||||
case 'invalid':
|
||||
return `invalid(${(action.text ?? '').slice(0, 40)})`
|
||||
default:
|
||||
return action.action
|
||||
}
|
||||
@@ -871,6 +921,8 @@ export class CladoActionExecutor {
|
||||
return `wait(${Math.round(action.time ?? 1)}s)`
|
||||
case 'end':
|
||||
return 'end()'
|
||||
case 'invalid':
|
||||
return 'invalid()'
|
||||
default:
|
||||
return action.action
|
||||
}
|
||||
@@ -885,8 +937,9 @@ export class CladoActionExecutor {
|
||||
actions: CladoAction[]
|
||||
url: string
|
||||
thinkingTrace: string[]
|
||||
finalAnswer?: string
|
||||
}): string {
|
||||
const { status, reason, actions, url, thinkingTrace } = params
|
||||
const { status, reason, actions, url, thinkingTrace, finalAnswer } = params
|
||||
const actionSummary =
|
||||
actions.length === 0
|
||||
? 'No actions were executed.'
|
||||
@@ -907,6 +960,7 @@ export class CladoActionExecutor {
|
||||
`Status: ${status}`,
|
||||
`Reason: ${reason}`,
|
||||
`URL: ${url || 'unknown'}`,
|
||||
finalAnswer ? `Final answer: ${finalAnswer}` : '',
|
||||
'',
|
||||
'Recent actions:',
|
||||
actionSummary,
|
||||
|
||||
@@ -5,4 +5,5 @@
|
||||
export const DEFAULT_TIMEOUT_MS = 30 * 60 * 1000 // 30 minutes
|
||||
export const SCREENSHOT_TIMEOUT_MS = 65_000 // 65s — ensures we get extension's error (60s)
|
||||
export const MAX_ACTIONS_PER_DELEGATION = 15
|
||||
export const CLADO_REQUEST_TIMEOUT_MS = 120_000
|
||||
// Cold start can take ~5 minutes per Clado; 6 minutes leaves headroom.
|
||||
export const CLADO_REQUEST_TIMEOUT_MS = 360_000
|
||||
|
||||
@@ -14,8 +14,11 @@
|
||||
*/
|
||||
|
||||
import {
|
||||
closeSync,
|
||||
existsSync,
|
||||
mkdirSync,
|
||||
mkdtempSync,
|
||||
openSync,
|
||||
readFileSync,
|
||||
rmSync,
|
||||
writeFileSync,
|
||||
@@ -33,7 +36,17 @@ export interface EvalPorts {
|
||||
|
||||
const MAX_RESTART_ATTEMPTS = 3
|
||||
const CDP_WAIT_TIMEOUT_MS = 30_000
|
||||
const SERVER_HEALTH_TIMEOUT_MS = 30_000
|
||||
// Bumped from 30s → 90s while debugging dev-CI startup. Dev's server module
|
||||
// graph is ~108 files larger than main's; cold-cache module load on a CI
|
||||
// runner can take much longer than the original 30s budget allowed.
|
||||
const SERVER_HEALTH_TIMEOUT_MS = 90_000
|
||||
|
||||
// Where per-worker server stderr is written. Captured (rather than ignored)
|
||||
// so eval-weekly.yml can upload these as workflow artifacts on failure for
|
||||
// post-mortem debugging. Path is also referenced in the workflow's artifact
|
||||
// upload step.
|
||||
const SERVER_LOG_DIR =
|
||||
process.env.BROWSEROS_SERVER_LOG_DIR || '/tmp/browseros-server-logs'
|
||||
|
||||
const MONOREPO_ROOT = join(
|
||||
dirname(fileURLToPath(import.meta.url)),
|
||||
@@ -53,6 +66,7 @@ export class BrowserOSAppManager {
|
||||
private ports: EvalPorts
|
||||
private chromeProc: Subprocess | null = null
|
||||
private serverProc: Subprocess | null = null
|
||||
private serverLogFd: number | null = null
|
||||
private tempDir: string | null = null
|
||||
private readonly workerIndex: number
|
||||
private readonly loadExtensions: boolean
|
||||
@@ -183,15 +197,36 @@ export class BrowserOSAppManager {
|
||||
VITE_BROWSEROS_SERVER_PORT: String(server),
|
||||
}
|
||||
|
||||
// Capture both stdout and stderr to a per-worker file so we can
|
||||
// post-mortem startup hangs. The server uses pino which writes logs to
|
||||
// stdout by default — capturing stderr alone misses everything. The
|
||||
// eval-weekly workflow uploads /tmp/browseros-server-logs/ as a workflow
|
||||
// artifact on failure.
|
||||
// Open the per-worker log file under SERVER_LOG_DIR. If the directory
|
||||
// can't be created or the file can't be opened (e.g. unwritable custom
|
||||
// BROWSEROS_SERVER_LOG_DIR), fall back to /dev/null so spawn still works.
|
||||
const logPath = join(SERVER_LOG_DIR, `server-W${this.workerIndex}.log`)
|
||||
let logFd: number
|
||||
try {
|
||||
mkdirSync(SERVER_LOG_DIR, { recursive: true })
|
||||
logFd = openSync(logPath, 'a')
|
||||
} catch {
|
||||
logFd = openSync('/dev/null', 'w')
|
||||
}
|
||||
this.serverLogFd = logFd
|
||||
|
||||
// `start:ci` skips `--watch` (no file-watcher overhead in CI). Falls back
|
||||
// to the regular `start` script outside CI for the dev-watch experience.
|
||||
const startScript = process.env.CI ? 'start:ci' : 'start'
|
||||
this.serverProc = spawn({
|
||||
cmd: ['bun', 'run', '--filter', '@browseros/server', 'start'],
|
||||
cmd: ['bun', 'run', '--filter', '@browseros/server', startScript],
|
||||
cwd: MONOREPO_ROOT,
|
||||
stdout: 'ignore',
|
||||
stderr: 'ignore',
|
||||
stdout: logFd,
|
||||
stderr: logFd,
|
||||
env: serverEnv,
|
||||
})
|
||||
console.log(
|
||||
` [W${this.workerIndex}] Server started (PID: ${this.serverProc.pid})`,
|
||||
` [W${this.workerIndex}] Server started (PID: ${this.serverProc.pid}, logs → ${logPath})`,
|
||||
)
|
||||
|
||||
// --- Wait for Server Health ---
|
||||
@@ -244,6 +279,18 @@ export class BrowserOSAppManager {
|
||||
await this.killProcess(this.serverProc)
|
||||
this.serverProc = null
|
||||
|
||||
// Close the parent's copy of the server log fd. Child kept its own dup
|
||||
// until it exited above, so closing here doesn't truncate any output.
|
||||
// Without this we'd leak one fd per restart attempt across all workers.
|
||||
if (this.serverLogFd !== null) {
|
||||
try {
|
||||
closeSync(this.serverLogFd)
|
||||
} catch {
|
||||
// already closed or invalid — ignore
|
||||
}
|
||||
this.serverLogFd = null
|
||||
}
|
||||
|
||||
// Kill Chrome (graceful → force)
|
||||
await this.killProcess(this.chromeProc)
|
||||
this.chromeProc = null
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
},
|
||||
"scripts": {
|
||||
"start": "bun --watch --env-file=.env.development src/index.ts",
|
||||
"start:ci": "bun --env-file=.env.development src/index.ts",
|
||||
"build": "bun ../../scripts/build/server.ts --target=all",
|
||||
"test": "bun run test:all",
|
||||
"test:all": "bun run ./tests/__helpers__/run-test-group.ts all",
|
||||
|
||||
@@ -146,7 +146,7 @@ export async function createHttpServer(config: HttpServerConfig) {
|
||||
getVmName: () => VM_NAME,
|
||||
},
|
||||
openclawGatewayChat: new OpenClawGatewayChatClient(
|
||||
getOpenClawService().getPort(),
|
||||
() => getOpenClawService().getPort(),
|
||||
async () => getOpenClawService().getGatewayToken(),
|
||||
),
|
||||
openclawProvisioner: {
|
||||
|
||||
@@ -48,7 +48,14 @@ export function buildContainerRuntime(
|
||||
): ContainerRuntime {
|
||||
const platform = input.platform ?? process.platform
|
||||
if (platform !== 'darwin') {
|
||||
if (process.env.NODE_ENV === 'test') {
|
||||
// BROWSEROS_SKIP_OPENCLAW=1 is the explicit opt-in for non-darwin hosts
|
||||
// (e.g. Linux CI runners) where OpenClaw can't actually run but the rest
|
||||
// of the server should still come up. Returns a no-op runtime — any
|
||||
// OpenClaw API call hitting it will fail loudly at request time.
|
||||
if (
|
||||
process.env.NODE_ENV === 'test' ||
|
||||
process.env.BROWSEROS_SKIP_OPENCLAW === '1'
|
||||
) {
|
||||
return new UnsupportedPlatformTestRuntime(input.projectDir)
|
||||
}
|
||||
throw unsupportedPlatformError()
|
||||
|
||||
@@ -37,7 +37,7 @@ export interface GatewayChatTurnInput {
|
||||
|
||||
export class OpenClawGatewayChatClient {
|
||||
constructor(
|
||||
private readonly hostPort: number,
|
||||
private readonly getHostPort: () => number,
|
||||
private readonly getToken: () => Promise<string>,
|
||||
) {}
|
||||
|
||||
@@ -46,7 +46,7 @@ export class OpenClawGatewayChatClient {
|
||||
): Promise<ReadableStream<AgentStreamEvent>> {
|
||||
const token = await this.getToken()
|
||||
const response = await fetch(
|
||||
`http://127.0.0.1:${this.hostPort}/v1/chat/completions`,
|
||||
`http://127.0.0.1:${this.getHostPort()}/v1/chat/completions`,
|
||||
{
|
||||
method: 'POST',
|
||||
headers: {
|
||||
|
||||
@@ -126,17 +126,28 @@ export class Application {
|
||||
this.logStartupSummary()
|
||||
startSkillSync()
|
||||
|
||||
configureOpenClawService({
|
||||
browserosServerPort: this.config.serverPort,
|
||||
resourcesDir,
|
||||
vmCache: this.vmCacheConfig(),
|
||||
})
|
||||
.tryAutoStart()
|
||||
.catch((err) =>
|
||||
logger.warn('OpenClaw auto-start failed', {
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
}),
|
||||
)
|
||||
// OpenClaw is best-effort — a failure here must not crash the server.
|
||||
// The container runtime constructor throws synchronously on non-darwin
|
||||
// (e.g. Linux CI runners), and the .catch() on tryAutoStart() only
|
||||
// handles async throws inside auto-start. Wrap both in try/catch so the
|
||||
// process keeps running even when OpenClaw can't initialize at all.
|
||||
try {
|
||||
configureOpenClawService({
|
||||
browserosServerPort: this.config.serverPort,
|
||||
resourcesDir,
|
||||
vmCache: this.vmCacheConfig(),
|
||||
})
|
||||
.tryAutoStart()
|
||||
.catch((err) =>
|
||||
logger.warn('OpenClaw auto-start failed', {
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
}),
|
||||
)
|
||||
} catch (err) {
|
||||
logger.warn('OpenClaw configuration failed, continuing without it', {
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
})
|
||||
}
|
||||
|
||||
metrics.log('http_server.started', { version: VERSION })
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user