mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
feat(eval): switch to Linux GitHub-hosted runner (#519)
* feat(eval): switch to ubuntu-latest runner, add OE-Clado config - Switch workflow from self-hosted Mac Studio to ubuntu-latest - Install BrowserOS Linux .deb in CI (no self-hosted runner needed) - Add browseros-oe-clado-weekly.json config for orchestrator-executor - Fix report chart to show date+time (not just date) - Make BROWSEROS_BINARY configurable via env var * feat(eval): add NopeCHA captcha solver extension to eval runs - Auto-load NopeCHA extension in eval Chrome instances - Works in incognito + headless mode - CI workflow downloads NopeCHA before eval - extensions/ directory gitignored (downloaded at runtime) * feat(eval): per-config concurrency — different configs run in parallel * feat(eval): remove concurrency limit — all runs execute in parallel
This commit is contained in:
23
.github/workflows/eval-weekly.yml
vendored
23
.github/workflows/eval-weekly.yml
vendored
@@ -11,22 +11,24 @@ on:
|
||||
required: false
|
||||
default: 'configs/browseros-agent-weekly.json'
|
||||
|
||||
concurrency:
|
||||
group: eval-runner
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
eval:
|
||||
runs-on: self-hosted
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 360
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install BrowserOS
|
||||
run: |
|
||||
wget -q https://github.com/browseros-ai/BrowserOS/releases/download/v0.44.0.1/BrowserOS_v0.44.0.1_amd64.deb
|
||||
sudo dpkg -i BrowserOS_v0.44.0.1_amd64.deb
|
||||
browseros --version || echo "BrowserOS installed at $(which browseros)"
|
||||
|
||||
- name: Install Bun
|
||||
uses: oven-sh/setup-bun@v2
|
||||
with:
|
||||
@@ -34,7 +36,14 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
working-directory: packages/browseros-agent
|
||||
run: bun install
|
||||
run: bun install --ignore-scripts && bun run build:agent-sdk
|
||||
|
||||
- name: Install captcha solver extension
|
||||
working-directory: packages/browseros-agent/apps/eval
|
||||
run: |
|
||||
mkdir -p extensions
|
||||
curl -sL -o /tmp/nopecha.zip https://github.com/NopeCHALLC/nopecha-extension/releases/latest/download/chromium_automation.zip
|
||||
unzip -qo /tmp/nopecha.zip -d extensions/nopecha
|
||||
|
||||
- name: Run eval
|
||||
working-directory: packages/browseros-agent/apps/eval
|
||||
@@ -43,7 +52,7 @@ jobs:
|
||||
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
BROWSEROS_BINARY: ${{ secrets.BROWSEROS_BINARY }}
|
||||
BROWSEROS_BINARY: /usr/bin/browseros
|
||||
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
|
||||
run: |
|
||||
echo "Running eval with config: $EVAL_CONFIG"
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
data/raw/
|
||||
results/
|
||||
extensions/
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"agent": {
|
||||
"type": "orchestrator-executor",
|
||||
"orchestrator": {
|
||||
"provider": "openai-compatible",
|
||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||
"apiKey": "FIREWORKS_API_KEY",
|
||||
"baseUrl": "https://api.fireworks.ai/inference/v1"
|
||||
},
|
||||
"executor": {
|
||||
"provider": "clado-action",
|
||||
"model": "qwen3-vl-30b-a3b-instruct",
|
||||
"apiKey": "",
|
||||
"baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
|
||||
}
|
||||
},
|
||||
"dataset": "../data/webbench-2of4-50.jsonl",
|
||||
"num_workers": 10,
|
||||
"restart_server_per_task": true,
|
||||
"browseros": {
|
||||
"server_url": "http://127.0.0.1:9110",
|
||||
"base_cdp_port": 9010,
|
||||
"base_server_port": 9110,
|
||||
"base_extension_port": 9310,
|
||||
"load_extensions": false,
|
||||
"headless": true
|
||||
},
|
||||
"graders": ["performance_grader"],
|
||||
"grader_api_key_env": "OPENROUTER_API_KEY",
|
||||
"grader_base_url": "https://openrouter.ai/api/v1",
|
||||
"grader_model": "openai/gpt-4.1",
|
||||
"timeout_ms": 1800000
|
||||
}
|
||||
@@ -158,8 +158,8 @@ const runs: RunSummary[] = manifests
|
||||
: 0
|
||||
|
||||
const date = m.uploadedAt
|
||||
? m.uploadedAt.split('T')[0]
|
||||
: m.runId.slice(0, 10)
|
||||
? `${m.uploadedAt.split('T')[0]} ${m.uploadedAt.split('T')[1]?.slice(0, 5) || ''}`
|
||||
: m.runId.slice(0, 15)
|
||||
|
||||
const model = m.agentConfig?.model || 'unknown'
|
||||
const dataset = m.dataset || m.runId
|
||||
|
||||
@@ -35,6 +35,10 @@ const BROWSEROS_BINARY =
|
||||
'/Applications/BrowserOS.app/Contents/MacOS/BrowserOS'
|
||||
|
||||
const CONTROLLER_EXT_DIR = join(MONOREPO_ROOT, 'apps/controller-ext/dist')
|
||||
const CAPTCHA_EXT_DIR = join(
|
||||
dirname(fileURLToPath(import.meta.url)),
|
||||
'../../../extensions/nopecha',
|
||||
)
|
||||
|
||||
export class BrowserOSAppManager {
|
||||
private ports: EvalPorts
|
||||
@@ -154,8 +158,15 @@ export class BrowserOSAppManager {
|
||||
`--user-data-dir=${this.tempDir}`,
|
||||
]
|
||||
|
||||
const extensions: string[] = []
|
||||
if (this.loadExtensions && existsSync(CONTROLLER_EXT_DIR)) {
|
||||
chromeArgs.push(`--load-extension=${CONTROLLER_EXT_DIR}`)
|
||||
extensions.push(CONTROLLER_EXT_DIR)
|
||||
}
|
||||
if (existsSync(CAPTCHA_EXT_DIR)) {
|
||||
extensions.push(CAPTCHA_EXT_DIR)
|
||||
}
|
||||
if (extensions.length > 0) {
|
||||
chromeArgs.push(`--load-extension=${extensions.join(',')}`)
|
||||
}
|
||||
|
||||
chromeArgs.push('about:blank')
|
||||
|
||||
Reference in New Issue
Block a user