feat(eval): switch to Linux GitHub-hosted runner (#519)

* feat(eval): switch to ubuntu-latest runner, add OE-Clado config

- Switch workflow from self-hosted Mac Studio to ubuntu-latest
- Install BrowserOS Linux .deb in CI (no self-hosted runner needed)
- Add browseros-oe-clado-weekly.json config for orchestrator-executor
- Fix report chart to show date+time (not just date)
- Make BROWSEROS_BINARY configurable via env var

* feat(eval): add NopeCHA captcha solver extension to eval runs

- Auto-load NopeCHA extension in eval Chrome instances
- Works in incognito + headless mode
- CI workflow downloads NopeCHA before eval
- extensions/ directory gitignored (downloaded at runtime)

* feat(eval): per-config concurrency — different configs run in parallel

* feat(eval): remove concurrency limit — all runs execute in parallel
This commit is contained in:
shivammittal274
2026-03-21 23:04:45 +05:30
committed by GitHub
parent ba7892322b
commit f157436e7d
5 changed files with 64 additions and 10 deletions

View File

@@ -11,22 +11,24 @@ on:
required: false
default: 'configs/browseros-agent-weekly.json'
concurrency:
group: eval-runner
cancel-in-progress: false
permissions:
contents: read
jobs:
eval:
runs-on: self-hosted
runs-on: ubuntu-latest
timeout-minutes: 360
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install BrowserOS
run: |
wget -q https://github.com/browseros-ai/BrowserOS/releases/download/v0.44.0.1/BrowserOS_v0.44.0.1_amd64.deb
sudo dpkg -i BrowserOS_v0.44.0.1_amd64.deb
browseros --version || echo "BrowserOS installed at $(which browseros)"
- name: Install Bun
uses: oven-sh/setup-bun@v2
with:
@@ -34,7 +36,14 @@ jobs:
- name: Install dependencies
working-directory: packages/browseros-agent
run: bun install
run: bun install --ignore-scripts && bun run build:agent-sdk
- name: Install captcha solver extension
working-directory: packages/browseros-agent/apps/eval
run: |
mkdir -p extensions
curl -sL -o /tmp/nopecha.zip https://github.com/NopeCHALLC/nopecha-extension/releases/latest/download/chromium_automation.zip
unzip -qo /tmp/nopecha.zip -d extensions/nopecha
- name: Run eval
working-directory: packages/browseros-agent/apps/eval
@@ -43,7 +52,7 @@ jobs:
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
BROWSEROS_BINARY: ${{ secrets.BROWSEROS_BINARY }}
BROWSEROS_BINARY: /usr/bin/browseros
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
run: |
echo "Running eval with config: $EVAL_CONFIG"

View File

@@ -1,2 +1,3 @@
data/raw/
results/
extensions/

View File

@@ -0,0 +1,33 @@
{
"agent": {
"type": "orchestrator-executor",
"orchestrator": {
"provider": "openai-compatible",
"model": "accounts/fireworks/models/kimi-k2p5",
"apiKey": "FIREWORKS_API_KEY",
"baseUrl": "https://api.fireworks.ai/inference/v1"
},
"executor": {
"provider": "clado-action",
"model": "qwen3-vl-30b-a3b-instruct",
"apiKey": "",
"baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run"
}
},
"dataset": "../data/webbench-2of4-50.jsonl",
"num_workers": 10,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": false,
"headless": true
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1800000
}

View File

@@ -158,8 +158,8 @@ const runs: RunSummary[] = manifests
: 0
const date = m.uploadedAt
? m.uploadedAt.split('T')[0]
: m.runId.slice(0, 10)
? `${m.uploadedAt.split('T')[0]} ${m.uploadedAt.split('T')[1]?.slice(0, 5) || ''}`
: m.runId.slice(0, 15)
const model = m.agentConfig?.model || 'unknown'
const dataset = m.dataset || m.runId

View File

@@ -35,6 +35,10 @@ const BROWSEROS_BINARY =
'/Applications/BrowserOS.app/Contents/MacOS/BrowserOS'
const CONTROLLER_EXT_DIR = join(MONOREPO_ROOT, 'apps/controller-ext/dist')
const CAPTCHA_EXT_DIR = join(
dirname(fileURLToPath(import.meta.url)),
'../../../extensions/nopecha',
)
export class BrowserOSAppManager {
private ports: EvalPorts
@@ -154,8 +158,15 @@ export class BrowserOSAppManager {
`--user-data-dir=${this.tempDir}`,
]
const extensions: string[] = []
if (this.loadExtensions && existsSync(CONTROLLER_EXT_DIR)) {
chromeArgs.push(`--load-extension=${CONTROLLER_EXT_DIR}`)
extensions.push(CONTROLLER_EXT_DIR)
}
if (existsSync(CAPTCHA_EXT_DIR)) {
extensions.push(CAPTCHA_EXT_DIR)
}
if (extensions.length > 0) {
chromeArgs.push(`--load-extension=${extensions.join(',')}`)
}
chromeArgs.push('about:blank')