From f157436e7ddfc19497ba40dcb13ec048d910900c Mon Sep 17 00:00:00 2001 From: shivammittal274 <56757235+shivammittal274@users.noreply.github.com> Date: Sat, 21 Mar 2026 23:04:45 +0530 Subject: [PATCH] feat(eval): switch to Linux GitHub-hosted runner (#519) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(eval): switch to ubuntu-latest runner, add OE-Clado config - Switch workflow from self-hosted Mac Studio to ubuntu-latest - Install BrowserOS Linux .deb in CI (no self-hosted runner needed) - Add browseros-oe-clado-weekly.json config for orchestrator-executor - Fix report chart to show date+time (not just date) - Make BROWSEROS_BINARY configurable via env var * feat(eval): add NopeCHA captcha solver extension to eval runs - Auto-load NopeCHA extension in eval Chrome instances - Works in incognito + headless mode - CI workflow downloads NopeCHA before eval - extensions/ directory gitignored (downloaded at runtime) * feat(eval): per-config concurrency — different configs run in parallel * feat(eval): remove concurrency limit — all runs execute in parallel --- .github/workflows/eval-weekly.yml | 23 +++++++++---- packages/browseros-agent/apps/eval/.gitignore | 1 + .../configs/browseros-oe-clado-weekly.json | 33 +++++++++++++++++++ .../apps/eval/scripts/weekly-report.ts | 4 +-- .../eval/src/runner/browseros-app-manager.ts | 13 +++++++- 5 files changed, 64 insertions(+), 10 deletions(-) create mode 100644 packages/browseros-agent/apps/eval/configs/browseros-oe-clado-weekly.json diff --git a/.github/workflows/eval-weekly.yml b/.github/workflows/eval-weekly.yml index 4271cd3b4..bb69f26cc 100644 --- a/.github/workflows/eval-weekly.yml +++ b/.github/workflows/eval-weekly.yml @@ -11,22 +11,24 @@ on: required: false default: 'configs/browseros-agent-weekly.json' -concurrency: - group: eval-runner - cancel-in-progress: false - permissions: contents: read jobs: eval: - runs-on: self-hosted + runs-on: ubuntu-latest timeout-minutes: 360 steps: - name: Checkout uses: actions/checkout@v4 + - name: Install BrowserOS + run: | + wget -q https://github.com/browseros-ai/BrowserOS/releases/download/v0.44.0.1/BrowserOS_v0.44.0.1_amd64.deb + sudo dpkg -i BrowserOS_v0.44.0.1_amd64.deb + browseros --version || echo "BrowserOS installed at $(which browseros)" + - name: Install Bun uses: oven-sh/setup-bun@v2 with: @@ -34,7 +36,14 @@ jobs: - name: Install dependencies working-directory: packages/browseros-agent - run: bun install + run: bun install --ignore-scripts && bun run build:agent-sdk + + - name: Install captcha solver extension + working-directory: packages/browseros-agent/apps/eval + run: | + mkdir -p extensions + curl -sL -o /tmp/nopecha.zip https://github.com/NopeCHALLC/nopecha-extension/releases/latest/download/chromium_automation.zip + unzip -qo /tmp/nopecha.zip -d extensions/nopecha - name: Run eval working-directory: packages/browseros-agent/apps/eval @@ -43,7 +52,7 @@ jobs: OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - BROWSEROS_BINARY: ${{ secrets.BROWSEROS_BINARY }} + BROWSEROS_BINARY: /usr/bin/browseros EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }} run: | echo "Running eval with config: $EVAL_CONFIG" diff --git a/packages/browseros-agent/apps/eval/.gitignore b/packages/browseros-agent/apps/eval/.gitignore index 569b53e48..10ec97e8e 100644 --- a/packages/browseros-agent/apps/eval/.gitignore +++ b/packages/browseros-agent/apps/eval/.gitignore @@ -1,2 +1,3 @@ data/raw/ results/ +extensions/ diff --git a/packages/browseros-agent/apps/eval/configs/browseros-oe-clado-weekly.json b/packages/browseros-agent/apps/eval/configs/browseros-oe-clado-weekly.json new file mode 100644 index 000000000..d58ee680b --- /dev/null +++ b/packages/browseros-agent/apps/eval/configs/browseros-oe-clado-weekly.json @@ -0,0 +1,33 @@ +{ + "agent": { + "type": "orchestrator-executor", + "orchestrator": { + "provider": "openai-compatible", + "model": "accounts/fireworks/models/kimi-k2p5", + "apiKey": "FIREWORKS_API_KEY", + "baseUrl": "https://api.fireworks.ai/inference/v1" + }, + "executor": { + "provider": "clado-action", + "model": "qwen3-vl-30b-a3b-instruct", + "apiKey": "", + "baseUrl": "https://clado-ai--clado-browseros-action-actionmodel-generate.modal.run" + } + }, + "dataset": "../data/webbench-2of4-50.jsonl", + "num_workers": 10, + "restart_server_per_task": true, + "browseros": { + "server_url": "http://127.0.0.1:9110", + "base_cdp_port": 9010, + "base_server_port": 9110, + "base_extension_port": 9310, + "load_extensions": false, + "headless": true + }, + "graders": ["performance_grader"], + "grader_api_key_env": "OPENROUTER_API_KEY", + "grader_base_url": "https://openrouter.ai/api/v1", + "grader_model": "openai/gpt-4.1", + "timeout_ms": 1800000 +} diff --git a/packages/browseros-agent/apps/eval/scripts/weekly-report.ts b/packages/browseros-agent/apps/eval/scripts/weekly-report.ts index 97b196989..20a828038 100644 --- a/packages/browseros-agent/apps/eval/scripts/weekly-report.ts +++ b/packages/browseros-agent/apps/eval/scripts/weekly-report.ts @@ -158,8 +158,8 @@ const runs: RunSummary[] = manifests : 0 const date = m.uploadedAt - ? m.uploadedAt.split('T')[0] - : m.runId.slice(0, 10) + ? `${m.uploadedAt.split('T')[0]} ${m.uploadedAt.split('T')[1]?.slice(0, 5) || ''}` + : m.runId.slice(0, 15) const model = m.agentConfig?.model || 'unknown' const dataset = m.dataset || m.runId diff --git a/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts b/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts index 3a4a6c318..7898297a3 100644 --- a/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts +++ b/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts @@ -35,6 +35,10 @@ const BROWSEROS_BINARY = '/Applications/BrowserOS.app/Contents/MacOS/BrowserOS' const CONTROLLER_EXT_DIR = join(MONOREPO_ROOT, 'apps/controller-ext/dist') +const CAPTCHA_EXT_DIR = join( + dirname(fileURLToPath(import.meta.url)), + '../../../extensions/nopecha', +) export class BrowserOSAppManager { private ports: EvalPorts @@ -154,8 +158,15 @@ export class BrowserOSAppManager { `--user-data-dir=${this.tempDir}`, ] + const extensions: string[] = [] if (this.loadExtensions && existsSync(CONTROLLER_EXT_DIR)) { - chromeArgs.push(`--load-extension=${CONTROLLER_EXT_DIR}`) + extensions.push(CONTROLLER_EXT_DIR) + } + if (existsSync(CAPTCHA_EXT_DIR)) { + extensions.push(CAPTCHA_EXT_DIR) + } + if (extensions.length > 0) { + chromeArgs.push(`--load-extension=${extensions.join(',')}`) } chromeArgs.push('about:blank')