name: Weekly Eval on: schedule: # Every Saturday at 06:00 UTC - cron: '0 6 * * 6' push: branches: [main] paths: - 'packages/browseros-agent/apps/server/src/agent/**' - 'packages/browseros-agent/apps/server/src/tools/**' workflow_dispatch: inputs: config: description: 'Eval config file (relative to apps/eval/)' required: false default: 'configs/legacy/browseros-agent-weekly.json' permissions: contents: read jobs: eval: runs-on: ubuntu-latest timeout-minutes: 360 steps: - name: Checkout uses: actions/checkout@v4 - name: Install BrowserOS run: | # Rolling stable channel — see https://cdn.browseros.com/download/BrowserOS.deb wget -q -O BrowserOS.deb https://cdn.browseros.com/download/BrowserOS.deb sudo dpkg -i BrowserOS.deb browseros --version || echo "BrowserOS installed at $(which browseros)" - name: Install Bun uses: oven-sh/setup-bun@v2 with: bun-version: latest - name: Install dependencies working-directory: packages/browseros-agent run: bun install --ignore-scripts - name: Install Claude Code CLI working-directory: packages/browseros-agent/apps/eval env: EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }} run: | if bun -e "const config = await Bun.file(process.env.EVAL_CONFIG).json(); process.exit(config.agent?.type === 'claude-code' ? 0 : 1)"; then npm install -g @anthropic-ai/claude-code@2.1.119 echo "Claude Code CLI installed at $(command -v claude)" claude --version else echo "Eval config does not use Claude Code; skipping Claude Code CLI install" fi - name: Install Python eval dependencies # agisdk pinned so silent upstream releases can't shift task definitions # or grader behavior. Bump intentionally with a documented re-baseline. run: pip install agisdk==0.3.5 requests - name: Clone WebArena-Infinity run: git clone --depth 1 https://github.com/web-arena-x/webarena-infinity.git /tmp/webarena-infinity - name: Install xvfb run: sudo apt-get update && sudo apt-get install -y xvfb - name: Install captcha solver extension working-directory: packages/browseros-agent/apps/eval run: | mkdir -p extensions curl -sL -o /tmp/nopecha.zip https://github.com/NopeCHALLC/nopecha-extension/releases/latest/download/chromium_automation.zip unzip -qo /tmp/nopecha.zip -d extensions/nopecha - name: Run eval and publish to R2 working-directory: packages/browseros-agent/apps/eval env: FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} AWS_REGION: ${{ secrets.AWS_REGION || 'us-west-2' }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }} BROWSEROS_BINARY: /usr/bin/browseros WEBARENA_INFINITY_DIR: /tmp/webarena-infinity # OpenClaw container runtime is macOS-only; opt the Linux runner # into the no-op stub so the server can boot and the eval can run. BROWSEROS_SKIP_OPENCLAW: '1' EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }} run: | echo "Running eval with config: $EVAL_CONFIG" xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts suite --config "$EVAL_CONFIG" # Capture the run directory so report.html can be generated before the R2 publish step. SUMMARY_PATH="$(find results -name summary.json -type f -print | sort | tail -n 1)" if [ -z "$SUMMARY_PATH" ]; then echo "No eval run summary found" exit 1 fi RUN_DIR="$(dirname "$SUMMARY_PATH")" echo "EVAL_RUN_DIR=$RUN_DIR" >> "$GITHUB_ENV" - name: Generate run analysis report if: success() working-directory: packages/browseros-agent/apps/eval env: CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} run: | echo "Generating run report for $EVAL_RUN_DIR" bun scripts/generate-report.ts --input "$EVAL_RUN_DIR" --output "$EVAL_RUN_DIR/report.html" - name: Publish eval run to R2 if: success() working-directory: packages/browseros-agent/apps/eval env: EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }} EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }} EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }} EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }} EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }} run: bun run src/index.ts publish --run "$EVAL_RUN_DIR" --target r2 - name: Generate trend report if: success() timeout-minutes: 5 continue-on-error: true working-directory: packages/browseros-agent env: EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }} EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }} EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }} EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }} EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }} run: bun apps/eval/scripts/weekly-report.ts /tmp/eval-report.html - name: Upload trend report as artifact if: success() uses: actions/upload-artifact@v4 with: name: eval-report-${{ github.run_id }} path: /tmp/eval-report.html - name: Upload server stderr logs (for post-mortem on startup failures) if: always() uses: actions/upload-artifact@v4 with: name: browseros-server-logs-${{ github.run_id }} path: /tmp/browseros-server-logs/ if-no-files-found: ignore