mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-18 19:16:22 +00:00
* feat(eval): add claude-generated run report artifact * fix(eval): install claude code cli for CI evals * fix(eval): bypass claude code tool permissions * Eval metrics configs (#932) * feat(eval): add agisdk comparison metrics configs * fix(eval): keep cdp crashes from aborting run
153 lines
6.2 KiB
YAML
153 lines
6.2 KiB
YAML
name: Weekly Eval
|
|
|
|
on:
|
|
schedule:
|
|
# Every Saturday at 06:00 UTC
|
|
- cron: '0 6 * * 6'
|
|
push:
|
|
branches: [main]
|
|
paths:
|
|
- 'packages/browseros-agent/apps/server/src/agent/**'
|
|
- 'packages/browseros-agent/apps/server/src/tools/**'
|
|
workflow_dispatch:
|
|
inputs:
|
|
config:
|
|
description: 'Eval config file (relative to apps/eval/)'
|
|
required: false
|
|
default: 'configs/legacy/browseros-agent-weekly.json'
|
|
|
|
permissions:
|
|
contents: read
|
|
|
|
jobs:
|
|
eval:
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 360
|
|
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Install BrowserOS
|
|
run: |
|
|
# Rolling stable channel — see https://cdn.browseros.com/download/BrowserOS.deb
|
|
wget -q -O BrowserOS.deb https://cdn.browseros.com/download/BrowserOS.deb
|
|
sudo dpkg -i BrowserOS.deb
|
|
browseros --version || echo "BrowserOS installed at $(which browseros)"
|
|
|
|
- name: Install Bun
|
|
uses: oven-sh/setup-bun@v2
|
|
with:
|
|
bun-version: latest
|
|
|
|
- name: Install dependencies
|
|
working-directory: packages/browseros-agent
|
|
run: bun install --ignore-scripts
|
|
|
|
- name: Install Claude Code CLI
|
|
working-directory: packages/browseros-agent/apps/eval
|
|
env:
|
|
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }}
|
|
run: |
|
|
if bun -e "const config = await Bun.file(process.env.EVAL_CONFIG).json(); process.exit(config.agent?.type === 'claude-code' ? 0 : 1)"; then
|
|
npm install -g @anthropic-ai/claude-code@2.1.119
|
|
echo "Claude Code CLI installed at $(command -v claude)"
|
|
claude --version
|
|
else
|
|
echo "Eval config does not use Claude Code; skipping Claude Code CLI install"
|
|
fi
|
|
|
|
- name: Install Python eval dependencies
|
|
# agisdk pinned so silent upstream releases can't shift task definitions
|
|
# or grader behavior. Bump intentionally with a documented re-baseline.
|
|
run: pip install agisdk==0.3.5 requests
|
|
|
|
- name: Clone WebArena-Infinity
|
|
run: git clone --depth 1 https://github.com/web-arena-x/webarena-infinity.git /tmp/webarena-infinity
|
|
|
|
- name: Install xvfb
|
|
run: sudo apt-get update && sudo apt-get install -y xvfb
|
|
|
|
- name: Install captcha solver extension
|
|
working-directory: packages/browseros-agent/apps/eval
|
|
run: |
|
|
mkdir -p extensions
|
|
curl -sL -o /tmp/nopecha.zip https://github.com/NopeCHALLC/nopecha-extension/releases/latest/download/chromium_automation.zip
|
|
unzip -qo /tmp/nopecha.zip -d extensions/nopecha
|
|
|
|
- name: Run eval and publish to R2
|
|
working-directory: packages/browseros-agent/apps/eval
|
|
env:
|
|
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
|
|
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
|
AWS_REGION: ${{ secrets.AWS_REGION || 'us-west-2' }}
|
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
|
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
|
|
NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
|
|
BROWSEROS_BINARY: /usr/bin/browseros
|
|
WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
|
|
# OpenClaw container runtime is macOS-only; opt the Linux runner
|
|
# into the no-op stub so the server can boot and the eval can run.
|
|
BROWSEROS_SKIP_OPENCLAW: '1'
|
|
EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }}
|
|
run: |
|
|
echo "Running eval with config: $EVAL_CONFIG"
|
|
xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts suite --config "$EVAL_CONFIG"
|
|
# Capture the run directory so report.html can be generated before the R2 publish step.
|
|
SUMMARY_PATH="$(find results -name summary.json -type f -print | sort | tail -n 1)"
|
|
if [ -z "$SUMMARY_PATH" ]; then
|
|
echo "No eval run summary found"
|
|
exit 1
|
|
fi
|
|
RUN_DIR="$(dirname "$SUMMARY_PATH")"
|
|
echo "EVAL_RUN_DIR=$RUN_DIR" >> "$GITHUB_ENV"
|
|
|
|
- name: Generate run analysis report
|
|
if: success()
|
|
working-directory: packages/browseros-agent/apps/eval
|
|
env:
|
|
CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
|
|
run: |
|
|
echo "Generating run report for $EVAL_RUN_DIR"
|
|
bun scripts/generate-report.ts --input "$EVAL_RUN_DIR" --output "$EVAL_RUN_DIR/report.html"
|
|
|
|
- name: Publish eval run to R2
|
|
if: success()
|
|
working-directory: packages/browseros-agent/apps/eval
|
|
env:
|
|
EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
|
|
EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }}
|
|
EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }}
|
|
EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }}
|
|
EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }}
|
|
run: bun run src/index.ts publish --run "$EVAL_RUN_DIR" --target r2
|
|
|
|
- name: Generate trend report
|
|
if: success()
|
|
timeout-minutes: 5
|
|
continue-on-error: true
|
|
working-directory: packages/browseros-agent
|
|
env:
|
|
EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
|
|
EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }}
|
|
EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }}
|
|
EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }}
|
|
EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }}
|
|
run: bun apps/eval/scripts/weekly-report.ts /tmp/eval-report.html
|
|
|
|
- name: Upload trend report as artifact
|
|
if: success()
|
|
uses: actions/upload-artifact@v4
|
|
with:
|
|
name: eval-report-${{ github.run_id }}
|
|
path: /tmp/eval-report.html
|
|
|
|
- name: Upload server stderr logs (for post-mortem on startup failures)
|
|
if: always()
|
|
uses: actions/upload-artifact@v4
|
|
with:
|
|
name: browseros-server-logs-${{ github.run_id }}
|
|
path: /tmp/browseros-server-logs/
|
|
if-no-files-found: ignore
|