ci: add timeout and continue-on-error for trend report step

chore: restore kimi-k2p5 as default eval config
chore: temp switch to opus 4.6 for eval run
2026-05-18 11:06:19 +00:00 · 2026-04-09 23:16:48 +05:30 · 2026-04-09 20:22:27 +05:30 · 2026-04-09 20:21:45 +05:30 · 2026-04-09 13:23:51 +05:30 · 2026-04-09 12:22:05 +05:30
122 changed files with 3040 additions and 3929 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -9,4 +9,6 @@ packages/browseros/chromium_patches/**/*.py linguist-generated
 scripts/*.py linguist-generated
 # Mark build directories as generated
 build/* linguist-generated
+# Mark eval/test framework as vendored so it's excluded from language stats
+packages/browseros-agent/apps/eval/** linguist-vendored
 docs/videos/** filter=lfs diff=lfs merge=lfs -text
--- a/.github/workflows/eval-weekly.yml
+++ b/.github/workflows/eval-weekly.yml
@@ -43,6 +43,12 @@ jobs:
        working-directory: packages/browseros-agent
        run: bun install --ignore-scripts && bun run build:agent-sdk

+      - name: Install Python eval dependencies
+        run: pip install agisdk requests
+
+      - name: Clone WebArena-Infinity
+        run: git clone --depth 1 https://github.com/web-arena-x/webarena-infinity.git /tmp/webarena-infinity
+
      - name: Install xvfb
        run: sudo apt-get update && sudo apt-get install -y xvfb

@@ -57,9 +63,11 @@ jobs:
        working-directory: packages/browseros-agent/apps/eval
        env:
          FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
+          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
          CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
          NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }}
          BROWSEROS_BINARY: /usr/bin/browseros
+          WEBARENA_INFINITY_DIR: /tmp/webarena-infinity
          EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/browseros-agent-weekly.json' }}
        run: |
          echo "Running eval with config: $EVAL_CONFIG"
@@ -81,6 +89,8 @@ jobs:

      - name: Generate trend report
        if: success()
+        timeout-minutes: 5
+        continue-on-error: true
        working-directory: packages/browseros-agent
        env:
          EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }}
--- a/.github/workflows/release-server.yml
+++ b/.github/workflows/release-server.yml
@@ -0,0 +1,147 @@
+name: Release BrowserOS Server
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Release version (e.g. 0.0.80)"
+        required: true
+        type: string
+
+concurrency:
+  group: release-server
+  cancel-in-progress: false
+
+jobs:
+  release:
+    if: github.ref == 'refs/heads/main'
+    runs-on: ubuntu-latest
+    environment: release-core
+    permissions:
+      contents: write
+    defaults:
+      run:
+        working-directory: packages/browseros-agent
+
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: "1.3.6"
+
+      - name: Install dependencies
+        run: bun ci
+
+      - name: Prepare production env file
+        run: cp apps/server/.env.production.example apps/server/.env.production
+
+      - name: Validate version
+        id: version
+        env:
+          REQUESTED_VERSION: ${{ inputs.version }}
+        run: |
+          PACKAGE_VERSION=$(node -p "require('./apps/server/package.json').version")
+          echo "package_version=$PACKAGE_VERSION" >> "$GITHUB_OUTPUT"
+          echo "release_sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
+
+          if [ "$PACKAGE_VERSION" != "$REQUESTED_VERSION" ]; then
+            echo "Requested version $REQUESTED_VERSION does not match apps/server/package.json ($PACKAGE_VERSION)"
+            exit 1
+          fi
+
+      - name: Build release artifacts
+        run: bun run build:server:ci
+
+      - name: Verify release artifacts
+        run: |
+          mapfile -t ZIP_FILES < <(find dist/prod/server -maxdepth 1 -type f -name 'browseros-server-resources-*.zip' | sort)
+
+          if [ "${#ZIP_FILES[@]}" -eq 0 ]; then
+            echo "No server release zip files were produced"
+            exit 1
+          fi
+
+          printf 'Found release artifacts:\n%s\n' "${ZIP_FILES[@]}"
+
+      - name: Generate release notes
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PACKAGE_VERSION: ${{ steps.version.outputs.package_version }}
+        run: |
+          SERVER_APP_PATH="packages/browseros-agent/apps/server"
+          SERVER_BUILD_DIR="packages/browseros-agent/scripts/build/server"
+          SERVER_BUILD_ENTRY="packages/browseros-agent/scripts/build/server.ts"
+          SERVER_RESOURCE_MANIFEST="packages/browseros-agent/scripts/build/config/server-prod-resources.json"
+          SERVER_WORKSPACE_PKG="packages/browseros-agent/package.json"
+          CURRENT_TAG="browseros-server-v$PACKAGE_VERSION"
+          PREV_TAG=$(git tag -l "browseros-server-v*" --sort=-v:refname | grep -v "^${CURRENT_TAG}$" | head -n 1)
+
+          if [ -z "$PREV_TAG" ]; then
+            echo "Initial release of browseros-server." > /tmp/release-notes.md
+          else
+            COMMITS=$(git log "$PREV_TAG"..HEAD --pretty=format:"%H" -- \
+              "$SERVER_APP_PATH" \
+              "$SERVER_BUILD_DIR" \
+              "$SERVER_BUILD_ENTRY" \
+              "$SERVER_RESOURCE_MANIFEST" \
+              "$SERVER_WORKSPACE_PKG")
+
+            if [ -z "$COMMITS" ]; then
+              echo "No notable changes." > /tmp/release-notes.md
+            else
+              echo "## What's Changed" > /tmp/release-notes.md
+              echo "" >> /tmp/release-notes.md
+
+              while IFS= read -r SHA; do
+                SUBJECT=$(git log -1 --pretty=format:"%s" "$SHA")
+                PR_NUM=$(gh api "/repos/${{ github.repository }}/commits/${SHA}/pulls" --jq '.[0].number // empty' 2>/dev/null)
+
+                if [ -n "$PR_NUM" ] && ! echo "$SUBJECT" | grep -qF "(#${PR_NUM})"; then
+                  echo "- ${SUBJECT} (#${PR_NUM})" >> /tmp/release-notes.md
+                else
+                  echo "- ${SUBJECT}" >> /tmp/release-notes.md
+                fi
+              done <<< "$COMMITS"
+            fi
+          fi
+        working-directory: ${{ github.workspace }}
+
+      - name: Create GitHub release
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PACKAGE_VERSION: ${{ steps.version.outputs.package_version }}
+          RELEASE_SHA: ${{ steps.version.outputs.release_sha }}
+        run: |
+          TAG="browseros-server-v$PACKAGE_VERSION"
+          TITLE="BrowserOS Server - v$PACKAGE_VERSION"
+          mapfile -t ZIP_FILES < <(find packages/browseros-agent/dist/prod/server -maxdepth 1 -type f -name 'browseros-server-resources-*.zip' | sort)
+
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+
+          if git rev-parse "$TAG" >/dev/null 2>&1; then
+            echo "Tag $TAG already exists, skipping tag creation"
+          else
+            git tag -a "$TAG" -m "browseros-server v$PACKAGE_VERSION" "$RELEASE_SHA"
+          fi
+
+          if git ls-remote --tags origin "$TAG" | grep -q "$TAG"; then
+            echo "Tag $TAG already on remote, skipping push"
+          else
+            git push origin "$TAG"
+          fi
+
+          if gh release view "$TAG" >/dev/null 2>&1; then
+            echo "Release $TAG already exists, updating"
+            gh release edit "$TAG" --title "$TITLE" --notes-file /tmp/release-notes.md
+            gh release upload "$TAG" "${ZIP_FILES[@]}" --clobber
+          else
+            gh release create "$TAG" \
+              --title "$TITLE" \
+              --notes-file /tmp/release-notes.md \
+              "${ZIP_FILES[@]}"
+          fi
+        working-directory: ${{ github.workspace }}
--- a/README.md
+++ b/README.md
@@ -192,7 +192,7 @@ We'd love your help making BrowserOS better! See our [Contributing Guide](CONTRI

 BrowserOS is open source under the [AGPL-3.0 license](LICENSE).

-Copyright &copy; 2025 Felafax, Inc.
+Copyright &copy; 2026 Felafax, Inc.

 ## Stargazers

--- a/docs/features/ad-blocking.mdx
+++ b/docs/features/ad-blocking.mdx
@@ -3,13 +3,17 @@ title: "Ad Blocking"
 description: "BrowserOS supports full ad blocking with uBlock Origin"
 ---

-BrowserOS supports full ad blocking through [uBlock Origin](https://ublockorigin.com/), the most effective open-source ad blocker available.
+BrowserOS supports full ad blocking through [uBlock Origin](https://ublockorigin.com/), the most powerful open-source ad blocker available — the full extension, not the watered-down "Lite" version.

-## How It Works
+## Why BrowserOS?

-Chrome has been [phasing out support](https://developer.chrome.com/docs/extensions/develop/migrate/mv2-deprecation-timeline) for Manifest V2 extensions, which uBlock Origin relies on for its full blocking capabilities. We re-enabled Manifest V2 support in BrowserOS so uBlock Origin can run at full power.
+Chrome [killed support](https://developer.chrome.com/docs/extensions/develop/migrate/mv2-deprecation-timeline) for uBlock Origin by phasing out Manifest V2 extensions. The only option left on Chrome is "uBlock Origin Lite," a significantly weaker version that can't use advanced filtering rules.

-Install it from the Chrome Web Store: [uBlock Origin](https://chromewebstore.google.com/detail/ublock-origin/cjpalhdlnbpafiamejdnhcphjbkeiagm)
+**BrowserOS re-enabled full Manifest V2 support**, so you can install and run the original uBlock Origin at full power — the same extension Chrome no longer allows.
+
+<Card title="Install uBlock Origin" icon="shield-check" href="https://chromewebstore.google.com/detail/ublock-origin/cjpalhdlnbpafiamejdnhcphjbkeiagm">
+  Install the full uBlock Origin extension from the Chrome Web Store. Works on BrowserOS out of the box.
+</Card>

 ## BrowserOS vs Chrome

--- a/docs/onboarding.mdx
+++ b/docs/onboarding.mdx
@@ -42,6 +42,10 @@ Welcome to BrowserOS! Let's get you set up.

 ## You're all set!

+<Tip>
+**Block ads with uBlock Origin** — Chrome dropped support for the full uBlock Origin extension, but BrowserOS brought it back. [Install it from the Chrome Web Store](https://chromewebstore.google.com/detail/ublock-origin/cjpalhdlnbpafiamejdnhcphjbkeiagm) and browse ad-free. [Learn more →](/features/ad-blocking)
+</Tip>
+
 Explore what BrowserOS can do:

 <Columns cols={2}>
--- a/packages/browseros-agent/apps/agent/.env.example
+++ b/packages/browseros-agent/apps/agent/.env.example
@@ -15,9 +15,6 @@ VITE_PUBLIC_SENTRY_DSN=
 # BrowserOS API URL
 VITE_PUBLIC_BROWSEROS_API=https://api.browseros.com

-# Launch feature flags
-VITE_PUBLIC_KIMI_LAUNCH=false
-
 # GraphQL Schema Path (optional — falls back to schema/schema.graphql)
 GRAPHQL_SCHEMA_PATH=

--- a/packages/browseros-agent/apps/agent/CLAUDE.md
+++ b/packages/browseros-agent/apps/agent/CLAUDE.md
@@ -12,7 +12,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 |------|------------|---------|
 | Folders | kebab-case | `ai-settings/`, `jtbd-popup/`, `llm-hub/` |
 | React components (.tsx) | PascalCase | `AISettingsPage.tsx`, `SurveyHeader.tsx` |
-| Hooks (.ts) | camelCase with `use` prefix | `useRunWorkflow.ts`, `useVoiceInput.ts` |
+| Hooks (.ts) | camelCase with `use` prefix | `useVoiceInput.ts`, `useMessageTree.ts` |
 | Non-component files (.ts) | lowercase | `types.ts`, `models.ts`, `storage.ts` |

 ## Project Overview
--- a/packages/browseros-agent/apps/agent/components/sidebar/SettingsSidebar.tsx
+++ b/packages/browseros-agent/apps/agent/components/sidebar/SettingsSidebar.tsx
@@ -4,7 +4,6 @@ import {
  Bot,
  Compass,
  CreditCard,
-  GitBranch,
  MessageSquare,
  Palette,
  RotateCcw,
@@ -86,12 +85,6 @@ const primarySettingsSections: NavSection[] = [
        icon: CreditCard,
        feature: Feature.CREDITS_SUPPORT,
      },
-      {
-        name: 'Workflows',
-        to: '/workflows',
-        icon: GitBranch,
-        feature: Feature.WORKFLOW_SUPPORT,
-      },
    ],
  },
 ]
--- a/packages/browseros-agent/apps/agent/entrypoints/app/App.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/App.tsx
@@ -11,7 +11,6 @@ import { Onboarding } from '../onboarding/index/Onboarding'
 import { StepsLayout } from '../onboarding/steps/StepsLayout'
 import { AISettingsPage } from './ai-settings/AISettingsPage'
 import { ConnectMCP } from './connect-mcp/ConnectMCP'
-import { CreateGraphWrapper } from './create-graph/CreateGraphWrapper'
 import { CustomizationPage } from './customization/CustomizationPage'
 import { SurveyPage } from './jtbd-agent/SurveyPage'
 import { AuthLayout } from './layout/AuthLayout'
@@ -29,7 +28,6 @@ import { SearchProviderPage } from './search-provider/SearchProviderPage'
 import { SkillsPage } from './skills/SkillsPage'
 import { SoulPage } from './soul/SoulPage'
 import { UsagePage } from './usage/UsagePage'
-import { WorkflowsPageWrapper } from './workflows/WorkflowsPageWrapper'

 function getSurveyParams(): { maxTurns?: number; experimentId?: string } {
  const params = new URLSearchParams(window.location.search)
@@ -53,9 +51,7 @@ const OptionsRedirect: FC = () => {
    soul: '/home/soul',
    skills: '/home/skills',
    'jtbd-agent': '/settings/survey',
-    workflows: '/workflows',
    scheduled: '/scheduled',
-    'create-graph': '/workflows/create-graph',
  }

  const newPath = routeMap[path] || '/settings/ai'
@@ -90,7 +86,6 @@ export const App: FC = () => {

          {/* Primary nav routes */}
          <Route path="connect-apps" element={<ConnectMCP />} />
-          <Route path="workflows" element={<WorkflowsPageWrapper />} />
          <Route path="scheduled" element={<ScheduledTasksPage />} />
        </Route>

@@ -108,9 +103,6 @@ export const App: FC = () => {
          </Route>
        </Route>

-        {/* Full-screen without sidebar */}
-        <Route path="workflows/create-graph" element={<CreateGraphWrapper />} />
-
        {/* Onboarding routes - no sidebar, no auth required */}
        <Route path="onboarding">
          <Route index element={<Onboarding />} />
--- a/packages/browseros-agent/apps/agent/entrypoints/app/ai-settings/NewProviderDialog.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/ai-settings/NewProviderDialog.tsx
@@ -8,7 +8,7 @@ import {
  Loader2,
  XCircle,
 } from 'lucide-react'
-import { type FC, useEffect, useMemo, useState } from 'react'
+import { type FC, useEffect, useMemo, useRef, useState } from 'react'
 import { useForm } from 'react-hook-form'
 import { z } from 'zod/v3'
 import { Button } from '@/components/ui/button'
@@ -61,7 +61,6 @@ import {
  KIMI_API_KEY_GUIDE_CLICKED_EVENT,
  MODEL_SELECTED_EVENT,
 } from '@/lib/constants/analyticsEvents'
-import { useKimiLaunch } from '@/lib/feature-flags/useKimiLaunch'
 import {
  getDefaultBaseUrlForProviders,
  getProviderTemplate,
@@ -223,9 +222,9 @@ export const NewProviderDialog: FC<NewProviderDialogProps> = ({
  const [testResult, setTestResult] = useState<TestResult | null>(null)
  const [modelPickerOpen, setModelPickerOpen] = useState(false)
  const [modelSearch, setModelSearch] = useState('')
+  const modelListRef = useRef<HTMLDivElement>(null)
  const { supports } = useCapabilities()
  const { baseUrl: agentServerUrl } = useAgentServerUrl()
-  const kimiLaunch = useKimiLaunch()

  const filteredProviderTypeOptions = providerTypeOptions.filter((opt) => {
    if (opt.value === 'chatgpt-pro')
@@ -233,8 +232,6 @@ export const NewProviderDialog: FC<NewProviderDialogProps> = ({
    if (opt.value === 'github-copilot')
      return supports(Feature.GITHUB_COPILOT_SUPPORT)
    if (opt.value === 'qwen-code') return supports(Feature.QWEN_CODE_SUPPORT)
-    if (opt.value === 'moonshot')
-      return kimiLaunch || initialValues?.type === 'moonshot'
    if (opt.value === 'openai-compatible') {
      return supports(Feature.OPENAI_COMPATIBLE_SUPPORT)
    }
@@ -309,6 +306,9 @@ export const NewProviderDialog: FC<NewProviderDialogProps> = ({
    ? modelFuse.search(modelSearch).map((r) => r.item)
    : modelInfoList

+  const showCustomEntry =
+    modelSearch && !filteredModels.some((m) => m.modelId === modelSearch)
+
  // Handle provider type change (user-initiated via Select)
  const handleTypeChange = (newType: ProviderType) => {
    form.setValue('type', newType)
@@ -894,59 +894,96 @@ export const NewProviderDialog: FC<NewProviderDialogProps> = ({
                          <CommandInput
                            placeholder="Search models..."
                            value={modelSearch}
-                            onValueChange={setModelSearch}
+                            onValueChange={(v) => {
+                              setModelSearch(v)
+                              requestAnimationFrame(() => {
+                                modelListRef.current?.scrollTo(0, 0)
+                              })
+                            }}
                            onKeyDown={(e) => {
-                              if (
-                                e.key === 'Enter' &&
-                                modelSearch &&
-                                filteredModels.length === 0
-                              ) {
+                              if (e.key === 'Enter' && modelSearch) {
                                e.preventDefault()
+                                e.stopPropagation()
                                form.setValue('modelId', modelSearch)
                                track(MODEL_SELECTED_EVENT, {
                                  provider_type: watchedType,
                                  model_id: modelSearch,
-                                  is_custom_model: true,
+                                  is_custom_model: !modelInfoList.some(
+                                    (m) => m.modelId === modelSearch,
+                                  ),
                                })
                                setModelPickerOpen(false)
                                setModelSearch('')
                              }
                            }}
                          />
-                          <CommandList>
+                          <CommandList ref={modelListRef}>
                            <CommandEmpty>
                              No models found. Press Enter to use &quot;
                              {modelSearch}&quot;
                            </CommandEmpty>
-                            <CommandGroup>
-                              {filteredModels.map((model) => (
+                            {showCustomEntry && (
+                              <CommandGroup forceMount>
                                <CommandItem
-                                  key={model.modelId}
-                                  value={model.modelId}
+                                  forceMount
+                                  value={`custom:${modelSearch}`}
                                  onSelect={() => {
-                                    form.setValue('modelId', model.modelId)
+                                    form.setValue('modelId', modelSearch)
                                    track(MODEL_SELECTED_EVENT, {
                                      provider_type: watchedType,
-                                      model_id: model.modelId,
-                                      context_window: model.contextLength,
-                                      is_custom_model: false,
+                                      model_id: modelSearch,
+                                      is_custom_model: true,
                                    })
                                    setModelPickerOpen(false)
                                    setModelSearch('')
                                  }}
                                >
                                  <span className="flex-1 truncate">
-                                    {model.modelId}
+                                    {modelSearch}
                                  </span>
-                                  <span className="ml-2 shrink-0 rounded-md bg-muted px-1.5 py-0.5 font-mono text-[10px] text-muted-foreground">
-                                    {formatContextWindow(model.contextLength)}
-                                  </span>
-                                  {field.value === model.modelId && (
+                                  {field.value === modelSearch && (
                                    <Check className="ml-2 h-4 w-4 shrink-0" />
                                  )}
                                </CommandItem>
-                              ))}
-                            </CommandGroup>
+                              </CommandGroup>
+                            )}
+                            {filteredModels.length > 0 && (
+                              <CommandGroup>
+                                {filteredModels.map((model) => (
+                                  <CommandItem
+                                    key={model.modelId}
+                                    value={model.modelId}
+                                    onSelect={() => {
+                                      form.setValue('modelId', model.modelId)
+                                      track(MODEL_SELECTED_EVENT, {
+                                        provider_type: watchedType,
+                                        model_id: model.modelId,
+                                        context_window: model.contextLength,
+                                        is_custom_model: !modelInfoList.some(
+                                          (m) => m.modelId === model.modelId,
+                                        ),
+                                      })
+                                      setModelPickerOpen(false)
+                                      setModelSearch('')
+                                    }}
+                                  >
+                                    <span className="flex-1 truncate">
+                                      {model.modelId}
+                                    </span>
+                                    {model.contextLength > 0 && (
+                                      <span className="ml-2 shrink-0 rounded-md bg-muted px-1.5 py-0.5 font-mono text-[10px] text-muted-foreground">
+                                        {formatContextWindow(
+                                          model.contextLength,
+                                        )}
+                                      </span>
+                                    )}
+                                    {field.value === model.modelId && (
+                                      <Check className="ml-2 h-4 w-4 shrink-0" />
+                                    )}
+                                  </CommandItem>
+                                ))}
+                              </CommandGroup>
+                            )}
                          </CommandList>
                        </Command>
                      </PopoverContent>
--- a/packages/browseros-agent/apps/agent/entrypoints/app/ai-settings/ProviderCard.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/ai-settings/ProviderCard.tsx
@@ -2,7 +2,6 @@ import { Check, Loader2, Trash2 } from 'lucide-react'
 import type { FC } from 'react'
 import { Badge } from '@/components/ui/badge'
 import { Button } from '@/components/ui/button'
-import { useKimiLaunch } from '@/lib/feature-flags/useKimiLaunch'
 import { BrowserOSIcon, ProviderIcon } from '@/lib/llm-providers/providerIcons'
 import type { LlmProviderConfig } from '@/lib/llm-providers/types'
 import { cn } from '@/lib/utils'
@@ -30,7 +29,6 @@ export const ProviderCard: FC<ProviderCardProps> = ({
  isTesting = false,
 }) => {
  const inputId = `provider-${provider.id}`
-  const kimiLaunch = useKimiLaunch()

  return (
    <label
@@ -79,30 +77,21 @@ export const ProviderCard: FC<ProviderCardProps> = ({
            </Badge>
          )}
        </div>
-        {isBuiltIn && provider.type === 'browseros' && kimiLaunch && (
-          <span className="mb-1 inline-block rounded-full border border-orange-300/60 bg-orange-100/70 px-3 py-0.5 font-semibold text-orange-700 text-xs dark:border-orange-400/40 dark:bg-orange-500/15 dark:text-orange-300">
-            In partnership with Moonshot AI
-          </span>
-        )}
        <p className="truncate text-muted-foreground text-sm">
          {isBuiltIn ? (
-            kimiLaunch ? (
-              'Extended usage limits for the next 2 weeks!'
-            ) : (
-              <>
-                BrowserOS-hosted model with strict rate limits.{' '}
-                <a
-                  href="https://docs.browseros.com/features/bring-your-own-llm"
-                  target="_blank"
-                  rel="noopener noreferrer"
-                  className="underline hover:text-foreground"
-                  onClick={(e) => e.stopPropagation()}
-                >
-                  Bring your own key
-                </a>{' '}
-                for better performance.
-              </>
-            )
+            <>
+              BrowserOS-hosted model with strict rate limits.{' '}
+              <a
+                href="https://docs.browseros.com/features/bring-your-own-llm"
+                target="_blank"
+                rel="noopener noreferrer"
+                className="underline hover:text-foreground"
+                onClick={(e) => e.stopPropagation()}
+              >
+                Bring your own key
+              </a>{' '}
+              for better performance.
+            </>
          ) : provider.baseUrl ? (
            `${provider.modelId} • ${provider.baseUrl}`
          ) : (
--- a/packages/browseros-agent/apps/agent/entrypoints/app/ai-settings/ProviderTemplatesSection.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/ai-settings/ProviderTemplatesSection.tsx
@@ -7,7 +7,6 @@ import {
 } from '@/components/ui/collapsible'
 import { Feature } from '@/lib/browseros/capabilities'
 import { useCapabilities } from '@/lib/browseros/useCapabilities'
-import { useKimiLaunch } from '@/lib/feature-flags/useKimiLaunch'
 import {
  type ProviderTemplate,
  providerTemplates,
@@ -23,7 +22,6 @@ export const ProviderTemplatesSection: FC<ProviderTemplatesSectionProps> = ({
  onUseTemplate,
 }) => {
  const { supports } = useCapabilities()
-  const kimiLaunch = useKimiLaunch()

  const filteredTemplates = providerTemplates.filter((template) => {
    if (template.id === 'chatgpt-pro')
@@ -31,7 +29,6 @@ export const ProviderTemplatesSection: FC<ProviderTemplatesSectionProps> = ({
    if (template.id === 'github-copilot')
      return supports(Feature.GITHUB_COPILOT_SUPPORT)
    if (template.id === 'qwen-code') return supports(Feature.QWEN_CODE_SUPPORT)
-    if (template.id === 'moonshot') return kimiLaunch
    if (template.id === 'openai-compatible') {
      return supports(Feature.OPENAI_COMPATIBLE_SUPPORT)
    }
@@ -67,7 +64,6 @@ export const ProviderTemplatesSection: FC<ProviderTemplatesSectionProps> = ({
                <ProviderTemplateCard
                  key={template.id}
                  template={template}
-                  highlighted={template.id === 'moonshot'}
                  isNew={isNew}
                  onUseTemplate={onUseTemplate}
                />
--- a/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/CreateGraph.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/CreateGraph.tsx
@@ -1,484 +0,0 @@
-import { useChat } from '@ai-sdk/react'
-import { DefaultChatTransport, type UIMessage } from 'ai'
-import { compact } from 'es-toolkit/array'
-import type { FC, FormEvent } from 'react'
-import { useEffect, useRef, useState } from 'react'
-import { useSearchParams } from 'react-router'
-import useDeepCompareEffect from 'use-deep-compare-effect'
-import type { Provider } from '@/components/chat/chatComponentTypes'
-import {
-  AlertDialog,
-  AlertDialogAction,
-  AlertDialogCancel,
-  AlertDialogContent,
-  AlertDialogDescription,
-  AlertDialogFooter,
-  AlertDialogHeader,
-  AlertDialogTitle,
-} from '@/components/ui/alert-dialog'
-import {
-  ResizableHandle,
-  ResizablePanel,
-  ResizablePanelGroup,
-} from '@/components/ui/resizable'
-import { useChatRefs } from '@/entrypoints/sidepanel/index/useChatRefs'
-import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
-import {
-  GRAPH_SAVED_EVENT,
-  GRAPH_UPDATED_EVENT,
-  NEW_GRAPH_CREATED_EVENT,
-} from '@/lib/constants/analyticsEvents'
-import { useLlmProviders } from '@/lib/llm-providers/useLlmProviders'
-import { track } from '@/lib/metrics/track'
-import { useRpcClient } from '@/lib/rpc/RpcClientProvider'
-import { sentry } from '@/lib/sentry/sentry'
-import { useWorkflows } from '@/lib/workflows/workflowStorage'
-import { GraphCanvas } from './GraphCanvas'
-import { GraphChat } from './GraphChat'
-import { WorkflowsChatHeader } from './WorkflowsChatHeader'
-
-type MessageType = 'create-graph' | 'update-graph' | 'run-graph'
-
-type GraphMessageMetadata = {
-  messageType?: MessageType
-  codeId?: string
-  graph?: GraphData
-  window?: chrome.windows.Window
-}
-
-export type GraphData = {
-  nodes: {
-    id: string
-    type: string
-    data: {
-      label: string
-    }
-  }[]
-  edges: {
-    id: string
-    source: string
-    target: string
-  }[]
-}
-
-const getLastMessageText = (messages: UIMessage[]) => {
-  const lastMessage = messages[messages.length - 1]
-  if (!lastMessage) return ''
-  return lastMessage.parts
-    .filter((part) => part.type === 'text')
-    .map((part) => part.text)
-    .join('')
-}
-
-export const CreateGraph: FC = () => {
-  const [searchParams] = useSearchParams()
-  const workflowIdParam = searchParams.get('workflowId')
-
-  const [graphName, setGraphName] = useState('')
-  const [codeId, setCodeId] = useState<string | undefined>(undefined)
-  const [graphData, setGraphData] = useState<GraphData | undefined>(undefined)
-  const [savedWorkflowId, setSavedWorkflowId] = useState<string | undefined>(
-    undefined,
-  )
-  const [savedCodeId, setSavedCodeId] = useState<string | undefined>(undefined)
-  const [isInitialized, setIsInitialized] = useState(!workflowIdParam)
-  const [canvasPanelSize, setCanvasPanelSize] = useState<
-    { asPercentage: number; inPixels: number } | undefined
-  >(undefined)
-
-  const [query, setQuery] = useState('')
-  const [showDiscardDialog, setShowDiscardDialog] = useState(false)
-
-  const { workflows, addWorkflow, editWorkflow } = useWorkflows()
-  const { providers: llmProviders, setDefaultProvider } = useLlmProviders()
-  const rpcClient = useRpcClient()
-
-  // Initialize edit mode when workflowId is provided
-  useDeepCompareEffect(() => {
-    if (!workflowIdParam || isInitialized) return
-
-    const workflow = workflows.find((w) => w.id === workflowIdParam)
-    if (!workflow) return
-
-    const initializeEditMode = async () => {
-      setGraphName(workflow.workflowName)
-      setCodeId(workflow.codeId)
-      setSavedWorkflowId(workflow.id)
-      setSavedCodeId(workflow.codeId)
-
-      try {
-        const response = await rpcClient.graph[':id'].$get({
-          param: { id: workflow.codeId },
-        })
-
-        if (response.ok) {
-          const data = await response.json()
-          if ('graph' in data && data.graph) {
-            setGraphData(data.graph as GraphData)
-          }
-        }
-      } catch (error) {
-        sentry.captureException(error, {
-          extra: {
-            message: 'Failed to fetch graph data from the server',
-            codeId: workflow.codeId,
-          },
-        })
-      }
-
-      setIsInitialized(true)
-    }
-
-    initializeEditMode()
-  }, [workflowIdParam, workflows, isInitialized, rpcClient])
-
-  const updateQuery = (newQuery: string) => {
-    setQuery(newQuery)
-  }
-
-  const onSubmit = (e: FormEvent) => {
-    e.preventDefault()
-    if (codeId) {
-      sendMessage({
-        text: query,
-        metadata: {
-          messageType: 'update-graph' as MessageType,
-          codeId,
-        },
-      })
-      track(GRAPH_UPDATED_EVENT)
-    } else {
-      sendMessage({
-        text: query,
-        metadata: {
-          messageType: 'create-graph' as MessageType,
-        },
-      })
-      track(NEW_GRAPH_CREATED_EVENT)
-    }
-    setQuery('')
-  }
-
-  const {
-    baseUrl: agentServerUrl,
-    isLoading: _isLoadingAgentUrl,
-    error: agentUrlError,
-  } = useAgentServerUrl()
-
-  const {
-    selectedLlmProviderRef,
-    enabledMcpServersRef,
-    enabledCustomServersRef,
-    personalizationRef,
-    selectedLlmProvider,
-    isLoadingProviders,
-  } = useChatRefs()
-
-  const agentUrlRef = useRef(agentServerUrl)
-  const codeIdRef = useRef(codeId)
-
-  useEffect(() => {
-    agentUrlRef.current = agentServerUrl
-    codeIdRef.current = codeId
-  }, [agentServerUrl, codeId])
-
-  const { sendMessage, stop, status, messages, error, setMessages } = useChat({
-    transport: new DefaultChatTransport({
-      prepareSendMessagesRequest: async ({ messages }) => {
-        const lastMessage = messages[messages.length - 1]
-        const lastMessageText = getLastMessageText(messages)
-        const metadata = lastMessage.metadata as
-          | GraphMessageMetadata
-          | undefined
-
-        if (metadata?.messageType === 'create-graph') {
-          return {
-            api: `${agentUrlRef.current}/graph`,
-            body: {
-              query: lastMessageText,
-            },
-          }
-        }
-
-        if (metadata?.messageType === 'update-graph' && codeIdRef.current) {
-          return {
-            api: `${agentUrlRef.current}/graph/${codeIdRef.current}`,
-            body: {
-              query: lastMessageText,
-            },
-          }
-        }
-
-        if (metadata?.messageType === 'run-graph' && codeIdRef.current) {
-          const provider = selectedLlmProviderRef.current
-          const enabledMcpServers = enabledMcpServersRef.current
-          const customMcpServers = enabledCustomServersRef.current
-
-          return {
-            api: `${agentUrlRef.current}/graph/${codeIdRef.current}/run`,
-            body: {
-              provider: provider?.type,
-              providerType: provider?.type,
-              providerName: provider?.name,
-              model: provider?.modelId ?? 'browseros',
-              contextWindowSize: provider?.contextWindow,
-              temperature: provider?.temperature,
-              resourceName: provider?.resourceName,
-              // Bedrock-specific
-              accessKeyId: provider?.accessKeyId,
-              secretAccessKey: provider?.secretAccessKey,
-              region: provider?.region,
-              sessionToken: provider?.sessionToken,
-              apiKey: provider?.apiKey,
-              baseUrl: provider?.baseUrl,
-              browserContext: {
-                windowId: metadata?.window?.id,
-                activeTab: metadata?.window?.tabs?.[0],
-                enabledMcpServers: compact(enabledMcpServers),
-                customMcpServers,
-              },
-              userSystemPrompt: personalizationRef.current,
-            },
-          }
-        }
-
-        return {
-          api: `${agentUrlRef.current}/graph`,
-          body: {
-            query: lastMessageText,
-          },
-        }
-      },
-    }),
-  })
-
-  const lastAssistantMessageWithGraph = messages.findLast((m) => {
-    if (m.role !== 'assistant') return false
-    const metadata = m.metadata as GraphMessageMetadata | undefined
-    return metadata?.graph !== undefined
-  })
-
-  const onClickTest = async () => {
-    let backgroundWindow: chrome.windows.Window | undefined
-    try {
-      backgroundWindow = await chrome.windows.create({
-        url: 'chrome://newtab',
-        focused: true,
-        type: 'normal',
-      })
-    } catch {
-      // Fallback when no window context is available (e.g. all windows closed)
-      const tab = await chrome.tabs.create({
-        url: 'chrome://newtab',
-        active: true,
-      })
-      if (tab.windowId) {
-        backgroundWindow = await chrome.windows.get(tab.windowId)
-      }
-    }
-
-    sendMessage({
-      text: 'Run a test of the graph you just created.',
-      metadata: {
-        messageType: 'run-graph' as MessageType,
-        codeId,
-        window: backgroundWindow,
-      },
-    })
-  }
-
-  const hasUnsavedChanges = savedWorkflowId ? codeId !== savedCodeId : true
-  const shouldBlockNavigation = !!codeId && hasUnsavedChanges
-
-  // Handle browser refresh/close
-  useEffect(() => {
-    const handleBeforeUnload = (e: BeforeUnloadEvent) => {
-      if (shouldBlockNavigation) {
-        e.preventDefault()
-      }
-    }
-
-    window.addEventListener('beforeunload', handleBeforeUnload)
-    return () => window.removeEventListener('beforeunload', handleBeforeUnload)
-  }, [shouldBlockNavigation])
-
-  const onClickSave = async () => {
-    if (!graphName || !codeId) return
-
-    if (savedWorkflowId) {
-      await editWorkflow(savedWorkflowId, {
-        workflowName: graphName,
-        codeId,
-      })
-      setSavedCodeId(codeId)
-    } else {
-      const newWorkflow = await addWorkflow({
-        workflowName: graphName,
-        codeId,
-      })
-      setSavedWorkflowId(newWorkflow.id)
-      setSavedCodeId(codeId)
-    }
-    track(GRAPH_SAVED_EVENT)
-  }
-
-  // Provider data for header
-  const providers: Provider[] = llmProviders.map((p) => ({
-    id: p.id,
-    name: p.name,
-    type: p.type,
-  }))
-
-  const selectedProviderForHeader: Provider | undefined = selectedLlmProvider
-    ? {
-        id: selectedLlmProvider.id,
-        name: selectedLlmProvider.name,
-        type: selectedLlmProvider.type,
-      }
-    : providers[0]
-
-  // Has generated code but can't auto-save (no name)
-  const hasUnsavedWork = codeId && !graphName
-
-  const resetToNewWorkflow = () => {
-    setCodeId(undefined)
-    setGraphData(undefined)
-    setGraphName('')
-    setSavedWorkflowId(undefined)
-    setSavedCodeId(undefined)
-    setMessages([])
-  }
-
-  const handleSelectProvider = (provider: Provider) => {
-    setDefaultProvider(provider.id)
-  }
-
-  const handleNewWorkflow = async () => {
-    // Can auto-save: has name AND code
-    if (graphName && codeId) {
-      await onClickSave()
-      resetToNewWorkflow()
-      return
-    }
-
-    // Has unsaved work that can't be auto-saved: show confirmation
-    if (hasUnsavedWork) {
-      setShowDiscardDialog(true)
-      return
-    }
-
-    // Nothing to save, just reset
-    resetToNewWorkflow()
-  }
-
-  const handleConfirmDiscard = () => {
-    setShowDiscardDialog(false)
-    resetToNewWorkflow()
-  }
-
-  const handleSuggestionClick = (prompt: string) => {
-    sendMessage({
-      text: prompt,
-      metadata: {
-        messageType: 'create-graph' as MessageType,
-      },
-    })
-  }
-
-  useDeepCompareEffect(() => {
-    if (status === 'ready' && lastAssistantMessageWithGraph) {
-      const metadata = lastAssistantMessageWithGraph.metadata as
-        | GraphMessageMetadata
-        | undefined
-      setCodeId(metadata?.codeId)
-      setGraphData(metadata?.graph)
-    }
-  }, [status, lastAssistantMessageWithGraph ?? {}])
-
-  if (!isInitialized || isLoadingProviders || !selectedProviderForHeader) {
-    return (
-      <div className="flex h-screen w-screen items-center justify-center bg-background text-foreground">
-        <div className="fade-in animate-in text-muted-foreground duration-200 [animation-delay:300ms] [animation-fill-mode:backwards]">
-          Loading...
-        </div>
-      </div>
-    )
-  }
-
-  return (
-    <div className="h-screen w-screen bg-background text-foreground">
-      <ResizablePanelGroup orientation="horizontal">
-        <ResizablePanel
-          id="graph-canvas"
-          defaultSize={'70%'}
-          minSize={'30%'}
-          maxSize={'70%'}
-          onResize={(size) => setCanvasPanelSize(size)}
-        >
-          <GraphCanvas
-            graphName={graphName}
-            onGraphNameChange={(val) => setGraphName(val)}
-            graphData={graphData}
-            codeId={codeId}
-            onClickTest={onClickTest}
-            onClickSave={onClickSave}
-            isSaved={!!savedWorkflowId}
-            hasUnsavedChanges={hasUnsavedChanges}
-            shouldBlockNavigation={shouldBlockNavigation}
-            panelSize={canvasPanelSize}
-          />
-        </ResizablePanel>
-
-        <ResizableHandle withHandle />
-
-        <ResizablePanel
-          id="graph-chat"
-          defaultSize={'30%'}
-          maxSize={'70%'}
-          minSize={'30%'}
-        >
-          <div className="flex h-full flex-col">
-            <WorkflowsChatHeader
-              selectedProvider={selectedProviderForHeader}
-              providers={providers}
-              onSelectProvider={handleSelectProvider}
-              onNewWorkflow={handleNewWorkflow}
-              hasMessages={messages.length > 0}
-            />
-            <div className="min-h-0 flex-1">
-              <GraphChat
-                messages={messages}
-                onSubmit={onSubmit}
-                onInputChange={updateQuery}
-                onStop={stop}
-                input={query}
-                status={status}
-                agentUrlError={agentUrlError}
-                chatError={error}
-                onSuggestionClick={handleSuggestionClick}
-              />
-            </div>
-          </div>
-        </ResizablePanel>
-      </ResizablePanelGroup>
-
-      <AlertDialog open={showDiscardDialog} onOpenChange={setShowDiscardDialog}>
-        <AlertDialogContent>
-          <AlertDialogHeader>
-            <AlertDialogTitle>Discard unsaved workflow?</AlertDialogTitle>
-            <AlertDialogDescription>
-              You have an unsaved workflow. Creating a new one will discard your
-              current changes.
-            </AlertDialogDescription>
-          </AlertDialogHeader>
-          <AlertDialogFooter>
-            <AlertDialogCancel>Cancel</AlertDialogCancel>
-            <AlertDialogAction onClick={handleConfirmDiscard}>
-              Discard
-            </AlertDialogAction>
-          </AlertDialogFooter>
-        </AlertDialogContent>
-      </AlertDialog>
-    </div>
-  )
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/CreateGraphWrapper.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/CreateGraphWrapper.tsx
@@ -1,13 +0,0 @@
-import { type FC, Suspense } from 'react'
-import { RpcClientProvider } from '@/lib/rpc/RpcClientProvider'
-import { CreateGraph } from './CreateGraph'
-
-export const CreateGraphWrapper: FC = () => {
-  return (
-    <RpcClientProvider>
-      <Suspense fallback={<div className="h-screen w-screen bg-background" />}>
-        <CreateGraph />
-      </Suspense>
-    </RpcClientProvider>
-  )
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/CustomNode.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/CustomNode.tsx
@@ -1,140 +0,0 @@
-import { Handle, type Node, type NodeProps, Position } from '@xyflow/react'
-import {
-  CheckCircle,
-  Download,
-  GitBranch,
-  GitMerge,
-  MousePointer,
-  Navigation,
-  Play,
-  RotateCw,
-  Split,
-  Square,
-} from 'lucide-react'
-import type React from 'react'
-import { memo } from 'react'
-import { cn } from '@/lib/utils'
-
-const nodeConfig: Record<
-  NodeType,
-  { color: string; icon: React.ElementType; label: string }
-> = {
-  start: {
-    color: 'text-green-600 dark:text-green-400',
-    icon: Play,
-    label: 'Start',
-  },
-  end: {
-    color: 'text-red-600 dark:text-red-400',
-    icon: Square,
-    label: 'End',
-  },
-  nav: {
-    color: 'text-blue-600 dark:text-blue-400',
-    icon: Navigation,
-    label: 'Navigate',
-  },
-  act: {
-    color: 'text-purple-600 dark:text-purple-400',
-    icon: MousePointer,
-    label: 'Action',
-  },
-  extract: {
-    color: 'text-amber-600 dark:text-amber-400',
-    icon: Download,
-    label: 'Extract',
-  },
-  verify: {
-    color: 'text-emerald-600 dark:text-emerald-400',
-    icon: CheckCircle,
-    label: 'Verify',
-  },
-  decision: {
-    color: 'text-pink-600 dark:text-pink-400',
-    icon: GitBranch,
-    label: 'Decision',
-  },
-  loop: {
-    color: 'text-cyan-600 dark:text-cyan-400',
-    icon: RotateCw,
-    label: 'Loop',
-  },
-  fork: {
-    color: 'text-indigo-600 dark:text-indigo-400',
-    icon: Split,
-    label: 'Fork',
-  },
-  join: {
-    color: 'text-lime-600 dark:text-lime-400',
-    icon: GitMerge,
-    label: 'Join',
-  },
-}
-
-export type NodeType =
-  | 'start'
-  | 'end'
-  | 'nav'
-  | 'act'
-  | 'extract'
-  | 'verify'
-  | 'decision'
-  | 'loop'
-  | 'fork'
-  | 'join'
-
-type CustomNodeData = Node<{
-  type: NodeType
-  label: string
-}>
-
-export const CustomNode = memo(
-  ({ data: { label, type } }: NodeProps<CustomNodeData>) => {
-    const config = nodeConfig[type || 'start']
-    const Icon = config.icon
-
-    const showSourceHandle = type !== 'end'
-    const showTargetHandle = type !== 'start'
-
-    return (
-      <div className="min-w-45 rounded-lg border border-border bg-card px-4 py-3 shadow-md transition-all">
-        {showTargetHandle && (
-          <Handle
-            type="target"
-            position={Position.Top}
-            className="h-2 w-2 bg-accent-orange!"
-          />
-        )}
-
-        <div className="flex items-center gap-2">
-          <div className={cn('shrink-0', config.color)}>
-            <Icon className="h-5 w-5" />
-          </div>
-          <div className="min-w-0 flex-1">
-            <div
-              className={cn(
-                'mb-0.5 font-semibold text-xs uppercase tracking-wide',
-                config.color,
-              )}
-            >
-              {config.label}
-            </div>
-            <div className="wrap-break-word font-medium text-foreground text-sm">
-              {label}
-            </div>
-          </div>
-        </div>
-
-        {showSourceHandle && (
-          <Handle
-            type="source"
-            position={Position.Bottom}
-            className="h-2 w-2 bg-accent-orange!"
-          />
-        )}
-      </div>
-    )
-  },
-)
-
-CustomNode.displayName = 'CustomNode'
--- a/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/GraphCanvas.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/GraphCanvas.tsx
@@ -1,514 +0,0 @@
-import cytoscape from 'cytoscape'
-import dagre from 'cytoscape-dagre'
-// @ts-expect-error no types available
-import nodeHtmlLabel from 'cytoscape-node-html-label'
-import DOMPurify from 'dompurify'
-import {
-  ArrowLeft,
-  Maximize,
-  Minus,
-  Pencil,
-  Play,
-  Plus,
-  Save,
-} from 'lucide-react'
-import type { FC } from 'react'
-import { useCallback, useEffect, useRef, useState } from 'react'
-import { useNavigate } from 'react-router'
-import useDeepCompareEffect from 'use-deep-compare-effect'
-import ProductLogo from '@/assets/product_logo.svg'
-import { Button } from '@/components/ui/button'
-import {
-  Tooltip,
-  TooltipContent,
-  TooltipTrigger,
-} from '@/components/ui/tooltip'
-import type { GraphData } from './CreateGraph'
-import type { NodeType } from './CustomNode'
-
-cytoscape.use(dagre)
-nodeHtmlLabel(cytoscape)
-
-const NODE_CONFIG: Record<
-  NodeType,
-  { color: string; bgColor: string; icon: string; label: string }
-> = {
-  start: {
-    color: '#22c55e',
-    bgColor: 'rgba(34, 197, 94, 0.1)',
-    icon: `<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><polygon points="6 3 20 12 6 21 6 3"></polygon></svg>`,
-    label: 'START',
-  },
-  end: {
-    color: '#ef4444',
-    bgColor: 'rgba(239, 68, 68, 0.1)',
-    icon: `<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect width="18" height="18" x="3" y="3" rx="2"></rect></svg>`,
-    label: 'END',
-  },
-  nav: {
-    color: '#3b82f6',
-    bgColor: 'rgba(59, 130, 246, 0.1)',
-    icon: `<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><polygon points="3 11 22 2 13 21 11 13 3 11"></polygon></svg>`,
-    label: 'NAVIGATE',
-  },
-  act: {
-    color: '#8b5cf6',
-    bgColor: 'rgba(139, 92, 246, 0.1)',
-    icon: `<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="m4 4 7.07 17 2.51-7.39L21 11.07z"></path></svg>`,
-    label: 'ACTION',
-  },
-  extract: {
-    color: '#f59e0b',
-    bgColor: 'rgba(245, 158, 11, 0.1)',
-    icon: `<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"></path><polyline points="7 10 12 15 17 10"></polyline><line x1="12" x2="12" y1="15" y2="3"></line></svg>`,
-    label: 'EXTRACT',
-  },
-  verify: {
-    color: '#10b981',
-    bgColor: 'rgba(16, 185, 129, 0.1)',
-    icon: `<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M22 11.08V12a10 10 0 1 1-5.93-9.14"></path><polyline points="22 4 12 14.01 9 11.01"></polyline></svg>`,
-    label: 'VERIFY',
-  },
-  decision: {
-    color: '#ec4899',
-    bgColor: 'rgba(236, 72, 153, 0.1)',
-    icon: `<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><line x1="6" x2="6" y1="3" y2="15"></line><circle cx="18" cy="6" r="3"></circle><circle cx="6" cy="18" r="3"></circle><path d="M18 9a9 9 0 0 1-9 9"></path></svg>`,
-    label: 'DECISION',
-  },
-  loop: {
-    color: '#06b6d4',
-    bgColor: 'rgba(6, 182, 212, 0.1)',
-    icon: `<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21 12a9 9 0 1 1-9-9c2.52 0 4.93 1 6.74 2.74L21 8"></path><path d="M21 3v5h-5"></path></svg>`,
-    label: 'LOOP',
-  },
-  fork: {
-    color: '#6366f1',
-    bgColor: 'rgba(99, 102, 241, 0.1)',
-    icon: `<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M16 3h5v5"></path><path d="M8 3H3v5"></path><path d="M12 22v-8.3a4 4 0 0 0-1.172-2.872L3 3"></path><path d="m15 9 6-6"></path></svg>`,
-    label: 'FORK',
-  },
-  join: {
-    color: '#84cc16',
-    bgColor: 'rgba(132, 204, 22, 0.1)',
-    icon: `<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="18" cy="18" r="3"></circle><circle cx="6" cy="6" r="3"></circle><path d="M6 21V9a9 9 0 0 0 9 9"></path></svg>`,
-    label: 'JOIN',
-  },
-}
-
-const initialData: GraphData = {
-  nodes: [
-    {
-      id: 'start',
-      type: 'start',
-      data: { label: 'Use the Chat to build your workflow!' },
-    },
-  ],
-  edges: [],
-}
-
-const MIN_NODE_WIDTH = 180
-const MAX_NODE_WIDTH = 240
-const BASE_NODE_HEIGHT = 70
-const CHAR_WIDTH = 7
-const ICON_AND_PADDING = 62
-const MAX_ZOOM = 1.2
-
-const calculateNodeDimensions = (
-  label: string,
-): { width: number; height: number } => {
-  const textWidth = label.length * CHAR_WIDTH + ICON_AND_PADDING
-  const width = Math.max(MIN_NODE_WIDTH, Math.min(MAX_NODE_WIDTH, textWidth))
-
-  const maxCharsPerLine = Math.floor((width - ICON_AND_PADDING) / CHAR_WIDTH)
-  const lines = Math.ceil(label.length / maxCharsPerLine)
-  const extraHeight = Math.max(0, lines - 1) * 18
-  const height = BASE_NODE_HEIGHT + extraHeight
-
-  return { width, height }
-}
-
-const createNodeHtml = (type: NodeType, label: string): string => {
-  const config = NODE_CONFIG[type] || NODE_CONFIG.start
-  const sanitizedLabel = DOMPurify.sanitize(label, { ALLOWED_TAGS: [] })
-  return `
-    <div class="graph-node" style="
-      display: flex;
-      align-items: flex-start;
-      gap: 10px;
-      min-width: 160px;
-      max-width: 220px;
-      padding: 12px 16px;
-      background-color: var(--graph-node-bg);
-      border: 1px solid var(--graph-node-border);
-      border-radius: 10px;
-      box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
-      font-family: system-ui, -apple-system, sans-serif;
-    ">
-      <div style="
-        flex-shrink: 0;
-        color: ${config.color};
-        margin-top: 2px;
-      ">
-        ${config.icon}
-      </div>
-      <div style="flex: 1; min-width: 0;">
-        <div style="
-          font-size: 10px;
-          font-weight: 600;
-          letter-spacing: 0.05em;
-          color: ${config.color};
-          margin-bottom: 4px;
-        ">${config.label}</div>
-        <div style="
-          font-size: 13px;
-          font-weight: 500;
-          color: var(--graph-node-text);
-          line-height: 1.4;
-          word-wrap: break-word;
-        ">${sanitizedLabel}</div>
-      </div>
-    </div>
-  `
-}
-
-type GraphCanvasProps = {
-  graphName: string
-  onGraphNameChange: (name: string) => void
-  graphData?: GraphData
-  codeId?: string
-  onClickTest: () => unknown
-  onClickSave: () => unknown
-  isSaved: boolean
-  hasUnsavedChanges: boolean
-  shouldBlockNavigation: boolean
-  panelSize?: { asPercentage: number; inPixels: number }
-}
-
-export const GraphCanvas: FC<GraphCanvasProps> = ({
-  graphName,
-  onGraphNameChange,
-  graphData = initialData,
-  codeId,
-  onClickTest,
-  onClickSave,
-  isSaved,
-  hasUnsavedChanges,
-  shouldBlockNavigation,
-  panelSize,
-}) => {
-  const [isEditingName, setIsEditingName] = useState(false)
-  const navigate = useNavigate()
-  const containerRef = useRef<HTMLDivElement>(null)
-  const cyRef = useRef<cytoscape.Core | null>(null)
-
-  const handleBack = () => {
-    if (shouldBlockNavigation) {
-      const confirmed = window.confirm(
-        'You have unsaved changes. Are you sure you want to leave?',
-      )
-      if (!confirmed) return
-    }
-    navigate(-1)
-  }
-
-  const canTest = !!codeId
-  const canSave = !!graphName && !!codeId && hasUnsavedChanges
-
-  const getTestTooltip = () => {
-    if (!codeId) return 'Create a workflow using the chat first'
-    return 'Run a test of this workflow'
-  }
-
-  const getSaveTooltip = () => {
-    if (!codeId) return 'Create a workflow using the chat first'
-    if (!graphName) return 'Provide a name for the workflow'
-    if (isSaved && !hasUnsavedChanges) return 'Workflow already saved'
-    return isSaved ? 'Save changes to this workflow' : 'Save this workflow'
-  }
-
-  const getSaveButtonLabel = () => {
-    return isSaved ? 'Save Changes' : 'Save Workflow'
-  }
-
-  const zoomIn = useCallback(() => {
-    cyRef.current?.zoom(cyRef.current.zoom() * 1.2)
-    cyRef.current?.center()
-  }, [])
-
-  const zoomOut = useCallback(() => {
-    cyRef.current?.zoom(cyRef.current.zoom() / 1.2)
-    cyRef.current?.center()
-  }, [])
-
-  const fitView = useCallback(() => {
-    cyRef.current?.fit(undefined, 50)
-    cyRef.current?.center()
-  }, [])
-
-  useEffect(() => {
-    if (!containerRef.current) return
-
-    const cy = cytoscape({
-      container: containerRef.current,
-      elements: [],
-      style: [
-        {
-          selector: 'node',
-          style: {
-            width: 'data(nodeWidth)',
-            height: 'data(nodeHeight)',
-            'background-opacity': 0,
-            'border-width': 0,
-          },
-        },
-        {
-          selector: 'edge',
-          style: {
-            width: 2,
-            'line-color': '#f97316',
-            'target-arrow-color': '#f97316',
-            'target-arrow-shape': 'triangle',
-            'curve-style': 'bezier',
-            'arrow-scale': 1.2,
-          },
-        },
-        {
-          selector: 'edge.back-edge',
-          style: {
-            'line-style': 'dashed',
-            'line-dash-pattern': [6, 3],
-            'curve-style': 'unbundled-bezier',
-            'control-point-distances': [100],
-            'control-point-weights': [0.5],
-          },
-        },
-      ],
-      layout: { name: 'preset' },
-      userZoomingEnabled: true,
-      userPanningEnabled: true,
-      boxSelectionEnabled: false,
-      selectionType: 'single',
-      autoungrabify: true,
-      autounselectify: true,
-      maxZoom: MAX_ZOOM,
-      minZoom: 0.2,
-    })
-
-    // @ts-expect-error nodeHtmlLabel extension
-    cy.nodeHtmlLabel([
-      {
-        query: 'node',
-        halign: 'center',
-        valign: 'center',
-        halignBox: 'center',
-        valignBox: 'center',
-        tpl: (data: { type: NodeType; label: string }) => {
-          return createNodeHtml(data.type, data.label)
-        },
-      },
-    ])
-
-    cyRef.current = cy
-
-    return () => {
-      cy.destroy()
-    }
-  }, [])
-
-  const updateGraph = useCallback((data: GraphData) => {
-    const cy = cyRef.current
-    if (!cy) return
-
-    cy.elements().remove()
-
-    const nodes = data.nodes.map((node) => {
-      const dimensions = calculateNodeDimensions(node.data.label)
-      return {
-        data: {
-          id: node.id,
-          label: node.data.label,
-          type: node.type as NodeType,
-          nodeWidth: dimensions.width,
-          nodeHeight: dimensions.height,
-        },
-      }
-    })
-
-    const edges = data.edges.map((edge) => ({
-      data: {
-        id: edge.id,
-        source: edge.source,
-        target: edge.target,
-      },
-    }))
-
-    cy.add([...nodes, ...edges])
-
-    cy.layout({
-      name: 'dagre',
-      rankDir: 'TB',
-      nodeSep: 80,
-      rankSep: 100,
-      padding: 50,
-      animate: true,
-      animationDuration: 300,
-      fit: true,
-    } as cytoscape.LayoutOptions).run()
-
-    setTimeout(() => {
-      cy.edges().forEach((edge) => {
-        const sourceNode = edge.source()
-        const targetNode = edge.target()
-        const sourceY = sourceNode.position('y')
-        const targetY = targetNode.position('y')
-
-        if (sourceY > targetY) {
-          edge.addClass('back-edge')
-        }
-      })
-    }, 350)
-  }, [])
-
-  useDeepCompareEffect(() => {
-    updateGraph(graphData)
-  }, [graphData])
-
-  useEffect(() => {
-    if (panelSize?.inPixels !== undefined) {
-      cyRef.current?.resize()
-      setTimeout(() => fitView(), 100)
-    }
-  }, [panelSize?.inPixels, fitView])
-
-  return (
-    <div className="flex h-full flex-col [--graph-node-bg:rgba(255,255,255,1)] [--graph-node-border:rgba(228,228,231,1)] [--graph-node-text:rgba(24,24,27,1)] dark:[--graph-node-bg:rgba(24,24,27,1)] dark:[--graph-node-border:rgba(63,63,70,1)] dark:[--graph-node-text:rgba(250,250,250,1)]">
-      {/* Graph Header */}
-      <header className="flex h-14 shrink-0 items-center justify-between border-border/40 border-b bg-background/80 px-3 backdrop-blur-md">
-        <div className="flex min-w-0 flex-1 items-center gap-3">
-          <Button
-            variant="ghost"
-            size="icon"
-            className="h-8 w-8 shrink-0"
-            onClick={handleBack}
-          >
-            <ArrowLeft className="h-4 w-4" />
-          </Button>
-          <img src={ProductLogo} alt="BrowserOS" className="h-8 w-8 shrink-0" />
-          {isEditingName ? (
-            <input
-              type="text"
-              value={graphName}
-              onChange={(e) => onGraphNameChange(e.target.value)}
-              onBlur={() => setIsEditingName(false)}
-              onKeyDown={(e) => {
-                if (e.key === 'Enter') setIsEditingName(false)
-              }}
-              // biome-ignore lint/a11y/noAutofocus: needed to autofocus field when edit mode is toggled
-              autoFocus
-              placeholder="Enter workflow name..."
-              className="max-w-64 border-[var(--accent-orange)] border-b bg-transparent font-semibold text-sm outline-none placeholder:font-normal placeholder:text-muted-foreground/60"
-            />
-          ) : (
-            <Button
-              variant="ghost"
-              size="sm"
-              onClick={() => setIsEditingName(true)}
-              className="group min-w-0 gap-2 px-2 py-1"
-            >
-              {graphName ? (
-                <span className="truncate font-semibold text-sm">
-                  {graphName}
-                </span>
-              ) : (
-                <span className="text-muted-foreground/60 text-sm italic">
-                  Untitled workflow
-                </span>
-              )}
-              <Pencil className="h-3.5 w-3.5 shrink-0 text-muted-foreground opacity-0 transition-opacity group-hover:opacity-100" />
-            </Button>
-          )}
-        </div>
-
-        {/* Control Buttons */}
-        <div className="flex items-center gap-2">
-          <Tooltip>
-            <TooltipTrigger asChild>
-              <span>
-                <Button
-                  variant="secondary"
-                  size="sm"
-                  onClick={onClickTest}
-                  disabled={!canTest}
-                >
-                  <Play className="mr-1.5 h-4 w-4" />
-                  Test Workflow
-                </Button>
-              </span>
-            </TooltipTrigger>
-            <TooltipContent>{getTestTooltip()}</TooltipContent>
-          </Tooltip>
-          <Tooltip>
-            <TooltipTrigger asChild>
-              <span>
-                <Button
-                  size="sm"
-                  onClick={onClickSave}
-                  disabled={!canSave}
-                  className="bg-[var(--accent-orange)] shadow-lg shadow-orange-500/20 hover:bg-[var(--accent-orange-bright)] disabled:bg-[var(--accent-orange)]/50"
-                >
-                  <Save className="mr-1.5 h-4 w-4" />
-                  {getSaveButtonLabel()}
-                </Button>
-              </span>
-            </TooltipTrigger>
-            <TooltipContent>{getSaveTooltip()}</TooltipContent>
-          </Tooltip>
-        </div>
-      </header>
-
-      {/* Graph Canvas */}
-      <div className="relative min-h-0 flex-1 overflow-hidden [--dot-color:rgba(0,0,0,0.2)] dark:[--dot-color:rgba(255,255,255,0.15)]">
-        <div
-          ref={containerRef}
-          className="h-full w-full bg-zinc-50 dark:bg-zinc-900"
-          style={{
-            backgroundImage:
-              'radial-gradient(circle, var(--dot-color) 1.5px, transparent 1.5px)',
-            backgroundSize: '20px 20px',
-          }}
-        />
-
-        {/* Zoom Controls */}
-        <div className="absolute bottom-4 left-4 z-10 flex flex-col gap-1 rounded-lg border-2 border-border bg-card p-1">
-          <Button
-            variant="ghost"
-            size="icon"
-            className="h-8 w-8"
-            onClick={zoomIn}
-            title="Zoom in"
-          >
-            <Plus className="h-4 w-4" />
-          </Button>
-          <Button
-            variant="ghost"
-            size="icon"
-            className="h-8 w-8"
-            onClick={zoomOut}
-            title="Zoom out"
-          >
-            <Minus className="h-4 w-4" />
-          </Button>
-          <Button
-            variant="ghost"
-            size="icon"
-            className="h-8 w-8"
-            onClick={fitView}
-            title="Fit view"
-          >
-            <Maximize className="h-4 w-4" />
-          </Button>
-        </div>
-      </div>
-    </div>
-  )
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/GraphChat.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/GraphChat.tsx
@@ -1,194 +0,0 @@
-import type { UIMessage } from 'ai'
-import { Send, SquareStop } from 'lucide-react'
-import type { FC, FormEventHandler, KeyboardEvent } from 'react'
-import { useEffect, useRef, useState } from 'react'
-import { ChatError } from '@/entrypoints/sidepanel/index/ChatError'
-import { ChatMessages } from '@/entrypoints/sidepanel/index/ChatMessages'
-import { getResponseAndQueryFromMessageId } from '@/entrypoints/sidepanel/index/useChatSession'
-import {
-  GRAPH_MESSAGE_DISLIKE_EVENT,
-  GRAPH_MESSAGE_LIKE_EVENT,
-} from '@/lib/constants/analyticsEvents'
-import { useJtbdPopup } from '@/lib/jtbd-popup/useJtbdPopup'
-import { track } from '@/lib/metrics/track'
-import { cn } from '@/lib/utils'
-import { GraphEmptyState } from './GraphEmptyState'
-import { getWorkflowDisplayMessages } from './workflow-tidbit-messages'
-
-interface GraphChatProps {
-  onSubmit: FormEventHandler<HTMLFormElement>
-  onInputChange: (value: string) => void
-  onStop: () => void
-  input: string
-  status: 'streaming' | 'submitted' | 'ready' | 'error'
-  messages: UIMessage[]
-  chatError?: Error
-  agentUrlError?: Error | null
-  onSuggestionClick: (prompt: string) => void
-}
-
-export const GraphChat: FC<GraphChatProps> = ({
-  onSubmit,
-  onInputChange,
-  onStop,
-  input,
-  status,
-  messages,
-  chatError,
-  agentUrlError,
-  onSuggestionClick,
-}) => {
-  const [liked, setLiked] = useState<Record<string, boolean>>({})
-  const [disliked, setDisliked] = useState<Record<string, boolean>>({})
-  const [mounted, setMounted] = useState(false)
-  const displayMessages = getWorkflowDisplayMessages(messages)
-
-  useEffect(() => {
-    setMounted(true)
-  }, [])
-
-  const {
-    popupVisible,
-    recordMessageSent,
-    triggerIfEligible,
-    onTakeSurvey: onTakeSurveyBase,
-    onDismiss: onDismissJtbdPopup,
-  } = useJtbdPopup()
-
-  const onTakeSurvey = () =>
-    onTakeSurveyBase({ experimentId: 'workflow_survey' })
-
-  // Trigger JTBD popup when AI finishes responding
-  const previousChatStatus = useRef(status)
-  // biome-ignore lint/correctness/useExhaustiveDependencies: intentionally only trigger on status change
-  useEffect(() => {
-    const aiWasProcessing =
-      previousChatStatus.current === 'streaming' ||
-      previousChatStatus.current === 'submitted'
-    const aiJustFinished = aiWasProcessing && status === 'ready'
-
-    if (aiJustFinished && messages.length > 0) {
-      triggerIfEligible()
-    }
-    previousChatStatus.current = status
-  }, [status])
-
-  const onClickLike = (messageId: string) => {
-    const { responseText, queryText } = getResponseAndQueryFromMessageId(
-      messages,
-      messageId,
-    )
-
-    track(GRAPH_MESSAGE_LIKE_EVENT, { responseText, queryText, messageId })
-
-    setLiked((prev) => ({
-      ...prev,
-      [messageId]: !prev[messageId],
-    }))
-  }
-
-  const onClickDislike = (messageId: string, comment?: string) => {
-    const { responseText, queryText } = getResponseAndQueryFromMessageId(
-      messages,
-      messageId,
-    )
-
-    track(GRAPH_MESSAGE_DISLIKE_EVENT, {
-      responseText,
-      queryText,
-      messageId,
-      comment,
-    })
-
-    setDisliked((prev) => ({
-      ...prev,
-      [messageId]: !prev[messageId],
-    }))
-  }
-
-  const handleSubmit: FormEventHandler<HTMLFormElement> = (e) => {
-    recordMessageSent()
-    onSubmit(e)
-  }
-
-  const handleKeyDown = (e: KeyboardEvent<HTMLTextAreaElement>) => {
-    if (
-      e.key === 'Enter' &&
-      !e.shiftKey &&
-      !e.metaKey &&
-      !e.ctrlKey &&
-      !e.nativeEvent.isComposing
-    ) {
-      e.preventDefault()
-      if (input.trim()) {
-        e.currentTarget.form?.requestSubmit()
-      }
-    }
-  }
-
-  return (
-    <div className="flex h-full flex-col overflow-hidden">
-      <div className="styled-scrollbar min-h-0 flex-1 overflow-y-auto pb-2">
-        {displayMessages.length === 0 ? (
-          <GraphEmptyState
-            mounted={mounted}
-            onSuggestionClick={onSuggestionClick}
-          />
-        ) : (
-          <ChatMessages
-            liked={liked}
-            disliked={disliked}
-            onClickDislike={onClickDislike}
-            onClickLike={onClickLike}
-            messages={displayMessages}
-            status={status}
-            showJtbdPopup={popupVisible}
-            showDontShowAgain={false}
-            onTakeSurvey={onTakeSurvey}
-            onDismissJtbdPopup={onDismissJtbdPopup}
-          />
-        )}
-      </div>
-      {agentUrlError && <ChatError error={agentUrlError} />}
-      {chatError && <ChatError error={chatError} />}
-      <div className="shrink-0 border-border/40 border-t bg-background/80 p-2 backdrop-blur-md">
-        <form
-          onSubmit={handleSubmit}
-          className="relative flex w-full items-end gap-2"
-        >
-          <textarea
-            className={cn(
-              'field-sizing-content max-h-60 min-h-[42px] flex-1 resize-none overflow-hidden rounded-2xl border border-border/50 bg-muted/50 px-4 py-2.5 pr-11 text-sm outline-none transition-colors placeholder:text-muted-foreground/70 hover:border-border focus:border-[var(--accent-orange)]',
-            )}
-            value={input}
-            onChange={(e) => onInputChange(e.target.value)}
-            onKeyDown={handleKeyDown}
-            placeholder={
-              'Visit Amazon and add sensodyne toothpaste to the cart.'
-            }
-            rows={1}
-          />
-          {status === 'streaming' ? (
-            <button
-              type="button"
-              onClick={onStop}
-              className="absolute right-1.5 bottom-1.5 cursor-pointer rounded-full bg-red-600 p-2 text-white shadow-sm transition-all duration-200 hover:bg-red-900 disabled:cursor-not-allowed disabled:opacity-50"
-            >
-              <SquareStop className="h-3.5 w-3.5" />
-              <span className="sr-only">Stop</span>
-            </button>
-          ) : (
-            <button
-              type="submit"
-              disabled={!input.trim()}
-              className="absolute right-1.5 bottom-1.5 cursor-pointer rounded-full bg-[var(--accent-orange)] p-2 text-white shadow-sm transition-all duration-200 hover:bg-[var(--accent-orange-bright)] disabled:cursor-not-allowed disabled:opacity-50"
-            >
-              <Send className="h-3.5 w-3.5" />
-              <span className="sr-only">Send</span>
-            </button>
-          )}
-        </form>
-      </div>
-    </div>
-  )
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/GraphEmptyState.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/GraphEmptyState.tsx
@@ -1,77 +0,0 @@
-import { Workflow } from 'lucide-react'
-import type { FC } from 'react'
-import { cn } from '@/lib/utils'
-
-interface Suggestion {
-  display: string
-  prompt: string
-  icon: string
-}
-
-const WORKFLOW_SUGGESTIONS: Suggestion[] = [
-  {
-    display: 'Search Amazon and add toothpaste to cart',
-    prompt:
-      'Go to Amazon, search for toothpaste, select 1 pack filter and add the first result to cart',
-    icon: '🛒',
-  },
-  {
-    display: 'Accept LinkedIn connection requests',
-    prompt:
-      'Open LinkedIn and go to my connection requests, accept one by one in a loop for 25 times',
-    icon: '🤝',
-  },
-  {
-    display: 'Unsubscribe from Gmail subscriptions',
-    prompt:
-      'Go to Gmail, navigate to manage subscriptions and unsubscribe from all',
-    icon: '📧',
-  },
-]
-
-interface GraphEmptyStateProps {
-  mounted: boolean
-  onSuggestionClick: (prompt: string) => void
-}
-
-export const GraphEmptyState: FC<GraphEmptyStateProps> = ({
-  mounted,
-  onSuggestionClick,
-}) => {
-  return (
-    <div
-      className={cn(
-        'm-0! flex h-full flex-col items-center justify-center space-y-4 text-center opacity-0 transition-all duration-700',
-        mounted ? 'translate-y-0 opacity-100' : 'translate-y-4 opacity-0',
-      )}
-    >
-      <div className="mb-2 flex h-14 w-14 items-center justify-center rounded-2xl bg-muted/50">
-        <Workflow className="h-7 w-7 text-[var(--accent-orange)]" />
-      </div>
-      <div>
-        <h2 className="mb-1 font-semibold text-lg">
-          Create reliable workflows
-        </h2>
-        <p className="max-w-[240px] text-muted-foreground text-xs">
-          Chat with the agent to create and refine browser automation
-        </p>
-      </div>
-
-      <div className="mt-6 grid w-full max-w-[300px] grid-cols-1 gap-2">
-        {WORKFLOW_SUGGESTIONS.map((suggestion) => (
-          <button
-            type="button"
-            key={suggestion.display}
-            onClick={() => onSuggestionClick(suggestion.prompt)}
-            className="group flex items-center justify-between rounded-lg border border-border/50 bg-card px-3 py-2.5 text-left text-xs transition-all duration-200 hover:border-[var(--accent-orange)]/50 hover:bg-[var(--accent-orange)]/5"
-          >
-            {suggestion.display}
-            <span className="opacity-0 transition-opacity duration-200 group-hover:opacity-100">
-              {suggestion.icon}
-            </span>
-          </button>
-        ))}
-      </div>
-    </div>
-  )
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/WorkflowsChatHeader.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/WorkflowsChatHeader.tsx
@@ -1,92 +0,0 @@
-import { Github, Plus, SettingsIcon } from 'lucide-react'
-import type { FC } from 'react'
-import { ChatProviderSelector } from '@/components/chat/ChatProviderSelector'
-import type { Provider } from '@/components/chat/chatComponentTypes'
-import { ThemeToggle } from '@/components/elements/theme-toggle'
-import { productRepositoryUrl } from '@/lib/constants/productUrls'
-import { BrowserOSIcon, ProviderIcon } from '@/lib/llm-providers/providerIcons'
-import type { ProviderType } from '@/lib/llm-providers/types'
-
-interface WorkflowsChatHeaderProps {
-  selectedProvider: Provider
-  providers: Provider[]
-  onSelectProvider: (provider: Provider) => void
-  onNewWorkflow: () => void
-  hasMessages: boolean
-}
-
-export const WorkflowsChatHeader: FC<WorkflowsChatHeaderProps> = ({
-  selectedProvider,
-  providers,
-  onSelectProvider,
-  onNewWorkflow,
-  hasMessages,
-}) => {
-  return (
-    <header className="flex h-14 shrink-0 items-center justify-between border-border/40 border-b bg-background/80 px-3 backdrop-blur-md">
-      <div className="flex items-center gap-2">
-        <ChatProviderSelector
-          providers={providers}
-          selectedProvider={selectedProvider}
-          onSelectProvider={onSelectProvider}
-        >
-          <button
-            type="button"
-            className="group relative inline-flex cursor-pointer items-center gap-2 rounded-lg p-2 text-muted-foreground transition-colors hover:bg-muted/50 hover:text-foreground data-[state=open]:bg-accent"
-            title="Change AI Provider"
-          >
-            {selectedProvider.type === 'browseros' ? (
-              <BrowserOSIcon size={18} />
-            ) : (
-              <ProviderIcon
-                type={selectedProvider.type as ProviderType}
-                size={18}
-              />
-            )}
-            <span className="font-semibold text-base">
-              {selectedProvider.name}
-            </span>
-          </button>
-        </ChatProviderSelector>
-      </div>
-
-      <div className="flex items-center gap-1">
-        {hasMessages && (
-          <button
-            type="button"
-            onClick={onNewWorkflow}
-            className="cursor-pointer rounded-lg p-2 text-muted-foreground transition-colors hover:bg-muted/50 hover:text-foreground"
-            title="New workflow"
-          >
-            <Plus className="h-4 w-4" />
-          </button>
-        )}
-
-        <a
-          href={productRepositoryUrl}
-          target="_blank"
-          rel="noopener noreferrer"
-          className="cursor-pointer rounded-lg p-2 text-muted-foreground transition-colors hover:bg-muted/50 hover:text-foreground"
-          title="Star on Github"
-        >
-          <Github className="h-4 w-4" />
-        </a>
-
-        <a
-          href="/app.html#/settings"
-          target="_blank"
-          rel="noopener noreferrer"
-          className="cursor-pointer rounded-lg p-2 text-muted-foreground transition-colors hover:bg-muted/50 hover:text-foreground"
-          title="Settings"
-        >
-          <SettingsIcon className="h-4 w-4" />
-        </a>
-
-        <ThemeToggle
-          className="rounded-lg p-2 text-muted-foreground transition-colors hover:bg-muted/50 hover:text-foreground"
-          iconClassName="h-4 w-4"
-        />
-      </div>
-    </header>
-  )
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/workflow-tidbit-messages.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/create-graph/workflow-tidbit-messages.ts
@@ -1,111 +0,0 @@
-import type { UIMessage } from 'ai'
-
-type MessagePart = UIMessage['parts'][number]
-
-const TIDBIT_SUFFIXES = ['...', '\u2026'] as const
-
-const isTextPart = (
-  part: MessagePart,
-): part is MessagePart & { type: 'text' } => part.type === 'text'
-
-const isTidbitLine = (line: string): boolean => {
-  const trimmed = line.trim()
-  if (trimmed.length === 0) return false
-  return TIDBIT_SUFFIXES.some((suffix) => trimmed.endsWith(suffix))
-}
-
-const getNonEmptyLines = (text: string): string[] =>
-  text.split('\n').filter((line) => line.trim().length > 0)
-
-const isAllTidbitText = (text: string): boolean => {
-  const lines = getNonEmptyLines(text)
-  return lines.length > 0 && lines.every((line) => isTidbitLine(line))
-}
-
-export const isWorkflowTidbitMessage = (message: UIMessage): boolean => {
-  if (message.role !== 'assistant') return false
-  if (message.parts.length === 0) return false
-  if (message.parts.some((part) => !isTextPart(part))) return false
-
-  const fullText = message.parts
-    .filter((part) => isTextPart(part))
-    .map((part) => part.text)
-    .join('')
-
-  return isAllTidbitText(fullText)
-}
-
-// within a text part that has multiple tidbit lines, keep only the last line
-const compactTidbitLinesInPart = (part: MessagePart): MessagePart => {
-  if (!isTextPart(part)) return part
-
-  const lines = getNonEmptyLines(part.text)
-  if (lines.length <= 1) return part
-  if (!lines.every((line) => isTidbitLine(line))) return part
-
-  return { ...part, text: lines[lines.length - 1] }
-}
-
-// collapse consecutive tidbit text parts within a single message
-const compactTidbitPartsInMessage = (message: UIMessage): UIMessage => {
-  if (message.role !== 'assistant') return message
-
-  // first compact multi-line tidbit text within each part
-  const lineCompactedParts = message.parts.map(compactTidbitLinesInPart)
-
-  // then collapse consecutive tidbit parts to just the last one
-  const compactedParts: UIMessage['parts'] = []
-  let pendingTidbitPart: (MessagePart & { type: 'text' }) | null = null
-
-  const flushPendingTidbitPart = () => {
-    if (!pendingTidbitPart) return
-    compactedParts.push(pendingTidbitPart)
-    pendingTidbitPart = null
-  }
-
-  for (const part of lineCompactedParts) {
-    if (isTextPart(part) && isAllTidbitText(part.text)) {
-      pendingTidbitPart = part
-      continue
-    }
-
-    flushPendingTidbitPart()
-    compactedParts.push(part)
-  }
-
-  flushPendingTidbitPart()
-
-  const partsChanged =
-    compactedParts.length !== message.parts.length ||
-    compactedParts.some((p, i) => p !== message.parts[i])
-
-  if (!partsChanged) return message
-
-  return { ...message, parts: compactedParts }
-}
-
-export const getWorkflowDisplayMessages = (
-  messages: UIMessage[],
-): UIMessage[] => {
-  // first compact tidbit parts within each message
-  const normalizedMessages = messages.map(compactTidbitPartsInMessage)
-  const compactedMessages: UIMessage[] = []
-
-  // then collapse consecutive tidbit-only messages
-  for (const message of normalizedMessages) {
-    const previousMessage = compactedMessages[compactedMessages.length - 1]
-    const shouldReplacePreviousTidbit =
-      previousMessage &&
-      isWorkflowTidbitMessage(previousMessage) &&
-      isWorkflowTidbitMessage(message)
-
-    if (shouldReplacePreviousTidbit) {
-      compactedMessages[compactedMessages.length - 1] = message
-      continue
-    }
-
-    compactedMessages.push(message)
-  }
-
-  return compactedMessages
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/llm-hub/HubProviderRow.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/llm-hub/HubProviderRow.tsx
@@ -2,8 +2,6 @@ import { Globe2, Trash2 } from 'lucide-react'
 import type { FC } from 'react'
 import { useMemo } from 'react'
 import { Button } from '@/components/ui/button'
-import { useKimiLaunch } from '@/lib/feature-flags/useKimiLaunch'
-import { cn } from '@/lib/utils'
 import { getFaviconUrl, type LlmHubProvider } from './models'

 interface HubProviderRowProps {
@@ -20,20 +18,9 @@ export const HubProviderRow: FC<HubProviderRowProps> = ({
  onDelete,
 }) => {
  const iconUrl = useMemo(() => getFaviconUrl(provider.url), [provider.url])
-  const kimiLaunch = useKimiLaunch()
-  const normalizedName = provider.name.trim().toLowerCase()
-  const normalizedUrl = provider.url.trim().toLowerCase()
-  const isKimi = normalizedName === 'kimi' || normalizedUrl.includes('kimi.com')
-  const showKimiFlare = isKimi && kimiLaunch

  return (
-    <div
-      className={cn(
-        'group flex w-full items-center gap-4 rounded-xl border border-border bg-card p-4 transition-all hover:border-[var(--accent-orange)] hover:shadow-md',
-        showKimiFlare &&
-          'border-orange-300/80 bg-orange-50/20 shadow-sm ring-1 ring-orange-300/45 dark:bg-orange-500/5',
-      )}
-    >
+    <div className="group flex w-full items-center gap-4 rounded-xl border border-border bg-card p-4 transition-all hover:border-[var(--accent-orange)] hover:shadow-md">
      <div className="flex h-10 w-10 shrink-0 items-center justify-center overflow-hidden rounded-lg bg-muted">
        {iconUrl ? (
          <img
@@ -49,16 +36,6 @@ export const HubProviderRow: FC<HubProviderRowProps> = ({
      <div className="min-w-0 flex-1">
        <div className="mb-0.5 flex items-center gap-2">
          <span className="block truncate font-semibold">{provider.name}</span>
-          {showKimiFlare && (
-            <div className="flex flex-wrap items-center gap-1">
-              <span className="rounded-full border border-orange-300/60 bg-orange-100/70 px-2 py-0.5 font-semibold text-[11px] text-orange-700 dark:border-orange-400/40 dark:bg-orange-500/15 dark:text-orange-300">
-                Recommended
-              </span>
-              <span className="rounded-full border border-orange-300/60 bg-orange-100/60 px-2.5 py-0.5 font-medium text-orange-700 text-xs dark:border-orange-400/40 dark:bg-orange-500/15 dark:text-orange-300">
-                Powered by Moonshot AI
-              </span>
-            </div>
-          )}
        </div>
        <p className="truncate text-muted-foreground/70 text-xs">
          {provider.url}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/scheduled-tasks/ScheduledTasksList.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/scheduled-tasks/ScheduledTasksList.tsx
@@ -28,7 +28,7 @@ export const ScheduledTasksList: FC<ScheduledTasksListProps> = ({
      <div className="rounded-xl border border-border bg-card p-6 shadow-sm">
        <div className="rounded-lg border border-border border-dashed py-8 text-center">
          <p className="text-muted-foreground text-sm">
-            No scheduled tasks yet. Create one to automate recurring workflows.
+            No scheduled tasks yet. Create one to automate recurring tasks.
          </p>
        </div>
      </div>
--- a/packages/browseros-agent/apps/agent/entrypoints/app/skills/SkillsPage.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/skills/SkillsPage.tsx
@@ -238,7 +238,7 @@ const EmptyState: FC<{ onCreateClick: () => void }> = ({ onCreateClick }) => (
      <h3 className="mb-1 font-medium text-lg">No skills yet</h3>
      <p className="mb-5 max-w-sm text-muted-foreground text-sm leading-6">
        Skills teach your agent how to handle repeatable tasks like research,
-        extraction, and structured workflows.
+        extraction, and repeatable browser tasks.
      </p>
      <Button onClick={onCreateClick} size="sm">
        <Plus className="mr-1.5 size-4" />
--- a/packages/browseros-agent/apps/agent/entrypoints/app/workflows/RunWorkflowDialog.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/workflows/RunWorkflowDialog.tsx
@@ -1,123 +0,0 @@
-import type { UIMessage } from 'ai'
-import { Loader2, RotateCcw, Square, X } from 'lucide-react'
-import type { FC } from 'react'
-import { Button } from '@/components/ui/button'
-import {
-  Dialog,
-  DialogContent,
-  DialogHeader,
-  DialogTitle,
-} from '@/components/ui/dialog'
-
-interface RunWorkflowDialogProps {
-  open: boolean
-  workflowName: string
-  messages: UIMessage[]
-  status: 'streaming' | 'submitted' | 'ready' | 'error'
-  wasCancelled: boolean
-  error: Error | undefined
-  onStop: () => void
-  onRetry: () => void
-  onClose: () => void
-}
-
-export const RunWorkflowDialog: FC<RunWorkflowDialogProps> = ({
-  open,
-  workflowName,
-  messages,
-  status,
-  wasCancelled,
-  error,
-  onStop,
-  onRetry,
-  onClose,
-}) => {
-  const isProcessing = status === 'streaming' || status === 'submitted'
-  const _isComplete = !isProcessing
-
-  const getStatusText = () => {
-    if (status === 'submitted') return 'Starting workflow...'
-    if (status === 'streaming') return 'Running...'
-    if (wasCancelled) return 'Execution cancelled'
-    if (status === 'error') return 'Error occurred'
-    return 'Completed'
-  }
-
-  const getMessageContent = (message: UIMessage) => {
-    return message.parts
-      .filter((part) => part.type === 'text')
-      .map((part) => part.text)
-      .join('')
-  }
-
-  const assistantMessages = messages.filter((m) => m.role === 'assistant')
-
-  return (
-    <Dialog open={open} onOpenChange={() => {}}>
-      <DialogContent
-        className="max-h-[80vh] max-w-2xl overflow-hidden [&>button]:hidden"
-        onInteractOutside={(e) => e.preventDefault()}
-        onEscapeKeyDown={(e) => e.preventDefault()}
-      >
-        <DialogHeader className="flex-row items-center justify-between space-y-0">
-          <DialogTitle className="flex items-center gap-2">
-            {isProcessing && (
-              <Loader2 className="h-4 w-4 animate-spin text-[var(--accent-orange)]" />
-            )}
-            Running: {workflowName}
-          </DialogTitle>
-          <div className="flex items-center gap-2">
-            {isProcessing ? (
-              <Button variant="destructive" size="sm" onClick={onStop}>
-                <Square className="mr-1.5 h-3 w-3" />
-                Stop
-              </Button>
-            ) : (
-              <>
-                <Button variant="secondary" size="sm" onClick={onRetry}>
-                  <RotateCcw className="mr-1.5 h-3 w-3" />
-                  Retry
-                </Button>
-                <Button variant="outline" size="sm" onClick={onClose}>
-                  <X className="mr-1.5 h-3 w-3" />
-                  Close
-                </Button>
-              </>
-            )}
-          </div>
-        </DialogHeader>
-
-        <div className="flex flex-col gap-2">
-          <div className="text-muted-foreground text-sm">{getStatusText()}</div>
-
-          {error && (
-            <div className="rounded-lg border border-destructive/50 bg-destructive/10 p-3 text-destructive text-sm">
-              <div className="font-medium">Error Details</div>
-              <div className="mt-1 whitespace-pre-wrap font-mono text-xs">
-                {error.message}
-              </div>
-            </div>
-          )}
-
-          <div className="max-h-[50vh] overflow-y-auto rounded-lg border border-border bg-muted/30 p-4">
-            {assistantMessages.length === 0 ? (
-              <div className="text-muted-foreground text-sm">
-                {isProcessing
-                  ? 'Waiting for response...'
-                  : 'No output available.'}
-              </div>
-            ) : (
-              <div className="space-y-4">
-                {assistantMessages.map((message) => (
-                  <div key={message.id} className="whitespace-pre-wrap text-sm">
-                    {getMessageContent(message)}
-                  </div>
-                ))}
-              </div>
-            )}
-          </div>
-        </div>
-      </DialogContent>
-    </Dialog>
-  )
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/workflows/WorkflowCard.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/workflows/WorkflowCard.tsx
@@ -1,51 +0,0 @@
-import { Pencil, Play, Trash2 } from 'lucide-react'
-import type { FC } from 'react'
-import { NavLink } from 'react-router'
-import { Button } from '@/components/ui/button'
-import type { Workflow } from '@/lib/workflows/workflowStorage'
-
-interface WorkflowCardProps {
-  workflow: Workflow
-  onDelete: () => void
-  onRun: () => void
-}
-
-export const WorkflowCard: FC<WorkflowCardProps> = ({
-  workflow,
-  onDelete,
-  onRun,
-}) => {
-  return (
-    <div className="rounded-xl border border-border bg-card p-4 shadow-sm transition-all hover:border-[var(--accent-orange)]/50 hover:shadow-sm">
-      <div className="flex items-center gap-4">
-        <div className="min-w-0 flex-1">
-          <span className="truncate font-semibold">
-            {workflow.workflowName}
-          </span>
-        </div>
-
-        <div className="flex shrink-0 items-center gap-2">
-          <Button variant="outline" size="sm" onClick={onRun}>
-            <Play className="mr-1.5 h-3 w-3" />
-            Run
-          </Button>
-          <Button asChild variant="outline" size="sm">
-            <NavLink to={`/workflows/create-graph?workflowId=${workflow.id}`}>
-              <Pencil className="mr-1.5 h-3 w-3" />
-              Edit
-            </NavLink>
-          </Button>
-          <Button
-            variant="ghost"
-            size="icon-sm"
-            onClick={onDelete}
-            className="text-muted-foreground hover:bg-destructive/10 hover:text-destructive"
-            aria-label={`Delete ${workflow.workflowName}`}
-          >
-            <Trash2 className="h-4 w-4" />
-          </Button>
-        </div>
-      </div>
-    </div>
-  )
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/workflows/WorkflowsHeader.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/workflows/WorkflowsHeader.tsx
@@ -1,56 +0,0 @@
-import { HelpCircle, Plus, Workflow } from 'lucide-react'
-import type { FC } from 'react'
-import { NavLink } from 'react-router'
-import { Button } from '@/components/ui/button'
-import {
-  Tooltip,
-  TooltipContent,
-  TooltipProvider,
-  TooltipTrigger,
-} from '@/components/ui/tooltip'
-import { workflowsHelpUrl } from '@/lib/constants/productUrls'
-
-export const WorkflowsHeader: FC = () => {
-  return (
-    <div className="rounded-xl border border-border bg-card p-6 shadow-sm transition-all hover:shadow-md">
-      <div className="flex items-start gap-4">
-        <div className="flex h-12 w-12 shrink-0 items-center justify-center rounded-xl bg-[var(--accent-orange)]/10">
-          <Workflow className="h-6 w-6 text-[var(--accent-orange)]" />
-        </div>
-        <div className="flex-1">
-          <div className="mb-1 flex items-center gap-2">
-            <h2 className="font-semibold text-xl">Workflows</h2>
-            <TooltipProvider delayDuration={0}>
-              <Tooltip>
-                <TooltipTrigger asChild>
-                  <a
-                    href={workflowsHelpUrl}
-                    target="_blank"
-                    rel="noopener noreferrer"
-                    className="rounded-full p-1 text-muted-foreground transition-colors hover:bg-muted hover:text-foreground"
-                  >
-                    <HelpCircle className="h-4 w-4" />
-                  </a>
-                </TooltipTrigger>
-                <TooltipContent>Learn more about workflows</TooltipContent>
-              </Tooltip>
-            </TooltipProvider>
-          </div>
-          <p className="text-muted-foreground text-sm">
-            Create and manage browser automation workflows
-          </p>
-        </div>
-        <Button
-          asChild
-          className="border-[var(--accent-orange)] bg-[var(--accent-orange)]/10 text-[var(--accent-orange)] hover:bg-[var(--accent-orange)]/20 hover:text-[var(--accent-orange)]"
-          variant="outline"
-        >
-          <NavLink to="/workflows/create-graph">
-            <Plus className="mr-1.5 h-4 w-4" />
-            New Workflow
-          </NavLink>
-        </Button>
-      </div>
-    </div>
-  )
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/workflows/WorkflowsList.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/workflows/WorkflowsList.tsx
@@ -1,40 +0,0 @@
-import type { FC } from 'react'
-import type { Workflow } from '@/lib/workflows/workflowStorage'
-import { WorkflowCard } from './WorkflowCard'
-
-interface WorkflowsListProps {
-  workflows: Workflow[]
-  onDelete: (workflowId: string) => void
-  onRun: (workflowId: string) => void
-}
-
-export const WorkflowsList: FC<WorkflowsListProps> = ({
-  workflows,
-  onDelete,
-  onRun,
-}) => {
-  if (workflows.length === 0) {
-    return (
-      <div className="rounded-xl border border-border bg-card p-6 shadow-sm">
-        <div className="rounded-lg border border-border border-dashed py-8 text-center">
-          <p className="text-muted-foreground text-sm">
-            No workflows yet. Create one to automate browser tasks.
-          </p>
-        </div>
-      </div>
-    )
-  }
-
-  return (
-    <div className="space-y-3">
-      {workflows.map((workflow) => (
-        <WorkflowCard
-          key={workflow.id}
-          workflow={workflow}
-          onDelete={() => onDelete(workflow.id)}
-          onRun={() => onRun(workflow.id)}
-        />
-      ))}
-    </div>
-  )
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/workflows/WorkflowsPage.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/workflows/WorkflowsPage.tsx
@@ -1,127 +0,0 @@
-import { type FC, useState } from 'react'
-import {
-  AlertDialog,
-  AlertDialogAction,
-  AlertDialogCancel,
-  AlertDialogContent,
-  AlertDialogDescription,
-  AlertDialogFooter,
-  AlertDialogHeader,
-  AlertDialogTitle,
-} from '@/components/ui/alert-dialog'
-import {
-  WORKFLOW_DELETED_EVENT,
-  WORKFLOW_RUN_STARTED_EVENT,
-} from '@/lib/constants/analyticsEvents'
-import { track } from '@/lib/metrics/track'
-import { useRpcClient } from '@/lib/rpc/RpcClientProvider'
-import { sentry } from '@/lib/sentry/sentry'
-import { useWorkflows } from '@/lib/workflows/workflowStorage'
-import { RunWorkflowDialog } from './RunWorkflowDialog'
-import { useRunWorkflow } from './useRunWorkflow'
-import { WorkflowsHeader } from './WorkflowsHeader'
-import { WorkflowsList } from './WorkflowsList'
-
-export const WorkflowsPage: FC = () => {
-  const { workflows, removeWorkflow } = useWorkflows()
-  const rpcClient = useRpcClient()
-
-  const [deleteWorkflowId, setDeleteWorkflowId] = useState<string | null>(null)
-
-  const {
-    isRunning,
-    runningWorkflowName,
-    messages,
-    status,
-    wasCancelled,
-    error,
-    runWorkflow,
-    stopRun,
-    retry,
-    closeDialog,
-  } = useRunWorkflow()
-
-  const handleDelete = (workflowId: string) => {
-    setDeleteWorkflowId(workflowId)
-  }
-
-  const confirmDelete = async () => {
-    if (!deleteWorkflowId) return
-
-    const workflow = workflows.find((w) => w.id === deleteWorkflowId)
-    if (!workflow) return
-
-    try {
-      await rpcClient.graph[':id'].$delete({ param: { id: workflow.codeId } })
-    } catch (error) {
-      sentry.captureException(error, {
-        extra: {
-          message: 'Failed to delete graph from server',
-          codeId: workflow.codeId,
-          workflowId: deleteWorkflowId,
-        },
-      })
-    }
-
-    await removeWorkflow(deleteWorkflowId)
-    setDeleteWorkflowId(null)
-    track(WORKFLOW_DELETED_EVENT)
-  }
-
-  const handleRun = (workflowId: string) => {
-    const workflow = workflows.find((w) => w.id === workflowId)
-    if (workflow) {
-      track(WORKFLOW_RUN_STARTED_EVENT)
-      runWorkflow(workflow.codeId, workflow.workflowName)
-    }
-  }
-
-  const workflowToDelete = deleteWorkflowId
-    ? workflows.find((w) => w.id === deleteWorkflowId)
-    : null
-
-  return (
-    <div className="fade-in slide-in-from-bottom-5 animate-in space-y-6 duration-500">
-      <WorkflowsHeader />
-
-      <WorkflowsList
-        workflows={workflows}
-        onDelete={handleDelete}
-        onRun={handleRun}
-      />
-
-      <AlertDialog
-        open={deleteWorkflowId !== null}
-        onOpenChange={(open) => !open && setDeleteWorkflowId(null)}
-      >
-        <AlertDialogContent>
-          <AlertDialogHeader>
-            <AlertDialogTitle>Delete Workflow</AlertDialogTitle>
-            <AlertDialogDescription>
-              Delete "{workflowToDelete?.workflowName}"? This action cannot be
-              undone.
-            </AlertDialogDescription>
-          </AlertDialogHeader>
-          <AlertDialogFooter>
-            <AlertDialogCancel>Cancel</AlertDialogCancel>
-            <AlertDialogAction onClick={confirmDelete}>
-              Delete
-            </AlertDialogAction>
-          </AlertDialogFooter>
-        </AlertDialogContent>
-      </AlertDialog>
-
-      <RunWorkflowDialog
-        open={isRunning}
-        workflowName={runningWorkflowName}
-        messages={messages}
-        status={status}
-        wasCancelled={wasCancelled}
-        error={error}
-        onStop={stopRun}
-        onRetry={retry}
-        onClose={closeDialog}
-      />
-    </div>
-  )
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/workflows/WorkflowsPageWrapper.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/workflows/WorkflowsPageWrapper.tsx
@@ -1,10 +0,0 @@
-import { type FC, Suspense } from 'react'
-import { WorkflowsPage } from './WorkflowsPage'
-
-export const WorkflowsPageWrapper: FC = () => {
-  return (
-    <Suspense fallback={<div className="h-screen w-screen bg-background" />}>
-      <WorkflowsPage />
-    </Suspense>
-  )
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/app/workflows/useRunWorkflow.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/app/workflows/useRunWorkflow.ts
@@ -1,167 +0,0 @@
-import { useChat } from '@ai-sdk/react'
-import { DefaultChatTransport } from 'ai'
-import { compact } from 'es-toolkit/array'
-import { useEffect, useRef, useState } from 'react'
-import { useChatRefs } from '@/entrypoints/sidepanel/index/useChatRefs'
-import { useAgentServerUrl } from '@/lib/browseros/useBrowserOSProviders'
-import {
-  WORKFLOW_RUN_COMPLETED_EVENT,
-  WORKFLOW_RUN_RETRIED_EVENT,
-  WORKFLOW_RUN_STOPPED_EVENT,
-} from '@/lib/constants/analyticsEvents'
-import { track } from '@/lib/metrics/track'
-
-type WorkflowMessageMetadata = {
-  window?: chrome.windows.Window
-}
-
-export const useRunWorkflow = () => {
-  const [isRunning, setIsRunning] = useState(false)
-  const [runningWorkflowName, setRunningWorkflowName] = useState<string>('')
-  const [wasCancelled, setWasCancelled] = useState(false)
-  const codeIdRef = useRef<string | undefined>(undefined)
-
-  const { baseUrl: agentServerUrl } = useAgentServerUrl()
-
-  const {
-    selectedLlmProviderRef,
-    enabledMcpServersRef,
-    enabledCustomServersRef,
-    personalizationRef,
-  } = useChatRefs()
-
-  const agentUrlRef = useRef(agentServerUrl)
-
-  useEffect(() => {
-    agentUrlRef.current = agentServerUrl
-  }, [agentServerUrl])
-
-  const { sendMessage, stop, status, messages, setMessages, error } = useChat({
-    transport: new DefaultChatTransport({
-      prepareSendMessagesRequest: async ({ messages }) => {
-        const lastMessage = messages[messages.length - 1]
-        const metadata = lastMessage.metadata as
-          | WorkflowMessageMetadata
-          | undefined
-        const provider = selectedLlmProviderRef.current
-        const enabledMcpServers = enabledMcpServersRef.current
-        const customMcpServers = enabledCustomServersRef.current
-
-        return {
-          api: `${agentUrlRef.current}/graph/${codeIdRef.current}/run`,
-          body: {
-            provider: provider?.type,
-            providerType: provider?.type,
-            providerName: provider?.name,
-            model: provider?.modelId ?? 'browseros',
-            contextWindowSize: provider?.contextWindow,
-            temperature: provider?.temperature,
-            resourceName: provider?.resourceName,
-            accessKeyId: provider?.accessKeyId,
-            secretAccessKey: provider?.secretAccessKey,
-            region: provider?.region,
-            sessionToken: provider?.sessionToken,
-            apiKey: provider?.apiKey,
-            baseUrl: provider?.baseUrl,
-            browserContext: {
-              windowId: metadata?.window?.id,
-              activeTab: metadata?.window?.tabs?.[0],
-              enabledMcpServers: compact(enabledMcpServers),
-              customMcpServers,
-            },
-            userSystemPrompt: personalizationRef.current,
-            supportsImages: provider?.supportsImages,
-          },
-        }
-      },
-    }),
-  })
-
-  const previousStatus = useRef(status)
-  useEffect(() => {
-    const wasProcessing =
-      previousStatus.current === 'streaming' ||
-      previousStatus.current === 'submitted'
-    const justFinished =
-      wasProcessing && (status === 'ready' || status === 'error')
-
-    if (justFinished && isRunning) {
-      track(WORKFLOW_RUN_COMPLETED_EVENT, {
-        status: wasCancelled
-          ? 'cancelled'
-          : status === 'error'
-            ? 'failed'
-            : 'completed',
-      })
-    }
-    previousStatus.current = status
-  }, [status, isRunning, wasCancelled])
-
-  const startWorkflowRun = async () => {
-    setMessages([])
-    setWasCancelled(false)
-
-    let backgroundWindow: chrome.windows.Window | undefined
-    try {
-      backgroundWindow = await chrome.windows.create({
-        url: 'chrome://newtab',
-        focused: true,
-        type: 'normal',
-      })
-    } catch {
-      // Fallback when no window context is available (e.g. all windows closed)
-      const tab = await chrome.tabs.create({
-        url: 'chrome://newtab',
-        active: true,
-      })
-      if (tab.windowId) {
-        backgroundWindow = await chrome.windows.get(tab.windowId)
-      }
-    }
-
-    sendMessage({
-      text: 'Run the workflow.',
-      metadata: {
-        window: backgroundWindow,
-      },
-    })
-  }
-
-  const runWorkflow = async (codeId: string, workflowName: string) => {
-    codeIdRef.current = codeId
-    setRunningWorkflowName(workflowName)
-    setIsRunning(true)
-    await startWorkflowRun()
-  }
-
-  const stopRun = () => {
-    track(WORKFLOW_RUN_STOPPED_EVENT)
-    setWasCancelled(true)
-    stop()
-  }
-
-  const retry = async () => {
-    track(WORKFLOW_RUN_RETRIED_EVENT)
-    await startWorkflowRun()
-  }
-
-  const closeDialog = () => {
-    setIsRunning(false)
-    setRunningWorkflowName('')
-    setWasCancelled(false)
-    setMessages([])
-  }
-
-  return {
-    isRunning,
-    runningWorkflowName,
-    messages,
-    status,
-    wasCancelled,
-    error,
-    runWorkflow,
-    stopRun,
-    retry,
-    closeDialog,
-  }
-}
--- a/packages/browseros-agent/apps/agent/entrypoints/newtab/index/tips.ts
+++ b/packages/browseros-agent/apps/agent/entrypoints/newtab/index/tips.ts
@@ -45,7 +45,7 @@ export const TIPS: Tip[] = [
  },
  {
    id: 'mcp-servers',
-    text: 'Add MCP servers for Google Calendar, Gmail, Notion, and more to build multi-service workflows.',
+    text: 'Add MCP servers for Google Calendar, Gmail, Notion, and more to power multi-service automations.',
  },
  {
    id: 'skills',
@@ -75,10 +75,6 @@ export const TIPS: Tip[] = [
    id: 'at-mention-tabs',
    text: 'Type @ in the search bar to mention and attach open tabs as context for your AI queries.',
  },
-  {
-    id: 'workflows',
-    text: 'For complex repeatable tasks, build visual Workflows instead of one-off prompts for consistent results.',
-  },
  {
    id: 'mode-selection',
    text: 'Use Chat mode for read-only operations like questions and summaries, and Agent mode for multi-step browser tasks.',
--- a/packages/browseros-agent/apps/agent/entrypoints/onboarding/features/Features.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/onboarding/features/Features.tsx
@@ -5,7 +5,6 @@ import {
  Bot,
  Code2,
  FolderOpen,
-  GitBranch,
  LinkIcon,
  Plug,
  SplitSquareHorizontal,
@@ -23,7 +22,6 @@ import {
  COWORK_DEMO_URL,
  MCP_SERVER_DEMO_URL,
  SPLIT_VIEW_GIF_URL,
-  WORKFLOWS_DEMO_URL,
 } from '@/lib/constants/mediaUrls'
 import {
  discordUrl,
@@ -44,7 +42,7 @@ const features: Feature[] = [
    description:
      'Describe any task and watch BrowserOS execute it—clicking, typing, and navigating for you.',
    detailedDescription:
-      'The BrowserOS Agent turns your words into browser actions. Describe what you need in plain English—fill out this form, extract data from that page, navigate through these steps—and the agent handles the rest. It clicks buttons, types text, navigates between pages, and completes multi-step workflows automatically. Everything runs locally on your machine with your own API keys, so your data stays private.',
+      'The BrowserOS Agent turns your words into browser actions. Describe what you need in plain English—fill out this form, extract data from that page, navigate through these steps—and the agent handles the rest. It clicks buttons, types text, navigates between pages, and completes multi-step browser tasks automatically. Everything runs locally on your machine with your own API keys, so your data stays private.',
    highlights: [
      'Multi-tab execution — run agents in multiple tabs simultaneously',
      'Smart navigation — automatically finds and interacts with page elements',
@@ -75,24 +73,6 @@ const features: Feature[] = [
    gridClass: 'md:col-span-1',
    videoUrl: MCP_SERVER_DEMO_URL,
  },
-  {
-    id: 'workflows',
-    Icon: GitBranch,
-    tag: 'AUTOMATION',
-    title: 'Visual Workflows',
-    description:
-      'Build reliable, repeatable automations with a visual graph builder.',
-    detailedDescription:
-      'Workflows turn complex browser tasks into reliable, reusable automations. Instead of hoping the agent figures out the right steps each time, you define the exact sequence in a visual graph. Describe what you want in chat, and the workflow agent generates the graph. Add loops, conditionals, and parallel branches. Save workflows and run them on-demand whenever you need.',
-    highlights: [
-      'Chat-to-graph — describe your automation and get a visual workflow',
-      'Parallel execution — run multiple branches simultaneously',
-      'Loops & conditionals — handle complex logic with flow control',
-      'Save & reuse — run saved workflows on-demand, daily, or weekly',
-    ],
-    gridClass: 'md:col-span-1',
-    videoUrl: WORKFLOWS_DEMO_URL || undefined,
-  },
  {
    id: 'cowork',
    Icon: FolderOpen,
--- a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/ChatError.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/ChatError.tsx
@@ -1,20 +1,18 @@
 import { AlertCircle, RefreshCw } from 'lucide-react'
 import type { FC } from 'react'
-// import { useMemo } from 'react'
+import { useMemo } from 'react'
 import { Button } from '@/components/ui/button'

-// --- Commented out for Kimi partnership launch (restore after) ---
-// const SURVEY_DIRECTIONS = [
-//   'competitor',
-//   'switching',
-//   'workflow',
-//   'activation',
-// ] as const
-//
-// function pickRandomDirection(): string {
-//   return SURVEY_DIRECTIONS[Math.floor(Math.random() * SURVEY_DIRECTIONS.length)]
-// }
-// --- End commented out survey code ---
+const SURVEY_DIRECTIONS = [
+  'competitor',
+  'switching',
+  'workflow',
+  'activation',
+] as const
+
+function pickRandomDirection(): string {
+  return SURVEY_DIRECTIONS[Math.floor(Math.random() * SURVEY_DIRECTIONS.length)]
+}

 interface ChatErrorProps {
  error: Error
@@ -95,13 +93,11 @@ export const ChatError: FC<ChatErrorProps> = ({
  const { text, url, isRateLimit, isCreditsExhausted, isConnectionError } =
    parseErrorMessage(error.message, providerType)

-  // --- Commented out for Kimi partnership launch (restore after) ---
-  // const surveyUrl = useMemo(
-  //   () =>
-  //     `/app.html?page=survey&maxTurns=20&experimentId=daily_limit_${pickRandomDirection()}#/settings/survey`,
-  //   [],
-  // )
-  // --- End commented out survey code ---
+  const surveyUrl = useMemo(
+    () =>
+      `/app.html?page=survey&maxTurns=20&experimentId=daily_limit_${pickRandomDirection()}#/settings/survey`,
+    [],
+  )

  const getTitle = () => {
    if (isRateLimit) return 'Daily limit reached'
@@ -126,8 +122,17 @@ export const ChatError: FC<ChatErrorProps> = ({
          View troubleshooting guide
        </a>
      )}
-      {/* --- Commented out for Kimi partnership launch (restore after) ---
-      {isRateLimit && (
+      {isCreditsExhausted && url && (
+        <a
+          href={url}
+          target="_blank"
+          rel="noopener noreferrer"
+          className="text-muted-foreground text-xs underline hover:text-foreground"
+        >
+          View Usage & Billing
+        </a>
+      )}
+      {isRateLimit && !isCreditsExhausted && (
        <p className="text-muted-foreground text-xs">
          <a
            href={url}
@@ -148,27 +153,6 @@ export const ChatError: FC<ChatErrorProps> = ({
          </a>
        </p>
      )}
-      --- End commented out survey code --- */}
-      {isCreditsExhausted && url && (
-        <a
-          href={url}
-          target="_blank"
-          rel="noopener noreferrer"
-          className="text-muted-foreground text-xs underline hover:text-foreground"
-        >
-          View Usage & Billing
-        </a>
-      )}
-      {isRateLimit && providerType === 'browseros' && (
-        <a
-          href="/app.html#/settings/ai"
-          target="_blank"
-          rel="noopener noreferrer"
-          className="inline-flex items-center gap-1.5 rounded-md border border-[var(--accent-orange)] bg-[var(--accent-orange)]/10 px-3 py-1.5 font-medium text-[var(--accent-orange)] text-xs transition-colors hover:bg-[var(--accent-orange)]/20"
-        >
-          Add your own provider for unlimited usage
-        </a>
-      )}
      {onRetry && (
        <Button
          variant="outline"
--- a/packages/browseros-agent/apps/agent/lib/browseros/capabilities.ts
+++ b/packages/browseros-agent/apps/agent/lib/browseros/capabilities.ts
@@ -31,8 +31,6 @@ export enum Feature {
  WORKSPACE_FOLDER_SUPPORT = 'WORKSPACE_FOLDER_SUPPORT',
  // Proxy server support
  PROXY_SUPPORT = 'PROXY_SUPPORT',
-  // Workflows feature
-  WORKFLOW_SUPPORT = 'WORKFLOW_SUPPORT',
  // previousConversation as structured array (older servers only accept string)
  PREVIOUS_CONVERSATION_ARRAY = 'PREVIOUS_CONVERSATION_ARRAY',
  // Soul page: agent personality viewer and editor
@@ -73,7 +71,6 @@ const FEATURE_CONFIG: { [K in Feature]: FeatureConfig } = {
  [Feature.CUSTOMIZATION_SUPPORT]: { minBrowserOSVersion: '0.36.1.0' },
  [Feature.WORKSPACE_FOLDER_SUPPORT]: { minBrowserOSVersion: '0.36.4.0' },
  [Feature.PROXY_SUPPORT]: { minBrowserOSVersion: '0.39.0.1' },
-  [Feature.WORKFLOW_SUPPORT]: { minServerVersion: '0.0.41' },
  [Feature.PREVIOUS_CONVERSATION_ARRAY]: { minServerVersion: '0.0.64' },
  [Feature.SOUL_SUPPORT]: { minServerVersion: '0.0.67' },
  [Feature.NEWTAB_CHAT_SUPPORT]: { minBrowserOSVersion: '0.40.0.0' },
--- a/packages/browseros-agent/apps/agent/lib/constants/analyticsEvents.ts
+++ b/packages/browseros-agent/apps/agent/lib/constants/analyticsEvents.ts
@@ -1,19 +1,6 @@
 /** @public */
 export const MESSAGE_LIKE_EVENT = 'ui.message.like'

-export const GRAPH_MESSAGE_LIKE_EVENT = 'settings.graph.message.like'
-
-export const GRAPH_MESSAGE_DISLIKE_EVENT = 'settings.graph.message.dislike'
-
-/** @public */
-export const NEW_GRAPH_CREATED_EVENT = 'settings.graph.created'
-
-/** @public */
-export const GRAPH_SAVED_EVENT = 'settings.graph.saved'
-
-/** @public */
-export const GRAPH_UPDATED_EVENT = 'settings.graph.updated'
-
 /** @public */
 export const MESSAGE_DISLIKE_EVENT = 'ui.message.dislike'

@@ -178,21 +165,6 @@ export const NEWTAB_VOICE_TRANSCRIPTION_COMPLETED_EVENT =
 /** @public */
 export const NEWTAB_VOICE_ERROR_EVENT = 'newtab.voice.error'

-/** @public */
-export const WORKFLOW_DELETED_EVENT = 'settings.workflow.deleted'
-
-/** @public */
-export const WORKFLOW_RUN_STARTED_EVENT = 'settings.workflow.run_started'
-
-/** @public */
-export const WORKFLOW_RUN_STOPPED_EVENT = 'settings.workflow.run_stopped'
-
-/** @public */
-export const WORKFLOW_RUN_RETRIED_EVENT = 'settings.workflow.run_retried'
-
-/** @public */
-export const WORKFLOW_RUN_COMPLETED_EVENT = 'settings.workflow.run_completed'
-
 /** @public */
 export const SIDEPANEL_AI_TRIGGERED_EVENT = 'sidepanel.ai.triggered'

@@ -308,14 +280,6 @@ export const KIMI_API_KEY_CONFIGURED_EVENT = 'settings.kimi.api_key_configured'
 export const KIMI_API_KEY_GUIDE_CLICKED_EVENT =
  'settings.kimi.api_key_guide_clicked'

-/** @public */
-export const KIMI_RATE_LIMIT_DOCS_CLICKED_EVENT =
-  'ui.rate_limit.kimi_docs_clicked'
-
-/** @public */
-export const KIMI_RATE_LIMIT_PLATFORM_CLICKED_EVENT =
-  'ui.rate_limit.moonshot_platform_clicked'
-
 /** @public */
 export const SIDEPANEL_VOICE_RECORDING_STARTED_EVENT =
  'sidepanel.voice.recording_started'
--- a/packages/browseros-agent/apps/agent/lib/constants/productUrls.ts
+++ b/packages/browseros-agent/apps/agent/lib/constants/productUrls.ts
@@ -49,11 +49,6 @@ export const productVideoUrl = 'https://youtu.be/J-lFhTP-7is'
 */
 export const productRepositoryShortUrl = 'https://git.new/browseros'

-/**
- * @public
- */
-export const workflowsHelpUrl = 'https://docs.browseros.com/features/workflows'
-
 /**
 * @public
 */
--- a/packages/browseros-agent/apps/agent/lib/env.ts
+++ b/packages/browseros-agent/apps/agent/lib/env.ts
@@ -6,7 +6,6 @@ const EnvSchema = z.object({
  VITE_PUBLIC_POSTHOG_HOST: z.string().optional(),
  VITE_PUBLIC_SENTRY_DSN: z.string().optional(),
  VITE_PUBLIC_BROWSEROS_API: z.string().optional(),
-  VITE_PUBLIC_KIMI_LAUNCH: z.string().optional(),
  PROD: z.boolean(),
 })

--- a/packages/browseros-agent/apps/agent/lib/feature-flags/kimi-launch.ts
+++ b/packages/browseros-agent/apps/agent/lib/feature-flags/kimi-launch.ts
@@ -1,14 +0,0 @@
-import { env } from '@/lib/env'
-
-const ENABLED_VALUES = new Set(['1', 'true', 'yes', 'on'])
-
-function parseKimiLaunchFlag(value: string | undefined): boolean {
-  if (!value) return false
-  return ENABLED_VALUES.has(value.trim().toLowerCase())
-}
-
-const kimiLaunchEnabled = parseKimiLaunchFlag(env.VITE_PUBLIC_KIMI_LAUNCH)
-
-export function isKimiLaunchEnabled(): boolean {
-  return kimiLaunchEnabled
-}
--- a/packages/browseros-agent/apps/agent/lib/feature-flags/useKimiLaunch.ts
+++ b/packages/browseros-agent/apps/agent/lib/feature-flags/useKimiLaunch.ts
@@ -1,5 +0,0 @@
-import { isKimiLaunchEnabled } from './kimi-launch'
-
-export function useKimiLaunch(): boolean {
-  return isKimiLaunchEnabled()
-}
--- a/packages/browseros-agent/apps/agent/lib/llm-hub/storage.ts
+++ b/packages/browseros-agent/apps/agent/lib/llm-hub/storage.ts
@@ -1,6 +1,5 @@
 import { getBrowserOSAdapter } from '@/lib/browseros/adapter'
 import { BROWSEROS_PREFS } from '@/lib/browseros/prefs'
-import { isKimiLaunchEnabled } from '@/lib/feature-flags/kimi-launch'

 /** @public */
 export interface LlmHubProvider {
@@ -8,43 +7,15 @@ export interface LlmHubProvider {
  url: string
 }

-const KIMI_PROVIDER: LlmHubProvider = {
-  name: 'Kimi',
-  url: 'https://www.kimi.com',
-}
-
-function ensureKimiFirst(providers: LlmHubProvider[]): LlmHubProvider[] {
-  if (!isKimiLaunchEnabled()) return providers
-  const hasKimi = providers.some(
-    (p) => p.name === 'Kimi' || p.url.includes('kimi.com'),
-  )
-  return hasKimi ? providers : [KIMI_PROVIDER, ...providers]
-}
-
 export async function loadProviders(): Promise<LlmHubProvider[]> {
  try {
    const adapter = getBrowserOSAdapter()
    const providersPref = await adapter.getPref(
      BROWSEROS_PREFS.THIRD_PARTY_LLM_PROVIDERS,
    )
-    const providers = (providersPref?.value as LlmHubProvider[]) || []
-
-    if (providers.length === 0) {
-      if (isKimiLaunchEnabled()) {
-        const defaults = [KIMI_PROVIDER]
-        await saveProviders(defaults)
-        return defaults
-      }
-      return []
-    }
-
-    const normalized = ensureKimiFirst(providers)
-    if (normalized !== providers) {
-      await saveProviders(normalized)
-    }
-    return normalized
+    return (providersPref?.value as LlmHubProvider[]) || []
  } catch {
-    return isKimiLaunchEnabled() ? [KIMI_PROVIDER] : []
+    return []
  }
 }

--- a/packages/browseros-agent/apps/agent/lib/llm-providers/storage.ts
+++ b/packages/browseros-agent/apps/agent/lib/llm-providers/storage.ts
@@ -2,14 +2,12 @@ import { storage } from '@wxt-dev/storage'
 import { sessionStorage } from '@/lib/auth/sessionStorage'
 import { getBrowserOSAdapter } from '@/lib/browseros/adapter'
 import { BROWSEROS_PREFS } from '@/lib/browseros/prefs'
-import { isKimiLaunchEnabled } from '@/lib/feature-flags/kimi-launch'
 import type { LlmProviderConfig, LlmProvidersBackup } from './types'
 import { uploadLlmProvidersToGraphql } from './uploadLlmProvidersToGraphql'

 /** Default provider ID constant */
 export const DEFAULT_PROVIDER_ID = 'browseros'
 const DEFAULT_PROVIDER_NAME = 'BrowserOS'
-const KIMI_LAUNCH_PROVIDER_NAME = 'Kimi K2.5'

 /** Storage key for LLM providers array */
 export const providersStorage = storage.defineItem<LlmProviderConfig[]>(
@@ -91,7 +89,7 @@ export function setupLlmProvidersSyncToBackend(): () => void {
 /** Load providers from storage */
 export async function loadProviders(): Promise<LlmProviderConfig[]> {
  const providers = (await providersStorage.getValue()) || []
-  const normalizedProviders = normalizeProvidersForLaunch(providers)
+  const normalizedProviders = normalizeProviderNames(providers)

  // Keep storage consistent so every consumer sees the same provider name.
  if (
@@ -109,7 +107,7 @@ export function createDefaultBrowserOSProvider(): LlmProviderConfig {
  return {
    id: DEFAULT_PROVIDER_ID,
    type: 'browseros',
-    name: getBuiltInProviderName(),
+    name: DEFAULT_PROVIDER_NAME,
    baseUrl: 'https://api.browseros.com/v1',
    modelId: 'browseros-auto',
    supportsImages: true,
@@ -125,26 +123,22 @@ export function createDefaultProvidersConfig(): LlmProviderConfig[] {
  return [createDefaultBrowserOSProvider()]
 }

-function getBuiltInProviderName(): string {
-  return isKimiLaunchEnabled()
-    ? KIMI_LAUNCH_PROVIDER_NAME
-    : DEFAULT_PROVIDER_NAME
-}
-
-function normalizeProvidersForLaunch(
+/**
+ * Normalize built-in provider names back to "BrowserOS" (e.g. from "Kimi K2.5"
+ * which was set during a previous partnership launch).
+ */
+function normalizeProviderNames(
  providers: LlmProviderConfig[],
 ): LlmProviderConfig[] {
-  const builtInProviderName = getBuiltInProviderName()
-
  return providers.map((provider) => {
    if (
      provider.id === DEFAULT_PROVIDER_ID &&
      provider.type === 'browseros' &&
-      provider.name !== builtInProviderName
+      provider.name !== DEFAULT_PROVIDER_NAME
    ) {
      return {
        ...provider,
-        name: builtInProviderName,
+        name: DEFAULT_PROVIDER_NAME,
      }
    }
    return provider
--- a/packages/browseros-agent/apps/agent/lib/workflows/workflowStorage.ts
+++ b/packages/browseros-agent/apps/agent/lib/workflows/workflowStorage.ts
@@ -1,54 +0,0 @@
-import { storage } from '@wxt-dev/storage'
-import { useEffect, useState } from 'react'
-
-export interface Workflow {
-  id: string
-  codeId: string
-  workflowName: string
-}
-
-export const workflowStorage = storage.defineItem<Workflow[]>(
-  'local:workflows',
-  {
-    fallback: [],
-  },
-)
-
-export function useWorkflows() {
-  const [workflows, setWorkflows] = useState<Workflow[]>([])
-
-  useEffect(() => {
-    workflowStorage.getValue().then(setWorkflows)
-    const unwatch = workflowStorage.watch((newValue) => {
-      setWorkflows(newValue ?? [])
-    })
-    return unwatch
-  }, [])
-
-  const addWorkflow = async (workflow: Omit<Workflow, 'id'>) => {
-    const newWorkflow: Workflow = {
-      id: crypto.randomUUID(),
-      ...workflow,
-    }
-    const current = (await workflowStorage.getValue()) ?? []
-    await workflowStorage.setValue([...current, newWorkflow])
-    return newWorkflow
-  }
-
-  const removeWorkflow = async (id: string) => {
-    const current = (await workflowStorage.getValue()) ?? []
-    await workflowStorage.setValue(current.filter((w) => w.id !== id))
-  }
-
-  const editWorkflow = async (
-    id: string,
-    updates: Partial<Omit<Workflow, 'id'>>,
-  ) => {
-    const current = (await workflowStorage.getValue()) ?? []
-    await workflowStorage.setValue(
-      current.map((w) => (w.id === id ? { ...w, ...updates } : w)),
-    )
-  }
-
-  return { workflows, addWorkflow, removeWorkflow, editWorkflow }
-}
--- a/packages/browseros-agent/apps/agent/package.json
+++ b/packages/browseros-agent/apps/agent/package.json
@@ -2,7 +2,7 @@
  "name": "@browseros/agent",
  "description": "manifest.json description",
  "private": true,
-  "version": "0.0.98",
+  "version": "0.0.99",
  "type": "module",
  "scripts": {
    "dev": "test -d generated/graphql || bun run codegen; mkdir -p /tmp/browseros-dev; bun --env-file=.env.development wxt",
--- a/packages/browseros-agent/apps/cli/Makefile
+++ b/packages/browseros-agent/apps/cli/Makefile
@@ -2,13 +2,17 @@ BINARY := browseros-cli
 SOURCES := $(shell find . -name '*.go')
 VERSION ?= dev
 POSTHOG_API_KEY ?=
+DIST := dist
 LDFLAGS := -X main.version=$(VERSION) -X browseros-cli/analytics.posthogAPIKey=$(POSTHOG_API_KEY)
+HOST_OS := $(shell go env GOOS)
+HOST_ARCH := $(shell go env GOARCH)
+HOST_EXT := $(if $(filter windows,$(HOST_OS)),.exe,)
+HOST_BINARY = $(DIST)/$(BINARY)_$(HOST_OS)_$(HOST_ARCH)$(HOST_EXT)

 $(BINARY): $(SOURCES)
 	go build -ldflags "$(LDFLAGS)" -o $(BINARY) .

 PLATFORMS := darwin/amd64 darwin/arm64 linux/amd64 linux/arm64 windows/amd64 windows/arm64
-DIST := dist

 .PHONY: install clean vet test release

@@ -45,6 +49,11 @@ release:
 		fi; \
 		mv "$(DIST)/$(BINARY)$$EXT" "$(DIST)/$(BINARY)_$${OS}_$${ARCH}$$EXT"; \
 	done
+	@ACTUAL_VERSION=$$($(HOST_BINARY) --version | awk '{print $$3}'); \
+		if [ "$$ACTUAL_VERSION" != "$(VERSION)" ]; then \
+			echo "Error: expected $(HOST_BINARY) to report version $(VERSION), got $$ACTUAL_VERSION" >&2; \
+			exit 1; \
+		fi
 	@cd $(DIST) && (command -v sha256sum >/dev/null 2>&1 && sha256sum *.tar.gz *.zip || shasum -a 256 *.tar.gz *.zip) > checksums.txt
 	@echo "=== Built artifacts ==="
 	@ls -lh $(DIST)
--- a/packages/browseros-agent/apps/cli/cmd/init.go
+++ b/packages/browseros-agent/apps/cli/cmd/init.go
@@ -25,13 +25,17 @@ func init() {
 		Long: `Set up the CLI by providing the MCP server URL from BrowserOS.

 Open BrowserOS → Settings → BrowserOS MCP to find your Server URL.
-The URL looks like: http://127.0.0.1:9004/mcp
+The URL looks like: http://127.0.0.1:9000/mcp

 The port varies per installation, so this step is required on first use.
 Run again if your port changes.

+You can provide the full URL or just the port number:
+  browseros-cli init http://127.0.0.1:9000/mcp
+  browseros-cli init 9000
+
 Three modes:
-  browseros-cli init <url>    Non-interactive, use the provided URL
+  browseros-cli init <url>    Non-interactive (full URL or port number)
  browseros-cli init --auto   Auto-discover from ~/.browseros/server.json
  browseros-cli init          Interactive prompt`,
 		Annotations: map[string]string{"group": "Setup:"},
@@ -65,13 +69,14 @@ Three modes:
 				bold.Println("BrowserOS CLI Setup")
 				fmt.Println()
 				fmt.Println("Open BrowserOS → Settings → BrowserOS MCP")
-				fmt.Println("Copy the Server URL shown there.")
+				fmt.Println("Copy the Server URL or port number shown there.")
 				fmt.Println()
-				dim.Println("It looks like: http://127.0.0.1:9004/mcp")
+				dim.Println("Examples:  http://127.0.0.1:9000/mcp")
+				dim.Println("           9000")
 				fmt.Println()

 				reader := bufio.NewReader(os.Stdin)
-				fmt.Print("Server URL: ")
+				fmt.Print("Server URL or port: ")
 				line, err := reader.ReadString('\n')
 				if err != nil {
 					output.Error("failed to read input", 1)
--- a/packages/browseros-agent/apps/cli/cmd/root.go
+++ b/packages/browseros-agent/apps/cli/cmd/root.go
@@ -34,6 +34,7 @@ const automaticUpdateDrainTimeout = 150 * time.Millisecond

 func SetVersion(v string) {
 	version = v
+	rootCmd.Version = v
 }

 var (
@@ -338,10 +339,27 @@ func loadBrowserosServerURL() string {

 func normalizeServerURL(raw string) string {
 	normalized := strings.TrimSpace(raw)
+
+	if isPortOnly(normalized) {
+		normalized = "http://127.0.0.1:" + normalized
+	}
+
 	normalized = strings.TrimSuffix(normalized, "/mcp")
 	return strings.TrimSuffix(normalized, "/")
 }

+func isPortOnly(s string) bool {
+	if s == "" {
+		return false
+	}
+	for _, c := range s {
+		if c < '0' || c > '9' {
+			return false
+		}
+	}
+	return true
+}
+
 func validateServerURL(raw string) (string, error) {
 	baseURL := normalizeServerURL(raw)
 	if baseURL != "" {
--- a/packages/browseros-agent/apps/cli/cmd/root_test.go
+++ b/packages/browseros-agent/apps/cli/cmd/root_test.go
@@ -5,6 +5,24 @@ import (
 	"time"
 )

+func TestSetVersionUpdatesRootCommand(t *testing.T) {
+	originalVersion := version
+	originalRootVersion := rootCmd.Version
+	t.Cleanup(func() {
+		version = originalVersion
+		rootCmd.Version = originalRootVersion
+	})
+
+	SetVersion("1.2.3")
+
+	if version != "1.2.3" {
+		t.Fatalf("version = %q, want %q", version, "1.2.3")
+	}
+	if rootCmd.Version != "1.2.3" {
+		t.Fatalf("rootCmd.Version = %q, want %q", rootCmd.Version, "1.2.3")
+	}
+}
+
 func TestCommandName(t *testing.T) {
 	tests := []struct {
 		name string
--- a/packages/browseros-agent/apps/eval/configs/agisdk-real-smoke.json
+++ b/packages/browseros-agent/apps/eval/configs/agisdk-real-smoke.json
@@ -0,0 +1,26 @@
+{
+  "agent": {
+    "type": "single",
+    "provider": "openai-compatible",
+    "model": "accounts/fireworks/models/kimi-k2p5",
+    "apiKey": "FIREWORKS_API_KEY",
+    "baseUrl": "https://api.fireworks.ai/inference/v1",
+    "supportsImages": true
+  },
+  "dataset": "../data/agisdk-real.jsonl",
+  "num_workers": 10,
+  "restart_server_per_task": true,
+  "browseros": {
+    "server_url": "http://127.0.0.1:9110",
+    "base_cdp_port": 9010,
+    "base_server_port": 9110,
+    "base_extension_port": 9310,
+    "load_extensions": false,
+    "headless": false
+  },
+  "captcha": {
+    "api_key_env": "NOPECHA_API_KEY"
+  },
+  "graders": ["agisdk_state_diff"],
+  "timeout_ms": 1800000
+}
--- a/packages/browseros-agent/apps/eval/configs/infinity-hard-50.json
+++ b/packages/browseros-agent/apps/eval/configs/infinity-hard-50.json
@@ -0,0 +1,26 @@
+{
+  "agent": {
+    "type": "single",
+    "provider": "openai-compatible",
+    "model": "accounts/fireworks/models/kimi-k2p5",
+    "apiKey": "FIREWORKS_API_KEY",
+    "baseUrl": "https://api.fireworks.ai/inference/v1",
+    "supportsImages": true
+  },
+  "dataset": "../data/webarena-infinity-hard-50.jsonl",
+  "num_workers": 10,
+  "restart_server_per_task": true,
+  "browseros": {
+    "server_url": "http://127.0.0.1:9110",
+    "base_cdp_port": 9010,
+    "base_server_port": 9110,
+    "base_extension_port": 9310,
+    "load_extensions": false,
+    "headless": false
+  },
+  "captcha": {
+    "api_key_env": "NOPECHA_API_KEY"
+  },
+  "graders": ["infinity_state"],
+  "timeout_ms": 1800000
+}
--- a/packages/browseros-agent/apps/eval/data/agisdk-real.jsonl
+++ b/packages/browseros-agent/apps/eval/data/agisdk-real.jsonl
@@ -0,0 +1,52 @@
+{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
+{"query_id": "agisdk-fly-unified-5", "dataset": "agisdk-real", "query": "Find me the cheapest fare for a flight from Orlando to Milwaukee on December 5th, 2024 and book it.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/25, Security Code: 420 Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-5", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "United Airlines"}}}
+{"query_id": "agisdk-udriver-10", "dataset": "agisdk-real", "query": "Order me a ride for 4pm, I'll be at the de Young muesum headed to the Waterbar, fanciest option possible please.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-10", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
+{"query_id": "agisdk-udriver-9", "dataset": "agisdk-real", "query": "Book me a ride from the thai restaurant I last took a ride to for later today at 2pm, I'll be at 333 Apartments on Fremont", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-9", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-9", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Uber"}}}
+{"query_id": "agisdk-topwork-4", "dataset": "agisdk-real", "query": "Create a job post for a UI/UX Designer with expertise in Figma, Sketch, and Adobe Creative Suite, including project details, timeline, and required skills (Wireframing, Prototyping, Responsive Design).", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-4", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
+{"query_id": "agisdk-gocalendar-4", "dataset": "agisdk-real", "query": "Change the \"Team Check-In\" event on July 18, 2024, name to \"Project Kickoff\" and update the location to \"Zoom\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-4", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
+{"query_id": "agisdk-staynb-6", "dataset": "agisdk-real", "query": "Find and book the stay with the best value for money (cheapest stay with the best reviews) for 1 day. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-6", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-6", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
+{"query_id": "agisdk-omnizon-10", "dataset": "agisdk-real", "query": "Click on \"buy now\" on any product, increase its quantity to the maximum allowed, update the delivery date to the last available, and place the order.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-10", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Amazon"}}}
+{"query_id": "agisdk-fly-unified-9", "dataset": "agisdk-real", "query": "Book me a flight from San Francisco to Chicago in Basic Economy on December 18th at 10:00. Ensure no seat selection is made.\nPassenger: David Lee\nDate of Birth: 07/22/1985\nSex: Male\nSeat Selection: No\nPayment: Credit Card (9999 8888 7777), Exp: 03/30, Address: 987 Cedar St, Chicago, IL, 60601, USA, Phone: 555-987-1234, Email: davidlee@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-9", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-9", "challenge_type": "action", "difficulty": "hard", "similar_to": "United Airlines"}}}
+{"query_id": "agisdk-networkin-9", "dataset": "agisdk-real", "query": "Find a professional who attended Stanford and send them a connection request and a message.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-9", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-9", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-udriver-11", "dataset": "agisdk-real", "query": "I need to go from Pacific Catch on Chestnut back home to 333 Fremont now. If the fancy version is within ten dollars of the regular one, book that.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-11", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-11", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
+{"query_id": "agisdk-fly-unified-4", "dataset": "agisdk-real", "query": "Book me a round-trip flight from Providence (Rhode Island) to Indianapolis, departing on December 5th, 2024 at 08:00 and returning on December 9th at 14:00.\nPassenger: Jane Smith\nDate of Birth: 02/14/1995\nSex: Female\nSeat Selection: Yes (Window seat)\nPayment: Credit Card (378342143523967), Exp: 06/26, security code: 345 Address: 456 Elm St, Miami, FL, 33101, USA, Phone: 555-987-6543, Email: janesmith@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-4", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "United Airlines"}}}
+{"query_id": "agisdk-networkin-5", "dataset": "agisdk-real", "query": "Send a connection request to John Smith.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-5", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-5", "challenge_type": "action", "difficulty": "easy", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-zilloft-6", "dataset": "agisdk-real", "query": "Select a property listed in San Francisco as \"Condos\" within a price range under $300,000 and request a tour for tomorrow at 4:00 PM. Use these contact details: Name: Sarah Brown, Email: sarahbrown@example.com, Phone: 555-987-6543.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-6", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "Zillow"}}}
+{"query_id": "agisdk-topwork-2", "dataset": "agisdk-real", "query": "Create a job posting for a Backend Developer specializing in Python, Django, and Flask to develop a high-performance web application. Include project details such as required skills (PostgreSQL, Docker, AWS, CI/CD), estimated project timeline, and budget.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-2", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-2", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
+{"query_id": "agisdk-gocalendar-3", "dataset": "agisdk-real", "query": "Delete the event titled \"Breakfast Meeting with Client\" scheduled for July 19, 2024", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-3", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "Google Calendar"}}}
+{"query_id": "agisdk-topwork-3", "dataset": "agisdk-real", "query": "Create a job listing for a Full-Stack Developer with expertise in Java, Spring Boot, and Angular, outlining the project scope, estimated duration, and required skills (MySQL, Docker, Kubernetes, and Jenkins). The ideal candidate should have experience in enterprise-level applications and building scalable microservices. After creating the job post, please describe what you included in the job listing.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-3", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-3", "challenge_type": "retrieval", "difficulty": "medium", "similar_to": "Upwork"}}}
+{"query_id": "agisdk-fly-unified-2", "dataset": "agisdk-real", "query": "Book me a one-way flight from Indiana to New York on December 2nd 2024 at 12:00.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/25, Security Code: 245, Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-2", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "United Airlines"}}}
+{"query_id": "agisdk-dashdish-7", "dataset": "agisdk-real", "query": "Select \"Express Delivery\" for an order from \"DragonEats\" of \"Mushroom Swiss Burger\" and complete the checkout with the pre-loaded Visa card.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-7", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-7", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
+{"query_id": "agisdk-networkin-3", "dataset": "agisdk-real", "query": "Write a post inviting users to a networking event, including details about the event's purpose, date, and target audience.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-3", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-3", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-gomail-7", "dataset": "agisdk-real", "query": "Delete the email with the subject \"New Leadership Articles You Can't Miss\" from the Inbox.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-7", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-7", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Gmail"}}}
+{"query_id": "agisdk-opendining-8", "dataset": "agisdk-real", "query": "Identify and book the restaurant with the lowest rating. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-8", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-8", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "OpenTable"}}}
+{"query_id": "agisdk-omnizon-2", "dataset": "agisdk-real", "query": "Search for \"smartphones\" using the search bar, add the first two to your cart, view the details of the third product, click on \"Buy Now,\" and proceed through the checkout process.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-2", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-2", "challenge_type": "action", "difficulty": "medium", "similar_to": "Amazon"}}}
+{"query_id": "agisdk-udriver-1", "dataset": "agisdk-real", "query": "Book a ride from Fitness Urbano to Pacific Cafe", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-1", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-1", "challenge_type": "action", "difficulty": "easy", "similar_to": "Uber"}}}
+{"query_id": "agisdk-staynb-2", "dataset": "agisdk-real", "query": "Click on one of the stays displayed on the homepage and book it for a family of 4 (2 adults and 2 children). For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-2", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Airbnb"}}}
+{"query_id": "agisdk-opendining-10", "dataset": "agisdk-real", "query": "Check the menus of all restaurants for vegetarian options and make a reservation at the one with the most vegetarian choices. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-10", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-10", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "OpenTable"}}}
+{"query_id": "agisdk-opendining-4", "dataset": "agisdk-real", "query": "Use the search bar to search for a restaurant on September 2nd at 4:30 PM for 7 people, using \"Japanese\" as the search term, and book the first result. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-4", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-4", "challenge_type": "action", "difficulty": "hard", "similar_to": "OpenTable"}}}
+{"query_id": "agisdk-gomail-8", "dataset": "agisdk-real", "query": "Clear all emails from \"GitHub\" in the inbox to trash.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-8", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-8", "challenge_type": "action", "difficulty": "medium", "similar_to": "Gmail"}}}
+{"query_id": "agisdk-dashdish-4", "dataset": "agisdk-real", "query": "Schedule a delivery order from \"Taco Bell\" adding a \"Classic Cheeseburger\" large size for later and add the note \"Leave at the front door\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-4", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Doordash"}}}
+{"query_id": "agisdk-networkin-1", "dataset": "agisdk-real", "query": "Create a new text post for the feed with a professional update about AI trends in 2025, mentioning three key advancements and their impact on the job market.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-1", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-dashdish-5", "dataset": "agisdk-real", "query": "Add three \"Loaded Bacon Cheese Fries\" to the shopping cart from \"Man vs. Fries\". Proceed to checkout and select \"Pickup\" as the delivery method.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-5", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Doordash"}}}
+{"query_id": "agisdk-opendining-5", "dataset": "agisdk-real", "query": "Scroll through the homepage carousel until \"Ocean Breeze\" is visible, select the second available time slot, and complete the reservation. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-5", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-5", "challenge_type": "action", "difficulty": "medium", "similar_to": "OpenTable"}}}
+{"query_id": "agisdk-topwork-1", "dataset": "agisdk-real", "query": "Create a new job post for a Frontend Developer with expertise in React and TypeScript, specifying project details such as estimated duration, required skills, and budget.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-1", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
+{"query_id": "agisdk-gocalendar-1", "dataset": "agisdk-real", "query": "Create a new event titled \"Team Meeting\" on July 19, 2024, from 2 PM to 2:30 PM, and include \"Conference Room A\" as the location", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-1", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
+{"query_id": "agisdk-gomail-5", "dataset": "agisdk-real", "query": "Schedule an email to jane.doe@example.com with the subject \"Weekly Update\" to be sent next Monday at 9:00 AM.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-5", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Gmail"}}}
+{"query_id": "agisdk-staynb-4", "dataset": "agisdk-real", "query": "Book a stay for 2 children with 1 adult. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-4", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Airbnb"}}}
+{"query_id": "agisdk-omnizon-8", "dataset": "agisdk-real", "query": "Search for \"Automatic Espresso Machine,\" click on the cheapest one, change the quantity to 5, use \"buy now\" to purchase them and complete the checkout.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-8", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-8", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "Amazon"}}}
+{"query_id": "agisdk-networkin-6", "dataset": "agisdk-real", "query": "Choose a random person who you haven't connected with, connect with them, and send them a message saying, 'howdy, partner'.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-6", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-dashdish-2", "dataset": "agisdk-real", "query": "Add a \"Medium Pepperoni Pizza\" from the restaurant \"Papa Johns Pizza\" to the shopping cart and purchase it.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-2", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Doordash"}}}
+{"query_id": "agisdk-staynb-8", "dataset": "agisdk-real", "query": "Scroll through the homepage and book the last stay located in Paris.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-8", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-8", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
+{"query_id": "agisdk-omnizon-4", "dataset": "agisdk-real", "query": "Search for a \"Marshall Emberton II Portable Bluetooth Speaker\" and add it to your cart, then search for the \"Michael Kors Oversized Slim Runway Men's Watch,\" add it to the cart, and complete the checkout with both items.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-4", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-4", "challenge_type": "action", "difficulty": "hard", "similar_to": "Amazon"}}}
+{"query_id": "agisdk-gomail-2", "dataset": "agisdk-real", "query": "Mark the first email in the Inbox as \"read\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-2", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Gmail"}}}
+{"query_id": "agisdk-networkin-10", "dataset": "agisdk-real", "query": "Generate a polite follow-up message for a previous unanswered chat, starting with \"Following up on\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-10", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-10", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-gomail-3", "dataset": "agisdk-real", "query": "Compose a new email to jonathan.smith@example.com with the subject \"Meeting Notes\" and body \"Please find the meeting notes attached.\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-3", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "Gmail"}}}
+{"query_id": "agisdk-udriver-6", "dataset": "agisdk-real", "query": "Me and 4 friends need a ride from the Palace Hotel to dinner at Osha Thai leaving now", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-6", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-6", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
+{"query_id": "agisdk-staynb-9", "dataset": "agisdk-real", "query": "Book a stay with the maximum number of guests supported. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-9", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-9", "challenge_type": "action", "difficulty": "hard", "similar_to": "Airbnb"}}}
+{"query_id": "agisdk-zilloft-3", "dataset": "agisdk-real", "query": "Find a home in San Diego priced under $150,000 with at least 2 bedrooms and request a tour. Use these details: Contact Name: John Doe, Email: johndoe@example.com, Phone: 555-123-4567, Tour Time: 2:00 PM, Tour Date: First available.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-3", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-3", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "Zillow"}}}
+{"query_id": "agisdk-fly-unified-6", "dataset": "agisdk-real", "query": "Reserve me a seat for the flight from Austin to Pittsburgh departing on December 11th, 2024 at 8:00 in Basic Economy.\nPassenger: Alice Brown\nDate of Birth: 05/20/1992\nSex: Female\nSeat Selection: Yes (Aisle seat)\nPayment: Credit Card (378342143523967), Exp: 09/27, security code: 332 Address: 789 Pine St, Los Angeles, CA, 90012, USA, Phone: 555-456-7890, Email: alicebrown@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-6", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "United Airlines"}}}
+{"query_id": "agisdk-opendining-3", "dataset": "agisdk-real", "query": "Book a table at \"The Royal Dine\" for a party of 4 on July 20, 2024, at 7 PM. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-3", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "OpenTable"}}}
+{"query_id": "agisdk-omnizon-9", "dataset": "agisdk-real", "query": "Search for \"PlayStation DualSense\", purchase it using the \"buy now\" button after opening the first result and change the default payment method to:\nname: Jack Fulton\ncard number: 9231 3432 8927 7764\nexp date: 1/2029\nsecurity code: 128\n before placing your order. ", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-9", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-9", "challenge_type": "action", "difficulty": "hard", "similar_to": "Amazon"}}}
+{"query_id": "agisdk-gocalendar-7", "dataset": "agisdk-real", "query": "Reschedule the \"Morning Coffee with sister\" event from July 18, 2024, at 9 AM to July 19, 2024, at 10AM using drag-and-drop functionality", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-7", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-7", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
+{"query_id": "agisdk-staynb-5", "dataset": "agisdk-real", "query": "Use the search bar to look for a stay. For the \"Where\" section, use the \"Search by region\" popover and select \"Europe\". Set the check-in date to October 13th and the check-out date to October 23rd. For the \"Who\" section, select 1 infant, 2 children, and 2 adults. Press the search button, select the first stay, and book it.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-5", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-5", "challenge_type": "action", "difficulty": "medium", "similar_to": "Airbnb"}}}
--- a/packages/browseros-agent/apps/eval/data/webarena-infinity-hard-50.jsonl
+++ b/packages/browseros-agent/apps/eval/data/webarena-infinity-hard-50.jsonl
@@ -0,0 +1,50 @@
+{"query_id": "infinity-elation-prescriptions-task_h69", "dataset": "webarena-infinity", "query": "Approve all pending refill requests except for any medication that is involved in a major drug-drug interaction with another of the patient's active medications. Deny those with the reason 'Drug interaction \u2014 needs provider review before renewal'.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h69", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h69.py", "app_base_port": 8020}}}
+{"query_id": "infinity-elation-clinical-records-task_h52", "dataset": "webarena-infinity", "query": "Add the document tag 'Provider-Reviewed' to every visit note template that was created by the current logged-in provider. Do not modify templates created by other providers.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h52", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h52.py", "app_base_port": 8000}}}
+{"query_id": "infinity-gmail-accounts-and-contacts-task_h44", "dataset": "webarena-infinity", "query": "Your sister's husband is one of your contacts. Find him, star his entry, and add the Friends label.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h44", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h44.py", "app_base_port": 8070}}}
+{"query_id": "infinity-gmail-task_h2", "dataset": "webarena-infinity", "query": "Update the Datadog alerts filter to also archive matching emails and forward them to priya.sharma@cloudnine.dev instead of nate.patel@devops.tools.", "graders": ["infinity_state"], "start_url": "http://localhost:8060", "metadata": {"original_task_id": "gmail-task_h2", "website": "gmail", "category": "webarena-infinity", "additional": {"app_name": "gmail", "difficulty": "hard", "verifier_path": "real-tasks/task_h2.py", "app_base_port": 8060}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h58", "dataset": "webarena-infinity", "query": "The Performance Initiative epic has two child epics. For the child epic with more open issues, set the weight of every issue in it to 13. For the other child epic, close all its open issues.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h58", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h58.py", "app_base_port": 8050}}}
+{"query_id": "infinity-figma-slides-task_h46", "dataset": "webarena-infinity", "query": "There are two slides with tables in the deck. Lock the table that compares competitors, and change the font size to 16 on the table that tracks quarterly feature adoption.", "graders": ["infinity_state"], "start_url": "http://localhost:8030", "metadata": {"original_task_id": "figma-slides-task_h46", "website": "figma-slides", "category": "webarena-infinity", "additional": {"app_name": "figma-slides", "difficulty": "hard", "verifier_path": "real-tasks/task_h46.py", "app_base_port": 8030}}}
+{"query_id": "infinity-elation-prescriptions-task_h50", "dataset": "webarena-infinity", "query": "Deny the pending refill for the patient's cholesterol medication because his lipid panel is overdue. Then deny the Lisinopril refill as well \u2014 he needs a follow-up blood pressure check first.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h50", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h50.py", "app_base_port": 8020}}}
+{"query_id": "infinity-elation-prescriptions-task_h19", "dataset": "webarena-infinity", "query": "Discontinue the Omeprazole and prescribe Famotidine 20mg tablet twice daily as a replacement for GERD \u2014 qty 60, 3 refills, send to CVS #4521.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h19", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h19.py", "app_base_port": 8020}}}
+{"query_id": "infinity-paypal-my-wallet-task_h25", "dataset": "webarena-infinity", "query": "Convert all of my Australian dollars to euros.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h25", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h25.py", "app_base_port": 8100}}}
+{"query_id": "infinity-elation-clinical-records-task_h66", "dataset": "webarena-infinity", "query": "Create a new template called 'Anxiety Management' with HPI and Assessment sections, and billing code 99213 with description 'Office visit, established, low complexity'. Then create a visit note for Emily Nakamura using that new template and the Telehealth category, add a Psychological Status block to the note, and sign it.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h66", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h66.py", "app_base_port": 8000}}}
+{"query_id": "infinity-elation-clinical-records-task_h62", "dataset": "webarena-infinity", "query": "Look up which template is assigned to the COVID Vaccine appointment type. Remove all its existing document tags and replace them with the single tag 'COVID-Protocol'. Then also assign that same template to the Urgent Same-Day appointment type.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h62", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h62.py", "app_base_port": 8000}}}
+{"query_id": "infinity-elation-prescriptions-task_h32", "dataset": "webarena-infinity", "query": "The patient has a medication that's being dispensed as written (brand name only). Discontinue that prescription and replace it with a new one \u2014 same medication, same sig, same pharmacy \u2014 but allow generic substitution this time. Qty 30, 3 refills, 30 days supply.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h32", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h32.py", "app_base_port": 8020}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h48", "dataset": "webarena-infinity", "query": "Add the 'breaking-change' label to every open issue in the API v3 Migration epic and remove any existing workflow-scoped labels from those issues.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h48", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h48.py", "app_base_port": 8050}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h77", "dataset": "webarena-infinity", "query": "Rename the 'UX' label to 'user-experience', change its type to 'group', and then add it to every open issue in the Frontend Modernization epic that doesn't already have it.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h77", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h77.py", "app_base_port": 8050}}}
+{"query_id": "infinity-xero-invoicing-task_h15", "dataset": "webarena-infinity", "query": "Create a new invoice for Summit Health Group for an annual software license and 12 months of support with a 10% discount on support.", "graders": ["infinity_state"], "start_url": "http://localhost:8120", "metadata": {"original_task_id": "xero-invoicing-task_h15", "website": "xero-invoicing", "category": "webarena-infinity", "additional": {"app_name": "xero-invoicing", "difficulty": "hard", "verifier_path": "real-tasks/task_h15.py", "app_base_port": 8120}}}
+{"query_id": "infinity-elation-clinical-records-task_h55", "dataset": "webarena-infinity", "query": "Resolve every problem across all patients in the system that currently has a status of Controlled.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h55", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h55.py", "app_base_port": 8000}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h8", "dataset": "webarena-infinity", "query": "Create a confidential issue titled 'Emergency security patch' with priority::critical and the 'security' label, assigned to James O'Brien and Oliver Schmidt, with weight 2 in the Security Hardening milestone.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h8", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h8.py", "app_base_port": 8050}}}
+{"query_id": "infinity-paypal-my-wallet-task_h20", "dataset": "webarena-infinity", "query": "Make a $200 payment on PayPal Credit and change autopay to pay the full balance.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h20", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h20.py", "app_base_port": 8100}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h52", "dataset": "webarena-infinity", "query": "Create a new board called 'Performance Tracker' with lists for the priority::critical, priority::high, and priority::medium labels. Then add the 'priority::high' label to every open issue in the v4.1 milestone that has the 'performance' label.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h52", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h52.py", "app_base_port": 8050}}}
+{"query_id": "infinity-paypal-my-wallet-task_h80", "dataset": "webarena-infinity", "query": "Save all available Food & Drink offers, buy a $25 DoorDash gift card for yourself, and switch currency conversion to use my card issuer.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h80", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h80.py", "app_base_port": 8100}}}
+{"query_id": "infinity-gmail-accounts-and-contacts-task_h50", "dataset": "webarena-infinity", "query": "Add the Emergency label to every contact who is currently listed as a delegate (active, pending, or expired). Then remove all delegates whose status is not 'active'.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h50", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h50.py", "app_base_port": 8070}}}
+{"query_id": "infinity-elation-clinical-records-task_h14", "dataset": "webarena-infinity", "query": "Add the tag 'Flu-Season' to every patient whose primary provider is Dr. Sarah Chen.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h14", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h14.py", "app_base_port": 8000}}}
+{"query_id": "infinity-figma-text-and-typography-task_h7", "dataset": "webarena-infinity", "query": "Remove all list formatting from every layer.", "graders": ["infinity_state"], "start_url": "http://localhost:8040", "metadata": {"original_task_id": "figma-text-and-typography-task_h7", "website": "figma-text-and-typography", "category": "webarena-infinity", "additional": {"app_name": "figma-text-and-typography", "difficulty": "hard", "verifier_path": "real-tasks/task_h7.py", "app_base_port": 8040}}}
+{"query_id": "infinity-paypal-my-wallet-task_h26", "dataset": "webarena-infinity", "query": "Send a $50 Amazon gift card to sarah.chen@email.com with 'Thank you!' as the message, and save the Amazon cashback offer.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h26", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h26.py", "app_base_port": 8100}}}
+{"query_id": "infinity-handshake-career-exploration-task_h97", "dataset": "webarena-infinity", "query": "Find the single most helpful answer across all Q&A questions and mark it helpful. Then find the most-viewed question and submit your own answer to it.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h97", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h97.py", "app_base_port": 8080}}}
+{"query_id": "infinity-figma-slides-task_h79", "dataset": "webarena-infinity", "query": "In the adoption table, find the feature with the highest Target Q4 percentage. In the competitive table, change DesignCraft's entry for that same feature to 'Market Leader'. Then update that feature's Target Q4 to '95%'.", "graders": ["infinity_state"], "start_url": "http://localhost:8030", "metadata": {"original_task_id": "figma-slides-task_h79", "website": "figma-slides", "category": "webarena-infinity", "additional": {"app_name": "figma-slides", "difficulty": "hard", "verifier_path": "real-tasks/task_h79.py", "app_base_port": 8030}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h41", "dataset": "webarena-infinity", "query": "For every open issue in the v4.2 - Security Hardening milestone: if it is already confidential, set its health status to 'at risk'. If it is not confidential, make it confidential and set its health status to 'needs attention'.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h41", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h41.py", "app_base_port": 8050}}}
+{"query_id": "infinity-handshake-career-exploration-task_h90", "dataset": "webarena-infinity", "query": "A student in the feed mentioned attending the NSBE conference. That student also answered a Q&A question about diversity programs in tech. Submit your own answer to that same question sharing your experience, then bookmark that student's feed post.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h90", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h90.py", "app_base_port": 8080}}}
+{"query_id": "infinity-elation-prescriptions-task_h30", "dataset": "webarena-infinity", "query": "The patient has three temporary medications. Discontinue the corticosteroid taper and the penicillin antibiotic \u2014 the patient completed both courses. Move the remaining temporary medication to permanent Rx.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h30", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h30.py", "app_base_port": 8020}}}
+{"query_id": "infinity-linear-account-settings-task_h19", "dataset": "webarena-infinity", "query": "Turn off all desktop application settings: open in desktop app, notification badge, and spell check.", "graders": ["infinity_state"], "start_url": "http://localhost:8090", "metadata": {"original_task_id": "linear-account-settings-task_h19", "website": "linear-account-settings", "category": "webarena-infinity", "additional": {"app_name": "linear-account-settings", "difficulty": "hard", "verifier_path": "real-tasks/task_h19.py", "app_base_port": 8090}}}
+{"query_id": "infinity-elation-prescriptions-task_h39", "dataset": "webarena-infinity", "query": "Change the default pharmacy to Express Scripts Mail Pharmacy for mail-order prescriptions. Then document that the patient takes Magnesium Citrate 400mg tablet as an OTC supplement \u2014 once daily at bedtime, 30-day supply.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h39", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h39.py", "app_base_port": 8020}}}
+{"query_id": "infinity-handshake-career-exploration-task_h136", "dataset": "webarena-infinity", "query": "Your earliest completed appointment was a specific type. Schedule a follow-up appointment of the same category and type with the same staff member, for March 28, 2026 at 9:00 AM, in person.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h136", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h136.py", "app_base_port": 8080}}}
+{"query_id": "infinity-handshake-career-exploration-task_h105", "dataset": "webarena-infinity", "query": "Find the second-most-viewed question in Q&A. It has two answers \u2014 mark the one with fewer helpful votes as helpful.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h105", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h105.py", "app_base_port": 8080}}}
+{"query_id": "infinity-gmail-accounts-and-contacts-task_h22", "dataset": "webarena-infinity", "query": "The Engineering Manager at TechCorp is listed as one of your delegates. Remove her delegation and unstar her contact.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h22", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h22.py", "app_base_port": 8070}}}
+{"query_id": "infinity-elation-patient-communication-task_h9", "dataset": "webarena-infinity", "query": "Acknowledge all unacknowledged reminders in the system.", "graders": ["infinity_state"], "start_url": "http://localhost:8010", "metadata": {"original_task_id": "elation-patient-communication-task_h9", "website": "elation-patient-communication", "category": "webarena-infinity", "additional": {"app_name": "elation-patient-communication", "difficulty": "hard", "verifier_path": "real-tasks/task_h9.py", "app_base_port": 8010}}}
+{"query_id": "infinity-superhuman-general-task_h1", "dataset": "webarena-infinity", "query": "Label the FinancePlus partnership email and the QuantumLab prototype email as 'Clients'.", "graders": ["infinity_state"], "start_url": "http://localhost:8110", "metadata": {"original_task_id": "superhuman-general-task_h1", "website": "superhuman-general", "category": "webarena-infinity", "additional": {"app_name": "superhuman-general", "difficulty": "hard", "verifier_path": "real-tasks/task_h1.py", "app_base_port": 8110}}}
+{"query_id": "infinity-xero-invoicing-task_h79", "dataset": "webarena-infinity", "query": "Change the invoice prefix to 'AUS-' and the next number to 100, then create a new invoice for CloudNine Analytics for 8 hours of UI/UX design work.", "graders": ["infinity_state"], "start_url": "http://localhost:8120", "metadata": {"original_task_id": "xero-invoicing-task_h79", "website": "xero-invoicing", "category": "webarena-infinity", "additional": {"app_name": "xero-invoicing", "difficulty": "hard", "verifier_path": "real-tasks/task_h79.py", "app_base_port": 8120}}}
+{"query_id": "infinity-figma-slides-task_h16", "dataset": "webarena-infinity", "query": "Enable slide numbers on every slide using the 'with total' format and change the aspect ratio to 4:3.", "graders": ["infinity_state"], "start_url": "http://localhost:8030", "metadata": {"original_task_id": "figma-slides-task_h16", "website": "figma-slides", "category": "webarena-infinity", "additional": {"app_name": "figma-slides", "difficulty": "hard", "verifier_path": "real-tasks/task_h16.py", "app_base_port": 8030}}}
+{"query_id": "infinity-linear-account-settings-task_h16", "dataset": "webarena-infinity", "query": "Revoke all API keys that have an expiration date.", "graders": ["infinity_state"], "start_url": "http://localhost:8090", "metadata": {"original_task_id": "linear-account-settings-task_h16", "website": "linear-account-settings", "category": "webarena-infinity", "additional": {"app_name": "linear-account-settings", "difficulty": "hard", "verifier_path": "real-tasks/task_h16.py", "app_base_port": 8090}}}
+{"query_id": "infinity-elation-prescriptions-task_h2", "dataset": "webarena-infinity", "query": "Prescribe Buspirone 10mg for the patient's anxiety \u2014 once daily in the morning, qty 30, 5 refills. Send it to the same pharmacy that fills his Sertraline.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h2", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h2.py", "app_base_port": 8020}}}
+{"query_id": "infinity-handshake-career-exploration-task_h1", "dataset": "webarena-infinity", "query": "Follow all consulting firms on Handshake.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h1", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h1.py", "app_base_port": 8080}}}
+{"query_id": "infinity-handshake-career-exploration-task_h141", "dataset": "webarena-infinity", "query": "Some of your saved jobs are from employers you haven't followed yet. Find and follow each of those employers.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h141", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h141.py", "app_base_port": 8080}}}
+{"query_id": "infinity-figma-text-and-typography-task_h74", "dataset": "webarena-infinity", "query": "Set the spelling language to Japanese, the big nudge amount to 50, and the default horizontal alignment to right.", "graders": ["infinity_state"], "start_url": "http://localhost:8040", "metadata": {"original_task_id": "figma-text-and-typography-task_h74", "website": "figma-text-and-typography", "category": "webarena-infinity", "additional": {"app_name": "figma-text-and-typography", "difficulty": "hard", "verifier_path": "real-tasks/task_h74.py", "app_base_port": 8040}}}
+{"query_id": "infinity-elation-patient-communication-task_h63", "dataset": "webarena-infinity", "query": "Check the visit summaries to find the patient whose BNP level improved. Reply to their most recent message confirming they can resume light activity, then update their emergency contact's phone number to (650) 555-0001.", "graders": ["infinity_state"], "start_url": "http://localhost:8010", "metadata": {"original_task_id": "elation-patient-communication-task_h63", "website": "elation-patient-communication", "category": "webarena-infinity", "additional": {"app_name": "elation-patient-communication", "difficulty": "hard", "verifier_path": "real-tasks/task_h63.py", "app_base_port": 8010}}}
+{"query_id": "infinity-elation-patient-communication-task_h14", "dataset": "webarena-infinity", "query": "Change Dr. Torres's notification timeframe to 'Do not notify me' and remove Dr. Torres from Dr. Chen's General Question routing.", "graders": ["infinity_state"], "start_url": "http://localhost:8010", "metadata": {"original_task_id": "elation-patient-communication-task_h14", "website": "elation-patient-communication", "category": "webarena-infinity", "additional": {"app_name": "elation-patient-communication", "difficulty": "hard", "verifier_path": "real-tasks/task_h14.py", "app_base_port": 8010}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h67", "dataset": "webarena-infinity", "query": "Delete all time entries from the GraphQL gateway issue, add a single new entry of 16 hours with summary 'Complete rewrite estimate', and set its time estimate to 40 hours.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h67", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h67.py", "app_base_port": 8050}}}
+{"query_id": "infinity-gmail-accounts-and-contacts-task_h73", "dataset": "webarena-infinity", "query": "Among the individual people in your other contacts (those with a first and last name), find the one who was saved most recently. Move them to your main contacts, set their company to 'Salesforce', job title to 'Account Executive', and add the Work label.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h73", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h73.py", "app_base_port": 8070}}}
+{"query_id": "infinity-elation-prescriptions-task_h4", "dataset": "webarena-infinity", "query": "Run a medication reconciliation and mark the Calcium+D3 supplement for discontinuation during the review.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h4", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h4.py", "app_base_port": 8020}}}
+{"query_id": "infinity-elation-prescriptions-task_h47", "dataset": "webarena-infinity", "query": "The patient's SSRI is currently dispensed at a different pharmacy than most of his other medications. Prescribe a refill of the same SSRI at the same dose and sig, but send it to CVS #4521 instead \u2014 qty 30, 5 refills, 30 days supply.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h47", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h47.py", "app_base_port": 8020}}}
+{"query_id": "infinity-paypal-my-wallet-task_h89", "dataset": "webarena-infinity", "query": "If your USD PayPal balance is above $2,500, convert $500 to Japanese Yen. If it is $2,500 or below, first add $500 from your Chase bank account, then convert $500 to JPY. Either way, set the debit card cash back category to Fuel.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h89", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h89.py", "app_base_port": 8100}}}
--- a/packages/browseros-agent/apps/eval/scripts/agisdk-evaluate.py
+++ b/packages/browseros-agent/apps/eval/scripts/agisdk-evaluate.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""
+AGI SDK evaluation helper for BrowserOS eval framework.
+
+Reads JSON from stdin with task_id and env_state, runs the agisdk
+evaluator, and outputs the result as JSON to stdout.
+
+Input format:
+    {"task_id": "dashdish-1", "env_state": {...}, "model_response": ""}
+
+Output format:
+    {"reward": 0.0, "pass": false, "message": "...", "per_criterion": [...]}
+"""
+
+import json
+import sys
+
+
+def main():
+    data = json.loads(sys.stdin.read())
+    task_id = data["task_id"]
+    env_state = data["env_state"]
+    model_response = data.get("model_response", "")
+
+    try:
+        from agisdk.REAL.browsergym.webclones.evaluate import WebCloneEvaluator
+        from agisdk.REAL.browsergym.webclones.task_config import TaskConfig
+    except ImportError:
+        print(
+            json.dumps(
+                {
+                    "reward": 0,
+                    "pass": False,
+                    "message": "agisdk package not installed. Run: pip install agisdk",
+                    "per_criterion": [],
+                }
+            )
+        )
+        sys.exit(0)
+
+    try:
+        # Redirect stdout to stderr during evaluation — agisdk's rich logger
+        # prints directly to stdout, which would corrupt our JSON output
+        real_stdout = sys.stdout
+        sys.stdout = sys.stderr
+
+        tc = TaskConfig(task_id)
+        evaluator = WebCloneEvaluator(tc)
+        reward_val, _done, message, info = evaluator.evaluate(
+            env_state=env_state, model_response=model_response
+        )
+
+        sys.stdout = real_stdout
+
+        reward_val = float(reward_val) if reward_val is not None else 0.0
+        results = info.get("results", [])
+        per_criterion = [
+            {"passed": r[0], "detail": str(r[1]) if len(r) > 1 else ""}
+            for r in results
+        ]
+
+        print(
+            json.dumps(
+                {
+                    "reward": reward_val,
+                    "pass": reward_val == 1.0,
+                    "message": str(message),
+                    "per_criterion": per_criterion,
+                }
+            )
+        )
+
+    except Exception as e:
+        sys.stdout = real_stdout if "real_stdout" in dir() else sys.__stdout__
+        print(
+            json.dumps(
+                {
+                    "reward": 0,
+                    "pass": False,
+                    "message": f"Evaluation error: {str(e)}",
+                    "per_criterion": [],
+                }
+            )
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/browseros-agent/apps/eval/scripts/build-agisdk-dataset.py
+++ b/packages/browseros-agent/apps/eval/scripts/build-agisdk-dataset.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""
+Build JSONL dataset for AGI SDK / REAL Bench evaluation.
+
+Reads task definitions from the agisdk package, filters to feasible
+action-only tasks (excludes llm_boolean evaluators), and outputs JSONL
+to stdout in the BrowserOS eval framework format.
+
+Usage:
+    python scripts/build-agisdk-dataset.py > data/agisdk-real.jsonl
+"""
+
+import json
+import sys
+
+
+def has_llm_eval(task: dict) -> bool:
+    return any(e.get("type") == "llm_boolean" for e in task.get("evals", []))
+
+
+def main():
+    try:
+        from agisdk.REAL.tasks import all_tasks
+    except ImportError:
+        print(
+            "Error: agisdk package not installed. Run: pip install agisdk",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    count = 0
+    skipped_infeasible = 0
+    skipped_llm = 0
+
+    for task in all_tasks:
+        if not task.get("possible", True):
+            skipped_infeasible += 1
+            continue
+
+        if has_llm_eval(task):
+            skipped_llm += 1
+            continue
+
+        task_id = task["id"]
+        website = task.get("website", {})
+        goal = task.get("goal", "")
+        start_url = website.get("url", "")
+
+        if not start_url or not goal:
+            print(f"Warning: Skipping {task_id} — missing url or goal", file=sys.stderr)
+            continue
+
+        entry = {
+            "query_id": f"agisdk-{task_id}",
+            "dataset": "agisdk-real",
+            "query": goal,
+            "graders": ["agisdk_state_diff"],
+            "start_url": start_url,
+            "metadata": {
+                "original_task_id": task_id,
+                "website": website.get("name", ""),
+                "category": "agisdk-real",
+                "additional": {
+                    "agisdk_task_id": task_id,
+                    "challenge_type": task.get("challengeType", "action"),
+                    "difficulty": task.get("difficulty", "unknown"),
+                    "similar_to": website.get("similarTo", ""),
+                },
+            },
+        }
+
+        print(json.dumps(entry))
+        count += 1
+
+    print(
+        f"Generated {count} tasks (skipped {skipped_infeasible} infeasible, "
+        f"{skipped_llm} llm_boolean)",
+        file=sys.stderr,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/browseros-agent/apps/eval/scripts/build-infinity-dataset.py
+++ b/packages/browseros-agent/apps/eval/scripts/build-infinity-dataset.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""
+Dataset generator for WebArena-Infinity benchmark.
+
+Reads real-tasks.json from each app directory and outputs JSONL
+in the eval framework's TaskSchema format.
+
+Usage:
+    python build-infinity-dataset.py --apps-dir /path/to/webarena-infinity/apps
+    python build-infinity-dataset.py --apps-dir /path/to/apps --apps gmail linear --difficulty medium
+"""
+
+import argparse
+import json
+import os
+import sys
+
+
+def load_tasks(app_dir: str, app_name: str) -> list[dict]:
+    tasks_file = os.path.join(app_dir, "real-tasks.json")
+    if not os.path.exists(tasks_file):
+        print(f"Warning: No real-tasks.json found in {app_dir}", file=sys.stderr)
+        return []
+    with open(tasks_file) as f:
+        return json.load(f)
+
+
+def build_task_entry(
+    app_name: str,
+    task: dict,
+    base_port: int,
+) -> dict:
+    task_id = task.get("id", task.get("task_id", "unknown"))
+    difficulty = task.get("difficulty", "unknown")
+    query = task.get("query", task.get("instruction", task.get("task", "")))
+    verifier_path = task.get(
+        "verify",
+        task.get("verifier_path", f"real-tasks/{task_id}.py"),
+    )
+
+    return {
+        "query_id": f"infinity-{app_name}-{task_id}",
+        "dataset": "webarena-infinity",
+        "query": query,
+        "graders": ["infinity_state"],
+        "start_url": f"http://localhost:{base_port}",
+        "setup_script": f"POST http://localhost:{base_port}/api/reset",
+        "metadata": {
+            "original_task_id": f"{app_name}-{task_id}",
+            "website": app_name,
+            "category": "webarena-infinity",
+            "additional": {
+                "app_name": app_name,
+                "difficulty": difficulty,
+                "verifier_path": verifier_path,
+                "app_port": base_port,
+            },
+        },
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate JSONL dataset from WebArena-Infinity apps"
+    )
+    parser.add_argument(
+        "--apps-dir",
+        required=True,
+        help="Path to webarena-infinity/apps/ directory",
+    )
+    parser.add_argument(
+        "--apps",
+        nargs="*",
+        default=None,
+        help="Filter to specific app names (default: all)",
+    )
+    parser.add_argument(
+        "--difficulty",
+        choices=["easy", "medium", "hard"],
+        default=None,
+        help="Filter by difficulty tier",
+    )
+    parser.add_argument(
+        "--base-port",
+        type=int,
+        default=8000,
+        help="Starting port number for apps (default: 8000)",
+    )
+    args = parser.parse_args()
+
+    if not os.path.isdir(args.apps_dir):
+        print(f"Error: {args.apps_dir} is not a directory", file=sys.stderr)
+        sys.exit(1)
+
+    app_dirs = sorted(os.listdir(args.apps_dir))
+    if args.apps:
+        app_dirs = [d for d in app_dirs if d in args.apps]
+
+    port = args.base_port
+    for app_name in app_dirs:
+        app_path = os.path.join(args.apps_dir, app_name)
+        if not os.path.isdir(app_path):
+            continue
+
+        tasks = load_tasks(app_path, app_name)
+        for task in tasks:
+            difficulty = task.get("difficulty", "unknown")
+            if args.difficulty and difficulty != args.difficulty:
+                continue
+
+            entry = build_task_entry(app_name, task, port)
+            print(json.dumps(entry))
+
+        port += 1
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/browseros-agent/apps/eval/scripts/infinity-evaluate.py
+++ b/packages/browseros-agent/apps/eval/scripts/infinity-evaluate.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Evaluation helper for WebArena-Infinity verifier scripts.
+
+Reads JSON from stdin with app_server_url, verifier_path, and task_id.
+Runs the verifier against the app server and outputs a JSON result.
+
+Verifiers have the signature: verify(server_url: str) -> tuple[bool, str]
+They fetch /api/state internally and return (passed, message).
+
+Usage:
+    echo '{"app_server_url": "http://localhost:8000", "verifier_path": "/path/to/verify.py"}' | python infinity-evaluate.py
+"""
+
+import importlib.util
+import json
+import sys
+import traceback
+
+
+def load_verifier(verifier_path: str):
+    spec = importlib.util.spec_from_file_location("verifier", verifier_path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Cannot load verifier from {verifier_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def main():
+    try:
+        data = json.loads(sys.stdin.read())
+    except json.JSONDecodeError as e:
+        print(json.dumps({"pass": False, "reward": 0.0, "message": f"Invalid JSON input: {e}"}))
+        sys.exit(1)
+
+    server_url = data.get("app_server_url", "")
+    verifier_path = data.get("verifier_path", "")
+
+    if not server_url or not verifier_path:
+        print(json.dumps({
+            "pass": False,
+            "reward": 0.0,
+            "message": "Missing app_server_url or verifier_path",
+        }))
+        sys.exit(1)
+
+    try:
+        verifier = load_verifier(verifier_path)
+        fn = getattr(verifier, "verify", None)
+        if not callable(fn):
+            raise AttributeError(
+                f"Verifier has no verify() function. "
+                f"Available: {[a for a in dir(verifier) if not a.startswith('_')]}"
+            )
+
+        # Verifiers take server_url and fetch state internally
+        result = fn(server_url)
+
+        # Return is tuple[bool, str]
+        if isinstance(result, tuple) and len(result) >= 2:
+            passed, message = result[0], str(result[1])
+        else:
+            passed, message = bool(result), str(result)
+
+    except Exception as e:
+        print(json.dumps({
+            "pass": False,
+            "reward": 0.0,
+            "message": f"Verifier error: {e}\n{traceback.format_exc()}",
+        }))
+        sys.exit(1)
+
+    print(json.dumps({
+        "pass": passed,
+        "reward": 1.0 if passed else 0.0,
+        "message": message,
+    }))
+
+
+if __name__ == "__main__":
+    main()
--- a/packages/browseros-agent/apps/eval/scripts/weekly-report.ts
+++ b/packages/browseros-agent/apps/eval/scripts/weekly-report.ts
@@ -59,6 +59,8 @@ interface RunSummary {
 }

 const PASS_FAIL_GRADER_ORDER = [
+  'agisdk_state_diff',
+  'infinity_state',
  'performance_grader',
  'webvoyager_grader',
  'fara_combined',
--- a/packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts
@@ -0,0 +1,202 @@
+import { spawn } from 'node:child_process'
+import { join } from 'node:path'
+import type { GraderResult } from '../../types'
+import { callMcpTool } from '../../utils/mcp-client'
+import type { Grader, GraderInput } from '../types'
+
+const EVAL_SCRIPT = join(
+  import.meta.dirname,
+  '..',
+  '..',
+  '..',
+  'scripts',
+  'agisdk-evaluate.py',
+)
+
+export class AgisdkStateDiffGrader implements Grader {
+  name = 'agisdk_state_diff'
+
+  async grade(input: GraderInput): Promise<GraderResult> {
+    const taskId = this.extractTaskId(input.task.query_id)
+    const startUrl = this.extractStartUrl(input)
+    const mcpEndpoint =
+      input.mcpUrl ||
+      `${process.env.BROWSEROS_SERVER_URL || 'http://127.0.0.1:9110'}/mcp`
+
+    if (!startUrl) {
+      return {
+        score: 0,
+        pass: false,
+        reasoning: 'Could not determine clone site URL from task',
+      }
+    }
+
+    const origin = new URL(startUrl).origin
+
+    let envState: Record<string, unknown>
+    try {
+      envState = await this.fetchFinishState(origin, mcpEndpoint)
+    } catch (error) {
+      return {
+        score: 0,
+        pass: false,
+        reasoning: `Failed to fetch /finish endpoint: ${error instanceof Error ? error.message : String(error)}`,
+        details: { origin, error: true },
+      }
+    }
+
+    try {
+      const result = await this.runPythonEvaluator(
+        taskId,
+        envState,
+        input.finalAnswer || '',
+      )
+      return {
+        score: result.reward,
+        pass: result.pass,
+        reasoning:
+          result.message ||
+          (result.pass ? 'All criteria passed' : 'Some criteria failed'),
+        details: {
+          reward: result.reward,
+          per_criterion: result.per_criterion,
+          origin,
+          agisdk_task_id: taskId,
+        },
+      }
+    } catch (error) {
+      return {
+        score: 0,
+        pass: false,
+        reasoning: `Python evaluator error: ${error instanceof Error ? error.message : String(error)}`,
+        details: { error: true },
+      }
+    }
+  }
+
+  private extractTaskId(queryId: string): string {
+    return queryId.replace(/^agisdk-/, '')
+  }
+
+  private extractStartUrl(input: GraderInput): string | null {
+    // Derive from task_id: "dashdish-10" → "https://evals-dashdish.vercel.app"
+    // Task IDs are "{site}-{number}" where site may contain hyphens (e.g. "fly-unified-5")
+    const taskId = this.extractTaskId(input.task.query_id)
+    const siteId = taskId.replace(/-\d+$/, '')
+    if (siteId) return `https://evals-${siteId}.vercel.app`
+
+    // Fallback: search messages for vercel.app URLs
+    for (const msg of input.messages) {
+      const text =
+        msg.type === 'user'
+          ? msg.content
+          : msg.type === 'tool-input-available'
+            ? JSON.stringify(msg.input)
+            : ''
+      const urlMatch = text.match(/https?:\/\/[^\s"']+\.vercel\.app/)
+      if (urlMatch) return urlMatch[0]
+    }
+
+    return null
+  }
+
+  private async fetchFinishState(
+    origin: string,
+    mcpEndpoint: string,
+  ): Promise<Record<string, unknown>> {
+    const finishUrl = `${origin}/finish`
+
+    // Navigate browser to /finish page (state diff is rendered client-side)
+    await callMcpTool(mcpEndpoint, 'navigate_page', {
+      url: finishUrl,
+      page: 1,
+    })
+
+    // Wait for the page to render, then extract JSON from <pre> element
+    const result = await callMcpTool(mcpEndpoint, 'evaluate_script', {
+      page: 1,
+      expression: `
+        new Promise((resolve, reject) => {
+          let attempts = 0;
+          const check = () => {
+            const pre = document.querySelector('pre');
+            if (pre && pre.textContent.trim().startsWith('{')) {
+              resolve(pre.textContent);
+            } else if (++attempts > 20) {
+              reject(new Error('Timed out waiting for <pre> JSON on /finish'));
+            } else {
+              setTimeout(check, 500);
+            }
+          };
+          check();
+        })
+      `,
+    })
+
+    const textContent = result.content?.find(
+      (c: { type: string }) => c.type === 'text',
+    )
+    if (!textContent?.text) {
+      throw new Error('No text content returned from /finish page')
+    }
+
+    return JSON.parse(textContent.text) as Record<string, unknown>
+  }
+
+  private runPythonEvaluator(
+    taskId: string,
+    envState: Record<string, unknown>,
+    modelResponse: string,
+  ): Promise<{
+    reward: number
+    pass: boolean
+    message: string
+    per_criterion: unknown[]
+  }> {
+    return new Promise((resolve, reject) => {
+      const proc = spawn('python3', [EVAL_SCRIPT], {
+        stdio: ['pipe', 'pipe', 'pipe'],
+      })
+
+      const inputData = JSON.stringify({
+        task_id: taskId,
+        env_state: envState,
+        model_response: modelResponse,
+      })
+
+      let stdout = ''
+      let stderr = ''
+
+      proc.stdout.on('data', (data: Buffer) => {
+        stdout += data.toString()
+      })
+
+      proc.stderr.on('data', (data: Buffer) => {
+        stderr += data.toString()
+      })
+
+      proc.on('close', (code) => {
+        if (code !== 0) {
+          reject(
+            new Error(`Python evaluator exited with code ${code}: ${stderr}`),
+          )
+          return
+        }
+
+        try {
+          const result = JSON.parse(stdout.trim())
+          resolve(result)
+        } catch {
+          reject(new Error(`Failed to parse evaluator output: ${stdout}`))
+        }
+      })
+
+      proc.on('error', (err) => {
+        reject(new Error(`Failed to spawn Python evaluator: ${err.message}`))
+      })
+
+      proc.stdin.write(inputData)
+      proc.stdin.end()
+    })
+  }
+}
--- a/packages/browseros-agent/apps/eval/src/graders/benchmark/infinity-state.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/benchmark/infinity-state.ts
@@ -0,0 +1,134 @@
+import { join, resolve } from 'node:path'
+import type { GraderResult } from '../../types'
+import type { Grader, GraderInput } from '../types'
+
+interface InfinityEvalInput {
+  app_server_url: string
+  verifier_path: string
+  task_id: string
+}
+
+interface InfinityEvalOutput {
+  pass: boolean
+  reward: number
+  message: string
+}
+
+const EVAL_SCRIPT = resolve(
+  import.meta.dir,
+  '../../../scripts/infinity-evaluate.py',
+)
+
+export class InfinityStateGrader implements Grader {
+  name = 'infinity_state'
+
+  async grade(input: GraderInput): Promise<GraderResult> {
+    const parsed = this.parseQueryId(input.task.query_id)
+    if (!parsed) {
+      return {
+        score: 0,
+        pass: false,
+        reasoning: `Cannot parse query_id "${input.task.query_id}" — expected format: infinity-{app}-{task_id}`,
+      }
+    }
+
+    const appServerUrl = this.resolveAppServerUrl(input)
+    if (!appServerUrl) {
+      return {
+        score: 0,
+        pass: false,
+        reasoning: 'Cannot determine app server URL',
+      }
+    }
+
+    const infinityDir = process.env.WEBARENA_INFINITY_DIR
+    if (!infinityDir) {
+      return {
+        score: 0,
+        pass: false,
+        reasoning:
+          'WEBARENA_INFINITY_DIR env var not set. Point it to the webarena-infinity repo root.',
+      }
+    }
+
+    const verifierPath = join(
+      infinityDir,
+      'apps',
+      parsed.appName,
+      'real-tasks',
+      `${parsed.taskId}.py`,
+    )
+
+    const evalInput: InfinityEvalInput = {
+      app_server_url: appServerUrl,
+      verifier_path: verifierPath,
+      task_id: input.task.query_id,
+    }
+
+    try {
+      const result = await this.runPythonEvaluator(evalInput)
+      return {
+        score: result.pass ? 1 : 0,
+        pass: result.pass,
+        reasoning: result.message,
+        details: {
+          reward: result.reward,
+          app_name: parsed.appName,
+          app_server_url: appServerUrl,
+        },
+      }
+    } catch (error) {
+      return {
+        score: 0,
+        pass: false,
+        reasoning: `Evaluator process error: ${error instanceof Error ? error.message : String(error)}`,
+      }
+    }
+  }
+
+  private parseQueryId(
+    queryId: string,
+  ): { appName: string; taskId: string } | null {
+    // Task IDs start with "task_", app names may contain hyphens
+    // e.g. "infinity-elation-prescriptions-task_h69"
+    const match = queryId.match(/^infinity-(.+)-(task_.+)$/)
+    if (!match) return null
+    return { appName: match[1], taskId: match[2] }
+  }
+
+  private resolveAppServerUrl(input: GraderInput): string | null {
+    // Passed directly from task executor (started by InfinityAppManager)
+    if (input.infinityAppUrl) return input.infinityAppUrl
+
+    // Fallback: env var for manual testing
+    if (process.env.INFINITY_APP_URL) return process.env.INFINITY_APP_URL
+
+    return null
+  }
+
+  private async runPythonEvaluator(
+    evalInput: InfinityEvalInput,
+  ): Promise<InfinityEvalOutput> {
+    const proc = Bun.spawn(['python3', EVAL_SCRIPT], {
+      stdin: 'pipe',
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+
+    const inputJson = JSON.stringify(evalInput)
+    proc.stdin.write(inputJson)
+    proc.stdin.end()
+
+    const stdout = await new Response(proc.stdout).text()
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+
+    if (exitCode !== 0) {
+      throw new Error(
+        `Python evaluator exited with code ${exitCode}: ${stderr || stdout}`,
+      )
+    }
+
+    return JSON.parse(stdout.trim()) as InfinityEvalOutput
+  }
+}
--- a/packages/browseros-agent/apps/eval/src/graders/registry.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/registry.ts
@@ -1,4 +1,6 @@
 import type { GraderResult } from '../types'
+import { AgisdkStateDiffGrader } from './benchmark/agisdk-state-diff'
+import { InfinityStateGrader } from './benchmark/infinity-state'
 import { Mind2WebJudgeGrader } from './benchmark/mind2web'
 import { WebVoyagerGrader } from './benchmark/webvoyager'
 import { FaraAlignmentGrader } from './fara/alignment'
@@ -19,7 +21,13 @@ export function createGrader(
  options: GraderOptions | null,
 ): Grader | null {
  switch (name) {
-    // Benchmark graders
+    // Deterministic benchmark graders (no LLM judge)
+    case 'agisdk_state_diff':
+      return new AgisdkStateDiffGrader()
+    case 'infinity_state':
+      return new InfinityStateGrader()
+
+    // LLM-based benchmark graders
    case 'webvoyager_grader':
      if (!options?.apiKey) return null
      return new WebVoyagerGrader(
@@ -107,10 +115,12 @@ export async function runGraders(

 // Export grader classes for direct use
 export {
+  AgisdkStateDiffGrader,
  FaraAlignmentGrader,
  FaraCombinedGrader,
  FaraMultimodalGrader,
  FaraRubricGrader,
+  InfinityStateGrader,
  Mind2WebJudgeGrader,
  PerformanceGrader,
  WebVoyagerGrader,
--- a/packages/browseros-agent/apps/eval/src/graders/types.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/types.ts
@@ -11,6 +11,8 @@ export interface GraderInput {
  finalAnswer: string | null
  expectedAnswer?: string | null
  outputDir: string
+  mcpUrl?: string
+  infinityAppUrl?: string
 }

 export interface Grader {
--- a/packages/browseros-agent/apps/eval/src/runner/infinity-app-manager.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/infinity-app-manager.ts
@@ -0,0 +1,89 @@
+/**
+ * Manages WebArena-Infinity app server lifecycle per task.
+ *
+ * Each worker gets a unique port: base_port + worker_index.
+ * Server is started fresh before each task and killed after,
+ * guaranteeing clean state.
+ */
+
+import { type ChildProcess, spawn } from 'node:child_process'
+import { join } from 'node:path'
+
+export class InfinityAppManager {
+  private proc: ChildProcess | null = null
+  private port: number
+  private infinityDir: string
+
+  constructor(
+    private workerIndex: number,
+    private basePort: number = 8000,
+  ) {
+    this.port = basePort + workerIndex
+    this.infinityDir = process.env.WEBARENA_INFINITY_DIR || ''
+  }
+
+  async startApp(appName: string): Promise<string> {
+    await this.stop()
+
+    if (!this.infinityDir) {
+      throw new Error('WEBARENA_INFINITY_DIR env var not set')
+    }
+
+    const serverScript = join(this.infinityDir, 'apps', appName, 'server.py')
+    this.proc = spawn('python3', [serverScript, '--port', String(this.port)], {
+      stdio: ['ignore', 'pipe', 'pipe'],
+      cwd: join(this.infinityDir, 'apps', appName),
+    })
+
+    // Wait for server to be ready
+    const url = `http://localhost:${this.port}`
+    await this.waitForReady(url)
+    return url
+  }
+
+  async stop(): Promise<void> {
+    if (this.proc) {
+      this.proc.kill('SIGTERM')
+      await new Promise<void>((resolve) => {
+        const timeout = setTimeout(() => {
+          this.proc?.kill('SIGKILL')
+          resolve()
+        }, 3000)
+        this.proc?.on('exit', () => {
+          clearTimeout(timeout)
+          resolve()
+        })
+      })
+      this.proc = null
+    }
+  }
+
+  getPort(): number {
+    return this.port
+  }
+
+  getUrl(): string {
+    return `http://localhost:${this.port}`
+  }
+
+  private async waitForReady(
+    url: string,
+    maxAttempts = 30,
+    intervalMs = 500,
+  ): Promise<void> {
+    for (let i = 0; i < maxAttempts; i++) {
+      try {
+        const resp = await fetch(url, {
+          signal: AbortSignal.timeout(2000),
+        })
+        if (resp.ok) return
+      } catch {
+        // Server not ready yet
+      }
+      await new Promise((r) => setTimeout(r, intervalMs))
+    }
+    throw new Error(
+      `Infinity app server not ready after ${maxAttempts * intervalMs}ms on port ${this.port}`,
+    )
+  }
+}
--- a/packages/browseros-agent/apps/eval/src/runner/task-executor.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/task-executor.ts
@@ -9,6 +9,7 @@ import {
 import { runGraders } from '../graders/registry'
 import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
 import { callMcpTool } from '../utils/mcp-client'
+import { InfinityAppManager } from './infinity-app-manager'
 import type { GraderOptions, TaskResult } from './types'

 // ============================================================================
@@ -101,6 +102,36 @@ export class TaskExecutor {
    // Resolve page ID once — fresh browser has exactly one page
    const pageId = await this.resolveInitialPageId(mcpUrl)

+    // For Infinity tasks, start a fresh app server per task
+    let infinityManager: InfinityAppManager | null = null
+    let actualStartUrl = task.start_url
+
+    if (task.dataset === 'webarena-infinity') {
+      const appName = (task.metadata?.additional as Record<string, unknown>)
+        ?.app_name as string
+      const appBasePort =
+        ((task.metadata?.additional as Record<string, unknown>)
+          ?.app_base_port as number) || 8000
+      const workerIndex = this.config.browseros.base_server_port - 9110 // derive from port offset
+
+      if (appName && process.env.WEBARENA_INFINITY_DIR) {
+        infinityManager = new InfinityAppManager(workerIndex, appBasePort)
+        try {
+          actualStartUrl = await infinityManager.startApp(appName)
+          console.log(
+            `  Infinity app "${appName}" started on port ${infinityManager.getPort()}`,
+          )
+        } catch (error) {
+          throw new TaskExecutionError(
+            `Failed to start Infinity app: ${error instanceof Error ? error.message : String(error)}`,
+            task,
+            'navigation',
+            error instanceof Error ? error : undefined,
+          )
+        }
+      }
+    }
+
    try {
      // Phase 1: Set viewport + navigate to start URL
      try {
@@ -114,10 +145,10 @@ export class TaskExecutor {
        )
      }

-      if (task.start_url && task.start_url !== 'about:blank') {
+      if (actualStartUrl && actualStartUrl !== 'about:blank') {
        try {
          await callMcpTool(mcpUrl, 'navigate_page', {
-            url: task.start_url,
+            url: actualStartUrl,
            page: pageId,
          })
        } catch (error) {
@@ -134,7 +165,11 @@ export class TaskExecutor {
      const agentResult = await this.executeAgent(task, pageId)

      // Phase 3: Run graders
-      const graderResults = await this.runGraders(task, agentResult)
+      const graderResults = await this.runGraders(
+        task,
+        agentResult,
+        infinityManager?.getUrl(),
+      )

      const status =
        agentResult.metadata.termination_reason === 'timeout'
@@ -169,6 +204,11 @@ export class TaskExecutor {
      } catch {
        // Ignore cleanup errors
      }
+
+      // Stop Infinity app server if running
+      if (infinityManager) {
+        await infinityManager.stop().catch(() => {})
+      }
    }
  }

@@ -209,6 +249,7 @@ export class TaskExecutor {
  private async runGraders(
    task: Task,
    agentResult: AgentResult,
+    infinityAppUrl?: string,
  ): Promise<Record<string, GraderResult>> {
    const configGraders = this.config.graders ?? []
    const taskGraders = task.graders ?? []
@@ -234,6 +275,8 @@ export class TaskExecutor {
          expectedAnswer: (task.metadata?.additional as Record<string, unknown>)
            ?.answer as string | undefined,
          outputDir: join(this.outputDir, task.query_id),
+          mcpUrl: `${this.config.browseros.server_url}/mcp`,
+          infinityAppUrl,
        },
        this.deps.graderOptions,
      )
--- a/packages/browseros-agent/apps/eval/src/runner/types.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/types.ts
@@ -100,6 +100,8 @@ export interface TaskResultSummary {
 // ============================================================================

 export const PASS_FAIL_GRADER_ORDER = [
+  'agisdk_state_diff',
+  'infinity_state',
  'performance_grader',
  'webvoyager_grader',
  'fara_combined',
--- a/packages/browseros-agent/apps/server/README.md
+++ b/packages/browseros-agent/apps/server/README.md
@@ -92,10 +92,6 @@ Skills are custom instruction sets that shape agent behavior:
 - **Loader** (`src/skills/loader.ts`) — loads skills from local and remote sources
 - **Remote sync** (`src/skills/remote-sync.ts`) — syncs skills from the BrowserOS cloud

-## Graph Executor (Workflows)
-
-The graph executor (`src/graph/executor.ts`) runs visual workflow graphs built in the BrowserOS workflow editor. Each node in the graph maps to agent actions, conditionals, or data transformations.
-
 ## Directory Structure

 ```
@@ -120,14 +116,12 @@ apps/server/
 │   │   ├── filesystem/
 │   │   └── ...
 │   ├── skills/                # Skills system
-│   ├── graph/                 # Workflow graph executor
 │   ├── lib/                   # Shared utilities
 │   └── rpc.ts                 # JSON-RPC type definitions
 ├── tests/
 │   ├── tools/                 # Tool-level tests
 │   ├── sdk/                   # SDK integration tests
 │   └── server.integration.test.ts
-├── graph/                     # Workflow graph definitions
 └── package.json
 ```

--- a/packages/browseros-agent/apps/server/package.json
+++ b/packages/browseros-agent/apps/server/package.json
@@ -1,6 +1,6 @@
 {
  "name": "@browseros/server",
-  "version": "0.0.80",
+  "version": "0.0.82",
  "description": "BrowserOS server",
  "type": "module",
  "main": "./src/index.ts",
--- a/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts
+++ b/packages/browseros-agent/apps/server/src/agent/ai-sdk-agent.ts
@@ -209,7 +209,7 @@ export class AiSdkAgent {
      userSystemPrompt: config.resolvedConfig.userSystemPrompt,
      exclude: excludeSections,
      isScheduledTask: config.resolvedConfig.isScheduledTask,
-      scheduledTaskWindowId: config.browserContext?.windowId,
+      scheduledTaskPageId: config.browserContext?.activeTab?.pageId,
      workspaceDir: config.resolvedConfig.workingDir,
      soulContent,
      isSoulBootstrap: isBootstrap,
--- a/packages/browseros-agent/apps/server/src/agent/mcp-builder.ts
+++ b/packages/browseros-agent/apps/server/src/agent/mcp-builder.ts
@@ -2,6 +2,7 @@ import { createMCPClient } from '@ai-sdk/mcp'
 import { TIMEOUTS } from '@browseros/shared/constants/timeouts'
 import type { BrowserContext } from '@browseros/shared/schemas/browser-context'
 import type { ToolSet } from 'ai'
+import { klavisStrataCache } from '../api/services/klavis/strata-cache'
 import type { KlavisClient } from '../lib/clients/klavis/klavis-client'
 import { logger } from '../lib/logger'
 import {
@@ -40,7 +41,8 @@ export async function buildMcpServerSpecs(
    deps.browserContext?.enabledMcpServers?.length
  ) {
    try {
-      const result = await deps.klavisClient.createStrata(
+      const result = await klavisStrataCache.getOrFetch(
+        deps.klavisClient,
        deps.browserosId,
        deps.browserContext.enabledMcpServers,
      )
--- a/packages/browseros-agent/apps/server/src/agent/prompt.ts
+++ b/packages/browseros-agent/apps/server/src/agent/prompt.ts
@@ -49,7 +49,7 @@ You do not have a filesystem workspace in this session. Return all results direc
  // Mode-aware framing
  if (options?.isScheduledTask) {
    role +=
-      '\n\nYou are running as a scheduled background task in a dedicated hidden browser window. Complete the task autonomously and report results.'
+      '\n\nYou are running as a scheduled background task on a system-managed hidden page. Complete the task autonomously and report results.'
  } else if (options?.chatMode) {
    role +=
      '\n\nYou are in read-only chat mode. You can observe pages but cannot interact with them, modify files, or store memories.'
@@ -238,7 +238,7 @@ When a task requires working on multiple pages simultaneously:
 7. **Never force-switch the user's active tab.** If you need user interaction on a background tab (e.g., login, CAPTCHA), tell the user which tab needs attention and let them switch manually.
 8. **Never navigate the user's current tab** during a multi-tab task. The current tab is the user's anchor — use it only for reading (snapshots, content extraction). All navigation should happen on background tabs.

-**Do NOT use \`create_hidden_window\` or \`new_hidden_page\` for user-requested tasks.** Hidden windows are invisible to the user and cannot be screenshotted. Use \`new_page\` (background mode) instead — tabs appear in the user's tab strip and can be inspected. Reserve hidden windows for automated/scheduled runs only.`
+**Do NOT use \`create_hidden_window\` or \`new_hidden_page\` for user-requested tasks.** Hidden pages are invisible to the user and do not appear in the user's tab strip. Use \`new_page\` (background mode) instead — tabs appear in the user's tab strip and can be inspected. Reserve hidden pages for automated/scheduled runs only.`

  if (!isNewTab) {
    executionContent += `
@@ -661,22 +661,24 @@ function getUserContext(

    if (options?.isScheduledTask) {
      pageCtx +=
-        '\nYou are running as a **scheduled background task** in a dedicated hidden browser window.'
+        '\nYou are running as a **scheduled background task** on a system-managed hidden page.'
    }

    pageCtx +=
      '\n\n**CRITICAL RULES:**\n1. **Do NOT call `get_active_page` or `list_pages` to find your starting page.** Use the **page ID from the Browser Context** directly.'

    if (options?.isScheduledTask) {
-      const windowRef = options.scheduledTaskWindowId
-        ? `\`windowId: ${options.scheduledTaskWindowId}\``
-        : 'the `windowId` from the Browser Context'
-      pageCtx += `\n2. **Always pass ${windowRef}** when calling \`new_page\` or \`new_hidden_page\`. Never omit the \`windowId\` parameter.`
+      const pageRef = options.scheduledTaskPageId
+        ? `\`${options.scheduledTaskPageId}\``
+        : 'the page ID from the Browser Context'
+      pageCtx += `\n2. **Use starting page ID ${pageRef} directly.** For additional browsing, prefer \`new_hidden_page\` so the work stays invisible to the user.`
      pageCtx +=
-        '\n3. **Do NOT close your dedicated hidden window** (via `close_window`). It is managed by the system and will be cleaned up automatically.'
+        '\n3. **Do NOT close your starting hidden page** (via `close_page` on that page ID). It is managed by the system and will be cleaned up automatically.'
      pageCtx +=
-        '\n4. **Do NOT create new windows** (via `create_window` or `create_hidden_window`). Use your existing hidden window for all pages.'
-      pageCtx += '\n5. Complete the task end-to-end and report results.'
+        '\n4. **Do NOT create new windows** (via `create_window` or `create_hidden_window`). Use hidden pages instead.'
+      pageCtx +=
+        '\n5. **Close extra hidden pages when you are done with them** unless you explicitly reveal them with `show_page`.'
+      pageCtx += '\n6. Complete the task end-to-end and report results.'
    }

    pageCtx += '\n</page_context>'
@@ -737,7 +739,7 @@ export interface BuildSystemPromptOptions {
  userSystemPrompt?: string
  exclude?: string[]
  isScheduledTask?: boolean
-  scheduledTaskWindowId?: number
+  scheduledTaskPageId?: number
  workspaceDir?: string
  soulContent?: string
  isSoulBootstrap?: boolean
--- a/packages/browseros-agent/apps/server/src/agent/session-store.ts
+++ b/packages/browseros-agent/apps/server/src/agent/session-store.ts
@@ -4,8 +4,8 @@ import type { AiSdkAgent } from './ai-sdk-agent'

 export interface AgentSession {
  agent: AiSdkAgent
-  hiddenWindowId?: number
-  /** Browser context scoped to the hidden window (scheduled tasks only) */
+  hiddenPageId?: number
+  /** Browser context scoped to the scheduled hidden page. */
  browserContext?: BrowserContext
  /** MCP server names used when the session was created, for change detection. */
  mcpServerKey?: string
--- a/packages/browseros-agent/apps/server/src/api/routes/graph.ts
+++ b/packages/browseros-agent/apps/server/src/api/routes/graph.ts
@@ -1,274 +0,0 @@
-/**
- * @license
- * Copyright 2025 BrowserOS
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-import { PATHS } from '@browseros/shared/constants/paths'
-import { zValidator } from '@hono/zod-validator'
-import type { Context } from 'hono'
-import { Hono } from 'hono'
-import { stream } from 'hono/streaming'
-import { logger } from '../../lib/logger'
-import { GraphService } from '../services/graph-service'
-import {
-  CreateGraphRequestSchema,
-  RunGraphRequestSchema,
-  UpdateGraphRequestSchema,
-} from '../types'
-import {
-  formatUIMessageStreamDone,
-  formatUIMessageStreamEvent,
-} from '../utils/ui-message-stream'
-import { SessionIdParamSchema } from '../utils/validation'
-
-interface SSEStreamOptions {
-  vercelAIStream?: boolean
-  logLabel: string
-}
-
-type SSEStreamCallback = (
-  stream: { write: (data: string) => Promise<unknown> },
-  signal: AbortSignal,
-) => Promise<void>
-
-function createSSEStream(
-  c: Context,
-  options: SSEStreamOptions,
-  callback: SSEStreamCallback,
-) {
-  c.header('Content-Type', 'text/event-stream')
-  c.header('Cache-Control', 'no-cache')
-  c.header('Connection', 'keep-alive')
-
-  if (options.vercelAIStream) {
-    c.header('x-vercel-ai-ui-message-stream', 'v1')
-  }
-
-  const abortController = new AbortController()
-
-  if (c.req.raw.signal) {
-    c.req.raw.signal.addEventListener('abort', () => abortController.abort(), {
-      once: true,
-    })
-  }
-
-  return stream(c, async (honoStream) => {
-    honoStream.onAbort(() => {
-      abortController.abort()
-      logger.debug(`${options.logLabel} stream aborted`)
-    })
-
-    await callback(honoStream, abortController.signal)
-  })
-}
-
-interface GraphRouteDeps {
-  port: number
-  tempDir?: string
-  codegenServiceUrl?: string
-}
-
-export function createGraphRoutes(deps: GraphRouteDeps) {
-  const { port, codegenServiceUrl } = deps
-
-  const serverUrl = `http://127.0.0.1:${port}`
-  const tempDir = deps.tempDir || PATHS.DEFAULT_EXECUTION_DIR
-
-  const graphService = codegenServiceUrl
-    ? new GraphService({ codegenServiceUrl, serverUrl, tempDir })
-    : null
-
-  // Chain route definitions for proper Hono RPC type inference
-  return new Hono()
-    .post('/', zValidator('json', CreateGraphRequestSchema), async (c) => {
-      if (!graphService) {
-        return c.json({ error: 'CODEGEN_SERVICE_URL not configured' }, 503)
-      }
-      const request = c.req.valid('json')
-      logger.info('Graph create request received', { query: request.query })
-
-      return createSSEStream(
-        c,
-        { logLabel: 'Graph create', vercelAIStream: true },
-        async (s, signal) => {
-          try {
-            await graphService.createGraph(
-              request.query,
-              async (event) => {
-                await s.write(formatUIMessageStreamEvent(event))
-              },
-              signal,
-            )
-          } catch (error) {
-            const errorMessage =
-              error instanceof Error ? error.message : String(error)
-            await s.write(
-              formatUIMessageStreamEvent({
-                type: 'error',
-                errorText: errorMessage,
-              }),
-            )
-            await s.write(
-              formatUIMessageStreamEvent({
-                type: 'finish',
-                finishReason: 'error',
-              }),
-            )
-          } finally {
-            await s.write(formatUIMessageStreamDone())
-          }
-        },
-      )
-    })
-    .post(
-      '/:id',
-      zValidator('param', SessionIdParamSchema),
-      zValidator('json', UpdateGraphRequestSchema),
-      async (c) => {
-        if (!graphService) {
-          return c.json({ error: 'CODEGEN_SERVICE_URL not configured' }, 503)
-        }
-        const { id: sessionId } = c.req.valid('param')
-        const request = c.req.valid('json')
-        logger.info('Graph update request received', {
-          sessionId,
-          query: request.query,
-        })
-
-        return createSSEStream(
-          c,
-          { logLabel: 'Graph update', vercelAIStream: true },
-          async (s, signal) => {
-            try {
-              await graphService.updateGraph(
-                sessionId,
-                request.query,
-                async (event) => {
-                  await s.write(formatUIMessageStreamEvent(event))
-                },
-                signal,
-              )
-            } catch (error) {
-              const errorMessage =
-                error instanceof Error ? error.message : String(error)
-              await s.write(
-                formatUIMessageStreamEvent({
-                  type: 'error',
-                  errorText: errorMessage,
-                }),
-              )
-              await s.write(
-                formatUIMessageStreamEvent({
-                  type: 'finish',
-                  finishReason: 'error',
-                }),
-              )
-            } finally {
-              await s.write(formatUIMessageStreamDone())
-            }
-          },
-        )
-      },
-    )
-    .get('/:id', zValidator('param', SessionIdParamSchema), async (c) => {
-      if (!graphService) {
-        return c.json({ error: 'CODEGEN_SERVICE_URL not configured' }, 503)
-      }
-      const { id: sessionId } = c.req.valid('param')
-
-      logger.debug('Graph get request received', { sessionId })
-
-      const session = await graphService.getGraph(sessionId)
-
-      if (!session) {
-        return c.json({ error: 'Graph not found' }, 404)
-      }
-
-      return c.json(session)
-    })
-    .post(
-      '/:id/run',
-      zValidator('param', SessionIdParamSchema),
-      zValidator('json', RunGraphRequestSchema),
-      async (c) => {
-        if (!graphService) {
-          return c.json({ error: 'CODEGEN_SERVICE_URL not configured' }, 503)
-        }
-        const { id: sessionId } = c.req.valid('param')
-        const request = c.req.valid('json')
-        logger.info('Graph run request received', {
-          sessionId,
-          provider: request.provider,
-          model: request.model,
-        })
-
-        return createSSEStream(
-          c,
-          { logLabel: 'Graph run', vercelAIStream: true },
-          async (s, signal) => {
-            try {
-              // Emit start event at route level
-              await s.write(
-                formatUIMessageStreamEvent({
-                  type: 'start',
-                  messageId: sessionId,
-                }),
-              )
-
-              await graphService.runGraph(
-                sessionId,
-                request,
-                async (event) => {
-                  // Agent SDK handles proper event formatting
-                  // Skip start/finish (managed at route level), forward everything else
-                  if (event.type === 'start' || event.type === 'finish') {
-                    return
-                  }
-                  await s.write(formatUIMessageStreamEvent(event))
-                },
-                signal,
-              )
-
-              // Emit finish at route level
-              await s.write(
-                formatUIMessageStreamEvent({
-                  type: 'finish',
-                  finishReason: 'stop',
-                }),
-              )
-            } catch (error) {
-              const errorMessage =
-                error instanceof Error ? error.message : String(error)
-              await s.write(
-                formatUIMessageStreamEvent({
-                  type: 'error',
-                  errorText: errorMessage,
-                }),
-              )
-              await s.write(
-                formatUIMessageStreamEvent({
-                  type: 'finish',
-                  finishReason: 'error',
-                }),
-              )
-            } finally {
-              await s.write(formatUIMessageStreamDone())
-            }
-          },
-        )
-      },
-    )
-    .delete('/:id', zValidator('param', SessionIdParamSchema), async (c) => {
-      if (!graphService) {
-        return c.json({ error: 'CODEGEN_SERVICE_URL not configured' }, 503)
-      }
-      const { id: sessionId } = c.req.valid('param')
-
-      logger.debug('Graph delete request received', { sessionId })
-
-      await graphService.deleteGraph(sessionId)
-
-      return c.json({ success: true, message: `Graph ${sessionId} deleted` })
-    })
-}
--- a/packages/browseros-agent/apps/server/src/api/routes/klavis.ts
+++ b/packages/browseros-agent/apps/server/src/api/routes/klavis.ts
@@ -10,6 +10,7 @@ import { z } from 'zod'
 import { KlavisClient } from '../../lib/clients/klavis/klavis-client'
 import { OAUTH_MCP_SERVERS } from '../../lib/clients/klavis/oauth-mcp-servers'
 import { logger } from '../../lib/logger'
+import { klavisStrataCache } from '../services/klavis/strata-cache'

 const ServerNameSchema = z.object({
  serverName: z.string().min(1),
@@ -125,6 +126,7 @@ export function createKlavisRoutes(deps: KlavisRouteDeps) {
      logger.info('Adding server to strata', { serverName })

      const result = await klavisClient.createStrata(browserosId, [serverName])
+      klavisStrataCache.invalidate(browserosId)

      return c.json({
        success: true,
@@ -184,7 +186,17 @@ export function createKlavisRoutes(deps: KlavisRouteDeps) {

        logger.info('Removing server from strata', { serverName })

-        await klavisClient.removeServer(browserosId, serverName)
+        // The chat hot path keys its cache by the user's full enabled set,
+        // so a single-server lookup here would always miss and immediately
+        // be cleared by invalidate() below — call createStrata directly
+        // to recover the strataId, mirroring the original removeServer flow.
+        const strata = await klavisClient.createStrata(browserosId, [
+          serverName,
+        ])
+        await klavisClient.deleteServersFromStrata(strata.strataId, [
+          serverName,
+        ])
+        klavisStrataCache.invalidate(browserosId)

        return c.json({
          success: true,
--- a/packages/browseros-agent/apps/server/src/api/routes/mcp.ts
+++ b/packages/browseros-agent/apps/server/src/api/routes/mcp.ts
@@ -11,8 +11,8 @@ import { logger } from '../../lib/logger'
 import { metrics } from '../../lib/metrics'
 import { Sentry } from '../../lib/sentry'
 import type { ToolRegistry } from '../../tools/tool-registry'
+import type { KlavisProxyHandle } from '../services/klavis/strata-proxy'
 import { createMcpServer } from '../services/mcp/mcp-server'
-import type { KlavisProxyHandle } from '../services/mcp/register-klavis-mcp'
 import type { Env } from '../types'

 interface McpRouteDeps {
--- a/packages/browseros-agent/apps/server/src/api/server.ts
+++ b/packages/browseros-agent/apps/server/src/api/server.ts
@@ -22,7 +22,6 @@ import { logger } from '../lib/logger'
 import { Sentry } from '../lib/sentry'
 import { createChatRoutes } from './routes/chat'
 import { createCreditsRoutes } from './routes/credits'
-import { createGraphRoutes } from './routes/graph'
 import { createHealthRoute } from './routes/health'
 import { createKlavisRoutes } from './routes/klavis'
 import { createMcpRoutes } from './routes/mcp'
@@ -38,7 +37,7 @@ import { createStatusRoute } from './routes/status'
 import {
  connectKlavisProxy,
  type KlavisProxyHandle,
-} from './services/mcp/register-klavis-mcp'
+} from './services/klavis/strata-proxy'
 import type { Env, HttpServerConfig } from './types'
 import { defaultCorsConfig } from './utils/cors'

@@ -171,14 +170,6 @@ export async function createHttpServer(config: HttpServerConfig) {
        browserosId,
      }),
    )
-    .route(
-      '/graph',
-      createGraphRoutes({
-        port,
-        tempDir: executionDir,
-        codegenServiceUrl: config.codegenServiceUrl,
-      }),
-    )

  // Error handler
  app.onError((err, c) => {
--- a/packages/browseros-agent/apps/server/src/api/services/chat-service.ts
+++ b/packages/browseros-agent/apps/server/src/api/services/chat-service.ts
@@ -146,34 +146,51 @@ export class ChatService {

    if (!session) {
      isNewSession = true
-      let hiddenWindowId: number | undefined
+      let hiddenPageId: number | undefined
      let browserContext = await this.resolvePageIds(request.browserContext)
      if (request.isScheduledTask) {
        try {
-          const win = await this.deps.browser.createWindow({ hidden: true })
-          hiddenWindowId = win.windowId
-          const pageId = await this.deps.browser.newPage('about:blank', {
-            windowId: hiddenWindowId,
+          hiddenPageId = await this.deps.browser.newPage('about:blank', {
+            hidden: true,
+            background: true,
          })
+          let hiddenWindowId: number | undefined
+          try {
+            const hiddenPage = (await this.deps.browser.listPages()).find(
+              (page) => page.pageId === hiddenPageId,
+            )
+            hiddenWindowId = hiddenPage?.windowId
+          } catch (error) {
+            logger.warn('Failed to look up hidden page metadata', {
+              conversationId: request.conversationId,
+              pageId: hiddenPageId,
+              error: error instanceof Error ? error.message : String(error),
+            })
+          }
          browserContext = {
            ...browserContext,
            windowId: hiddenWindowId,
+            selectedTabs: undefined,
+            tabs: undefined,
            activeTab: {
-              id: pageId,
-              pageId,
+              id: hiddenPageId,
+              pageId: hiddenPageId,
              url: 'about:blank',
              title: 'Scheduled Task',
            },
          }
-          logger.info('Created hidden window for scheduled task', {
+          logger.info('Created hidden page for scheduled task', {
            conversationId: request.conversationId,
+            pageId: hiddenPageId,
            windowId: hiddenWindowId,
-            pageId,
          })
        } catch (error) {
-          logger.warn('Failed to create hidden window, using default', {
-            error: error instanceof Error ? error.message : String(error),
-          })
+          logger.warn(
+            'Failed to create hidden page, using default browser context',
+            {
+              error: error instanceof Error ? error.message : String(error),
+            },
+          )
        }
      }

@@ -188,7 +205,7 @@ export class ChatService {
      })
      session = {
        agent,
-        hiddenWindowId,
+        hiddenPageId,
        browserContext,
        mcpServerKey,
        workingDir: request.userWorkingDir,
@@ -245,10 +262,10 @@ export class ChatService {
          totalMessages: messages.length,
        })

-        if (session?.hiddenWindowId) {
-          const windowId = session.hiddenWindowId
-          session.hiddenWindowId = undefined
-          this.closeHiddenWindow(windowId, request.conversationId)
+        if (session?.hiddenPageId) {
+          const pageId = session.hiddenPageId
+          session.hiddenPageId = undefined
+          this.closeHiddenPage(pageId, request.conversationId)
        }
      },
    })
@@ -258,10 +275,10 @@ export class ChatService {
    conversationId: string,
  ): Promise<{ deleted: boolean; sessionCount: number }> {
    const session = this.deps.sessionStore.get(conversationId)
-    if (session?.hiddenWindowId) {
-      const windowId = session.hiddenWindowId
-      session.hiddenWindowId = undefined
-      this.closeHiddenWindow(windowId, conversationId)
+    if (session?.hiddenPageId) {
+      const pageId = session.hiddenPageId
+      session.hiddenPageId = undefined
+      this.closeHiddenPage(pageId, conversationId)
    }
    const deleted = await this.deps.sessionStore.delete(conversationId)
    return { deleted, sessionCount: this.deps.sessionStore.count() }
@@ -309,10 +326,10 @@ export class ChatService {
    }
  }

-  private closeHiddenWindow(windowId: number, conversationId: string): void {
-    this.deps.browser.closeWindow(windowId).catch((error) => {
-      logger.warn('Failed to close hidden window', {
-        windowId,
+  private closeHiddenPage(pageId: number, conversationId: string): void {
+    this.deps.browser.closePage(pageId).catch((error) => {
+      logger.warn('Failed to close hidden page', {
+        pageId,
        conversationId,
        error: error instanceof Error ? error.message : String(error),
      })
@@ -329,7 +346,10 @@ export class ChatService {
    await session.agent.dispose()
    this.deps.sessionStore.remove(request.conversationId)

-    const browserContext = await this.resolvePageIds(request.browserContext)
+    const browserContext = agentConfig.isScheduledTask
+      ? (session.browserContext ??
+        (await this.resolvePageIds(request.browserContext)))
+      : await this.resolvePageIds(request.browserContext)
    const agent = await AiSdkAgent.create({
      resolvedConfig: agentConfig,
      browser: this.deps.browser,
@@ -341,6 +361,7 @@ export class ChatService {
    })
    const newSession: AgentSession = {
      agent,
+      hiddenPageId: session.hiddenPageId,
      browserContext,
      mcpServerKey,
      workingDir: request.userWorkingDir,
--- a/packages/browseros-agent/apps/server/src/api/services/graph-service.ts
+++ b/packages/browseros-agent/apps/server/src/api/services/graph-service.ts
@@ -1,328 +0,0 @@
-/**
- * @license
- * Copyright 2025 BrowserOS
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-import { UIMessageStreamEventSchema } from '@browseros/shared/schemas/ui-stream'
-import type { LLMConfig, UIMessageStreamEvent } from '@browseros-ai/agent-sdk'
-import { createParser, type EventSourceMessage } from 'eventsource-parser'
-import { cleanupExecution, executeGraph } from '../../graph/executor'
-import { logger } from '../../lib/logger'
-import {
-  CodegenFinishMetadataSchema,
-  CodegenGetResponseSchema,
-  type GraphSession,
-  type RunGraphRequest,
-  type WorkflowGraph,
-} from '../types'
-
-export interface GraphServiceDeps {
-  codegenServiceUrl: string
-  serverUrl: string
-  tempDir: string
-}
-
-interface SessionState {
-  codeId: string | null
-  code: string | null
-  graph: WorkflowGraph | null
-}
-
-export class GraphService {
-  constructor(private deps: GraphServiceDeps) {}
-
-  /**
-   * Create a new graph by proxying to codegen service.
-   * Streams UIMessageStreamEvent events back to caller.
-   */
-  async createGraph(
-    query: string,
-    onEvent: (event: UIMessageStreamEvent) => Promise<void>,
-    signal?: AbortSignal,
-  ): Promise<GraphSession | null> {
-    const url = `${this.deps.codegenServiceUrl}/api/code`
-
-    logger.debug('Creating graph via codegen service', { url, query })
-
-    return this.proxyCodegenRequest(url, 'POST', { query }, onEvent, signal)
-  }
-
-  /**
-   * Update an existing graph by proxying to codegen service.
-   */
-  async updateGraph(
-    sessionId: string,
-    query: string,
-    onEvent: (event: UIMessageStreamEvent) => Promise<void>,
-    signal?: AbortSignal,
-  ): Promise<GraphSession | null> {
-    const url = `${this.deps.codegenServiceUrl}/api/code/${sessionId}`
-
-    logger.debug('Updating graph via codegen service', {
-      url,
-      sessionId,
-      query,
-    })
-
-    return this.proxyCodegenRequest(url, 'PUT', { query }, onEvent, signal)
-  }
-
-  /**
-   * Get graph code and visualization from codegen service.
-   */
-  async getGraph(sessionId: string): Promise<GraphSession | null> {
-    const url = `${this.deps.codegenServiceUrl}/api/code/${sessionId}`
-
-    logger.debug('Fetching graph from codegen service', { url, sessionId })
-
-    try {
-      const response = await fetch(url)
-
-      if (!response.ok) {
-        if (response.status === 404) {
-          return null
-        }
-        throw new Error(`Codegen service error: ${response.status}`)
-      }
-
-      const json = await response.json()
-      const result = CodegenGetResponseSchema.safeParse(json)
-
-      if (!result.success) {
-        logger.error('Invalid codegen response', {
-          issues: result.error.issues,
-        })
-        throw new Error('Invalid response from codegen service')
-      }
-
-      return {
-        id: sessionId,
-        code: result.data.code,
-        graph: result.data.graph,
-        createdAt: new Date(result.data.createdAt || Date.now()),
-      }
-    } catch (error) {
-      const errorMessage =
-        error instanceof Error ? error.message : String(error)
-      logger.error('Failed to fetch graph', { sessionId, error: errorMessage })
-      throw error
-    }
-  }
-
-  /**
-   * Execute a graph by fetching code from codegen and running it.
-   */
-  async runGraph(
-    sessionId: string,
-    request: RunGraphRequest,
-    onProgress: (event: UIMessageStreamEvent) => Promise<void>,
-    signal?: AbortSignal,
-  ): Promise<void> {
-    // Fetch code from codegen service
-    const graph = await this.getGraph(sessionId)
-
-    if (!graph) {
-      throw new Error(`Graph not found: ${sessionId}`)
-    }
-
-    logger.debug('Executing graph', {
-      sessionId,
-      codeLength: graph.code.length,
-    })
-
-    // Build LLM config from request
-    const llmConfig: LLMConfig | undefined = request.provider
-      ? {
-          provider: request.provider,
-          model: request.model,
-          apiKey: request.apiKey,
-          baseUrl: request.baseUrl,
-          resourceName: request.resourceName,
-          region: request.region,
-          accessKeyId: request.accessKeyId,
-          secretAccessKey: request.secretAccessKey,
-          sessionToken: request.sessionToken,
-        }
-      : undefined
-
-    const result = await executeGraph(
-      graph.code,
-      sessionId,
-      this.deps.tempDir,
-      {
-        serverUrl: this.deps.serverUrl,
-        llmConfig,
-        browserContext: request.browserContext,
-        onProgress: (event) => {
-          onProgress(event).catch((err) => {
-            logger.warn('Failed to send progress event', { error: String(err) })
-          })
-        },
-        signal,
-      },
-    )
-
-    if (!result.success) {
-      throw new Error(result.error || 'Graph execution failed')
-    }
-  }
-
-  /**
-   * Delete execution files for a graph.
-   */
-  async deleteGraph(sessionId: string): Promise<void> {
-    await cleanupExecution(sessionId, this.deps.tempDir)
-  }
-
-  /**
-   * Proxy a request to codegen service and stream UIMessageStreamEvent events.
-   */
-  private async proxyCodegenRequest(
-    url: string,
-    method: 'POST' | 'PUT',
-    body: { query: string },
-    onEvent: (event: UIMessageStreamEvent) => Promise<void>,
-    signal?: AbortSignal,
-  ): Promise<GraphSession | null> {
-    try {
-      const response = await this.fetchCodegenService(url, method, body, signal)
-      return await this.parseUIMessageStream(response, onEvent)
-    } catch (error) {
-      const errorMessage =
-        error instanceof Error ? error.message : String(error)
-      logger.error('Codegen proxy request failed', { url, error: errorMessage })
-      throw error
-    }
-  }
-
-  private async fetchCodegenService(
-    url: string,
-    method: 'POST' | 'PUT',
-    body: { query: string },
-    signal?: AbortSignal,
-  ): Promise<Response> {
-    const response = await fetch(url, {
-      method,
-      headers: {
-        'Content-Type': 'application/json',
-        Accept: 'text/event-stream',
-      },
-      body: JSON.stringify(body),
-      signal,
-    })
-
-    if (!response.ok) {
-      throw new Error(`Codegen service error: ${response.status}`)
-    }
-
-    if (!response.body) {
-      throw new Error('No response body from codegen service')
-    }
-
-    return response
-  }
-
-  /**
-   * Parse UIMessageStreamEvent SSE stream from codegen service.
-   * Extracts codeId, code, graph from the finish event's messageMetadata.
-   */
-  private async parseUIMessageStream(
-    response: Response,
-    onEvent: (event: UIMessageStreamEvent) => Promise<void>,
-  ): Promise<GraphSession | null> {
-    if (!response.body) {
-      throw new Error('No response body')
-    }
-
-    const reader = response.body.getReader()
-    const decoder = new TextDecoder()
-    const state: SessionState = { codeId: null, code: null, graph: null }
-    const pendingEvents: UIMessageStreamEvent[] = []
-
-    const parser = createParser({
-      onEvent: (msg: EventSourceMessage) => {
-        if (msg.data === '[DONE]') return
-
-        try {
-          const json = JSON.parse(msg.data)
-          const result = UIMessageStreamEventSchema.safeParse(json)
-
-          if (!result.success) {
-            logger.warn('Invalid UIMessageStream event', {
-              data: msg.data,
-              issues: result.error.issues,
-            })
-            return
-          }
-
-          pendingEvents.push(result.data as UIMessageStreamEvent)
-        } catch {
-          logger.warn('Failed to parse UIMessageStream event', {
-            data: msg.data,
-          })
-        }
-      },
-    })
-
-    try {
-      while (true) {
-        const { done, value } = await reader.read()
-        if (done) break
-
-        const text = decoder.decode(value, { stream: true })
-        parser.feed(text)
-
-        // Process any events that were parsed
-        let event = pendingEvents.shift()
-        while (event) {
-          this.extractSessionData(event, state)
-          await onEvent(event)
-          event = pendingEvents.shift()
-        }
-      }
-
-      // Process any remaining events
-      let remaining = pendingEvents.shift()
-      while (remaining) {
-        this.extractSessionData(remaining, state)
-        await onEvent(remaining)
-        remaining = pendingEvents.shift()
-      }
-
-      if (state.codeId && state.code) {
-        return {
-          id: state.codeId,
-          code: state.code,
-          graph: state.graph,
-          createdAt: new Date(),
-        }
-      }
-
-      return null
-    } finally {
-      reader.releaseLock()
-    }
-  }
-
-  /**
-   * Extract session data (codeId, code, graph) from UIMessageStreamEvent.
-   */
-  private extractSessionData(
-    event: UIMessageStreamEvent,
-    state: SessionState,
-  ): void {
-    if (event.type === 'start' && event.messageId) {
-      state.codeId = event.messageId
-    } else if (event.type === 'finish' && event.messageMetadata) {
-      const result = CodegenFinishMetadataSchema.safeParse(
-        event.messageMetadata,
-      )
-      if (result.success) {
-        if (result.data.codeId) state.codeId = result.data.codeId
-        if (result.data.code) state.code = result.data.code
-        if (result.data.graph !== undefined) state.graph = result.data.graph
-      }
-    }
-  }
-}
--- a/packages/browseros-agent/apps/server/src/api/services/klavis/strata-cache.ts
+++ b/packages/browseros-agent/apps/server/src/api/services/klavis/strata-cache.ts
@@ -0,0 +1,145 @@
+/**
+ * @license
+ * Copyright 2025 BrowserOS
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ *
+ * In-process cache for Klavis Strata `createStrata` responses.
+ *
+ * Conversation creation in `/chat` was blocking on a Worker-proxied
+ * `klavisClient.createStrata` round-trip every time the user had any
+ * managed Klavis app connected. This cache stores the (immutable) JSON
+ * metadata returned by `createStrata` so that subsequent chats with the
+ * same `(browserosId, enabled-server-set)` skip the round-trip entirely.
+ *
+ * It does NOT cache live MCP client connections — only URL/id metadata.
+ * Per-session MCP clients continue to be opened and closed by
+ * `AiSdkAgent.create` / `dispose` exactly as before, which makes the
+ * cache safe across concurrent chats by construction.
+ */
+
+import type {
+  KlavisClient,
+  StrataCreateResponse,
+} from '../../../lib/clients/klavis/klavis-client'
+import { logger } from '../../../lib/logger'
+
+const DEFAULT_TTL_MS = 60 * 60 * 1000
+
+interface CacheEntry {
+  strataServerUrl: string
+  strataId: string
+  addedServers: string[]
+  serverKey: string
+  expiresAt: number
+}
+
+function normalizeServers(servers: readonly string[]): string {
+  return [...new Set(servers)].sort().join(',')
+}
+
+function keyOf(browserosId: string, normalized: string): string {
+  // xxhash64 → 16 hex chars, fixed width. Birthday-bound collision risk
+  // for our scale (<10k entries) is ~5e-15; we additionally verify
+  // serverKey on read so collisions cannot affect correctness.
+  const hash = Bun.hash(normalized).toString(16).padStart(16, '0')
+  return `${browserosId}|${hash}`
+}
+
+export class KlavisStrataCache {
+  private entries = new Map<string, Promise<CacheEntry>>()
+
+  constructor(private ttlMs: number = DEFAULT_TTL_MS) {}
+
+  async getOrFetch(
+    client: KlavisClient,
+    browserosId: string,
+    servers: readonly string[],
+  ): Promise<StrataCreateResponse> {
+    const normalized = normalizeServers(servers)
+    const key = keyOf(browserosId, normalized)
+    const existing = this.entries.get(key)
+
+    if (existing) {
+      const resolved = await existing.catch(() => null)
+      if (
+        resolved &&
+        resolved.serverKey === normalized &&
+        Date.now() < resolved.expiresAt
+      ) {
+        logger.debug('Klavis strata cache hit', { key })
+        return this.toResponse(resolved)
+      }
+      // Stale/collision/rejected — evict, but only if we're the rightful
+      // evictor (a racing caller may have already replaced this slot).
+      if (this.entries.get(key) === existing) {
+        this.entries.delete(key)
+      }
+    }
+
+    logger.debug('Klavis strata cache miss', {
+      key,
+      serverCount: servers.length,
+    })
+    const inflight = this.fetch(client, browserosId, servers, normalized)
+    this.entries.set(key, inflight)
+
+    try {
+      return this.toResponse(await inflight)
+    } catch (err) {
+      // Identity-check: only drop OUR entry. A racing invalidate() may have
+      // already removed it, or a racing miss may have inserted a new one
+      // that we must not clobber.
+      if (this.entries.get(key) === inflight) {
+        this.entries.delete(key)
+      }
+      throw err
+    }
+  }
+
+  invalidate(browserosId: string): void {
+    const prefix = `${browserosId}|`
+    let dropped = 0
+    for (const key of this.entries.keys()) {
+      if (key.startsWith(prefix)) {
+        this.entries.delete(key)
+        dropped++
+      }
+    }
+    if (dropped > 0) {
+      logger.debug('Klavis strata cache invalidated', {
+        browserosId: browserosId.slice(0, 12),
+        dropped,
+      })
+    }
+  }
+
+  clear(): void {
+    this.entries.clear()
+  }
+
+  private async fetch(
+    client: KlavisClient,
+    browserosId: string,
+    servers: readonly string[],
+    normalized: string,
+  ): Promise<CacheEntry> {
+    const result = await client.createStrata(browserosId, [...servers])
+    return {
+      strataServerUrl: result.strataServerUrl,
+      strataId: result.strataId,
+      addedServers: result.addedServers,
+      serverKey: normalized,
+      expiresAt: Date.now() + this.ttlMs,
+    }
+  }
+
+  private toResponse(entry: CacheEntry): StrataCreateResponse {
+    return {
+      strataServerUrl: entry.strataServerUrl,
+      strataId: entry.strataId,
+      addedServers: entry.addedServers,
+    }
+  }
+}
+
+export const klavisStrataCache = new KlavisStrataCache()
--- a/packages/browseros-agent/apps/server/src/api/services/mcp/register-klavis-mcp.ts
+++ b/packages/browseros-agent/apps/server/src/api/services/mcp/register-klavis-mcp.ts
@@ -14,6 +14,7 @@ import type { KlavisClient } from '../../../lib/clients/klavis/klavis-client'
 import { OAUTH_MCP_SERVERS } from '../../../lib/clients/klavis/oauth-mcp-servers'
 import { logger } from '../../../lib/logger'
 import { metrics } from '../../../lib/metrics'
+import { klavisStrataCache } from './strata-cache'

 function withTimeout<T>(promise: Promise<T>, label: string): Promise<T> {
  let timerId: ReturnType<typeof setTimeout> | undefined
@@ -49,7 +50,8 @@ export async function connectKlavisProxy(
  // even unauthenticated ones (Klavis handles auth prompts on call)
  const allServers = OAUTH_MCP_SERVERS.map((s) => s.name)

-  const strata = await deps.klavisClient.createStrata(
+  const strata = await klavisStrataCache.getOrFetch(
+    deps.klavisClient,
    deps.browserosId,
    allServers,
  )
--- a/packages/browseros-agent/apps/server/src/api/services/mcp/mcp-server.ts
+++ b/packages/browseros-agent/apps/server/src/api/services/mcp/mcp-server.ts
@@ -8,11 +8,11 @@ import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'
 import { SetLevelRequestSchema } from '@modelcontextprotocol/sdk/types.js'
 import type { Browser } from '../../../browser/browser'
 import type { ToolRegistry } from '../../../tools/tool-registry'
-import { MCP_INSTRUCTIONS } from './mcp-prompt'
 import {
  type KlavisProxyHandle,
  registerKlavisTools,
-} from './register-klavis-mcp'
+} from '../klavis/strata-proxy'
+import { MCP_INSTRUCTIONS } from './mcp-prompt'
 import { registerTools } from './register-mcp'

 export interface McpServiceDeps {
--- a/packages/browseros-agent/apps/server/src/api/types.ts
+++ b/packages/browseros-agent/apps/server/src/api/types.ts
@@ -102,87 +102,3 @@ export interface HttpServerConfig {

  onShutdown?: () => void
 }
-
-// Graph request schemas
-export const CreateGraphRequestSchema = z.object({
-  query: z.string().min(1, 'Query cannot be empty'),
-})
-
-export type CreateGraphRequest = z.infer<typeof CreateGraphRequestSchema>
-
-export const UpdateGraphRequestSchema = z.object({
-  query: z.string().min(1, 'Query cannot be empty'),
-})
-
-export type UpdateGraphRequest = z.infer<typeof UpdateGraphRequestSchema>
-
-// Run graph request - similar to ChatRequest, needs provider config for Agent SDK
-export const RunGraphRequestSchema = AgentLLMConfigSchema.extend({
-  browserContext: BrowserContextSchema.optional(),
-})
-
-export type RunGraphRequest = z.infer<typeof RunGraphRequestSchema>
-
-// Workflow graph schemas (matching codegen-service)
-export const WorkflowNodeTypeSchema = z.enum([
-  'start',
-  'end',
-  'nav',
-  'act',
-  'extract',
-  'verify',
-  'decision',
-  'loop',
-  'fork',
-  'join',
-])
-
-export type WorkflowNodeType = z.infer<typeof WorkflowNodeTypeSchema>
-
-export const WorkflowNodeSchema = z.object({
-  id: z.string(),
-  type: WorkflowNodeTypeSchema,
-  data: z.object({ label: z.string() }),
-})
-
-export type WorkflowNode = z.infer<typeof WorkflowNodeSchema>
-
-export const WorkflowEdgeSchema = z.object({
-  id: z.string(),
-  source: z.string(),
-  target: z.string(),
-})
-
-export type WorkflowEdge = z.infer<typeof WorkflowEdgeSchema>
-
-export const WorkflowGraphSchema = z.object({
-  nodes: z.array(WorkflowNodeSchema),
-  edges: z.array(WorkflowEdgeSchema),
-})
-
-export type WorkflowGraph = z.infer<typeof WorkflowGraphSchema>
-
-export interface GraphSession {
-  id: string
-  code: string
-  graph: WorkflowGraph | null
-  createdAt: Date
-}
-
-// Codegen service response schema for GET /api/code/:id
-export const CodegenGetResponseSchema = z.object({
-  code: z.string(),
-  graph: WorkflowGraphSchema.nullable(),
-  createdAt: z.string().optional(),
-})
-
-export type CodegenGetResponse = z.infer<typeof CodegenGetResponseSchema>
-
-// Metadata schema for finish events from codegen service
-export const CodegenFinishMetadataSchema = z.object({
-  codeId: z.string().optional(),
-  code: z.string().optional(),
-  graph: WorkflowGraphSchema.nullable().optional(),
-})
-
-export type CodegenFinishMetadata = z.infer<typeof CodegenFinishMetadataSchema>
--- a/packages/browseros-agent/apps/server/src/graph/executor.ts
+++ b/packages/browseros-agent/apps/server/src/graph/executor.ts
@@ -1,145 +0,0 @@
-/**
- * @license
- * Copyright 2025 BrowserOS
- * SPDX-License-Identifier: AGPL-3.0-or-later
- */
-
-import { mkdir, rm } from 'node:fs/promises'
-import path from 'node:path'
-import type { BrowserContext } from '@browseros/shared/schemas/browser-context'
-import type { LLMConfig, UIMessageStreamEvent } from '@browseros-ai/agent-sdk'
-import { Agent } from '@browseros-ai/agent-sdk'
-import { z } from 'zod'
-import { logger } from '../lib/logger'
-
-//TODO: nikhil - Fix this with new bun package logic
-// Expose zod globally for generated graph code. The codegen service generates code
-// that uses `z` for schema validation, but transformCodeForExecution strips all imports
-// since dependencies can't be resolved in dynamically imported files (especially in
-// compiled binaries where modules are bundled). By exposing `z` as a global, the
-// generated code can reference it without an import statement.
-;(globalThis as unknown as Record<string, unknown>).z = z
-
-export interface ExecutorOptions {
-  serverUrl: string
-  llmConfig?: LLMConfig
-  browserContext?: BrowserContext
-  onProgress: (event: UIMessageStreamEvent) => void
-  signal?: AbortSignal
-}
-
-export interface ExecutorResult {
-  success: boolean
-  result?: unknown
-  error?: string
-}
-
-/**
- * Executes generated graph code using the Agent SDK.
- *
- * @param code - Generated code from codegen service
- * @param sessionId - Unique session ID for this execution
- * @param tempDir - Base temp directory for execution files
- * @param options - Execution options (serverUrl, llmConfig, onProgress, signal)
- */
-export async function executeGraph(
-  code: string,
-  sessionId: string,
-  tempDir: string,
-  options: ExecutorOptions,
-): Promise<ExecutorResult> {
-  const execDir = path.join(tempDir, 'graph', sessionId)
-
-  try {
-    // Check if aborted before starting
-    if (options.signal?.aborted) {
-      return { success: false, error: 'Execution aborted' }
-    }
-
-    // Create execution directory
-    await mkdir(execDir, { recursive: true })
-
-    // Transform code: remove import statements (Agent is passed directly)
-    const transformedCode = transformCodeForExecution(code)
-
-    // Write code to file
-    const codePath = path.join(execDir, 'graph.ts')
-    await Bun.write(codePath, transformedCode)
-
-    logger.debug(`Wrote graph code to ${codePath}`)
-
-    // Create Agent instance with progress callback (auto-disposed on scope exit)
-    await using agent = new Agent({
-      url: options.serverUrl,
-      llm: options.llmConfig,
-      onProgress: options.onProgress,
-      signal: options.signal,
-      browserContext: options.browserContext,
-      stateful: true,
-    })
-
-    // Dynamic import with cache-busting (Bun caches imports by path)
-    const module = await import(`${codePath}?t=${Date.now()}`)
-
-    if (typeof module.run !== 'function') {
-      throw new Error('Generated code must export a "run" function')
-    }
-
-    let abortHandler: (() => void) | undefined
-    try {
-      // Only use Promise.race if we have a signal to listen to
-      const result = options.signal
-        ? await Promise.race([
-            module.run(agent),
-            new Promise<never>((_, reject) => {
-              abortHandler = () => reject(new Error('Execution aborted'))
-              options.signal?.addEventListener('abort', abortHandler, {
-                once: true,
-              })
-            }),
-          ])
-        : await module.run(agent)
-
-      return { success: true, result }
-    } finally {
-      if (abortHandler && options.signal) {
-        options.signal.removeEventListener('abort', abortHandler)
-      }
-    }
-  } catch (error) {
-    const errorMessage = error instanceof Error ? error.message : String(error)
-    logger.error(`Graph execution failed: ${errorMessage}`)
-    return { success: false, error: errorMessage }
-  }
-}
-
-export function transformCodeForExecution(code: string): string {
-  // Remove multi-line imports: import { ... } from 'any-package'
-  let result = code.replace(
-    /^\s*import\s+(?:type\s+)?\{[\s\S]*?\}\s*from\s*['"][^'"\n]*['"].*$/gm,
-    '',
-  )
-
-  // Remove single-line imports: import X from '...', import 'side-effect', etc.
-  result = result.replace(/^\s*import\s+.*['"][^'"\n]*['"].*$/gm, '')
-
-  return result
-}
-
-/**
- * Cleans up execution files for a session.
- */
-export async function cleanupExecution(
-  sessionId: string,
-  tempDir: string,
-): Promise<void> {
-  const execDir = path.join(tempDir, 'graph', sessionId)
-
-  try {
-    await rm(execDir, { recursive: true, force: true })
-    logger.debug(`Cleaned up execution directory: ${execDir}`)
-  } catch (error) {
-    const errorMessage = error instanceof Error ? error.message : String(error)
-    logger.warn(`Failed to cleanup execution directory: ${errorMessage}`)
-  }
-}
--- a/packages/browseros-agent/apps/server/src/lib/clients/klavis/klavis-client.ts
+++ b/packages/browseros-agent/apps/server/src/lib/clients/klavis/klavis-client.ts
@@ -145,16 +145,14 @@ export class KlavisClient {
    })
  }

-  /**
-   * Remove a server from a Strata instance
-   * Flow: createStrata(server) to get strataId → DELETE /strata/{strataId}/servers?servers=X
-   */
-  async removeServer(userId: string, serverName: string): Promise<void> {
-    // createStrata to get strataId (passing same server ensures it exists)
-    const strata = await this.createStrata(userId, [serverName])
+  async deleteServersFromStrata(
+    strataId: string,
+    servers: string[],
+  ): Promise<void> {
+    const query = servers.map(encodeURIComponent).join(',')
    await this.request(
      'DELETE',
-      `/mcp-server/strata/${strata.strataId}/servers?servers=${encodeURIComponent(serverName)}`,
+      `/mcp-server/strata/${strataId}/servers?servers=${query}`,
    )
  }
 }
--- a/packages/browseros-agent/apps/server/src/tools/browseros-info.ts
+++ b/packages/browseros-agent/apps/server/src/tools/browseros-info.ts
@@ -12,8 +12,7 @@ BrowserOS is an AI-native browser built on Chromium that turns plain English int
 ## Modes

 - **Chat Mode** — Ask questions about any webpage: summarize articles, extract data, translate content. Activate with Option+K. Works with any LLM, including local models.
- **Agent Mode** — Describe a task and the agent executes it: clicking, typing, navigating, filling forms, extracting data, multi-step workflows. Best with Claude Opus 4.5 or Kimi K2.5.
- **Graph Mode (Workflows)** — Build visual workflow graphs for repeatable, reliable automations with parallel execution, loops, and conditionals.
+- **Agent Mode** — Describe a task and the agent executes it: clicking, typing, navigating, filling forms, extracting data, and multi-step browser tasks. Best with Claude Opus 4.5 or Kimi K2.5.

 ---

@@ -23,16 +22,12 @@ BrowserOS is an AI-native browser built on Chromium that turns plain English int
 Connect your preferred AI provider or run models locally. Supported providers: Gemini (free tier), Claude/Anthropic, OpenAI, OpenRouter (500+ models). Local options: Ollama, LM Studio. Configure at chrome://browseros/settings.
 Learn more: https://docs.browseros.com/features/bring-your-own-llm

-### Workflows
-Convert complex browser tasks into repeatable visual automations. Describe the task, the agent generates a workflow graph, refine it through conversation, then run it on demand. Ideal for data entry, outreach, price monitoring, bulk operations.
-Learn more: https://docs.browseros.com/features/workflows
-
 ### Scheduled Tasks
 Automate tasks on a schedule — daily, hourly, or every few minutes. Runs in a background window without interrupting your work. Use cases: morning briefings, LinkedIn automation, price monitoring. Requires BrowserOS to be open.
 Learn more: https://docs.browseros.com/features/scheduled-tasks

 ### Filesystem Access
-Grant the agent controlled access to a local folder to read files, write reports, and run shell commands. Sandboxed — cannot access parent directories. Combine web research with local file creation in a single workflow.
+Grant the agent controlled access to a local folder to read files, write reports, and run shell commands. Sandboxed — cannot access parent directories. Combine web research with local file creation in a single task.
 Learn more: https://docs.browseros.com/features/cowork

 ### Connect Apps (MCPs)
@@ -54,7 +49,6 @@ Learn more: https://docs.browseros.com/features/ad-blocking`
 const VALID_TOPICS = [
  'overview',
  'bring-your-own-llm',
-  'workflows',
  'scheduled-tasks',
  'filesystem-access',
  'connect-apps',
@@ -67,9 +61,8 @@ const TOPIC_SECTIONS: Record<string, { start: string; end?: string }> = {
  overview: { start: '# BrowserOS', end: '## Core Features' },
  'bring-your-own-llm': {
    start: '### Bring Your Own LLM',
-    end: '### Workflows',
+    end: '### Scheduled Tasks',
  },
-  workflows: { start: '### Workflows', end: '### Scheduled Tasks' },
  'scheduled-tasks': {
    start: '### Scheduled Tasks',
    end: '### Filesystem Access',
--- a/packages/browseros-agent/apps/server/src/tools/navigation.ts
+++ b/packages/browseros-agent/apps/server/src/tools/navigation.ts
@@ -173,7 +173,7 @@ export const new_page = defineTool({
 export const new_hidden_page = defineTool({
  name: 'new_hidden_page',
  description:
-    'Open a new hidden page (tab) and navigate to a URL. Hidden pages are not visible to the user and useful for background data fetching or automation. Note: take_screenshot is not supported on hidden tabs — use show_page first to make it visible.',
+    'Open a new hidden page (tab) and navigate to a URL. Hidden pages are not visible to the user and useful for background data fetching or automation.',
  input: z.object({
    url: z.string().describe('URL to open'),
    windowId: z.number().optional().describe('Window ID to create tab in'),
@@ -206,7 +206,7 @@ export const new_hidden_page = defineTool({
 export const show_page = defineTool({
  name: 'show_page',
  description:
-    'Restore a hidden page back into a visible browser window. Use after new_hidden_page when you need to make the page visible (e.g. for screenshots). Errors if the page is already visible.',
+    'Restore a hidden page back into a visible browser window. Use after new_hidden_page when you want the user to inspect or interact with it. Errors if the page is already visible.',
  input: z.object({
    page: pageParam,
    windowId: z
--- a/packages/browseros-agent/apps/server/src/tools/windows.ts
+++ b/packages/browseros-agent/apps/server/src/tools/windows.ts
@@ -79,7 +79,7 @@ export const create_window = defineTool({
 export const create_hidden_window = defineTool({
  name: 'create_hidden_window',
  description:
-    'Create a new hidden browser window. Hidden windows are not visible to the user and useful for background automation. Note: take_screenshot is not supported on hidden windows.',
+    'Create a new hidden browser window. Hidden windows are not visible to the user and useful for background automation.',
  input: z.object({}),
  output: z.object({
    window: windowInfoSchema,
--- a/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts
+++ b/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts
@@ -86,7 +86,7 @@ function buildScheduled(overrides?: Partial<BuildSystemPromptOptions>): string {
  return buildSystemPrompt({
    isScheduledTask: true,
    workspaceDir: '/tmp/scheduled',
-    scheduledTaskWindowId: 42,
+    scheduledTaskPageId: 42,
    exclude: ['nudges'],
    ...overrides,
  })
@@ -258,7 +258,7 @@ describe('workspace gating (P11)', () => {
 // from subtle cues (missing sections, restricted tools), which is unreliable.
 //
 // - Regular: no extra framing (default behavior)
-// - Scheduled: must know it's autonomous, in a hidden window, no user interaction
+// - Scheduled: must know it's autonomous, on a hidden page, no user interaction
 // - Chat: must know it's read-only, cannot click/fill/write
 //
 // If mode framing breaks, scheduled tasks may try to ask the user questions,
@@ -310,20 +310,21 @@ describe('mode-aware framing', () => {
    expect(prompt).not.toContain('<page_context>')
  })

-  it('scheduled task includes windowId in page context', () => {
-    const prompt = buildScheduled({ scheduledTaskWindowId: 99 })
-    expect(prompt).toContain('windowId: 99')
+  it('scheduled task includes starting pageId in page context', () => {
+    const prompt = buildScheduled({ scheduledTaskPageId: 99 })
+    expect(prompt).toContain('starting page ID `99`')
  })

-  it('scheduled task without windowId uses Browser Context reference', () => {
-    const prompt = buildScheduled({ scheduledTaskWindowId: undefined })
-    expect(prompt).toContain('the `windowId` from the Browser Context')
+  it('scheduled task without pageId uses Browser Context reference', () => {
+    const prompt = buildScheduled({ scheduledTaskPageId: undefined })
+    expect(prompt).toContain('the page ID from the Browser Context')
  })

-  it('scheduled task includes hidden window management rules', () => {
+  it('scheduled task includes hidden page management rules', () => {
    const prompt = buildScheduled()
-    expect(prompt).toContain('Do NOT close your dedicated hidden window')
+    expect(prompt).toContain('Do NOT close your starting hidden page')
    expect(prompt).toContain('Do NOT create new windows')
+    expect(prompt).toContain('Close extra hidden pages')
  })
 })

@@ -1060,11 +1061,12 @@ describe('execution section', () => {

  it('prohibits hidden windows for user tasks', () => {
    // Why: Run 2 used create_hidden_window instead of background tabs.
-    // Hidden windows are invisible to users and can't be screenshotted.
+    // Hidden pages are invisible to users, so user-requested work must stay on visible tabs.
    const prompt = buildRegular()
    expect(prompt).toContain('Do NOT use')
    expect(prompt).toContain('create_hidden_window')
    expect(prompt).toContain('new_hidden_page')
+    expect(prompt).not.toContain('cannot be screenshotted')
  })

  it('includes tab retry discipline', () => {
--- a/packages/browseros-agent/apps/server/tests/api/routes/klavis.test.ts
+++ b/packages/browseros-agent/apps/server/tests/api/routes/klavis.test.ts
@@ -3,12 +3,17 @@
 * Copyright 2025 BrowserOS
 */

-import { afterEach, describe, it } from 'bun:test'
+import { afterEach, beforeEach, describe, it } from 'bun:test'
 import assert from 'node:assert'
 import { createKlavisRoutes } from '../../../src/api/routes/klavis'
+import { klavisStrataCache } from '../../../src/api/services/klavis/strata-cache'

 const originalFetch = globalThis.fetch

+beforeEach(() => {
+  klavisStrataCache.clear()
+})
+
 afterEach(() => {
  globalThis.fetch = originalFetch
 })
--- a/packages/browseros-agent/apps/server/tests/api/services/chat-service.test.ts
+++ b/packages/browseros-agent/apps/server/tests/api/services/chat-service.test.ts
@@ -0,0 +1,291 @@
+import { describe, expect, it, mock } from 'bun:test'
+
+interface MockMessage {
+  id: string
+  role: 'user' | 'assistant'
+  parts: Array<{ type: 'text'; text: string }>
+}
+
+interface MockAgent {
+  toolLoopAgent: object
+  toolNames: Set<string>
+  messages: MockMessage[]
+  appendUserMessage(text: string): void
+  dispose(): Promise<void>
+}
+
+interface StoredSession {
+  agent: MockAgent
+  hiddenPageId?: number
+}
+
+interface StreamResponseOptions {
+  onFinish(args: { messages: MockMessage[] }): Promise<void>
+}
+
+let agentToReturn: MockAgent | undefined
+let streamResponseHandler:
+  | ((options: StreamResponseOptions) => Promise<Response>)
+  | undefined
+
+const createAgentSpy = mock(async (config: unknown) => {
+  if (!agentToReturn) {
+    throw new Error(`No mock agent configured for ${JSON.stringify(config)}`)
+  }
+  return agentToReturn
+})
+
+const createAgentUIStreamResponseSpy = mock(
+  async (options: StreamResponseOptions) => {
+    if (!streamResponseHandler) {
+      throw new Error('No stream response handler configured')
+    }
+    return await streamResponseHandler(options)
+  },
+)
+
+const resolveLLMConfigSpy = mock(async () => ({
+  provider: 'openai',
+  model: 'gpt-5',
+  apiKey: 'test-key',
+}))
+
+mock.module('ai', () => ({
+  createAgentUIStreamResponse: createAgentUIStreamResponseSpy,
+}))
+
+mock.module('../../../src/agent/ai-sdk-agent', () => ({
+  AiSdkAgent: {
+    create: createAgentSpy,
+  },
+}))
+
+mock.module('../../../src/lib/clients/llm/config', () => ({
+  resolveLLMConfig: resolveLLMConfigSpy,
+}))
+
+mock.module('../../../src/lib/logger', () => ({
+  logger: {
+    info: mock(() => {}),
+    warn: mock(() => {}),
+    debug: mock(() => {}),
+  },
+}))
+
+const { ChatService } = await import('../../../src/api/services/chat-service')
+
+function createSessionStore() {
+  const sessions = new Map<string, StoredSession>()
+  return {
+    get(conversationId: string) {
+      return sessions.get(conversationId)
+    },
+    set(conversationId: string, session: StoredSession) {
+      sessions.set(conversationId, session)
+    },
+    remove(conversationId: string) {
+      return sessions.delete(conversationId)
+    },
+    async delete(conversationId: string) {
+      const session = sessions.get(conversationId)
+      if (!session) return false
+      await session.agent.dispose()
+      sessions.delete(conversationId)
+      return true
+    },
+    count() {
+      return sessions.size
+    },
+  }
+}
+
+function createFakeAgent() {
+  const messages: MockMessage[] = []
+  return {
+    toolLoopAgent: {},
+    toolNames: new Set<string>(),
+    messages,
+    appendUserMessage(text: string) {
+      messages.push({
+        id: 'user-1',
+        role: 'user',
+        parts: [{ type: 'text', text }],
+      })
+    },
+    dispose: mock(async () => {}),
+  }
+}
+
+describe('ChatService scheduled task hidden page lifecycle', () => {
+  it('creates and cleans up a hidden page without creating a hidden window', async () => {
+    const fakeAgent = createFakeAgent()
+    agentToReturn = fakeAgent
+    streamResponseHandler = async ({ onFinish }) => {
+      await onFinish({ messages: fakeAgent.messages })
+      return new Response('ok')
+    }
+
+    const browser = {
+      newPage: mock(async () => 77),
+      listPages: mock(async () => [
+        {
+          pageId: 77,
+          windowId: 11,
+        },
+      ]),
+      closePage: mock(async () => {}),
+      createWindow: mock(async () => ({ windowId: 11 })),
+      closeWindow: mock(async () => {}),
+      resolveTabIds: mock(async () => new Map<number, number>()),
+    }
+    const sessionStore = createSessionStore()
+    const service = new ChatService({
+      sessionStore: sessionStore as never,
+      klavisClient: {} as never,
+      browser: browser as never,
+      registry: {} as never,
+    })
+
+    await service.processMessage(
+      {
+        conversationId: crypto.randomUUID(),
+        message: 'Run the scheduled task',
+        isScheduledTask: true,
+        mode: 'agent',
+        origin: 'sidepanel',
+        browserContext: {
+          windowId: 9,
+          activeTab: {
+            id: 3,
+            url: 'https://example.com',
+            title: 'Example',
+          },
+          selectedTabs: [{ id: 4 }],
+          enabledMcpServers: ['slack'],
+        },
+      } as never,
+      new AbortController().signal,
+    )
+
+    expect(browser.newPage).toHaveBeenCalledWith('about:blank', {
+      hidden: true,
+      background: true,
+    })
+    expect(browser.createWindow).not.toHaveBeenCalled()
+    expect(browser.closePage).toHaveBeenCalledWith(77)
+    expect(browser.closeWindow).not.toHaveBeenCalled()
+
+    const createArgs = createAgentSpy.mock.calls.at(-1)?.[0] as {
+      browserContext?: {
+        windowId?: number
+        selectedTabs?: unknown[]
+        activeTab?: {
+          id: number
+          pageId: number
+          url: string
+          title: string
+        }
+        enabledMcpServers?: string[]
+      }
+    }
+    expect(createArgs.browserContext?.windowId).toBe(11)
+    expect(createArgs.browserContext?.selectedTabs).toBeUndefined()
+    expect(createArgs.browserContext?.activeTab).toEqual({
+      id: 77,
+      pageId: 77,
+      url: 'about:blank',
+      title: 'Scheduled Task',
+    })
+    expect(createArgs.browserContext?.enabledMcpServers).toEqual(['slack'])
+  })
+
+  it('deleteSession closes the tracked hidden page', async () => {
+    const fakeAgent = createFakeAgent()
+    const sessionStore = createSessionStore()
+    const browser = {
+      closePage: mock(async () => {}),
+    }
+    const conversationId = crypto.randomUUID()
+
+    sessionStore.set(conversationId, {
+      agent: fakeAgent,
+      hiddenPageId: 33,
+    })
+
+    const service = new ChatService({
+      sessionStore: sessionStore as never,
+      klavisClient: {} as never,
+      browser: browser as never,
+      registry: {} as never,
+    })
+
+    const result = await service.deleteSession(conversationId)
+
+    expect(result).toEqual({ deleted: true, sessionCount: 0 })
+    expect(browser.closePage).toHaveBeenCalledWith(33)
+    expect(fakeAgent.dispose).toHaveBeenCalledTimes(1)
+  })
+
+  it('keeps the scheduled hidden page context when metadata lookup fails', async () => {
+    const fakeAgent = createFakeAgent()
+    agentToReturn = fakeAgent
+    streamResponseHandler = async ({ onFinish }) => {
+      await onFinish({ messages: fakeAgent.messages })
+      return new Response('ok')
+    }
+
+    const browser = {
+      newPage: mock(async () => 88),
+      listPages: mock(async () => {
+        throw new Error('CDP lookup failed')
+      }),
+      closePage: mock(async () => {}),
+      resolveTabIds: mock(async () => new Map<number, number>()),
+    }
+    const sessionStore = createSessionStore()
+    const service = new ChatService({
+      sessionStore: sessionStore as never,
+      klavisClient: {} as never,
+      browser: browser as never,
+      registry: {} as never,
+    })
+
+    await service.processMessage(
+      {
+        conversationId: crypto.randomUUID(),
+        message: 'Run the scheduled task',
+        isScheduledTask: true,
+        mode: 'agent',
+        origin: 'sidepanel',
+        browserContext: {
+          activeTab: {
+            id: 3,
+            url: 'https://example.com',
+            title: 'Example',
+          },
+        },
+      } as never,
+      new AbortController().signal,
+    )
+
+    const createArgs = createAgentSpy.mock.calls.at(-1)?.[0] as {
+      browserContext?: {
+        windowId?: number
+        activeTab?: {
+          id: number
+          pageId: number
+          url: string
+          title: string
+        }
+      }
+    }
+    expect(createArgs.browserContext?.windowId).toBeUndefined()
+    expect(createArgs.browserContext?.activeTab).toEqual({
+      id: 88,
+      pageId: 88,
+      url: 'about:blank',
+      title: 'Scheduled Task',
+    })
+    expect(browser.closePage).toHaveBeenCalledWith(88)
+  })
+})
--- a/packages/browseros-agent/apps/server/tests/api/services/klavis/strata-cache.test.ts
+++ b/packages/browseros-agent/apps/server/tests/api/services/klavis/strata-cache.test.ts
@@ -0,0 +1,163 @@
+/**
+ * @license
+ * Copyright 2025 BrowserOS
+ */
+
+import { describe, expect, it } from 'bun:test'
+import { KlavisStrataCache } from '../../../../src/api/services/klavis/strata-cache'
+import type {
+  KlavisClient,
+  StrataCreateResponse,
+} from '../../../../src/lib/clients/klavis/klavis-client'
+
+class StubKlavisClient {
+  callCount = 0
+  delayMs = 0
+  shouldThrowOnce = false
+  lastServers: string[] | null = null
+
+  async createStrata(
+    userId: string,
+    servers: string[],
+  ): Promise<StrataCreateResponse> {
+    this.callCount++
+    this.lastServers = servers
+    if (this.shouldThrowOnce) {
+      this.shouldThrowOnce = false
+      throw new Error('boom')
+    }
+    if (this.delayMs > 0) {
+      await new Promise((r) => setTimeout(r, this.delayMs))
+    }
+    return {
+      strataServerUrl: `https://strata.test/${userId}/${servers.join('-')}`,
+      strataId: `strata_${userId}`,
+      addedServers: servers,
+    }
+  }
+}
+
+const asClient = (stub: StubKlavisClient): KlavisClient =>
+  stub as unknown as KlavisClient
+
+describe('KlavisStrataCache', () => {
+  it('cache hit returns the same value without re-calling the client', async () => {
+    const cache = new KlavisStrataCache()
+    const client = new StubKlavisClient()
+    const a = await cache.getOrFetch(asClient(client), 'user1', ['Gmail'])
+    const b = await cache.getOrFetch(asClient(client), 'user1', ['Gmail'])
+    expect(client.callCount).toBe(1)
+    expect(a.strataServerUrl).toBe(b.strataServerUrl)
+    expect(a.strataId).toBe(b.strataId)
+  })
+
+  it('normalizes server order — [Gmail, Linear] === [Linear, Gmail]', async () => {
+    const cache = new KlavisStrataCache()
+    const client = new StubKlavisClient()
+    await cache.getOrFetch(asClient(client), 'u', ['Gmail', 'Linear'])
+    await cache.getOrFetch(asClient(client), 'u', ['Linear', 'Gmail'])
+    expect(client.callCount).toBe(1)
+  })
+
+  it('dedupes duplicate server names within one call', async () => {
+    const cache = new KlavisStrataCache()
+    const client = new StubKlavisClient()
+    await cache.getOrFetch(asClient(client), 'u', ['Gmail', 'Gmail'])
+    await cache.getOrFetch(asClient(client), 'u', ['Gmail'])
+    expect(client.callCount).toBe(1)
+  })
+
+  it('different user gets a separate cache entry', async () => {
+    const cache = new KlavisStrataCache()
+    const client = new StubKlavisClient()
+    await cache.getOrFetch(asClient(client), 'userA', ['Gmail'])
+    await cache.getOrFetch(asClient(client), 'userB', ['Gmail'])
+    expect(client.callCount).toBe(2)
+  })
+
+  it('different server set (same user) gets a separate cache entry', async () => {
+    const cache = new KlavisStrataCache()
+    const client = new StubKlavisClient()
+    await cache.getOrFetch(asClient(client), 'u', ['Gmail'])
+    await cache.getOrFetch(asClient(client), 'u', ['Gmail', 'Linear'])
+    expect(client.callCount).toBe(2)
+  })
+
+  it('concurrent misses share a single in-flight Promise', async () => {
+    const cache = new KlavisStrataCache()
+    const client = new StubKlavisClient()
+    client.delayMs = 30
+    const [a, b, c] = await Promise.all([
+      cache.getOrFetch(asClient(client), 'u', ['Gmail']),
+      cache.getOrFetch(asClient(client), 'u', ['Gmail']),
+      cache.getOrFetch(asClient(client), 'u', ['Gmail']),
+    ])
+    expect(client.callCount).toBe(1)
+    expect(a.strataId).toBe(b.strataId)
+    expect(b.strataId).toBe(c.strataId)
+  })
+
+  it('TTL expiry triggers a fresh fetch', async () => {
+    const cache = new KlavisStrataCache(10) // 10 ms TTL
+    const client = new StubKlavisClient()
+    await cache.getOrFetch(asClient(client), 'u', ['Gmail'])
+    await new Promise((r) => setTimeout(r, 25))
+    await cache.getOrFetch(asClient(client), 'u', ['Gmail'])
+    expect(client.callCount).toBe(2)
+  })
+
+  it('invalidate(userA) drops only userA entries', async () => {
+    const cache = new KlavisStrataCache()
+    const client = new StubKlavisClient()
+    await cache.getOrFetch(asClient(client), 'userA', ['Gmail'])
+    await cache.getOrFetch(asClient(client), 'userB', ['Gmail'])
+    cache.invalidate('userA')
+    await cache.getOrFetch(asClient(client), 'userA', ['Gmail'])
+    await cache.getOrFetch(asClient(client), 'userB', ['Gmail'])
+    expect(client.callCount).toBe(3) // userA: cold + cold, userB: cold + hit
+  })
+
+  it('invalidate while a fetch is in flight does not store the result', async () => {
+    const cache = new KlavisStrataCache()
+    const client = new StubKlavisClient()
+    client.delayMs = 30
+    const inflight = cache.getOrFetch(asClient(client), 'u', ['Gmail'])
+    cache.invalidate('u')
+    const result = await inflight
+    expect(result.strataId).toBe('strata_u')
+    // Next call should not see the post-invalidate write — must re-fetch.
+    await cache.getOrFetch(asClient(client), 'u', ['Gmail'])
+    expect(client.callCount).toBe(2)
+  })
+
+  it('rejected fetches do not poison the cache', async () => {
+    const cache = new KlavisStrataCache()
+    const client = new StubKlavisClient()
+    client.shouldThrowOnce = true
+    await expect(
+      cache.getOrFetch(asClient(client), 'u', ['Gmail']),
+    ).rejects.toThrow('boom')
+    await cache.getOrFetch(asClient(client), 'u', ['Gmail'])
+    expect(client.callCount).toBe(2)
+  })
+
+  it('clear() drops all entries', async () => {
+    const cache = new KlavisStrataCache()
+    const client = new StubKlavisClient()
+    await cache.getOrFetch(asClient(client), 'userA', ['Gmail'])
+    await cache.getOrFetch(asClient(client), 'userB', ['Linear'])
+    cache.clear()
+    await cache.getOrFetch(asClient(client), 'userA', ['Gmail'])
+    await cache.getOrFetch(asClient(client), 'userB', ['Linear'])
+    expect(client.callCount).toBe(4)
+  })
+
+  it('passes a defensive copy of the servers array to the client', async () => {
+    const cache = new KlavisStrataCache()
+    const client = new StubKlavisClient()
+    const input: readonly string[] = ['Gmail', 'Linear']
+    await cache.getOrFetch(asClient(client), 'u', input)
+    expect(client.lastServers).not.toBe(input)
+    expect(client.lastServers).toEqual(['Gmail', 'Linear'])
+  })
+})
--- a/packages/browseros-agent/apps/server/tests/build.test.ts
+++ b/packages/browseros-agent/apps/server/tests/build.test.ts
@@ -8,11 +8,16 @@

 import { afterAll, describe, it } from 'bun:test'
 import assert from 'node:assert'
-import { mkdtempSync, rmSync, writeFileSync } from 'node:fs'
+import {
+  existsSync,
+  mkdtempSync,
+  readFileSync,
+  rmSync,
+  writeFileSync,
+} from 'node:fs'
 import { tmpdir } from 'node:os'
 import { join, resolve } from 'node:path'

-// Derive the build target from the current platform so the test is portable
 function getNativeTarget(): { id: string; ext: string } {
  const os =
    process.platform === 'darwin'
@@ -24,12 +29,30 @@ function getNativeTarget(): { id: string; ext: string } {
  return { id: `${os}-${cpu}`, ext: process.platform === 'win32' ? '.exe' : '' }
 }

-// Stub values so the build config validation passes without real secrets
-const BUILD_ENV_STUBS: Record<string, string> = {
+const REQUIRED_INLINE_ENV_KEYS = [
+  'BROWSEROS_CONFIG_URL',
+  'CODEGEN_SERVICE_URL',
+  'POSTHOG_API_KEY',
+  'SENTRY_DSN',
+] as const
+
+const R2_ENV_KEYS = [
+  'R2_ACCOUNT_ID',
+  'R2_ACCESS_KEY_ID',
+  'R2_SECRET_ACCESS_KEY',
+  'R2_BUCKET',
+] as const
+
+const PROD_SECRET_KEYS = [...REQUIRED_INLINE_ENV_KEYS, ...R2_ENV_KEYS]
+
+const INLINE_ENV_STUBS: Record<string, string> = {
  BROWSEROS_CONFIG_URL: 'https://stub.test/config',
  CODEGEN_SERVICE_URL: 'https://stub.test/codegen',
  POSTHOG_API_KEY: 'phc_test_stub',
  SENTRY_DSN: 'https://stub@sentry.test/0',
+}
+
+const R2_ENV_STUBS: Record<string, string> = {
  R2_ACCOUNT_ID: 'test',
  R2_ACCESS_KEY_ID: 'test',
  R2_SECRET_ACCESS_KEY: 'test',
@@ -39,23 +62,58 @@ const BUILD_ENV_STUBS: Record<string, string> = {
 describe('server build', () => {
  const rootDir = resolve(import.meta.dir, '../../..')
  const serverPkgPath = resolve(rootDir, 'apps/server/package.json')
+  const prodEnvPath = resolve(rootDir, 'apps/server/.env.production')
+  const prodEnvTemplatePath = resolve(
+    rootDir,
+    'apps/server/.env.production.example',
+  )
+  const originalProdEnv = existsSync(prodEnvPath)
+    ? readFileSync(prodEnvPath, 'utf-8')
+    : null
+  const prodEnvTemplate = readFileSync(prodEnvTemplatePath, 'utf-8')
  const buildScript = resolve(rootDir, 'scripts/build/server.ts')
  const target = getNativeTarget()
  const binaryPath = resolve(
    rootDir,
    `dist/prod/server/.tmp/binaries/browseros-server-${target.id}${target.ext}`,
  )
-
-  // Empty manifest so the build skips R2 resource downloads
+  const zipPath = resolve(
+    rootDir,
+    `dist/prod/server/browseros-server-resources-${target.id}.zip`,
+  )
  const tempDir = mkdtempSync(join(tmpdir(), 'browseros-build-test-'))
  const emptyManifestPath = join(tempDir, 'empty-manifest.json')
  writeFileSync(emptyManifestPath, JSON.stringify({ resources: [] }))

+  function buildEnv(
+    extraEnv: Record<string, string>,
+    omitKeys: readonly string[] = [],
+  ): NodeJS.ProcessEnv {
+    const env: NodeJS.ProcessEnv = {
+      ...process.env,
+      ...extraEnv,
+    }
+    for (const key of omitKeys) {
+      delete env[key]
+    }
+    return env
+  }
+
+  function resetProdEnvToTemplate(): void {
+    writeFileSync(prodEnvPath, prodEnvTemplate)
+  }
+
  afterAll(() => {
    rmSync(tempDir, { recursive: true, force: true })
+    if (originalProdEnv === null) {
+      rmSync(prodEnvPath, { force: true })
+      return
+    }
+    writeFileSync(prodEnvPath, originalProdEnv)
  })

  it('compiles and --version outputs correct version', async () => {
+    resetProdEnvToTemplate()
    const pkg = await Bun.file(serverPkgPath).json()
    const expectedVersion: string = pkg.version

@@ -71,7 +129,7 @@ describe('server build', () => {
        cwd: rootDir,
        stdout: 'pipe',
        stderr: 'pipe',
-        env: { ...process.env, ...BUILD_ENV_STUBS },
+        env: buildEnv({ ...INLINE_ENV_STUBS, ...R2_ENV_STUBS }),
      },
    )
    const buildExit = await build.exited
@@ -97,4 +155,26 @@ describe('server build', () => {
    )
    assert.strictEqual(versionOutput.trim(), expectedVersion)
  }, 300_000)
+
+  it('archives CI builds without R2 config or production env secrets', async () => {
+    resetProdEnvToTemplate()
+    rmSync(zipPath, { force: true })
+
+    const build = Bun.spawn(
+      ['bun', buildScript, `--target=${target.id}`, '--ci'],
+      {
+        cwd: rootDir,
+        stdout: 'pipe',
+        stderr: 'pipe',
+        env: buildEnv({}, PROD_SECRET_KEYS),
+      },
+    )
+    const buildExit = await build.exited
+    if (buildExit !== 0) {
+      const stderr = await new Response(build.stderr).text()
+      assert.fail(`CI build failed (exit ${buildExit}):\n${stderr}`)
+    }
+
+    assert.ok(existsSync(zipPath), `Expected archive at ${zipPath}`)
+  }, 300_000)
 })
--- a/packages/browseros-agent/apps/server/tests/graph/executor.test.ts
+++ b/packages/browseros-agent/apps/server/tests/graph/executor.test.ts
@@ -1,285 +0,0 @@
-/**
- * @license
- * Copyright 2025 BrowserOS
- */
-
-import { describe, it } from 'bun:test'
-import assert from 'node:assert'
-
-import { transformCodeForExecution } from '../../src/graph/executor'
-
-describe('transformCodeForExecution', () => {
-  describe('single-line imports', () => {
-    it('removes default import', () => {
-      const code = `import foo from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes named import', () => {
-      const code = `import { foo } from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes multiple named imports', () => {
-      const code = `import { foo, bar, baz } from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes namespace import', () => {
-      const code = `import * as pkg from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes side-effect import', () => {
-      const code = `import 'side-effect'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes default + named import', () => {
-      const code = `import foo, { bar } from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes import with alias', () => {
-      const code = `import { foo as f } from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-  })
-
-  describe('type imports', () => {
-    it('removes type import', () => {
-      const code = `import type { Foo } from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes type default import', () => {
-      const code = `import type Foo from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes inline type specifier', () => {
-      const code = `import { type Foo, bar } from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-  })
-
-  describe('multi-line imports', () => {
-    it('removes multi-line named imports', () => {
-      const code = `import {
-  foo,
-  bar,
-} from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes multi-line type imports', () => {
-      const code = `import type {
-  Foo,
-  Bar,
-} from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes multi-line imports with aliases', () => {
-      const code = `import {
-  foo as f,
-  bar as b,
-} from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes deeply nested multi-line imports', () => {
-      const code = `import {
-  foo,
-  bar,
-  baz,
-  qux,
-} from '@scoped/package-name'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-  })
-
-  describe('quote styles', () => {
-    it('handles single quotes', () => {
-      const code = `import foo from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('handles double quotes', () => {
-      const code = `import foo from "pkg"
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-  })
-
-  describe('multiple imports', () => {
-    it('removes all imports from different packages', () => {
-      const code = `import { z } from 'zod'
-import { Agent } from '@browseros-ai/agent-sdk'
-import type { Config } from './types'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes mixed single and multi-line imports', () => {
-      const code = `import foo from 'foo'
-import {
-  bar,
-  baz,
-} from 'bar'
-import qux from 'qux'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-  })
-
-  describe('indentation', () => {
-    it('removes indented imports', () => {
-      const code = `  import foo from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes tab-indented imports', () => {
-      const code = `\timport foo from 'pkg'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-  })
-
-  describe('preserves non-import code', () => {
-    it('preserves all code after imports', () => {
-      const code = `import foo from 'pkg'
-
-export async function run(agent) {
-  await agent.navigate('https://example.com')
-  return 'done'
-}`
-      const result = transformCodeForExecution(code)
-      assert.ok(result.includes('export async function run(agent)'))
-      assert.ok(result.includes("await agent.navigate('https://example.com')"))
-      assert.ok(result.includes("return 'done'"))
-      assert.ok(!result.includes('import'))
-    })
-
-    it('preserves code with import-like strings', () => {
-      const code = `import foo from 'pkg'
-const str = "import { x } from 'y'"
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.ok(result.includes(`const str = "import { x } from 'y'"`))
-      assert.ok(result.includes('const x = 1'))
-    })
-
-    it('preserves dynamic imports', () => {
-      const code = `import foo from 'pkg'
-const mod = await import('./dynamic')
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.ok(result.includes("const mod = await import('./dynamic')"))
-      assert.ok(result.includes('const x = 1'))
-    })
-  })
-
-  describe('scoped packages', () => {
-    it('removes @scoped/package imports', () => {
-      const code = `import { Agent } from '@browseros-ai/agent-sdk'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes deeply scoped package imports', () => {
-      const code = `import { foo } from '@org/pkg/sub/path'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-  })
-
-  describe('relative imports', () => {
-    it('removes relative imports', () => {
-      const code = `import foo from './foo'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('removes parent directory imports', () => {
-      const code = `import foo from '../foo'
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-  })
-
-  describe('edge cases', () => {
-    it('handles empty code', () => {
-      const result = transformCodeForExecution('')
-      assert.strictEqual(result, '')
-    })
-
-    it('handles code with no imports', () => {
-      const code = `const x = 1
-const y = 2`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result, code)
-    })
-
-    it('handles code with only imports', () => {
-      const code = `import foo from 'foo'
-import bar from 'bar'`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), '')
-    })
-
-    it('handles imports with trailing semicolons', () => {
-      const code = `import foo from 'pkg';
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-
-    it('handles imports with trailing comments', () => {
-      const code = `import foo from 'pkg' // comment
-const x = 1`
-      const result = transformCodeForExecution(code)
-      assert.strictEqual(result.trim(), 'const x = 1')
-    })
-  })
-})
--- a/packages/browseros-agent/apps/server/tests/tools/navigation.test.ts
+++ b/packages/browseros-agent/apps/server/tests/tools/navigation.test.ts
@@ -29,6 +29,13 @@ function structuredOf<T>(result: { structuredContent?: unknown }): T {
 }

 describe('navigation tools', () => {
+  it('hidden-page tool descriptions do not claim screenshots are unsupported', () => {
+    assert.ok(
+      !new_hidden_page.description.includes('take_screenshot is not supported'),
+    )
+    assert.ok(!show_page.description.includes('for screenshots'))
+  })
+
  it('list_pages returns at least one page', async () => {
    await withBrowser(async ({ execute }) => {
      const result = await execute(list_pages, {})
--- a/packages/browseros-agent/apps/server/tests/tools/windows.test.ts
+++ b/packages/browseros-agent/apps/server/tests/tools/windows.test.ts
@@ -24,6 +24,14 @@ function structuredOf<T>(result: { structuredContent?: unknown }): T {
 }

 describe('window tools', () => {
+  it('create_hidden_window description does not claim screenshots are unsupported', () => {
+    assert.ok(
+      !create_hidden_window.description.includes(
+        'take_screenshot is not supported',
+      ),
+    )
+  })
+
  it('list_windows returns at least one window', async () => {
    await withBrowser(async ({ execute }) => {
      const result = await execute(list_windows, {})
--- a/packages/browseros-agent/bun.lock
+++ b/packages/browseros-agent/bun.lock
@@ -23,7 +23,7 @@
    },
    "apps/agent": {
      "name": "@browseros/agent",
-      "version": "0.0.98",
+      "version": "0.0.99",
      "dependencies": {
        "@ai-sdk/react": "^3.0.96",
        "@browseros/server": "workspace:*",
@@ -152,7 +152,7 @@
    },
    "apps/server": {
      "name": "@browseros/server",
-      "version": "0.0.80",
+      "version": "0.0.82",
      "bin": {
        "browseros-server": "./src/index.ts",
      },
--- a/packages/browseros-agent/docs/events.md
+++ b/packages/browseros-agent/docs/events.md
@@ -141,21 +141,6 @@ Prefix: `browseros.native.extension.`
 | `settings.scheduled_task.cancelled` | — | Running task was cancelled |
 | `settings.scheduled_task.retried` | — | Task run was retried |

-### Settings — Workflows
-
-| Event | Properties | Description |
-|-------|-----------|-------------|
-| `settings.graph.created` | — | New workflow graph created |
-| `settings.graph.saved` | — | Workflow graph saved |
-| `settings.graph.updated` | — | Workflow graph updated |
-| `settings.graph.message.like` | — | Workflow message liked |
-| `settings.graph.message.dislike` | — | Workflow message disliked |
-| `settings.workflow.deleted` | — | Workflow deleted |
-| `settings.workflow.run_started` | — | Workflow run started |
-| `settings.workflow.run_stopped` | — | Workflow run stopped |
-| `settings.workflow.run_retried` | — | Workflow run retried |
-| `settings.workflow.run_completed` | — | Workflow run completed |
-
 ### Onboarding

 | Event | Properties | Description |
--- a/packages/browseros-agent/package.json
+++ b/packages/browseros-agent/package.json
@@ -19,7 +19,7 @@
    "start:agent": "bun run --filter @browseros/agent dev",
    "build": "bun run build:server && bun run build:agent",
    "build:server": "FORCE_COLOR=1 bun scripts/build/server.ts --target=all",
-    "build:server:ci": "FORCE_COLOR=1 bun scripts/build/server.ts --target=all --compile-only",
+    "build:server:ci": "FORCE_COLOR=1 bun scripts/build/server.ts --target=all --ci",
    "build:server:test": "FORCE_COLOR=1 bun scripts/build/server.ts --target=darwin-arm64 --no-upload",
    "upload:cli-installers": "bun scripts/build/cli.ts",
    "start:server:test": "bun run build:server:test && set -a && . apps/server/.env.development && set +a && dist/prod/server/.tmp/binaries/browseros-server-darwin-arm64",
--- a/packages/browseros-agent/scripts/build/server/archive.ts
+++ b/packages/browseros-agent/scripts/build/server/archive.ts
@@ -37,29 +37,39 @@ export async function archiveAndUploadArtifacts(
  r2: R2Config,
  upload: boolean,
 ): Promise<UploadResult[]> {
-  const results: UploadResult[] = []
+  const results = await archiveArtifacts(artifacts)
+  if (!upload) {
+    return results
+  }

-  for (const artifact of artifacts) {
-    const zipPath = zipPathForArtifact(artifact)
-    await zipArtifactRoot(artifact.rootDir, zipPath)
-
-    if (!upload) {
-      results.push({ targetId: artifact.target.id, zipPath })
-      continue
-    }
-
-    const fileName = basename(zipPath)
+  const uploadedResults: UploadResult[] = []
+  for (const result of results) {
+    const fileName = basename(result.zipPath)
    const latestR2Key = joinObjectKey(r2.uploadPrefix, 'latest', fileName)
    const versionR2Key = joinObjectKey(r2.uploadPrefix, version, fileName)
-    await uploadFileToObject(client, r2, latestR2Key, zipPath)
-    await uploadFileToObject(client, r2, versionR2Key, zipPath)
-    results.push({
-      targetId: artifact.target.id,
-      zipPath,
+    await uploadFileToObject(client, r2, latestR2Key, result.zipPath)
+    await uploadFileToObject(client, r2, versionR2Key, result.zipPath)
+    uploadedResults.push({
+      targetId: result.targetId,
+      zipPath: result.zipPath,
      latestR2Key,
      versionR2Key,
    })
  }

+  return uploadedResults
+}
+
+export async function archiveArtifacts(
+  artifacts: StagedArtifact[],
+): Promise<UploadResult[]> {
+  const results: UploadResult[] = []
+
+  for (const artifact of artifacts) {
+    const zipPath = zipPathForArtifact(artifact)
+    await zipArtifactRoot(artifact.rootDir, zipPath)
+    results.push({ targetId: artifact.target.id, zipPath })
+  }
+
  return results
 }
--- a/packages/browseros-agent/scripts/build/server/cli.ts
+++ b/packages/browseros-agent/scripts/build/server/cli.ts
@@ -22,23 +22,26 @@ export function parseBuildArgs(argv: string[]): BuildArgs {
    .option('--upload', 'Upload artifact zips to R2')
    .option('--no-upload', 'Skip zip upload to R2')
    .option(
-      '--compile-only',
-      'Compile binaries only (skip R2 staging and upload)',
+      '--ci',
+      'Build local release zip artifacts for CI without R2 and without requiring production env secrets',
    )
  program.parse(argv, { from: 'user' })
  const options = program.opts<{
    target: string
    manifest: string
    upload: boolean
-    compileOnly: boolean
+    ci: boolean
  }>()

-  const compileOnly = options.compileOnly ?? false
+  const ci = options.ci ?? false
+  if (ci && options.upload) {
+    throw new Error('--ci cannot be combined with --upload')
+  }

  return {
    targets: resolveTargets(options.target),
    manifestPath: options.manifest,
-    upload: compileOnly ? false : (options.upload ?? true),
-    compileOnly,
+    upload: ci ? false : (options.upload ?? true),
+    ci,
  }
 }
--- a/packages/browseros-agent/scripts/build/server/compile.ts
+++ b/packages/browseros-agent/scripts/build/server/compile.ts
@@ -1,6 +1,7 @@
 import { mkdirSync, rmSync } from 'node:fs'
 import { join } from 'node:path'

+import { log } from '../log'
 import { wasmBinaryPlugin } from '../plugins/wasm-binary'
 import { runCommand } from './command'
 import type { BuildTarget, CompiledServerBinary } from './types'
@@ -52,6 +53,7 @@ async function bundleServer(
 async function compileTarget(
  target: BuildTarget,
  env: NodeJS.ProcessEnv,
+  ci: boolean,
 ): Promise<string> {
  const binaryPath = compiledBinaryPath(target)
  const args = [
@@ -66,11 +68,15 @@ async function compileTarget(
  await runCommand('bun', args, env)

  if (target.os === 'windows') {
-    await runCommand(
-      'bun',
-      ['scripts/patch-windows-exe.ts', binaryPath],
-      process.env,
-    )
+    if (ci) {
+      log.warn('Skipping Windows exe metadata patching in CI mode')
+    } else {
+      await runCommand(
+        'bun',
+        ['scripts/patch-windows-exe.ts', binaryPath],
+        process.env,
+      )
+    }
  }

  return binaryPath
@@ -81,14 +87,16 @@ export async function compileServerBinaries(
  envVars: Record<string, string>,
  processEnv: NodeJS.ProcessEnv,
  version: string,
+  options?: { ci?: boolean },
 ): Promise<CompiledServerBinary[]> {
+  const ci = options?.ci ?? false
  rmSync(TMP_ROOT, { recursive: true, force: true })
  mkdirSync(BINARIES_DIR, { recursive: true })
  await bundleServer(envVars, version)

  const compiled: CompiledServerBinary[] = []
  for (const target of targets) {
-    const binaryPath = await compileTarget(target, processEnv)
+    const binaryPath = await compileTarget(target, processEnv, ci)
    compiled.push({ target, binaryPath })
  }

--- a/packages/browseros-agent/scripts/build/server/config.ts
+++ b/packages/browseros-agent/scripts/build/server/config.ts
@@ -75,7 +75,7 @@ function validateProductionEnv(envVars: Record<string, string>): void {
 }

 export interface LoadBuildConfigOptions {
-  compileOnly?: boolean
+  ci?: boolean
 }

 export function loadBuildConfig(
@@ -84,7 +84,9 @@ export function loadBuildConfig(
 ): BuildConfig {
  const fileEnv = loadProdEnv(rootDir)
  const envVars = buildInlineEnv(fileEnv)
-  validateProductionEnv(envVars)
+  if (!options.ci) {
+    validateProductionEnv(envVars)
+  }

  const processEnv: NodeJS.ProcessEnv = {
    PATH: process.env.PATH ?? '',
@@ -92,7 +94,7 @@ export function loadBuildConfig(
    ...process.env,
  }

-  if (options.compileOnly) {
+  if (options.ci) {
    return { version: readServerVersion(rootDir), envVars, processEnv }
  }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
shivammittal274	e310a5d6cd	ci: add timeout and continue-on-error for trend report step	2026-04-09 23:16:48 +05:30
shivammittal274	ef7a022711	chore: restore kimi-k2p5 as default eval config	2026-04-09 20:22:27 +05:30
shivammittal274	16ce8853c6	chore: temp switch to opus 4.6 for eval run	2026-04-09 20:21:45 +05:30
shivammittal274	053c7733b4	fix: register deterministic graders in pass rate calculation Add agisdk_state_diff and infinity_state to PASS_FAIL_GRADER_ORDER in both runner types and weekly report script, so scores show correctly in the dashboard.	2026-04-09 13:23:51 +05:30
shivammittal274	3e3cff5404	chore: switch eval configs back to kimi-k2p5	2026-04-09 12:22:05 +05:30
shivammittal274	6305c87aa4	feat: add deterministic eval graders (AGI SDK + WebArena-Infinity) Two new benchmark integrations with programmatic grading — no LLM judge. AGI SDK / REAL Bench (52 tasks): - 11 React/Next.js clones of consumer apps (DoorDash, Amazon, Gmail, etc.) - Grader navigates browser to /finish, extracts state diff from <pre> tag - Python verifier checks exact values via jmespath queries WebArena-Infinity (50 hard tasks): - 13 LLM-generated SaaS clones (Gmail, GitLab, Linear, Figma, etc.) - InfinityAppManager starts fresh app server per task per worker - Python verifier calls /api/state and asserts on JSON state Infrastructure: - GraderInput extended with mcpUrl + infinityAppUrl for parallel workers - Each worker gets isolated ports (no cross-worker state contamination) - CI workflow: pip install agisdk, clone webarena-infinity repo	2026-04-09 12:19:55 +05:30
Felarof	df7873562d	Revert Kimi partnership UI, restore daily limit survey (#663 ) * docs: add uBlock Origin install info to getting started and ad-blocking pages Chrome dropped support for the full uBlock Origin extension — highlight that BrowserOS brings it back and make it easy to install from both the getting started guide and the dedicated ad-blocking page. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: revert Kimi partnership UI, restore daily limit survey Remove Kimi/Moonshot AI partnership branding from the rate limit banner, provider card, provider templates, and LLM hub. Restore the original survey CTA on daily limit errors. Moonshot AI remains as a regular provider template without the "Recommended" badge. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: address Greptile review comments - Guard survey CTA with !isCreditsExhausted to avoid showing it for credits-exhausted users who already see "View Usage & Billing" - Remove dead kimi-launch feature flag files (kimi-launch.ts, useKimiLaunch.ts) - Remove unused KIMI_RATE_LIMIT analytics events - Remove VITE_PUBLIC_KIMI_LAUNCH from env schema and .env.example Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>	2026-04-08 16:39:00 -07:00
shivammittal274	412386b489	fix: ensure custom model entry is always visible in model selector (#662 ) The merged PR (#661) injected custom entries into filteredModels, but cmdk auto-scrolls to its first selected CommandItem, pushing the custom entry out of view. Fix by using forceMount on a separate CommandGroup and resetting scroll to top on every keystroke via requestAnimationFrame.	2026-04-09 02:40:38 +05:30
shivammittal274	33617ba9e7	feat: show custom model ID as first option in model selector (#661 ) * feat: show custom model ID as first option in model selector When typing in the model dropdown, the user's exact input now appears as the first selectable row, followed by fuzzy search suggestions. This makes entering custom model IDs intuitive — previously the option was hidden behind a zero-results-only Enter shortcut that fuzzy search almost always prevented. * fix: correct is_custom_model flag and prevent duplicate analytics events - Use modelInfoList check instead of hardcoding is_custom_model: true in the Enter key handler - Add stopPropagation to prevent cmdk's root keydown handler from also firing onSelect, which caused duplicate MODEL_SELECTED_EVENT emissions	2026-04-09 01:44:17 +05:30
Nikhil	6712e1d321	chore: bump server and extension version (#659 )	2026-04-08 10:18:24 -07:00
Dani Akash	94540d9e87	chore(agent): remove workflows feature (#656 )	2026-04-08 08:42:22 +05:30
Nikhil	bb62213e84	fix: install linux sysroot in configure, not via gclient hook (#653 ) * fix: install linux sysroot in configure, not via gclient hook `gn gen` was failing on the arm64 leg with `Missing sysroot (//build/linux/debian_bullseye_arm64-sysroot)`. The previous design relied on `git_setup` writing `target_cpus` to `.gclient` so that `gclient sync`'s DEPS hook would download the cross-arch sysroot. That chain breaks for any chromium_src that was synced before cross-arch support landed (the hook is gated on .gclient state at sync time) and for partial pipeline runs that skip git_setup entirely. Nothing in configure declared or verified its sysroot precondition. Make configure self-healing: on Linux, invoke `build/linux/sysroot_scripts/install-sysroot.py --arch=<target>` directly before `gn gen`. install-sysroot.py is idempotent (stamp file + SHA check), fast when already installed, and decoupled from .gclient — it's exactly what the failing assertion's error message recommends. The script accepts our arch names directly: `x64` translates to `amd64` internally via ARCH_TRANSLATIONS, and `arm64` is a valid pass-through. Also temporarily pin release.linux.yaml to x64 only while we validate the sysroot bootstrap end-to-end. Flip back to `[x64, arm64]` once arm64 is green. * chore: pin release.linux.yaml to arm64-only for sysroot bootstrap test x64 already builds cleanly — the failing leg is arm64 cross-compile from an x64 host. Pin the config to arm64 to exercise the new install-sysroot.py path in configure without burning time on x64. Flip back to [x64, arm64] once arm64 is green.	2026-04-07 11:12:21 -07:00
Nikhil	dee3086a48	feat(server): cache klavis createStrata to unblock /chat hot path (#654 ) * feat(server): cache klavis createStrata to unblock /chat hot path Conversation creation in /chat was blocking on a Worker-proxied klavisClient.createStrata round-trip every time the user had any managed Klavis app connected. The 5s KLAVIS_TIMEOUT_MS in the ai-worker proxy existed specifically to bound this latency, but the same cap also caused user-visible 504s on /klavis/servers/remove since Strata DELETE operations routinely take >5s. Without caching we couldn't raise the timeout without regressing chat creation. This adds an in-process cache for Strata createStrata responses, keyed by (browserosId, hashed sorted-server-set) and gated by a 1h TTL. The cache stores only immutable JSON metadata (strataServerUrl, strataId, addedServers); per-session MCP clients continue to be opened and disposed by AiSdkAgent exactly as before, which keeps the cache concurrency-safe by construction. Cache invalidation has two layers: (a) the cache key embeds the server set, so adding/removing apps naturally produces a different key; (b) POST /klavis/servers/add and DELETE /klavis/servers/remove explicitly call invalidate(browserosId) after their underlying Klavis API call succeeds, as defense-in-depth. Other changes: - Consolidates klavis-related services into a new apps/server/src/api/services/klavis/ directory; moves register-klavis-mcp.ts -> strata-proxy.ts and adds strata-cache.ts there. lib/clients/klavis/ stays unchanged. - Refactors KlavisClient.removeServer into a low-level deleteServersFromStrata(strataId, servers) primitive. The cache-lookup + delete + invalidate orchestration moves up into routes/klavis.ts where it belongs, eliminating the lib->api layering inversion the original removeServer would have introduced. - Uses Bun.hash (xxhash64) for fixed-width 16-hex-char keys, with serverKey verified on read to make collision risk strictly zero. - Dedupes concurrent fetches via in-flight Promise sharing, with identity-checks before delete to avoid races between invalidate() and a racing replacement insert. Follow-up (separate PR): bump KLAVIS_TIMEOUT_MS to 30000 in ai-worker/wrangler.toml so /klavis/servers/remove stops 504-ing. * fix: address greptile review comments for klavis strata cache - Drop dead `invalidated` field on InflightEntry. It was added to support a "discard post-resolution if invalidated" check that I later replaced with identity-checked deletes during self-review, but I forgot to remove the field and the misleading comment referencing it. Simplify Map<string, InflightEntry> to plain Map<string, Promise<CacheEntry>>. - Lower cache miss log from info to debug. Misses fire on every new conversation; matching the existing debug-level for hits. - Stop routing the /klavis/servers/remove handler through klavisStrataCache.getOrFetch. The chat hot path keys its cache by the user's full enabled-server set (e.g. hash('Gmail,Linear')), so a single-server lookup here (hash('Gmail')) is guaranteed to miss, write a spurious entry, and then have it immediately cleared by invalidate() on the next line. Call createStrata directly to recover the strataId, mirroring the original removeServer flow.	2026-04-07 11:11:41 -07:00
Nikhil	8de2bf984f	feat: build linux x64 + arm64 in a single invocation (#652 ) `release.linux.yaml` now declares `architecture: [x64, arm64]` and the runner loops the entire pipeline once per architecture. depot_tools fetches both Linux sysroots automatically — `git_setup` idempotently ensures `target_cpus = ['x64', 'arm64']` is in `.gclient` before `gclient sync`, so cross-compiling arm64 from an x64 host just works. The resolver returns `List[Context]` (single-element for the common single-arch case), and `build/cli/build.py` loops `execute_pipeline` over the per-arch contexts. Modules stay 100% arch-agnostic — no new orchestration module, no new YAML schema beyond the list form. Also fix a cross-compile bug in `build/modules/package/linux.py`: the appimagetool binary must match the BUILD machine's arch (it executes locally), not the target arch. Split into a host-keyed `LINUX_HOST_APPIMAGETOOL` lookup vs the existing target-keyed `LINUX_ARCHITECTURE_CONFIG`. Target arch is still passed to appimagetool via the `ARCH` env var. - build/common/resolver.py: scalar OR list `architecture` -> List[Context] - build/cli/build.py: loop pipeline per arch, log multi-arch headers - build/config/release.linux.yaml: `architecture: [x64, arm64]` - build/modules/setup/git.py: idempotent `target_cpus` edit on Linux - build/modules/package/linux.py: host vs target appimagetool split - build/modules/package/linux_test.py: cover the host/target split	2026-04-06 13:08:06 -07:00
Nikhil	1b8720740c	feat: add linux arm64 release support (#651 ) * feat: support linux arm64 release artifacts * fix: address PR review comments for 0406-linux_arm64_support	2026-04-06 10:20:38 -07:00
Nikhil	91be726381	refactor: remove --compile-only flag, consolidate into --ci (#646 ) The --compile-only and --ci flags served overlapping purposes for CI builds. Remove --compile-only entirely since --ci already handles the CI use case (skip R2, skip prod env validation, local zip packaging) and --no-upload covers the upload-skipping use case for full builds.	2026-04-03 14:58:52 -07:00
Nikhil	ff5386a24a	fix: agent storage issue on update (#643 ) * fix: agent storage erase issue fix * fix: remove the guard against remote	2026-04-03 14:50:14 -07:00
Nikhil	a5f3c4da65	fix: skip windows exe patching in ci mode to avoid wine dependency (#645 ) The server release CI workflow fails on ubuntu-latest because patch-windows-exe.ts requires Wine to run rcedit. Thread the existing --ci flag through compileServerBinaries so Windows PE metadata patching is skipped in CI mode with a warning log.	2026-04-03 14:46:33 -07:00
Nikhil	e5a852dd3d	chore: update server version (#644 )	2026-04-03 14:29:07 -07:00
Felarof	aee30ce8e1	Update README.md (#638 )	2026-04-02 13:00:11 -07:00
Nikhil	0833c8d42d	fix: windows app-data location fix (#637 )	2026-04-02 08:53:04 -07:00
Nikhil	036c7f280b	fix: tab-grouping cdp crash (#635 ) * fix: tab group crash + history fix * fix: tab group crash + history fix	2026-04-01 15:06:41 -07:00
Nikhil	000429277d	fix: isolate server release packaging to ci mode (#629 ) * fix: relax compile-only release env requirements * refactor: add ci mode for server release builds	2026-03-31 20:57:44 -07:00
Nikhil	f8535fd96d	fix: exclude eval framework from language stats via gitattributes (#630 )	2026-03-31 20:44:06 -07:00
Nikhil	f0cbf77924	feat: add server release workflow (#627 ) * feat: add server release workflow * fix: address PR review comments for 0331-add_server_release_workflow * refactor: rework 0331-add_server_release_workflow based on feedback * refactor: rework 0331-add_server_release_workflow based on feedback	2026-03-31 17:37:06 -07:00
Nikhil	17be06eb2f	fix: report release cli version correctly (#626 )	2026-03-31 16:17:57 -07:00
Nikhil	0e90785500	fix: accept port-only input in CLI init command (#625 ) Users can now run `browseros-cli init 9000` in addition to the full URL. Updated default example port from 9004 to 9000.	2026-03-31 16:16:30 -07:00
Nikhil	2bb432b0f2	feat: use hidden pages for scheduled tasks (#624 ) * feat: use hidden pages for scheduled tasks * refactor: rework 0331-use_hidden_pages_for_scheduled_tasks based on feedback	2026-03-31 16:02:47 -07:00