From 2b4fdf1aad4de7980b3ca9c2753a95a39b22fb12 Mon Sep 17 00:00:00 2001 From: Dani Akash Date: Fri, 20 Mar 2026 18:31:36 +0530 Subject: [PATCH] feat: improved multi tab agent workflow (#507) * feat: updated multitab workflow * fix: updated prompt with fix for test cases * fix: active agent glow * fix: review comments --- .../sidepanel/index/useNotifyActiveTab.tsx | 74 ++++++++++++------- .../browseros-agent/apps/agent/wxt.config.ts | 1 + .../apps/server/src/agent/prompt.ts | 44 ++++++++++- .../apps/server/src/tools/navigation.ts | 13 ++-- .../apps/server/src/tools/snapshot.ts | 8 +- .../apps/server/tests/agent/prompt.test.ts | 51 ++++++++++++- 6 files changed, 155 insertions(+), 36 deletions(-) diff --git a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/useNotifyActiveTab.tsx b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/useNotifyActiveTab.tsx index ff0986205..e63c5ad6f 100644 --- a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/useNotifyActiveTab.tsx +++ b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/useNotifyActiveTab.tsx @@ -19,6 +19,10 @@ function extractTabId(toolPart: ToolUIPart | null): number | undefined { return input?.tabId } +function sendGlow(tabId: number, message: GlowMessage): void { + chrome.tabs.sendMessage(tabId, message).catch(() => {}) +} + export const useNotifyActiveTab = ({ messages, status, @@ -28,7 +32,10 @@ export const useNotifyActiveTab = ({ status: ChatStatus conversationId: string }) => { - const lastTabIdRef = useRef(null) + // Track the single tab currently glowing + const activeTabIdRef = useRef(null) + // Track all tabs that have been glowed during this stream (for cleanup) + const allGlowedTabsRef = useRef>(new Set()) const lastMessage = messages?.[messages.length - 1] @@ -41,27 +48,35 @@ export const useNotifyActiveTab = ({ useEffect(() => { const isStreaming = status === 'streaming' - const previousTabId = lastTabIdRef.current if (!isStreaming) { - if (previousTabId) { + // Deactivate ALL tabs that were glowed during this stream + const allGlowed = allGlowedTabsRef.current + if (allGlowed.size > 0) { const deactivate = async () => { + // Capture tab IDs before any async work to avoid race with clear() + const tabIds = Array.from(allGlowed) + allGlowed.clear() + const alreadyShown = await firstRunConfettiShownStorage.getValue() - const deactivateMessage: GlowMessage = { - conversationId, - isActive: false, - showConfetti: !alreadyShown, + let showConfetti = !alreadyShown + + for (const tabId of tabIds) { + sendGlow(tabId, { + conversationId, + isActive: false, + showConfetti, + }) + showConfetti = false } - chrome.tabs - .sendMessage(previousTabId, deactivateMessage) - .catch(() => {}) + if (!alreadyShown) { await firstRunConfettiShownStorage.setValue(true) } } deactivate() - lastTabIdRef.current = null } + activeTabIdRef.current = null return } @@ -70,34 +85,41 @@ export const useNotifyActiveTab = ({ let cancelled = false const activate = async () => { - let targetTabId = toolTabId ?? previousTabId ?? undefined + let targetTabId = toolTabId ?? undefined if (!targetTabId) { - const tabs = await chrome.tabs.query({ - active: true, - currentWindow: true, - }) - targetTabId = tabs[0]?.id + // Fallback: use the currently active tab, or query browser + if (activeTabIdRef.current) { + targetTabId = activeTabIdRef.current + } else { + const tabs = await chrome.tabs.query({ + active: true, + currentWindow: true, + }) + targetTabId = tabs[0]?.id + } } if (cancelled || !targetTabId) return + const previousTabId = activeTabIdRef.current + + // If the agent moved to a different tab, deactivate the previous one if (previousTabId && previousTabId !== targetTabId) { - const deactivateMessage: GlowMessage = { + sendGlow(previousTabId, { conversationId, isActive: false, - } - chrome.tabs - .sendMessage(previousTabId, deactivateMessage) - .catch(() => {}) + }) } - const activateMessage: GlowMessage = { + // Activate glow on the target tab + sendGlow(targetTabId, { conversationId, isActive: true, - } - chrome.tabs.sendMessage(targetTabId, activateMessage).catch(() => {}) - lastTabIdRef.current = targetTabId + }) + + activeTabIdRef.current = targetTabId + allGlowedTabsRef.current.add(targetTabId) } activate() diff --git a/packages/browseros-agent/apps/agent/wxt.config.ts b/packages/browseros-agent/apps/agent/wxt.config.ts index 0c4b849cd..ebd8e06d6 100644 --- a/packages/browseros-agent/apps/agent/wxt.config.ts +++ b/packages/browseros-agent/apps/agent/wxt.config.ts @@ -55,6 +55,7 @@ export default defineConfig({ permissions: [ 'topSites', 'tabs', + 'tabGroups', 'storage', 'sidePanel', 'browserOS', diff --git a/packages/browseros-agent/apps/server/src/agent/prompt.ts b/packages/browseros-agent/apps/server/src/agent/prompt.ts index fbd59767d..0c3fd0ae2 100644 --- a/packages/browseros-agent/apps/server/src/agent/prompt.ts +++ b/packages/browseros-agent/apps/server/src/agent/prompt.ts @@ -203,7 +203,28 @@ function getExecution( - Don't ask permission for routine steps. Act, then report. - Do not refuse by default, attempt tasks even when outcomes are uncertain. - For ambiguous/unclear requests, ask one targeted clarifying question. -- Stay on the current page. Only open new tabs when the user explicitly asks. +- Stay on the current page for single-page tasks. Use \`navigate_page\` to move within one tab. + +### Multi-tab workflow +When a task requires working on multiple pages simultaneously: +1. **Inform the user** that you're creating background tabs for the task. +2. **Open new tabs in background** using \`new_page\` (opens in background by default) — never steal focus from the user's current tab. +3. **IMMEDIATELY create a tab group** using \`group_tabs\` with a descriptive title — do this right after opening the tabs, before any other work. Include the user's current tab in the group. Every multi-tab task MUST have a tab group. +4. **Work on background tabs** — all tools (click, fill, navigate, snapshot) work on background tabs via their page ID. +5. **Narrate progress in chat** — keep the user informed: "Checking Vercel pricing... Now checking Netlify..." +6. **Report results in chat** — summarize findings so the user doesn't need to switch tabs. Leave tabs open for the user to browse later. +7. **Never force-switch the user's active tab.** If you need user interaction on a background tab (e.g., login, CAPTCHA), tell the user which tab needs attention and let them switch manually. +8. **Never navigate the user's current tab** during a multi-tab task. The current tab is the user's anchor — use it only for reading (snapshots, content extraction). All navigation should happen on background tabs. + +**Do NOT use \`create_hidden_window\` or \`new_hidden_page\` for user-requested tasks.** Hidden windows are invisible to the user and cannot be screenshotted. Use \`new_page\` (background mode) instead — tabs appear in the user's tab strip and can be inspected. Reserve hidden windows for automated/scheduled runs only. + +For single-page lookups (e.g., "go to X and read Y"), use \`navigate_page\` on the current tab. Only create new tabs when the task requires multiple pages open simultaneously. + +### Tab retry discipline +When a background tab fails (404, wrong content, unexpected redirect): +- **Navigate the existing tab** to the correct URL with \`navigate_page\` — do NOT open a new tab for retries. +- If you must abandon a tab, close it with \`close_page\` before opening a replacement. +- Never let orphan tabs accumulate — each task should end with only the tabs that contain useful content. ### Observe → Act → Verify - **Before acting**: Take a snapshot to get interactive element IDs. @@ -247,6 +268,14 @@ function getToolSelection(): string { - Prefer \`fill\` over \`press_key\` for text input. Use \`press_key\` for keyboard shortcuts (Enter, Escape, Tab, Ctrl+A, etc.). - Prefer clicking links over \`navigate_page\` when the link is visible. Use \`navigate_page\` for direct URL access, back/forward, or reload. +### Navigation: single-tab vs multi-tab +| Task | Approach | +|------|----------| +| Look up one page | \`navigate_page\` on current tab | +| Research across multiple sites | \`new_page\` (background) for each site + \`group_tabs\` | +| Compare two pages side by side | \`new_page\` (background) × 2 + \`group_tabs\` | +| User says "open a new tab" | \`new_page\` (background) — don't steal focus | + ### Connected apps: Strata vs browser When an app is Connected, prefer Strata tools over browser automation. Strata is faster, more reliable, and works without navigating away from the user's current page. ` @@ -351,7 +380,12 @@ function getErrorRecovery( ### Strata errors - Authentication error → call \`suggest_app_connection\` for re-auth (STOP and wait) - Action not found → try \`search_documentation\`, then fall back to browser automation -- Partial failure → report what succeeded and what didn't` +- Partial failure → report what succeeded and what didn't + +### Retry budget +- If a site isn't cooperating after 3-4 attempts (form not filling, redirects, geo-blocks), stop trying. +- Report what you've found so far and explain what didn't work: "Kayak kept defaulting to your local city. Here are the Google Flights results instead." +- Don't exhaust 10+ tool calls on a single failing site — the user's time matters more than completeness.` if (hasWorkspace) { recovery += ` @@ -528,6 +562,12 @@ Default: do not narrate routine, low-risk tool calls (just call the tool). Narrate only when it helps: multi-step plans, complex navigation, or when the user explicitly asked for explanation. Keep narration brief. "Searching for flights..." then tool call — not "I will now search for flights by calling the search tool." Execute independent tool calls in parallel when possible. + +When working on background tabs, always narrate progress so the user knows what's happening: +- "Opening a background tab to check Yahoo News headlines..." +- "Found 5 headlines on Yahoo News. Now checking Reuters..." +- "Done! Here's what I found across all sources:" +This is essential because the user can't see the background tabs — chat is their only window into your work. - Be concise: 1-2 lines for status updates and action confirmations. diff --git a/packages/browseros-agent/apps/server/src/tools/navigation.ts b/packages/browseros-agent/apps/server/src/tools/navigation.ts index 443a74c41..92048cfd8 100644 --- a/packages/browseros-agent/apps/server/src/tools/navigation.ts +++ b/packages/browseros-agent/apps/server/src/tools/navigation.ts @@ -121,14 +121,17 @@ export const navigate_page = defineTool({ export const new_page = defineTool({ name: 'new_page', - description: 'Open a new page (tab) and navigate to a URL', + description: + 'Open a new page (tab) and navigate to a URL. Opens in background by default to keep the user on their current page. Use group_tabs to organize related tabs.', input: z.object({ url: z.string().describe('URL to open'), hidden: z.boolean().default(false).describe('Create as hidden tab'), background: z .boolean() - .default(false) - .describe('Open in background without activating'), + .default(true) + .describe( + 'Open in background without stealing focus. Set to false only when user needs to see the tab immediately.', + ), windowId: z.number().optional().describe('Window ID to create tab in'), }), output: z.object({ @@ -140,8 +143,8 @@ export const new_page = defineTool({ }), handler: async (args, ctx, response) => { const pageId = await ctx.browser.newPage(args.url, { - hidden: args.hidden || undefined, - background: args.background || undefined, + hidden: args.hidden ? true : undefined, + background: args.background === false ? false : true, windowId: args.windowId, }) response.text(`Opened new page: ${args.url}\nPage ID: ${pageId}`) diff --git a/packages/browseros-agent/apps/server/src/tools/snapshot.ts b/packages/browseros-agent/apps/server/src/tools/snapshot.ts index cb4de85d1..a2205eb5a 100644 --- a/packages/browseros-agent/apps/server/src/tools/snapshot.ts +++ b/packages/browseros-agent/apps/server/src/tools/snapshot.ts @@ -101,7 +101,13 @@ export const get_page_content = defineTool({ extension: 'md', content: text, }) - response.text(`Saved page content to ${path}`) + // Return truncated content inline so the agent can work immediately, + // plus the file path for optional deep reading + const truncated = text.slice(0, TOOL_LIMITS.INLINE_PAGE_CONTENT_MAX_CHARS) + response.text(truncated) + response.text( + `\n\n[Content truncated at ${TOOL_LIMITS.INLINE_PAGE_CONTENT_MAX_CHARS} chars. Full content (${text.length} chars) saved to: ${path}]`, + ) response.data({ path, contentLength: text.length, diff --git a/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts b/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts index e0ff94eac..bda420d41 100644 --- a/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts +++ b/packages/browseros-agent/apps/server/tests/agent/prompt.test.ts @@ -1032,9 +1032,56 @@ describe('execution section', () => { expect(prompt).toContain('500') }) - it('includes new-tab restriction', () => { + it('includes multi-tab workflow guidance', () => { + // Why: The agent must know how to handle multi-tab tasks — open background + // tabs, create tab groups, narrate progress, and never steal user focus. const prompt = buildRegular() - expect(prompt).toContain('Only open new tabs when the user explicitly asks') + expect(prompt).toContain('Multi-tab workflow') + expect(prompt).toContain('background') + expect(prompt).toContain('group_tabs') + expect(prompt).toContain('Never force-switch') + }) + + it('enforces mandatory tab group creation', () => { + // Why: Run 7 showed the agent opening background tabs without creating + // a tab group. The prompt must make tab groups mandatory, not optional. + const prompt = buildRegular() + expect(prompt).toContain('IMMEDIATELY create a tab group') + expect(prompt).toContain('MUST have a tab group') + }) + + it('prohibits navigating user current tab during multi-tab', () => { + // Why: Run 7 showed the agent clicking a link on the user's current tab, + // navigating away from their starting page. The current tab must be read-only. + const prompt = buildRegular() + expect(prompt).toContain('Never navigate the user') + expect(prompt).toContain('anchor') + }) + + it('prohibits hidden windows for user tasks', () => { + // Why: Run 2 used create_hidden_window instead of background tabs. + // Hidden windows are invisible to users and can't be screenshotted. + const prompt = buildRegular() + expect(prompt).toContain('Do NOT use') + expect(prompt).toContain('create_hidden_window') + expect(prompt).toContain('new_hidden_page') + }) + + it('includes tab retry discipline', () => { + // Why: Run 7 opened 7+ tabs for a 3-article task because retries + // created new tabs instead of navigating existing ones. + const prompt = buildRegular() + expect(prompt).toContain('Tab retry discipline') + expect(prompt).toContain('Navigate the existing tab') + expect(prompt).toContain('close_page') + }) + + it('includes retry budget for failing sites', () => { + // Why: Run 8 spent 15+ tool calls fighting Kayak's geo-detection. + // The agent should give up after 3-4 attempts and report partial results. + const prompt = buildRegular() + expect(prompt).toContain('Retry budget') + expect(prompt).toContain('3-4 attempts') }) })