mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 23:53:25 +00:00
feat: improved multi tab agent workflow (#507)
* feat: updated multitab workflow * fix: updated prompt with fix for test cases * fix: active agent glow * fix: review comments
This commit is contained in:
@@ -19,6 +19,10 @@ function extractTabId(toolPart: ToolUIPart | null): number | undefined {
|
||||
return input?.tabId
|
||||
}
|
||||
|
||||
function sendGlow(tabId: number, message: GlowMessage): void {
|
||||
chrome.tabs.sendMessage(tabId, message).catch(() => {})
|
||||
}
|
||||
|
||||
export const useNotifyActiveTab = ({
|
||||
messages,
|
||||
status,
|
||||
@@ -28,7 +32,10 @@ export const useNotifyActiveTab = ({
|
||||
status: ChatStatus
|
||||
conversationId: string
|
||||
}) => {
|
||||
const lastTabIdRef = useRef<number | null>(null)
|
||||
// Track the single tab currently glowing
|
||||
const activeTabIdRef = useRef<number | null>(null)
|
||||
// Track all tabs that have been glowed during this stream (for cleanup)
|
||||
const allGlowedTabsRef = useRef<Set<number>>(new Set())
|
||||
|
||||
const lastMessage = messages?.[messages.length - 1]
|
||||
|
||||
@@ -41,27 +48,35 @@ export const useNotifyActiveTab = ({
|
||||
|
||||
useEffect(() => {
|
||||
const isStreaming = status === 'streaming'
|
||||
const previousTabId = lastTabIdRef.current
|
||||
|
||||
if (!isStreaming) {
|
||||
if (previousTabId) {
|
||||
// Deactivate ALL tabs that were glowed during this stream
|
||||
const allGlowed = allGlowedTabsRef.current
|
||||
if (allGlowed.size > 0) {
|
||||
const deactivate = async () => {
|
||||
// Capture tab IDs before any async work to avoid race with clear()
|
||||
const tabIds = Array.from(allGlowed)
|
||||
allGlowed.clear()
|
||||
|
||||
const alreadyShown = await firstRunConfettiShownStorage.getValue()
|
||||
const deactivateMessage: GlowMessage = {
|
||||
conversationId,
|
||||
isActive: false,
|
||||
showConfetti: !alreadyShown,
|
||||
let showConfetti = !alreadyShown
|
||||
|
||||
for (const tabId of tabIds) {
|
||||
sendGlow(tabId, {
|
||||
conversationId,
|
||||
isActive: false,
|
||||
showConfetti,
|
||||
})
|
||||
showConfetti = false
|
||||
}
|
||||
chrome.tabs
|
||||
.sendMessage(previousTabId, deactivateMessage)
|
||||
.catch(() => {})
|
||||
|
||||
if (!alreadyShown) {
|
||||
await firstRunConfettiShownStorage.setValue(true)
|
||||
}
|
||||
}
|
||||
deactivate()
|
||||
lastTabIdRef.current = null
|
||||
}
|
||||
activeTabIdRef.current = null
|
||||
return
|
||||
}
|
||||
|
||||
@@ -70,34 +85,41 @@ export const useNotifyActiveTab = ({
|
||||
let cancelled = false
|
||||
|
||||
const activate = async () => {
|
||||
let targetTabId = toolTabId ?? previousTabId ?? undefined
|
||||
let targetTabId = toolTabId ?? undefined
|
||||
|
||||
if (!targetTabId) {
|
||||
const tabs = await chrome.tabs.query({
|
||||
active: true,
|
||||
currentWindow: true,
|
||||
})
|
||||
targetTabId = tabs[0]?.id
|
||||
// Fallback: use the currently active tab, or query browser
|
||||
if (activeTabIdRef.current) {
|
||||
targetTabId = activeTabIdRef.current
|
||||
} else {
|
||||
const tabs = await chrome.tabs.query({
|
||||
active: true,
|
||||
currentWindow: true,
|
||||
})
|
||||
targetTabId = tabs[0]?.id
|
||||
}
|
||||
}
|
||||
|
||||
if (cancelled || !targetTabId) return
|
||||
|
||||
const previousTabId = activeTabIdRef.current
|
||||
|
||||
// If the agent moved to a different tab, deactivate the previous one
|
||||
if (previousTabId && previousTabId !== targetTabId) {
|
||||
const deactivateMessage: GlowMessage = {
|
||||
sendGlow(previousTabId, {
|
||||
conversationId,
|
||||
isActive: false,
|
||||
}
|
||||
chrome.tabs
|
||||
.sendMessage(previousTabId, deactivateMessage)
|
||||
.catch(() => {})
|
||||
})
|
||||
}
|
||||
|
||||
const activateMessage: GlowMessage = {
|
||||
// Activate glow on the target tab
|
||||
sendGlow(targetTabId, {
|
||||
conversationId,
|
||||
isActive: true,
|
||||
}
|
||||
chrome.tabs.sendMessage(targetTabId, activateMessage).catch(() => {})
|
||||
lastTabIdRef.current = targetTabId
|
||||
})
|
||||
|
||||
activeTabIdRef.current = targetTabId
|
||||
allGlowedTabsRef.current.add(targetTabId)
|
||||
}
|
||||
|
||||
activate()
|
||||
|
||||
@@ -55,6 +55,7 @@ export default defineConfig({
|
||||
permissions: [
|
||||
'topSites',
|
||||
'tabs',
|
||||
'tabGroups',
|
||||
'storage',
|
||||
'sidePanel',
|
||||
'browserOS',
|
||||
|
||||
@@ -203,7 +203,28 @@ function getExecution(
|
||||
- Don't ask permission for routine steps. Act, then report.
|
||||
- Do not refuse by default, attempt tasks even when outcomes are uncertain.
|
||||
- For ambiguous/unclear requests, ask one targeted clarifying question.
|
||||
- Stay on the current page. Only open new tabs when the user explicitly asks.
|
||||
- Stay on the current page for single-page tasks. Use \`navigate_page\` to move within one tab.
|
||||
|
||||
### Multi-tab workflow
|
||||
When a task requires working on multiple pages simultaneously:
|
||||
1. **Inform the user** that you're creating background tabs for the task.
|
||||
2. **Open new tabs in background** using \`new_page\` (opens in background by default) — never steal focus from the user's current tab.
|
||||
3. **IMMEDIATELY create a tab group** using \`group_tabs\` with a descriptive title — do this right after opening the tabs, before any other work. Include the user's current tab in the group. Every multi-tab task MUST have a tab group.
|
||||
4. **Work on background tabs** — all tools (click, fill, navigate, snapshot) work on background tabs via their page ID.
|
||||
5. **Narrate progress in chat** — keep the user informed: "Checking Vercel pricing... Now checking Netlify..."
|
||||
6. **Report results in chat** — summarize findings so the user doesn't need to switch tabs. Leave tabs open for the user to browse later.
|
||||
7. **Never force-switch the user's active tab.** If you need user interaction on a background tab (e.g., login, CAPTCHA), tell the user which tab needs attention and let them switch manually.
|
||||
8. **Never navigate the user's current tab** during a multi-tab task. The current tab is the user's anchor — use it only for reading (snapshots, content extraction). All navigation should happen on background tabs.
|
||||
|
||||
**Do NOT use \`create_hidden_window\` or \`new_hidden_page\` for user-requested tasks.** Hidden windows are invisible to the user and cannot be screenshotted. Use \`new_page\` (background mode) instead — tabs appear in the user's tab strip and can be inspected. Reserve hidden windows for automated/scheduled runs only.
|
||||
|
||||
For single-page lookups (e.g., "go to X and read Y"), use \`navigate_page\` on the current tab. Only create new tabs when the task requires multiple pages open simultaneously.
|
||||
|
||||
### Tab retry discipline
|
||||
When a background tab fails (404, wrong content, unexpected redirect):
|
||||
- **Navigate the existing tab** to the correct URL with \`navigate_page\` — do NOT open a new tab for retries.
|
||||
- If you must abandon a tab, close it with \`close_page\` before opening a replacement.
|
||||
- Never let orphan tabs accumulate — each task should end with only the tabs that contain useful content.
|
||||
|
||||
### Observe → Act → Verify
|
||||
- **Before acting**: Take a snapshot to get interactive element IDs.
|
||||
@@ -247,6 +268,14 @@ function getToolSelection(): string {
|
||||
- Prefer \`fill\` over \`press_key\` for text input. Use \`press_key\` for keyboard shortcuts (Enter, Escape, Tab, Ctrl+A, etc.).
|
||||
- Prefer clicking links over \`navigate_page\` when the link is visible. Use \`navigate_page\` for direct URL access, back/forward, or reload.
|
||||
|
||||
### Navigation: single-tab vs multi-tab
|
||||
| Task | Approach |
|
||||
|------|----------|
|
||||
| Look up one page | \`navigate_page\` on current tab |
|
||||
| Research across multiple sites | \`new_page\` (background) for each site + \`group_tabs\` |
|
||||
| Compare two pages side by side | \`new_page\` (background) × 2 + \`group_tabs\` |
|
||||
| User says "open a new tab" | \`new_page\` (background) — don't steal focus |
|
||||
|
||||
### Connected apps: Strata vs browser
|
||||
When an app is Connected, prefer Strata tools over browser automation. Strata is faster, more reliable, and works without navigating away from the user's current page.
|
||||
</tool_selection>`
|
||||
@@ -351,7 +380,12 @@ function getErrorRecovery(
|
||||
### Strata errors
|
||||
- Authentication error → call \`suggest_app_connection\` for re-auth (STOP and wait)
|
||||
- Action not found → try \`search_documentation\`, then fall back to browser automation
|
||||
- Partial failure → report what succeeded and what didn't`
|
||||
- Partial failure → report what succeeded and what didn't
|
||||
|
||||
### Retry budget
|
||||
- If a site isn't cooperating after 3-4 attempts (form not filling, redirects, geo-blocks), stop trying.
|
||||
- Report what you've found so far and explain what didn't work: "Kayak kept defaulting to your local city. Here are the Google Flights results instead."
|
||||
- Don't exhaust 10+ tool calls on a single failing site — the user's time matters more than completeness.`
|
||||
|
||||
if (hasWorkspace) {
|
||||
recovery += `
|
||||
@@ -528,6 +562,12 @@ Default: do not narrate routine, low-risk tool calls (just call the tool).
|
||||
Narrate only when it helps: multi-step plans, complex navigation, or when the user explicitly asked for explanation.
|
||||
Keep narration brief. "Searching for flights..." then tool call — not "I will now search for flights by calling the search tool."
|
||||
Execute independent tool calls in parallel when possible.
|
||||
|
||||
When working on background tabs, always narrate progress so the user knows what's happening:
|
||||
- "Opening a background tab to check Yahoo News headlines..."
|
||||
- "Found 5 headlines on Yahoo News. Now checking Reuters..."
|
||||
- "Done! Here's what I found across all sources:"
|
||||
This is essential because the user can't see the background tabs — chat is their only window into your work.
|
||||
</tool_call_style>
|
||||
|
||||
- Be concise: 1-2 lines for status updates and action confirmations.
|
||||
|
||||
@@ -121,14 +121,17 @@ export const navigate_page = defineTool({
|
||||
|
||||
export const new_page = defineTool({
|
||||
name: 'new_page',
|
||||
description: 'Open a new page (tab) and navigate to a URL',
|
||||
description:
|
||||
'Open a new page (tab) and navigate to a URL. Opens in background by default to keep the user on their current page. Use group_tabs to organize related tabs.',
|
||||
input: z.object({
|
||||
url: z.string().describe('URL to open'),
|
||||
hidden: z.boolean().default(false).describe('Create as hidden tab'),
|
||||
background: z
|
||||
.boolean()
|
||||
.default(false)
|
||||
.describe('Open in background without activating'),
|
||||
.default(true)
|
||||
.describe(
|
||||
'Open in background without stealing focus. Set to false only when user needs to see the tab immediately.',
|
||||
),
|
||||
windowId: z.number().optional().describe('Window ID to create tab in'),
|
||||
}),
|
||||
output: z.object({
|
||||
@@ -140,8 +143,8 @@ export const new_page = defineTool({
|
||||
}),
|
||||
handler: async (args, ctx, response) => {
|
||||
const pageId = await ctx.browser.newPage(args.url, {
|
||||
hidden: args.hidden || undefined,
|
||||
background: args.background || undefined,
|
||||
hidden: args.hidden ? true : undefined,
|
||||
background: args.background === false ? false : true,
|
||||
windowId: args.windowId,
|
||||
})
|
||||
response.text(`Opened new page: ${args.url}\nPage ID: ${pageId}`)
|
||||
|
||||
@@ -101,7 +101,13 @@ export const get_page_content = defineTool({
|
||||
extension: 'md',
|
||||
content: text,
|
||||
})
|
||||
response.text(`Saved page content to ${path}`)
|
||||
// Return truncated content inline so the agent can work immediately,
|
||||
// plus the file path for optional deep reading
|
||||
const truncated = text.slice(0, TOOL_LIMITS.INLINE_PAGE_CONTENT_MAX_CHARS)
|
||||
response.text(truncated)
|
||||
response.text(
|
||||
`\n\n[Content truncated at ${TOOL_LIMITS.INLINE_PAGE_CONTENT_MAX_CHARS} chars. Full content (${text.length} chars) saved to: ${path}]`,
|
||||
)
|
||||
response.data({
|
||||
path,
|
||||
contentLength: text.length,
|
||||
|
||||
@@ -1032,9 +1032,56 @@ describe('execution section', () => {
|
||||
expect(prompt).toContain('500')
|
||||
})
|
||||
|
||||
it('includes new-tab restriction', () => {
|
||||
it('includes multi-tab workflow guidance', () => {
|
||||
// Why: The agent must know how to handle multi-tab tasks — open background
|
||||
// tabs, create tab groups, narrate progress, and never steal user focus.
|
||||
const prompt = buildRegular()
|
||||
expect(prompt).toContain('Only open new tabs when the user explicitly asks')
|
||||
expect(prompt).toContain('Multi-tab workflow')
|
||||
expect(prompt).toContain('background')
|
||||
expect(prompt).toContain('group_tabs')
|
||||
expect(prompt).toContain('Never force-switch')
|
||||
})
|
||||
|
||||
it('enforces mandatory tab group creation', () => {
|
||||
// Why: Run 7 showed the agent opening background tabs without creating
|
||||
// a tab group. The prompt must make tab groups mandatory, not optional.
|
||||
const prompt = buildRegular()
|
||||
expect(prompt).toContain('IMMEDIATELY create a tab group')
|
||||
expect(prompt).toContain('MUST have a tab group')
|
||||
})
|
||||
|
||||
it('prohibits navigating user current tab during multi-tab', () => {
|
||||
// Why: Run 7 showed the agent clicking a link on the user's current tab,
|
||||
// navigating away from their starting page. The current tab must be read-only.
|
||||
const prompt = buildRegular()
|
||||
expect(prompt).toContain('Never navigate the user')
|
||||
expect(prompt).toContain('anchor')
|
||||
})
|
||||
|
||||
it('prohibits hidden windows for user tasks', () => {
|
||||
// Why: Run 2 used create_hidden_window instead of background tabs.
|
||||
// Hidden windows are invisible to users and can't be screenshotted.
|
||||
const prompt = buildRegular()
|
||||
expect(prompt).toContain('Do NOT use')
|
||||
expect(prompt).toContain('create_hidden_window')
|
||||
expect(prompt).toContain('new_hidden_page')
|
||||
})
|
||||
|
||||
it('includes tab retry discipline', () => {
|
||||
// Why: Run 7 opened 7+ tabs for a 3-article task because retries
|
||||
// created new tabs instead of navigating existing ones.
|
||||
const prompt = buildRegular()
|
||||
expect(prompt).toContain('Tab retry discipline')
|
||||
expect(prompt).toContain('Navigate the existing tab')
|
||||
expect(prompt).toContain('close_page')
|
||||
})
|
||||
|
||||
it('includes retry budget for failing sites', () => {
|
||||
// Why: Run 8 spent 15+ tool calls fighting Kayak's geo-detection.
|
||||
// The agent should give up after 3-4 attempts and report partial results.
|
||||
const prompt = buildRegular()
|
||||
expect(prompt).toContain('Retry budget')
|
||||
expect(prompt).toContain('3-4 attempts')
|
||||
})
|
||||
})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user