fix: controller context tool fixes (#305)

* fix: incorrect tool call for getting page snapshot * feat: let llm know the page is loaded after enrichment is complete * feat: improve prompt to prevent calling getActiveTab * feat: added enrichment to the get_load_status tool
2026-05-14 08:03:58 +00:00 · 2026-02-06 18:53:57 +05:30
parent 9f3562eb85
commit b5a4c3804b
7 changed files with 56 additions and 35 deletions
--- a/apps/server/src/agent/prompt.ts
+++ b/apps/server/src/agent/prompt.ts
@@ -48,15 +48,16 @@ function getTabGrouping(): string {
  return `## Tab Grouping First (MANDATORY)
 **Your FIRST action for ANY task must be creating a tab group.** No exceptions.

-1. **Get Active Tab**: Call \`browser_get_active_tab\` to get the current tab ID
-2. **Create Group Immediately**: Call \`browser_group_tabs([tabId], title, color)\` with a short title (3-4 words max) based on user intent (e.g., "Hotel Research", "Gift Shopping", "Flight Booking")
-3. **Store the Group ID**: The response returns a \`groupId\` - remember it for the entire task
-4. **Add Every New Tab**: When calling \`browser_open_tab\`, immediately follow with \`browser_group_tabs([newTabId], groupId=storedGroupId)\` to add it to the existing group
+The active tab ID is already provided in the Browser Context above. Use it directly — do NOT call \`browser_get_active_tab\` to discover it.

-Example flow:
+1. **Create Group Immediately**: Call \`browser_group_tabs([tabId], title, color)\` using the active tab ID from Browser Context, with a short title (3-4 words max) based on user intent (e.g., "Hotel Research", "Gift Shopping", "Flight Booking")
+2. **Store the Group ID**: The response returns a \`groupId\` - remember it for the entire task
+3. **Add Every New Tab**: When calling \`browser_open_tab\`, immediately follow with \`browser_group_tabs([newTabId], groupId=storedGroupId)\` to add it to the existing group
+
+Example flow (given Browser Context shows Tab 42):
 \`\`\`
-1. browser_get_active_tab → tabId: 42
-2. browser_group_tabs([42], "Hotel Research", "blue") → groupId: 7
+1. browser_group_tabs([42], "Hotel Research", "blue") → groupId: 7
+2. browser_navigate("https://booking.com", tabId=42)
 3. browser_open_tab("booking.com") → tabId: 43
 4. browser_group_tabs([43], groupId=7) → adds to existing group
 \`\`\`
@@ -84,8 +85,8 @@ function getCompleteTasks(): string {

 function getObserveActVerify(): string {
  return `## Observe → Act → Verify
- **Before acting**: Retrieve current tab, verify page loaded, fetch interactive elements
- **After navigation**: Re-fetch elements (nodeIds become invalid after page changes)
+- **Before acting**: Use the active tab from Browser Context, fetch interactive elements
+- **After navigation/clicks**: If the tool response includes "Page Content After Action", the page is loaded — proceed directly without calling \`browser_get_load_status\`. Re-fetch elements only if you need to interact with new elements (nodeIds become invalid after page changes).
 - **After actions**: Confirm successful execution before continuing`
 }

--- a/apps/server/src/tools/controller-based/response/controller-response.ts
+++ b/apps/server/src/tools/controller-based/response/controller-response.ts
@@ -51,40 +51,54 @@ export class ControllerResponse implements Response {
      : undefined
  }

-  #includeSnapshot = false
-  #includeScreenshot = false
+  #snapshotTabId: number | null = null
+  #screenshotTabId: number | null = null

-  setIncludeSnapshot(value: boolean): void {
-    this.#includeSnapshot = value
+  setIncludeSnapshot(tabId: number): void {
+    this.#snapshotTabId = tabId
  }

-  setIncludeScreenshot(value: boolean): void {
-    this.#includeScreenshot = value
+  setIncludeScreenshot(tabId: number): void {
+    this.#screenshotTabId = tabId
  }

  async handle(context: Context): Promise<Array<TextContent | ImageContent>> {
    const content = this.toContent()

-    if (this.#includeSnapshot) {
-      const result = await context.executeAction('getPageContent', {})
-      const text = (result as { content?: string })?.content
-      if (text) {
-        content.push({
+    if (this.#snapshotTabId != null) {
+      try {
+        const result = await context.executeAction('getSnapshot', {
+          tabId: this.#snapshotTabId,
          type: 'text',
-          text: `\n## Page Content After Action\n${text}`,
        })
+        const snapshot = result as { items?: Array<{ text: string }> }
+        if (snapshot?.items?.length) {
+          const text = snapshot.items.map((item) => item.text).join('\n')
+          content.push({
+            type: 'text',
+            text: `\n## Page Content After Action (page loaded, no need to check load status)\n${text}`,
+          })
+        }
+      } catch {
+        // Enrichment is best-effort; don't fail the tool response
      }
    }

-    if (this.#includeScreenshot) {
-      const result = await context.executeAction('captureScreenshot', {})
-      const data = result as { data?: string; mimeType?: string }
-      if (data?.data) {
-        content.push({
-          type: 'image',
-          data: data.data,
-          mimeType: data.mimeType ?? 'image/png',
+    if (this.#screenshotTabId != null) {
+      try {
+        const result = await context.executeAction('captureScreenshot', {
+          tabId: this.#screenshotTabId,
        })
+        const data = result as { data?: string; mimeType?: string }
+        if (data?.data) {
+          content.push({
+            type: 'image',
+            data: data.data,
+            mimeType: data.mimeType ?? 'image/png',
+          })
+        }
+      } catch {
+        // Enrichment is best-effort; don't fail the tool response
      }
    }

--- a/apps/server/src/tools/controller-based/tools/advanced.ts
+++ b/apps/server/src/tools/controller-based/tools/advanced.ts
@@ -81,7 +81,7 @@ export const sendKeys = defineTool<z.ZodRawShape, Context, Response>({
    const data = result as { success: boolean; message: string }

    response.appendResponseLine(data.message)
-    response.setIncludeSnapshot?.(true)
+    response.setIncludeSnapshot?.(tabId)
  },
 })

--- a/apps/server/src/tools/controller-based/tools/interaction.ts
+++ b/apps/server/src/tools/controller-based/tools/interaction.ts
@@ -261,7 +261,7 @@ export const clickElement = defineTool<z.ZodRawShape, Context, Response>({
    await context.executeAction('click', { tabId, nodeId })

    response.appendResponseLine(`Clicked element ${nodeId} in tab ${tabId}`)
-    response.setIncludeSnapshot?.(true)
+    response.setIncludeSnapshot?.(tabId)
  },
 })

@@ -290,7 +290,7 @@ export const typeText = defineTool<z.ZodRawShape, Context, Response>({
    response.appendResponseLine(
      `Typed text into element ${nodeId} in tab ${tabId}`,
    )
-    response.setIncludeSnapshot?.(true)
+    response.setIncludeSnapshot?.(tabId)
  },
 })

--- a/apps/server/src/tools/controller-based/tools/navigation.ts
+++ b/apps/server/src/tools/controller-based/tools/navigation.ts
@@ -44,6 +44,6 @@ export const navigate = defineTool<z.ZodRawShape, Context, Response>({
    response.addStructuredContent('tabId', data.tabId)
    response.addStructuredContent('windowId', data.windowId)
    response.addStructuredContent('url', data.url)
-    response.setIncludeSnapshot?.(true)
+    response.setIncludeSnapshot?.(data.tabId)
  },
 })
--- a/apps/server/src/tools/controller-based/tools/tab-management.ts
+++ b/apps/server/src/tools/controller-based/tools/tab-management.ts
@@ -187,6 +187,10 @@ export const getLoadStatus = defineTool<z.ZodRawShape, Context, Response>({
    response.addStructuredContent('isDOMContentLoaded', data.isDOMContentLoaded)
    response.addStructuredContent('isResourcesLoading', data.isResourcesLoading)
    response.addStructuredContent('isPageComplete', data.isPageComplete)
+
+    if (data.isPageComplete) {
+      response.setIncludeSnapshot?.(tabId)
+    }
  },
 })

--- a/apps/server/src/tools/controller-based/types/response.ts
+++ b/apps/server/src/tools/controller-based/types/response.ts
@@ -43,13 +43,15 @@ export interface Response {

  /**
   * Request page content snapshot to be appended after tool execution.
+   * Pass the tabId of the tab to snapshot.
   * Only supported by ControllerResponse (no-op on other implementations).
   */
-  setIncludeSnapshot?(value: boolean): void
+  setIncludeSnapshot?(tabId: number): void

  /**
   * Request screenshot to be appended after tool execution.
+   * Pass the tabId of the tab to capture.
   * Only supported by ControllerResponse (no-op on other implementations).
   */
-  setIncludeScreenshot?(value: boolean): void
+  setIncludeScreenshot?(tabId: number): void
 }