fix: controller context tool fixes (#305)

* fix: incorrect tool call for getting page snapshot

* feat: let llm know the page is loaded after enrichment is complete

* feat: improve prompt to prevent calling getActiveTab

* feat: added enrichment to the get_load_status tool
This commit is contained in:
Dani Akash
2026-02-06 18:53:57 +05:30
committed by GitHub
parent 9f3562eb85
commit b5a4c3804b
7 changed files with 56 additions and 35 deletions

View File

@@ -48,15 +48,16 @@ function getTabGrouping(): string {
return `## Tab Grouping First (MANDATORY)
**Your FIRST action for ANY task must be creating a tab group.** No exceptions.
1. **Get Active Tab**: Call \`browser_get_active_tab\` to get the current tab ID
2. **Create Group Immediately**: Call \`browser_group_tabs([tabId], title, color)\` with a short title (3-4 words max) based on user intent (e.g., "Hotel Research", "Gift Shopping", "Flight Booking")
3. **Store the Group ID**: The response returns a \`groupId\` - remember it for the entire task
4. **Add Every New Tab**: When calling \`browser_open_tab\`, immediately follow with \`browser_group_tabs([newTabId], groupId=storedGroupId)\` to add it to the existing group
The active tab ID is already provided in the Browser Context above. Use it directly — do NOT call \`browser_get_active_tab\` to discover it.
Example flow:
1. **Create Group Immediately**: Call \`browser_group_tabs([tabId], title, color)\` using the active tab ID from Browser Context, with a short title (3-4 words max) based on user intent (e.g., "Hotel Research", "Gift Shopping", "Flight Booking")
2. **Store the Group ID**: The response returns a \`groupId\` - remember it for the entire task
3. **Add Every New Tab**: When calling \`browser_open_tab\`, immediately follow with \`browser_group_tabs([newTabId], groupId=storedGroupId)\` to add it to the existing group
Example flow (given Browser Context shows Tab 42):
\`\`\`
1. browser_get_active_tab → tabId: 42
2. browser_group_tabs([42], "Hotel Research", "blue") → groupId: 7
1. browser_group_tabs([42], "Hotel Research", "blue") → groupId: 7
2. browser_navigate("https://booking.com", tabId=42)
3. browser_open_tab("booking.com") → tabId: 43
4. browser_group_tabs([43], groupId=7) → adds to existing group
\`\`\`
@@ -84,8 +85,8 @@ function getCompleteTasks(): string {
function getObserveActVerify(): string {
return `## Observe → Act → Verify
- **Before acting**: Retrieve current tab, verify page loaded, fetch interactive elements
- **After navigation**: Re-fetch elements (nodeIds become invalid after page changes)
- **Before acting**: Use the active tab from Browser Context, fetch interactive elements
- **After navigation/clicks**: If the tool response includes "Page Content After Action", the page is loaded — proceed directly without calling \`browser_get_load_status\`. Re-fetch elements only if you need to interact with new elements (nodeIds become invalid after page changes).
- **After actions**: Confirm successful execution before continuing`
}

View File

@@ -51,40 +51,54 @@ export class ControllerResponse implements Response {
: undefined
}
#includeSnapshot = false
#includeScreenshot = false
#snapshotTabId: number | null = null
#screenshotTabId: number | null = null
setIncludeSnapshot(value: boolean): void {
this.#includeSnapshot = value
setIncludeSnapshot(tabId: number): void {
this.#snapshotTabId = tabId
}
setIncludeScreenshot(value: boolean): void {
this.#includeScreenshot = value
setIncludeScreenshot(tabId: number): void {
this.#screenshotTabId = tabId
}
async handle(context: Context): Promise<Array<TextContent | ImageContent>> {
const content = this.toContent()
if (this.#includeSnapshot) {
const result = await context.executeAction('getPageContent', {})
const text = (result as { content?: string })?.content
if (text) {
content.push({
if (this.#snapshotTabId != null) {
try {
const result = await context.executeAction('getSnapshot', {
tabId: this.#snapshotTabId,
type: 'text',
text: `\n## Page Content After Action\n${text}`,
})
const snapshot = result as { items?: Array<{ text: string }> }
if (snapshot?.items?.length) {
const text = snapshot.items.map((item) => item.text).join('\n')
content.push({
type: 'text',
text: `\n## Page Content After Action (page loaded, no need to check load status)\n${text}`,
})
}
} catch {
// Enrichment is best-effort; don't fail the tool response
}
}
if (this.#includeScreenshot) {
const result = await context.executeAction('captureScreenshot', {})
const data = result as { data?: string; mimeType?: string }
if (data?.data) {
content.push({
type: 'image',
data: data.data,
mimeType: data.mimeType ?? 'image/png',
if (this.#screenshotTabId != null) {
try {
const result = await context.executeAction('captureScreenshot', {
tabId: this.#screenshotTabId,
})
const data = result as { data?: string; mimeType?: string }
if (data?.data) {
content.push({
type: 'image',
data: data.data,
mimeType: data.mimeType ?? 'image/png',
})
}
} catch {
// Enrichment is best-effort; don't fail the tool response
}
}

View File

@@ -81,7 +81,7 @@ export const sendKeys = defineTool<z.ZodRawShape, Context, Response>({
const data = result as { success: boolean; message: string }
response.appendResponseLine(data.message)
response.setIncludeSnapshot?.(true)
response.setIncludeSnapshot?.(tabId)
},
})

View File

@@ -261,7 +261,7 @@ export const clickElement = defineTool<z.ZodRawShape, Context, Response>({
await context.executeAction('click', { tabId, nodeId })
response.appendResponseLine(`Clicked element ${nodeId} in tab ${tabId}`)
response.setIncludeSnapshot?.(true)
response.setIncludeSnapshot?.(tabId)
},
})
@@ -290,7 +290,7 @@ export const typeText = defineTool<z.ZodRawShape, Context, Response>({
response.appendResponseLine(
`Typed text into element ${nodeId} in tab ${tabId}`,
)
response.setIncludeSnapshot?.(true)
response.setIncludeSnapshot?.(tabId)
},
})

View File

@@ -44,6 +44,6 @@ export const navigate = defineTool<z.ZodRawShape, Context, Response>({
response.addStructuredContent('tabId', data.tabId)
response.addStructuredContent('windowId', data.windowId)
response.addStructuredContent('url', data.url)
response.setIncludeSnapshot?.(true)
response.setIncludeSnapshot?.(data.tabId)
},
})

View File

@@ -187,6 +187,10 @@ export const getLoadStatus = defineTool<z.ZodRawShape, Context, Response>({
response.addStructuredContent('isDOMContentLoaded', data.isDOMContentLoaded)
response.addStructuredContent('isResourcesLoading', data.isResourcesLoading)
response.addStructuredContent('isPageComplete', data.isPageComplete)
if (data.isPageComplete) {
response.setIncludeSnapshot?.(tabId)
}
},
})

View File

@@ -43,13 +43,15 @@ export interface Response {
/**
* Request page content snapshot to be appended after tool execution.
* Pass the tabId of the tab to snapshot.
* Only supported by ControllerResponse (no-op on other implementations).
*/
setIncludeSnapshot?(value: boolean): void
setIncludeSnapshot?(tabId: number): void
/**
* Request screenshot to be appended after tool execution.
* Pass the tabId of the tab to capture.
* Only supported by ControllerResponse (no-op on other implementations).
*/
setIncludeScreenshot?(value: boolean): void
setIncludeScreenshot?(tabId: number): void
}