mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-14 08:03:58 +00:00
fix: controller context tool fixes (#305)
* fix: incorrect tool call for getting page snapshot * feat: let llm know the page is loaded after enrichment is complete * feat: improve prompt to prevent calling getActiveTab * feat: added enrichment to the get_load_status tool
This commit is contained in:
@@ -48,15 +48,16 @@ function getTabGrouping(): string {
|
||||
return `## Tab Grouping First (MANDATORY)
|
||||
**Your FIRST action for ANY task must be creating a tab group.** No exceptions.
|
||||
|
||||
1. **Get Active Tab**: Call \`browser_get_active_tab\` to get the current tab ID
|
||||
2. **Create Group Immediately**: Call \`browser_group_tabs([tabId], title, color)\` with a short title (3-4 words max) based on user intent (e.g., "Hotel Research", "Gift Shopping", "Flight Booking")
|
||||
3. **Store the Group ID**: The response returns a \`groupId\` - remember it for the entire task
|
||||
4. **Add Every New Tab**: When calling \`browser_open_tab\`, immediately follow with \`browser_group_tabs([newTabId], groupId=storedGroupId)\` to add it to the existing group
|
||||
The active tab ID is already provided in the Browser Context above. Use it directly — do NOT call \`browser_get_active_tab\` to discover it.
|
||||
|
||||
Example flow:
|
||||
1. **Create Group Immediately**: Call \`browser_group_tabs([tabId], title, color)\` using the active tab ID from Browser Context, with a short title (3-4 words max) based on user intent (e.g., "Hotel Research", "Gift Shopping", "Flight Booking")
|
||||
2. **Store the Group ID**: The response returns a \`groupId\` - remember it for the entire task
|
||||
3. **Add Every New Tab**: When calling \`browser_open_tab\`, immediately follow with \`browser_group_tabs([newTabId], groupId=storedGroupId)\` to add it to the existing group
|
||||
|
||||
Example flow (given Browser Context shows Tab 42):
|
||||
\`\`\`
|
||||
1. browser_get_active_tab → tabId: 42
|
||||
2. browser_group_tabs([42], "Hotel Research", "blue") → groupId: 7
|
||||
1. browser_group_tabs([42], "Hotel Research", "blue") → groupId: 7
|
||||
2. browser_navigate("https://booking.com", tabId=42)
|
||||
3. browser_open_tab("booking.com") → tabId: 43
|
||||
4. browser_group_tabs([43], groupId=7) → adds to existing group
|
||||
\`\`\`
|
||||
@@ -84,8 +85,8 @@ function getCompleteTasks(): string {
|
||||
|
||||
function getObserveActVerify(): string {
|
||||
return `## Observe → Act → Verify
|
||||
- **Before acting**: Retrieve current tab, verify page loaded, fetch interactive elements
|
||||
- **After navigation**: Re-fetch elements (nodeIds become invalid after page changes)
|
||||
- **Before acting**: Use the active tab from Browser Context, fetch interactive elements
|
||||
- **After navigation/clicks**: If the tool response includes "Page Content After Action", the page is loaded — proceed directly without calling \`browser_get_load_status\`. Re-fetch elements only if you need to interact with new elements (nodeIds become invalid after page changes).
|
||||
- **After actions**: Confirm successful execution before continuing`
|
||||
}
|
||||
|
||||
|
||||
@@ -51,40 +51,54 @@ export class ControllerResponse implements Response {
|
||||
: undefined
|
||||
}
|
||||
|
||||
#includeSnapshot = false
|
||||
#includeScreenshot = false
|
||||
#snapshotTabId: number | null = null
|
||||
#screenshotTabId: number | null = null
|
||||
|
||||
setIncludeSnapshot(value: boolean): void {
|
||||
this.#includeSnapshot = value
|
||||
setIncludeSnapshot(tabId: number): void {
|
||||
this.#snapshotTabId = tabId
|
||||
}
|
||||
|
||||
setIncludeScreenshot(value: boolean): void {
|
||||
this.#includeScreenshot = value
|
||||
setIncludeScreenshot(tabId: number): void {
|
||||
this.#screenshotTabId = tabId
|
||||
}
|
||||
|
||||
async handle(context: Context): Promise<Array<TextContent | ImageContent>> {
|
||||
const content = this.toContent()
|
||||
|
||||
if (this.#includeSnapshot) {
|
||||
const result = await context.executeAction('getPageContent', {})
|
||||
const text = (result as { content?: string })?.content
|
||||
if (text) {
|
||||
content.push({
|
||||
if (this.#snapshotTabId != null) {
|
||||
try {
|
||||
const result = await context.executeAction('getSnapshot', {
|
||||
tabId: this.#snapshotTabId,
|
||||
type: 'text',
|
||||
text: `\n## Page Content After Action\n${text}`,
|
||||
})
|
||||
const snapshot = result as { items?: Array<{ text: string }> }
|
||||
if (snapshot?.items?.length) {
|
||||
const text = snapshot.items.map((item) => item.text).join('\n')
|
||||
content.push({
|
||||
type: 'text',
|
||||
text: `\n## Page Content After Action (page loaded, no need to check load status)\n${text}`,
|
||||
})
|
||||
}
|
||||
} catch {
|
||||
// Enrichment is best-effort; don't fail the tool response
|
||||
}
|
||||
}
|
||||
|
||||
if (this.#includeScreenshot) {
|
||||
const result = await context.executeAction('captureScreenshot', {})
|
||||
const data = result as { data?: string; mimeType?: string }
|
||||
if (data?.data) {
|
||||
content.push({
|
||||
type: 'image',
|
||||
data: data.data,
|
||||
mimeType: data.mimeType ?? 'image/png',
|
||||
if (this.#screenshotTabId != null) {
|
||||
try {
|
||||
const result = await context.executeAction('captureScreenshot', {
|
||||
tabId: this.#screenshotTabId,
|
||||
})
|
||||
const data = result as { data?: string; mimeType?: string }
|
||||
if (data?.data) {
|
||||
content.push({
|
||||
type: 'image',
|
||||
data: data.data,
|
||||
mimeType: data.mimeType ?? 'image/png',
|
||||
})
|
||||
}
|
||||
} catch {
|
||||
// Enrichment is best-effort; don't fail the tool response
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ export const sendKeys = defineTool<z.ZodRawShape, Context, Response>({
|
||||
const data = result as { success: boolean; message: string }
|
||||
|
||||
response.appendResponseLine(data.message)
|
||||
response.setIncludeSnapshot?.(true)
|
||||
response.setIncludeSnapshot?.(tabId)
|
||||
},
|
||||
})
|
||||
|
||||
|
||||
@@ -261,7 +261,7 @@ export const clickElement = defineTool<z.ZodRawShape, Context, Response>({
|
||||
await context.executeAction('click', { tabId, nodeId })
|
||||
|
||||
response.appendResponseLine(`Clicked element ${nodeId} in tab ${tabId}`)
|
||||
response.setIncludeSnapshot?.(true)
|
||||
response.setIncludeSnapshot?.(tabId)
|
||||
},
|
||||
})
|
||||
|
||||
@@ -290,7 +290,7 @@ export const typeText = defineTool<z.ZodRawShape, Context, Response>({
|
||||
response.appendResponseLine(
|
||||
`Typed text into element ${nodeId} in tab ${tabId}`,
|
||||
)
|
||||
response.setIncludeSnapshot?.(true)
|
||||
response.setIncludeSnapshot?.(tabId)
|
||||
},
|
||||
})
|
||||
|
||||
|
||||
@@ -44,6 +44,6 @@ export const navigate = defineTool<z.ZodRawShape, Context, Response>({
|
||||
response.addStructuredContent('tabId', data.tabId)
|
||||
response.addStructuredContent('windowId', data.windowId)
|
||||
response.addStructuredContent('url', data.url)
|
||||
response.setIncludeSnapshot?.(true)
|
||||
response.setIncludeSnapshot?.(data.tabId)
|
||||
},
|
||||
})
|
||||
|
||||
@@ -187,6 +187,10 @@ export const getLoadStatus = defineTool<z.ZodRawShape, Context, Response>({
|
||||
response.addStructuredContent('isDOMContentLoaded', data.isDOMContentLoaded)
|
||||
response.addStructuredContent('isResourcesLoading', data.isResourcesLoading)
|
||||
response.addStructuredContent('isPageComplete', data.isPageComplete)
|
||||
|
||||
if (data.isPageComplete) {
|
||||
response.setIncludeSnapshot?.(tabId)
|
||||
}
|
||||
},
|
||||
})
|
||||
|
||||
|
||||
@@ -43,13 +43,15 @@ export interface Response {
|
||||
|
||||
/**
|
||||
* Request page content snapshot to be appended after tool execution.
|
||||
* Pass the tabId of the tab to snapshot.
|
||||
* Only supported by ControllerResponse (no-op on other implementations).
|
||||
*/
|
||||
setIncludeSnapshot?(value: boolean): void
|
||||
setIncludeSnapshot?(tabId: number): void
|
||||
|
||||
/**
|
||||
* Request screenshot to be appended after tool execution.
|
||||
* Pass the tabId of the tab to capture.
|
||||
* Only supported by ControllerResponse (no-op on other implementations).
|
||||
*/
|
||||
setIncludeScreenshot?(value: boolean): void
|
||||
setIncludeScreenshot?(tabId: number): void
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user