From d4e0a30e7c0ba88d020a752b0fcd8a0bbf4428b8 Mon Sep 17 00:00:00 2001 From: shivammittal274 Date: Mon, 16 Mar 2026 13:31:46 +0530 Subject: [PATCH] fix: detect custom clickable elements in take_snapshot take_snapshot only used the AX tree, which misses custom components (cursor:pointer divs, onclick handlers, etc.) that lack ARIA roles. These elements appeared as role="generic" and were invisible to the agent. Changes: - Merge findCursorInteractiveElements into snapshot() so take_snapshot catches cursor:pointer, onclick, and tabindex elements - Add DisclosureTriangle to INTERACTIVE_ROLES for elements - Use aria-label as text fallback in cursor detection for icon-only buttons - Fix dedup bug in enhancedSnapshot that was silently dropping all cursor-detected elements by checking against all AX node IDs instead of only already-included output IDs --- .../apps/server/src/browser/browser.ts | 35 +++++++++++++++---- .../apps/server/src/browser/snapshot.ts | 2 ++ 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/packages/browseros-agent/apps/server/src/browser/browser.ts b/packages/browseros-agent/apps/server/src/browser/browser.ts index ba1ec593c..7a825bc19 100644 --- a/packages/browseros-agent/apps/server/src/browser/browser.ts +++ b/packages/browseros-agent/apps/server/src/browser/browser.ts @@ -389,7 +389,30 @@ export class Browser { const session = await this.resolveSession(page) const nodes = await this.fetchAXTree(session) if (nodes.length === 0) return '' - return snapshot.buildInteractiveTree(nodes).join('\n') + + const lines = snapshot.buildInteractiveTree(nodes) + + try { + const cursorElements = + await snapshot.findCursorInteractiveElements(session) + + if (cursorElements.length > 0) { + const includedIds = new Set() + for (const line of lines) { + const match = line.match(/^\[(\d+)\]/) + if (match) includedIds.add(Number(match[1])) + } + + for (const el of cursorElements) { + if (includedIds.has(el.backendNodeId)) continue + lines.push(`[${el.backendNodeId}] clickable "${el.text}"`) + } + } + } catch { + // cursor detection is best-effort; AX tree results are still returned + } + + return lines.join('\n') } async getPageLinks( @@ -441,15 +464,15 @@ export class Browser { await snapshot.findCursorInteractiveElements(session) if (cursorElements.length > 0) { - const existingIds = new Set() - for (const node of nodes) { - if (node.backendDOMNodeId !== undefined) - existingIds.add(node.backendDOMNodeId) + const includedIds = new Set() + for (const line of treeLines) { + const match = line.match(/\[(\d+)\]/) + if (match) includedIds.add(Number(match[1])) } const extras: string[] = [] for (const el of cursorElements) { - if (existingIds.has(el.backendNodeId)) continue + if (includedIds.has(el.backendNodeId)) continue extras.push( `[${el.backendNodeId}] clickable "${el.text}" (${el.reasons.join(', ')})`, ) diff --git a/packages/browseros-agent/apps/server/src/browser/snapshot.ts b/packages/browseros-agent/apps/server/src/browser/snapshot.ts index 3e7fc345d..4b6893af5 100644 --- a/packages/browseros-agent/apps/server/src/browser/snapshot.ts +++ b/packages/browseros-agent/apps/server/src/browser/snapshot.ts @@ -41,6 +41,7 @@ const INTERACTIVE_ROLES = new Set([ 'option', 'treeitem', 'listbox', + 'DisclosureTriangle', ]) const NAMED_CONTENT_ROLES = new Set([ @@ -196,6 +197,7 @@ const CURSOR_INTERACTIVE_JS = `(function() { if (parent && getComputedStyle(parent).cursor === 'pointer') continue; } var text = (el.textContent || '').trim().slice(0, 100); + if (!text) text = (el.getAttribute('aria-label') || '').trim(); if (!text) continue; var rect = el.getBoundingClientRect(); if (rect.width === 0 || rect.height === 0) continue;