fix: enable agent interaction with elements inside iframes (#667)

* fix: enable agent interaction with elements inside iframes

Fetch accessibility trees from all frames via Page.getFrameTree() +
per-frame Accessibility.getFullAXTree(frameId), so iframe elements
appear in snapshots with valid backendNodeIds. Pages without iframes
take the original single-call path with zero overhead.

Update snapshot tree builders to walk multiple RootWebArea roots from
merged multi-frame trees. Extract same-origin iframe content in the
markdown walker; show [iframe: url] placeholder for cross-origin.

* fix: namespace AX nodeIds by frameId to prevent cross-frame collisions

CDP AXNodeId values are frame-scoped — each frame's accessibility tree
starts its own counter from 1. Prefix nodeId and childIds with frameId
before merging so the nodeMap in snapshot builders never overwrites
nodes from a different frame.
This commit is contained in:
shivammittal274
2026-04-09 23:14:53 +05:30
committed by GitHub
parent df7873562d
commit 1f2e783ab9
3 changed files with 81 additions and 18 deletions

View File

@@ -392,9 +392,48 @@ export class Browser {
// --- Observation ---
private async getFrameIds(session: ProtocolApi): Promise<string[]> {
try {
const result = await session.Page.getFrameTree()
const ids: string[] = []
type Tree = { frame: { id: string }; childFrames?: Tree[] }
function collect(tree: Tree) {
ids.push(tree.frame.id)
if (tree.childFrames)
for (const child of tree.childFrames) collect(child)
}
collect(result.frameTree as Tree)
return ids
} catch {
return []
}
}
private async fetchAXTree(session: ProtocolApi): Promise<AXNode[]> {
const result = await session.Accessibility.getFullAXTree()
return (result.nodes as AXNode[]) ?? []
const frameIds = await this.getFrameIds(session)
if (frameIds.length <= 1) {
const result = await session.Accessibility.getFullAXTree()
return (result.nodes as AXNode[]) ?? []
}
const allNodes: AXNode[] = []
for (const frameId of frameIds) {
try {
const result = await session.Accessibility.getFullAXTree({ frameId })
const nodes = (result.nodes as AXNode[]) ?? []
for (const node of nodes) {
allNodes.push({
...node,
nodeId: `${frameId}:${node.nodeId}`,
childIds: node.childIds?.map((id) => `${frameId}:${id}`),
})
}
} catch {
// Cross-origin or detached frames may fail — skip
}
}
return allNodes
}
async snapshot(page: number): Promise<string> {

View File

@@ -20,7 +20,7 @@ export function buildContentMarkdownExpression(
// Uses var + ES5 style for consistency with other injected scripts.
// Context object: { pre: bool, ld: listDepth, lt: listType, td: tableDepth }
const DOM_WALKER_SCRIPT = `(function(o) {
var SKIP = {SCRIPT:1,STYLE:1,NOSCRIPT:1,SVG:1,TEMPLATE:1,IFRAME:1,CANVAS:1,VIDEO:1,AUDIO:1,OBJECT:1,EMBED:1};
var SKIP = {SCRIPT:1,STYLE:1,NOSCRIPT:1,SVG:1,TEMPLATE:1,CANVAS:1,VIDEO:1,AUDIO:1,OBJECT:1,EMBED:1};
var FORM = {INPUT:1,SELECT:1,TEXTAREA:1,BUTTON:1};
var vh = window.innerHeight, vw = window.innerWidth;
var root = o.selector ? document.querySelector(o.selector) : document.body;
@@ -219,6 +219,15 @@ function walk(node, ctx) {
t = kids(el, ctx).trim();
return t ? '\\n*' + t + '*\\n' : '';
case 'IFRAME':
try {
var idoc = el.contentDocument;
if (idoc && idoc.body) return walk(idoc.body, ctx);
} catch(e) {}
var isrc = el.src || el.getAttribute('src');
if (isrc) return '\\n\\n[iframe: ' + isrc + ']\\n\\n';
return '';
default:
return kids(el, ctx);
}

View File

@@ -100,11 +100,16 @@ export function buildInteractiveTree(nodes: AXNode[]): string[] {
if (node.childIds) for (const childId of node.childIds) walk(childId)
}
const root =
nodes.find(
(n) => n.role?.value === 'RootWebArea' || n.role?.value === 'WebArea',
) ?? nodes[0]
if (root?.childIds) for (const childId of root.childIds) walk(childId)
const roots = nodes.filter(
(n) => n.role?.value === 'RootWebArea' || n.role?.value === 'WebArea',
)
if (roots.length === 0 && nodes[0]?.childIds) {
for (const childId of nodes[0].childIds) walk(childId)
} else {
for (const root of roots) {
if (root.childIds) for (const childId of root.childIds) walk(childId)
}
}
return lines
}
@@ -160,11 +165,16 @@ export function buildEnhancedTree(nodes: AXNode[]): string[] {
for (const childId of node.childIds) walk(childId, depth + 1)
}
const root =
nodes.find(
(n) => n.role?.value === 'RootWebArea' || n.role?.value === 'WebArea',
) ?? nodes[0]
if (root?.childIds) for (const childId of root.childIds) walk(childId, 0)
const roots = nodes.filter(
(n) => n.role?.value === 'RootWebArea' || n.role?.value === 'WebArea',
)
if (roots.length === 0 && nodes[0]?.childIds) {
for (const childId of nodes[0].childIds) walk(childId, 0)
} else {
for (const root of roots) {
if (root.childIds) for (const childId of root.childIds) walk(childId, 0)
}
}
return lines
}
@@ -292,11 +302,16 @@ export function extractLinkNodes(nodes: AXNode[]): LinkNode[] {
if (node.childIds) for (const childId of node.childIds) walk(childId)
}
const root =
nodes.find(
(n) => n.role?.value === 'RootWebArea' || n.role?.value === 'WebArea',
) ?? nodes[0]
if (root?.childIds) for (const childId of root.childIds) walk(childId)
const roots = nodes.filter(
(n) => n.role?.value === 'RootWebArea' || n.role?.value === 'WebArea',
)
if (roots.length === 0 && nodes[0]?.childIds) {
for (const childId of nodes[0].childIds) walk(childId)
} else {
for (const root of roots) {
if (root.childIds) for (const childId of root.childIds) walk(childId)
}
}
return links
}