feat: return element coordinates in tool responses and DPR in screens… (#437)

* feat: return element coordinates in tool responses and DPR in screenshots

- click, hover, fill, drag now return resolved coordinates in response text
- take_screenshot returns devicePixelRatio for mapping coordinates to pixels
- Coordinates are in CSS pixels; multiply by DPR to get screenshot pixels

* fix: use Promise.allSettled in screenshot to prevent DPR eval from aborting capture

Runtime.evaluate for devicePixelRatio can fail on PDF pages or
chrome-extension pages. Using Promise.allSettled ensures the screenshot
still succeeds, falling back to DPR=1.
This commit is contained in:
shivammittal274
2026-03-07 16:29:13 +05:30
committed by GitHub
parent 2e79933cae
commit c8a674fe93
3 changed files with 82 additions and 23 deletions

View File

@@ -506,7 +506,7 @@ export class Browser {
async screenshot( async screenshot(
page: number, page: number,
opts: { format: string; quality?: number; fullPage: boolean }, opts: { format: string; quality?: number; fullPage: boolean },
): Promise<{ data: string; mimeType: string }> { ): Promise<{ data: string; mimeType: string; devicePixelRatio: number }> {
const session = await this.resolveSession(page) const session = await this.resolveSession(page)
const params: Record<string, unknown> = { const params: Record<string, unknown> = {
@@ -515,11 +515,30 @@ export class Browser {
} }
if (opts.quality !== undefined) params.quality = opts.quality if (opts.quality !== undefined) params.quality = opts.quality
const result = await session.Page.captureScreenshot( const [screenshotResult, dprResult] = await Promise.allSettled([
params as Parameters<ProtocolApi['Page']['captureScreenshot']>[0], session.Page.captureScreenshot(
) params as Parameters<ProtocolApi['Page']['captureScreenshot']>[0],
),
session.Runtime.evaluate({
expression: 'window.devicePixelRatio',
returnByValue: true,
}),
])
return { data: result.data, mimeType: `image/${opts.format}` } if (screenshotResult.status === 'rejected') throw screenshotResult.reason
const result = screenshotResult.value
const devicePixelRatio =
dprResult.status === 'fulfilled' &&
typeof dprResult.value.result?.value === 'number'
? dprResult.value.result.value
: 1
return {
data: result.data,
mimeType: `image/${opts.format}`,
devicePixelRatio,
}
} }
async evaluate( async evaluate(
@@ -650,7 +669,7 @@ export class Browser {
page: number, page: number,
element: number, element: number,
opts?: { button?: string; clickCount?: number }, opts?: { button?: string; clickCount?: number },
): Promise<void> { ): Promise<{ x: number; y: number } | undefined> {
const session = await this.resolveSession(page) const session = await this.resolveSession(page)
await elements.scrollIntoView(session, element) await elements.scrollIntoView(session, element)
@@ -665,11 +684,13 @@ export class Browser {
opts?.clickCount ?? 1, opts?.clickCount ?? 1,
0, 0,
) )
return { x, y }
} catch { } catch {
logger.debug( logger.debug(
`CDP click failed for element=${element}, falling back to JS click`, `CDP click failed for element=${element}, falling back to JS click`,
) )
await elements.jsClick(session, element) await elements.jsClick(session, element)
return undefined
} }
} }
@@ -690,12 +711,16 @@ export class Browser {
) )
} }
async hover(page: number, element: number): Promise<void> { async hover(
page: number,
element: number,
): Promise<{ x: number; y: number }> {
const session = await this.resolveSession(page) const session = await this.resolveSession(page)
await elements.scrollIntoView(session, element) await elements.scrollIntoView(session, element)
const { x, y } = await elements.getElementCenter(session, element) const { x, y } = await elements.getElementCenter(session, element)
await mouse.dispatchHover(session, x, y) await mouse.dispatchHover(session, x, y)
return { x, y }
} }
async fill( async fill(
@@ -703,17 +728,24 @@ export class Browser {
element: number, element: number,
text: string, text: string,
clear = true, clear = true,
): Promise<void> { ): Promise<{ x: number; y: number } | undefined> {
const session = await this.resolveSession(page) const session = await this.resolveSession(page)
await elements.scrollIntoView(session, element) await elements.scrollIntoView(session, element)
let coords: { x: number; y: number } | undefined
try { try {
await elements.focusElement(session, element) await elements.focusElement(session, element)
try {
coords = await elements.getElementCenter(session, element)
} catch {
// coordinates are best-effort
}
} catch { } catch {
try { try {
const { x, y } = await elements.getElementCenter(session, element) const { x, y } = await elements.getElementCenter(session, element)
await mouse.dispatchClick(session, x, y, 'left', 1, 0) await mouse.dispatchClick(session, x, y, 'left', 1, 0)
coords = { x, y }
} catch { } catch {
logger.warn('Could not focus element via click either') logger.warn('Could not focus element via click either')
} }
@@ -721,6 +753,7 @@ export class Browser {
if (clear) await keyboard.clearField(session) if (clear) await keyboard.clearField(session)
await keyboard.typeText(session, text) await keyboard.typeText(session, text)
return coords
} }
async pressKey(page: number, key: string): Promise<void> { async pressKey(page: number, key: string): Promise<void> {
@@ -732,7 +765,10 @@ export class Browser {
page: number, page: number,
sourceElement: number, sourceElement: number,
target: { element?: number; x?: number; y?: number }, target: { element?: number; x?: number; y?: number },
): Promise<void> { ): Promise<{
from: { x: number; y: number }
to: { x: number; y: number }
}> {
const session = await this.resolveSession(page) const session = await this.resolveSession(page)
await elements.scrollIntoView(session, sourceElement) await elements.scrollIntoView(session, sourceElement)
@@ -750,6 +786,7 @@ export class Browser {
} }
await mouse.dispatchDrag(session, from, to) await mouse.dispatchDrag(session, from, to)
return { from, to }
} }
async scroll( async scroll(

View File

@@ -29,11 +29,14 @@ export const click = defineTool({
clickCount: z.number(), clickCount: z.number(),
}), }),
handler: async (args, ctx, response) => { handler: async (args, ctx, response) => {
await ctx.browser.click(args.page, args.element, { const coords = await ctx.browser.click(args.page, args.element, {
button: args.button, button: args.button,
clickCount: args.clickCount, clickCount: args.clickCount,
}) })
response.text(`Clicked [${args.element}]`) const coordText = coords
? ` at (${Math.round(coords.x)}, ${Math.round(coords.y)})`
: ''
response.text(`Clicked [${args.element}]${coordText}`)
response.data({ response.data({
action: 'click', action: 'click',
page: args.page, page: args.page,
@@ -97,8 +100,10 @@ export const hover = defineTool({
element: z.number(), element: z.number(),
}), }),
handler: async (args, ctx, response) => { handler: async (args, ctx, response) => {
await ctx.browser.hover(args.page, args.element) const coords = await ctx.browser.hover(args.page, args.element)
response.text(`Hovered over [${args.element}]`) response.text(
`Hovered over [${args.element}] at (${Math.round(coords.x)}, ${Math.round(coords.y)})`,
)
response.data({ action: 'hover', page: args.page, element: args.element }) response.data({ action: 'hover', page: args.page, element: args.element })
}, },
}) })
@@ -144,8 +149,18 @@ export const fill = defineTool({
clear: z.boolean(), clear: z.boolean(),
}), }),
handler: async (args, ctx, response) => { handler: async (args, ctx, response) => {
await ctx.browser.fill(args.page, args.element, args.text, args.clear) const coords = await ctx.browser.fill(
response.text(`Typed ${args.text.length} characters into [${args.element}]`) args.page,
args.element,
args.text,
args.clear,
)
const coordText = coords
? ` at (${Math.round(coords.x)}, ${Math.round(coords.y)})`
: ''
response.text(
`Typed ${args.text.length} characters into [${args.element}]${coordText}`,
)
response.data({ response.data({
action: 'fill', action: 'fill',
page: args.page, page: args.page,
@@ -205,7 +220,7 @@ export const drag = defineTool({
targetY: z.number().optional(), targetY: z.number().optional(),
}), }),
handler: async (args, ctx, response) => { handler: async (args, ctx, response) => {
await ctx.browser.drag(args.page, args.sourceElement, { const coords = await ctx.browser.drag(args.page, args.sourceElement, {
element: args.targetElement, element: args.targetElement,
x: args.targetX, x: args.targetX,
y: args.targetY, y: args.targetY,
@@ -214,7 +229,9 @@ export const drag = defineTool({
args.targetElement !== undefined args.targetElement !== undefined
? `[${args.targetElement}]` ? `[${args.targetElement}]`
: `(${args.targetX}, ${args.targetY})` : `(${args.targetX}, ${args.targetY})`
response.text(`Dragged [${args.sourceElement}] \u2192 ${target}`) response.text(
`Dragged [${args.sourceElement}] (${Math.round(coords.from.x)}, ${Math.round(coords.from.y)}) \u2192 ${target} (${Math.round(coords.to.x)}, ${Math.round(coords.to.y)})`,
)
response.data({ response.data({
action: 'drag', action: 'drag',
page: args.page, page: args.page,

View File

@@ -109,15 +109,20 @@ export const take_screenshot = defineTool({
}), }),
output: z.object({ output: z.object({
mimeType: z.string(), mimeType: z.string(),
devicePixelRatio: z.number(),
}), }),
handler: async (args, ctx, response) => { handler: async (args, ctx, response) => {
const { data, mimeType } = await ctx.browser.screenshot(args.page, { const { data, mimeType, devicePixelRatio } = await ctx.browser.screenshot(
format: args.format, args.page,
quality: args.quality, {
fullPage: args.fullPage, format: args.format,
}) quality: args.quality,
fullPage: args.fullPage,
},
)
response.image(data, mimeType) response.image(data, mimeType)
response.data({ mimeType }) response.text(`devicePixelRatio: ${devicePixelRatio}`)
response.data({ mimeType, devicePixelRatio })
}, },
}) })