feat: return element coordinates in tool responses and DPR in screens… (#437)

* feat: return element coordinates in tool responses and DPR in screenshots

- click, hover, fill, drag now return resolved coordinates in response text
- take_screenshot returns devicePixelRatio for mapping coordinates to pixels
- Coordinates are in CSS pixels; multiply by DPR to get screenshot pixels

* fix: use Promise.allSettled in screenshot to prevent DPR eval from aborting capture

Runtime.evaluate for devicePixelRatio can fail on PDF pages or
chrome-extension pages. Using Promise.allSettled ensures the screenshot
still succeeds, falling back to DPR=1.
This commit is contained in:
shivammittal274
2026-03-07 16:29:13 +05:30
committed by GitHub
parent 2e79933cae
commit c8a674fe93
3 changed files with 82 additions and 23 deletions

View File

@@ -506,7 +506,7 @@ export class Browser {
async screenshot(
page: number,
opts: { format: string; quality?: number; fullPage: boolean },
): Promise<{ data: string; mimeType: string }> {
): Promise<{ data: string; mimeType: string; devicePixelRatio: number }> {
const session = await this.resolveSession(page)
const params: Record<string, unknown> = {
@@ -515,11 +515,30 @@ export class Browser {
}
if (opts.quality !== undefined) params.quality = opts.quality
const result = await session.Page.captureScreenshot(
params as Parameters<ProtocolApi['Page']['captureScreenshot']>[0],
)
const [screenshotResult, dprResult] = await Promise.allSettled([
session.Page.captureScreenshot(
params as Parameters<ProtocolApi['Page']['captureScreenshot']>[0],
),
session.Runtime.evaluate({
expression: 'window.devicePixelRatio',
returnByValue: true,
}),
])
return { data: result.data, mimeType: `image/${opts.format}` }
if (screenshotResult.status === 'rejected') throw screenshotResult.reason
const result = screenshotResult.value
const devicePixelRatio =
dprResult.status === 'fulfilled' &&
typeof dprResult.value.result?.value === 'number'
? dprResult.value.result.value
: 1
return {
data: result.data,
mimeType: `image/${opts.format}`,
devicePixelRatio,
}
}
async evaluate(
@@ -650,7 +669,7 @@ export class Browser {
page: number,
element: number,
opts?: { button?: string; clickCount?: number },
): Promise<void> {
): Promise<{ x: number; y: number } | undefined> {
const session = await this.resolveSession(page)
await elements.scrollIntoView(session, element)
@@ -665,11 +684,13 @@ export class Browser {
opts?.clickCount ?? 1,
0,
)
return { x, y }
} catch {
logger.debug(
`CDP click failed for element=${element}, falling back to JS click`,
)
await elements.jsClick(session, element)
return undefined
}
}
@@ -690,12 +711,16 @@ export class Browser {
)
}
async hover(page: number, element: number): Promise<void> {
async hover(
page: number,
element: number,
): Promise<{ x: number; y: number }> {
const session = await this.resolveSession(page)
await elements.scrollIntoView(session, element)
const { x, y } = await elements.getElementCenter(session, element)
await mouse.dispatchHover(session, x, y)
return { x, y }
}
async fill(
@@ -703,17 +728,24 @@ export class Browser {
element: number,
text: string,
clear = true,
): Promise<void> {
): Promise<{ x: number; y: number } | undefined> {
const session = await this.resolveSession(page)
await elements.scrollIntoView(session, element)
let coords: { x: number; y: number } | undefined
try {
await elements.focusElement(session, element)
try {
coords = await elements.getElementCenter(session, element)
} catch {
// coordinates are best-effort
}
} catch {
try {
const { x, y } = await elements.getElementCenter(session, element)
await mouse.dispatchClick(session, x, y, 'left', 1, 0)
coords = { x, y }
} catch {
logger.warn('Could not focus element via click either')
}
@@ -721,6 +753,7 @@ export class Browser {
if (clear) await keyboard.clearField(session)
await keyboard.typeText(session, text)
return coords
}
async pressKey(page: number, key: string): Promise<void> {
@@ -732,7 +765,10 @@ export class Browser {
page: number,
sourceElement: number,
target: { element?: number; x?: number; y?: number },
): Promise<void> {
): Promise<{
from: { x: number; y: number }
to: { x: number; y: number }
}> {
const session = await this.resolveSession(page)
await elements.scrollIntoView(session, sourceElement)
@@ -750,6 +786,7 @@ export class Browser {
}
await mouse.dispatchDrag(session, from, to)
return { from, to }
}
async scroll(

View File

@@ -29,11 +29,14 @@ export const click = defineTool({
clickCount: z.number(),
}),
handler: async (args, ctx, response) => {
await ctx.browser.click(args.page, args.element, {
const coords = await ctx.browser.click(args.page, args.element, {
button: args.button,
clickCount: args.clickCount,
})
response.text(`Clicked [${args.element}]`)
const coordText = coords
? ` at (${Math.round(coords.x)}, ${Math.round(coords.y)})`
: ''
response.text(`Clicked [${args.element}]${coordText}`)
response.data({
action: 'click',
page: args.page,
@@ -97,8 +100,10 @@ export const hover = defineTool({
element: z.number(),
}),
handler: async (args, ctx, response) => {
await ctx.browser.hover(args.page, args.element)
response.text(`Hovered over [${args.element}]`)
const coords = await ctx.browser.hover(args.page, args.element)
response.text(
`Hovered over [${args.element}] at (${Math.round(coords.x)}, ${Math.round(coords.y)})`,
)
response.data({ action: 'hover', page: args.page, element: args.element })
},
})
@@ -144,8 +149,18 @@ export const fill = defineTool({
clear: z.boolean(),
}),
handler: async (args, ctx, response) => {
await ctx.browser.fill(args.page, args.element, args.text, args.clear)
response.text(`Typed ${args.text.length} characters into [${args.element}]`)
const coords = await ctx.browser.fill(
args.page,
args.element,
args.text,
args.clear,
)
const coordText = coords
? ` at (${Math.round(coords.x)}, ${Math.round(coords.y)})`
: ''
response.text(
`Typed ${args.text.length} characters into [${args.element}]${coordText}`,
)
response.data({
action: 'fill',
page: args.page,
@@ -205,7 +220,7 @@ export const drag = defineTool({
targetY: z.number().optional(),
}),
handler: async (args, ctx, response) => {
await ctx.browser.drag(args.page, args.sourceElement, {
const coords = await ctx.browser.drag(args.page, args.sourceElement, {
element: args.targetElement,
x: args.targetX,
y: args.targetY,
@@ -214,7 +229,9 @@ export const drag = defineTool({
args.targetElement !== undefined
? `[${args.targetElement}]`
: `(${args.targetX}, ${args.targetY})`
response.text(`Dragged [${args.sourceElement}] \u2192 ${target}`)
response.text(
`Dragged [${args.sourceElement}] (${Math.round(coords.from.x)}, ${Math.round(coords.from.y)}) \u2192 ${target} (${Math.round(coords.to.x)}, ${Math.round(coords.to.y)})`,
)
response.data({
action: 'drag',
page: args.page,

View File

@@ -109,15 +109,20 @@ export const take_screenshot = defineTool({
}),
output: z.object({
mimeType: z.string(),
devicePixelRatio: z.number(),
}),
handler: async (args, ctx, response) => {
const { data, mimeType } = await ctx.browser.screenshot(args.page, {
format: args.format,
quality: args.quality,
fullPage: args.fullPage,
})
const { data, mimeType, devicePixelRatio } = await ctx.browser.screenshot(
args.page,
{
format: args.format,
quality: args.quality,
fullPage: args.fullPage,
},
)
response.image(data, mimeType)
response.data({ mimeType })
response.text(`devicePixelRatio: ${devicePixelRatio}`)
response.data({ mimeType, devicePixelRatio })
},
})