mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-18 02:57:47 +00:00
feat: return element coordinates in tool responses and DPR in screens… (#437)
* feat: return element coordinates in tool responses and DPR in screenshots - click, hover, fill, drag now return resolved coordinates in response text - take_screenshot returns devicePixelRatio for mapping coordinates to pixels - Coordinates are in CSS pixels; multiply by DPR to get screenshot pixels * fix: use Promise.allSettled in screenshot to prevent DPR eval from aborting capture Runtime.evaluate for devicePixelRatio can fail on PDF pages or chrome-extension pages. Using Promise.allSettled ensures the screenshot still succeeds, falling back to DPR=1.
This commit is contained in:
@@ -506,7 +506,7 @@ export class Browser {
|
||||
async screenshot(
|
||||
page: number,
|
||||
opts: { format: string; quality?: number; fullPage: boolean },
|
||||
): Promise<{ data: string; mimeType: string }> {
|
||||
): Promise<{ data: string; mimeType: string; devicePixelRatio: number }> {
|
||||
const session = await this.resolveSession(page)
|
||||
|
||||
const params: Record<string, unknown> = {
|
||||
@@ -515,11 +515,30 @@ export class Browser {
|
||||
}
|
||||
if (opts.quality !== undefined) params.quality = opts.quality
|
||||
|
||||
const result = await session.Page.captureScreenshot(
|
||||
params as Parameters<ProtocolApi['Page']['captureScreenshot']>[0],
|
||||
)
|
||||
const [screenshotResult, dprResult] = await Promise.allSettled([
|
||||
session.Page.captureScreenshot(
|
||||
params as Parameters<ProtocolApi['Page']['captureScreenshot']>[0],
|
||||
),
|
||||
session.Runtime.evaluate({
|
||||
expression: 'window.devicePixelRatio',
|
||||
returnByValue: true,
|
||||
}),
|
||||
])
|
||||
|
||||
return { data: result.data, mimeType: `image/${opts.format}` }
|
||||
if (screenshotResult.status === 'rejected') throw screenshotResult.reason
|
||||
|
||||
const result = screenshotResult.value
|
||||
const devicePixelRatio =
|
||||
dprResult.status === 'fulfilled' &&
|
||||
typeof dprResult.value.result?.value === 'number'
|
||||
? dprResult.value.result.value
|
||||
: 1
|
||||
|
||||
return {
|
||||
data: result.data,
|
||||
mimeType: `image/${opts.format}`,
|
||||
devicePixelRatio,
|
||||
}
|
||||
}
|
||||
|
||||
async evaluate(
|
||||
@@ -650,7 +669,7 @@ export class Browser {
|
||||
page: number,
|
||||
element: number,
|
||||
opts?: { button?: string; clickCount?: number },
|
||||
): Promise<void> {
|
||||
): Promise<{ x: number; y: number } | undefined> {
|
||||
const session = await this.resolveSession(page)
|
||||
|
||||
await elements.scrollIntoView(session, element)
|
||||
@@ -665,11 +684,13 @@ export class Browser {
|
||||
opts?.clickCount ?? 1,
|
||||
0,
|
||||
)
|
||||
return { x, y }
|
||||
} catch {
|
||||
logger.debug(
|
||||
`CDP click failed for element=${element}, falling back to JS click`,
|
||||
)
|
||||
await elements.jsClick(session, element)
|
||||
return undefined
|
||||
}
|
||||
}
|
||||
|
||||
@@ -690,12 +711,16 @@ export class Browser {
|
||||
)
|
||||
}
|
||||
|
||||
async hover(page: number, element: number): Promise<void> {
|
||||
async hover(
|
||||
page: number,
|
||||
element: number,
|
||||
): Promise<{ x: number; y: number }> {
|
||||
const session = await this.resolveSession(page)
|
||||
|
||||
await elements.scrollIntoView(session, element)
|
||||
const { x, y } = await elements.getElementCenter(session, element)
|
||||
await mouse.dispatchHover(session, x, y)
|
||||
return { x, y }
|
||||
}
|
||||
|
||||
async fill(
|
||||
@@ -703,17 +728,24 @@ export class Browser {
|
||||
element: number,
|
||||
text: string,
|
||||
clear = true,
|
||||
): Promise<void> {
|
||||
): Promise<{ x: number; y: number } | undefined> {
|
||||
const session = await this.resolveSession(page)
|
||||
|
||||
await elements.scrollIntoView(session, element)
|
||||
|
||||
let coords: { x: number; y: number } | undefined
|
||||
try {
|
||||
await elements.focusElement(session, element)
|
||||
try {
|
||||
coords = await elements.getElementCenter(session, element)
|
||||
} catch {
|
||||
// coordinates are best-effort
|
||||
}
|
||||
} catch {
|
||||
try {
|
||||
const { x, y } = await elements.getElementCenter(session, element)
|
||||
await mouse.dispatchClick(session, x, y, 'left', 1, 0)
|
||||
coords = { x, y }
|
||||
} catch {
|
||||
logger.warn('Could not focus element via click either')
|
||||
}
|
||||
@@ -721,6 +753,7 @@ export class Browser {
|
||||
|
||||
if (clear) await keyboard.clearField(session)
|
||||
await keyboard.typeText(session, text)
|
||||
return coords
|
||||
}
|
||||
|
||||
async pressKey(page: number, key: string): Promise<void> {
|
||||
@@ -732,7 +765,10 @@ export class Browser {
|
||||
page: number,
|
||||
sourceElement: number,
|
||||
target: { element?: number; x?: number; y?: number },
|
||||
): Promise<void> {
|
||||
): Promise<{
|
||||
from: { x: number; y: number }
|
||||
to: { x: number; y: number }
|
||||
}> {
|
||||
const session = await this.resolveSession(page)
|
||||
|
||||
await elements.scrollIntoView(session, sourceElement)
|
||||
@@ -750,6 +786,7 @@ export class Browser {
|
||||
}
|
||||
|
||||
await mouse.dispatchDrag(session, from, to)
|
||||
return { from, to }
|
||||
}
|
||||
|
||||
async scroll(
|
||||
|
||||
@@ -29,11 +29,14 @@ export const click = defineTool({
|
||||
clickCount: z.number(),
|
||||
}),
|
||||
handler: async (args, ctx, response) => {
|
||||
await ctx.browser.click(args.page, args.element, {
|
||||
const coords = await ctx.browser.click(args.page, args.element, {
|
||||
button: args.button,
|
||||
clickCount: args.clickCount,
|
||||
})
|
||||
response.text(`Clicked [${args.element}]`)
|
||||
const coordText = coords
|
||||
? ` at (${Math.round(coords.x)}, ${Math.round(coords.y)})`
|
||||
: ''
|
||||
response.text(`Clicked [${args.element}]${coordText}`)
|
||||
response.data({
|
||||
action: 'click',
|
||||
page: args.page,
|
||||
@@ -97,8 +100,10 @@ export const hover = defineTool({
|
||||
element: z.number(),
|
||||
}),
|
||||
handler: async (args, ctx, response) => {
|
||||
await ctx.browser.hover(args.page, args.element)
|
||||
response.text(`Hovered over [${args.element}]`)
|
||||
const coords = await ctx.browser.hover(args.page, args.element)
|
||||
response.text(
|
||||
`Hovered over [${args.element}] at (${Math.round(coords.x)}, ${Math.round(coords.y)})`,
|
||||
)
|
||||
response.data({ action: 'hover', page: args.page, element: args.element })
|
||||
},
|
||||
})
|
||||
@@ -144,8 +149,18 @@ export const fill = defineTool({
|
||||
clear: z.boolean(),
|
||||
}),
|
||||
handler: async (args, ctx, response) => {
|
||||
await ctx.browser.fill(args.page, args.element, args.text, args.clear)
|
||||
response.text(`Typed ${args.text.length} characters into [${args.element}]`)
|
||||
const coords = await ctx.browser.fill(
|
||||
args.page,
|
||||
args.element,
|
||||
args.text,
|
||||
args.clear,
|
||||
)
|
||||
const coordText = coords
|
||||
? ` at (${Math.round(coords.x)}, ${Math.round(coords.y)})`
|
||||
: ''
|
||||
response.text(
|
||||
`Typed ${args.text.length} characters into [${args.element}]${coordText}`,
|
||||
)
|
||||
response.data({
|
||||
action: 'fill',
|
||||
page: args.page,
|
||||
@@ -205,7 +220,7 @@ export const drag = defineTool({
|
||||
targetY: z.number().optional(),
|
||||
}),
|
||||
handler: async (args, ctx, response) => {
|
||||
await ctx.browser.drag(args.page, args.sourceElement, {
|
||||
const coords = await ctx.browser.drag(args.page, args.sourceElement, {
|
||||
element: args.targetElement,
|
||||
x: args.targetX,
|
||||
y: args.targetY,
|
||||
@@ -214,7 +229,9 @@ export const drag = defineTool({
|
||||
args.targetElement !== undefined
|
||||
? `[${args.targetElement}]`
|
||||
: `(${args.targetX}, ${args.targetY})`
|
||||
response.text(`Dragged [${args.sourceElement}] \u2192 ${target}`)
|
||||
response.text(
|
||||
`Dragged [${args.sourceElement}] (${Math.round(coords.from.x)}, ${Math.round(coords.from.y)}) \u2192 ${target} (${Math.round(coords.to.x)}, ${Math.round(coords.to.y)})`,
|
||||
)
|
||||
response.data({
|
||||
action: 'drag',
|
||||
page: args.page,
|
||||
|
||||
@@ -109,15 +109,20 @@ export const take_screenshot = defineTool({
|
||||
}),
|
||||
output: z.object({
|
||||
mimeType: z.string(),
|
||||
devicePixelRatio: z.number(),
|
||||
}),
|
||||
handler: async (args, ctx, response) => {
|
||||
const { data, mimeType } = await ctx.browser.screenshot(args.page, {
|
||||
format: args.format,
|
||||
quality: args.quality,
|
||||
fullPage: args.fullPage,
|
||||
})
|
||||
const { data, mimeType, devicePixelRatio } = await ctx.browser.screenshot(
|
||||
args.page,
|
||||
{
|
||||
format: args.format,
|
||||
quality: args.quality,
|
||||
fullPage: args.fullPage,
|
||||
},
|
||||
)
|
||||
response.image(data, mimeType)
|
||||
response.data({ mimeType })
|
||||
response.text(`devicePixelRatio: ${devicePixelRatio}`)
|
||||
response.data({ mimeType, devicePixelRatio })
|
||||
},
|
||||
})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user