mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-21 12:55:09 +00:00
feat: return element coordinates in tool responses and DPR in screens… (#437)
* feat: return element coordinates in tool responses and DPR in screenshots - click, hover, fill, drag now return resolved coordinates in response text - take_screenshot returns devicePixelRatio for mapping coordinates to pixels - Coordinates are in CSS pixels; multiply by DPR to get screenshot pixels * fix: use Promise.allSettled in screenshot to prevent DPR eval from aborting capture Runtime.evaluate for devicePixelRatio can fail on PDF pages or chrome-extension pages. Using Promise.allSettled ensures the screenshot still succeeds, falling back to DPR=1.
This commit is contained in:
@@ -506,7 +506,7 @@ export class Browser {
|
|||||||
async screenshot(
|
async screenshot(
|
||||||
page: number,
|
page: number,
|
||||||
opts: { format: string; quality?: number; fullPage: boolean },
|
opts: { format: string; quality?: number; fullPage: boolean },
|
||||||
): Promise<{ data: string; mimeType: string }> {
|
): Promise<{ data: string; mimeType: string; devicePixelRatio: number }> {
|
||||||
const session = await this.resolveSession(page)
|
const session = await this.resolveSession(page)
|
||||||
|
|
||||||
const params: Record<string, unknown> = {
|
const params: Record<string, unknown> = {
|
||||||
@@ -515,11 +515,30 @@ export class Browser {
|
|||||||
}
|
}
|
||||||
if (opts.quality !== undefined) params.quality = opts.quality
|
if (opts.quality !== undefined) params.quality = opts.quality
|
||||||
|
|
||||||
const result = await session.Page.captureScreenshot(
|
const [screenshotResult, dprResult] = await Promise.allSettled([
|
||||||
params as Parameters<ProtocolApi['Page']['captureScreenshot']>[0],
|
session.Page.captureScreenshot(
|
||||||
)
|
params as Parameters<ProtocolApi['Page']['captureScreenshot']>[0],
|
||||||
|
),
|
||||||
|
session.Runtime.evaluate({
|
||||||
|
expression: 'window.devicePixelRatio',
|
||||||
|
returnByValue: true,
|
||||||
|
}),
|
||||||
|
])
|
||||||
|
|
||||||
return { data: result.data, mimeType: `image/${opts.format}` }
|
if (screenshotResult.status === 'rejected') throw screenshotResult.reason
|
||||||
|
|
||||||
|
const result = screenshotResult.value
|
||||||
|
const devicePixelRatio =
|
||||||
|
dprResult.status === 'fulfilled' &&
|
||||||
|
typeof dprResult.value.result?.value === 'number'
|
||||||
|
? dprResult.value.result.value
|
||||||
|
: 1
|
||||||
|
|
||||||
|
return {
|
||||||
|
data: result.data,
|
||||||
|
mimeType: `image/${opts.format}`,
|
||||||
|
devicePixelRatio,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async evaluate(
|
async evaluate(
|
||||||
@@ -650,7 +669,7 @@ export class Browser {
|
|||||||
page: number,
|
page: number,
|
||||||
element: number,
|
element: number,
|
||||||
opts?: { button?: string; clickCount?: number },
|
opts?: { button?: string; clickCount?: number },
|
||||||
): Promise<void> {
|
): Promise<{ x: number; y: number } | undefined> {
|
||||||
const session = await this.resolveSession(page)
|
const session = await this.resolveSession(page)
|
||||||
|
|
||||||
await elements.scrollIntoView(session, element)
|
await elements.scrollIntoView(session, element)
|
||||||
@@ -665,11 +684,13 @@ export class Browser {
|
|||||||
opts?.clickCount ?? 1,
|
opts?.clickCount ?? 1,
|
||||||
0,
|
0,
|
||||||
)
|
)
|
||||||
|
return { x, y }
|
||||||
} catch {
|
} catch {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
`CDP click failed for element=${element}, falling back to JS click`,
|
`CDP click failed for element=${element}, falling back to JS click`,
|
||||||
)
|
)
|
||||||
await elements.jsClick(session, element)
|
await elements.jsClick(session, element)
|
||||||
|
return undefined
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -690,12 +711,16 @@ export class Browser {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
async hover(page: number, element: number): Promise<void> {
|
async hover(
|
||||||
|
page: number,
|
||||||
|
element: number,
|
||||||
|
): Promise<{ x: number; y: number }> {
|
||||||
const session = await this.resolveSession(page)
|
const session = await this.resolveSession(page)
|
||||||
|
|
||||||
await elements.scrollIntoView(session, element)
|
await elements.scrollIntoView(session, element)
|
||||||
const { x, y } = await elements.getElementCenter(session, element)
|
const { x, y } = await elements.getElementCenter(session, element)
|
||||||
await mouse.dispatchHover(session, x, y)
|
await mouse.dispatchHover(session, x, y)
|
||||||
|
return { x, y }
|
||||||
}
|
}
|
||||||
|
|
||||||
async fill(
|
async fill(
|
||||||
@@ -703,17 +728,24 @@ export class Browser {
|
|||||||
element: number,
|
element: number,
|
||||||
text: string,
|
text: string,
|
||||||
clear = true,
|
clear = true,
|
||||||
): Promise<void> {
|
): Promise<{ x: number; y: number } | undefined> {
|
||||||
const session = await this.resolveSession(page)
|
const session = await this.resolveSession(page)
|
||||||
|
|
||||||
await elements.scrollIntoView(session, element)
|
await elements.scrollIntoView(session, element)
|
||||||
|
|
||||||
|
let coords: { x: number; y: number } | undefined
|
||||||
try {
|
try {
|
||||||
await elements.focusElement(session, element)
|
await elements.focusElement(session, element)
|
||||||
|
try {
|
||||||
|
coords = await elements.getElementCenter(session, element)
|
||||||
|
} catch {
|
||||||
|
// coordinates are best-effort
|
||||||
|
}
|
||||||
} catch {
|
} catch {
|
||||||
try {
|
try {
|
||||||
const { x, y } = await elements.getElementCenter(session, element)
|
const { x, y } = await elements.getElementCenter(session, element)
|
||||||
await mouse.dispatchClick(session, x, y, 'left', 1, 0)
|
await mouse.dispatchClick(session, x, y, 'left', 1, 0)
|
||||||
|
coords = { x, y }
|
||||||
} catch {
|
} catch {
|
||||||
logger.warn('Could not focus element via click either')
|
logger.warn('Could not focus element via click either')
|
||||||
}
|
}
|
||||||
@@ -721,6 +753,7 @@ export class Browser {
|
|||||||
|
|
||||||
if (clear) await keyboard.clearField(session)
|
if (clear) await keyboard.clearField(session)
|
||||||
await keyboard.typeText(session, text)
|
await keyboard.typeText(session, text)
|
||||||
|
return coords
|
||||||
}
|
}
|
||||||
|
|
||||||
async pressKey(page: number, key: string): Promise<void> {
|
async pressKey(page: number, key: string): Promise<void> {
|
||||||
@@ -732,7 +765,10 @@ export class Browser {
|
|||||||
page: number,
|
page: number,
|
||||||
sourceElement: number,
|
sourceElement: number,
|
||||||
target: { element?: number; x?: number; y?: number },
|
target: { element?: number; x?: number; y?: number },
|
||||||
): Promise<void> {
|
): Promise<{
|
||||||
|
from: { x: number; y: number }
|
||||||
|
to: { x: number; y: number }
|
||||||
|
}> {
|
||||||
const session = await this.resolveSession(page)
|
const session = await this.resolveSession(page)
|
||||||
|
|
||||||
await elements.scrollIntoView(session, sourceElement)
|
await elements.scrollIntoView(session, sourceElement)
|
||||||
@@ -750,6 +786,7 @@ export class Browser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
await mouse.dispatchDrag(session, from, to)
|
await mouse.dispatchDrag(session, from, to)
|
||||||
|
return { from, to }
|
||||||
}
|
}
|
||||||
|
|
||||||
async scroll(
|
async scroll(
|
||||||
|
|||||||
@@ -29,11 +29,14 @@ export const click = defineTool({
|
|||||||
clickCount: z.number(),
|
clickCount: z.number(),
|
||||||
}),
|
}),
|
||||||
handler: async (args, ctx, response) => {
|
handler: async (args, ctx, response) => {
|
||||||
await ctx.browser.click(args.page, args.element, {
|
const coords = await ctx.browser.click(args.page, args.element, {
|
||||||
button: args.button,
|
button: args.button,
|
||||||
clickCount: args.clickCount,
|
clickCount: args.clickCount,
|
||||||
})
|
})
|
||||||
response.text(`Clicked [${args.element}]`)
|
const coordText = coords
|
||||||
|
? ` at (${Math.round(coords.x)}, ${Math.round(coords.y)})`
|
||||||
|
: ''
|
||||||
|
response.text(`Clicked [${args.element}]${coordText}`)
|
||||||
response.data({
|
response.data({
|
||||||
action: 'click',
|
action: 'click',
|
||||||
page: args.page,
|
page: args.page,
|
||||||
@@ -97,8 +100,10 @@ export const hover = defineTool({
|
|||||||
element: z.number(),
|
element: z.number(),
|
||||||
}),
|
}),
|
||||||
handler: async (args, ctx, response) => {
|
handler: async (args, ctx, response) => {
|
||||||
await ctx.browser.hover(args.page, args.element)
|
const coords = await ctx.browser.hover(args.page, args.element)
|
||||||
response.text(`Hovered over [${args.element}]`)
|
response.text(
|
||||||
|
`Hovered over [${args.element}] at (${Math.round(coords.x)}, ${Math.round(coords.y)})`,
|
||||||
|
)
|
||||||
response.data({ action: 'hover', page: args.page, element: args.element })
|
response.data({ action: 'hover', page: args.page, element: args.element })
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
@@ -144,8 +149,18 @@ export const fill = defineTool({
|
|||||||
clear: z.boolean(),
|
clear: z.boolean(),
|
||||||
}),
|
}),
|
||||||
handler: async (args, ctx, response) => {
|
handler: async (args, ctx, response) => {
|
||||||
await ctx.browser.fill(args.page, args.element, args.text, args.clear)
|
const coords = await ctx.browser.fill(
|
||||||
response.text(`Typed ${args.text.length} characters into [${args.element}]`)
|
args.page,
|
||||||
|
args.element,
|
||||||
|
args.text,
|
||||||
|
args.clear,
|
||||||
|
)
|
||||||
|
const coordText = coords
|
||||||
|
? ` at (${Math.round(coords.x)}, ${Math.round(coords.y)})`
|
||||||
|
: ''
|
||||||
|
response.text(
|
||||||
|
`Typed ${args.text.length} characters into [${args.element}]${coordText}`,
|
||||||
|
)
|
||||||
response.data({
|
response.data({
|
||||||
action: 'fill',
|
action: 'fill',
|
||||||
page: args.page,
|
page: args.page,
|
||||||
@@ -205,7 +220,7 @@ export const drag = defineTool({
|
|||||||
targetY: z.number().optional(),
|
targetY: z.number().optional(),
|
||||||
}),
|
}),
|
||||||
handler: async (args, ctx, response) => {
|
handler: async (args, ctx, response) => {
|
||||||
await ctx.browser.drag(args.page, args.sourceElement, {
|
const coords = await ctx.browser.drag(args.page, args.sourceElement, {
|
||||||
element: args.targetElement,
|
element: args.targetElement,
|
||||||
x: args.targetX,
|
x: args.targetX,
|
||||||
y: args.targetY,
|
y: args.targetY,
|
||||||
@@ -214,7 +229,9 @@ export const drag = defineTool({
|
|||||||
args.targetElement !== undefined
|
args.targetElement !== undefined
|
||||||
? `[${args.targetElement}]`
|
? `[${args.targetElement}]`
|
||||||
: `(${args.targetX}, ${args.targetY})`
|
: `(${args.targetX}, ${args.targetY})`
|
||||||
response.text(`Dragged [${args.sourceElement}] \u2192 ${target}`)
|
response.text(
|
||||||
|
`Dragged [${args.sourceElement}] (${Math.round(coords.from.x)}, ${Math.round(coords.from.y)}) \u2192 ${target} (${Math.round(coords.to.x)}, ${Math.round(coords.to.y)})`,
|
||||||
|
)
|
||||||
response.data({
|
response.data({
|
||||||
action: 'drag',
|
action: 'drag',
|
||||||
page: args.page,
|
page: args.page,
|
||||||
|
|||||||
@@ -109,15 +109,20 @@ export const take_screenshot = defineTool({
|
|||||||
}),
|
}),
|
||||||
output: z.object({
|
output: z.object({
|
||||||
mimeType: z.string(),
|
mimeType: z.string(),
|
||||||
|
devicePixelRatio: z.number(),
|
||||||
}),
|
}),
|
||||||
handler: async (args, ctx, response) => {
|
handler: async (args, ctx, response) => {
|
||||||
const { data, mimeType } = await ctx.browser.screenshot(args.page, {
|
const { data, mimeType, devicePixelRatio } = await ctx.browser.screenshot(
|
||||||
format: args.format,
|
args.page,
|
||||||
quality: args.quality,
|
{
|
||||||
fullPage: args.fullPage,
|
format: args.format,
|
||||||
})
|
quality: args.quality,
|
||||||
|
fullPage: args.fullPage,
|
||||||
|
},
|
||||||
|
)
|
||||||
response.image(data, mimeType)
|
response.image(data, mimeType)
|
||||||
response.data({ mimeType })
|
response.text(`devicePixelRatio: ${devicePixelRatio}`)
|
||||||
|
response.data({ mimeType, devicePixelRatio })
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user