diff --git a/apps/server/src/browser/browser.ts b/apps/server/src/browser/browser.ts index 17b9f3a78..ba1ec593c 100644 --- a/apps/server/src/browser/browser.ts +++ b/apps/server/src/browser/browser.ts @@ -506,7 +506,7 @@ export class Browser { async screenshot( page: number, opts: { format: string; quality?: number; fullPage: boolean }, - ): Promise<{ data: string; mimeType: string }> { + ): Promise<{ data: string; mimeType: string; devicePixelRatio: number }> { const session = await this.resolveSession(page) const params: Record = { @@ -515,11 +515,30 @@ export class Browser { } if (opts.quality !== undefined) params.quality = opts.quality - const result = await session.Page.captureScreenshot( - params as Parameters[0], - ) + const [screenshotResult, dprResult] = await Promise.allSettled([ + session.Page.captureScreenshot( + params as Parameters[0], + ), + session.Runtime.evaluate({ + expression: 'window.devicePixelRatio', + returnByValue: true, + }), + ]) - return { data: result.data, mimeType: `image/${opts.format}` } + if (screenshotResult.status === 'rejected') throw screenshotResult.reason + + const result = screenshotResult.value + const devicePixelRatio = + dprResult.status === 'fulfilled' && + typeof dprResult.value.result?.value === 'number' + ? dprResult.value.result.value + : 1 + + return { + data: result.data, + mimeType: `image/${opts.format}`, + devicePixelRatio, + } } async evaluate( @@ -650,7 +669,7 @@ export class Browser { page: number, element: number, opts?: { button?: string; clickCount?: number }, - ): Promise { + ): Promise<{ x: number; y: number } | undefined> { const session = await this.resolveSession(page) await elements.scrollIntoView(session, element) @@ -665,11 +684,13 @@ export class Browser { opts?.clickCount ?? 1, 0, ) + return { x, y } } catch { logger.debug( `CDP click failed for element=${element}, falling back to JS click`, ) await elements.jsClick(session, element) + return undefined } } @@ -690,12 +711,16 @@ export class Browser { ) } - async hover(page: number, element: number): Promise { + async hover( + page: number, + element: number, + ): Promise<{ x: number; y: number }> { const session = await this.resolveSession(page) await elements.scrollIntoView(session, element) const { x, y } = await elements.getElementCenter(session, element) await mouse.dispatchHover(session, x, y) + return { x, y } } async fill( @@ -703,17 +728,24 @@ export class Browser { element: number, text: string, clear = true, - ): Promise { + ): Promise<{ x: number; y: number } | undefined> { const session = await this.resolveSession(page) await elements.scrollIntoView(session, element) + let coords: { x: number; y: number } | undefined try { await elements.focusElement(session, element) + try { + coords = await elements.getElementCenter(session, element) + } catch { + // coordinates are best-effort + } } catch { try { const { x, y } = await elements.getElementCenter(session, element) await mouse.dispatchClick(session, x, y, 'left', 1, 0) + coords = { x, y } } catch { logger.warn('Could not focus element via click either') } @@ -721,6 +753,7 @@ export class Browser { if (clear) await keyboard.clearField(session) await keyboard.typeText(session, text) + return coords } async pressKey(page: number, key: string): Promise { @@ -732,7 +765,10 @@ export class Browser { page: number, sourceElement: number, target: { element?: number; x?: number; y?: number }, - ): Promise { + ): Promise<{ + from: { x: number; y: number } + to: { x: number; y: number } + }> { const session = await this.resolveSession(page) await elements.scrollIntoView(session, sourceElement) @@ -750,6 +786,7 @@ export class Browser { } await mouse.dispatchDrag(session, from, to) + return { from, to } } async scroll( diff --git a/apps/server/src/tools/input.ts b/apps/server/src/tools/input.ts index be425b7ed..886c2550e 100644 --- a/apps/server/src/tools/input.ts +++ b/apps/server/src/tools/input.ts @@ -29,11 +29,14 @@ export const click = defineTool({ clickCount: z.number(), }), handler: async (args, ctx, response) => { - await ctx.browser.click(args.page, args.element, { + const coords = await ctx.browser.click(args.page, args.element, { button: args.button, clickCount: args.clickCount, }) - response.text(`Clicked [${args.element}]`) + const coordText = coords + ? ` at (${Math.round(coords.x)}, ${Math.round(coords.y)})` + : '' + response.text(`Clicked [${args.element}]${coordText}`) response.data({ action: 'click', page: args.page, @@ -97,8 +100,10 @@ export const hover = defineTool({ element: z.number(), }), handler: async (args, ctx, response) => { - await ctx.browser.hover(args.page, args.element) - response.text(`Hovered over [${args.element}]`) + const coords = await ctx.browser.hover(args.page, args.element) + response.text( + `Hovered over [${args.element}] at (${Math.round(coords.x)}, ${Math.round(coords.y)})`, + ) response.data({ action: 'hover', page: args.page, element: args.element }) }, }) @@ -144,8 +149,18 @@ export const fill = defineTool({ clear: z.boolean(), }), handler: async (args, ctx, response) => { - await ctx.browser.fill(args.page, args.element, args.text, args.clear) - response.text(`Typed ${args.text.length} characters into [${args.element}]`) + const coords = await ctx.browser.fill( + args.page, + args.element, + args.text, + args.clear, + ) + const coordText = coords + ? ` at (${Math.round(coords.x)}, ${Math.round(coords.y)})` + : '' + response.text( + `Typed ${args.text.length} characters into [${args.element}]${coordText}`, + ) response.data({ action: 'fill', page: args.page, @@ -205,7 +220,7 @@ export const drag = defineTool({ targetY: z.number().optional(), }), handler: async (args, ctx, response) => { - await ctx.browser.drag(args.page, args.sourceElement, { + const coords = await ctx.browser.drag(args.page, args.sourceElement, { element: args.targetElement, x: args.targetX, y: args.targetY, @@ -214,7 +229,9 @@ export const drag = defineTool({ args.targetElement !== undefined ? `[${args.targetElement}]` : `(${args.targetX}, ${args.targetY})` - response.text(`Dragged [${args.sourceElement}] \u2192 ${target}`) + response.text( + `Dragged [${args.sourceElement}] (${Math.round(coords.from.x)}, ${Math.round(coords.from.y)}) \u2192 ${target} (${Math.round(coords.to.x)}, ${Math.round(coords.to.y)})`, + ) response.data({ action: 'drag', page: args.page, diff --git a/apps/server/src/tools/snapshot.ts b/apps/server/src/tools/snapshot.ts index 88708865f..77baf78ce 100644 --- a/apps/server/src/tools/snapshot.ts +++ b/apps/server/src/tools/snapshot.ts @@ -109,15 +109,20 @@ export const take_screenshot = defineTool({ }), output: z.object({ mimeType: z.string(), + devicePixelRatio: z.number(), }), handler: async (args, ctx, response) => { - const { data, mimeType } = await ctx.browser.screenshot(args.page, { - format: args.format, - quality: args.quality, - fullPage: args.fullPage, - }) + const { data, mimeType, devicePixelRatio } = await ctx.browser.screenshot( + args.page, + { + format: args.format, + quality: args.quality, + fullPage: args.fullPage, + }, + ) response.image(data, mimeType) - response.data({ mimeType }) + response.text(`devicePixelRatio: ${devicePixelRatio}`) + response.data({ mimeType, devicePixelRatio }) }, })