fix: make SDK navigation tolerate unfocused startup tabs (#607)

This commit is contained in:
Nikhil
2026-03-27 14:34:36 -07:00
committed by GitHub
parent 4b191a759c
commit 83a25ad301
2 changed files with 103 additions and 49 deletions

View File

@@ -6,7 +6,7 @@
* Browser Service - Direct browser operations for SDK
*/
import type { Browser } from '../../../browser/browser'
import type { Browser, PageInfo } from '../../../browser/browser'
import type {
ActiveTab,
InteractiveElements,
@@ -19,6 +19,65 @@ import { SdkError } from './types'
export class BrowserService {
constructor(private browser: Browser) {}
private selectPage(pages: PageInfo[], windowId?: number): PageInfo | null {
const scopedPages =
windowId === undefined
? pages
: pages.filter((page) => page.windowId === windowId)
if (scopedPages.length === 0) {
return null
}
return (
scopedPages.find((page) => page.isActive) ??
scopedPages.find((page) => !page.isHidden) ??
scopedPages[0]
)
}
private async findExistingPage(windowId?: number): Promise<PageInfo | null> {
if (windowId === undefined) {
const activePage = await this.browser.getActivePage()
if (activePage) {
return activePage
}
}
return this.selectPage(await this.browser.listPages(), windowId)
}
private async resolveExistingPage(windowId?: number): Promise<PageInfo> {
const page = await this.findExistingPage(windowId)
if (!page) {
throw new SdkError(
windowId === undefined
? 'No active tab found'
: 'No tab found in specified window',
)
}
return page
}
private async resolveNavigationPage(windowId?: number): Promise<PageInfo> {
const existingPage = await this.findExistingPage(windowId)
if (existingPage) {
return existingPage
}
if (windowId !== undefined) {
throw new SdkError('No tab found in specified window')
}
const pageId = await this.browser.newPage('about:blank', {
background: false,
})
const createdPage = (await this.browser.listPages()).find(
(page) => page.pageId === pageId,
)
if (!createdPage) {
throw new SdkError('Failed to create a tab for navigation')
}
return createdPage
}
private async getPageIdForTab(tabId: number): Promise<number> {
const resolved = await this.browser.resolveTabIds([tabId])
const pageId = resolved.get(tabId)
@@ -29,26 +88,7 @@ export class BrowserService {
}
async getActiveTab(windowId?: number): Promise<ActiveTab> {
if (windowId !== undefined) {
// Find the active tab in the specified window
const pages = await this.browser.listPages()
const page = pages.find((p) => p.windowId === windowId && p.isActive)
if (!page) {
throw new SdkError('No active tab found in specified window')
}
return {
tabId: page.tabId,
url: page.url,
title: page.title,
windowId: page.windowId ?? 0,
}
}
const page = await this.browser.getActivePage()
if (!page) {
throw new SdkError('No active tab found')
}
const page = await this.resolveExistingPage(windowId)
return {
tabId: page.tabId,
url: page.url,
@@ -89,20 +129,7 @@ export class BrowserService {
return { tabId, windowId: page.windowId ?? 0 }
}
if (windowId !== undefined) {
const pages = await this.browser.listPages()
const page = pages.find((p) => p.windowId === windowId && p.isActive)
if (!page) {
throw new SdkError('No active tab in specified window')
}
await this.browser.goto(page.pageId, url)
return { tabId: page.tabId, windowId }
}
const activePage = await this.browser.getActivePage()
if (!activePage) {
throw new SdkError('No active tab to navigate')
}
const activePage = await this.resolveNavigationPage(windowId)
await this.browser.goto(activePage.pageId, url)
return {
tabId: activePage.tabId,

View File

@@ -6,21 +6,57 @@
* Tests the SDK against a real BrowserOS server.
*/
import { beforeAll, describe, it } from 'bun:test'
import { afterAll, beforeAll, describe, it } from 'bun:test'
import assert from 'node:assert'
import { Agent } from '@browseros-ai/agent-sdk'
import { CdpBackend } from '../../src/browser/backends/cdp'
import type { ControllerBackend } from '../../src/browser/backends/types'
import { Browser } from '../../src/browser/browser'
import {
ensureBrowserOS,
type TestEnvironmentConfig,
} from '../__helpers__/setup'
let config: TestEnvironmentConfig
let cdp: CdpBackend | null = null
let runtimeWindowId: number
const stubController: ControllerBackend = {
start: async () => {},
stop: async () => {},
isConnected: () => false,
send: async () => {
throw new Error('Controller not available in SDK tests')
},
}
async function getRuntimeWindow(
testConfig: TestEnvironmentConfig,
): Promise<number> {
const runtimeCdp = new CdpBackend({ port: testConfig.cdpPort })
await runtimeCdp.connect()
cdp = runtimeCdp
const browser = new Browser(runtimeCdp, stubController)
const pages = await browser.listPages()
const page =
pages.find((entry) => !entry.isHidden && entry.windowId !== undefined) ??
pages.find((entry) => entry.windowId !== undefined)
assert.ok(page?.windowId !== undefined, 'Expected a runtime window ID')
return page.windowId
}
beforeAll(async () => {
config = await ensureBrowserOS()
runtimeWindowId = await getRuntimeWindow(config)
}, 60000)
afterAll(async () => {
await cdp?.disconnect()
})
function createAgent(browserContext?: {
windowId?: number
activeTab?: { id: number; url: string }
@@ -177,20 +213,17 @@ describe('Agent SDK Integration', () => {
describe('browserContext', () => {
it('passes windowId through nav()', async () => {
const testWindowId = 12345
const testWindowId = runtimeWindowId
const agent = createAgent({ windowId: testWindowId })
const events: unknown[] = []
agent.onProgress((event) => events.push(event))
// This will use the windowId from browserContext
// Server logs should show the windowId being passed
const result = await agent.nav('data:text/html,<h1>Window Test</h1>')
console.log('\n=== nav() with windowId ===')
console.log('windowId:', testWindowId)
console.log('result:', JSON.stringify(result, null, 2))
// Navigation may fail if window doesn't exist, but we're testing the flow
assert.ok(
typeof result.success === 'boolean',
'Should return a result with success boolean',
@@ -198,14 +231,12 @@ describe('Agent SDK Integration', () => {
}, 30000)
it('passes windowId through act()', async () => {
const testWindowId = 12345
const testWindowId = runtimeWindowId
const agent = createAgent({ windowId: testWindowId })
// First navigate without windowId constraint to set up the page
const plainAgent = createAgent()
await plainAgent.nav('data:text/html,<button id="btn">Click</button>')
// Now act with windowId - server logs should show windowId being passed
const result = await agent.act('describe what you see')
console.log('\n=== act() with windowId ===')
@@ -220,14 +251,12 @@ describe('Agent SDK Integration', () => {
it('passes windowId through extract()', async () => {
const { z } = await import('zod')
const testWindowId = 12345
const testWindowId = runtimeWindowId
const agent = createAgent({ windowId: testWindowId })
// Set up a page first
const plainAgent = createAgent()
await plainAgent.nav('data:text/html,<h1>Extract Test</h1>')
// Extract with windowId - server logs should show windowId
const result = await agent.extract('get the page heading', {
schema: z.object({ heading: z.string() }),
})
@@ -240,14 +269,12 @@ describe('Agent SDK Integration', () => {
}, 60000)
it('passes windowId through verify()', async () => {
const testWindowId = 12345
const testWindowId = runtimeWindowId
const agent = createAgent({ windowId: testWindowId })
// Set up a page first
const plainAgent = createAgent()
await plainAgent.nav('data:text/html,<h1>Verify Test</h1>')
// Verify with windowId - server logs should show windowId
const result = await agent.verify('the page has some content')
console.log('\n=== verify() with windowId ===')