mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
* feat: add GitHub Copilot as OAuth-based LLM provider Add GitHub Copilot as a second OAuth provider using the Device Code flow (RFC 8628). Users authenticate via github.com/login/device, and the server polls for token completion. Supports 25+ models through a single Copilot subscription. Key changes: - Device Code OAuth flow in token manager (poll with safety margin) - Custom fetch wrapper injecting Copilot headers + vision detection - Provider factory using createOpenAICompatible for Chat Completions API - Extension UI with template card, auto-create on auth, and disconnect * fix: address PR review comments for GitHub Copilot OAuth - Validate device code response for error fields (GitHub can return 200 with error payload) - Store empty refreshToken instead of access token for GitHub tokens - Add closeButton to Toaster for dismissing device code toast * fix: add github-copilot to agent provider factory The chat route uses a separate provider-factory.ts (agent layer) from the test-provider route (llm/provider.ts). Added createGitHubCopilotFactory to the agent factory so chat works with GitHub Copilot. * fix: add github-copilot to provider icons, models, and dialog - Add Github icon from lucide-react to providerIcons map - Add 8 Copilot models (GPT-4o, Claude, Gemini, Grok) to models.ts - Add github-copilot to NewProviderDialog zod enum, validation skip, canTest check, and OAuth credential message * fix: reorder copilot models with free-tier models first Put models available on Copilot Free at the top (gpt-4o, gpt-4.1, gpt-5-mini, claude-haiku-4.5, grok-code-fast-1), followed by premium models that require paid Copilot subscription. * fix: set correct 64K context window for Copilot models Copilot API enforces a 64K input token limit regardless of the underlying model's native context window. Updated all model entries and the default template to 64000 so compaction triggers correctly. * fix: use actual per-model prompt limits from Copilot /models API Queried api.githubcopilot.com/models for real max_prompt_tokens values. GPT-4o/4.1 have 64K, Claude/gpt-5-mini have 128K, GPT-5.x have 272K. Also updated model list to match what's actually available on the API (e.g. claude-sonnet-4.6 instead of 4.5, added gpt-5.4/5.2-codex). * feat: resize images for Copilot using VS Code's algorithm Large screenshots cause 413 errors on Copilot's API. Resize images following VS Code's approach: max 2048px longest side, 768px shortest side, re-encode as JPEG at 75% quality. Uses sharp for server-side image processing. * fix: address all Greptile P1 review comments - Add .catch() on fire-and-forget pollDeviceCode to prevent unhandled rejection crashes (Node 15+) - Add deduplication guard (activeDeviceFlows Set) to prevent concurrent device code flows for the same provider - Add runtime validation of server response in frontend before calling window.open() and showing toast - Remove dead GITHUB_DEVICE_VERIFICATION constant from urls.ts * fix: upgrade biome to 2.4.8, fix all lint errors, and address review bugs - Upgrade biome from 2.4.5 to 2.4.8 (matches CI) and migrate configs - Fix image resize: only re-encode when dimensions actually change - Fix device code polling: retry on transient network errors instead of aborting - Allow restarting device code flow (clear old flow instead of throwing 500) - Fix pre-existing noNonNullAssertion and noExplicitAny lint errors globally * fix: address Greptile P2 review — image resize and config guard - Fix early-return guard: check max/min sides against their respective limits (MAX_LONG_SIDE/MAX_SHORT_SIDE) instead of both against SHORT - Preserve PNG alpha: detect hasAlpha and keep PNG format instead of unconditionally converting to lossy JPEG - Keep browserosId guard in resolveGitHubCopilotConfig consistent with ChatGPT Pro pattern (safety check that caller context is valid) * feat: update Copilot models to full list from pricing page, default to gpt-5-mini Added all 23 models from GitHub Copilot pricing page. Ordered with free-tier models first (gpt-5-mini, claude-haiku-4.5), then premium. Changed default from gpt-4o to gpt-5-mini since it's unlimited on Pro plan and has 128K context (vs gpt-4o's 64K limit).
250 lines
7.2 KiB
TypeScript
250 lines
7.2 KiB
TypeScript
/**
|
||
* Long-running stress test to simulate eval behavior
|
||
* Run with: bun apps/eval/scripts/debug-long-run.ts
|
||
*/
|
||
|
||
import { Client } from '@modelcontextprotocol/sdk/client/index.js'
|
||
import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
|
||
|
||
const SERVER_URL = 'http://127.0.0.1:9110'
|
||
const MCP_URL = `${SERVER_URL}/mcp`
|
||
|
||
// Simulate 60 turns like the failing task had
|
||
const NUM_TURNS = 60
|
||
const SCREENSHOT_EVERY_N_TURNS = 1
|
||
|
||
async function checkExtension(): Promise<boolean> {
|
||
try {
|
||
const res = await fetch(`${SERVER_URL}/extension-status`, {
|
||
signal: AbortSignal.timeout(5000),
|
||
})
|
||
if (!res.ok) return false
|
||
const data = (await res.json()) as { extensionConnected?: boolean }
|
||
return data.extensionConnected === true
|
||
} catch {
|
||
return false
|
||
}
|
||
}
|
||
|
||
async function callMcpTool(
|
||
name: string,
|
||
args: Record<string, unknown> = {},
|
||
timeoutMs: number = 65000,
|
||
): Promise<{ success: boolean; error?: string; duration: number }> {
|
||
const start = Date.now()
|
||
const client = new Client({ name: 'long-run-test', version: '1.0.0' })
|
||
const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
|
||
|
||
try {
|
||
await client.connect(transport)
|
||
|
||
const toolPromise = client.callTool({ name, arguments: args })
|
||
const timeoutPromise = new Promise<never>((_, reject) =>
|
||
setTimeout(
|
||
() => reject(new Error(`Timeout after ${timeoutMs}ms`)),
|
||
timeoutMs,
|
||
),
|
||
)
|
||
|
||
const result = await Promise.race([toolPromise, timeoutPromise])
|
||
const duration = Date.now() - start
|
||
|
||
const res = result as Record<string, unknown>
|
||
if (res.isError) {
|
||
const content = res.content as
|
||
| Array<{ type: string; text?: string }>
|
||
| undefined
|
||
const errorText =
|
||
content?.find((c) => c.type === 'text')?.text || 'Unknown error'
|
||
return { success: false, error: errorText, duration }
|
||
}
|
||
|
||
return { success: true, duration }
|
||
} catch (error) {
|
||
return {
|
||
success: false,
|
||
error: error instanceof Error ? error.message : String(error),
|
||
duration: Date.now() - start,
|
||
}
|
||
} finally {
|
||
try {
|
||
await transport.close()
|
||
} catch {}
|
||
}
|
||
}
|
||
|
||
async function main() {
|
||
console.log('='.repeat(60))
|
||
console.log('Long-Running Stress Test (simulating eval)')
|
||
console.log('='.repeat(60))
|
||
console.log(
|
||
`Simulating ${NUM_TURNS} turns with screenshots every ${SCREENSHOT_EVERY_N_TURNS} turn(s)`,
|
||
)
|
||
console.log()
|
||
|
||
// Create window
|
||
console.log('Creating window...')
|
||
|
||
let windowId = 0
|
||
let tabId = 0
|
||
|
||
const client = new Client({ name: 'long-run-test', version: '1.0.0' })
|
||
const transport = new StreamableHTTPClientTransport(new URL(MCP_URL))
|
||
|
||
try {
|
||
await client.connect(transport)
|
||
const result = await client.callTool({
|
||
name: 'browser_create_window',
|
||
arguments: { url: 'https://example.com', focused: false },
|
||
})
|
||
|
||
// Try structured content first
|
||
const createRes = result as Record<string, unknown>
|
||
const structured = createRes.structuredContent as
|
||
| Record<string, number>
|
||
| undefined
|
||
windowId = structured?.windowId ?? 0
|
||
tabId = structured?.tabId ?? 0
|
||
|
||
// Fall back to parsing text
|
||
if (!windowId || !tabId) {
|
||
const content = createRes.content as
|
||
| Array<{ type: string; text?: string }>
|
||
| undefined
|
||
const text = content?.find((c) => c.type === 'text')?.text || ''
|
||
const windowMatch = text.match(/window\s+(\d+)/i)
|
||
const tabMatch =
|
||
text.match(/Tab ID:\s*(\d+)/i) || text.match(/tab\s+(\d+)/i)
|
||
if (windowMatch) windowId = parseInt(windowMatch[1], 10)
|
||
if (tabMatch) tabId = parseInt(tabMatch[1], 10)
|
||
}
|
||
} finally {
|
||
try {
|
||
await transport.close()
|
||
} catch {}
|
||
}
|
||
|
||
if (!windowId || !tabId) {
|
||
console.log('❌ Could not determine window/tab IDs')
|
||
console.log('Trying to get from list tabs...')
|
||
|
||
// Try listing tabs
|
||
const client2 = new Client({ name: 'long-run-test', version: '1.0.0' })
|
||
const transport2 = new StreamableHTTPClientTransport(new URL(MCP_URL))
|
||
try {
|
||
await client2.connect(transport2)
|
||
const tabs = await client2.callTool({
|
||
name: 'browser_list_tabs',
|
||
arguments: {},
|
||
})
|
||
console.log('Tabs response:', JSON.stringify(tabs, null, 2))
|
||
} finally {
|
||
try {
|
||
await transport2.close()
|
||
} catch {}
|
||
}
|
||
return
|
||
}
|
||
|
||
console.log(`Window: ${windowId}, Tab: ${tabId}`)
|
||
console.log()
|
||
|
||
await new Promise((r) => setTimeout(r, 2000))
|
||
|
||
// Stats
|
||
let screenshotSuccess = 0
|
||
let screenshotFail = 0
|
||
let toolSuccess = 0
|
||
let toolFail = 0
|
||
let extensionDisconnects = 0
|
||
|
||
const startTime = Date.now()
|
||
|
||
// Simulate turns
|
||
for (let turn = 1; turn <= NUM_TURNS; turn++) {
|
||
const _turnStart = Date.now()
|
||
|
||
// Random tool calls to simulate agent behavior
|
||
const tools = [
|
||
{
|
||
name: 'browser_get_interactive_elements',
|
||
args: { tabId, windowId, simplified: true },
|
||
},
|
||
{ name: 'browser_list_tabs', args: { windowId } },
|
||
{ name: 'browser_get_active_tab', args: { windowId } },
|
||
]
|
||
|
||
// Pick a random tool
|
||
const tool = tools[Math.floor(Math.random() * tools.length)]
|
||
const toolRes = await callMcpTool(tool.name, tool.args, 30000)
|
||
|
||
if (toolRes.success) {
|
||
toolSuccess++
|
||
} else {
|
||
toolFail++
|
||
console.log(` Turn ${turn}: ❌ ${tool.name} failed: ${toolRes.error}`)
|
||
}
|
||
|
||
// Screenshot every N turns
|
||
if (turn % SCREENSHOT_EVERY_N_TURNS === 0) {
|
||
const ssRes = await callMcpTool(
|
||
'browser_get_screenshot',
|
||
{ tabId, windowId, size: 'small' },
|
||
65000,
|
||
)
|
||
|
||
if (ssRes.success) {
|
||
screenshotSuccess++
|
||
} else {
|
||
screenshotFail++
|
||
console.log(` Turn ${turn}: ❌ Screenshot failed: ${ssRes.error}`)
|
||
}
|
||
}
|
||
|
||
// Check extension status
|
||
const extConnected = await checkExtension()
|
||
if (!extConnected) {
|
||
extensionDisconnects++
|
||
console.log(` Turn ${turn}: ⚠️ Extension disconnected!`)
|
||
}
|
||
|
||
// Progress
|
||
if (turn % 10 === 0) {
|
||
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1)
|
||
console.log(
|
||
`Turn ${turn}/${NUM_TURNS} - Screenshots: ${screenshotSuccess}/${turn}, Tools: ${toolSuccess}/${turn}, Disconnects: ${extensionDisconnects}, Elapsed: ${elapsed}s`,
|
||
)
|
||
}
|
||
|
||
// Small delay between turns
|
||
await new Promise((r) => setTimeout(r, 200))
|
||
}
|
||
|
||
// Cleanup
|
||
console.log('\nClosing window...')
|
||
await callMcpTool('browser_close_window', { windowId })
|
||
|
||
// Summary
|
||
const totalTime = ((Date.now() - startTime) / 1000).toFixed(1)
|
||
|
||
console.log(`\n${'='.repeat(60)}`)
|
||
console.log('SUMMARY')
|
||
console.log('='.repeat(60))
|
||
console.log(`Total time: ${totalTime}s`)
|
||
console.log(
|
||
`Screenshots: ${screenshotSuccess}/${NUM_TURNS} (${((screenshotSuccess / NUM_TURNS) * 100).toFixed(1)}%)`,
|
||
)
|
||
console.log(
|
||
`Tool calls: ${toolSuccess}/${NUM_TURNS} (${((toolSuccess / NUM_TURNS) * 100).toFixed(1)}%)`,
|
||
)
|
||
console.log(`Extension disconnects: ${extensionDisconnects}`)
|
||
|
||
if (screenshotFail > 0 || toolFail > 0 || extensionDisconnects > 0) {
|
||
console.log('\n⚠️ Issues detected during long run!')
|
||
} else {
|
||
console.log('\n✅ All operations completed successfully!')
|
||
}
|
||
}
|
||
|
||
main().catch(console.error)
|