mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-21 12:55:09 +00:00
* fix: compaction config for small context windows (≤32K) Raise COMPACTION_SMALL_CONTEXT_WINDOW from 16K to 32K so models like Haiku 4.5 (30K context) use proportional 50% reserve instead of the fixed 20K reserve. Also scale fixedOverhead for small contexts (capped at 40% of context window) to prevent the doom loop where overhead alone triggers compaction on every step. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * docs: add compaction tuning guidance to limits constants Explain the relationship between SMALL_CONTEXT_WINDOW and FIXED_OVERHEAD so devs know the 24K minimum constraint when tweaking these values. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1473 lines
48 KiB
TypeScript
1473 lines
48 KiB
TypeScript
import { describe, expect, it } from 'bun:test'
|
|
import { AGENT_LIMITS } from '@browseros/shared/constants/limits'
|
|
import { LLM_PROVIDERS } from '@browseros/shared/schemas/llm'
|
|
import type { ModelMessage, ToolResultPart } from 'ai'
|
|
import {
|
|
computeConfig,
|
|
estimateTokens,
|
|
findSafeSplitPoint,
|
|
getCurrentTokenCount,
|
|
reduceToolOutputs,
|
|
type StepWithUsage,
|
|
slidingWindow,
|
|
} from '../../src/agent/compaction'
|
|
import {
|
|
countBinaryParts,
|
|
stripBinaryContent,
|
|
} from '../../src/agent/compaction/content'
|
|
import {
|
|
buildSummarizationPrompt,
|
|
buildTurnPrefixPrompt,
|
|
messagesToTranscript,
|
|
} from '../../src/agent/compaction/prompt'
|
|
import {
|
|
createContextOverflowMiddleware,
|
|
isContextOverflowError,
|
|
} from '../../src/agent/context-overflow-middleware'
|
|
import {
|
|
getMessageNormalizationOptions,
|
|
normalizeMessagesForModel,
|
|
} from '../../src/agent/message-normalization'
|
|
|
|
const {
|
|
COMPACTION_RESERVE_TOKENS,
|
|
COMPACTION_SMALL_CONTEXT_WINDOW,
|
|
COMPACTION_KEEP_RECENT_FRACTION,
|
|
COMPACTION_MAX_KEEP_RECENT,
|
|
COMPACTION_MIN_SUMMARIZABLE_INPUT,
|
|
COMPACTION_MIN_SUMMARIZABLE_INPUT_SMALL,
|
|
COMPACTION_MAX_SUMMARIZATION_INPUT,
|
|
COMPACTION_MIN_TOKEN_FLOOR,
|
|
COMPACTION_SUMMARIZER_OUTPUT_RATIO,
|
|
} = AGENT_LIMITS
|
|
|
|
function expectedReserve(contextWindow: number): number {
|
|
return contextWindow <= COMPACTION_SMALL_CONTEXT_WINDOW
|
|
? Math.floor(contextWindow * 0.5)
|
|
: COMPACTION_RESERVE_TOKENS
|
|
}
|
|
|
|
function expectedTrigger(contextWindow: number): number {
|
|
return Math.max(0, contextWindow - expectedReserve(contextWindow))
|
|
}
|
|
|
|
function expectedKeepRecent(contextWindow: number): number {
|
|
return Math.max(
|
|
0,
|
|
Math.min(
|
|
COMPACTION_MAX_KEEP_RECENT,
|
|
Math.floor(
|
|
expectedTrigger(contextWindow) * COMPACTION_KEEP_RECENT_FRACTION,
|
|
),
|
|
),
|
|
)
|
|
}
|
|
|
|
function expectedAvailableToSummarize(contextWindow: number): number {
|
|
return Math.max(
|
|
0,
|
|
expectedTrigger(contextWindow) - expectedKeepRecent(contextWindow),
|
|
)
|
|
}
|
|
|
|
function expectedMinSummarizable(contextWindow: number): number {
|
|
const base =
|
|
contextWindow <= COMPACTION_SMALL_CONTEXT_WINDOW
|
|
? COMPACTION_MIN_SUMMARIZABLE_INPUT_SMALL
|
|
: COMPACTION_MIN_SUMMARIZABLE_INPUT
|
|
return Math.max(
|
|
COMPACTION_MIN_TOKEN_FLOOR,
|
|
Math.min(base, expectedAvailableToSummarize(contextWindow)),
|
|
)
|
|
}
|
|
|
|
function expectedMaxSummarizationInput(contextWindow: number): number {
|
|
return Math.min(
|
|
COMPACTION_MAX_SUMMARIZATION_INPUT,
|
|
Math.max(
|
|
expectedMinSummarizable(contextWindow),
|
|
expectedAvailableToSummarize(contextWindow),
|
|
),
|
|
)
|
|
}
|
|
|
|
function expectedSummarizerMaxOutput(contextWindow: number): number {
|
|
return Math.max(
|
|
COMPACTION_MIN_TOKEN_FLOOR,
|
|
Math.floor(
|
|
expectedReserve(contextWindow) * COMPACTION_SUMMARIZER_OUTPUT_RATIO,
|
|
),
|
|
)
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function userMsg(text: string): ModelMessage {
|
|
return { role: 'user', content: text }
|
|
}
|
|
|
|
function assistantMsg(text: string): ModelMessage {
|
|
return { role: 'assistant', content: text }
|
|
}
|
|
|
|
function assistantToolCall(
|
|
toolName: string,
|
|
input: Record<string, unknown>,
|
|
): ModelMessage {
|
|
return {
|
|
role: 'assistant',
|
|
content: [
|
|
{
|
|
type: 'tool-call',
|
|
toolCallId: `call_${toolName}_${Date.now()}`,
|
|
toolName,
|
|
input,
|
|
},
|
|
],
|
|
}
|
|
}
|
|
|
|
function toolResult(
|
|
toolName: string,
|
|
text: string,
|
|
toolCallId?: string,
|
|
): ModelMessage {
|
|
return {
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: toolCallId ?? `call_${toolName}`,
|
|
toolName,
|
|
output: { type: 'text' as const, value: text },
|
|
},
|
|
],
|
|
}
|
|
}
|
|
|
|
function toolResultJson(toolName: string, value: unknown): ModelMessage {
|
|
return {
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: `call_${toolName}`,
|
|
toolName,
|
|
output: { type: 'json' as const, value },
|
|
},
|
|
],
|
|
}
|
|
}
|
|
|
|
function toolResultContent(
|
|
toolName: string,
|
|
value: Extract<ToolResultPart['output'], { type: 'content' }>['value'],
|
|
): ModelMessage {
|
|
return {
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: `call_${toolName}`,
|
|
toolName,
|
|
output: { type: 'content' as const, value },
|
|
},
|
|
],
|
|
}
|
|
}
|
|
|
|
function userMsgWithImage(text: string): ModelMessage {
|
|
return {
|
|
role: 'user',
|
|
content: [
|
|
{ type: 'text', text },
|
|
{ type: 'image', image: new Uint8Array([1, 2, 3]) },
|
|
],
|
|
}
|
|
}
|
|
|
|
function repeat(char: string, count: number): string {
|
|
return char.repeat(count)
|
|
}
|
|
|
|
function agentConfig(
|
|
overrides: Partial<{
|
|
provider: string
|
|
model: string
|
|
upstreamProvider: string
|
|
supportsImages: boolean
|
|
}> = {},
|
|
) {
|
|
return {
|
|
conversationId: 'test-conversation',
|
|
provider: LLM_PROVIDERS.OPENROUTER,
|
|
model: 'moonshotai/kimi-k2.5',
|
|
sessionExecutionDir: '/tmp/browseros-tests',
|
|
...overrides,
|
|
}
|
|
}
|
|
|
|
// Build a realistic browser automation conversation
|
|
function buildBrowserConversation(
|
|
toolOutputSize: number,
|
|
exchanges: number,
|
|
): ModelMessage[] {
|
|
const messages: ModelMessage[] = [
|
|
userMsg('Book me a flight from NYC to LAX on Kayak'),
|
|
]
|
|
|
|
for (let i = 0; i < exchanges; i++) {
|
|
messages.push(assistantToolCall(`action_${i}`, { step: i }))
|
|
messages.push(toolResult(`action_${i}`, repeat('x', toolOutputSize)))
|
|
messages.push(assistantMsg(`Completed step ${i}`))
|
|
}
|
|
|
|
return messages
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// computeConfig — Pi-style reserve trigger
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('computeConfig — reserve trigger', () => {
|
|
it('8K model → reserve is clamped to 50% of context', () => {
|
|
const config = computeConfig(8_000)
|
|
expect(config.reserveTokens).toBe(expectedReserve(8_000))
|
|
expect(config.triggerThreshold).toBe(expectedTrigger(8_000))
|
|
expect(config.triggerRatio).toBe(0.5)
|
|
})
|
|
|
|
it('16K model → reserve is clamped to 50% of context', () => {
|
|
const config = computeConfig(16_000)
|
|
expect(config.reserveTokens).toBe(expectedReserve(16_000))
|
|
expect(config.triggerThreshold).toBe(expectedTrigger(16_000))
|
|
expect(config.triggerRatio).toBe(0.5)
|
|
})
|
|
|
|
it('30K model → reserve is clamped to 50% of context', () => {
|
|
const config = computeConfig(30_000)
|
|
expect(config.reserveTokens).toBe(expectedReserve(30_000))
|
|
expect(config.triggerThreshold).toBe(expectedTrigger(30_000))
|
|
expect(config.triggerRatio).toBe(0.5)
|
|
})
|
|
|
|
it('32K model → reserve is clamped to 50% of context', () => {
|
|
const config = computeConfig(32_000)
|
|
expect(config.reserveTokens).toBe(expectedReserve(32_000))
|
|
expect(config.triggerThreshold).toBe(expectedTrigger(32_000))
|
|
expect(config.triggerRatio).toBe(0.5)
|
|
})
|
|
|
|
for (const size of [64_000, 200_000, 1_000_000]) {
|
|
it(`${(size / 1000).toFixed(0)}K model → reserve is fixed at COMPACTION_RESERVE_TOKENS`, () => {
|
|
const config = computeConfig(size)
|
|
expect(config.reserveTokens).toBe(COMPACTION_RESERVE_TOKENS)
|
|
expect(config.triggerThreshold).toBe(expectedTrigger(size))
|
|
expect(config.triggerRatio).toBeCloseTo(expectedTrigger(size) / size, 3)
|
|
})
|
|
}
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// computeConfig — keep-recent fraction with max cap
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('computeConfig — keep-recent', () => {
|
|
for (const size of [8_000, 16_000, 32_000, 64_000]) {
|
|
it(`${(size / 1000).toFixed(0)}K model → keeps ${COMPACTION_KEEP_RECENT_FRACTION * 100}% of trigger budget`, () => {
|
|
const config = computeConfig(size)
|
|
expect(config.keepRecentTokens).toBe(expectedKeepRecent(size))
|
|
expect(config.minSummarizableTokens).toBe(expectedMinSummarizable(size))
|
|
})
|
|
}
|
|
|
|
for (const size of [200_000, 1_000_000]) {
|
|
it(`${(size / 1000).toFixed(0)}K model → capped at COMPACTION_MAX_KEEP_RECENT`, () => {
|
|
const config = computeConfig(size)
|
|
expect(config.keepRecentTokens).toBe(COMPACTION_MAX_KEEP_RECENT)
|
|
})
|
|
}
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// computeConfig — Pi-style summarization budgets
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('computeConfig — summarization budgets', () => {
|
|
for (const size of [16_000, 32_000]) {
|
|
it(`${(size / 1000).toFixed(0)}K model → summarize budget is trigger minus keep-recent`, () => {
|
|
const config = computeConfig(size)
|
|
expect(config.maxSummarizationInput).toBe(
|
|
expectedMaxSummarizationInput(size),
|
|
)
|
|
expect(config.summarizerMaxOutputTokens).toBe(
|
|
expectedSummarizerMaxOutput(size),
|
|
)
|
|
})
|
|
}
|
|
|
|
it('20K model → min summarizable is clamped to available summarize budget', () => {
|
|
const config = computeConfig(20_000)
|
|
expect(config.minSummarizableTokens).toBe(expectedMinSummarizable(20_000))
|
|
expect(config.maxSummarizationInput).toBe(
|
|
expectedMaxSummarizationInput(20_000),
|
|
)
|
|
})
|
|
|
|
for (const size of [200_000, 1_000_000]) {
|
|
it(`${(size / 1000).toFixed(0)}K model → max summarization input capped at COMPACTION_MAX_SUMMARIZATION_INPUT`, () => {
|
|
const config = computeConfig(size)
|
|
expect(config.maxSummarizationInput).toBe(
|
|
COMPACTION_MAX_SUMMARIZATION_INPUT,
|
|
)
|
|
expect(config.summarizerMaxOutputTokens).toBe(
|
|
expectedSummarizerMaxOutput(size),
|
|
)
|
|
})
|
|
}
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// computeConfig — fixedOverhead scaling
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('computeConfig — fixedOverhead scaling', () => {
|
|
it('8K model → fixedOverhead capped at 40% of context', () => {
|
|
const config = computeConfig(8_000)
|
|
expect(config.fixedOverhead).toBe(Math.floor(8_000 * 0.4))
|
|
expect(config.fixedOverhead).toBeLessThan(
|
|
AGENT_LIMITS.COMPACTION_FIXED_OVERHEAD,
|
|
)
|
|
})
|
|
|
|
it('20K model → fixedOverhead capped at 40% of context', () => {
|
|
const config = computeConfig(20_000)
|
|
expect(config.fixedOverhead).toBe(Math.floor(20_000 * 0.4))
|
|
expect(config.fixedOverhead).toBeLessThan(
|
|
AGENT_LIMITS.COMPACTION_FIXED_OVERHEAD,
|
|
)
|
|
})
|
|
|
|
it('30K model → fixedOverhead equals constant (40% of 30K = 12K = constant)', () => {
|
|
const config = computeConfig(30_000)
|
|
expect(config.fixedOverhead).toBe(AGENT_LIMITS.COMPACTION_FIXED_OVERHEAD)
|
|
})
|
|
|
|
for (const size of [64_000, 200_000, 1_000_000]) {
|
|
it(`${(size / 1000).toFixed(0)}K model → fixedOverhead equals constant`, () => {
|
|
const config = computeConfig(size)
|
|
expect(config.fixedOverhead).toBe(AGENT_LIMITS.COMPACTION_FIXED_OVERHEAD)
|
|
})
|
|
}
|
|
|
|
it('30K model → fixedOverhead does not exceed trigger threshold', () => {
|
|
const config = computeConfig(30_000)
|
|
expect(config.fixedOverhead).toBeLessThanOrEqual(config.triggerThreshold)
|
|
})
|
|
|
|
it('20K model → fixedOverhead does not exceed trigger threshold', () => {
|
|
const config = computeConfig(20_000)
|
|
expect(config.fixedOverhead).toBeLessThanOrEqual(config.triggerThreshold)
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// estimateTokens
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('estimateTokens', () => {
|
|
it('estimates text messages as chars/3', () => {
|
|
const msgs = [userMsg('a'.repeat(300))]
|
|
expect(estimateTokens(msgs)).toBe(100)
|
|
})
|
|
|
|
it('estimates tool result text', () => {
|
|
const msgs = [toolResult('test', 'a'.repeat(600))]
|
|
expect(estimateTokens(msgs)).toBe(200)
|
|
})
|
|
|
|
it('estimates tool result JSON', () => {
|
|
const obj = { key: 'a'.repeat(100) }
|
|
const msgs = [toolResultJson('test', obj)]
|
|
const serialized = JSON.stringify(obj)
|
|
expect(estimateTokens(msgs)).toBe(Math.ceil(serialized.length / 3))
|
|
})
|
|
|
|
it('estimates tool result content without counting base64 payload size', () => {
|
|
const msgs = [
|
|
toolResultContent('snapshot', [
|
|
{ type: 'text', text: 'Screenshot taken' },
|
|
{
|
|
type: 'image-data',
|
|
data: 'x'.repeat(120_000),
|
|
mediaType: 'image/png',
|
|
},
|
|
]),
|
|
]
|
|
|
|
const textTokens = Math.ceil('Screenshot taken'.length / 3)
|
|
expect(estimateTokens(msgs)).toBe(textTokens + 1000)
|
|
})
|
|
|
|
it('counts images as 1000 tokens each', () => {
|
|
const msgs = [userMsgWithImage('hello')]
|
|
const textTokens = Math.ceil('hello'.length / 3)
|
|
expect(estimateTokens(msgs)).toBe(textTokens + 1000)
|
|
})
|
|
|
|
it('counts multiple images', () => {
|
|
const msg: ModelMessage = {
|
|
role: 'user',
|
|
content: [
|
|
{ type: 'text', text: 'compare these' },
|
|
{ type: 'image', image: new Uint8Array([1]) },
|
|
{ type: 'image', image: new Uint8Array([2]) },
|
|
],
|
|
}
|
|
const textTokens = Math.ceil('compare these'.length / 3)
|
|
expect(estimateTokens([msg])).toBe(textTokens + 2000)
|
|
})
|
|
|
|
it('handles tool call input', () => {
|
|
const msgs = [assistantToolCall('navigate', { url: 'https://example.com' })]
|
|
const expected = Math.ceil(
|
|
JSON.stringify({ url: 'https://example.com' }).length / 3,
|
|
)
|
|
expect(estimateTokens(msgs)).toBe(expected)
|
|
})
|
|
|
|
it('handles empty messages', () => {
|
|
expect(estimateTokens([])).toBe(0)
|
|
})
|
|
})
|
|
|
|
describe('stripBinaryContent', () => {
|
|
it('replaces content outputs with placeholder text and counts media parts', () => {
|
|
const msgs = [
|
|
toolResultContent('snapshot', [
|
|
{ type: 'text', text: 'Before image' },
|
|
{
|
|
type: 'image-data',
|
|
data: 'abcd',
|
|
mediaType: 'image/png',
|
|
},
|
|
{
|
|
type: 'file-data',
|
|
data: 'efgh',
|
|
mediaType: 'application/pdf',
|
|
filename: 'report.pdf',
|
|
},
|
|
]),
|
|
]
|
|
|
|
const stripped = stripBinaryContent(msgs)
|
|
const output = (
|
|
stripped[0].content as Array<{ output: { type: string; value: string } }>
|
|
)[0].output
|
|
|
|
expect(countBinaryParts(msgs)).toBe(2)
|
|
expect(output.type).toBe('text')
|
|
expect(output.value).toContain('Before image')
|
|
expect(output.value).toContain('[Image]')
|
|
expect(output.value).toContain('[File: report.pdf]')
|
|
expect(output.value).not.toContain('abcd')
|
|
expect(output.value).not.toContain('efgh')
|
|
})
|
|
})
|
|
|
|
describe('getMessageNormalizationOptions', () => {
|
|
it('marks openrouter-compatible transports as requiring normalization', () => {
|
|
expect(
|
|
getMessageNormalizationOptions(
|
|
agentConfig({ provider: LLM_PROVIDERS.OPENROUTER }),
|
|
).supportsMediaInToolResults,
|
|
).toBe(false)
|
|
|
|
expect(
|
|
getMessageNormalizationOptions(
|
|
agentConfig({
|
|
provider: LLM_PROVIDERS.BROWSEROS,
|
|
upstreamProvider: LLM_PROVIDERS.OPENAI,
|
|
}),
|
|
).supportsMediaInToolResults,
|
|
).toBe(false)
|
|
})
|
|
|
|
it('keeps native anthropic and openai transports unchanged', () => {
|
|
expect(
|
|
getMessageNormalizationOptions(
|
|
agentConfig({ provider: LLM_PROVIDERS.ANTHROPIC }),
|
|
).supportsMediaInToolResults,
|
|
).toBe(true)
|
|
expect(
|
|
getMessageNormalizationOptions(
|
|
agentConfig({ provider: LLM_PROVIDERS.OPENAI }),
|
|
).supportsMediaInToolResults,
|
|
).toBe(true)
|
|
})
|
|
})
|
|
|
|
describe('normalizeMessagesForModel', () => {
|
|
it('moves screenshot media into a follow-up user message for incompatible providers', () => {
|
|
const messages = [
|
|
assistantToolCall('snapshot', { page: 2 }),
|
|
toolResultContent('snapshot', [
|
|
{ type: 'text', text: 'Captured screenshot' },
|
|
{
|
|
type: 'image-data',
|
|
data: 'abcd',
|
|
mediaType: 'image/png',
|
|
},
|
|
]),
|
|
]
|
|
|
|
const normalized = normalizeMessagesForModel(messages, {
|
|
supportsImages: true,
|
|
supportsMediaInToolResults: false,
|
|
})
|
|
|
|
expect(normalized).toHaveLength(3)
|
|
|
|
const toolMessage = normalized[1]
|
|
expect(toolMessage.role).toBe('tool')
|
|
const output = (toolMessage.content as ToolResultPart[])[0].output
|
|
expect(output.type).toBe('text')
|
|
if (output.type === 'text') {
|
|
expect(output.value).toContain('Captured screenshot')
|
|
expect(output.value).toContain('[Image]')
|
|
expect(output.value).not.toContain('abcd')
|
|
}
|
|
|
|
const mediaMessage = normalized[2]
|
|
expect(mediaMessage.role).toBe('user')
|
|
expect(Array.isArray(mediaMessage.content)).toBe(true)
|
|
if (Array.isArray(mediaMessage.content)) {
|
|
expect(mediaMessage.content[0]).toEqual({
|
|
type: 'text',
|
|
text: 'Attached image(s) from tool result:',
|
|
})
|
|
expect(mediaMessage.content[1]).toEqual({
|
|
type: 'image',
|
|
image: 'abcd',
|
|
mediaType: 'image/png',
|
|
})
|
|
}
|
|
})
|
|
|
|
it('keeps media out of the prompt when the model does not support image input', () => {
|
|
const messages = [
|
|
assistantToolCall('snapshot', { page: 2 }),
|
|
toolResultContent('snapshot', [
|
|
{ type: 'text', text: 'Captured screenshot' },
|
|
{
|
|
type: 'image-data',
|
|
data: 'abcd',
|
|
mediaType: 'image/png',
|
|
},
|
|
]),
|
|
]
|
|
|
|
const normalized = normalizeMessagesForModel(messages, {
|
|
supportsImages: false,
|
|
supportsMediaInToolResults: false,
|
|
})
|
|
|
|
expect(normalized).toHaveLength(2)
|
|
const output = (normalized[1].content as ToolResultPart[])[0].output
|
|
expect(output.type).toBe('text')
|
|
})
|
|
|
|
it('converts generic file attachments into follow-up user file parts', () => {
|
|
const messages = [
|
|
assistantToolCall('fetch_report', { id: 'report-1' }),
|
|
toolResultContent('fetch_report', [
|
|
{ type: 'text', text: 'Downloaded report' },
|
|
{
|
|
type: 'file-data',
|
|
data: 'cGRm',
|
|
mediaType: 'application/pdf',
|
|
filename: 'report.pdf',
|
|
},
|
|
]),
|
|
]
|
|
|
|
const normalized = normalizeMessagesForModel(messages, {
|
|
supportsImages: true,
|
|
supportsMediaInToolResults: false,
|
|
})
|
|
|
|
expect(normalized).toHaveLength(3)
|
|
expect(normalized[2].role).toBe('user')
|
|
if (Array.isArray(normalized[2].content)) {
|
|
expect(normalized[2].content[0]).toEqual({
|
|
type: 'text',
|
|
text: 'Attached file(s) from tool result:',
|
|
})
|
|
expect(normalized[2].content[1]).toEqual({
|
|
type: 'file',
|
|
data: 'cGRm',
|
|
mediaType: 'application/pdf',
|
|
filename: 'report.pdf',
|
|
})
|
|
}
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// findSafeSplitPoint
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('findSafeSplitPoint', () => {
|
|
it('returns splitIndex -1 for too few messages', () => {
|
|
const msgs = [userMsg('hello'), assistantMsg('hi')]
|
|
const result = findSafeSplitPoint(msgs, 1)
|
|
expect(result.splitIndex).toBe(-1)
|
|
expect(result.isSplitTurn).toBe(false)
|
|
})
|
|
|
|
it('returns splitIndex -1 when conversation is smaller than keepRecent', () => {
|
|
const msgs = [userMsg('hello'), assistantMsg('hi'), userMsg('what')]
|
|
// Total estimated ~3-4 tokens, keepRecent = 1000
|
|
const result = findSafeSplitPoint(msgs, 1000)
|
|
expect(result.splitIndex).toBe(-1)
|
|
expect(result.isSplitTurn).toBe(false)
|
|
})
|
|
|
|
it('never cuts before a tool message', () => {
|
|
// Build: user, assistant(tool_call), tool, assistant(text), user, assistant
|
|
const msgs: ModelMessage[] = [
|
|
userMsg('do something'),
|
|
assistantToolCall('navigate', { url: 'https://example.com' }),
|
|
toolResult('navigate', repeat('x', 2000)),
|
|
assistantMsg('done navigating'),
|
|
userMsg(repeat('y', 8000)),
|
|
assistantMsg(repeat('z', 8000)),
|
|
]
|
|
|
|
const result = findSafeSplitPoint(msgs, 2100)
|
|
expect(result.splitIndex).toBeGreaterThan(0)
|
|
expect(msgs[result.splitIndex].role).not.toBe('tool')
|
|
})
|
|
|
|
it('walks backward past tool messages to find safe cut', () => {
|
|
const msgs: ModelMessage[] = [
|
|
userMsg('start'),
|
|
assistantMsg('ok'),
|
|
assistantToolCall('click', { selector: '#btn' }),
|
|
toolResult('click', repeat('x', 4000)), // walking back lands here — unsafe
|
|
assistantToolCall('snapshot', {}),
|
|
toolResult('snapshot', repeat('y', 4000)),
|
|
assistantMsg(repeat('z', 8000)), // ~2000 tokens, keepRecent = 2500
|
|
]
|
|
|
|
const result = findSafeSplitPoint(msgs, 2500)
|
|
if (result.splitIndex !== -1) {
|
|
expect(msgs[result.splitIndex].role).not.toBe('tool')
|
|
}
|
|
})
|
|
|
|
it('splits correctly in a realistic browser automation flow', () => {
|
|
// 10 exchanges, each tool output ~4000 chars (~1000 tokens)
|
|
const msgs = buildBrowserConversation(4000, 10)
|
|
const result = findSafeSplitPoint(msgs, 3000)
|
|
|
|
expect(result.splitIndex).toBeGreaterThan(0)
|
|
expect(result.splitIndex).toBeLessThan(msgs.length)
|
|
expect(msgs[result.splitIndex].role).not.toBe('tool')
|
|
|
|
const keptTokens = estimateTokens(msgs.slice(result.splitIndex))
|
|
expect(keptTokens).toBeGreaterThanOrEqual(3000)
|
|
})
|
|
|
|
it('handles assistant tool_call followed by tool result pairs', () => {
|
|
const msgs: ModelMessage[] = [
|
|
userMsg('start'),
|
|
assistantToolCall('a', {}),
|
|
toolResult('a', 'result a'),
|
|
assistantToolCall('b', {}),
|
|
toolResult('b', 'result b'),
|
|
assistantToolCall('c', {}),
|
|
toolResult('c', repeat('z', 4000)),
|
|
assistantMsg('final answer'),
|
|
]
|
|
|
|
const result = findSafeSplitPoint(msgs, 500)
|
|
if (result.splitIndex !== -1) {
|
|
const kept = msgs.slice(result.splitIndex)
|
|
for (let i = 0; i < kept.length; i++) {
|
|
if (kept[i].role === 'tool') {
|
|
expect(i).toBeGreaterThan(0)
|
|
expect(kept[i - 1].role).toBe('assistant')
|
|
}
|
|
}
|
|
}
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// findSafeSplitPoint — split turn detection
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('findSafeSplitPoint — split turn detection', () => {
|
|
it('detects split turn when cut lands mid-turn (user+assistant+tool+assistant+tool)', () => {
|
|
const msgs: ModelMessage[] = [
|
|
userMsg('first request'),
|
|
assistantMsg('done with first'),
|
|
userMsg('order MacBook on Amazon'), // index 2 — turn start
|
|
assistantToolCall('navigate', { url: 'https://amazon.com' }), // index 3
|
|
toolResult('navigate', repeat('x', 4000)), // index 4
|
|
assistantToolCall('click', { selector: '#buy' }), // index 5 — cut here
|
|
toolResult('click', repeat('y', 4000)), // index 6
|
|
assistantMsg(repeat('z', 8000)), // index 7
|
|
]
|
|
|
|
// keepRecent should land the cut around index 5 (mid-turn)
|
|
const result = findSafeSplitPoint(msgs, 2500)
|
|
if (result.splitIndex !== -1 && result.splitIndex > 2) {
|
|
expect(result.isSplitTurn).toBe(true)
|
|
expect(result.turnStartIndex).toBe(2)
|
|
}
|
|
})
|
|
|
|
it('does not flag split turn when cut is at user message', () => {
|
|
const msgs: ModelMessage[] = [
|
|
userMsg('first request'),
|
|
assistantMsg('done'),
|
|
userMsg(repeat('x', 8000)), // index 2 — this is where cut lands
|
|
assistantMsg(repeat('y', 8000)),
|
|
]
|
|
|
|
const result = findSafeSplitPoint(msgs, 2100)
|
|
if (result.splitIndex !== -1 && msgs[result.splitIndex].role === 'user') {
|
|
expect(result.isSplitTurn).toBe(false)
|
|
expect(result.turnStartIndex).toBe(-1)
|
|
}
|
|
})
|
|
|
|
it('does not flag split turn when user message is at index 0 (single turn)', () => {
|
|
// One user message followed by many tool exchanges
|
|
const msgs: ModelMessage[] = [
|
|
userMsg('do everything'), // index 0
|
|
]
|
|
for (let i = 0; i < 10; i++) {
|
|
msgs.push(assistantToolCall(`action_${i}`, { step: i }))
|
|
msgs.push(toolResult(`action_${i}`, repeat('x', 4000)))
|
|
}
|
|
msgs.push(assistantMsg(repeat('z', 8000)))
|
|
|
|
const result = findSafeSplitPoint(msgs, 3000)
|
|
if (result.splitIndex !== -1) {
|
|
// When the only user message is at index 0, it's NOT a split turn
|
|
// Regular summarization is better for this case
|
|
expect(result.isSplitTurn).toBe(false)
|
|
expect(result.turnStartIndex).toBe(-1)
|
|
}
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Splitting mechanics at different model sizes
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('splitting at different context windows', () => {
|
|
it('32K model — splits with realistic browser automation', () => {
|
|
const config = computeConfig(32_000)
|
|
const msgs = buildBrowserConversation(5000, 12)
|
|
const totalTokens = estimateTokens(msgs)
|
|
expect(totalTokens).toBeGreaterThan(12_800)
|
|
|
|
const result = findSafeSplitPoint(msgs, config.keepRecentTokens)
|
|
expect(result.splitIndex).toBeGreaterThan(0)
|
|
expect(msgs[result.splitIndex].role).not.toBe('tool')
|
|
|
|
const kept = msgs.slice(result.splitIndex)
|
|
const keptTokens = estimateTokens(kept)
|
|
expect(keptTokens).toBeGreaterThanOrEqual(config.keepRecentTokens)
|
|
|
|
const toSummarize = msgs.slice(0, result.splitIndex)
|
|
expect(toSummarize.length).toBeGreaterThan(0)
|
|
})
|
|
|
|
it('200K model — splits with long conversation', () => {
|
|
const config = computeConfig(200_000)
|
|
const msgs = buildBrowserConversation(10000, 50)
|
|
const totalTokens = estimateTokens(msgs)
|
|
expect(totalTokens).toBeGreaterThan(100_000)
|
|
|
|
const result = findSafeSplitPoint(msgs, config.keepRecentTokens)
|
|
expect(result.splitIndex).toBeGreaterThan(0)
|
|
|
|
const kept = msgs.slice(result.splitIndex)
|
|
const keptTokens = estimateTokens(kept)
|
|
expect(keptTokens).toBeGreaterThanOrEqual(config.keepRecentTokens)
|
|
})
|
|
|
|
it('16K model — handles tight context', () => {
|
|
const config = computeConfig(16_000)
|
|
const msgs = buildBrowserConversation(2000, 5)
|
|
const totalTokens = estimateTokens(msgs)
|
|
|
|
if (totalTokens > 16_000 * config.triggerRatio) {
|
|
const result = findSafeSplitPoint(msgs, config.keepRecentTokens)
|
|
if (result.splitIndex !== -1) {
|
|
expect(msgs[result.splitIndex].role).not.toBe('tool')
|
|
const toSummarize = msgs.slice(0, result.splitIndex)
|
|
expect(estimateTokens(toSummarize)).toBeGreaterThan(0)
|
|
}
|
|
}
|
|
})
|
|
|
|
it('keeps tool call + result pairs together after split', () => {
|
|
for (const contextWindow of [16_000, 32_000, 64_000, 200_000, 1_000_000]) {
|
|
const config = computeConfig(contextWindow)
|
|
const msgs = buildBrowserConversation(4000, 8)
|
|
const result = findSafeSplitPoint(msgs, config.keepRecentTokens)
|
|
|
|
if (result.splitIndex === -1) continue
|
|
|
|
const kept = msgs.slice(result.splitIndex)
|
|
for (let i = 0; i < kept.length; i++) {
|
|
if (kept[i].role === 'tool' && i === 0) {
|
|
throw new Error(
|
|
`Orphaned tool result at start of kept messages for ${contextWindow} context window`,
|
|
)
|
|
}
|
|
}
|
|
}
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// reduceToolOutputs
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('reduceToolOutputs', () => {
|
|
it('truncates protected recent outputs exceeding maxChars', () => {
|
|
const msgs = [toolResult('test', 'a'.repeat(20_000))]
|
|
const reduced = reduceToolOutputs(msgs, {
|
|
maxChars: 15_000,
|
|
keepRecentCount: 1,
|
|
})
|
|
|
|
const output = (
|
|
reduced[0].content as Array<{ output: { value: string } }>
|
|
)[0].output.value
|
|
expect(output.length).toBeLessThan(20_000)
|
|
expect(output).toContain('[... truncated')
|
|
})
|
|
|
|
it('clears older verbose outputs but protects the last two', () => {
|
|
const msgs = [
|
|
toolResult('old', 'x'.repeat(500)),
|
|
toolResult('recent_0', 'y'.repeat(500)),
|
|
toolResult('recent_1', 'z'.repeat(500)),
|
|
]
|
|
const reduced = reduceToolOutputs(msgs, {
|
|
maxChars: 200,
|
|
keepRecentCount: 2,
|
|
clearThreshold: 100,
|
|
})
|
|
|
|
const part = (
|
|
reduced[0].content as Array<{ output: { type: string; value: string } }>
|
|
)[0].output.value
|
|
const protected0 = (
|
|
reduced[1].content as Array<{ output: { value: string } }>
|
|
)[0].output.value
|
|
const protected1 = (
|
|
reduced[2].content as Array<{ output: { value: string } }>
|
|
)[0].output.value
|
|
|
|
expect(part).toBe('[Cleared — 500 chars]')
|
|
expect(protected0).toContain('[... truncated')
|
|
expect(protected1).toContain('[... truncated')
|
|
})
|
|
|
|
it('does not modify non-tool messages', () => {
|
|
const msgs = [userMsg('hello'), assistantMsg('world')]
|
|
expect(
|
|
reduceToolOutputs(msgs, { maxChars: 100, keepRecentCount: 2 }),
|
|
).toEqual(msgs)
|
|
})
|
|
|
|
it('normalizes content output before reduction', () => {
|
|
const msgs = [
|
|
toolResultContent('snapshot', [
|
|
{ type: 'text', text: 'Captured screenshot' },
|
|
{
|
|
type: 'image-data',
|
|
data: 'x'.repeat(20_000),
|
|
mediaType: 'image/png',
|
|
},
|
|
]),
|
|
]
|
|
const reduced = reduceToolOutputs(msgs, {
|
|
maxChars: 100,
|
|
keepRecentCount: 1,
|
|
clearThreshold: 0,
|
|
})
|
|
|
|
const output = (
|
|
reduced[0].content as Array<{ output: { type: string; value: string } }>
|
|
)[0].output
|
|
|
|
expect(output.type).toBe('text')
|
|
expect(output.value).toContain('Captured screenshot')
|
|
expect(output.value).toContain('[Image]')
|
|
expect(output.value).not.toContain('x'.repeat(100))
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// slidingWindow
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('slidingWindow', () => {
|
|
it('keeps tool+assistant pairs together', () => {
|
|
const msgs: ModelMessage[] = [
|
|
assistantToolCall('a', {}),
|
|
toolResult('a', repeat('x', 4000)),
|
|
assistantToolCall('b', {}),
|
|
toolResult('b', repeat('y', 4000)),
|
|
userMsg('continue'),
|
|
]
|
|
|
|
// maxTokens small enough to force dropping
|
|
const windowed = slidingWindow(msgs, 1500)
|
|
|
|
// Should not start with a tool result (that would be orphaned)
|
|
if (windowed.length > 0 && windowed[0].role === 'tool') {
|
|
// If it starts with tool, the next should be assistant
|
|
expect(windowed.length).toBeGreaterThan(1)
|
|
}
|
|
})
|
|
|
|
it('preserves at least 2 messages', () => {
|
|
const msgs = [userMsg(repeat('x', 10000)), assistantMsg(repeat('y', 10000))]
|
|
const windowed = slidingWindow(msgs, 100)
|
|
expect(windowed.length).toBeGreaterThanOrEqual(2)
|
|
})
|
|
|
|
it('returns original when under threshold', () => {
|
|
const msgs = [userMsg('hello'), assistantMsg('hi')]
|
|
const windowed = slidingWindow(msgs, 100_000)
|
|
expect(windowed).toEqual(msgs)
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// compaction-prompt: buildSummarizationPrompt
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('buildSummarizationPrompt', () => {
|
|
it('returns initial prompt when no existing summary', () => {
|
|
const prompt = buildSummarizationPrompt(null)
|
|
expect(prompt).toContain('Summarize the following')
|
|
expect(prompt).toContain('## Goal')
|
|
expect(prompt).toContain('## Active State')
|
|
expect(prompt).not.toContain('<previous_summary>')
|
|
})
|
|
|
|
it('returns update prompt with previous summary', () => {
|
|
const prompt = buildSummarizationPrompt('## Goal\nold stuff')
|
|
expect(prompt).toContain('Update the existing summary')
|
|
expect(prompt).toContain('PRESERVE all existing information')
|
|
expect(prompt).toContain('<previous_summary>')
|
|
expect(prompt).toContain('old stuff')
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// compaction-prompt: buildTurnPrefixPrompt
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('buildTurnPrefixPrompt', () => {
|
|
it('returns turn prefix prompt with expected sections', () => {
|
|
const prompt = buildTurnPrefixPrompt()
|
|
expect(prompt).toContain('PREFIX of a turn')
|
|
expect(prompt).toContain('## Original Request')
|
|
expect(prompt).toContain('## Early Progress')
|
|
expect(prompt).toContain('## Context for Suffix')
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// compaction-prompt: messagesToTranscript
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('messagesToTranscript', () => {
|
|
it('serializes user messages', () => {
|
|
const transcript = messagesToTranscript([userMsg('hello world')])
|
|
expect(transcript).toBe('[User]: hello world')
|
|
})
|
|
|
|
it('serializes assistant text', () => {
|
|
const transcript = messagesToTranscript([assistantMsg('I will help')])
|
|
expect(transcript).toBe('[Assistant]: I will help')
|
|
})
|
|
|
|
it('serializes tool calls', () => {
|
|
const transcript = messagesToTranscript([
|
|
assistantToolCall('navigate_to', { url: 'https://example.com' }),
|
|
])
|
|
expect(transcript).toContain('[Tool Call]: navigate_to(')
|
|
expect(transcript).toContain('https://example.com')
|
|
})
|
|
|
|
it('serializes tool results', () => {
|
|
const transcript = messagesToTranscript([
|
|
toolResult('navigate_to', 'Navigated to Example'),
|
|
])
|
|
expect(transcript).toContain(
|
|
'[Tool Result] navigate_to: Navigated to Example',
|
|
)
|
|
})
|
|
|
|
it('truncates large tool results to 2K', () => {
|
|
const transcript = messagesToTranscript([
|
|
toolResult('snapshot', repeat('x', 5000)),
|
|
])
|
|
expect(transcript).toContain('[... truncated')
|
|
// The tool output should be capped
|
|
expect(transcript.length).toBeLessThan(5000)
|
|
})
|
|
|
|
it('serializes content tool results without leaking base64', () => {
|
|
const transcript = messagesToTranscript([
|
|
toolResultContent('snapshot', [
|
|
{ type: 'text', text: 'Captured screenshot' },
|
|
{
|
|
type: 'image-data',
|
|
data: 'x'.repeat(10_000),
|
|
mediaType: 'image/png',
|
|
},
|
|
]),
|
|
])
|
|
|
|
expect(transcript).toContain('[Tool Result] snapshot: Captured screenshot')
|
|
expect(transcript).toContain('[Image]')
|
|
expect(transcript).not.toContain('x'.repeat(100))
|
|
})
|
|
|
|
it('replaces images with [Image]', () => {
|
|
const transcript = messagesToTranscript([userMsgWithImage('look at this')])
|
|
expect(transcript).toContain('[Image]')
|
|
expect(transcript).toContain('look at this')
|
|
})
|
|
|
|
it('handles a full conversation', () => {
|
|
const msgs: ModelMessage[] = [
|
|
userMsg('Open google.com'),
|
|
assistantMsg("I'll navigate to Google."),
|
|
assistantToolCall('navigate_to', { url: 'https://google.com' }),
|
|
toolResult('navigate_to', 'Navigated to Google'),
|
|
assistantMsg('I opened Google. What next?'),
|
|
userMsg('Search for flights'),
|
|
]
|
|
|
|
const transcript = messagesToTranscript(msgs)
|
|
expect(transcript).toContain('[User]: Open google.com')
|
|
expect(transcript).toContain("[Assistant]: I'll navigate to Google.")
|
|
expect(transcript).toContain('[Tool Call]: navigate_to(')
|
|
expect(transcript).toContain(
|
|
'[Tool Result] navigate_to: Navigated to Google',
|
|
)
|
|
expect(transcript).toContain('[User]: Search for flights')
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// End-to-end: config + split coherence at all model sizes
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('end-to-end config coherence', () => {
|
|
const modelSizes = [
|
|
8_000, 16_000, 32_000, 64_000, 128_000, 200_000, 1_000_000,
|
|
]
|
|
|
|
for (const size of modelSizes) {
|
|
it(`${(size / 1000).toFixed(0)}K model — trigger budget is partitioned into keep + summarize`, () => {
|
|
const config = computeConfig(size)
|
|
const triggerTokens = config.triggerThreshold
|
|
|
|
// Trigger budget is partitioned into kept + summarizable portions.
|
|
// For large windows the cap means leftover budget exists, so use >=.
|
|
expect(triggerTokens).toBeGreaterThanOrEqual(
|
|
config.keepRecentTokens + config.maxSummarizationInput,
|
|
)
|
|
expect(config.maxSummarizationInput).toBeGreaterThanOrEqual(
|
|
config.minSummarizableTokens,
|
|
)
|
|
|
|
// keepRecent should never exceed context window
|
|
expect(config.keepRecentTokens).toBeLessThan(size)
|
|
|
|
// maxSummarizationInput should never exceed context window
|
|
expect(config.maxSummarizationInput).toBeLessThanOrEqual(size)
|
|
})
|
|
}
|
|
|
|
it('reserve is either half-context (tiny models) or COMPACTION_RESERVE_TOKENS (larger models)', () => {
|
|
for (const size of [
|
|
8_000, 16_000, 32_000, 64_000, 128_000, 200_000, 1_000_000,
|
|
]) {
|
|
const config = computeConfig(size)
|
|
expect(config.reserveTokens).toBe(expectedReserve(size))
|
|
}
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// getCurrentTokenCount — Pi-style additive counting
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('getCurrentTokenCount — Pi-style additive', () => {
|
|
const config = computeConfig(200_000)
|
|
|
|
it('returns estimated with safety margin when no steps exist', () => {
|
|
const msgs = [userMsg('a'.repeat(400))]
|
|
const result = getCurrentTokenCount([], msgs, config)
|
|
const rawEstimate = estimateTokens(msgs, config.imageTokenEstimate)
|
|
const expected =
|
|
Math.ceil(rawEstimate * config.safetyMultiplier) + config.fixedOverhead
|
|
expect(result).toBe(expected)
|
|
})
|
|
|
|
it('returns estimated when last step has no usage', () => {
|
|
const steps: StepWithUsage[] = [{ usage: undefined }]
|
|
const msgs = [userMsg('hello')]
|
|
const result = getCurrentTokenCount(steps, msgs, config)
|
|
const rawEstimate = estimateTokens(msgs, config.imageTokenEstimate)
|
|
const expected =
|
|
Math.ceil(rawEstimate * config.safetyMultiplier) + config.fixedOverhead
|
|
expect(result).toBe(expected)
|
|
})
|
|
|
|
it('adds outputTokens to base when no trailing post-step messages remain', () => {
|
|
const steps: StepWithUsage[] = [
|
|
{ usage: { inputTokens: 50_000, outputTokens: 2_000 } },
|
|
]
|
|
const msgs = [userMsg('hello'), assistantMsg('response')]
|
|
const result = getCurrentTokenCount(steps, msgs, config)
|
|
expect(result).toBe(52_000)
|
|
})
|
|
|
|
it('adds trailing tool result tokens to base + output', () => {
|
|
const toolOutput = 'x'.repeat(40_000) // ~10K tokens
|
|
const steps: StepWithUsage[] = [
|
|
{ usage: { inputTokens: 100_000, outputTokens: 1_000 } },
|
|
]
|
|
const msgs = [
|
|
userMsg('hello'),
|
|
assistantToolCall('snapshot', {}),
|
|
toolResult('snapshot', toolOutput),
|
|
]
|
|
|
|
const result = getCurrentTokenCount(steps, msgs, config)
|
|
const expectedTrailing = estimateTokens(
|
|
[toolResult('snapshot', toolOutput)],
|
|
config.imageTokenEstimate,
|
|
)
|
|
expect(result).toBe(100_000 + 1_000 + expectedTrailing)
|
|
})
|
|
|
|
it('catches large DOM snapshot that would bypass threshold', () => {
|
|
// Simulates the original bug: last step saw 150K tokens,
|
|
// then a 100K-char tool result (~25K tokens) is added
|
|
const largeSnapshot = 'x'.repeat(100_000)
|
|
const steps: StepWithUsage[] = [
|
|
{ usage: { inputTokens: 150_000, outputTokens: 500 } },
|
|
]
|
|
const msgs = [
|
|
userMsg('navigate to site'),
|
|
assistantToolCall('snapshot', {}),
|
|
toolResult('snapshot', largeSnapshot),
|
|
]
|
|
|
|
const result = getCurrentTokenCount(steps, msgs, config)
|
|
// Must be significantly above 150K — the old code returned 150K (stale)
|
|
expect(result).toBeGreaterThan(170_000)
|
|
})
|
|
|
|
it('counts multiple trailing tool results', () => {
|
|
const steps: StepWithUsage[] = [
|
|
{ usage: { inputTokens: 80_000, outputTokens: 1_000 } },
|
|
]
|
|
const msgs = [
|
|
userMsg('do things'),
|
|
assistantToolCall('click', { selector: '#btn' }),
|
|
toolResult('click', 'x'.repeat(4_000)),
|
|
toolResult('snapshot', 'y'.repeat(8_000)),
|
|
]
|
|
|
|
const result = getCurrentTokenCount(steps, msgs, config)
|
|
const trailing1 = estimateTokens(
|
|
[toolResult('click', 'x'.repeat(4_000))],
|
|
config.imageTokenEstimate,
|
|
)
|
|
const trailing2 = estimateTokens(
|
|
[toolResult('snapshot', 'y'.repeat(8_000))],
|
|
config.imageTokenEstimate,
|
|
)
|
|
expect(result).toBe(80_000 + 1_000 + trailing1 + trailing2)
|
|
})
|
|
|
|
it('counts the synthetic follow-up user media message too', () => {
|
|
const steps: StepWithUsage[] = [
|
|
{ usage: { inputTokens: 50_000, outputTokens: 500 } },
|
|
]
|
|
const msgs = normalizeMessagesForModel(
|
|
[
|
|
userMsg('hello'),
|
|
assistantToolCall('snapshot', {}),
|
|
toolResultContent('snapshot', [
|
|
{ type: 'text', text: 'Captured screenshot' },
|
|
{
|
|
type: 'image-data',
|
|
data: 'abcd',
|
|
mediaType: 'image/png',
|
|
},
|
|
]),
|
|
],
|
|
{
|
|
supportsImages: true,
|
|
supportsMediaInToolResults: false,
|
|
},
|
|
)
|
|
|
|
const result = getCurrentTokenCount(steps, msgs, config)
|
|
const trailing = estimateTokens(msgs.slice(-2), config.imageTokenEstimate)
|
|
|
|
expect(result).toBe(50_000 + 500 + trailing)
|
|
})
|
|
|
|
it('stops counting trailing at the most recent assistant message', () => {
|
|
const steps: StepWithUsage[] = [
|
|
{ usage: { inputTokens: 50_000, outputTokens: 500 } },
|
|
]
|
|
const msgs = [
|
|
userMsg('hello'),
|
|
assistantToolCall('click', {}),
|
|
toolResult('click', 'x'.repeat(4_000)),
|
|
assistantMsg('done'),
|
|
]
|
|
|
|
const result = getCurrentTokenCount(steps, msgs, config)
|
|
expect(result).toBe(50_500)
|
|
})
|
|
|
|
it('handles zero outputTokens gracefully', () => {
|
|
const steps: StepWithUsage[] = [{ usage: { inputTokens: 50_000 } }]
|
|
const msgs = [userMsg('hello')]
|
|
const result = getCurrentTokenCount(steps, msgs, config)
|
|
expect(result).toBe(50_000)
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Context overflow middleware
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('createContextOverflowMiddleware', () => {
|
|
it('passes through when model succeeds', async () => {
|
|
const middleware = createContextOverflowMiddleware(200_000)
|
|
const mockResult = { text: 'hello' }
|
|
const params = {
|
|
prompt: [
|
|
{ role: 'system', content: 'You are helpful' },
|
|
{ role: 'user', content: 'hi' },
|
|
],
|
|
}
|
|
|
|
const result = await middleware.wrapGenerate?.({
|
|
doGenerate: async () => mockResult,
|
|
params,
|
|
} as any)
|
|
|
|
expect(result).toBe(mockResult)
|
|
})
|
|
|
|
it('rethrows non-context errors', async () => {
|
|
const middleware = createContextOverflowMiddleware(200_000)
|
|
const params = {
|
|
prompt: [{ role: 'user', content: 'hi' }],
|
|
}
|
|
|
|
await expect(
|
|
middleware.wrapGenerate?.({
|
|
doGenerate: async () => {
|
|
throw new Error('network timeout')
|
|
},
|
|
params,
|
|
} as any),
|
|
).rejects.toThrow('network timeout')
|
|
})
|
|
|
|
it('truncates and retries on context_length error', async () => {
|
|
const middleware = createContextOverflowMiddleware(200_000)
|
|
let callCount = 0
|
|
const params = {
|
|
prompt: [
|
|
{ role: 'system', content: 'system prompt' },
|
|
{ role: 'user', content: 'old message 1' },
|
|
{ role: 'assistant', content: 'old response 1' },
|
|
{ role: 'user', content: 'old message 2' },
|
|
{ role: 'assistant', content: 'old response 2' },
|
|
{ role: 'user', content: 'recent message' },
|
|
],
|
|
}
|
|
|
|
const result = await middleware.wrapGenerate?.({
|
|
doGenerate: async () => {
|
|
callCount++
|
|
if (callCount === 1) {
|
|
throw new Error('context_length_exceeded')
|
|
}
|
|
return { text: 'success after truncation' }
|
|
},
|
|
params,
|
|
} as any)
|
|
|
|
expect(callCount).toBe(2)
|
|
expect(result).toEqual({ text: 'success after truncation' })
|
|
// System message should be preserved
|
|
expect(params.prompt.some((m: any) => m.role === 'system')).toBe(true)
|
|
// Prompt should be shorter after truncation
|
|
expect(params.prompt.length).toBeLessThanOrEqual(6)
|
|
})
|
|
|
|
it('preserves system messages during truncation', async () => {
|
|
const middleware = createContextOverflowMiddleware(10_000)
|
|
let truncatedPrompt: any[] = []
|
|
const params = {
|
|
prompt: [
|
|
{ role: 'system', content: 'important system prompt' },
|
|
{ role: 'user', content: 'a'.repeat(50_000) },
|
|
{ role: 'assistant', content: 'b'.repeat(50_000) },
|
|
{ role: 'user', content: 'recent' },
|
|
],
|
|
}
|
|
|
|
await middleware.wrapGenerate?.({
|
|
doGenerate: async () => {
|
|
if (truncatedPrompt.length === 0) {
|
|
truncatedPrompt = [...params.prompt]
|
|
throw new Error('maximum context length exceeded')
|
|
}
|
|
truncatedPrompt = [...params.prompt]
|
|
return { text: 'ok' }
|
|
},
|
|
params,
|
|
} as any)
|
|
|
|
const systemMsgs = truncatedPrompt.filter((m: any) => m.role === 'system')
|
|
expect(systemMsgs.length).toBe(1)
|
|
expect(systemMsgs[0].content).toBe('important system prompt')
|
|
})
|
|
|
|
it('handles wrapStream the same way', async () => {
|
|
const middleware = createContextOverflowMiddleware(200_000)
|
|
let callCount = 0
|
|
const params = {
|
|
prompt: [
|
|
{ role: 'system', content: 'system' },
|
|
{ role: 'user', content: 'message' },
|
|
],
|
|
}
|
|
|
|
const result = await middleware.wrapStream?.({
|
|
doStream: async () => {
|
|
callCount++
|
|
if (callCount === 1) {
|
|
throw new Error('token limit exceeded')
|
|
}
|
|
return { stream: 'mock-stream' }
|
|
},
|
|
params,
|
|
} as any)
|
|
|
|
expect(callCount).toBe(2)
|
|
expect(result).toEqual({ stream: 'mock-stream' })
|
|
})
|
|
|
|
it('detects provider-specific context overflow errors', async () => {
|
|
const middleware = createContextOverflowMiddleware(200_000)
|
|
const errorMessages = [
|
|
'context_length_exceeded', // Generic
|
|
'prompt is too long: 213462 tokens > 200000 maximum', // Anthropic
|
|
'Your input exceeds the context window of this model', // OpenAI
|
|
'The input token count (1196265) exceeds the maximum number of tokens allowed', // Google
|
|
"This model's maximum prompt length is 131072 but the request contains 537812 tokens", // xAI
|
|
'Please reduce the length of the messages or completion', // Groq
|
|
'maximum context length is 128000 tokens', // OpenRouter
|
|
'token limit exceeded', // Generic
|
|
'too many tokens', // Generic
|
|
'exceeded model token limit', // Kimi
|
|
'input is too long for requested model', // Amazon Bedrock
|
|
]
|
|
|
|
for (const errMsg of errorMessages) {
|
|
let callCount = 0
|
|
const params = {
|
|
prompt: [{ role: 'user', content: 'hi' }],
|
|
}
|
|
|
|
await middleware.wrapGenerate?.({
|
|
doGenerate: async () => {
|
|
callCount++
|
|
if (callCount === 1) throw new Error(errMsg)
|
|
return { text: 'ok' }
|
|
},
|
|
params,
|
|
} as any)
|
|
|
|
expect(callCount).toBe(2)
|
|
}
|
|
})
|
|
|
|
it('does not false-positive on unrelated errors', () => {
|
|
const unrelatedErrors = [
|
|
'URL is too long',
|
|
'Invalid max_tokens: must be between 1 and 4096',
|
|
'session token is too long',
|
|
'file name is too long',
|
|
'network timeout',
|
|
'rate limit exceeded',
|
|
]
|
|
|
|
for (const errMsg of unrelatedErrors) {
|
|
expect(isContextOverflowError(new Error(errMsg))).toBe(false)
|
|
}
|
|
})
|
|
|
|
it('keeps at least the last non-system message when it exceeds target', async () => {
|
|
const middleware = createContextOverflowMiddleware(1_000)
|
|
let truncatedPrompt: any[] = []
|
|
const params = {
|
|
prompt: [
|
|
{ role: 'system', content: 'system' },
|
|
{ role: 'user', content: 'x'.repeat(100_000) },
|
|
],
|
|
}
|
|
|
|
await middleware.wrapGenerate?.({
|
|
doGenerate: async () => {
|
|
if (truncatedPrompt.length === 0) {
|
|
truncatedPrompt = [...params.prompt]
|
|
throw new Error('context_length_exceeded')
|
|
}
|
|
truncatedPrompt = [...params.prompt]
|
|
return { text: 'ok' }
|
|
},
|
|
params,
|
|
} as any)
|
|
|
|
// Must keep system + at least the last user message (not empty)
|
|
expect(truncatedPrompt.length).toBe(2)
|
|
expect(truncatedPrompt[0].role).toBe('system')
|
|
expect(truncatedPrompt[1].role).toBe('user')
|
|
})
|
|
})
|