mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-21 04:45:12 +00:00
* fix: run full browseros-agent test suite * fix: stabilize server test reporting in CI * fix: address PR review feedback * refactor: extract server core test runner * refactor: group server tests by filesystem * fix: align CI suites with server test groups * fix: provision server env for all CI suites * fix: stabilize ci checks * fix: report real test counts in ci
1453 lines
43 KiB
TypeScript
1453 lines
43 KiB
TypeScript
import { describe, expect, it } from 'bun:test'
|
|
import type {
|
|
LanguageModelV3CallOptions,
|
|
LanguageModelV3GenerateResult,
|
|
LanguageModelV3StreamPart,
|
|
LanguageModelV3Usage,
|
|
} from '@ai-sdk/provider'
|
|
import {
|
|
generateText,
|
|
type ModelMessage,
|
|
stepCountIs,
|
|
type ToolResultPart,
|
|
tool,
|
|
} from 'ai'
|
|
import { MockLanguageModelV3 } from 'ai/test'
|
|
import { z } from 'zod'
|
|
import {
|
|
type CompactionState,
|
|
computeConfig,
|
|
createCompactionPrepareStep,
|
|
estimateTokensForThreshold,
|
|
reduceToolOutputs,
|
|
} from '../../src/agent/compaction'
|
|
import { normalizeMessagesForModel } from '../../src/agent/message-normalization'
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Test infrastructure
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// biome-ignore lint/suspicious/noExplicitAny: test stubs for AI SDK internal types
|
|
type StepsStub = any
|
|
|
|
function usage(inputTotal: number, outputTotal = 50): LanguageModelV3Usage {
|
|
return {
|
|
inputTokens: {
|
|
total: inputTotal,
|
|
noCache: inputTotal,
|
|
cacheRead: undefined,
|
|
cacheWrite: undefined,
|
|
},
|
|
outputTokens: { total: outputTotal, reasoning: undefined },
|
|
}
|
|
}
|
|
|
|
function resultToStream(
|
|
result: LanguageModelV3GenerateResult,
|
|
): ReadableStream<LanguageModelV3StreamPart> {
|
|
return new ReadableStream({
|
|
start(ctrl) {
|
|
for (const part of result.content) {
|
|
if (part.type === 'text') {
|
|
ctrl.enqueue({ type: 'text-delta' as const, delta: part.text })
|
|
} else if (part.type === 'tool-call') {
|
|
const inputStr =
|
|
typeof part.input === 'string'
|
|
? part.input
|
|
: JSON.stringify(part.input)
|
|
ctrl.enqueue({
|
|
type: 'tool-call' as const,
|
|
toolCallId: part.toolCallId,
|
|
toolName: part.toolName,
|
|
input: inputStr,
|
|
delta: inputStr,
|
|
})
|
|
}
|
|
}
|
|
ctrl.enqueue({
|
|
type: 'finish' as const,
|
|
finishReason: result.finishReason,
|
|
usage: result.usage,
|
|
})
|
|
ctrl.close()
|
|
},
|
|
})
|
|
}
|
|
|
|
type DoGenerateFn = (
|
|
options: LanguageModelV3CallOptions,
|
|
) => Promise<LanguageModelV3GenerateResult>
|
|
|
|
function createMock(
|
|
doGenerate: LanguageModelV3GenerateResult | DoGenerateFn,
|
|
): InstanceType<typeof MockLanguageModelV3> {
|
|
const doGenerateFn =
|
|
typeof doGenerate === 'function' ? doGenerate : async () => doGenerate
|
|
|
|
return new MockLanguageModelV3({
|
|
doGenerate: doGenerateFn,
|
|
doStream: async (options: LanguageModelV3CallOptions) => {
|
|
try {
|
|
const result = await doGenerateFn(options)
|
|
return { stream: resultToStream(result) }
|
|
} catch (error) {
|
|
return {
|
|
stream: new ReadableStream<LanguageModelV3StreamPart>({
|
|
start(ctrl) {
|
|
ctrl.error(error)
|
|
},
|
|
}),
|
|
}
|
|
}
|
|
},
|
|
})
|
|
}
|
|
|
|
function textResponse(
|
|
text: string,
|
|
inputTokens: number,
|
|
): LanguageModelV3GenerateResult {
|
|
return {
|
|
content: [{ type: 'text', text }],
|
|
finishReason: { unified: 'stop', raw: 'stop' },
|
|
usage: usage(inputTokens),
|
|
}
|
|
}
|
|
|
|
function toolCallResponse(
|
|
toolName: string,
|
|
input: Record<string, unknown>,
|
|
inputTokens: number,
|
|
): LanguageModelV3GenerateResult {
|
|
return {
|
|
content: [
|
|
{
|
|
type: 'tool-call',
|
|
toolCallId: `call_${toolName}_${Math.random().toString(36).slice(2, 8)}`,
|
|
toolName,
|
|
input: JSON.stringify(input),
|
|
},
|
|
],
|
|
finishReason: { unified: 'tool-calls', raw: 'tool_calls' },
|
|
usage: usage(inputTokens),
|
|
}
|
|
}
|
|
|
|
function summaryResponse(inputTokens: number): LanguageModelV3GenerateResult {
|
|
return textResponse(
|
|
`## Goal
|
|
Test task
|
|
|
|
## Constraints & Preferences
|
|
- (none)
|
|
|
|
## Progress
|
|
### Done
|
|
- [x] Performed test actions
|
|
|
|
### In Progress
|
|
- [ ] Continue task
|
|
|
|
### Blocked
|
|
- (none)
|
|
|
|
## Key Decisions
|
|
- (none)
|
|
|
|
## Active State
|
|
- Page 1 open
|
|
|
|
## Next Steps
|
|
1. Continue
|
|
|
|
## Critical Context
|
|
- Test context`,
|
|
inputTokens,
|
|
)
|
|
}
|
|
|
|
function turnPrefixSummaryResponse(
|
|
inputTokens: number,
|
|
): LanguageModelV3GenerateResult {
|
|
return textResponse(
|
|
`## Original Request
|
|
User asked to perform a long task
|
|
|
|
## Early Progress
|
|
- Completed initial actions in the turn prefix
|
|
|
|
## Context for Suffix
|
|
- Context needed for the retained suffix`,
|
|
inputTokens,
|
|
)
|
|
}
|
|
|
|
function extractUserText(options: LanguageModelV3CallOptions): string {
|
|
const parts: string[] = []
|
|
for (const msg of options.prompt) {
|
|
if (msg.role !== 'user') continue
|
|
const content = msg.content
|
|
if (typeof content === 'string') {
|
|
parts.push(content)
|
|
} else if (Array.isArray(content)) {
|
|
for (const part of content) {
|
|
if (
|
|
typeof part === 'object' &&
|
|
part !== null &&
|
|
'text' in part &&
|
|
typeof part.text === 'string'
|
|
) {
|
|
parts.push(part.text)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return parts.join('\n')
|
|
}
|
|
|
|
function promptContainsText(
|
|
options: LanguageModelV3CallOptions,
|
|
needle: string,
|
|
): boolean {
|
|
return extractUserText(options).includes(needle)
|
|
}
|
|
|
|
function isSummarizationCall(options: LanguageModelV3CallOptions): boolean {
|
|
for (const msg of options.prompt) {
|
|
if (msg.role !== 'system') continue
|
|
const content = msg.content
|
|
if (typeof content === 'string') {
|
|
if (content.includes('context summarization assistant')) return true
|
|
} else if (Array.isArray(content)) {
|
|
const found = content.some(
|
|
(part: { type?: string; text?: string }) =>
|
|
'text' in part &&
|
|
typeof part.text === 'string' &&
|
|
part.text.includes('context summarization assistant'),
|
|
)
|
|
if (found) return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
function isTurnPrefixCall(options: LanguageModelV3CallOptions): boolean {
|
|
return promptContainsText(options, 'PREFIX of a turn')
|
|
}
|
|
|
|
/** Build messages with tool call/result pairs (prunable by Stage 2). */
|
|
function buildModerateMessages(
|
|
exchangeCount: number,
|
|
outputChars = 1000,
|
|
): ModelMessage[] {
|
|
const messages: ModelMessage[] = [
|
|
{ role: 'user', content: 'Do a multi-step browser task' },
|
|
]
|
|
for (let i = 0; i < exchangeCount; i++) {
|
|
messages.push({
|
|
role: 'assistant',
|
|
content: [
|
|
{
|
|
type: 'tool-call',
|
|
toolCallId: `call_${i}`,
|
|
toolName: `action_${i}`,
|
|
input: { step: i },
|
|
},
|
|
],
|
|
})
|
|
messages.push({
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: `call_${i}`,
|
|
toolName: `action_${i}`,
|
|
output: {
|
|
type: 'text' as const,
|
|
value: `Result ${i}: ${'x'.repeat(outputChars)}`,
|
|
},
|
|
},
|
|
],
|
|
})
|
|
messages.push({ role: 'assistant', content: `Step ${i} done.` })
|
|
}
|
|
return messages
|
|
}
|
|
|
|
function toolResultContent(
|
|
toolName: string,
|
|
value: Extract<ToolResultPart['output'], { type: 'content' }>['value'],
|
|
): ModelMessage {
|
|
return {
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: `call_${toolName}`,
|
|
toolName,
|
|
output: { type: 'content' as const, value },
|
|
},
|
|
],
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Build text-heavy user/assistant exchanges WITHOUT tool calls.
|
|
* These survive pruning and output reduction, forcing LLM summarization
|
|
* when large enough.
|
|
*/
|
|
function buildTextHeavyMessages(
|
|
exchangeCount: number,
|
|
charsPerMessage: number,
|
|
): ModelMessage[] {
|
|
const messages: ModelMessage[] = [
|
|
{ role: 'user', content: 'Do a multi-step analysis task' },
|
|
]
|
|
for (let i = 0; i < exchangeCount; i++) {
|
|
messages.push({
|
|
role: 'user',
|
|
content: `Question ${i}: ${'q'.repeat(charsPerMessage)}`,
|
|
})
|
|
messages.push({
|
|
role: 'assistant',
|
|
content: `Analysis ${i}: ${'a'.repeat(charsPerMessage)}`,
|
|
})
|
|
}
|
|
return messages
|
|
}
|
|
|
|
const testTools = {
|
|
get_page_content: tool({
|
|
description: 'Gets page content',
|
|
parameters: z.object({ pageId: z.number() }),
|
|
execute: async ({ pageId }) =>
|
|
`Page ${pageId}: ${'Lorem ipsum dolor sit amet. '.repeat(1000)}`,
|
|
}),
|
|
click_element: tool({
|
|
description: 'Clicks an element',
|
|
parameters: z.object({ selector: z.string() }),
|
|
execute: async ({ selector }) =>
|
|
`Clicked ${selector}: ${'Result data. '.repeat(500)}`,
|
|
}),
|
|
navigate_to: tool({
|
|
description: 'Navigate to URL',
|
|
parameters: z.object({ url: z.string() }),
|
|
execute: async ({ url }) =>
|
|
`Navigated to ${url}: ${'Page content. '.repeat(500)}`,
|
|
}),
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// E2E: prepareStep integration — trigger & no-trigger
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('compaction E2E — trigger logic', () => {
|
|
it('does NOT compact when real usage is below trigger', async () => {
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow: 200_000 })
|
|
|
|
const model = createMock(textResponse('unused', 100))
|
|
|
|
const result = await prepareStep({
|
|
messages: [
|
|
{ role: 'user', content: 'hello' },
|
|
{ role: 'assistant', content: 'hi there' },
|
|
],
|
|
steps: [{ usage: { inputTokens: 500 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
expect(result.messages.length).toBe(2)
|
|
expect(
|
|
(result.experimental_context as CompactionState).compactionCount,
|
|
).toBe(0)
|
|
})
|
|
|
|
it('compacts when real usage exceeds trigger (10K window, text-heavy exchanges)', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
|
|
const model = createMock(async () => summaryResponse(200))
|
|
|
|
const messages = buildTextHeavyMessages(8, 2000)
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: triggerAt + 1000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const state = result.experimental_context as CompactionState
|
|
expect(state.compactionCount).toBe(1)
|
|
expect(state.existingSummary).toBeTruthy()
|
|
expect(result.messages.length).toBeLessThan(messages.length)
|
|
expect(result.messages[0].content as string).toContain('## Goal')
|
|
})
|
|
|
|
it('uses estimation with safety multiplier on step 0 (no real usage)', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
|
|
const model = createMock(async () => summaryResponse(200))
|
|
|
|
const messages = buildTextHeavyMessages(8, 2000)
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
expect(
|
|
(result.experimental_context as CompactionState).compactionCount,
|
|
).toBe(1)
|
|
})
|
|
|
|
it('does NOT compact on step 0 when messages are small', async () => {
|
|
const contextWindow = 200_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
|
|
const model = createMock(async () => summaryResponse(200))
|
|
|
|
const result = await prepareStep({
|
|
messages: [
|
|
{ role: 'user', content: 'hello' },
|
|
{ role: 'assistant', content: 'hi' },
|
|
],
|
|
steps: [] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
expect(
|
|
(result.experimental_context as CompactionState).compactionCount,
|
|
).toBe(0)
|
|
})
|
|
|
|
it('preserves agent-normalized media messages when compaction does not trigger', async () => {
|
|
const contextWindow = 200_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
|
|
const model = createMock(textResponse('unused', 100))
|
|
const normalizedMessages = normalizeMessagesForModel(
|
|
[
|
|
{ role: 'user', content: 'Take a screenshot' },
|
|
{
|
|
role: 'assistant',
|
|
content: [
|
|
{
|
|
type: 'tool-call',
|
|
toolCallId: 'call_snapshot',
|
|
toolName: 'snapshot',
|
|
input: {},
|
|
},
|
|
],
|
|
},
|
|
toolResultContent('snapshot', [
|
|
{ type: 'text', text: 'Captured screenshot' },
|
|
{
|
|
type: 'image-data',
|
|
data: 'abcd',
|
|
mediaType: 'image/png',
|
|
},
|
|
]),
|
|
],
|
|
{
|
|
supportsImages: true,
|
|
supportsMediaInToolResults: false,
|
|
},
|
|
)
|
|
|
|
const result = await prepareStep({
|
|
messages: normalizedMessages,
|
|
steps: [] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
expect(
|
|
(result.experimental_context as CompactionState).compactionCount,
|
|
).toBe(0)
|
|
expect(result.messages).toHaveLength(4)
|
|
|
|
const toolOutput = (
|
|
result.messages[2].content as Array<{
|
|
output: { type: string; value: string }
|
|
}>
|
|
)[0].output
|
|
expect(toolOutput.type).toBe('text')
|
|
|
|
const mediaMessage = result.messages[3]
|
|
expect(mediaMessage.role).toBe('user')
|
|
expect(Array.isArray(mediaMessage.content)).toBe(true)
|
|
if (Array.isArray(mediaMessage.content)) {
|
|
expect(mediaMessage.content[0]).toEqual({
|
|
type: 'text',
|
|
text: 'Attached image(s) from tool result:',
|
|
})
|
|
expect(mediaMessage.content[1]).toEqual({
|
|
type: 'image',
|
|
image: 'abcd',
|
|
mediaType: 'image/png',
|
|
})
|
|
}
|
|
})
|
|
|
|
it('strips content tool-result media before pruning when that resolves the overflow', async () => {
|
|
const contextWindow = 200_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
const model = createMock(textResponse('unused', 100))
|
|
|
|
const result = await prepareStep({
|
|
messages: [
|
|
{ role: 'user', content: 'Take a screenshot' },
|
|
{
|
|
role: 'assistant',
|
|
content: [
|
|
{
|
|
type: 'tool-call',
|
|
toolCallId: 'call_snapshot',
|
|
toolName: 'snapshot',
|
|
input: {},
|
|
},
|
|
],
|
|
},
|
|
toolResultContent('snapshot', [
|
|
{ type: 'text', text: 'Captured screenshot' },
|
|
{
|
|
type: 'image-data',
|
|
data: 'x'.repeat(200_000),
|
|
mediaType: 'image/png',
|
|
},
|
|
]),
|
|
],
|
|
steps: [
|
|
{ usage: { inputTokens: triggerAt + 1_000, outputTokens: 100 } },
|
|
] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const output = (
|
|
result.messages[2].content as Array<{
|
|
output: { type: string; value: string }
|
|
}>
|
|
)[0].output
|
|
|
|
expect(
|
|
(result.experimental_context as CompactionState).compactionCount,
|
|
).toBe(0)
|
|
expect(result.messages).toHaveLength(3)
|
|
expect(output.type).toBe('text')
|
|
expect(output.value).toContain('Captured screenshot')
|
|
expect(output.value).toContain('[Image]')
|
|
expect(output.value).not.toContain('x'.repeat(100))
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// E2E: Token counting from steps
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('compaction E2E — token counting', () => {
|
|
it('uses real inputTokens when available', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
|
|
const model = createMock(async () => summaryResponse(200))
|
|
|
|
const messages = buildTextHeavyMessages(8, 2000)
|
|
|
|
// Just below trigger — should NOT compact
|
|
const resultBelow = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: triggerAt - 1 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
expect(
|
|
(resultBelow.experimental_context as CompactionState).compactionCount,
|
|
).toBe(0)
|
|
|
|
// Just above trigger — should compact (text-heavy survives pruning stages)
|
|
const resultAbove = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: triggerAt + 1 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
expect(
|
|
(resultAbove.experimental_context as CompactionState).compactionCount,
|
|
).toBe(1)
|
|
})
|
|
|
|
it('falls back to estimation when usage has no inputTokens', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
|
|
const model = createMock(async () => summaryResponse(200))
|
|
|
|
const messages = buildTextHeavyMessages(8, 2000)
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: undefined } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
expect(
|
|
(result.experimental_context as CompactionState).compactionCount,
|
|
).toBe(1)
|
|
})
|
|
|
|
it('falls back to estimation when usage.inputTokens is 0', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
|
|
const model = createMock(async () => summaryResponse(200))
|
|
|
|
const messages = buildTextHeavyMessages(8, 2000)
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: 0 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
expect(
|
|
(result.experimental_context as CompactionState).compactionCount,
|
|
).toBe(1)
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// E2E: Summarization quality and fallbacks
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('compaction E2E — summarization & fallbacks', () => {
|
|
it('falls back to sliding window when summarization throws', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
|
|
const model = createMock(async () => {
|
|
throw new Error('Model unavailable')
|
|
})
|
|
|
|
const messages = buildTextHeavyMessages(8, 2000)
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: triggerAt + 1000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const state = result.experimental_context as CompactionState
|
|
expect(state.compactionCount).toBe(0)
|
|
expect(state.existingSummary).toBeNull()
|
|
expect(result.messages.length).toBeLessThanOrEqual(messages.length)
|
|
})
|
|
|
|
it('falls back when summary is inflated (larger than original)', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
|
|
const model = createMock(async () => textResponse('x'.repeat(100_000), 200))
|
|
|
|
const messages = buildTextHeavyMessages(8, 2000)
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: triggerAt + 1000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const state = result.experimental_context as CompactionState
|
|
expect(state.compactionCount).toBe(0)
|
|
})
|
|
|
|
it('falls back when summary is empty', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
|
|
const model = createMock(async () => textResponse('', 200))
|
|
|
|
const messages = buildTextHeavyMessages(8, 2000)
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: triggerAt + 1000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const state = result.experimental_context as CompactionState
|
|
expect(state.compactionCount).toBe(0)
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// E2E: Iterative compaction
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('compaction E2E — iterative compaction', () => {
|
|
it('sends UPDATE prompt with previous summary on second compaction', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
|
|
let sawPreviousSummary = false
|
|
|
|
const model = createMock(async (options) => {
|
|
if (promptContainsText(options, '<previous_summary>')) {
|
|
sawPreviousSummary = true
|
|
}
|
|
return summaryResponse(200)
|
|
})
|
|
|
|
const messages1 = buildTextHeavyMessages(8, 2000)
|
|
const result1 = await prepareStep({
|
|
messages: messages1,
|
|
steps: [{ usage: { inputTokens: triggerAt + 1000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const state1 = result1.experimental_context as CompactionState
|
|
expect(state1.compactionCount).toBe(1)
|
|
expect(sawPreviousSummary).toBe(false)
|
|
|
|
sawPreviousSummary = false
|
|
const messages2: ModelMessage[] = [
|
|
...result1.messages,
|
|
...buildTextHeavyMessages(6, 2000).slice(1),
|
|
]
|
|
|
|
const result2 = await prepareStep({
|
|
messages: messages2,
|
|
steps: [{ usage: { inputTokens: triggerAt + 1000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: state1,
|
|
})
|
|
|
|
const state2 = result2.experimental_context as CompactionState
|
|
expect(state2.compactionCount).toBe(2)
|
|
expect(sawPreviousSummary).toBe(true)
|
|
})
|
|
|
|
it('state persists across non-compaction steps', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
|
|
const model = createMock(async () => summaryResponse(200))
|
|
|
|
const messages1 = buildTextHeavyMessages(8, 2000)
|
|
const result1 = await prepareStep({
|
|
messages: messages1,
|
|
steps: [{ usage: { inputTokens: triggerAt + 1000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
const state1 = result1.experimental_context as CompactionState
|
|
expect(state1.compactionCount).toBe(1)
|
|
|
|
const result2 = await prepareStep({
|
|
messages: result1.messages,
|
|
steps: [{ usage: { inputTokens: 500 } }] as StepsStub,
|
|
model,
|
|
experimental_context: state1,
|
|
})
|
|
const state2 = result2.experimental_context as CompactionState
|
|
expect(state2.compactionCount).toBe(1)
|
|
expect(state2.existingSummary).toBeTruthy()
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// E2E: Tool output handling in the pipeline
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('compaction E2E — tool output truncation', () => {
|
|
it('preserves small tool outputs when compaction does not run', async () => {
|
|
const contextWindow = 50_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
|
|
const model = createMock(async () => summaryResponse(200))
|
|
|
|
// Use a tool output under the 15K cap so Stage 0 does not truncate
|
|
const smallOutput = 'x'.repeat(10_000)
|
|
const messages: ModelMessage[] = [
|
|
{ role: 'user', content: 'Get the page' },
|
|
{
|
|
role: 'assistant',
|
|
content: [
|
|
{
|
|
type: 'tool-call',
|
|
toolCallId: 'call_1',
|
|
toolName: 'get_page_content',
|
|
input: { pageId: 1 },
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: 'call_1',
|
|
toolName: 'get_page_content',
|
|
output: { type: 'text' as const, value: smallOutput },
|
|
},
|
|
],
|
|
},
|
|
{ role: 'assistant', content: 'Got the content' },
|
|
]
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: 5000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const toolMsg = result.messages.find((m) => m.role === 'tool')
|
|
expect(toolMsg).toBeDefined()
|
|
const content = toolMsg?.content as Array<{ output: { value: string } }>
|
|
expect(content[0].output.value.length).toBe(10_000)
|
|
expect(content[0].output.value).not.toContain('[... truncated')
|
|
})
|
|
|
|
it('returns messages untouched when under threshold (no truncation)', async () => {
|
|
const contextWindow = 200_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
|
|
const model = createMock(async () => summaryResponse(200))
|
|
|
|
const bigOutput = 'x'.repeat(50_000)
|
|
const messages: ModelMessage[] = [
|
|
{ role: 'user', content: 'Get pages' },
|
|
{
|
|
role: 'assistant',
|
|
content: [
|
|
{
|
|
type: 'tool-call',
|
|
toolCallId: 'call_0',
|
|
toolName: 'get_page',
|
|
input: { id: 0 },
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: 'call_0',
|
|
toolName: 'get_page',
|
|
output: { type: 'text' as const, value: bigOutput },
|
|
},
|
|
],
|
|
},
|
|
{ role: 'assistant', content: 'Got it.' },
|
|
]
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: 5000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const state = result.experimental_context as CompactionState
|
|
expect(state.compactionCount).toBe(0)
|
|
|
|
// Under threshold — messages returned untouched, no truncation
|
|
const toolMsg = result.messages.find((m) => m.role === 'tool')
|
|
expect(toolMsg).toBeDefined()
|
|
const content = toolMsg?.content as Array<{ output: { value: string } }>
|
|
expect(content[0].output.value.length).toBe(50_000)
|
|
expect(content[0].output.value).not.toContain('[... truncated')
|
|
})
|
|
|
|
it('Stages 2+3 clear tool outputs before LLM summarization sees them', async () => {
|
|
// When tool-call-heavy messages trigger compaction, the pruning and
|
|
// clearing stages remove/replace tool outputs before Stage 4.
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
|
|
let summarizationCalled = false
|
|
const model = createMock(async (options) => {
|
|
if (isSummarizationCall(options)) {
|
|
summarizationCalled = true
|
|
return summaryResponse(200)
|
|
}
|
|
return textResponse('done', 100)
|
|
})
|
|
|
|
// These tool call/result pairs will be pruned/cleared before Stage 4.
|
|
const messages = buildModerateMessages(8, 2000)
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: triggerAt + 1000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const state = result.experimental_context as CompactionState
|
|
// Pruning + clearing resolved the overflow, so LLM summarization was not needed
|
|
expect(state.compactionCount).toBe(0)
|
|
expect(summarizationCalled).toBe(false)
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// E2E: Pruning and output reduction
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('compaction E2E — pruning and output reduction', () => {
|
|
it('Stage 2 (pruneMessages) resolves overflow without LLM summarization', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
|
|
let summarizationCalled = false
|
|
const model = createMock(async (options) => {
|
|
if (isSummarizationCall(options)) {
|
|
summarizationCalled = true
|
|
}
|
|
return summaryResponse(200)
|
|
})
|
|
|
|
// Tool call/result pairs get pruned by Stage 2. After pruning + re-estimation,
|
|
// the remaining content (short text messages) should be well under threshold.
|
|
const messages = buildModerateMessages(8, 2000)
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: triggerAt + 1000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const state = result.experimental_context as CompactionState
|
|
// Pruning resolved overflow — no LLM compaction needed
|
|
expect(state.compactionCount).toBe(0)
|
|
expect(summarizationCalled).toBe(false)
|
|
// Messages should be fewer (tool call content pruned or messages dropped)
|
|
expect(result.messages.length).toBeLessThanOrEqual(messages.length)
|
|
})
|
|
|
|
it('output reduction clears older outputs and truncates protected recent ones', async () => {
|
|
const messages: ModelMessage[] = [
|
|
{ role: 'user', content: 'Do tasks' },
|
|
{
|
|
role: 'assistant',
|
|
content: [
|
|
{
|
|
type: 'tool-call',
|
|
toolCallId: 'call_old',
|
|
toolName: 'action_old',
|
|
input: { step: 0 },
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: 'call_old',
|
|
toolName: 'action_old',
|
|
output: { type: 'text' as const, value: 'x'.repeat(500) },
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'assistant',
|
|
content: [
|
|
{
|
|
type: 'tool-call',
|
|
toolCallId: 'call_recent_0',
|
|
toolName: 'action_1',
|
|
input: { step: 1 },
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: 'call_recent_0',
|
|
toolName: 'action_1',
|
|
output: { type: 'text' as const, value: 'y'.repeat(500) },
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'assistant',
|
|
content: [
|
|
{
|
|
type: 'tool-call',
|
|
toolCallId: 'call_recent_1',
|
|
toolName: 'action_2',
|
|
input: { step: 2 },
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: 'call_recent_1',
|
|
toolName: 'action_2',
|
|
output: { type: 'text' as const, value: 'z'.repeat(500) },
|
|
},
|
|
],
|
|
},
|
|
]
|
|
|
|
const reduced = reduceToolOutputs(messages, {
|
|
maxChars: 200,
|
|
keepRecentCount: 2,
|
|
clearThreshold: 100,
|
|
})
|
|
const toolMsgs = reduced.filter((m) => m.role === 'tool') as Array<{
|
|
content: Array<{ output: { value: string } }>
|
|
}>
|
|
|
|
expect(toolMsgs[0].content[0].output.value).toContain('[Cleared')
|
|
expect(toolMsgs[1].content[0].output.value).toContain('[... truncated')
|
|
expect(toolMsgs[2].content[0].output.value).toContain('[... truncated')
|
|
})
|
|
|
|
it('all 4 stages work together when only LLM summarization resolves overflow', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
|
|
let summarizationCalled = false
|
|
const model = createMock(async (options) => {
|
|
if (isSummarizationCall(options)) {
|
|
summarizationCalled = true
|
|
return summaryResponse(200)
|
|
}
|
|
return textResponse('done', 100)
|
|
})
|
|
|
|
// Text-heavy messages: no tool calls to prune, no tool outputs to clear.
|
|
// Only LLM summarization can reduce the content.
|
|
const messages = buildTextHeavyMessages(8, 2000)
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: triggerAt + 1000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const state = result.experimental_context as CompactionState
|
|
expect(state.compactionCount).toBe(1)
|
|
expect(summarizationCalled).toBe(true)
|
|
expect(state.existingSummary).toBeTruthy()
|
|
expect(result.messages.length).toBeLessThan(messages.length)
|
|
expect(result.messages[0].content as string).toContain('## Goal')
|
|
})
|
|
|
|
it('reduceToolOutputs caps protected outputs at maxChars', () => {
|
|
const messages: ModelMessage[] = [
|
|
{
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: 'call_1',
|
|
toolName: 'test',
|
|
output: { type: 'text' as const, value: 'x'.repeat(50_000) },
|
|
},
|
|
],
|
|
},
|
|
]
|
|
|
|
const truncated = reduceToolOutputs(messages, {
|
|
maxChars: 15_000,
|
|
keepRecentCount: 1,
|
|
})
|
|
const part = (
|
|
truncated[0].content as Array<{ output: { value: string } }>
|
|
)[0]
|
|
expect(part.output.value).toContain('[... truncated')
|
|
expect(part.output.value.length).toBeLessThan(20_000)
|
|
})
|
|
|
|
it('reduceToolOutputs clears older verbose outputs but protects last N', () => {
|
|
const messages: ModelMessage[] = [
|
|
{
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: 'call_1',
|
|
toolName: 'test',
|
|
output: { type: 'text' as const, value: 'x'.repeat(500) },
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: 'call_2',
|
|
toolName: 'test',
|
|
output: { type: 'text' as const, value: 'y'.repeat(200) },
|
|
},
|
|
],
|
|
},
|
|
{
|
|
role: 'tool',
|
|
content: [
|
|
{
|
|
type: 'tool-result',
|
|
toolCallId: 'call_3',
|
|
toolName: 'test',
|
|
output: { type: 'text' as const, value: 'short' },
|
|
},
|
|
],
|
|
},
|
|
]
|
|
|
|
const cleared = reduceToolOutputs(messages, {
|
|
maxChars: 300,
|
|
keepRecentCount: 2,
|
|
clearThreshold: 100,
|
|
})
|
|
const part0 = (
|
|
cleared[0].content as Array<{ output: { value: string } }>
|
|
)[0]
|
|
const part1 = (
|
|
cleared[1].content as Array<{ output: { value: string } }>
|
|
)[0]
|
|
const part2 = (
|
|
cleared[2].content as Array<{ output: { value: string } }>
|
|
)[0]
|
|
expect(part0.output.value).toBe('[Cleared — 500 chars]')
|
|
expect(part1.output.value).toBe('y'.repeat(200))
|
|
expect(part2.output.value).toBe('short')
|
|
})
|
|
|
|
it('estimateTokensForThreshold applies safety multiplier and overhead', () => {
|
|
const config = computeConfig(10_000)
|
|
const messages: ModelMessage[] = [
|
|
{ role: 'user', content: 'x'.repeat(3000) },
|
|
]
|
|
const estimated = estimateTokensForThreshold(messages, config)
|
|
expect(estimated).toBe(
|
|
Math.ceil(1000 * config.safetyMultiplier) + config.fixedOverhead,
|
|
)
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// E2E: Full generateText with prepareStep at different context windows
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('compaction E2E — generateText with tools and prepareStep', () => {
|
|
for (const contextWindow of [8_000, 16_000, 32_000, 200_000]) {
|
|
const toolCallCount = contextWindow >= 200_000 ? 8 : 4
|
|
|
|
it(`${(contextWindow / 1000).toFixed(0)}K context — multi-tool conversation with compaction`, async () => {
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
let stepCount = 0
|
|
let _compactionTriggered = false
|
|
|
|
const model = createMock(async (options) => {
|
|
if (isSummarizationCall(options)) {
|
|
_compactionTriggered = true
|
|
return summaryResponse(200)
|
|
}
|
|
|
|
stepCount++
|
|
if (stepCount <= toolCallCount) {
|
|
const simulatedTokens = Math.floor(
|
|
(stepCount / toolCallCount) *
|
|
contextWindow *
|
|
config.triggerRatio *
|
|
1.2,
|
|
)
|
|
return toolCallResponse(
|
|
'get_page_content',
|
|
{ pageId: stepCount },
|
|
simulatedTokens,
|
|
)
|
|
}
|
|
return textResponse('All pages processed successfully!', 5000)
|
|
})
|
|
|
|
const result = await generateText({
|
|
model,
|
|
system: 'You are a browser automation agent.',
|
|
tools: testTools,
|
|
stopWhen: stepCountIs(toolCallCount + 5),
|
|
prepareStep,
|
|
messages: [
|
|
{ role: 'user', content: `Get content from ${toolCallCount} pages` },
|
|
],
|
|
})
|
|
|
|
expect(result.text).toContain('All pages processed')
|
|
expect(result.steps.length).toBeGreaterThanOrEqual(toolCallCount + 1)
|
|
// Earlier stages (pruning/output reduction) may resolve
|
|
// overflow before LLM summarization. For tool-call-heavy conversations,
|
|
// this is expected. We verify the conversation completed successfully.
|
|
})
|
|
}
|
|
|
|
it('agent continues correctly after compaction (summary is injected as first message)', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
let stepCount = 0
|
|
let _messagesAfterCompaction: LanguageModelV3CallOptions['prompt'] = []
|
|
|
|
const model = createMock(async (options) => {
|
|
if (isSummarizationCall(options)) {
|
|
return summaryResponse(200)
|
|
}
|
|
|
|
stepCount++
|
|
|
|
if (stepCount >= 3) {
|
|
_messagesAfterCompaction = [...options.prompt]
|
|
}
|
|
|
|
if (stepCount <= 3) {
|
|
return toolCallResponse(
|
|
'navigate_to',
|
|
{ url: `https://page${stepCount}.com` },
|
|
stepCount >= 2
|
|
? Math.floor(contextWindow * config.triggerRatio * 1.5)
|
|
: 1000,
|
|
)
|
|
}
|
|
return textResponse('Navigation complete!', 5000)
|
|
})
|
|
|
|
const result = await generateText({
|
|
model,
|
|
system: 'Navigate pages.',
|
|
tools: testTools,
|
|
stopWhen: stepCountIs(10),
|
|
prepareStep,
|
|
messages: [{ role: 'user', content: 'Navigate to 3 pages' }],
|
|
})
|
|
|
|
expect(result.text).toContain('Navigation complete')
|
|
})
|
|
|
|
it('tool call/result pairs are never orphaned after compaction', async () => {
|
|
const contextWindow = 8_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
let stepCount = 0
|
|
const allPrompts: LanguageModelV3CallOptions['prompt'][] = []
|
|
|
|
const model = createMock(async (options) => {
|
|
if (isSummarizationCall(options)) {
|
|
return summaryResponse(200)
|
|
}
|
|
|
|
allPrompts.push([...options.prompt])
|
|
stepCount++
|
|
|
|
if (stepCount <= 5) {
|
|
return toolCallResponse(
|
|
'click_element',
|
|
{ selector: `#btn-${stepCount}` },
|
|
Math.floor(
|
|
(stepCount / 5) * contextWindow * config.triggerRatio * 1.3,
|
|
),
|
|
)
|
|
}
|
|
return textResponse('Done!', 5000)
|
|
})
|
|
|
|
const result = await generateText({
|
|
model,
|
|
system: 'Click buttons.',
|
|
tools: testTools,
|
|
stopWhen: stepCountIs(10),
|
|
prepareStep,
|
|
messages: [{ role: 'user', content: 'Click 5 buttons' }],
|
|
})
|
|
|
|
expect(result.text).toContain('Done!')
|
|
|
|
for (const prompt of allPrompts) {
|
|
for (let i = 0; i < prompt.length; i++) {
|
|
const msg = prompt[i]
|
|
if (msg.role === 'tool') {
|
|
const prevNonSystem = prompt
|
|
.slice(0, i)
|
|
.filter((m: { role: string }) => m.role !== 'system')
|
|
if (prevNonSystem.length > 0) {
|
|
const prev = prevNonSystem[prevNonSystem.length - 1]
|
|
expect(['assistant', 'user']).toContain(prev.role)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
})
|
|
})
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// E2E: Split turn compaction
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('compaction E2E — split turn handling', () => {
|
|
it('uses regular summarization for single massive turn (user at index 0)', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
|
|
let turnPrefixCalled = false
|
|
let historySummarizationCalled = false
|
|
|
|
const model = createMock(async (options) => {
|
|
if (isSummarizationCall(options)) {
|
|
if (isTurnPrefixCall(options)) {
|
|
turnPrefixCalled = true
|
|
return turnPrefixSummaryResponse(200)
|
|
}
|
|
historySummarizationCalled = true
|
|
return summaryResponse(200)
|
|
}
|
|
return textResponse('done', 100)
|
|
})
|
|
|
|
// Single massive turn with text-heavy content (no tool calls to prune).
|
|
// User at index 0 means this is NOT a split turn.
|
|
const messages: ModelMessage[] = [
|
|
{
|
|
role: 'user',
|
|
content: 'Do a very long multi-step task with many actions',
|
|
},
|
|
]
|
|
for (let i = 0; i < 15; i++) {
|
|
messages.push({
|
|
role: 'assistant',
|
|
content: `Analysis step ${i}: ${'a'.repeat(2000)}`,
|
|
})
|
|
messages.push({
|
|
role: 'user',
|
|
content: `Follow-up question ${i}: ${'q'.repeat(500)}`,
|
|
})
|
|
}
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: triggerAt + 2000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const state = result.experimental_context as CompactionState
|
|
expect(state.compactionCount).toBe(1)
|
|
expect(state.existingSummary).toBeTruthy()
|
|
expect(result.messages.length).toBeLessThan(messages.length)
|
|
|
|
expect(turnPrefixCalled).toBe(false)
|
|
expect(historySummarizationCalled).toBe(true)
|
|
expect(state.existingSummary).toContain('## Goal')
|
|
})
|
|
|
|
it('generates both history and turn prefix summaries for multi-turn split', async () => {
|
|
const contextWindow = 10_000
|
|
const prepareStep = createCompactionPrepareStep({ contextWindow })
|
|
const config = computeConfig(contextWindow)
|
|
const triggerAt = Math.floor(contextWindow * config.triggerRatio)
|
|
|
|
let turnPrefixCalled = false
|
|
let historySummarizationCalled = false
|
|
|
|
const model = createMock(async (options) => {
|
|
if (isSummarizationCall(options)) {
|
|
if (isTurnPrefixCall(options)) {
|
|
turnPrefixCalled = true
|
|
return turnPrefixSummaryResponse(200)
|
|
}
|
|
historySummarizationCalled = true
|
|
return summaryResponse(200)
|
|
}
|
|
return textResponse('done', 100)
|
|
})
|
|
|
|
// Build history (first turn) followed by a massive second turn.
|
|
// Use text-heavy content so pruning stages don't resolve the overflow.
|
|
const messages: ModelMessage[] = [
|
|
{ role: 'user', content: `First analysis: ${'f'.repeat(3000)}` },
|
|
{
|
|
role: 'assistant',
|
|
content: `First result: ${'r'.repeat(3000)}`,
|
|
},
|
|
// Massive second turn
|
|
{ role: 'user', content: 'Now do a very long task with many steps' },
|
|
]
|
|
for (let i = 0; i < 12; i++) {
|
|
messages.push({
|
|
role: 'assistant',
|
|
content: `Step ${i} analysis: ${'a'.repeat(2000)}`,
|
|
})
|
|
if (i < 11) {
|
|
messages.push({
|
|
role: 'user',
|
|
content: `Follow-up ${i}: ${'q'.repeat(500)}`,
|
|
})
|
|
}
|
|
}
|
|
|
|
const result = await prepareStep({
|
|
messages,
|
|
steps: [{ usage: { inputTokens: triggerAt + 2000 } }] as StepsStub,
|
|
model,
|
|
experimental_context: null,
|
|
})
|
|
|
|
const state = result.experimental_context as CompactionState
|
|
expect(state.compactionCount).toBe(1)
|
|
expect(state.existingSummary).toBeTruthy()
|
|
|
|
expect(turnPrefixCalled).toBe(true)
|
|
expect(historySummarizationCalled).toBe(true)
|
|
expect(state.existingSummary).toContain('Turn Context (split turn)')
|
|
})
|
|
})
|