fix: improve rate limit handling and add prompt-too-long toast

- Add 2s deduplication window to prevent rate limit counter inflation from concurrent 429s
- Separate cooldown system from rate limits for non-429 errors (auth failures, 5xx)
- Add quota_fallback config option for automatic quota switching on rate limit
- Add toast notification for 400 'Prompt is too long' errors guiding users to /compact
- Add 5 new cooldown unit tests
- Enhance regression test suite with concurrent test infrastructure
- Add comprehensive rate limit analysis documentation
This commit is contained in:
tctinh
2025-12-28 16:09:52 +07:00
parent 93415be685
commit 457b3ac12b
9 changed files with 1614 additions and 78 deletions

View File

@@ -1,15 +1,23 @@
#!/usr/bin/env npx tsx
import { spawn } from "child_process";
type Category = "thinking-order" | "tool-pairing" | "multi-tool";
type Category = "thinking-order" | "tool-pairing" | "multi-tool" | "multi-provider" | "error-handling" | "stress" | "concurrency";
type TestSuite = "sanity" | "heavy" | "all";
interface MultiTurnTest {
name: string;
model: string;
category: Category;
turns: string[];
suite: TestSuite;
turns: (string | TurnConfig)[];
errorPatterns: string[];
timeout: number;
expectError?: string;
}
interface TurnConfig {
prompt: string;
model?: string;
}
interface TestResult {
@@ -17,6 +25,18 @@ interface TestResult {
error?: string;
duration: number;
turnsCompleted: number;
sessionId?: string;
}
interface ConcurrentTest {
name: string;
category: "concurrency";
suite: TestSuite;
concurrentRequests: number;
model: string;
prompt: string;
errorPatterns: string[];
timeout: number;
}
const ERROR_PATTERNS = [
@@ -32,60 +52,53 @@ const ERROR_PATTERNS = [
"must remain as they were",
];
const TESTS: MultiTurnTest[] = [
// Issue #50: Thinking block order bug - simple single-turn tool use
const GEMINI_FLASH = "google/antigravity-gemini-3-flash";
const GEMINI_FLASH_CLI_QUOTA = "google/gemini-2.5-flash";
const CLAUDE_SONNET = "google/antigravity-claude-sonnet-4-5-thinking-low";
const CLAUDE_OPUS = "google/antigravity-claude-opus-4-5-thinking-low";
const SANITY_TESTS: MultiTurnTest[] = [
{
name: "thinking-tool-use",
model: "google/antigravity-claude-sonnet-4-5-thinking-low",
model: CLAUDE_SONNET,
category: "thinking-order",
turns: [
"Read package.json and tell me the package name",
],
suite: "sanity",
turns: ["Read package.json and tell me the package name"],
errorPatterns: ERROR_PATTERNS,
timeout: 90000,
},
{
name: "thinking-bash-tool",
model: "google/antigravity-claude-sonnet-4-5-thinking-low",
model: CLAUDE_SONNET,
category: "thinking-order",
turns: [
"Run: echo 'hello' and tell me the output",
],
suite: "sanity",
turns: ["Run: echo 'hello' and tell me the output"],
errorPatterns: ERROR_PATTERNS,
timeout: 90000,
},
// Tool pairing - simple two-turn
{
name: "tool-pairing-sequential",
model: "google/antigravity-claude-sonnet-4-5-thinking-low",
model: CLAUDE_SONNET,
category: "tool-pairing",
turns: [
"Run: echo 'first'",
"Run: echo 'second'",
],
suite: "sanity",
turns: ["Run: echo 'first'", "Run: echo 'second'"],
errorPatterns: ERROR_PATTERNS,
timeout: 120000,
},
// Opus model basic test
{
name: "opus-thinking-basic",
model: "google/antigravity-claude-opus-4-5-thinking-low",
model: CLAUDE_OPUS,
category: "thinking-order",
turns: [
"What is 7 * 8? Use bash to verify: echo $((7*8))",
],
suite: "sanity",
turns: ["What is 7 * 8? Use bash to verify: echo $((7*8))"],
errorPatterns: ERROR_PATTERNS,
timeout: 120000,
},
// Bug: "thinking blocks in latest assistant message cannot be modified"
// Tests multi-turn with thinking blocks to verify they're preserved unchanged
{
name: "thinking-modification-continue",
model: "google/antigravity-claude-sonnet-4-5-thinking-low",
model: CLAUDE_SONNET,
category: "thinking-order",
suite: "sanity",
turns: [
"Read package.json and tell me the version",
"Now read tsconfig.json and tell me the target",
@@ -94,8 +107,178 @@ const TESTS: MultiTurnTest[] = [
errorPatterns: ERROR_PATTERNS,
timeout: 120000,
},
{
name: "multi-provider-switch",
model: GEMINI_FLASH,
category: "multi-provider",
suite: "sanity",
turns: [
{ prompt: "What is 2+2? Answer briefly.", model: GEMINI_FLASH },
{ prompt: "What is 3+3? Answer briefly.", model: CLAUDE_SONNET },
{ prompt: "What is 4+4? Answer briefly.", model: GEMINI_FLASH },
],
errorPatterns: ERROR_PATTERNS,
timeout: 180000,
},
{
name: "prompt-too-long-recovery",
model: GEMINI_FLASH,
category: "error-handling",
suite: "sanity",
turns: ["Reply with exactly: OK", "Repeat the word 'test' 50000 times"],
errorPatterns: ["FATAL", "unhandled", "Cannot read properties"],
timeout: 60000,
},
];
const HEAVY_TESTS: MultiTurnTest[] = [
{
name: "stress-8-turn-multi-provider",
model: GEMINI_FLASH,
category: "stress",
suite: "heavy",
turns: [
{ prompt: "Read package.json and tell me the name", model: GEMINI_FLASH },
{ prompt: "Now read tsconfig.json and tell me the target", model: CLAUDE_SONNET },
{ prompt: "Run: ls -la src/plugin | head -5", model: GEMINI_FLASH },
{ prompt: "Read src/plugin/auth.ts and summarize in 1 sentence", model: CLAUDE_SONNET },
{ prompt: "Run: wc -l src/plugin/*.ts | tail -3", model: GEMINI_FLASH },
{ prompt: "Read README.md first 50 lines and tell me what this project does", model: CLAUDE_SONNET },
{ prompt: "Run: git log --oneline -3", model: GEMINI_FLASH },
{ prompt: "Summarize everything we discussed in 3 bullet points", model: CLAUDE_SONNET },
],
errorPatterns: ERROR_PATTERNS,
timeout: 600000,
},
{
name: "opencode-tools-comprehensive",
model: CLAUDE_SONNET,
category: "multi-tool",
suite: "heavy",
turns: [
"Use glob to find all *.ts files in src/plugin directory",
"Use grep to search for 'async function' in src/plugin/auth.ts",
"Use bash to run: echo 'test123' && pwd",
"Use read to read the first 20 lines of package.json",
"Use lsp_diagnostics on src/plugin/auth.ts to check for errors",
"Use glob to find all test files matching *.test.ts",
],
errorPatterns: ERROR_PATTERNS,
timeout: 480000,
},
{
name: "stress-20-turn-recovery",
model: GEMINI_FLASH,
category: "stress",
suite: "heavy",
turns: [
{ prompt: "Read package.json and extract the version number only", model: GEMINI_FLASH },
{ prompt: "Run: ls src/plugin/*.ts | head -3", model: CLAUDE_SONNET },
{ prompt: "Read src/plugin/auth.ts first 30 lines", model: GEMINI_FLASH },
{ prompt: "Use grep to find 'export' in src/plugin/auth.ts", model: CLAUDE_SONNET },
{ prompt: "Run: echo 'checkpoint 1' && date", model: GEMINI_FLASH },
{ prompt: "Read tsconfig.json and tell me the module type", model: CLAUDE_SONNET },
{ prompt: "Use glob to find all *.test.ts files", model: GEMINI_FLASH },
{ prompt: "Read src/plugin/token.ts first 20 lines", model: CLAUDE_SONNET },
{ prompt: "Run: wc -l src/plugin/*.ts | sort -n | tail -5", model: GEMINI_FLASH },
{ prompt: "What files have we read so far? List them.", model: CLAUDE_SONNET },
{ prompt: "Read src/plugin/request.ts first 25 lines", model: GEMINI_FLASH },
{ prompt: "Use grep to find 'async' in src/plugin/request.ts", model: CLAUDE_SONNET },
{ prompt: "Run: echo 'checkpoint 2' && pwd", model: GEMINI_FLASH },
{ prompt: "Read src/plugin/storage.ts first 20 lines", model: CLAUDE_SONNET },
{ prompt: "Use lsp_diagnostics on src/plugin/token.ts", model: GEMINI_FLASH },
{ prompt: "Read vitest.config.ts completely", model: CLAUDE_SONNET },
{ prompt: "Run: git status --short | head -5", model: GEMINI_FLASH },
{ prompt: "Read src/constants.ts completely", model: CLAUDE_SONNET },
{ prompt: "Run: echo 'final checkpoint' && echo 'all done'", model: GEMINI_FLASH },
{ prompt: "Summarize this entire conversation in 5 bullet points", model: CLAUDE_SONNET },
],
errorPatterns: ERROR_PATTERNS,
timeout: 900000,
},
{
name: "stress-50-turn-endurance",
model: GEMINI_FLASH,
category: "stress",
suite: "heavy",
turns: generateEnduranceTest(50),
errorPatterns: ERROR_PATTERNS,
timeout: 1800000,
},
];
function generateEnduranceTest(turnCount: number): TurnConfig[] {
const turns: TurnConfig[] = [];
const prompts = [
{ prompt: "What is {n} + {n}? Answer with just the number.", model: GEMINI_FLASH },
{ prompt: "Run: echo 'turn {i}'", model: CLAUDE_SONNET },
{ prompt: "Read package.json and tell me one field", model: GEMINI_FLASH },
{ prompt: "Run: pwd && echo 'ok'", model: CLAUDE_SONNET },
{ prompt: "What turn number are we on? Just say the number.", model: GEMINI_FLASH },
{ prompt: "Run: date +%H:%M:%S", model: CLAUDE_SONNET },
{ prompt: "Use glob to find one .ts file in src/", model: GEMINI_FLASH },
{ prompt: "Run: echo 'checkpoint {i}'", model: CLAUDE_SONNET },
{ prompt: "Read tsconfig.json and tell me target", model: GEMINI_FLASH },
{ prompt: "What have we done in last 3 turns? Brief answer.", model: CLAUDE_SONNET },
];
for (let i = 0; i < turnCount; i++) {
const template = prompts[i % prompts.length]!;
const prompt = template.prompt
.replace(/\{i\}/g, String(i + 1))
.replace(/\{n\}/g, String(i + 1));
turns.push({ prompt, model: template.model });
}
turns.push({
prompt: `We completed ${turnCount} turns. Summarize this session in 3 sentences.`,
model: CLAUDE_SONNET,
});
return turns;
}
const RATE_LIMIT_ERROR_PATTERNS = [
"false alarm",
"incorrectly marked as rate limited",
"wrong quota",
];
const CONCURRENT_TESTS: ConcurrentTest[] = [
{
name: "concurrent-5-same-model",
category: "concurrency",
suite: "heavy",
concurrentRequests: 5,
model: GEMINI_FLASH,
prompt: "What is 2+2? Answer with just the number.",
errorPatterns: [...ERROR_PATTERNS, ...RATE_LIMIT_ERROR_PATTERNS],
timeout: 120000,
},
{
name: "concurrent-3-mixed-models",
category: "concurrency",
suite: "heavy",
concurrentRequests: 3,
model: GEMINI_FLASH,
prompt: "Say hello in one word.",
errorPatterns: [...ERROR_PATTERNS, ...RATE_LIMIT_ERROR_PATTERNS],
timeout: 120000,
},
{
name: "concurrent-10-antigravity-heavy",
category: "concurrency",
suite: "heavy",
concurrentRequests: 10,
model: GEMINI_FLASH,
prompt: "What is 1+1? Answer with just the number.",
errorPatterns: [...ERROR_PATTERNS, ...RATE_LIMIT_ERROR_PATTERNS],
timeout: 180000,
},
];
const ALL_TESTS = [...SANITY_TESTS, ...HEAVY_TESTS];
async function runTurn(
prompt: string,
model: string,
@@ -161,31 +344,114 @@ async function runTurn(
});
}
async function deleteSession(sessionId: string): Promise<void> {
return new Promise((resolve) => {
const proc = spawn("opencode", ["session", "delete", sessionId, "--force"], {
stdio: ["ignore", "pipe", "pipe"],
timeout: 10000,
cwd: process.cwd(),
});
proc.on("close", () => resolve());
proc.on("error", () => resolve());
});
}
async function runConcurrentTest(test: ConcurrentTest): Promise<TestResult> {
const start = Date.now();
const sessionIds: string[] = [];
process.stdout.write(` Spawning ${test.concurrentRequests} concurrent requests...`);
const promises = Array.from({ length: test.concurrentRequests }, (_, i) =>
runTurn(
`${test.prompt} (request ${i + 1})`,
test.model,
null,
`concurrent-${test.name}-${i}`,
test.timeout
)
);
const results = await Promise.all(promises);
process.stdout.write("\r" + " ".repeat(60) + "\r");
for (const result of results) {
if (result.sessionId) {
sessionIds.push(result.sessionId);
}
}
for (const result of results) {
for (const pattern of test.errorPatterns) {
if (result.stderr.toLowerCase().includes(pattern.toLowerCase())) {
for (const sid of sessionIds) {
await deleteSession(sid);
}
return {
success: false,
error: `Found error pattern "${pattern}" in concurrent response`,
duration: Date.now() - start,
turnsCompleted: 0,
};
}
}
}
const failedCount = results.filter((r) => r.code !== 0).length;
if (failedCount > test.concurrentRequests / 2) {
for (const sid of sessionIds) {
await deleteSession(sid);
}
return {
success: false,
error: `${failedCount}/${test.concurrentRequests} requests failed`,
duration: Date.now() - start,
turnsCompleted: test.concurrentRequests - failedCount,
};
}
for (const sid of sessionIds) {
await deleteSession(sid);
}
return {
success: true,
duration: Date.now() - start,
turnsCompleted: test.concurrentRequests,
};
}
async function runMultiTurnTest(test: MultiTurnTest): Promise<TestResult> {
const start = Date.now();
let sessionId: string | null = null;
let turnsCompleted = 0;
for (let index = 0; index < test.turns.length; index++) {
const prompt = test.turns[index]!;
const turn = test.turns[index]!;
const prompt = typeof turn === "string" ? turn : turn.prompt;
const model = typeof turn === "string" ? test.model : (turn.model ?? test.model);
const turnStart = Date.now();
process.stdout.write(`\r Progress: ${index + 1}/${test.turns.length} turns...`);
const result = await runTurn(
prompt,
test.model,
model,
sessionId ?? null,
`regression-${test.name}`,
test.timeout
);
const combined = result.output + result.stderr;
for (const pattern of test.errorPatterns) {
if (combined.toLowerCase().includes(pattern.toLowerCase())) {
if (result.stderr.toLowerCase().includes(pattern.toLowerCase())) {
process.stdout.write("\r" + " ".repeat(50) + "\r");
return {
success: false,
error: `Turn ${index + 1}: Found error pattern "${pattern}"`,
duration: Date.now() - start,
turnsCompleted,
sessionId: sessionId ?? undefined,
};
}
}
@@ -193,11 +459,13 @@ async function runMultiTurnTest(test: MultiTurnTest): Promise<TestResult> {
if (result.code !== 0 && result.code !== null) {
const isTimeout = Date.now() - turnStart >= test.timeout - 1000;
if (isTimeout) {
process.stdout.write("\r" + " ".repeat(50) + "\r");
return {
success: false,
error: `Turn ${index + 1}: Timeout after ${test.timeout}ms`,
duration: Date.now() - start,
turnsCompleted,
sessionId: sessionId ?? undefined,
};
}
}
@@ -206,16 +474,19 @@ async function runMultiTurnTest(test: MultiTurnTest): Promise<TestResult> {
turnsCompleted++;
}
process.stdout.write("\r" + " ".repeat(50) + "\r");
return {
success: true,
duration: Date.now() - start,
turnsCompleted,
sessionId: sessionId ?? undefined,
};
}
function parseArgs(): {
filterName: string | null;
filterCategory: Category | null;
suite: TestSuite;
dryRun: boolean;
help: boolean;
} {
@@ -224,9 +495,15 @@ function parseArgs(): {
const idx = args.indexOf(flag);
return idx !== -1 && args[idx + 1] !== undefined ? args[idx + 1]! : null;
};
let suite: TestSuite = "all";
if (args.includes("--sanity")) suite = "sanity";
if (args.includes("--heavy")) suite = "heavy";
return {
filterName: getArg("--test") ?? getArg("--name"),
filterCategory: getArg("--category") as Category | null,
suite,
dryRun: args.includes("--dry-run"),
help: args.includes("--help") || args.includes("-h"),
};
@@ -236,54 +513,86 @@ function showHelp(): void {
console.log(`
Multi-Turn Regression Test Suite for Antigravity Plugin
Tests for known bugs:
- Issue #50: Thinking block order errors
- Tool pairing: tool_use without tool_result
- Multi-tool: Complex tool chains
Test Suites:
--sanity Quick tests (7 tests, ~5 min) - run frequently
--heavy Stress tests (4 tests, ~30 min) - long conversations
(default) All tests
Tests:
Sanity (quick, repeatable):
- thinking-tool-use, thinking-bash-tool, tool-pairing-sequential
- opus-thinking-basic, thinking-modification-continue
- multi-provider-switch, prompt-too-long-recovery
Heavy (stress, endurance):
- stress-8-turn-multi-provider (8 turns)
- opencode-tools-comprehensive (6 turns, all tools)
- stress-20-turn-recovery (20 turns, multi-model, recovery)
- stress-50-turn-endurance (51 turns, endurance test)
Usage:
npx tsx script/test-regression.ts [options]
Options:
--sanity Run sanity tests only (quick)
--heavy Run heavy tests only (stress)
--test <name> Run specific test by name
--category <cat> Run tests by category (thinking-order|tool-pairing|multi-tool)
--category <cat> Run tests by category
--dry-run List tests without running
--help, -h Show this help
Examples:
npx tsx script/test-regression.ts --dry-run
npx tsx script/test-regression.ts --category thinking-order
npx tsx script/test-regression.ts --test thinking-tool-use-basic
npx tsx script/test-regression.ts --sanity
npx tsx script/test-regression.ts --heavy
npx tsx script/test-regression.ts --test stress-20-turn-recovery
`);
}
async function main(): Promise<void> {
const { filterName, filterCategory, dryRun, help } = parseArgs();
const { filterName, filterCategory, suite, dryRun, help } = parseArgs();
if (help) {
showHelp();
return;
}
let tests = TESTS;
let tests: MultiTurnTest[];
switch (suite) {
case "sanity":
tests = SANITY_TESTS;
break;
case "heavy":
tests = HEAVY_TESTS;
break;
default:
tests = ALL_TESTS;
}
if (filterName) {
tests = tests.filter((t) => t.name === filterName);
}
if (filterCategory) {
if (filterCategory && filterCategory !== "concurrency") {
tests = tests.filter((t) => t.category === filterCategory);
}
if (tests.length === 0) {
const runConcurrentOnly = filterCategory === "concurrency";
if (runConcurrentOnly) {
tests = [];
}
if (tests.length === 0 && !runConcurrentOnly) {
console.error("No tests match the specified filters");
process.exit(1);
}
console.log(`\n🧪 Multi-Turn Regression Tests (${tests.length} tests)\n${"=".repeat(55)}\n`);
const totalTurns = tests.reduce((sum, t) => sum + t.turns.length, 0);
const concurrentCount = CONCURRENT_TESTS.reduce((sum, t) => sum + t.concurrentRequests, 0);
console.log(`\n🧪 Regression Tests [${suite.toUpperCase()}] (${tests.length} tests, ${totalTurns} turns + ${concurrentCount} concurrent)\n${"=".repeat(60)}\n`);
if (dryRun) {
console.log("Tests to run:\n");
for (const test of tests) {
console.log(` ${test.name}`);
console.log(` ${test.name} [${test.suite}]`);
console.log(` Model: ${test.model}`);
console.log(` Category: ${test.category}`);
console.log(` Turns: ${test.turns.length}`);
@@ -295,29 +604,63 @@ async function main(): Promise<void> {
const results: { test: MultiTurnTest; result: TestResult }[] = [];
for (const test of tests) {
console.log(`Testing: ${test.name}`);
console.log(`Testing: ${test.name} [${test.suite}]`);
console.log(` Model: ${test.model}`);
console.log(` Turns: ${test.turns.length}`);
process.stdout.write(" Status: ");
const result = await runMultiTurnTest(test);
results.push({ test, result });
if (result.success) {
console.log(`✅ PASS (${result.turnsCompleted}/${test.turns.length} turns, ${(result.duration / 1000).toFixed(1)}s)`);
console.log(` Status: ✅ PASS (${result.turnsCompleted}/${test.turns.length} turns, ${(result.duration / 1000).toFixed(1)}s)`);
} else {
console.log(`❌ FAIL`);
console.log(` Status: ❌ FAIL`);
console.log(` Error: ${result.error}`);
console.log(` Completed: ${result.turnsCompleted}/${test.turns.length} turns`);
}
if (result.sessionId) {
await deleteSession(result.sessionId);
}
console.log();
}
if (suite === "heavy" || suite === "all" || runConcurrentOnly || filterName) {
let concurrentTests = CONCURRENT_TESTS;
if (filterName) {
concurrentTests = concurrentTests.filter((t) => t.name === filterName);
}
if (concurrentTests.length === 0 && !runConcurrentOnly && tests.length === 0) {
console.error("No tests match the specified filters");
process.exit(1);
}
if (concurrentTests.length > 0) {
console.log(`\n🔄 Concurrent Tests (${concurrentTests.length} tests)\n${"-".repeat(40)}\n`);
for (const test of concurrentTests) {
console.log(`Testing: ${test.name} [concurrent]`);
console.log(` Model: ${test.model}`);
console.log(` Concurrent: ${test.concurrentRequests} requests`);
const result = await runConcurrentTest(test);
results.push({ test: test as unknown as MultiTurnTest, result });
if (result.success) {
console.log(` Status: ✅ PASS (${result.turnsCompleted} requests, ${(result.duration / 1000).toFixed(1)}s)`);
} else {
console.log(` Status: ❌ FAIL`);
console.log(` Error: ${result.error}`);
}
console.log();
}
}
}
const passed = results.filter((r) => r.result.success).length;
const failed = results.filter((r) => !r.result.success).length;
const totalTime = results.reduce((sum, r) => sum + r.result.duration, 0);
console.log("=".repeat(55));
console.log(`\nSummary: ${passed} passed, ${failed} failed\n`);
console.log("=".repeat(60));
console.log(`\nSummary: ${passed} passed, ${failed} failed (${(totalTime / 1000).toFixed(1)}s total)\n`);
if (failed > 0) {
console.log("Failed tests:");