fix: improve rate limit handling and add prompt-too-long toast

- Add 2s deduplication window to prevent rate limit counter inflation from concurrent 429s - Separate cooldown system from rate limits for non-429 errors (auth failures, 5xx) - Add quota_fallback config option for automatic quota switching on rate limit - Add toast notification for 400 'Prompt is too long' errors guiding users to /compact - Add 5 new cooldown unit tests - Enhance regression test suite with concurrent test infrastructure - Add comprehensive rate limit analysis documentation
2026-05-13 23:53:18 +00:00 · 2025-12-28 16:09:52 +07:00
parent 93415be685
commit 457b3ac12b
9 changed files with 1614 additions and 78 deletions
--- a/script/test-regression.ts
+++ b/script/test-regression.ts
@@ -1,15 +1,23 @@
 #!/usr/bin/env npx tsx
 import { spawn } from "child_process";

-type Category = "thinking-order" | "tool-pairing" | "multi-tool";
+type Category = "thinking-order" | "tool-pairing" | "multi-tool" | "multi-provider" | "error-handling" | "stress" | "concurrency";
+type TestSuite = "sanity" | "heavy" | "all";

 interface MultiTurnTest {
  name: string;
  model: string;
  category: Category;
-  turns: string[];
+  suite: TestSuite;
+  turns: (string | TurnConfig)[];
  errorPatterns: string[];
  timeout: number;
+  expectError?: string;
+}
+
+interface TurnConfig {
+  prompt: string;
+  model?: string;
 }

 interface TestResult {
@@ -17,6 +25,18 @@ interface TestResult {
  error?: string;
  duration: number;
  turnsCompleted: number;
+  sessionId?: string;
+}
+
+interface ConcurrentTest {
+  name: string;
+  category: "concurrency";
+  suite: TestSuite;
+  concurrentRequests: number;
+  model: string;
+  prompt: string;
+  errorPatterns: string[];
+  timeout: number;
 }

 const ERROR_PATTERNS = [
@@ -32,60 +52,53 @@ const ERROR_PATTERNS = [
  "must remain as they were",
 ];

-const TESTS: MultiTurnTest[] = [
-  // Issue #50: Thinking block order bug - simple single-turn tool use
+const GEMINI_FLASH = "google/antigravity-gemini-3-flash";
+const GEMINI_FLASH_CLI_QUOTA = "google/gemini-2.5-flash";
+const CLAUDE_SONNET = "google/antigravity-claude-sonnet-4-5-thinking-low";
+const CLAUDE_OPUS = "google/antigravity-claude-opus-4-5-thinking-low";
+
+const SANITY_TESTS: MultiTurnTest[] = [
  {
    name: "thinking-tool-use",
-    model: "google/antigravity-claude-sonnet-4-5-thinking-low",
+    model: CLAUDE_SONNET,
    category: "thinking-order",
-    turns: [
-      "Read package.json and tell me the package name",
-    ],
+    suite: "sanity",
+    turns: ["Read package.json and tell me the package name"],
    errorPatterns: ERROR_PATTERNS,
    timeout: 90000,
  },
  {
    name: "thinking-bash-tool",
-    model: "google/antigravity-claude-sonnet-4-5-thinking-low",
+    model: CLAUDE_SONNET,
    category: "thinking-order",
-    turns: [
-      "Run: echo 'hello' and tell me the output",
-    ],
+    suite: "sanity",
+    turns: ["Run: echo 'hello' and tell me the output"],
    errorPatterns: ERROR_PATTERNS,
    timeout: 90000,
  },
-
-  // Tool pairing - simple two-turn
  {
    name: "tool-pairing-sequential",
-    model: "google/antigravity-claude-sonnet-4-5-thinking-low",
+    model: CLAUDE_SONNET,
    category: "tool-pairing",
-    turns: [
-      "Run: echo 'first'",
-      "Run: echo 'second'",
-    ],
+    suite: "sanity",
+    turns: ["Run: echo 'first'", "Run: echo 'second'"],
    errorPatterns: ERROR_PATTERNS,
    timeout: 120000,
  },
-
-  // Opus model basic test
  {
    name: "opus-thinking-basic",
-    model: "google/antigravity-claude-opus-4-5-thinking-low",
+    model: CLAUDE_OPUS,
    category: "thinking-order",
-    turns: [
-      "What is 7 * 8? Use bash to verify: echo $((7*8))",
-    ],
+    suite: "sanity",
+    turns: ["What is 7 * 8? Use bash to verify: echo $((7*8))"],
    errorPatterns: ERROR_PATTERNS,
    timeout: 120000,
  },
-
-  // Bug: "thinking blocks in latest assistant message cannot be modified"
-  // Tests multi-turn with thinking blocks to verify they're preserved unchanged
  {
    name: "thinking-modification-continue",
-    model: "google/antigravity-claude-sonnet-4-5-thinking-low",
+    model: CLAUDE_SONNET,
    category: "thinking-order",
+    suite: "sanity",
    turns: [
      "Read package.json and tell me the version",
      "Now read tsconfig.json and tell me the target",
@@ -94,8 +107,178 @@ const TESTS: MultiTurnTest[] = [
    errorPatterns: ERROR_PATTERNS,
    timeout: 120000,
  },
+  {
+    name: "multi-provider-switch",
+    model: GEMINI_FLASH,
+    category: "multi-provider",
+    suite: "sanity",
+    turns: [
+      { prompt: "What is 2+2? Answer briefly.", model: GEMINI_FLASH },
+      { prompt: "What is 3+3? Answer briefly.", model: CLAUDE_SONNET },
+      { prompt: "What is 4+4? Answer briefly.", model: GEMINI_FLASH },
+    ],
+    errorPatterns: ERROR_PATTERNS,
+    timeout: 180000,
+  },
+  {
+    name: "prompt-too-long-recovery",
+    model: GEMINI_FLASH,
+    category: "error-handling",
+    suite: "sanity",
+    turns: ["Reply with exactly: OK", "Repeat the word 'test' 50000 times"],
+    errorPatterns: ["FATAL", "unhandled", "Cannot read properties"],
+    timeout: 60000,
+  },
 ];

+const HEAVY_TESTS: MultiTurnTest[] = [
+  {
+    name: "stress-8-turn-multi-provider",
+    model: GEMINI_FLASH,
+    category: "stress",
+    suite: "heavy",
+    turns: [
+      { prompt: "Read package.json and tell me the name", model: GEMINI_FLASH },
+      { prompt: "Now read tsconfig.json and tell me the target", model: CLAUDE_SONNET },
+      { prompt: "Run: ls -la src/plugin | head -5", model: GEMINI_FLASH },
+      { prompt: "Read src/plugin/auth.ts and summarize in 1 sentence", model: CLAUDE_SONNET },
+      { prompt: "Run: wc -l src/plugin/*.ts | tail -3", model: GEMINI_FLASH },
+      { prompt: "Read README.md first 50 lines and tell me what this project does", model: CLAUDE_SONNET },
+      { prompt: "Run: git log --oneline -3", model: GEMINI_FLASH },
+      { prompt: "Summarize everything we discussed in 3 bullet points", model: CLAUDE_SONNET },
+    ],
+    errorPatterns: ERROR_PATTERNS,
+    timeout: 600000,
+  },
+  {
+    name: "opencode-tools-comprehensive",
+    model: CLAUDE_SONNET,
+    category: "multi-tool",
+    suite: "heavy",
+    turns: [
+      "Use glob to find all *.ts files in src/plugin directory",
+      "Use grep to search for 'async function' in src/plugin/auth.ts",
+      "Use bash to run: echo 'test123' && pwd",
+      "Use read to read the first 20 lines of package.json",
+      "Use lsp_diagnostics on src/plugin/auth.ts to check for errors",
+      "Use glob to find all test files matching *.test.ts",
+    ],
+    errorPatterns: ERROR_PATTERNS,
+    timeout: 480000,
+  },
+  {
+    name: "stress-20-turn-recovery",
+    model: GEMINI_FLASH,
+    category: "stress",
+    suite: "heavy",
+    turns: [
+      { prompt: "Read package.json and extract the version number only", model: GEMINI_FLASH },
+      { prompt: "Run: ls src/plugin/*.ts | head -3", model: CLAUDE_SONNET },
+      { prompt: "Read src/plugin/auth.ts first 30 lines", model: GEMINI_FLASH },
+      { prompt: "Use grep to find 'export' in src/plugin/auth.ts", model: CLAUDE_SONNET },
+      { prompt: "Run: echo 'checkpoint 1' && date", model: GEMINI_FLASH },
+      { prompt: "Read tsconfig.json and tell me the module type", model: CLAUDE_SONNET },
+      { prompt: "Use glob to find all *.test.ts files", model: GEMINI_FLASH },
+      { prompt: "Read src/plugin/token.ts first 20 lines", model: CLAUDE_SONNET },
+      { prompt: "Run: wc -l src/plugin/*.ts | sort -n | tail -5", model: GEMINI_FLASH },
+      { prompt: "What files have we read so far? List them.", model: CLAUDE_SONNET },
+      { prompt: "Read src/plugin/request.ts first 25 lines", model: GEMINI_FLASH },
+      { prompt: "Use grep to find 'async' in src/plugin/request.ts", model: CLAUDE_SONNET },
+      { prompt: "Run: echo 'checkpoint 2' && pwd", model: GEMINI_FLASH },
+      { prompt: "Read src/plugin/storage.ts first 20 lines", model: CLAUDE_SONNET },
+      { prompt: "Use lsp_diagnostics on src/plugin/token.ts", model: GEMINI_FLASH },
+      { prompt: "Read vitest.config.ts completely", model: CLAUDE_SONNET },
+      { prompt: "Run: git status --short | head -5", model: GEMINI_FLASH },
+      { prompt: "Read src/constants.ts completely", model: CLAUDE_SONNET },
+      { prompt: "Run: echo 'final checkpoint' && echo 'all done'", model: GEMINI_FLASH },
+      { prompt: "Summarize this entire conversation in 5 bullet points", model: CLAUDE_SONNET },
+    ],
+    errorPatterns: ERROR_PATTERNS,
+    timeout: 900000,
+  },
+  {
+    name: "stress-50-turn-endurance",
+    model: GEMINI_FLASH,
+    category: "stress",
+    suite: "heavy",
+    turns: generateEnduranceTest(50),
+    errorPatterns: ERROR_PATTERNS,
+    timeout: 1800000,
+  },
+];
+
+function generateEnduranceTest(turnCount: number): TurnConfig[] {
+  const turns: TurnConfig[] = [];
+  const prompts = [
+    { prompt: "What is {n} + {n}? Answer with just the number.", model: GEMINI_FLASH },
+    { prompt: "Run: echo 'turn {i}'", model: CLAUDE_SONNET },
+    { prompt: "Read package.json and tell me one field", model: GEMINI_FLASH },
+    { prompt: "Run: pwd && echo 'ok'", model: CLAUDE_SONNET },
+    { prompt: "What turn number are we on? Just say the number.", model: GEMINI_FLASH },
+    { prompt: "Run: date +%H:%M:%S", model: CLAUDE_SONNET },
+    { prompt: "Use glob to find one .ts file in src/", model: GEMINI_FLASH },
+    { prompt: "Run: echo 'checkpoint {i}'", model: CLAUDE_SONNET },
+    { prompt: "Read tsconfig.json and tell me target", model: GEMINI_FLASH },
+    { prompt: "What have we done in last 3 turns? Brief answer.", model: CLAUDE_SONNET },
+  ];
+
+  for (let i = 0; i < turnCount; i++) {
+    const template = prompts[i % prompts.length]!;
+    const prompt = template.prompt
+      .replace(/\{i\}/g, String(i + 1))
+      .replace(/\{n\}/g, String(i + 1));
+    turns.push({ prompt, model: template.model });
+  }
+
+  turns.push({
+    prompt: `We completed ${turnCount} turns. Summarize this session in 3 sentences.`,
+    model: CLAUDE_SONNET,
+  });
+
+  return turns;
+}
+
+const RATE_LIMIT_ERROR_PATTERNS = [
+  "false alarm",
+  "incorrectly marked as rate limited",
+  "wrong quota",
+];
+
+const CONCURRENT_TESTS: ConcurrentTest[] = [
+  {
+    name: "concurrent-5-same-model",
+    category: "concurrency",
+    suite: "heavy",
+    concurrentRequests: 5,
+    model: GEMINI_FLASH,
+    prompt: "What is 2+2? Answer with just the number.",
+    errorPatterns: [...ERROR_PATTERNS, ...RATE_LIMIT_ERROR_PATTERNS],
+    timeout: 120000,
+  },
+  {
+    name: "concurrent-3-mixed-models",
+    category: "concurrency",
+    suite: "heavy",
+    concurrentRequests: 3,
+    model: GEMINI_FLASH,
+    prompt: "Say hello in one word.",
+    errorPatterns: [...ERROR_PATTERNS, ...RATE_LIMIT_ERROR_PATTERNS],
+    timeout: 120000,
+  },
+  {
+    name: "concurrent-10-antigravity-heavy",
+    category: "concurrency",
+    suite: "heavy",
+    concurrentRequests: 10,
+    model: GEMINI_FLASH,
+    prompt: "What is 1+1? Answer with just the number.",
+    errorPatterns: [...ERROR_PATTERNS, ...RATE_LIMIT_ERROR_PATTERNS],
+    timeout: 180000,
+  },
+];
+
+const ALL_TESTS = [...SANITY_TESTS, ...HEAVY_TESTS];
+
 async function runTurn(
  prompt: string,
  model: string,
@@ -161,31 +344,114 @@ async function runTurn(
  });
 }

+async function deleteSession(sessionId: string): Promise<void> {
+  return new Promise((resolve) => {
+    const proc = spawn("opencode", ["session", "delete", sessionId, "--force"], {
+      stdio: ["ignore", "pipe", "pipe"],
+      timeout: 10000,
+      cwd: process.cwd(),
+    });
+
+    proc.on("close", () => resolve());
+    proc.on("error", () => resolve());
+  });
+}
+
+async function runConcurrentTest(test: ConcurrentTest): Promise<TestResult> {
+  const start = Date.now();
+  const sessionIds: string[] = [];
+
+  process.stdout.write(`  Spawning ${test.concurrentRequests} concurrent requests...`);
+
+  const promises = Array.from({ length: test.concurrentRequests }, (_, i) =>
+    runTurn(
+      `${test.prompt} (request ${i + 1})`,
+      test.model,
+      null,
+      `concurrent-${test.name}-${i}`,
+      test.timeout
+    )
+  );
+
+  const results = await Promise.all(promises);
+  process.stdout.write("\r" + " ".repeat(60) + "\r");
+
+  for (const result of results) {
+    if (result.sessionId) {
+      sessionIds.push(result.sessionId);
+    }
+  }
+
+  for (const result of results) {
+    for (const pattern of test.errorPatterns) {
+      if (result.stderr.toLowerCase().includes(pattern.toLowerCase())) {
+        for (const sid of sessionIds) {
+          await deleteSession(sid);
+        }
+        return {
+          success: false,
+          error: `Found error pattern "${pattern}" in concurrent response`,
+          duration: Date.now() - start,
+          turnsCompleted: 0,
+        };
+      }
+    }
+  }
+
+  const failedCount = results.filter((r) => r.code !== 0).length;
+  if (failedCount > test.concurrentRequests / 2) {
+    for (const sid of sessionIds) {
+      await deleteSession(sid);
+    }
+    return {
+      success: false,
+      error: `${failedCount}/${test.concurrentRequests} requests failed`,
+      duration: Date.now() - start,
+      turnsCompleted: test.concurrentRequests - failedCount,
+    };
+  }
+
+  for (const sid of sessionIds) {
+    await deleteSession(sid);
+  }
+
+  return {
+    success: true,
+    duration: Date.now() - start,
+    turnsCompleted: test.concurrentRequests,
+  };
+}
+
 async function runMultiTurnTest(test: MultiTurnTest): Promise<TestResult> {
  const start = Date.now();
  let sessionId: string | null = null;
  let turnsCompleted = 0;

  for (let index = 0; index < test.turns.length; index++) {
-    const prompt = test.turns[index]!;
+    const turn = test.turns[index]!;
+    const prompt = typeof turn === "string" ? turn : turn.prompt;
+    const model = typeof turn === "string" ? test.model : (turn.model ?? test.model);
    const turnStart = Date.now();
+
+    process.stdout.write(`\r  Progress: ${index + 1}/${test.turns.length} turns...`);
+
    const result = await runTurn(
      prompt,
-      test.model,
+      model,
      sessionId ?? null,
      `regression-${test.name}`,
      test.timeout
    );

-    const combined = result.output + result.stderr;
-
    for (const pattern of test.errorPatterns) {
-      if (combined.toLowerCase().includes(pattern.toLowerCase())) {
+      if (result.stderr.toLowerCase().includes(pattern.toLowerCase())) {
+        process.stdout.write("\r" + " ".repeat(50) + "\r");
        return {
          success: false,
          error: `Turn ${index + 1}: Found error pattern "${pattern}"`,
          duration: Date.now() - start,
          turnsCompleted,
+          sessionId: sessionId ?? undefined,
        };
      }
    }
@@ -193,11 +459,13 @@ async function runMultiTurnTest(test: MultiTurnTest): Promise<TestResult> {
    if (result.code !== 0 && result.code !== null) {
      const isTimeout = Date.now() - turnStart >= test.timeout - 1000;
      if (isTimeout) {
+        process.stdout.write("\r" + " ".repeat(50) + "\r");
        return {
          success: false,
          error: `Turn ${index + 1}: Timeout after ${test.timeout}ms`,
          duration: Date.now() - start,
          turnsCompleted,
+          sessionId: sessionId ?? undefined,
        };
      }
    }
@@ -206,16 +474,19 @@ async function runMultiTurnTest(test: MultiTurnTest): Promise<TestResult> {
    turnsCompleted++;
  }

+  process.stdout.write("\r" + " ".repeat(50) + "\r");
  return {
    success: true,
    duration: Date.now() - start,
    turnsCompleted,
+    sessionId: sessionId ?? undefined,
  };
 }

 function parseArgs(): {
  filterName: string | null;
  filterCategory: Category | null;
+  suite: TestSuite;
  dryRun: boolean;
  help: boolean;
 } {
@@ -224,9 +495,15 @@ function parseArgs(): {
    const idx = args.indexOf(flag);
    return idx !== -1 && args[idx + 1] !== undefined ? args[idx + 1]! : null;
  };
+
+  let suite: TestSuite = "all";
+  if (args.includes("--sanity")) suite = "sanity";
+  if (args.includes("--heavy")) suite = "heavy";
+
  return {
    filterName: getArg("--test") ?? getArg("--name"),
    filterCategory: getArg("--category") as Category | null,
+    suite,
    dryRun: args.includes("--dry-run"),
    help: args.includes("--help") || args.includes("-h"),
  };
@@ -236,54 +513,86 @@ function showHelp(): void {
  console.log(`
 Multi-Turn Regression Test Suite for Antigravity Plugin

-Tests for known bugs:
-  - Issue #50: Thinking block order errors
-  - Tool pairing: tool_use without tool_result
-  - Multi-tool: Complex tool chains
+Test Suites:
+  --sanity    Quick tests (7 tests, ~5 min) - run frequently
+  --heavy     Stress tests (4 tests, ~30 min) - long conversations
+  (default)   All tests
+
+Tests:
+  Sanity (quick, repeatable):
+    - thinking-tool-use, thinking-bash-tool, tool-pairing-sequential
+    - opus-thinking-basic, thinking-modification-continue
+    - multi-provider-switch, prompt-too-long-recovery
+
+  Heavy (stress, endurance):
+    - stress-8-turn-multi-provider (8 turns)
+    - opencode-tools-comprehensive (6 turns, all tools)
+    - stress-20-turn-recovery (20 turns, multi-model, recovery)
+    - stress-50-turn-endurance (51 turns, endurance test)

 Usage:
  npx tsx script/test-regression.ts [options]

 Options:
+  --sanity              Run sanity tests only (quick)
+  --heavy               Run heavy tests only (stress)
  --test <name>         Run specific test by name
-  --category <cat>      Run tests by category (thinking-order|tool-pairing|multi-tool)
+  --category <cat>      Run tests by category
  --dry-run             List tests without running
  --help, -h            Show this help

 Examples:
-  npx tsx script/test-regression.ts --dry-run
-  npx tsx script/test-regression.ts --category thinking-order
-  npx tsx script/test-regression.ts --test thinking-tool-use-basic
+  npx tsx script/test-regression.ts --sanity
+  npx tsx script/test-regression.ts --heavy
+  npx tsx script/test-regression.ts --test stress-20-turn-recovery
 `);
 }

 async function main(): Promise<void> {
-  const { filterName, filterCategory, dryRun, help } = parseArgs();
+  const { filterName, filterCategory, suite, dryRun, help } = parseArgs();

  if (help) {
    showHelp();
    return;
  }

-  let tests = TESTS;
+  let tests: MultiTurnTest[];
+  switch (suite) {
+    case "sanity":
+      tests = SANITY_TESTS;
+      break;
+    case "heavy":
+      tests = HEAVY_TESTS;
+      break;
+    default:
+      tests = ALL_TESTS;
+  }
+
  if (filterName) {
    tests = tests.filter((t) => t.name === filterName);
  }
-  if (filterCategory) {
+  if (filterCategory && filterCategory !== "concurrency") {
    tests = tests.filter((t) => t.category === filterCategory);
  }

-  if (tests.length === 0) {
+  const runConcurrentOnly = filterCategory === "concurrency";
+  if (runConcurrentOnly) {
+    tests = [];
+  }
+
+  if (tests.length === 0 && !runConcurrentOnly) {
    console.error("No tests match the specified filters");
    process.exit(1);
  }

-  console.log(`\n🧪 Multi-Turn Regression Tests (${tests.length} tests)\n${"=".repeat(55)}\n`);
+  const totalTurns = tests.reduce((sum, t) => sum + t.turns.length, 0);
+  const concurrentCount = CONCURRENT_TESTS.reduce((sum, t) => sum + t.concurrentRequests, 0);
+  console.log(`\n🧪 Regression Tests [${suite.toUpperCase()}] (${tests.length} tests, ${totalTurns} turns + ${concurrentCount} concurrent)\n${"=".repeat(60)}\n`);

  if (dryRun) {
    console.log("Tests to run:\n");
    for (const test of tests) {
-      console.log(`  ${test.name}`);
+      console.log(`  ${test.name} [${test.suite}]`);
      console.log(`    Model: ${test.model}`);
      console.log(`    Category: ${test.category}`);
      console.log(`    Turns: ${test.turns.length}`);
@@ -295,29 +604,63 @@ async function main(): Promise<void> {
  const results: { test: MultiTurnTest; result: TestResult }[] = [];

  for (const test of tests) {
-    console.log(`Testing: ${test.name}`);
+    console.log(`Testing: ${test.name} [${test.suite}]`);
    console.log(`  Model: ${test.model}`);
    console.log(`  Turns: ${test.turns.length}`);
-    process.stdout.write("  Status: ");

    const result = await runMultiTurnTest(test);
    results.push({ test, result });

    if (result.success) {
-      console.log(`✅ PASS (${result.turnsCompleted}/${test.turns.length} turns, ${(result.duration / 1000).toFixed(1)}s)`);
+      console.log(`  Status: ✅ PASS (${result.turnsCompleted}/${test.turns.length} turns, ${(result.duration / 1000).toFixed(1)}s)`);
    } else {
-      console.log(`❌ FAIL`);
+      console.log(`  Status: ❌ FAIL`);
      console.log(`    Error: ${result.error}`);
      console.log(`    Completed: ${result.turnsCompleted}/${test.turns.length} turns`);
    }
+
+    if (result.sessionId) {
+      await deleteSession(result.sessionId);
+    }
    console.log();
  }

+  if (suite === "heavy" || suite === "all" || runConcurrentOnly || filterName) {
+    let concurrentTests = CONCURRENT_TESTS;
+    if (filterName) {
+      concurrentTests = concurrentTests.filter((t) => t.name === filterName);
+    }
+    if (concurrentTests.length === 0 && !runConcurrentOnly && tests.length === 0) {
+      console.error("No tests match the specified filters");
+      process.exit(1);
+    }
+    if (concurrentTests.length > 0) {
+      console.log(`\n🔄 Concurrent Tests (${concurrentTests.length} tests)\n${"-".repeat(40)}\n`);
+      for (const test of concurrentTests) {
+        console.log(`Testing: ${test.name} [concurrent]`);
+        console.log(`  Model: ${test.model}`);
+        console.log(`  Concurrent: ${test.concurrentRequests} requests`);
+
+        const result = await runConcurrentTest(test);
+        results.push({ test: test as unknown as MultiTurnTest, result });
+
+        if (result.success) {
+          console.log(`  Status: ✅ PASS (${result.turnsCompleted} requests, ${(result.duration / 1000).toFixed(1)}s)`);
+        } else {
+          console.log(`  Status: ❌ FAIL`);
+          console.log(`    Error: ${result.error}`);
+        }
+        console.log();
+      }
+    }
+  }
+
  const passed = results.filter((r) => r.result.success).length;
  const failed = results.filter((r) => !r.result.success).length;
+  const totalTime = results.reduce((sum, r) => sum + r.result.duration, 0);

-  console.log("=".repeat(55));
-  console.log(`\nSummary: ${passed} passed, ${failed} failed\n`);
+  console.log("=".repeat(60));
+  console.log(`\nSummary: ${passed} passed, ${failed} failed (${(totalTime / 1000).toFixed(1)}s total)\n`);

  if (failed > 0) {
    console.log("Failed tests:");