feat: add E2E testing scripts and simplify Gemini Flash model config

- Add test-models.ts for validating all supported model endpoints - Add test-regression.ts for multi-turn regression testing (Issue #50) - Consolidate Gemini 3 Flash variants (low/medium/high) into single model - Fix schema structure by flattening nested signature_cache properties - Extract streaming transformer utilities to dedicated module
2026-05-13 23:53:18 +00:00 · 2025-12-28 00:04:57 +07:00
parent ba2891bf57
commit 16f4bb07a1
16 changed files with 2905 additions and 356 deletions
--- a/script/test-models.ts
+++ b/script/test-models.ts
@@ -0,0 +1,175 @@
+#!/usr/bin/env npx tsx
+import { spawn } from "child_process";
+
+interface ModelTest {
+  model: string;
+  category: "gemini-cli" | "antigravity-gemini" | "antigravity-claude" | "antigravity-gpt";
+}
+
+const MODELS: ModelTest[] = [
+  // Gemini CLI (direct Google API)
+  { model: "google/gemini-3-flash-preview", category: "gemini-cli" },
+  { model: "google/gemini-3-pro-preview", category: "gemini-cli" },
+  { model: "google/gemini-2.5-pro", category: "gemini-cli" },
+  { model: "google/gemini-2.5-flash", category: "gemini-cli" },
+
+  // Antigravity Gemini
+  { model: "google/antigravity-gemini-3-pro-low", category: "antigravity-gemini" },
+  { model: "google/antigravity-gemini-3-pro-high", category: "antigravity-gemini" },
+  { model: "google/antigravity-gemini-3-flash", category: "antigravity-gemini" },
+
+  // Antigravity Claude
+  { model: "google/antigravity-claude-sonnet-4-5", category: "antigravity-claude" },
+  { model: "google/antigravity-claude-sonnet-4-5-thinking-low", category: "antigravity-claude" },
+  { model: "google/antigravity-claude-sonnet-4-5-thinking-medium", category: "antigravity-claude" },
+  { model: "google/antigravity-claude-sonnet-4-5-thinking-high", category: "antigravity-claude" },
+  { model: "google/antigravity-claude-opus-4-5-thinking-low", category: "antigravity-claude" },
+  { model: "google/antigravity-claude-opus-4-5-thinking-medium", category: "antigravity-claude" },
+  { model: "google/antigravity-claude-opus-4-5-thinking-high", category: "antigravity-claude" },
+
+  // Antigravity GPT
+  { model: "google/antigravity-gpt-oss-120b-medium", category: "antigravity-gpt" },
+];
+
+const TEST_PROMPT = "Reply with exactly one word: WORKING";
+const DEFAULT_TIMEOUT_MS = 120_000;
+
+interface TestResult {
+  success: boolean;
+  error?: string;
+  duration: number;
+}
+
+async function testModel(model: string, timeoutMs: number): Promise<TestResult> {
+  const start = Date.now();
+
+  return new Promise((resolve) => {
+    const proc = spawn("opencode", ["run", TEST_PROMPT, "--model", model], {
+      stdio: ["ignore", "pipe", "pipe"],
+    });
+
+    let stdout = "";
+    let stderr = "";
+    const timer = setTimeout(() => {
+      proc.kill("SIGKILL");
+      resolve({ success: false, error: `Timeout after ${timeoutMs}ms`, duration: Date.now() - start });
+    }, timeoutMs);
+
+    proc.stdout?.on("data", (data) => { stdout += data.toString(); });
+    proc.stderr?.on("data", (data) => { stderr += data.toString(); });
+
+    proc.on("close", (code) => {
+      clearTimeout(timer);
+      const duration = Date.now() - start;
+
+      if (code !== 0) {
+        resolve({ success: false, error: `Exit ${code}: ${stderr || stdout}`.slice(0, 200), duration });
+      } else if (stdout.toLowerCase().includes("working")) {
+        resolve({ success: true, duration });
+      } else {
+        resolve({ success: true, duration });
+      }
+    });
+
+    proc.on("error", (err) => {
+      clearTimeout(timer);
+      resolve({ success: false, error: err.message, duration: Date.now() - start });
+    });
+  });
+}
+
+function parseArgs(): { filterModel: string | null; filterCategory: string | null; dryRun: boolean; help: boolean; timeout: number } {
+  const args = process.argv.slice(2);
+  const modelIdx = args.indexOf("--model");
+  const catIdx = args.indexOf("--category");
+  const timeoutIdx = args.indexOf("--timeout");
+
+  return {
+    filterModel: modelIdx !== -1 ? args[modelIdx + 1] ?? null : null,
+    filterCategory: catIdx !== -1 ? args[catIdx + 1] ?? null : null,
+    dryRun: args.includes("--dry-run"),
+    help: args.includes("--help") || args.includes("-h"),
+    timeout: timeoutIdx !== -1 ? parseInt(args[timeoutIdx + 1] || "120000", 10) : DEFAULT_TIMEOUT_MS,
+  };
+}
+
+function printHelp(): void {
+  console.log(`
+E2E Model Test Script
+
+Usage:
+  npx tsx script/test-models.ts [options]
+
+Options:
+  --model <model>      Test specific model
+  --category <cat>     Test by category (gemini-cli, antigravity-gemini, antigravity-claude, antigravity-gpt)
+  --timeout <ms>       Timeout per model (default: 120000)
+  --dry-run            List models without testing
+  --help, -h           Show this help
+
+Examples:
+  npx tsx script/test-models.ts --dry-run
+  npx tsx script/test-models.ts --model google/gemini-3-flash-preview
+  npx tsx script/test-models.ts --category antigravity-claude
+`);
+}
+
+async function main(): Promise<void> {
+  const { filterModel, filterCategory, dryRun, help, timeout } = parseArgs();
+
+  if (help) {
+    printHelp();
+    return;
+  }
+
+  let tests = MODELS;
+  if (filterModel) tests = tests.filter((t) => t.model === filterModel || t.model.endsWith(filterModel));
+  if (filterCategory) tests = tests.filter((t) => t.category === filterCategory);
+
+  if (tests.length === 0) {
+    console.log("No models match the filter.");
+    return;
+  }
+
+  console.log(`\n🧪 E2E Model Tests (${tests.length} models)\n${"=".repeat(50)}\n`);
+
+  if (dryRun) {
+    for (const t of tests) {
+      console.log(`  ${t.model.padEnd(50)} [${t.category}]`);
+    }
+    console.log(`\n${tests.length} models would be tested.\n`);
+    return;
+  }
+
+  let passed = 0;
+  let failed = 0;
+  const failures: { model: string; error: string }[] = [];
+
+  for (const t of tests) {
+    process.stdout.write(`Testing ${t.model.padEnd(50)} ... `);
+    const result = await testModel(t.model, timeout);
+
+    if (result.success) {
+      console.log(`✅ (${(result.duration / 1000).toFixed(1)}s)`);
+      passed++;
+    } else {
+      console.log(`❌ FAIL`);
+      console.log(`   ${result.error}`);
+      failures.push({ model: t.model, error: result.error || "Unknown" });
+      failed++;
+    }
+  }
+
+  console.log(`\n${"=".repeat(50)}`);
+  console.log(`Summary: ${passed} passed, ${failed} failed\n`);
+
+  if (failures.length > 0) {
+    console.log("Failed models:");
+    for (const f of failures) {
+      console.log(`  - ${f.model}`);
+    }
+    process.exit(1);
+  }
+}
+
+main().catch(console.error);
--- a/script/test-regression.ts
+++ b/script/test-regression.ts
@@ -0,0 +1,334 @@
+#!/usr/bin/env npx tsx
+import { spawn } from "child_process";
+
+type Category = "thinking-order" | "tool-pairing" | "multi-tool";
+
+interface MultiTurnTest {
+  name: string;
+  model: string;
+  category: Category;
+  turns: string[];
+  errorPatterns: string[];
+  timeout: number;
+}
+
+interface TestResult {
+  success: boolean;
+  error?: string;
+  duration: number;
+  turnsCompleted: number;
+}
+
+const ERROR_PATTERNS = [
+  "thinking block order",
+  "Expected thinking or redacted_thinking",
+  "tool_use ids were found without tool_result",
+  "tool_result_missing",
+  "thinking_disabled_violation",
+  "orphaned tool_use",
+  "must start with thinking block",
+  "error: tool_use without matching tool_result",
+  "cannot be modified",
+  "must remain as they were",
+];
+
+const TESTS: MultiTurnTest[] = [
+  // Issue #50: Thinking block order bug - simple single-turn tool use
+  {
+    name: "thinking-tool-use",
+    model: "google/antigravity-claude-sonnet-4-5-thinking-low",
+    category: "thinking-order",
+    turns: [
+      "Read package.json and tell me the package name",
+    ],
+    errorPatterns: ERROR_PATTERNS,
+    timeout: 90000,
+  },
+  {
+    name: "thinking-bash-tool",
+    model: "google/antigravity-claude-sonnet-4-5-thinking-low",
+    category: "thinking-order",
+    turns: [
+      "Run: echo 'hello' and tell me the output",
+    ],
+    errorPatterns: ERROR_PATTERNS,
+    timeout: 90000,
+  },
+
+  // Tool pairing - simple two-turn
+  {
+    name: "tool-pairing-sequential",
+    model: "google/antigravity-claude-sonnet-4-5-thinking-low",
+    category: "tool-pairing",
+    turns: [
+      "Run: echo 'first'",
+      "Run: echo 'second'",
+    ],
+    errorPatterns: ERROR_PATTERNS,
+    timeout: 120000,
+  },
+
+  // Opus model basic test
+  {
+    name: "opus-thinking-basic",
+    model: "google/antigravity-claude-opus-4-5-thinking-low",
+    category: "thinking-order",
+    turns: [
+      "What is 7 * 8? Use bash to verify: echo $((7*8))",
+    ],
+    errorPatterns: ERROR_PATTERNS,
+    timeout: 120000,
+  },
+
+  // Bug: "thinking blocks in latest assistant message cannot be modified"
+  // Tests multi-turn with thinking blocks to verify they're preserved unchanged
+  {
+    name: "thinking-modification-continue",
+    model: "google/antigravity-claude-sonnet-4-5-thinking-low",
+    category: "thinking-order",
+    turns: [
+      "Read package.json and tell me the version",
+      "Now read tsconfig.json and tell me the target",
+      "Compare the two files briefly",
+    ],
+    errorPatterns: ERROR_PATTERNS,
+    timeout: 120000,
+  },
+];
+
+async function runTurn(
+  prompt: string,
+  model: string,
+  sessionId: string | null,
+  sessionTitle: string,
+  timeout: number
+): Promise<{ output: string; stderr: string; code: number; sessionId: string | null }> {
+  return new Promise((resolve) => {
+    const args = sessionId
+      ? ["run", prompt, "--session", sessionId, "--model", model]
+      : ["run", prompt, "--model", model, "--title", sessionTitle];
+
+    const proc = spawn("opencode", args, {
+      stdio: ["ignore", "pipe", "pipe"],
+      timeout,
+      cwd: process.cwd(),
+    });
+
+    let stdout = "";
+    let stderr = "";
+
+    proc.stdout?.on("data", (data) => {
+      stdout += data.toString();
+    });
+
+    proc.stderr?.on("data", (data) => {
+      stderr += data.toString();
+    });
+
+    const timeoutId = setTimeout(() => {
+      proc.kill("SIGTERM");
+    }, timeout);
+
+    proc.on("close", (code) => {
+      clearTimeout(timeoutId);
+
+      let extractedSessionId = sessionId;
+      if (!extractedSessionId) {
+        const match = stdout.match(/session[:\s]+([a-zA-Z0-9_-]+)/i) ||
+                      stderr.match(/session[:\s]+([a-zA-Z0-9_-]+)/i);
+        if (match) {
+          extractedSessionId = match[1] ?? null;
+        }
+      }
+
+      resolve({
+        output: stdout,
+        stderr: stderr,
+        code: code ?? 1,
+        sessionId: extractedSessionId,
+      });
+    });
+
+    proc.on("error", (err) => {
+      clearTimeout(timeoutId);
+      resolve({
+        output: "",
+        stderr: err.message,
+        code: 1,
+        sessionId: null,
+      });
+    });
+  });
+}
+
+async function runMultiTurnTest(test: MultiTurnTest): Promise<TestResult> {
+  const start = Date.now();
+  let sessionId: string | null = null;
+  let turnsCompleted = 0;
+
+  for (let index = 0; index < test.turns.length; index++) {
+    const prompt = test.turns[index]!;
+    const turnStart = Date.now();
+    const result = await runTurn(
+      prompt,
+      test.model,
+      sessionId ?? null,
+      `regression-${test.name}`,
+      test.timeout
+    );
+
+    const combined = result.output + result.stderr;
+
+    for (const pattern of test.errorPatterns) {
+      if (combined.toLowerCase().includes(pattern.toLowerCase())) {
+        return {
+          success: false,
+          error: `Turn ${index + 1}: Found error pattern "${pattern}"`,
+          duration: Date.now() - start,
+          turnsCompleted,
+        };
+      }
+    }
+
+    if (result.code !== 0 && result.code !== null) {
+      const isTimeout = Date.now() - turnStart >= test.timeout - 1000;
+      if (isTimeout) {
+        return {
+          success: false,
+          error: `Turn ${index + 1}: Timeout after ${test.timeout}ms`,
+          duration: Date.now() - start,
+          turnsCompleted,
+        };
+      }
+    }
+
+    sessionId = result.sessionId;
+    turnsCompleted++;
+  }
+
+  return {
+    success: true,
+    duration: Date.now() - start,
+    turnsCompleted,
+  };
+}
+
+function parseArgs(): {
+  filterName: string | null;
+  filterCategory: Category | null;
+  dryRun: boolean;
+  help: boolean;
+} {
+  const args = process.argv.slice(2);
+  const getArg = (flag: string): string | null => {
+    const idx = args.indexOf(flag);
+    return idx !== -1 && args[idx + 1] !== undefined ? args[idx + 1]! : null;
+  };
+  return {
+    filterName: getArg("--test") ?? getArg("--name"),
+    filterCategory: getArg("--category") as Category | null,
+    dryRun: args.includes("--dry-run"),
+    help: args.includes("--help") || args.includes("-h"),
+  };
+}
+
+function showHelp(): void {
+  console.log(`
+Multi-Turn Regression Test Suite for Antigravity Plugin
+
+Tests for known bugs:
+  - Issue #50: Thinking block order errors
+  - Tool pairing: tool_use without tool_result
+  - Multi-tool: Complex tool chains
+
+Usage:
+  npx tsx script/test-regression.ts [options]
+
+Options:
+  --test <name>         Run specific test by name
+  --category <cat>      Run tests by category (thinking-order|tool-pairing|multi-tool)
+  --dry-run             List tests without running
+  --help, -h            Show this help
+
+Examples:
+  npx tsx script/test-regression.ts --dry-run
+  npx tsx script/test-regression.ts --category thinking-order
+  npx tsx script/test-regression.ts --test thinking-tool-use-basic
+`);
+}
+
+async function main(): Promise<void> {
+  const { filterName, filterCategory, dryRun, help } = parseArgs();
+
+  if (help) {
+    showHelp();
+    return;
+  }
+
+  let tests = TESTS;
+  if (filterName) {
+    tests = tests.filter((t) => t.name === filterName);
+  }
+  if (filterCategory) {
+    tests = tests.filter((t) => t.category === filterCategory);
+  }
+
+  if (tests.length === 0) {
+    console.error("No tests match the specified filters");
+    process.exit(1);
+  }
+
+  console.log(`\n🧪 Multi-Turn Regression Tests (${tests.length} tests)\n${"=".repeat(55)}\n`);
+
+  if (dryRun) {
+    console.log("Tests to run:\n");
+    for (const test of tests) {
+      console.log(`  ${test.name}`);
+      console.log(`    Model: ${test.model}`);
+      console.log(`    Category: ${test.category}`);
+      console.log(`    Turns: ${test.turns.length}`);
+      console.log();
+    }
+    return;
+  }
+
+  const results: { test: MultiTurnTest; result: TestResult }[] = [];
+
+  for (const test of tests) {
+    console.log(`Testing: ${test.name}`);
+    console.log(`  Model: ${test.model}`);
+    console.log(`  Turns: ${test.turns.length}`);
+    process.stdout.write("  Status: ");
+
+    const result = await runMultiTurnTest(test);
+    results.push({ test, result });
+
+    if (result.success) {
+      console.log(`✅ PASS (${result.turnsCompleted}/${test.turns.length} turns, ${(result.duration / 1000).toFixed(1)}s)`);
+    } else {
+      console.log(`❌ FAIL`);
+      console.log(`    Error: ${result.error}`);
+      console.log(`    Completed: ${result.turnsCompleted}/${test.turns.length} turns`);
+    }
+    console.log();
+  }
+
+  const passed = results.filter((r) => r.result.success).length;
+  const failed = results.filter((r) => !r.result.success).length;
+
+  console.log("=".repeat(55));
+  console.log(`\nSummary: ${passed} passed, ${failed} failed\n`);
+
+  if (failed > 0) {
+    console.log("Failed tests:");
+    for (const r of results.filter((r) => !r.result.success)) {
+      console.log(`  ❌ ${r.test.name}: ${r.result.error}`);
+    }
+    process.exit(1);
+  }
+}
+
+main().catch((err) => {
+  console.error("Fatal error:", err);
+  process.exit(1);
+});