opencode-antigravity-auth/script/test-regression.ts

#!/usr/bin/env npx tsx
import { spawn } from "child_process";

type Category = "thinking-order" | "tool-pairing" | "multi-tool" | "multi-provider" | "error-handling" | "stress" | "concurrency";
type TestSuite = "sanity" | "heavy" | "all";

interface MultiTurnTest {
  name: string;
  model: string;
  category: Category;
  suite: TestSuite;
  turns: (string | TurnConfig)[];
  errorPatterns: string[];
  timeout: number;
  expectError?: string;
}

interface TurnConfig {
  prompt: string;
  model?: string;
}

interface TestResult {
  success: boolean;
  error?: string;
  duration: number;
  turnsCompleted: number;
  sessionId?: string;
}

interface ConcurrentTest {
  name: string;
  category: "concurrency";
  suite: TestSuite;
  concurrentRequests: number;
  model: string;
  prompt: string;
  errorPatterns: string[];
  timeout: number;
}

const ERROR_PATTERNS = [
  "thinking block order",
  "Expected thinking or redacted_thinking",
  "tool_use ids were found without tool_result",
  "tool_result_missing",
  "thinking_disabled_violation",
  "orphaned tool_use",
  "must start with thinking block",
  "error: tool_use without matching tool_result",
  "cannot be modified",
  "must remain as they were",
];

const GEMINI_FLASH = "google/antigravity-gemini-3-flash";
const GEMINI_FLASH_CLI_QUOTA = "google/gemini-2.5-flash";
const CLAUDE_SONNET = "google/antigravity-claude-sonnet-4-6";
const CLAUDE_OPUS = "google/antigravity-claude-opus-4-6-thinking-low";

const SANITY_TESTS: MultiTurnTest[] = [
  {
    name: "thinking-tool-use",
    model: CLAUDE_SONNET,
    category: "thinking-order",
    suite: "sanity",
    turns: ["Read package.json and tell me the package name"],
    errorPatterns: ERROR_PATTERNS,
    timeout: 90000,
  },
  {
    name: "thinking-bash-tool",
    model: CLAUDE_SONNET,
    category: "thinking-order",
    suite: "sanity",
    turns: ["Run: echo 'hello' and tell me the output"],
    errorPatterns: ERROR_PATTERNS,
    timeout: 90000,
  },
  {
    name: "tool-pairing-sequential",
    model: CLAUDE_SONNET,
    category: "tool-pairing",
    suite: "sanity",
    turns: ["Run: echo 'first'", "Run: echo 'second'"],
    errorPatterns: ERROR_PATTERNS,
    timeout: 120000,
  },
  {
    name: "opus-thinking-basic",
    model: CLAUDE_OPUS,
    category: "thinking-order",
    suite: "sanity",
    turns: ["What is 7 * 8? Use bash to verify: echo $((7*8))"],
    errorPatterns: ERROR_PATTERNS,
    timeout: 120000,
  },
  {
    name: "thinking-modification-continue",
    model: CLAUDE_SONNET,
    category: "thinking-order",
    suite: "sanity",
    turns: [
      "Read package.json and tell me the version",
      "Now read tsconfig.json and tell me the target",
      "Compare the two files briefly",
    ],
    errorPatterns: ERROR_PATTERNS,
    timeout: 120000,
  },
  {
    name: "multi-provider-switch",
    model: GEMINI_FLASH,
    category: "multi-provider",
    suite: "sanity",
    turns: [
      { prompt: "What is 2+2? Answer briefly.", model: GEMINI_FLASH },
      { prompt: "What is 3+3? Answer briefly.", model: CLAUDE_SONNET },
      { prompt: "What is 4+4? Answer briefly.", model: GEMINI_FLASH },
    ],
    errorPatterns: ERROR_PATTERNS,
    timeout: 180000,
  },
  {
    name: "prompt-too-long-recovery",
    model: GEMINI_FLASH,
    category: "error-handling",
    suite: "sanity",
    turns: ["Reply with exactly: OK", "Repeat the word 'test' 50000 times"],
    errorPatterns: ["FATAL", "unhandled", "Cannot read properties"],
    timeout: 60000,
  },
];

const HEAVY_TESTS: MultiTurnTest[] = [
  {
    name: "stress-8-turn-multi-provider",
    model: GEMINI_FLASH,
    category: "stress",
    suite: "heavy",
    turns: [
      { prompt: "Read package.json and tell me the name", model: GEMINI_FLASH },
      { prompt: "Now read tsconfig.json and tell me the target", model: CLAUDE_SONNET },
      { prompt: "Run: ls -la src/plugin | head -5", model: GEMINI_FLASH },
      { prompt: "Read src/plugin/auth.ts and summarize in 1 sentence", model: CLAUDE_SONNET },
      { prompt: "Run: wc -l src/plugin/*.ts | tail -3", model: GEMINI_FLASH },
      { prompt: "Read README.md first 50 lines and tell me what this project does", model: CLAUDE_SONNET },
      { prompt: "Run: git log --oneline -3", model: GEMINI_FLASH },
      { prompt: "Summarize everything we discussed in 3 bullet points", model: CLAUDE_SONNET },
    ],
    errorPatterns: ERROR_PATTERNS,
    timeout: 600000,
  },
  {
    name: "opencode-tools-comprehensive",
    model: CLAUDE_SONNET,
    category: "multi-tool",
    suite: "heavy",
    turns: [
      "Use glob to find all *.ts files in src/plugin directory",
      "Use grep to search for 'async function' in src/plugin/auth.ts",
      "Use bash to run: echo 'test123' && pwd",
      "Use read to read the first 20 lines of package.json",
      "Use lsp_diagnostics on src/plugin/auth.ts to check for errors",
      "Use glob to find all test files matching *.test.ts",
    ],
    errorPatterns: ERROR_PATTERNS,
    timeout: 480000,
  },
  {
    name: "stress-20-turn-recovery",
    model: GEMINI_FLASH,
    category: "stress",
    suite: "heavy",
    turns: [
      { prompt: "Read package.json and extract the version number only", model: GEMINI_FLASH },
      { prompt: "Run: ls src/plugin/*.ts | head -3", model: CLAUDE_SONNET },
      { prompt: "Read src/plugin/auth.ts first 30 lines", model: GEMINI_FLASH },
      { prompt: "Use grep to find 'export' in src/plugin/auth.ts", model: CLAUDE_SONNET },
      { prompt: "Run: echo 'checkpoint 1' && date", model: GEMINI_FLASH },
      { prompt: "Read tsconfig.json and tell me the module type", model: CLAUDE_SONNET },
      { prompt: "Use glob to find all *.test.ts files", model: GEMINI_FLASH },
      { prompt: "Read src/plugin/token.ts first 20 lines", model: CLAUDE_SONNET },
      { prompt: "Run: wc -l src/plugin/*.ts | sort -n | tail -5", model: GEMINI_FLASH },
      { prompt: "What files have we read so far? List them.", model: CLAUDE_SONNET },
      { prompt: "Read src/plugin/request.ts first 25 lines", model: GEMINI_FLASH },
      { prompt: "Use grep to find 'async' in src/plugin/request.ts", model: CLAUDE_SONNET },
      { prompt: "Run: echo 'checkpoint 2' && pwd", model: GEMINI_FLASH },
      { prompt: "Read src/plugin/storage.ts first 20 lines", model: CLAUDE_SONNET },
      { prompt: "Use lsp_diagnostics on src/plugin/token.ts", model: GEMINI_FLASH },
      { prompt: "Read vitest.config.ts completely", model: CLAUDE_SONNET },
      { prompt: "Run: git status --short | head -5", model: GEMINI_FLASH },
      { prompt: "Read src/constants.ts completely", model: CLAUDE_SONNET },
      { prompt: "Run: echo 'final checkpoint' && echo 'all done'", model: GEMINI_FLASH },
      { prompt: "Summarize this entire conversation in 5 bullet points", model: CLAUDE_SONNET },
    ],
    errorPatterns: ERROR_PATTERNS,
    timeout: 900000,
  },
  {
    name: "stress-50-turn-endurance",
    model: GEMINI_FLASH,
    category: "stress",
    suite: "heavy",
    turns: generateEnduranceTest(50),
    errorPatterns: ERROR_PATTERNS,
    timeout: 1800000,
  },
];

function generateEnduranceTest(turnCount: number): TurnConfig[] {
  const turns: TurnConfig[] = [];
  const prompts = [
    { prompt: "What is {n} + {n}? Answer with just the number.", model: GEMINI_FLASH },
    { prompt: "Run: echo 'turn {i}'", model: CLAUDE_SONNET },
    { prompt: "Read package.json and tell me one field", model: GEMINI_FLASH },
    { prompt: "Run: pwd && echo 'ok'", model: CLAUDE_SONNET },
    { prompt: "What turn number are we on? Just say the number.", model: GEMINI_FLASH },
    { prompt: "Run: date +%H:%M:%S", model: CLAUDE_SONNET },
    { prompt: "Use glob to find one .ts file in src/", model: GEMINI_FLASH },
    { prompt: "Run: echo 'checkpoint {i}'", model: CLAUDE_SONNET },
    { prompt: "Read tsconfig.json and tell me target", model: GEMINI_FLASH },
    { prompt: "What have we done in last 3 turns? Brief answer.", model: CLAUDE_SONNET },
  ];

  for (let i = 0; i < turnCount; i++) {
    const template = prompts[i % prompts.length]!;
    const prompt = template.prompt
      .replace(/\{i\}/g, String(i + 1))
      .replace(/\{n\}/g, String(i + 1));
    turns.push({ prompt, model: template.model });
  }

  turns.push({
    prompt: `We completed ${turnCount} turns. Summarize this session in 3 sentences.`,
    model: CLAUDE_SONNET,
  });

  return turns;
}

const RATE_LIMIT_ERROR_PATTERNS = [
  "false alarm",
  "incorrectly marked as rate limited",
  "wrong quota",
];

const CONCURRENT_TESTS: ConcurrentTest[] = [
  {
    name: "concurrent-5-same-model",
    category: "concurrency",
    suite: "heavy",
    concurrentRequests: 5,
    model: GEMINI_FLASH,
    prompt: "What is 2+2? Answer with just the number.",
    errorPatterns: [...ERROR_PATTERNS, ...RATE_LIMIT_ERROR_PATTERNS],
    timeout: 120000,
  },
  {
    name: "concurrent-3-mixed-models",
    category: "concurrency",
    suite: "heavy",
    concurrentRequests: 3,
    model: GEMINI_FLASH,
    prompt: "Say hello in one word.",
    errorPatterns: [...ERROR_PATTERNS, ...RATE_LIMIT_ERROR_PATTERNS],
    timeout: 120000,
  },
  {
    name: "concurrent-10-antigravity-heavy",
    category: "concurrency",
    suite: "heavy",
    concurrentRequests: 10,
    model: GEMINI_FLASH,
    prompt: "What is 1+1? Answer with just the number.",
    errorPatterns: [...ERROR_PATTERNS, ...RATE_LIMIT_ERROR_PATTERNS],
    timeout: 180000,
  },
];

const ALL_TESTS = [...SANITY_TESTS, ...HEAVY_TESTS];

async function runTurn(
  prompt: string,
  model: string,
  sessionId: string | null,
  sessionTitle: string,
  timeout: number
): Promise<{ output: string; stderr: string; code: number; sessionId: string | null }> {
  return new Promise((resolve) => {
    const args = sessionId
      ? ["run", prompt, "--session", sessionId, "--model", model]
      : ["run", prompt, "--model", model, "--title", sessionTitle];

    const proc = spawn("opencode", args, {
      stdio: ["ignore", "pipe", "pipe"],
      cwd: process.cwd(),
    });

    let stdout = "";
    let stderr = "";

    proc.stdout?.on("data", (data) => {
      stdout += data.toString();
    });

    proc.stderr?.on("data", (data) => {
      stderr += data.toString();
    });

    const timeoutId = setTimeout(() => {
      proc.kill("SIGTERM");
    }, timeout);

    proc.on("close", (code) => {
      clearTimeout(timeoutId);

      let extractedSessionId = sessionId;
      if (!extractedSessionId) {
        const match = stdout.match(/session[:\s]+([a-zA-Z0-9_-]+)/i) ||
                      stderr.match(/session[:\s]+([a-zA-Z0-9_-]+)/i);
        if (match) {
          extractedSessionId = match[1] ?? null;
        }
      }

      resolve({
        output: stdout,
        stderr: stderr,
        code: code ?? 1,
        sessionId: extractedSessionId,
      });
    });

    proc.on("error", (err) => {
      clearTimeout(timeoutId);
      resolve({
        output: "",
        stderr: err.message,
        code: 1,
        sessionId: null,
      });
    });
  });
}

async function deleteSession(sessionId: string): Promise<void> {
  return new Promise((resolve) => {
    const proc = spawn("opencode", ["session", "delete", sessionId, "--force"], {
      stdio: ["ignore", "pipe", "pipe"],
      timeout: 10000,
      cwd: process.cwd(),
    });

    proc.on("close", () => resolve());
    proc.on("error", () => resolve());
  });
}

async function runConcurrentTest(test: ConcurrentTest): Promise<TestResult> {
  const start = Date.now();
  const sessionIds: string[] = [];

  process.stdout.write(`  Spawning ${test.concurrentRequests} concurrent requests...`);

  const promises = Array.from({ length: test.concurrentRequests }, (_, i) =>
    runTurn(
      `${test.prompt} (request ${i + 1})`,
      test.model,
      null,
      `concurrent-${test.name}-${i}`,
      test.timeout
    )
  );

  const results = await Promise.all(promises);
  process.stdout.write("\r" + " ".repeat(60) + "\r");

  for (const result of results) {
    if (result.sessionId) {
      sessionIds.push(result.sessionId);
    }
  }

  for (const result of results) {
    for (const pattern of test.errorPatterns) {
      if (result.stderr.toLowerCase().includes(pattern.toLowerCase())) {
        for (const sid of sessionIds) {
          await deleteSession(sid);
        }
        return {
          success: false,
          error: `Found error pattern "${pattern}" in concurrent response`,
          duration: Date.now() - start,
          turnsCompleted: 0,
        };
      }
    }
  }

  const failedResults = results.filter((r) => r.code !== 0);
  const failedCount = failedResults.length;
  if (failedCount > test.concurrentRequests / 2) {
    for (const sid of sessionIds) {
      await deleteSession(sid);
    }
    const firstFailure = failedResults[0];
    const failureDetails = firstFailure
      ? `\n    First failure stderr: ${firstFailure.stderr.slice(0, 500)}`
      : "";
    return {
      success: false,
      error: `${failedCount}/${test.concurrentRequests} requests failed${failureDetails}`,
      duration: Date.now() - start,
      turnsCompleted: test.concurrentRequests - failedCount,
    };
  }

  for (const sid of sessionIds) {
    await deleteSession(sid);
  }

  return {
    success: true,
    duration: Date.now() - start,
    turnsCompleted: test.concurrentRequests,
  };
}

async function runMultiTurnTest(test: MultiTurnTest): Promise<TestResult> {
  const start = Date.now();
  let sessionId: string | null = null;
  let turnsCompleted = 0;

  for (let index = 0; index < test.turns.length; index++) {
    const turn = test.turns[index]!;
    const prompt = typeof turn === "string" ? turn : turn.prompt;
    const model = typeof turn === "string" ? test.model : (turn.model ?? test.model);
    const turnStart = Date.now();

    process.stdout.write(`\r  Progress: ${index + 1}/${test.turns.length} turns...`);

    const result = await runTurn(
      prompt,
      model,
      sessionId ?? null,
      `regression-${test.name}`,
      test.timeout
    );

    for (const pattern of test.errorPatterns) {
      if (result.stderr.toLowerCase().includes(pattern.toLowerCase())) {
        process.stdout.write("\r" + " ".repeat(50) + "\r");
        return {
          success: false,
          error: `Turn ${index + 1}: Found error pattern "${pattern}"`,
          duration: Date.now() - start,
          turnsCompleted,
          sessionId: sessionId ?? undefined,
        };
      }
    }

    if (result.code !== 0 && result.code !== null) {
      const isTimeout = Date.now() - turnStart >= test.timeout - 1000;
      if (isTimeout) {
        process.stdout.write("\r" + " ".repeat(50) + "\r");
        return {
          success: false,
          error: `Turn ${index + 1}: Timeout after ${test.timeout}ms`,
          duration: Date.now() - start,
          turnsCompleted,
          sessionId: sessionId ?? undefined,
        };
      }
    }

    sessionId = result.sessionId;
    turnsCompleted++;
  }

  process.stdout.write("\r" + " ".repeat(50) + "\r");
  return {
    success: true,
    duration: Date.now() - start,
    turnsCompleted,
    sessionId: sessionId ?? undefined,
  };
}

function parseArgs(): {
  filterName: string | null;
  filterCategory: Category | null;
  suite: TestSuite;
  dryRun: boolean;
  help: boolean;
} {
  const args = process.argv.slice(2);
  const getArg = (flag: string): string | null => {
    const idx = args.indexOf(flag);
    return idx !== -1 && args[idx + 1] !== undefined ? args[idx + 1]! : null;
  };

  let suite: TestSuite = "all";
  if (args.includes("--sanity")) suite = "sanity";
  if (args.includes("--heavy")) suite = "heavy";

  return {
    filterName: getArg("--test") ?? getArg("--name"),
    filterCategory: getArg("--category") as Category | null,
    suite,
    dryRun: args.includes("--dry-run"),
    help: args.includes("--help") || args.includes("-h"),
  };
}

function showHelp(): void {
  console.log(`
Multi-Turn Regression Test Suite for Antigravity Plugin

Test Suites:
  --sanity    Quick tests (7 tests, ~5 min) - run frequently
  --heavy     Stress tests (4 tests, ~30 min) - long conversations
  (default)   All tests

Tests:
  Sanity (quick, repeatable):
    - thinking-tool-use, thinking-bash-tool, tool-pairing-sequential
    - opus-thinking-basic, thinking-modification-continue
    - multi-provider-switch, prompt-too-long-recovery

  Heavy (stress, endurance):
    - stress-8-turn-multi-provider (8 turns)
    - opencode-tools-comprehensive (6 turns, all tools)
    - stress-20-turn-recovery (20 turns, multi-model, recovery)
    - stress-50-turn-endurance (51 turns, endurance test)

Usage:
  npx tsx script/test-regression.ts [options]

Options:
  --sanity              Run sanity tests only (quick)
  --heavy               Run heavy tests only (stress)
  --test <name>         Run specific test by name
  --category <cat>      Run tests by category
  --dry-run             List tests without running
  --help, -h            Show this help

Examples:
  npx tsx script/test-regression.ts --sanity
  npx tsx script/test-regression.ts --heavy
  npx tsx script/test-regression.ts --test stress-20-turn-recovery
`);
}

async function main(): Promise<void> {
  const { filterName, filterCategory, suite, dryRun, help } = parseArgs();

  if (help) {
    showHelp();
    return;
  }

  let tests: MultiTurnTest[];
  switch (suite) {
    case "sanity":
      tests = SANITY_TESTS;
      break;
    case "heavy":
      tests = HEAVY_TESTS;
      break;
    default:
      tests = ALL_TESTS;
  }

  if (filterName) {
    tests = tests.filter((t) => t.name === filterName);
  }
  if (filterCategory && filterCategory !== "concurrency") {
    tests = tests.filter((t) => t.category === filterCategory);
  }

  const runConcurrentOnly = filterCategory === "concurrency";
  if (runConcurrentOnly) {
    tests = [];
  }

  if (tests.length === 0 && !runConcurrentOnly) {
    console.error("No tests match the specified filters");
    process.exit(1);
  }

  const totalTurns = tests.reduce((sum, t) => sum + t.turns.length, 0);
  const concurrentCount = CONCURRENT_TESTS.reduce((sum, t) => sum + t.concurrentRequests, 0);
  console.log(`\n🧪 Regression Tests [${suite.toUpperCase()}] (${tests.length} tests, ${totalTurns} turns + ${concurrentCount} concurrent)\n${"=".repeat(60)}\n`);

  if (dryRun) {
    console.log("Tests to run:\n");
    for (const test of tests) {
      console.log(`  ${test.name} [${test.suite}]`);
      console.log(`    Model: ${test.model}`);
      console.log(`    Category: ${test.category}`);
      console.log(`    Turns: ${test.turns.length}`);
      console.log();
    }
    return;
  }

  const results: { test: MultiTurnTest; result: TestResult }[] = [];

  for (const test of tests) {
    console.log(`Testing: ${test.name} [${test.suite}]`);
    console.log(`  Model: ${test.model}`);
    console.log(`  Turns: ${test.turns.length}`);

    const result = await runMultiTurnTest(test);
    results.push({ test, result });

    if (result.success) {
      console.log(`  Status: ✅ PASS (${result.turnsCompleted}/${test.turns.length} turns, ${(result.duration / 1000).toFixed(1)}s)`);
    } else {
      console.log(`  Status: ❌ FAIL`);
      console.log(`    Error: ${result.error}`);
      console.log(`    Completed: ${result.turnsCompleted}/${test.turns.length} turns`);
    }

    if (result.sessionId) {
      await deleteSession(result.sessionId);
    }
    console.log();
  }

  if (suite === "heavy" || suite === "all" || runConcurrentOnly || filterName) {
    let concurrentTests = CONCURRENT_TESTS;
    if (filterName) {
      concurrentTests = concurrentTests.filter((t) => t.name === filterName);
    }
    if (concurrentTests.length === 0 && !runConcurrentOnly && tests.length === 0) {
      console.error("No tests match the specified filters");
      process.exit(1);
    }
    if (concurrentTests.length > 0) {
      console.log(`\n🔄 Concurrent Tests (${concurrentTests.length} tests)\n${"-".repeat(40)}\n`);
      for (const test of concurrentTests) {
        console.log(`Testing: ${test.name} [concurrent]`);
        console.log(`  Model: ${test.model}`);
        console.log(`  Concurrent: ${test.concurrentRequests} requests`);

        const result = await runConcurrentTest(test);
        results.push({ test: test as unknown as MultiTurnTest, result });

        if (result.success) {
          console.log(`  Status: ✅ PASS (${result.turnsCompleted} requests, ${(result.duration / 1000).toFixed(1)}s)`);
        } else {
          console.log(`  Status: ❌ FAIL`);
          console.log(`    Error: ${result.error}`);
        }
        console.log();
      }
    }
  }

  const passed = results.filter((r) => r.result.success).length;
  const failed = results.filter((r) => !r.result.success).length;
  const totalTime = results.reduce((sum, r) => sum + r.result.duration, 0);

  console.log("=".repeat(60));
  console.log(`\nSummary: ${passed} passed, ${failed} failed (${(totalTime / 1000).toFixed(1)}s total)\n`);

  if (failed > 0) {
    console.log("Failed tests:");
    for (const r of results.filter((r) => !r.result.success)) {
      console.log(`  ❌ ${r.test.name}: ${r.result.error}`);
    }
    process.exit(1);
  }
}

main().catch((err) => {
  console.error("Fatal error:", err);
  process.exit(1);
});