feat: add E2E testing scripts and simplify Gemini Flash model config

- Add test-models.ts for validating all supported model endpoints
- Add test-regression.ts for multi-turn regression testing (Issue #50)
- Consolidate Gemini 3 Flash variants (low/medium/high) into single model
- Fix schema structure by flattening nested signature_cache properties
- Extract streaming transformer utilities to dedicated module
This commit is contained in:
tctinh
2025-12-28 00:04:57 +07:00
parent ba2891bf57
commit 16f4bb07a1
16 changed files with 2905 additions and 356 deletions

175
script/test-models.ts Normal file
View File

@@ -0,0 +1,175 @@
#!/usr/bin/env npx tsx
import { spawn } from "child_process";
interface ModelTest {
model: string;
category: "gemini-cli" | "antigravity-gemini" | "antigravity-claude" | "antigravity-gpt";
}
const MODELS: ModelTest[] = [
// Gemini CLI (direct Google API)
{ model: "google/gemini-3-flash-preview", category: "gemini-cli" },
{ model: "google/gemini-3-pro-preview", category: "gemini-cli" },
{ model: "google/gemini-2.5-pro", category: "gemini-cli" },
{ model: "google/gemini-2.5-flash", category: "gemini-cli" },
// Antigravity Gemini
{ model: "google/antigravity-gemini-3-pro-low", category: "antigravity-gemini" },
{ model: "google/antigravity-gemini-3-pro-high", category: "antigravity-gemini" },
{ model: "google/antigravity-gemini-3-flash", category: "antigravity-gemini" },
// Antigravity Claude
{ model: "google/antigravity-claude-sonnet-4-5", category: "antigravity-claude" },
{ model: "google/antigravity-claude-sonnet-4-5-thinking-low", category: "antigravity-claude" },
{ model: "google/antigravity-claude-sonnet-4-5-thinking-medium", category: "antigravity-claude" },
{ model: "google/antigravity-claude-sonnet-4-5-thinking-high", category: "antigravity-claude" },
{ model: "google/antigravity-claude-opus-4-5-thinking-low", category: "antigravity-claude" },
{ model: "google/antigravity-claude-opus-4-5-thinking-medium", category: "antigravity-claude" },
{ model: "google/antigravity-claude-opus-4-5-thinking-high", category: "antigravity-claude" },
// Antigravity GPT
{ model: "google/antigravity-gpt-oss-120b-medium", category: "antigravity-gpt" },
];
const TEST_PROMPT = "Reply with exactly one word: WORKING";
const DEFAULT_TIMEOUT_MS = 120_000;
interface TestResult {
success: boolean;
error?: string;
duration: number;
}
async function testModel(model: string, timeoutMs: number): Promise<TestResult> {
const start = Date.now();
return new Promise((resolve) => {
const proc = spawn("opencode", ["run", TEST_PROMPT, "--model", model], {
stdio: ["ignore", "pipe", "pipe"],
});
let stdout = "";
let stderr = "";
const timer = setTimeout(() => {
proc.kill("SIGKILL");
resolve({ success: false, error: `Timeout after ${timeoutMs}ms`, duration: Date.now() - start });
}, timeoutMs);
proc.stdout?.on("data", (data) => { stdout += data.toString(); });
proc.stderr?.on("data", (data) => { stderr += data.toString(); });
proc.on("close", (code) => {
clearTimeout(timer);
const duration = Date.now() - start;
if (code !== 0) {
resolve({ success: false, error: `Exit ${code}: ${stderr || stdout}`.slice(0, 200), duration });
} else if (stdout.toLowerCase().includes("working")) {
resolve({ success: true, duration });
} else {
resolve({ success: true, duration });
}
});
proc.on("error", (err) => {
clearTimeout(timer);
resolve({ success: false, error: err.message, duration: Date.now() - start });
});
});
}
function parseArgs(): { filterModel: string | null; filterCategory: string | null; dryRun: boolean; help: boolean; timeout: number } {
const args = process.argv.slice(2);
const modelIdx = args.indexOf("--model");
const catIdx = args.indexOf("--category");
const timeoutIdx = args.indexOf("--timeout");
return {
filterModel: modelIdx !== -1 ? args[modelIdx + 1] ?? null : null,
filterCategory: catIdx !== -1 ? args[catIdx + 1] ?? null : null,
dryRun: args.includes("--dry-run"),
help: args.includes("--help") || args.includes("-h"),
timeout: timeoutIdx !== -1 ? parseInt(args[timeoutIdx + 1] || "120000", 10) : DEFAULT_TIMEOUT_MS,
};
}
function printHelp(): void {
console.log(`
E2E Model Test Script
Usage:
npx tsx script/test-models.ts [options]
Options:
--model <model> Test specific model
--category <cat> Test by category (gemini-cli, antigravity-gemini, antigravity-claude, antigravity-gpt)
--timeout <ms> Timeout per model (default: 120000)
--dry-run List models without testing
--help, -h Show this help
Examples:
npx tsx script/test-models.ts --dry-run
npx tsx script/test-models.ts --model google/gemini-3-flash-preview
npx tsx script/test-models.ts --category antigravity-claude
`);
}
async function main(): Promise<void> {
const { filterModel, filterCategory, dryRun, help, timeout } = parseArgs();
if (help) {
printHelp();
return;
}
let tests = MODELS;
if (filterModel) tests = tests.filter((t) => t.model === filterModel || t.model.endsWith(filterModel));
if (filterCategory) tests = tests.filter((t) => t.category === filterCategory);
if (tests.length === 0) {
console.log("No models match the filter.");
return;
}
console.log(`\n🧪 E2E Model Tests (${tests.length} models)\n${"=".repeat(50)}\n`);
if (dryRun) {
for (const t of tests) {
console.log(` ${t.model.padEnd(50)} [${t.category}]`);
}
console.log(`\n${tests.length} models would be tested.\n`);
return;
}
let passed = 0;
let failed = 0;
const failures: { model: string; error: string }[] = [];
for (const t of tests) {
process.stdout.write(`Testing ${t.model.padEnd(50)} ... `);
const result = await testModel(t.model, timeout);
if (result.success) {
console.log(`✅ (${(result.duration / 1000).toFixed(1)}s)`);
passed++;
} else {
console.log(`❌ FAIL`);
console.log(` ${result.error}`);
failures.push({ model: t.model, error: result.error || "Unknown" });
failed++;
}
}
console.log(`\n${"=".repeat(50)}`);
console.log(`Summary: ${passed} passed, ${failed} failed\n`);
if (failures.length > 0) {
console.log("Failed models:");
for (const f of failures) {
console.log(` - ${f.model}`);
}
process.exit(1);
}
}
main().catch(console.error);

334
script/test-regression.ts Normal file
View File

@@ -0,0 +1,334 @@
#!/usr/bin/env npx tsx
import { spawn } from "child_process";
type Category = "thinking-order" | "tool-pairing" | "multi-tool";
interface MultiTurnTest {
name: string;
model: string;
category: Category;
turns: string[];
errorPatterns: string[];
timeout: number;
}
interface TestResult {
success: boolean;
error?: string;
duration: number;
turnsCompleted: number;
}
const ERROR_PATTERNS = [
"thinking block order",
"Expected thinking or redacted_thinking",
"tool_use ids were found without tool_result",
"tool_result_missing",
"thinking_disabled_violation",
"orphaned tool_use",
"must start with thinking block",
"error: tool_use without matching tool_result",
"cannot be modified",
"must remain as they were",
];
const TESTS: MultiTurnTest[] = [
// Issue #50: Thinking block order bug - simple single-turn tool use
{
name: "thinking-tool-use",
model: "google/antigravity-claude-sonnet-4-5-thinking-low",
category: "thinking-order",
turns: [
"Read package.json and tell me the package name",
],
errorPatterns: ERROR_PATTERNS,
timeout: 90000,
},
{
name: "thinking-bash-tool",
model: "google/antigravity-claude-sonnet-4-5-thinking-low",
category: "thinking-order",
turns: [
"Run: echo 'hello' and tell me the output",
],
errorPatterns: ERROR_PATTERNS,
timeout: 90000,
},
// Tool pairing - simple two-turn
{
name: "tool-pairing-sequential",
model: "google/antigravity-claude-sonnet-4-5-thinking-low",
category: "tool-pairing",
turns: [
"Run: echo 'first'",
"Run: echo 'second'",
],
errorPatterns: ERROR_PATTERNS,
timeout: 120000,
},
// Opus model basic test
{
name: "opus-thinking-basic",
model: "google/antigravity-claude-opus-4-5-thinking-low",
category: "thinking-order",
turns: [
"What is 7 * 8? Use bash to verify: echo $((7*8))",
],
errorPatterns: ERROR_PATTERNS,
timeout: 120000,
},
// Bug: "thinking blocks in latest assistant message cannot be modified"
// Tests multi-turn with thinking blocks to verify they're preserved unchanged
{
name: "thinking-modification-continue",
model: "google/antigravity-claude-sonnet-4-5-thinking-low",
category: "thinking-order",
turns: [
"Read package.json and tell me the version",
"Now read tsconfig.json and tell me the target",
"Compare the two files briefly",
],
errorPatterns: ERROR_PATTERNS,
timeout: 120000,
},
];
async function runTurn(
prompt: string,
model: string,
sessionId: string | null,
sessionTitle: string,
timeout: number
): Promise<{ output: string; stderr: string; code: number; sessionId: string | null }> {
return new Promise((resolve) => {
const args = sessionId
? ["run", prompt, "--session", sessionId, "--model", model]
: ["run", prompt, "--model", model, "--title", sessionTitle];
const proc = spawn("opencode", args, {
stdio: ["ignore", "pipe", "pipe"],
timeout,
cwd: process.cwd(),
});
let stdout = "";
let stderr = "";
proc.stdout?.on("data", (data) => {
stdout += data.toString();
});
proc.stderr?.on("data", (data) => {
stderr += data.toString();
});
const timeoutId = setTimeout(() => {
proc.kill("SIGTERM");
}, timeout);
proc.on("close", (code) => {
clearTimeout(timeoutId);
let extractedSessionId = sessionId;
if (!extractedSessionId) {
const match = stdout.match(/session[:\s]+([a-zA-Z0-9_-]+)/i) ||
stderr.match(/session[:\s]+([a-zA-Z0-9_-]+)/i);
if (match) {
extractedSessionId = match[1] ?? null;
}
}
resolve({
output: stdout,
stderr: stderr,
code: code ?? 1,
sessionId: extractedSessionId,
});
});
proc.on("error", (err) => {
clearTimeout(timeoutId);
resolve({
output: "",
stderr: err.message,
code: 1,
sessionId: null,
});
});
});
}
async function runMultiTurnTest(test: MultiTurnTest): Promise<TestResult> {
const start = Date.now();
let sessionId: string | null = null;
let turnsCompleted = 0;
for (let index = 0; index < test.turns.length; index++) {
const prompt = test.turns[index]!;
const turnStart = Date.now();
const result = await runTurn(
prompt,
test.model,
sessionId ?? null,
`regression-${test.name}`,
test.timeout
);
const combined = result.output + result.stderr;
for (const pattern of test.errorPatterns) {
if (combined.toLowerCase().includes(pattern.toLowerCase())) {
return {
success: false,
error: `Turn ${index + 1}: Found error pattern "${pattern}"`,
duration: Date.now() - start,
turnsCompleted,
};
}
}
if (result.code !== 0 && result.code !== null) {
const isTimeout = Date.now() - turnStart >= test.timeout - 1000;
if (isTimeout) {
return {
success: false,
error: `Turn ${index + 1}: Timeout after ${test.timeout}ms`,
duration: Date.now() - start,
turnsCompleted,
};
}
}
sessionId = result.sessionId;
turnsCompleted++;
}
return {
success: true,
duration: Date.now() - start,
turnsCompleted,
};
}
function parseArgs(): {
filterName: string | null;
filterCategory: Category | null;
dryRun: boolean;
help: boolean;
} {
const args = process.argv.slice(2);
const getArg = (flag: string): string | null => {
const idx = args.indexOf(flag);
return idx !== -1 && args[idx + 1] !== undefined ? args[idx + 1]! : null;
};
return {
filterName: getArg("--test") ?? getArg("--name"),
filterCategory: getArg("--category") as Category | null,
dryRun: args.includes("--dry-run"),
help: args.includes("--help") || args.includes("-h"),
};
}
function showHelp(): void {
console.log(`
Multi-Turn Regression Test Suite for Antigravity Plugin
Tests for known bugs:
- Issue #50: Thinking block order errors
- Tool pairing: tool_use without tool_result
- Multi-tool: Complex tool chains
Usage:
npx tsx script/test-regression.ts [options]
Options:
--test <name> Run specific test by name
--category <cat> Run tests by category (thinking-order|tool-pairing|multi-tool)
--dry-run List tests without running
--help, -h Show this help
Examples:
npx tsx script/test-regression.ts --dry-run
npx tsx script/test-regression.ts --category thinking-order
npx tsx script/test-regression.ts --test thinking-tool-use-basic
`);
}
async function main(): Promise<void> {
const { filterName, filterCategory, dryRun, help } = parseArgs();
if (help) {
showHelp();
return;
}
let tests = TESTS;
if (filterName) {
tests = tests.filter((t) => t.name === filterName);
}
if (filterCategory) {
tests = tests.filter((t) => t.category === filterCategory);
}
if (tests.length === 0) {
console.error("No tests match the specified filters");
process.exit(1);
}
console.log(`\n🧪 Multi-Turn Regression Tests (${tests.length} tests)\n${"=".repeat(55)}\n`);
if (dryRun) {
console.log("Tests to run:\n");
for (const test of tests) {
console.log(` ${test.name}`);
console.log(` Model: ${test.model}`);
console.log(` Category: ${test.category}`);
console.log(` Turns: ${test.turns.length}`);
console.log();
}
return;
}
const results: { test: MultiTurnTest; result: TestResult }[] = [];
for (const test of tests) {
console.log(`Testing: ${test.name}`);
console.log(` Model: ${test.model}`);
console.log(` Turns: ${test.turns.length}`);
process.stdout.write(" Status: ");
const result = await runMultiTurnTest(test);
results.push({ test, result });
if (result.success) {
console.log(`✅ PASS (${result.turnsCompleted}/${test.turns.length} turns, ${(result.duration / 1000).toFixed(1)}s)`);
} else {
console.log(`❌ FAIL`);
console.log(` Error: ${result.error}`);
console.log(` Completed: ${result.turnsCompleted}/${test.turns.length} turns`);
}
console.log();
}
const passed = results.filter((r) => r.result.success).length;
const failed = results.filter((r) => !r.result.success).length;
console.log("=".repeat(55));
console.log(`\nSummary: ${passed} passed, ${failed} failed\n`);
if (failed > 0) {
console.log("Failed tests:");
for (const r of results.filter((r) => !r.result.success)) {
console.log(`${r.test.name}: ${r.result.error}`);
}
process.exit(1);
}
}
main().catch((err) => {
console.error("Fatal error:", err);
process.exit(1);
});