Files
moltbot/extensions/tts-local-cli/speech-provider.test.ts
Jason Zhou bfd540bcdf [codex] refresh plugin regression fixtures
Summary:
- Refresh plugin regression fixtures and test-support mocks for guarded network resolution, progress streaming windows, staged TTS output, QQBot STT, and CLI runner assertions.
- Resolve current-main conflicts in Discord, Google video, QQBot STT, and CLI runner tests without changing runtime code.

Verification:
- pnpm check:test-types
- pnpm vitest run $(git diff --name-only origin/main...HEAD)
- git diff --check
- GitHub CI passed, including Real behavior proof, auto-response, ClawSweeper dispatch, CodeQL, and full CI checks.

Co-authored-by: Jason Zhou <22532527+JayZeeDesign@users.noreply.github.com>
2026-05-10 23:44:50 -05:00

308 lines
9.4 KiB
TypeScript

import { mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs";
import os from "node:os";
import path from "node:path";
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-contracts";
import type { SpeechProviderConfig, SpeechSynthesisRequest } from "openclaw/plugin-sdk/speech-core";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
type SpeechSynthesisTarget = SpeechSynthesisRequest["target"];
const runFfmpegMock = vi.hoisted(() => vi.fn<(args: string[]) => Promise<string | void>>());
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
runFfmpeg: runFfmpegMock,
}));
import { buildCliSpeechProvider } from "./speech-provider.js";
const TEST_CFG = {} as OpenClawConfig;
function createCliFixture(): { dir: string; script: string } {
const dir = mkdtempSync(path.join(os.tmpdir(), "openclaw-cli-tts-test-"));
const script = path.join(dir, "write-audio.mjs");
writeFileSync(
script,
`
import { writeFileSync } from "node:fs";
const outIndex = process.argv.indexOf("--out");
const outputPath = outIndex >= 0 ? process.argv[outIndex + 1] : "";
const textIndex = process.argv.indexOf("--text");
const textArg = textIndex >= 0 ? process.argv[textIndex + 1] : "";
const stdin = await new Promise((resolve) => {
let data = "";
process.stdin.setEncoding("utf8");
process.stdin.on("data", (chunk) => { data += chunk; });
process.stdin.on("end", () => resolve(data));
});
const payload = Buffer.from(JSON.stringify({ args: process.argv.slice(2), stdin, textArg }));
if (outputPath) {
writeFileSync(outputPath, payload);
} else {
process.stdout.write(payload);
}
`,
);
return { dir, script };
}
function baseProviderConfig(
script: string,
overrides: SpeechProviderConfig = {},
): SpeechProviderConfig {
return {
command: process.execPath,
args: [script],
timeoutMs: 1000,
...overrides,
};
}
async function synthesize(params: {
providerConfig: SpeechProviderConfig;
text?: string;
target?: SpeechSynthesisTarget;
}) {
return await buildCliSpeechProvider().synthesize({
text: params.text ?? "hello world",
cfg: TEST_CFG,
providerConfig: params.providerConfig,
providerOverrides: {},
timeoutMs: 1000,
target: params.target ?? "audio-file",
});
}
function parseAudioPayload(result: { audioBuffer: Buffer }) {
return JSON.parse(result.audioBuffer.toString("utf8")) as {
stdin?: string;
textArg?: string;
};
}
function requireFfmpegArgs(index = 0) {
const args = runFfmpegMock.mock.calls[index]?.[0];
if (!args) {
throw new Error(`runFfmpeg call ${index} missing`);
}
return args;
}
function expectArgsContainSequence(args: string[], sequence: string[]) {
const startIndex = args.findIndex((arg, index) =>
sequence.every((expected, offset) => args[index + offset] === expected),
);
expect(startIndex).toBeGreaterThanOrEqual(0);
}
describe("buildCliSpeechProvider", () => {
beforeEach(() => {
runFfmpegMock.mockImplementation(async (args) => {
const outputPath = args.at(-1);
if (typeof outputPath !== "string") {
throw new Error("missing ffmpeg output path");
}
const stagedTarget = outputPath.endsWith(".part")
? outputPath.slice(0, -".part".length)
: outputPath;
const forcedFormatIndex = args.lastIndexOf("-f");
const forcedFormat =
forcedFormatIndex >= 0 && typeof args[forcedFormatIndex + 1] === "string"
? args[forcedFormatIndex + 1]
: undefined;
const extension =
forcedFormat === "s16le"
? ".pcm"
: forcedFormat
? `.${forcedFormat}`
: path.extname(stagedTarget);
writeFileSync(outputPath, Buffer.from(`converted:${extension}`));
});
});
afterEach(() => {
vi.clearAllMocks();
});
it("prefers canonical provider config over the cli alias", () => {
const provider = buildCliSpeechProvider();
expect(
provider.resolveConfig?.({
cfg: TEST_CFG,
rawConfig: {
providers: {
cli: { command: "alias-command" },
"tts-local-cli": { command: "canonical-command" },
},
},
timeoutMs: 1000,
}),
).toEqual({ command: "canonical-command" });
});
it("passes text through stdin when args omit the text template", async () => {
const fixture = createCliFixture();
try {
const result = await synthesize({
providerConfig: baseProviderConfig(fixture.script, {
args: [fixture.script, "--out", "{{OutputPath}}"],
outputFormat: "mp3",
}),
text: "hello 😀 world",
});
expect(result.outputFormat).toBe("mp3");
expect(result.fileExtension).toBe(".mp3");
expect(result.voiceCompatible).toBe(false);
const audioPayload = parseAudioPayload(result);
expect(audioPayload.stdin).toBe("hello world");
expect(audioPayload.textArg).toBe("");
expect(runFfmpegMock).not.toHaveBeenCalled();
} finally {
rmSync(fixture.dir, { recursive: true, force: true });
}
});
it("uses template args and stdout output when no output file is produced", async () => {
const fixture = createCliFixture();
try {
const result = await synthesize({
providerConfig: baseProviderConfig(fixture.script, {
args: [fixture.script, "--text", "{{Text}}"],
outputFormat: "wav",
}),
text: "spoken words",
});
expect(result.outputFormat).toBe("wav");
expect(result.fileExtension).toBe(".wav");
expect(result.voiceCompatible).toBe(false);
const audioPayload = parseAudioPayload(result);
expect(audioPayload.stdin).toBe("");
expect(audioPayload.textArg).toBe("spoken words");
} finally {
rmSync(fixture.dir, { recursive: true, force: true });
}
});
it("converts non-opus output for voice-note targets", async () => {
const fixture = createCliFixture();
try {
const result = await synthesize({
providerConfig: baseProviderConfig(fixture.script, {
args: [fixture.script, "--out", "{{OutputPath}}"],
outputFormat: "mp3",
}),
target: "voice-note",
});
expect(result).toEqual({
audioBuffer: Buffer.from("converted:.opus"),
outputFormat: "opus",
fileExtension: ".ogg",
voiceCompatible: true,
});
expectArgsContainSequence(requireFfmpegArgs(), ["-c:a", "libopus", "-b:a", "64k"]);
} finally {
rmSync(fixture.dir, { recursive: true, force: true });
}
});
it("converts stdout WAV to the requested audio-file format", async () => {
const fixture = createCliFixture();
try {
const result = await synthesize({
providerConfig: baseProviderConfig(fixture.script, {
args: [fixture.script, "--text", "{{Text}}"],
outputFormat: "mp3",
}),
});
expect(result).toEqual({
audioBuffer: Buffer.from("converted:.mp3"),
outputFormat: "mp3",
fileExtension: ".mp3",
voiceCompatible: false,
});
expectArgsContainSequence(requireFfmpegArgs(), ["-c:a", "libmp3lame", "-b:a", "128k"]);
} finally {
rmSync(fixture.dir, { recursive: true, force: true });
}
});
it("converts CLI output to raw telephony PCM", async () => {
const fixture = createCliFixture();
try {
const result = await buildCliSpeechProvider().synthesizeTelephony?.({
text: "phone reply",
cfg: TEST_CFG,
providerConfig: baseProviderConfig(fixture.script, {
args: [fixture.script, "--out", "{{OutputPath}}"],
outputFormat: "wav",
}),
timeoutMs: 1000,
});
expect(result).toEqual({
audioBuffer: Buffer.from("converted:.pcm"),
outputFormat: "pcm",
sampleRate: 16000,
});
expectArgsContainSequence(requireFfmpegArgs(), ["-ar", "16000", "-ac", "1", "-f", "s16le"]);
} finally {
rmSync(fixture.dir, { recursive: true, force: true });
}
});
it("can synthesize through a real local CLI fixture and ffmpeg", async () => {
if (process.env.OPENCLAW_LIVE_TEST !== "1") {
return;
}
const fixture = createCliFixture();
const rawFfmpeg = await vi.importActual<typeof import("openclaw/plugin-sdk/media-runtime")>(
"openclaw/plugin-sdk/media-runtime",
);
runFfmpegMock.mockImplementation(async (args) => {
await rawFfmpeg.runFfmpeg(args);
});
try {
const wavPath = path.join(fixture.dir, "source.wav");
await rawFfmpeg.runFfmpeg([
"-y",
"-f",
"lavfi",
"-i",
"sine=frequency=660:duration=0.1",
"-c:a",
"pcm_s16le",
wavPath,
]);
writeFileSync(
fixture.script,
`
import { copyFileSync } from "node:fs";
const outIndex = process.argv.indexOf("--out");
copyFileSync(${JSON.stringify(wavPath)}, process.argv[outIndex + 1]);
`,
);
const result = await synthesize({
providerConfig: baseProviderConfig(fixture.script, {
args: [fixture.script, "--out", "{{OutputPath}}"],
outputFormat: "wav",
}),
target: "voice-note",
});
expect(result.outputFormat).toBe("opus");
expect(result.fileExtension).toBe(".ogg");
expect(result.voiceCompatible).toBe(true);
expect(result.audioBuffer.byteLength).toBeGreaterThan(0);
expect(readFileSync(wavPath).byteLength).toBeGreaterThan(0);
} finally {
rmSync(fixture.dir, { recursive: true, force: true });
}
});
});