feat(media): add voice conversion and speech plugins

2026-05-13 15:47:28 +00:00 · 2026-04-25 12:12:11 +01:00
parent 16b7dee1ef
commit b511250e5c
37 changed files with 1681 additions and 47 deletions
--- a/extensions/tts-local-cli/index.ts
+++ b/extensions/tts-local-cli/index.ts
@@ -0,0 +1,11 @@
+import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
+import { buildCliSpeechProvider } from "./speech-provider.js";
+
+export default definePluginEntry({
+  id: "tts-local-cli",
+  name: "Local CLI TTS",
+  description: "Bundled CLI speech provider for local TTS",
+  register(api) {
+    api.registerSpeechProvider(buildCliSpeechProvider());
+  },
+});
--- a/extensions/tts-local-cli/openclaw.plugin.json
+++ b/extensions/tts-local-cli/openclaw.plugin.json
@@ -0,0 +1,12 @@
+{
+  "id": "tts-local-cli",
+  "enabledByDefault": true,
+  "contracts": {
+    "speechProviders": ["tts-local-cli", "cli"]
+  },
+  "configSchema": {
+    "type": "object",
+    "additionalProperties": false,
+    "properties": {}
+  }
+}
--- a/extensions/tts-local-cli/package.json
+++ b/extensions/tts-local-cli/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "@openclaw/tts-local-cli",
+  "version": "2026.4.25",
+  "private": true,
+  "description": "OpenClaw local CLI TTS plugin",
+  "type": "module",
+  "devDependencies": {
+    "@openclaw/plugin-sdk": "workspace:*"
+  },
+  "openclaw": {
+    "extensions": [
+      "./index.ts"
+    ]
+  }
+}
--- a/extensions/tts-local-cli/speech-provider.test.ts
+++ b/extensions/tts-local-cli/speech-provider.test.ts
@@ -0,0 +1,283 @@
+import { mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
+import type { SpeechProviderConfig, SpeechSynthesisRequest } from "openclaw/plugin-sdk/speech-core";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+
+type SpeechSynthesisTarget = SpeechSynthesisRequest["target"];
+
+const runFfmpegMock = vi.hoisted(() => vi.fn<(args: string[]) => Promise<string | void>>());
+
+vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
+  runFfmpeg: runFfmpegMock,
+}));
+
+import { buildCliSpeechProvider } from "./speech-provider.js";
+
+const TEST_CFG = {} as OpenClawConfig;
+
+function createCliFixture(): { dir: string; script: string } {
+  const dir = mkdtempSync(path.join(os.tmpdir(), "openclaw-cli-tts-test-"));
+  const script = path.join(dir, "write-audio.mjs");
+  writeFileSync(
+    script,
+    `
+import { writeFileSync } from "node:fs";
+
+const outIndex = process.argv.indexOf("--out");
+const outputPath = outIndex >= 0 ? process.argv[outIndex + 1] : "";
+const textIndex = process.argv.indexOf("--text");
+const textArg = textIndex >= 0 ? process.argv[textIndex + 1] : "";
+const stdin = await new Promise((resolve) => {
+  let data = "";
+  process.stdin.setEncoding("utf8");
+  process.stdin.on("data", (chunk) => { data += chunk; });
+  process.stdin.on("end", () => resolve(data));
+});
+const payload = Buffer.from(JSON.stringify({ args: process.argv.slice(2), stdin, textArg }));
+if (outputPath) {
+  writeFileSync(outputPath, payload);
+} else {
+  process.stdout.write(payload);
+}
+`,
+  );
+  return { dir, script };
+}
+
+function baseProviderConfig(
+  script: string,
+  overrides: SpeechProviderConfig = {},
+): SpeechProviderConfig {
+  return {
+    command: process.execPath,
+    args: [script],
+    timeoutMs: 1000,
+    ...overrides,
+  };
+}
+
+async function synthesize(params: {
+  providerConfig: SpeechProviderConfig;
+  text?: string;
+  target?: SpeechSynthesisTarget;
+}) {
+  return await buildCliSpeechProvider().synthesize({
+    text: params.text ?? "hello world",
+    cfg: TEST_CFG,
+    providerConfig: params.providerConfig,
+    providerOverrides: {},
+    timeoutMs: 1000,
+    target: params.target ?? "audio-file",
+  });
+}
+
+describe("buildCliSpeechProvider", () => {
+  beforeEach(() => {
+    runFfmpegMock.mockImplementation(async (args) => {
+      const outputPath = args.at(-1);
+      if (typeof outputPath !== "string") {
+        throw new Error("missing ffmpeg output path");
+      }
+      writeFileSync(outputPath, Buffer.from(`converted:${path.extname(outputPath)}`));
+    });
+  });
+
+  afterEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it("prefers canonical provider config over the cli alias", () => {
+    const provider = buildCliSpeechProvider();
+
+    expect(
+      provider.resolveConfig?.({
+        cfg: TEST_CFG,
+        rawConfig: {
+          providers: {
+            cli: { command: "alias-command" },
+            "tts-local-cli": { command: "canonical-command" },
+          },
+        },
+        timeoutMs: 1000,
+      }),
+    ).toEqual({ command: "canonical-command" });
+  });
+
+  it("passes text through stdin when args omit the text template", async () => {
+    const fixture = createCliFixture();
+    try {
+      const result = await synthesize({
+        providerConfig: baseProviderConfig(fixture.script, {
+          args: [fixture.script, "--out", "{{OutputPath}}"],
+          outputFormat: "mp3",
+        }),
+        text: "hello 😀 world",
+      });
+
+      expect(result).toMatchObject({
+        outputFormat: "mp3",
+        fileExtension: ".mp3",
+        voiceCompatible: false,
+      });
+      expect(JSON.parse(result.audioBuffer.toString("utf8"))).toMatchObject({
+        stdin: "hello world",
+        textArg: "",
+      });
+      expect(runFfmpegMock).not.toHaveBeenCalled();
+    } finally {
+      rmSync(fixture.dir, { recursive: true, force: true });
+    }
+  });
+
+  it("uses template args and stdout output when no output file is produced", async () => {
+    const fixture = createCliFixture();
+    try {
+      const result = await synthesize({
+        providerConfig: baseProviderConfig(fixture.script, {
+          args: [fixture.script, "--text", "{{Text}}"],
+          outputFormat: "wav",
+        }),
+        text: "spoken words",
+      });
+
+      expect(result).toMatchObject({
+        outputFormat: "wav",
+        fileExtension: ".wav",
+        voiceCompatible: false,
+      });
+      expect(JSON.parse(result.audioBuffer.toString("utf8"))).toMatchObject({
+        stdin: "",
+        textArg: "spoken words",
+      });
+    } finally {
+      rmSync(fixture.dir, { recursive: true, force: true });
+    }
+  });
+
+  it("converts non-opus output for voice-note targets", async () => {
+    const fixture = createCliFixture();
+    try {
+      const result = await synthesize({
+        providerConfig: baseProviderConfig(fixture.script, {
+          args: [fixture.script, "--out", "{{OutputPath}}"],
+          outputFormat: "mp3",
+        }),
+        target: "voice-note",
+      });
+
+      expect(result).toEqual({
+        audioBuffer: Buffer.from("converted:.opus"),
+        outputFormat: "opus",
+        fileExtension: ".ogg",
+        voiceCompatible: true,
+      });
+      expect(runFfmpegMock).toHaveBeenCalledWith(
+        expect.arrayContaining(["-c:a", "libopus", "-b:a", "64k"]),
+      );
+    } finally {
+      rmSync(fixture.dir, { recursive: true, force: true });
+    }
+  });
+
+  it("converts stdout WAV to the requested audio-file format", async () => {
+    const fixture = createCliFixture();
+    try {
+      const result = await synthesize({
+        providerConfig: baseProviderConfig(fixture.script, {
+          args: [fixture.script, "--text", "{{Text}}"],
+          outputFormat: "mp3",
+        }),
+      });
+
+      expect(result).toEqual({
+        audioBuffer: Buffer.from("converted:.mp3"),
+        outputFormat: "mp3",
+        fileExtension: ".mp3",
+        voiceCompatible: false,
+      });
+      expect(runFfmpegMock).toHaveBeenCalledWith(
+        expect.arrayContaining(["-c:a", "libmp3lame", "-b:a", "128k"]),
+      );
+    } finally {
+      rmSync(fixture.dir, { recursive: true, force: true });
+    }
+  });
+
+  it("converts CLI output to raw telephony PCM", async () => {
+    const fixture = createCliFixture();
+    try {
+      const result = await buildCliSpeechProvider().synthesizeTelephony?.({
+        text: "phone reply",
+        cfg: TEST_CFG,
+        providerConfig: baseProviderConfig(fixture.script, {
+          args: [fixture.script, "--out", "{{OutputPath}}"],
+          outputFormat: "wav",
+        }),
+        timeoutMs: 1000,
+      });
+
+      expect(result).toEqual({
+        audioBuffer: Buffer.from("converted:.pcm"),
+        outputFormat: "pcm",
+        sampleRate: 16000,
+      });
+      expect(runFfmpegMock).toHaveBeenCalledWith(
+        expect.arrayContaining(["-ar", "16000", "-ac", "1", "-f", "s16le"]),
+      );
+    } finally {
+      rmSync(fixture.dir, { recursive: true, force: true });
+    }
+  });
+
+  it("can synthesize through a real local CLI fixture and ffmpeg", async () => {
+    if (process.env.OPENCLAW_LIVE_TEST !== "1") {
+      return;
+    }
+    const fixture = createCliFixture();
+    const rawFfmpeg = await vi.importActual<typeof import("openclaw/plugin-sdk/media-runtime")>(
+      "openclaw/plugin-sdk/media-runtime",
+    );
+    runFfmpegMock.mockImplementation(async (args) => {
+      await rawFfmpeg.runFfmpeg(args);
+    });
+    try {
+      const wavPath = path.join(fixture.dir, "source.wav");
+      await rawFfmpeg.runFfmpeg([
+        "-y",
+        "-f",
+        "lavfi",
+        "-i",
+        "sine=frequency=660:duration=0.1",
+        "-c:a",
+        "pcm_s16le",
+        wavPath,
+      ]);
+      writeFileSync(
+        fixture.script,
+        `
+import { copyFileSync } from "node:fs";
+const outIndex = process.argv.indexOf("--out");
+copyFileSync(${JSON.stringify(wavPath)}, process.argv[outIndex + 1]);
+`,
+      );
+
+      const result = await synthesize({
+        providerConfig: baseProviderConfig(fixture.script, {
+          args: [fixture.script, "--out", "{{OutputPath}}"],
+          outputFormat: "wav",
+        }),
+        target: "voice-note",
+      });
+
+      expect(result.outputFormat).toBe("opus");
+      expect(result.fileExtension).toBe(".ogg");
+      expect(result.voiceCompatible).toBe(true);
+      expect(result.audioBuffer.byteLength).toBeGreaterThan(0);
+      expect(readFileSync(wavPath).byteLength).toBeGreaterThan(0);
+    } finally {
+      rmSync(fixture.dir, { recursive: true, force: true });
+    }
+  });
+});
--- a/extensions/tts-local-cli/speech-provider.ts
+++ b/extensions/tts-local-cli/speech-provider.ts
@@ -0,0 +1,436 @@
+import { spawn } from "node:child_process";
+import { existsSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
+import path from "node:path";
+import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
+import { createSubsystemLogger } from "openclaw/plugin-sdk/runtime-env";
+import type {
+  SpeechProviderConfig,
+  SpeechProviderPlugin,
+  SpeechSynthesisRequest,
+  SpeechTelephonySynthesisRequest,
+} from "openclaw/plugin-sdk/speech-core";
+import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
+
+const log = createSubsystemLogger("tts-local-cli");
+
+const VALID_OUTPUT_FORMATS = ["mp3", "opus", "wav"] as const;
+const AUDIO_EXTENSIONS = new Set([".wav", ".mp3", ".opus", ".ogg", ".m4a"]);
+type OutputFormat = (typeof VALID_OUTPUT_FORMATS)[number];
+
+type CliConfig = {
+  command: string;
+  args?: string[];
+  outputFormat?: OutputFormat;
+  timeoutMs?: number;
+  cwd?: string;
+  env?: Record<string, string>;
+};
+
+const DEFAULT_TIMEOUT_MS = 120_000;
+
+function asObject(value: unknown): Record<string, unknown> | undefined {
+  return typeof value === "object" && value !== null && !Array.isArray(value)
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function asStringArray(value: unknown): string[] | undefined {
+  return Array.isArray(value) && value.every((v) => typeof v === "string") ? value : undefined;
+}
+
+function asRecord(value: unknown): Record<string, string> | undefined {
+  const obj = asObject(value);
+  if (!obj) {
+    return undefined;
+  }
+  const result: Record<string, string> = {};
+  for (const [k, v] of Object.entries(obj)) {
+    if (typeof v === "string") {
+      result[k] = v;
+    }
+  }
+  return Object.keys(result).length > 0 ? result : undefined;
+}
+
+function normalizeOutputFormat(value: unknown): OutputFormat {
+  if (typeof value !== "string") {
+    return "mp3";
+  }
+  const lower = value.toLowerCase().trim();
+  if (VALID_OUTPUT_FORMATS.includes(lower as OutputFormat)) {
+    return lower as OutputFormat;
+  }
+  return "mp3";
+}
+
+function resolveCliProviderConfig(rawConfig: Record<string, unknown>): SpeechProviderConfig {
+  const providers = asObject(rawConfig.providers);
+  return asObject(providers?.["tts-local-cli"]) ?? asObject(providers?.cli) ?? {};
+}
+
+function getConfig(cfg: SpeechProviderConfig): CliConfig | null {
+  const command = typeof cfg.command === "string" ? cfg.command.trim() : "";
+  if (!command) {
+    return null;
+  }
+  return {
+    command,
+    args: asStringArray(cfg.args),
+    outputFormat: normalizeOutputFormat(cfg.outputFormat),
+    timeoutMs: typeof cfg.timeoutMs === "number" ? cfg.timeoutMs : DEFAULT_TIMEOUT_MS,
+    cwd: typeof cfg.cwd === "string" ? cfg.cwd : undefined,
+    env: asRecord(cfg.env),
+  };
+}
+
+function stripEmojis(text: string): string {
+  return text
+    .replace(/[\p{Emoji_Presentation}\p{Extended_Pictographic}]/gu, " ")
+    .replace(/\s+/g, " ")
+    .trim();
+}
+
+function applyTemplate(str: string, ctx: Record<string, string | undefined>): string {
+  return str.replace(/{{\s*(\w+)\s*}}/gi, (_, key) => {
+    const normalizedKey = key.charAt(0).toUpperCase() + key.slice(1).toLowerCase();
+    return ctx[normalizedKey] ?? ctx[key] ?? "";
+  });
+}
+
+function parseCommand(cmdStr: string): { cmd: string; initialArgs: string[] } {
+  const parts: string[] = [];
+  let current = "";
+  let inQuote = false;
+  let quoteChar = "";
+
+  for (const char of cmdStr.trim()) {
+    if (inQuote) {
+      if (char === quoteChar) {
+        inQuote = false;
+      } else {
+        current += char;
+      }
+    } else if (char === '"' || char === "'") {
+      inQuote = true;
+      quoteChar = char;
+    } else if (char === " " || char === "\t") {
+      if (current) {
+        parts.push(current);
+        current = "";
+      }
+    } else {
+      current += char;
+    }
+  }
+  if (current) {
+    parts.push(current);
+  }
+  return { cmd: parts[0] || "", initialArgs: parts.slice(1) };
+}
+
+function findAudioFile(dir: string, baseName: string): string | null {
+  const files = readdirSync(dir);
+  for (const file of files) {
+    const ext = path.extname(file).toLowerCase();
+    if (AUDIO_EXTENSIONS.has(ext) && (file.startsWith(baseName) || file.includes(baseName))) {
+      return path.join(dir, file);
+    }
+  }
+  for (const file of files) {
+    const ext = path.extname(file).toLowerCase();
+    if (AUDIO_EXTENSIONS.has(ext)) {
+      return path.join(dir, file);
+    }
+  }
+  return null;
+}
+
+function detectFormat(filePath: string): "mp3" | "opus" | "wav" | null {
+  const ext = path.extname(filePath).toLowerCase();
+  if (ext === ".opus" || ext === ".ogg") {
+    return "opus";
+  }
+  if (ext === ".wav") {
+    return "wav";
+  }
+  if (ext === ".mp3" || ext === ".m4a") {
+    return "mp3";
+  }
+  return null;
+}
+
+function getFileExt(format: string): string {
+  if (format === "opus") {
+    return ".opus";
+  }
+  if (format === "wav") {
+    return ".wav";
+  }
+  return ".mp3";
+}
+
+async function runCli(params: {
+  command: string;
+  args: string[];
+  cwd?: string;
+  env?: Record<string, string>;
+  timeoutMs: number;
+  text: string;
+  outputDir: string;
+  filePrefix: string;
+  outputFormat?: OutputFormat;
+}): Promise<{ buffer: Buffer; actualFormat: "mp3" | "opus" | "wav"; audioPath?: string }> {
+  const cleanText = stripEmojis(params.text);
+  if (!cleanText) {
+    throw new Error("CLI TTS: text is empty after removing emojis");
+  }
+
+  const outputExt = getFileExt(params.outputFormat ?? "wav");
+  const ctx: Record<string, string | undefined> = {
+    Text: cleanText,
+    OutputPath: path.join(params.outputDir, `${params.filePrefix}${outputExt}`),
+    OutputDir: params.outputDir,
+    OutputBase: params.filePrefix,
+  };
+
+  const { cmd, initialArgs } = parseCommand(params.command);
+  if (!cmd) {
+    throw new Error("CLI TTS: invalid command");
+  }
+
+  const baseArgs = [...initialArgs, ...params.args];
+  const args = baseArgs.map((a) => applyTemplate(a, ctx));
+
+  return new Promise((resolve, reject) => {
+    let timedOut = false;
+    const timer = setTimeout(() => {
+      timedOut = true;
+      proc.kill();
+      // Escalate to SIGKILL if child ignores SIGTERM
+      setTimeout(() => proc.kill("SIGKILL"), 5000).unref();
+    }, params.timeoutMs);
+
+    const env = params.env ? { ...process.env, ...params.env } : process.env;
+    const proc = spawn(cmd, args, { cwd: params.cwd, env, stdio: ["pipe", "pipe", "pipe"] });
+
+    const stdoutChunks: Buffer[] = [];
+    const stderrChunks: Buffer[] = [];
+    proc.stdout.on("data", (c) => stdoutChunks.push(c));
+    proc.stderr.on("data", (c) => stderrChunks.push(c));
+
+    proc.on("error", (e) => {
+      clearTimeout(timer);
+      reject(new Error(`CLI TTS failed: ${e.message}`));
+    });
+
+    proc.on("close", (code) => {
+      clearTimeout(timer);
+      if (timedOut) {
+        return reject(new Error(`CLI TTS timed out after ${params.timeoutMs}ms`));
+      }
+      if (code !== 0) {
+        const stderr = Buffer.concat(stderrChunks).toString("utf8");
+        return reject(new Error(`CLI TTS exit ${code}: ${stderr}`));
+      }
+
+      const audioFile = findAudioFile(params.outputDir, params.filePrefix);
+      if (audioFile) {
+        if (!existsSync(audioFile)) {
+          return reject(new Error(`CLI TTS: output file not found at ${audioFile}`));
+        }
+        const format = detectFormat(audioFile);
+        if (!format) {
+          return reject(new Error(`CLI TTS: unknown format for ${audioFile}`));
+        }
+        return resolve({
+          buffer: readFileSync(audioFile),
+          actualFormat: format,
+          audioPath: audioFile,
+        });
+      }
+
+      const stdout = Buffer.concat(stdoutChunks);
+      if (stdout.length > 0) {
+        // Assume WAV for stdout output; could be MP3 but caller should convert if needed
+        return resolve({ buffer: stdout, actualFormat: "wav" });
+      }
+      reject(new Error("CLI TTS produced no output"));
+    });
+
+    proc.stdin?.on("error", () => {}); // suppress EPIPE if child ignores stdin
+    if (!baseArgs.some((a) => /{{\s*text\s*}}/i.test(a))) {
+      proc.stdin?.write(cleanText);
+    }
+    proc.stdin?.end();
+  });
+}
+
+async function convertAudio(
+  inputPath: string,
+  outputDir: string,
+  target: OutputFormat,
+): Promise<Buffer> {
+  const outputPath = path.join(outputDir, `converted${getFileExt(target)}`);
+  const args = ["-y", "-i", inputPath];
+  if (target === "opus") {
+    args.push("-c:a", "libopus", "-b:a", "64k", outputPath);
+  } else if (target === "wav") {
+    args.push("-c:a", "pcm_s16le", outputPath);
+  } else {
+    args.push("-c:a", "libmp3lame", "-b:a", "128k", outputPath);
+  }
+  await runFfmpeg(args);
+  return readFileSync(outputPath);
+}
+
+async function convertToRawPcm(inputPath: string, outputDir: string): Promise<Buffer> {
+  // Output raw 16kHz mono 16-bit little-endian PCM (no WAV headers)
+  const outputPath = path.join(outputDir, "telephony.pcm");
+  await runFfmpeg([
+    "-y",
+    "-i",
+    inputPath,
+    "-c:a",
+    "pcm_s16le",
+    "-ar",
+    "16000",
+    "-ac",
+    "1",
+    "-f",
+    "s16le",
+    outputPath,
+  ]);
+  return readFileSync(outputPath);
+}
+
+export function buildCliSpeechProvider(): SpeechProviderPlugin {
+  return {
+    id: "tts-local-cli",
+    aliases: ["cli"],
+    label: "Local CLI",
+    autoSelectOrder: 1000,
+
+    resolveConfig(ctx): SpeechProviderConfig {
+      return resolveCliProviderConfig(ctx.rawConfig);
+    },
+
+    isConfigured(ctx): boolean {
+      return getConfig(ctx.providerConfig) !== null;
+    },
+
+    async synthesize(req: SpeechSynthesisRequest) {
+      const config = getConfig(req.providerConfig);
+      if (!config) {
+        throw new Error("CLI TTS not configured");
+      }
+
+      log.debug(`synthesize: text=${req.text.slice(0, 50)}...`);
+
+      const tempDir = mkdtempSync(path.join(resolvePreferredOpenClawTmpDir(), "openclaw-cli-tts-"));
+
+      try {
+        const result = await runCli({
+          command: config.command,
+          args: config.args ?? [],
+          cwd: config.cwd,
+          env: config.env,
+          timeoutMs: config.timeoutMs ?? DEFAULT_TIMEOUT_MS,
+          text: req.text,
+          outputDir: tempDir,
+          filePrefix: "speech",
+          outputFormat: config.outputFormat,
+        });
+
+        log.debug(`synthesize: format=${result.actualFormat}, size=${result.buffer.length}`);
+
+        let buffer: Buffer;
+        let format: OutputFormat;
+
+        if (req.target === "voice-note") {
+          if (result.actualFormat !== "opus") {
+            const inputFile =
+              result.audioPath ?? path.join(tempDir, `input${getFileExt(result.actualFormat)}`);
+            if (!result.audioPath) {
+              writeFileSync(inputFile, result.buffer);
+            }
+            buffer = await convertAudio(inputFile, tempDir, "opus");
+            format = "opus";
+          } else {
+            buffer = result.buffer;
+            format = "opus";
+          }
+        } else {
+          const desired = config.outputFormat ?? "mp3";
+          if (result.actualFormat !== desired) {
+            const inputFile =
+              result.audioPath ?? path.join(tempDir, `input${getFileExt(result.actualFormat)}`);
+            if (!result.audioPath) {
+              writeFileSync(inputFile, result.buffer);
+            }
+            buffer = await convertAudio(inputFile, tempDir, desired);
+            format = desired;
+          } else {
+            buffer = result.buffer;
+            format = result.actualFormat;
+          }
+        }
+
+        const fileExtension = format === "opus" ? ".ogg" : `.${format}`;
+        return {
+          audioBuffer: buffer,
+          outputFormat: format,
+          fileExtension,
+          voiceCompatible: req.target === "voice-note" && format === "opus",
+        };
+      } finally {
+        try {
+          rmSync(tempDir, { recursive: true, force: true });
+        } catch {}
+      }
+    },
+
+    async synthesizeTelephony(req: SpeechTelephonySynthesisRequest) {
+      const config = getConfig(req.providerConfig);
+      if (!config) {
+        throw new Error("CLI TTS not configured");
+      }
+
+      log.debug(`synthesizeTelephony: text=${req.text.slice(0, 50)}...`);
+
+      const tempDir = mkdtempSync(path.join(resolvePreferredOpenClawTmpDir(), "openclaw-cli-tts-"));
+
+      try {
+        const result = await runCli({
+          command: config.command,
+          args: config.args ?? [],
+          cwd: config.cwd,
+          env: config.env,
+          timeoutMs: config.timeoutMs ?? DEFAULT_TIMEOUT_MS,
+          text: req.text,
+          outputDir: tempDir,
+          filePrefix: "telephony",
+          outputFormat: config.outputFormat,
+        });
+
+        const inputFile =
+          result.audioPath ?? path.join(tempDir, `input${getFileExt(result.actualFormat)}`);
+        if (!result.audioPath) {
+          writeFileSync(inputFile, result.buffer);
+        }
+
+        // Convert to raw 16kHz mono PCM for telephony (no WAV headers)
+        const pcmBuffer = await convertToRawPcm(inputFile, tempDir);
+
+        return {
+          audioBuffer: pcmBuffer,
+          outputFormat: "pcm",
+          sampleRate: 16000,
+        };
+      } finally {
+        try {
+          rmSync(tempDir, { recursive: true, force: true });
+        } catch {}
+      }
+    },
+  };
+}