mirror of
https://github.com/moltbot/moltbot.git
synced 2026-05-13 15:47:28 +00:00
194 lines
5.5 KiB
TypeScript
194 lines
5.5 KiB
TypeScript
import { assertOkOrThrowProviderError } from "openclaw/plugin-sdk/provider-http";
|
|
import {
|
|
normalizeApplyTextNormalization,
|
|
normalizeLanguageCode,
|
|
normalizeSeed,
|
|
requireInRange,
|
|
} from "openclaw/plugin-sdk/speech";
|
|
import {
|
|
fetchWithSsrFGuard,
|
|
ssrfPolicyFromHttpBaseUrlAllowedHostname,
|
|
} from "openclaw/plugin-sdk/ssrf-runtime";
|
|
import { isValidElevenLabsVoiceId, normalizeElevenLabsBaseUrl } from "./shared.js";
|
|
|
|
function assertElevenLabsVoiceSettings(settings: {
|
|
stability: number;
|
|
similarityBoost: number;
|
|
style: number;
|
|
useSpeakerBoost: boolean;
|
|
speed: number;
|
|
}) {
|
|
requireInRange(settings.stability, 0, 1, "stability");
|
|
requireInRange(settings.similarityBoost, 0, 1, "similarityBoost");
|
|
requireInRange(settings.style, 0, 1, "style");
|
|
requireInRange(settings.speed, 0.5, 2, "speed");
|
|
}
|
|
|
|
function resolveElevenLabsAcceptHeader(outputFormat: string): string | undefined {
|
|
const normalized = outputFormat.trim().toLowerCase();
|
|
if (!normalized || normalized.startsWith("mp3_")) {
|
|
return "audio/mpeg";
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
type ElevenLabsTtsRequestParams = {
|
|
text: string;
|
|
apiKey: string;
|
|
baseUrl: string;
|
|
voiceId: string;
|
|
modelId: string;
|
|
outputFormat: string;
|
|
seed?: number;
|
|
applyTextNormalization?: "auto" | "on" | "off";
|
|
languageCode?: string;
|
|
latencyTier?: number;
|
|
voiceSettings: {
|
|
stability: number;
|
|
similarityBoost: number;
|
|
style: number;
|
|
useSpeakerBoost: boolean;
|
|
speed: number;
|
|
};
|
|
timeoutMs: number;
|
|
};
|
|
|
|
function prepareElevenLabsTtsRequest(params: ElevenLabsTtsRequestParams & { stream: boolean }): {
|
|
url: URL;
|
|
normalizedBaseUrl: string;
|
|
acceptHeader?: string;
|
|
body: string;
|
|
} {
|
|
const {
|
|
text,
|
|
baseUrl,
|
|
voiceId,
|
|
modelId,
|
|
outputFormat,
|
|
seed,
|
|
applyTextNormalization,
|
|
languageCode,
|
|
latencyTier,
|
|
voiceSettings,
|
|
} = params;
|
|
if (!isValidElevenLabsVoiceId(voiceId)) {
|
|
throw new Error("Invalid voiceId format");
|
|
}
|
|
assertElevenLabsVoiceSettings(voiceSettings);
|
|
const normalizedLanguage = normalizeLanguageCode(languageCode);
|
|
const normalizedNormalization = normalizeApplyTextNormalization(applyTextNormalization);
|
|
const normalizedSeed = normalizeSeed(seed);
|
|
const normalizedBaseUrl = normalizeElevenLabsBaseUrl(baseUrl);
|
|
const normalizedLatencyTier =
|
|
typeof latencyTier === "number" && Number.isFinite(latencyTier)
|
|
? Math.trunc(latencyTier)
|
|
: undefined;
|
|
if (normalizedLatencyTier !== undefined) {
|
|
requireInRange(normalizedLatencyTier, 0, 4, "latencyTier");
|
|
}
|
|
const url = new URL(
|
|
`${normalizedBaseUrl}/v1/text-to-speech/${voiceId}${params.stream ? "/stream" : ""}`,
|
|
);
|
|
if (outputFormat) {
|
|
url.searchParams.set("output_format", outputFormat);
|
|
}
|
|
const supportsStreamingLatency = modelId.trim().toLowerCase() !== "eleven_v3";
|
|
if (normalizedLatencyTier !== undefined && supportsStreamingLatency) {
|
|
url.searchParams.set("optimize_streaming_latency", normalizedLatencyTier.toString());
|
|
}
|
|
const acceptHeader = resolveElevenLabsAcceptHeader(outputFormat);
|
|
return {
|
|
url,
|
|
normalizedBaseUrl,
|
|
acceptHeader,
|
|
body: JSON.stringify({
|
|
text,
|
|
model_id: modelId,
|
|
seed: normalizedSeed,
|
|
apply_text_normalization: normalizedNormalization,
|
|
language_code: normalizedLanguage,
|
|
voice_settings: {
|
|
stability: voiceSettings.stability,
|
|
similarity_boost: voiceSettings.similarityBoost,
|
|
style: voiceSettings.style,
|
|
use_speaker_boost: voiceSettings.useSpeakerBoost,
|
|
speed: voiceSettings.speed,
|
|
},
|
|
}),
|
|
};
|
|
}
|
|
|
|
export async function elevenLabsTTS(params: ElevenLabsTtsRequestParams): Promise<Buffer> {
|
|
const { apiKey, timeoutMs } = params;
|
|
const { url, normalizedBaseUrl, acceptHeader, body } = prepareElevenLabsTtsRequest({
|
|
...params,
|
|
stream: false,
|
|
});
|
|
|
|
const { response, release } = await fetchWithSsrFGuard({
|
|
url: url.toString(),
|
|
init: {
|
|
method: "POST",
|
|
headers: {
|
|
"xi-api-key": apiKey,
|
|
"Content-Type": "application/json",
|
|
...(acceptHeader ? { Accept: acceptHeader } : {}),
|
|
},
|
|
body,
|
|
},
|
|
timeoutMs,
|
|
policy: ssrfPolicyFromHttpBaseUrlAllowedHostname(normalizedBaseUrl),
|
|
auditContext: "elevenlabs.tts",
|
|
});
|
|
try {
|
|
await assertOkOrThrowProviderError(response, "ElevenLabs API error");
|
|
|
|
return Buffer.from(await response.arrayBuffer());
|
|
} finally {
|
|
await release();
|
|
}
|
|
}
|
|
|
|
export async function elevenLabsTTSStream(params: ElevenLabsTtsRequestParams): Promise<{
|
|
audioStream: ReadableStream<Uint8Array>;
|
|
release: () => Promise<void>;
|
|
}> {
|
|
const { apiKey, timeoutMs } = params;
|
|
const { url, normalizedBaseUrl, acceptHeader, body } = prepareElevenLabsTtsRequest({
|
|
...params,
|
|
stream: true,
|
|
});
|
|
|
|
const { response, release } = await fetchWithSsrFGuard({
|
|
url: url.toString(),
|
|
init: {
|
|
method: "POST",
|
|
headers: {
|
|
"xi-api-key": apiKey,
|
|
"Content-Type": "application/json",
|
|
...(acceptHeader ? { Accept: acceptHeader } : {}),
|
|
},
|
|
body,
|
|
},
|
|
timeoutMs,
|
|
policy: ssrfPolicyFromHttpBaseUrlAllowedHostname(normalizedBaseUrl),
|
|
auditContext: "elevenlabs.tts.stream",
|
|
});
|
|
let handedOff = false;
|
|
try {
|
|
await assertOkOrThrowProviderError(response, "ElevenLabs API error");
|
|
if (!response.body) {
|
|
throw new Error("ElevenLabs API response missing audio stream");
|
|
}
|
|
handedOff = true;
|
|
return {
|
|
audioStream: response.body,
|
|
release,
|
|
};
|
|
} finally {
|
|
if (!handedOff) {
|
|
await release();
|
|
}
|
|
}
|
|
}
|