mirror of
https://github.com/moltbot/moltbot.git
synced 2026-05-13 15:47:28 +00:00
Enable realtime Talk instructions config
This commit is contained in:
committed by
Peter Steinberger
parent
1463974c46
commit
c5cc1bb184
@@ -1,4 +1,4 @@
|
||||
6a0d49baf0a07563146d9c3fa7650175f4b3e5bf8ed67a8f8a1930fd8dc18f3b config-baseline.json
|
||||
b58f16663413d37906072f3ed7fa7c0ed0bac6347ab76627180ea4f21331b9c5 config-baseline.core.json
|
||||
7317136882cafd0cfd23146218b7bec4a65a2c08328f7a1f8e22ac04789ec35d config-baseline.json
|
||||
8e0f397eaeaa858f016a080e2585c69fa6744cdbb0e7f68d71cfc693971dbc8c config-baseline.core.json
|
||||
25c6e70d5b4925e07549072159ce4fcad45813fed12fa36a2f43d3568ca8dd96 config-baseline.channel.json
|
||||
af8a8e8616a0146ad989ff1bc0e8cf62c61a4d434dd67bbe7fe082c5c204fada config-baseline.plugin.json
|
||||
|
||||
@@ -1396,6 +1396,7 @@ Defaults for Talk mode (macOS/iOS/Android).
|
||||
voice: "cedar",
|
||||
},
|
||||
},
|
||||
instructions: "Speak warmly and keep answers brief.",
|
||||
mode: "realtime",
|
||||
transport: "webrtc",
|
||||
brain: "agent-consult",
|
||||
@@ -1416,6 +1417,7 @@ Defaults for Talk mode (macOS/iOS/Android).
|
||||
- `consultFastMode` sets a one-shot fast-mode override for Control UI Talk realtime consults without changing the session's normal fast-mode setting.
|
||||
- `speechLocale` sets the BCP 47 locale id used by iOS/macOS Talk speech recognition. Leave unset to use the device default.
|
||||
- `silenceTimeoutMs` controls how long Talk mode waits after user silence before it sends the transcript. Unset keeps the platform default pause window (`700 ms on macOS and Android, 900 ms on iOS`).
|
||||
- `realtime.instructions` appends provider-facing system instructions to OpenClaw's built-in realtime prompt, so voice style can be configured without losing default `openclaw_agent_consult` guidance.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -85,6 +85,7 @@ Supported keys:
|
||||
voice: "cedar",
|
||||
},
|
||||
},
|
||||
instructions: "Speak warmly and keep answers brief.",
|
||||
mode: "realtime",
|
||||
transport: "webrtc",
|
||||
brain: "agent-consult",
|
||||
@@ -108,6 +109,7 @@ Defaults:
|
||||
- `realtime.providers.<provider>` stores provider-owned realtime config. The browser receives only ephemeral or constrained session credentials, never a standard API key.
|
||||
- `realtime.providers.openai.voice`: built-in OpenAI Realtime voice id. Current `gpt-realtime-2` voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`; `marin` and `cedar` are recommended for best quality.
|
||||
- `realtime.brain`: `agent-consult` routes realtime tool calls through Gateway policy; `direct-tools` is owner-only compatibility behavior; `none` is for transcription or external orchestration.
|
||||
- `realtime.instructions`: appends provider-facing system instructions to OpenClaw's built-in realtime prompt. Use it for voice style and tone; OpenClaw keeps the default `openclaw_agent_consult` guidance.
|
||||
- `talk.catalog` exposes each provider's valid modes, transports, brain strategies, realtime audio formats, and capability flags so first-party Talk clients can avoid unsupported combinations.
|
||||
- Streaming transcription providers are discovered through `talk.catalog.transcription`. The current Gateway relay uses the Voice Call streaming provider config until the dedicated Talk transcription config surface is added.
|
||||
- `speechLocale`: optional BCP 47 locale id for on-device Talk speech recognition on iOS/macOS. Leave unset to use the device default.
|
||||
|
||||
@@ -162,6 +162,8 @@ export const FIELD_HELP: Record<string, string> = {
|
||||
"Realtime provider model id override for browser or Gateway-owned Talk sessions.",
|
||||
"talk.realtime.voice":
|
||||
"Realtime provider voice id override for browser or Gateway-owned Talk sessions.",
|
||||
"talk.realtime.instructions":
|
||||
"Additional system instructions appended to OpenClaw's built-in realtime Talk prompt. Use this for voice style, tone, and other provider-facing realtime behavior while keeping agent-consult guidance intact.",
|
||||
"talk.realtime.mode": "Talk execution mode: realtime, stt-tts, or transcription.",
|
||||
"talk.realtime.transport":
|
||||
"Talk byte/session transport: webrtc, provider-websocket, gateway-relay, or managed-room.",
|
||||
|
||||
@@ -893,6 +893,7 @@ export const FIELD_LABELS: Record<string, string> = {
|
||||
"talk.realtime.providers.*.apiKey": "Talk Realtime Provider API Key", // pragma: allowlist secret
|
||||
"talk.realtime.model": "Talk Realtime Model",
|
||||
"talk.realtime.voice": "Talk Realtime Voice",
|
||||
"talk.realtime.instructions": "Talk Realtime Instructions",
|
||||
"talk.realtime.mode": "Talk Realtime Mode",
|
||||
"talk.realtime.transport": "Talk Realtime Transport",
|
||||
"talk.realtime.brain": "Talk Realtime Brain",
|
||||
|
||||
@@ -133,6 +133,26 @@ describe("talk normalization", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("preserves normalized realtime instructions in talk.config payloads", () => {
|
||||
const payload = buildTalkConfigResponse({
|
||||
realtime: {
|
||||
provider: "openai",
|
||||
providers: {
|
||||
openai: {
|
||||
model: "gpt-realtime",
|
||||
voice: "alloy",
|
||||
},
|
||||
},
|
||||
instructions: " Speak with crisp diction. ",
|
||||
},
|
||||
});
|
||||
|
||||
expect(payload?.realtime).toMatchObject({
|
||||
provider: "openai",
|
||||
instructions: "Speak with crisp diction.",
|
||||
});
|
||||
});
|
||||
|
||||
it("does not report an active provider when the configured speech provider cannot resolve", () => {
|
||||
const mismatchPayload = buildTalkConfigResponse({
|
||||
provider: "acme",
|
||||
|
||||
@@ -110,6 +110,10 @@ function normalizeTalkRealtimeConfig(value: unknown): TalkRealtimeConfig | undef
|
||||
if (voice) {
|
||||
normalized.voice = voice;
|
||||
}
|
||||
const instructions = normalizeOptionalString(source.instructions);
|
||||
if (instructions) {
|
||||
normalized.instructions = instructions;
|
||||
}
|
||||
if (source.mode === "realtime" || source.mode === "stt-tts" || source.mode === "transcription") {
|
||||
normalized.mode = source.mode;
|
||||
}
|
||||
|
||||
@@ -54,6 +54,8 @@ export type TalkRealtimeConfig = {
|
||||
model?: string;
|
||||
/** Provider voice override for realtime sessions. */
|
||||
voice?: string;
|
||||
/** Additional system instructions appended to realtime Talk sessions. */
|
||||
instructions?: string;
|
||||
/** Realtime execution mode. */
|
||||
mode?: "realtime" | "stt-tts" | "transcription";
|
||||
/** Byte/session transport. */
|
||||
|
||||
@@ -24,6 +24,25 @@ describe("OpenClawSchema talk validation", () => {
|
||||
).toThrow(/consultThinkingLevel/i);
|
||||
});
|
||||
|
||||
it("accepts additional realtime Talk instructions", () => {
|
||||
expect(() =>
|
||||
OpenClawSchema.parse({
|
||||
talk: {
|
||||
realtime: {
|
||||
provider: "openai",
|
||||
providers: {
|
||||
openai: {
|
||||
model: "gpt-realtime",
|
||||
voice: "alloy",
|
||||
},
|
||||
},
|
||||
instructions: "Speak with crisp diction.",
|
||||
},
|
||||
},
|
||||
}),
|
||||
).not.toThrow();
|
||||
});
|
||||
|
||||
it.each([
|
||||
["boolean", true],
|
||||
["string", "1500"],
|
||||
|
||||
@@ -236,6 +236,7 @@ const TalkRealtimeSchema = z
|
||||
providers: z.record(z.string(), TalkProviderEntrySchema).optional(),
|
||||
model: z.string().optional(),
|
||||
voice: z.string().optional(),
|
||||
instructions: z.string().optional(),
|
||||
mode: z.enum(["realtime", "stt-tts", "transcription"]).optional(),
|
||||
transport: z.enum(["webrtc", "provider-websocket", "gateway-relay", "managed-room"]).optional(),
|
||||
brain: z.enum(["agent-consult", "direct-tools", "none"]).optional(),
|
||||
|
||||
@@ -153,6 +153,7 @@ describe("validateTalkConfigResult", () => {
|
||||
},
|
||||
model: "gpt-realtime",
|
||||
voice: "alloy",
|
||||
instructions: "Speak with crisp diction.",
|
||||
mode: "realtime",
|
||||
transport: "gateway-relay",
|
||||
brain: "agent-consult",
|
||||
|
||||
@@ -493,6 +493,7 @@ const TalkRealtimeConfigSchema = Type.Object(
|
||||
providers: Type.Optional(Type.Record(Type.String(), TalkProviderConfigSchema)),
|
||||
model: Type.Optional(Type.String()),
|
||||
voice: Type.Optional(Type.String()),
|
||||
instructions: Type.Optional(Type.String()),
|
||||
mode: Type.Optional(TalkModeSchema),
|
||||
transport: Type.Optional(TalkTransportSchema),
|
||||
brain: Type.Optional(TalkBrainSchema),
|
||||
|
||||
@@ -183,7 +183,7 @@ export const talkClientHandlers: GatewayRequestHandlers = {
|
||||
if (resolution.provider.createBrowserSession && transport !== "gateway-relay") {
|
||||
const session = await resolution.provider.createBrowserSession({
|
||||
providerConfig: resolution.providerConfig,
|
||||
instructions: buildRealtimeInstructions(),
|
||||
instructions: buildRealtimeInstructions(realtimeConfig.instructions),
|
||||
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
|
||||
model: normalizeOptionalString(typedParams.model) ?? realtimeConfig.model,
|
||||
voice: normalizeOptionalString(typedParams.voice) ?? realtimeConfig.voice,
|
||||
|
||||
@@ -242,7 +242,7 @@ export const talkSessionHandlers: GatewayRequestHandlers = {
|
||||
connId,
|
||||
provider: resolution.provider,
|
||||
providerConfig: withRealtimeBrowserOverrides(resolution.providerConfig, { model, voice }),
|
||||
instructions: buildRealtimeInstructions(),
|
||||
instructions: buildRealtimeInstructions(realtimeConfig.instructions),
|
||||
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
|
||||
model,
|
||||
voice,
|
||||
|
||||
@@ -127,6 +127,7 @@ export function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvide
|
||||
},
|
||||
model: normalizeOptionalString(talkRealtime?.model),
|
||||
voice: normalizeOptionalString(talkRealtime?.voice),
|
||||
instructions: normalizeOptionalString(talkRealtime?.instructions),
|
||||
mode: normalizeOptionalLowercaseString(talkRealtime?.mode),
|
||||
transport: normalizeOptionalLowercaseString(talkRealtime?.transport),
|
||||
brain: normalizeOptionalLowercaseString(talkRealtime?.brain),
|
||||
@@ -210,8 +211,14 @@ export function resolveConfiguredRealtimeTranscriptionProvider(params: {
|
||||
throw new Error("No realtime transcription provider registered");
|
||||
}
|
||||
|
||||
export function buildRealtimeInstructions(): string {
|
||||
return `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
|
||||
const DEFAULT_REALTIME_INSTRUCTIONS = `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
|
||||
|
||||
export function buildRealtimeInstructions(configuredInstructions?: string): string {
|
||||
const extra = normalizeOptionalString(configuredInstructions);
|
||||
if (!extra) {
|
||||
return DEFAULT_REALTIME_INSTRUCTIONS;
|
||||
}
|
||||
return `${DEFAULT_REALTIME_INSTRUCTIONS}\n\nAdditional realtime instructions:\n${extra}`;
|
||||
}
|
||||
|
||||
export function withRealtimeBrowserOverrides(
|
||||
|
||||
@@ -498,6 +498,7 @@ describe("talk.session unified handlers", () => {
|
||||
realtime: {
|
||||
provider: "openai",
|
||||
providers: { openai: { apiKey: "openai-key" } },
|
||||
instructions: "Speak warmly.",
|
||||
},
|
||||
},
|
||||
}) as OpenClawConfig,
|
||||
@@ -513,6 +514,7 @@ describe("talk.session unified handlers", () => {
|
||||
model: "gpt-realtime",
|
||||
voice: "alloy",
|
||||
}),
|
||||
instructions: expect.stringContaining("Additional realtime instructions:\nSpeak warmly."),
|
||||
}),
|
||||
);
|
||||
expect(createRespond).toHaveBeenCalledWith(
|
||||
@@ -1196,7 +1198,7 @@ describe("talk.client.create handler", () => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it("uses talk.realtime provider, model, and voice without reading speech provider config", async () => {
|
||||
it("uses talk.realtime provider, model, voice, and instructions without reading speech provider config", async () => {
|
||||
const createBrowserSession = vi.fn(async () => ({
|
||||
provider: "openai",
|
||||
transport: "webrtc" as const,
|
||||
@@ -1232,6 +1234,7 @@ describe("talk.client.create handler", () => {
|
||||
providers: { openai: { apiKey: "openai-key" } },
|
||||
model: "gpt-realtime",
|
||||
voice: "alloy",
|
||||
instructions: "Speak warmly.",
|
||||
},
|
||||
},
|
||||
}) as OpenClawConfig,
|
||||
@@ -1248,6 +1251,7 @@ describe("talk.client.create handler", () => {
|
||||
expect.objectContaining({
|
||||
model: "gpt-realtime",
|
||||
voice: "alloy",
|
||||
instructions: expect.stringContaining("Additional realtime instructions:\nSpeak warmly."),
|
||||
}),
|
||||
);
|
||||
expect(respond).toHaveBeenCalledWith(
|
||||
|
||||
Reference in New Issue
Block a user