Enable realtime Talk instructions config

This commit is contained in:
VACInc
2026-05-09 19:45:40 -04:00
committed by Peter Steinberger
parent 1463974c46
commit c5cc1bb184
16 changed files with 73 additions and 7 deletions

View File

@@ -1,4 +1,4 @@
6a0d49baf0a07563146d9c3fa7650175f4b3e5bf8ed67a8f8a1930fd8dc18f3b config-baseline.json
b58f16663413d37906072f3ed7fa7c0ed0bac6347ab76627180ea4f21331b9c5 config-baseline.core.json
7317136882cafd0cfd23146218b7bec4a65a2c08328f7a1f8e22ac04789ec35d config-baseline.json
8e0f397eaeaa858f016a080e2585c69fa6744cdbb0e7f68d71cfc693971dbc8c config-baseline.core.json
25c6e70d5b4925e07549072159ce4fcad45813fed12fa36a2f43d3568ca8dd96 config-baseline.channel.json
af8a8e8616a0146ad989ff1bc0e8cf62c61a4d434dd67bbe7fe082c5c204fada config-baseline.plugin.json

View File

@@ -1396,6 +1396,7 @@ Defaults for Talk mode (macOS/iOS/Android).
voice: "cedar",
},
},
instructions: "Speak warmly and keep answers brief.",
mode: "realtime",
transport: "webrtc",
brain: "agent-consult",
@@ -1416,6 +1417,7 @@ Defaults for Talk mode (macOS/iOS/Android).
- `consultFastMode` sets a one-shot fast-mode override for Control UI Talk realtime consults without changing the session's normal fast-mode setting.
- `speechLocale` sets the BCP 47 locale id used by iOS/macOS Talk speech recognition. Leave unset to use the device default.
- `silenceTimeoutMs` controls how long Talk mode waits after user silence before it sends the transcript. Unset keeps the platform default pause window (`700 ms on macOS and Android, 900 ms on iOS`).
- `realtime.instructions` appends provider-facing system instructions to OpenClaw's built-in realtime prompt, so voice style can be configured without losing default `openclaw_agent_consult` guidance.
---

View File

@@ -85,6 +85,7 @@ Supported keys:
voice: "cedar",
},
},
instructions: "Speak warmly and keep answers brief.",
mode: "realtime",
transport: "webrtc",
brain: "agent-consult",
@@ -108,6 +109,7 @@ Defaults:
- `realtime.providers.<provider>` stores provider-owned realtime config. The browser receives only ephemeral or constrained session credentials, never a standard API key.
- `realtime.providers.openai.voice`: built-in OpenAI Realtime voice id. Current `gpt-realtime-2` voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`; `marin` and `cedar` are recommended for best quality.
- `realtime.brain`: `agent-consult` routes realtime tool calls through Gateway policy; `direct-tools` is owner-only compatibility behavior; `none` is for transcription or external orchestration.
- `realtime.instructions`: appends provider-facing system instructions to OpenClaw's built-in realtime prompt. Use it for voice style and tone; OpenClaw keeps the default `openclaw_agent_consult` guidance.
- `talk.catalog` exposes each provider's valid modes, transports, brain strategies, realtime audio formats, and capability flags so first-party Talk clients can avoid unsupported combinations.
- Streaming transcription providers are discovered through `talk.catalog.transcription`. The current Gateway relay uses the Voice Call streaming provider config until the dedicated Talk transcription config surface is added.
- `speechLocale`: optional BCP 47 locale id for on-device Talk speech recognition on iOS/macOS. Leave unset to use the device default.

View File

@@ -162,6 +162,8 @@ export const FIELD_HELP: Record<string, string> = {
"Realtime provider model id override for browser or Gateway-owned Talk sessions.",
"talk.realtime.voice":
"Realtime provider voice id override for browser or Gateway-owned Talk sessions.",
"talk.realtime.instructions":
"Additional system instructions appended to OpenClaw's built-in realtime Talk prompt. Use this for voice style, tone, and other provider-facing realtime behavior while keeping agent-consult guidance intact.",
"talk.realtime.mode": "Talk execution mode: realtime, stt-tts, or transcription.",
"talk.realtime.transport":
"Talk byte/session transport: webrtc, provider-websocket, gateway-relay, or managed-room.",

View File

@@ -893,6 +893,7 @@ export const FIELD_LABELS: Record<string, string> = {
"talk.realtime.providers.*.apiKey": "Talk Realtime Provider API Key", // pragma: allowlist secret
"talk.realtime.model": "Talk Realtime Model",
"talk.realtime.voice": "Talk Realtime Voice",
"talk.realtime.instructions": "Talk Realtime Instructions",
"talk.realtime.mode": "Talk Realtime Mode",
"talk.realtime.transport": "Talk Realtime Transport",
"talk.realtime.brain": "Talk Realtime Brain",

View File

@@ -133,6 +133,26 @@ describe("talk normalization", () => {
});
});
it("preserves normalized realtime instructions in talk.config payloads", () => {
const payload = buildTalkConfigResponse({
realtime: {
provider: "openai",
providers: {
openai: {
model: "gpt-realtime",
voice: "alloy",
},
},
instructions: " Speak with crisp diction. ",
},
});
expect(payload?.realtime).toMatchObject({
provider: "openai",
instructions: "Speak with crisp diction.",
});
});
it("does not report an active provider when the configured speech provider cannot resolve", () => {
const mismatchPayload = buildTalkConfigResponse({
provider: "acme",

View File

@@ -110,6 +110,10 @@ function normalizeTalkRealtimeConfig(value: unknown): TalkRealtimeConfig | undef
if (voice) {
normalized.voice = voice;
}
const instructions = normalizeOptionalString(source.instructions);
if (instructions) {
normalized.instructions = instructions;
}
if (source.mode === "realtime" || source.mode === "stt-tts" || source.mode === "transcription") {
normalized.mode = source.mode;
}

View File

@@ -54,6 +54,8 @@ export type TalkRealtimeConfig = {
model?: string;
/** Provider voice override for realtime sessions. */
voice?: string;
/** Additional system instructions appended to realtime Talk sessions. */
instructions?: string;
/** Realtime execution mode. */
mode?: "realtime" | "stt-tts" | "transcription";
/** Byte/session transport. */

View File

@@ -24,6 +24,25 @@ describe("OpenClawSchema talk validation", () => {
).toThrow(/consultThinkingLevel/i);
});
it("accepts additional realtime Talk instructions", () => {
expect(() =>
OpenClawSchema.parse({
talk: {
realtime: {
provider: "openai",
providers: {
openai: {
model: "gpt-realtime",
voice: "alloy",
},
},
instructions: "Speak with crisp diction.",
},
},
}),
).not.toThrow();
});
it.each([
["boolean", true],
["string", "1500"],

View File

@@ -236,6 +236,7 @@ const TalkRealtimeSchema = z
providers: z.record(z.string(), TalkProviderEntrySchema).optional(),
model: z.string().optional(),
voice: z.string().optional(),
instructions: z.string().optional(),
mode: z.enum(["realtime", "stt-tts", "transcription"]).optional(),
transport: z.enum(["webrtc", "provider-websocket", "gateway-relay", "managed-room"]).optional(),
brain: z.enum(["agent-consult", "direct-tools", "none"]).optional(),

View File

@@ -153,6 +153,7 @@ describe("validateTalkConfigResult", () => {
},
model: "gpt-realtime",
voice: "alloy",
instructions: "Speak with crisp diction.",
mode: "realtime",
transport: "gateway-relay",
brain: "agent-consult",

View File

@@ -493,6 +493,7 @@ const TalkRealtimeConfigSchema = Type.Object(
providers: Type.Optional(Type.Record(Type.String(), TalkProviderConfigSchema)),
model: Type.Optional(Type.String()),
voice: Type.Optional(Type.String()),
instructions: Type.Optional(Type.String()),
mode: Type.Optional(TalkModeSchema),
transport: Type.Optional(TalkTransportSchema),
brain: Type.Optional(TalkBrainSchema),

View File

@@ -183,7 +183,7 @@ export const talkClientHandlers: GatewayRequestHandlers = {
if (resolution.provider.createBrowserSession && transport !== "gateway-relay") {
const session = await resolution.provider.createBrowserSession({
providerConfig: resolution.providerConfig,
instructions: buildRealtimeInstructions(),
instructions: buildRealtimeInstructions(realtimeConfig.instructions),
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
model: normalizeOptionalString(typedParams.model) ?? realtimeConfig.model,
voice: normalizeOptionalString(typedParams.voice) ?? realtimeConfig.voice,

View File

@@ -242,7 +242,7 @@ export const talkSessionHandlers: GatewayRequestHandlers = {
connId,
provider: resolution.provider,
providerConfig: withRealtimeBrowserOverrides(resolution.providerConfig, { model, voice }),
instructions: buildRealtimeInstructions(),
instructions: buildRealtimeInstructions(realtimeConfig.instructions),
tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
model,
voice,

View File

@@ -127,6 +127,7 @@ export function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvide
},
model: normalizeOptionalString(talkRealtime?.model),
voice: normalizeOptionalString(talkRealtime?.voice),
instructions: normalizeOptionalString(talkRealtime?.instructions),
mode: normalizeOptionalLowercaseString(talkRealtime?.mode),
transport: normalizeOptionalLowercaseString(talkRealtime?.transport),
brain: normalizeOptionalLowercaseString(talkRealtime?.brain),
@@ -210,8 +211,14 @@ export function resolveConfiguredRealtimeTranscriptionProvider(params: {
throw new Error("No realtime transcription provider registered");
}
export function buildRealtimeInstructions(): string {
return `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
const DEFAULT_REALTIME_INSTRUCTIONS = `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
export function buildRealtimeInstructions(configuredInstructions?: string): string {
const extra = normalizeOptionalString(configuredInstructions);
if (!extra) {
return DEFAULT_REALTIME_INSTRUCTIONS;
}
return `${DEFAULT_REALTIME_INSTRUCTIONS}\n\nAdditional realtime instructions:\n${extra}`;
}
export function withRealtimeBrowserOverrides(

View File

@@ -498,6 +498,7 @@ describe("talk.session unified handlers", () => {
realtime: {
provider: "openai",
providers: { openai: { apiKey: "openai-key" } },
instructions: "Speak warmly.",
},
},
}) as OpenClawConfig,
@@ -513,6 +514,7 @@ describe("talk.session unified handlers", () => {
model: "gpt-realtime",
voice: "alloy",
}),
instructions: expect.stringContaining("Additional realtime instructions:\nSpeak warmly."),
}),
);
expect(createRespond).toHaveBeenCalledWith(
@@ -1196,7 +1198,7 @@ describe("talk.client.create handler", () => {
vi.clearAllMocks();
});
it("uses talk.realtime provider, model, and voice without reading speech provider config", async () => {
it("uses talk.realtime provider, model, voice, and instructions without reading speech provider config", async () => {
const createBrowserSession = vi.fn(async () => ({
provider: "openai",
transport: "webrtc" as const,
@@ -1232,6 +1234,7 @@ describe("talk.client.create handler", () => {
providers: { openai: { apiKey: "openai-key" } },
model: "gpt-realtime",
voice: "alloy",
instructions: "Speak warmly.",
},
},
}) as OpenClawConfig,
@@ -1248,6 +1251,7 @@ describe("talk.client.create handler", () => {
expect.objectContaining({
model: "gpt-realtime",
voice: "alloy",
instructions: expect.stringContaining("Additional realtime instructions:\nSpeak warmly."),
}),
);
expect(respond).toHaveBeenCalledWith(