Enable realtime Talk instructions config

2026-05-13 15:47:28 +00:00 · 2026-05-09 19:45:40 -04:00
parent 1463974c46
commit c5cc1bb184
16 changed files with 73 additions and 7 deletions
--- a/docs/.generated/config-baseline.sha256
+++ b/docs/.generated/config-baseline.sha256
@@ -1,4 +1,4 @@
-6a0d49baf0a07563146d9c3fa7650175f4b3e5bf8ed67a8f8a1930fd8dc18f3b  config-baseline.json
-b58f16663413d37906072f3ed7fa7c0ed0bac6347ab76627180ea4f21331b9c5  config-baseline.core.json
+7317136882cafd0cfd23146218b7bec4a65a2c08328f7a1f8e22ac04789ec35d  config-baseline.json
+8e0f397eaeaa858f016a080e2585c69fa6744cdbb0e7f68d71cfc693971dbc8c  config-baseline.core.json
 25c6e70d5b4925e07549072159ce4fcad45813fed12fa36a2f43d3568ca8dd96  config-baseline.channel.json
 af8a8e8616a0146ad989ff1bc0e8cf62c61a4d434dd67bbe7fe082c5c204fada  config-baseline.plugin.json
--- a/docs/gateway/config-agents.md
+++ b/docs/gateway/config-agents.md
@@ -1396,6 +1396,7 @@ Defaults for Talk mode (macOS/iOS/Android).
          voice: "cedar",
        },
      },
+      instructions: "Speak warmly and keep answers brief.",
      mode: "realtime",
      transport: "webrtc",
      brain: "agent-consult",
@@ -1416,6 +1417,7 @@ Defaults for Talk mode (macOS/iOS/Android).
 - `consultFastMode` sets a one-shot fast-mode override for Control UI Talk realtime consults without changing the session's normal fast-mode setting.
 - `speechLocale` sets the BCP 47 locale id used by iOS/macOS Talk speech recognition. Leave unset to use the device default.
 - `silenceTimeoutMs` controls how long Talk mode waits after user silence before it sends the transcript. Unset keeps the platform default pause window (`700 ms on macOS and Android, 900 ms on iOS`).
+- `realtime.instructions` appends provider-facing system instructions to OpenClaw's built-in realtime prompt, so voice style can be configured without losing default `openclaw_agent_consult` guidance.

 ---

--- a/docs/nodes/talk.md
+++ b/docs/nodes/talk.md
@@ -85,6 +85,7 @@ Supported keys:
          voice: "cedar",
        },
      },
+      instructions: "Speak warmly and keep answers brief.",
      mode: "realtime",
      transport: "webrtc",
      brain: "agent-consult",
@@ -108,6 +109,7 @@ Defaults:
 - `realtime.providers.<provider>` stores provider-owned realtime config. The browser receives only ephemeral or constrained session credentials, never a standard API key.
 - `realtime.providers.openai.voice`: built-in OpenAI Realtime voice id. Current `gpt-realtime-2` voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`; `marin` and `cedar` are recommended for best quality.
 - `realtime.brain`: `agent-consult` routes realtime tool calls through Gateway policy; `direct-tools` is owner-only compatibility behavior; `none` is for transcription or external orchestration.
+- `realtime.instructions`: appends provider-facing system instructions to OpenClaw's built-in realtime prompt. Use it for voice style and tone; OpenClaw keeps the default `openclaw_agent_consult` guidance.
 - `talk.catalog` exposes each provider's valid modes, transports, brain strategies, realtime audio formats, and capability flags so first-party Talk clients can avoid unsupported combinations.
 - Streaming transcription providers are discovered through `talk.catalog.transcription`. The current Gateway relay uses the Voice Call streaming provider config until the dedicated Talk transcription config surface is added.
 - `speechLocale`: optional BCP 47 locale id for on-device Talk speech recognition on iOS/macOS. Leave unset to use the device default.
--- a/src/config/schema.help.ts
+++ b/src/config/schema.help.ts
@@ -162,6 +162,8 @@ export const FIELD_HELP: Record<string, string> = {
    "Realtime provider model id override for browser or Gateway-owned Talk sessions.",
  "talk.realtime.voice":
    "Realtime provider voice id override for browser or Gateway-owned Talk sessions.",
+  "talk.realtime.instructions":
+    "Additional system instructions appended to OpenClaw's built-in realtime Talk prompt. Use this for voice style, tone, and other provider-facing realtime behavior while keeping agent-consult guidance intact.",
  "talk.realtime.mode": "Talk execution mode: realtime, stt-tts, or transcription.",
  "talk.realtime.transport":
    "Talk byte/session transport: webrtc, provider-websocket, gateway-relay, or managed-room.",
--- a/src/config/schema.labels.ts
+++ b/src/config/schema.labels.ts
@@ -893,6 +893,7 @@ export const FIELD_LABELS: Record<string, string> = {
  "talk.realtime.providers.*.apiKey": "Talk Realtime Provider API Key", // pragma: allowlist secret
  "talk.realtime.model": "Talk Realtime Model",
  "talk.realtime.voice": "Talk Realtime Voice",
+  "talk.realtime.instructions": "Talk Realtime Instructions",
  "talk.realtime.mode": "Talk Realtime Mode",
  "talk.realtime.transport": "Talk Realtime Transport",
  "talk.realtime.brain": "Talk Realtime Brain",
--- a/src/config/talk.normalize.test.ts
+++ b/src/config/talk.normalize.test.ts
@@ -133,6 +133,26 @@ describe("talk normalization", () => {
    });
  });

+  it("preserves normalized realtime instructions in talk.config payloads", () => {
+    const payload = buildTalkConfigResponse({
+      realtime: {
+        provider: "openai",
+        providers: {
+          openai: {
+            model: "gpt-realtime",
+            voice: "alloy",
+          },
+        },
+        instructions: " Speak with crisp diction. ",
+      },
+    });
+
+    expect(payload?.realtime).toMatchObject({
+      provider: "openai",
+      instructions: "Speak with crisp diction.",
+    });
+  });
+
  it("does not report an active provider when the configured speech provider cannot resolve", () => {
    const mismatchPayload = buildTalkConfigResponse({
      provider: "acme",
--- a/src/config/talk.ts
+++ b/src/config/talk.ts
@@ -110,6 +110,10 @@ function normalizeTalkRealtimeConfig(value: unknown): TalkRealtimeConfig | undef
  if (voice) {
    normalized.voice = voice;
  }
+  const instructions = normalizeOptionalString(source.instructions);
+  if (instructions) {
+    normalized.instructions = instructions;
+  }
  if (source.mode === "realtime" || source.mode === "stt-tts" || source.mode === "transcription") {
    normalized.mode = source.mode;
  }
--- a/src/config/types.gateway.ts
+++ b/src/config/types.gateway.ts
@@ -54,6 +54,8 @@ export type TalkRealtimeConfig = {
  model?: string;
  /** Provider voice override for realtime sessions. */
  voice?: string;
+  /** Additional system instructions appended to realtime Talk sessions. */
+  instructions?: string;
  /** Realtime execution mode. */
  mode?: "realtime" | "stt-tts" | "transcription";
  /** Byte/session transport. */
--- a/src/config/zod-schema.talk.test.ts
+++ b/src/config/zod-schema.talk.test.ts
@@ -24,6 +24,25 @@ describe("OpenClawSchema talk validation", () => {
    ).toThrow(/consultThinkingLevel/i);
  });

+  it("accepts additional realtime Talk instructions", () => {
+    expect(() =>
+      OpenClawSchema.parse({
+        talk: {
+          realtime: {
+            provider: "openai",
+            providers: {
+              openai: {
+                model: "gpt-realtime",
+                voice: "alloy",
+              },
+            },
+            instructions: "Speak with crisp diction.",
+          },
+        },
+      }),
+    ).not.toThrow();
+  });
+
  it.each([
    ["boolean", true],
    ["string", "1500"],
--- a/src/config/zod-schema.ts
+++ b/src/config/zod-schema.ts
@@ -236,6 +236,7 @@ const TalkRealtimeSchema = z
    providers: z.record(z.string(), TalkProviderEntrySchema).optional(),
    model: z.string().optional(),
    voice: z.string().optional(),
+    instructions: z.string().optional(),
    mode: z.enum(["realtime", "stt-tts", "transcription"]).optional(),
    transport: z.enum(["webrtc", "provider-websocket", "gateway-relay", "managed-room"]).optional(),
    brain: z.enum(["agent-consult", "direct-tools", "none"]).optional(),
--- a/src/gateway/protocol/index.test.ts
+++ b/src/gateway/protocol/index.test.ts
@@ -153,6 +153,7 @@ describe("validateTalkConfigResult", () => {
              },
              model: "gpt-realtime",
              voice: "alloy",
+              instructions: "Speak with crisp diction.",
              mode: "realtime",
              transport: "gateway-relay",
              brain: "agent-consult",
--- a/src/gateway/protocol/schema/channels.ts
+++ b/src/gateway/protocol/schema/channels.ts
@@ -493,6 +493,7 @@ const TalkRealtimeConfigSchema = Type.Object(
    providers: Type.Optional(Type.Record(Type.String(), TalkProviderConfigSchema)),
    model: Type.Optional(Type.String()),
    voice: Type.Optional(Type.String()),
+    instructions: Type.Optional(Type.String()),
    mode: Type.Optional(TalkModeSchema),
    transport: Type.Optional(TalkTransportSchema),
    brain: Type.Optional(TalkBrainSchema),
--- a/src/gateway/server-methods/talk-client.ts
+++ b/src/gateway/server-methods/talk-client.ts
@@ -183,7 +183,7 @@ export const talkClientHandlers: GatewayRequestHandlers = {
      if (resolution.provider.createBrowserSession && transport !== "gateway-relay") {
        const session = await resolution.provider.createBrowserSession({
          providerConfig: resolution.providerConfig,
-          instructions: buildRealtimeInstructions(),
+          instructions: buildRealtimeInstructions(realtimeConfig.instructions),
          tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
          model: normalizeOptionalString(typedParams.model) ?? realtimeConfig.model,
          voice: normalizeOptionalString(typedParams.voice) ?? realtimeConfig.voice,
--- a/src/gateway/server-methods/talk-session.ts
+++ b/src/gateway/server-methods/talk-session.ts
@@ -242,7 +242,7 @@ export const talkSessionHandlers: GatewayRequestHandlers = {
          connId,
          provider: resolution.provider,
          providerConfig: withRealtimeBrowserOverrides(resolution.providerConfig, { model, voice }),
-          instructions: buildRealtimeInstructions(),
+          instructions: buildRealtimeInstructions(realtimeConfig.instructions),
          tools: [REALTIME_VOICE_AGENT_CONSULT_TOOL],
          model,
          voice,
--- a/src/gateway/server-methods/talk-shared.ts
+++ b/src/gateway/server-methods/talk-shared.ts
@@ -127,6 +127,7 @@ export function buildTalkRealtimeConfig(config: OpenClawConfig, requestedProvide
    },
    model: normalizeOptionalString(talkRealtime?.model),
    voice: normalizeOptionalString(talkRealtime?.voice),
+    instructions: normalizeOptionalString(talkRealtime?.instructions),
    mode: normalizeOptionalLowercaseString(talkRealtime?.mode),
    transport: normalizeOptionalLowercaseString(talkRealtime?.transport),
    brain: normalizeOptionalLowercaseString(talkRealtime?.brain),
@@ -210,8 +211,14 @@ export function resolveConfiguredRealtimeTranscriptionProvider(params: {
  throw new Error("No realtime transcription provider registered");
 }

-export function buildRealtimeInstructions(): string {
-  return `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
+const DEFAULT_REALTIME_INSTRUCTIONS = `You are OpenClaw's realtime voice interface. Keep spoken replies concise. If the user asks for code, repository state, tools, files, current OpenClaw context, or deeper reasoning, call ${REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME} and then summarize the result naturally.`;
+
+export function buildRealtimeInstructions(configuredInstructions?: string): string {
+  const extra = normalizeOptionalString(configuredInstructions);
+  if (!extra) {
+    return DEFAULT_REALTIME_INSTRUCTIONS;
+  }
+  return `${DEFAULT_REALTIME_INSTRUCTIONS}\n\nAdditional realtime instructions:\n${extra}`;
 }

 export function withRealtimeBrowserOverrides(
--- a/src/gateway/server-methods/talk.test.ts
+++ b/src/gateway/server-methods/talk.test.ts
@@ -498,6 +498,7 @@ describe("talk.session unified handlers", () => {
              realtime: {
                provider: "openai",
                providers: { openai: { apiKey: "openai-key" } },
+                instructions: "Speak warmly.",
              },
            },
          }) as OpenClawConfig,
@@ -513,6 +514,7 @@ describe("talk.session unified handlers", () => {
          model: "gpt-realtime",
          voice: "alloy",
        }),
+        instructions: expect.stringContaining("Additional realtime instructions:\nSpeak warmly."),
      }),
    );
    expect(createRespond).toHaveBeenCalledWith(
@@ -1196,7 +1198,7 @@ describe("talk.client.create handler", () => {
    vi.clearAllMocks();
  });

-  it("uses talk.realtime provider, model, and voice without reading speech provider config", async () => {
+  it("uses talk.realtime provider, model, voice, and instructions without reading speech provider config", async () => {
    const createBrowserSession = vi.fn(async () => ({
      provider: "openai",
      transport: "webrtc" as const,
@@ -1232,6 +1234,7 @@ describe("talk.client.create handler", () => {
                providers: { openai: { apiKey: "openai-key" } },
                model: "gpt-realtime",
                voice: "alloy",
+                instructions: "Speak warmly.",
              },
            },
          }) as OpenClawConfig,
@@ -1248,6 +1251,7 @@ describe("talk.client.create handler", () => {
      expect.objectContaining({
        model: "gpt-realtime",
        voice: "alloy",
+        instructions: expect.stringContaining("Additional realtime instructions:\nSpeak warmly."),
      }),
    );
    expect(respond).toHaveBeenCalledWith(