fix(providers): read nested llama cpp props context

2026-05-13 23:56:07 +00:00 · 2026-05-05 23:23:23 +01:00
parent 7c7d19ec84
commit f4be39c4f4
3 changed files with 173 additions and 26 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -597,6 +597,7 @@ Docs: https://docs.openclaw.ai
 - Sessions cleanup: add `openclaw sessions cleanup --fix-dm-scope` so operators who return `session.dmScope` to `main` can dry-run and retire stale direct-DM session rows while preserving transcripts as deleted archives. Fixes #47561 and #45554. Thanks @BunsDev.

 - Doctor/Codex: repair legacy `openai-codex/*` routes and cron payload model refs to canonical `openai/*`, keep OpenAI agent turns on Codex by default, ignore stale whole-agent/session runtime pins, preserve explicit provider/model runtime policy, and migrate legacy runtime model refs to model-scoped runtime entries. Thanks @vincentkoc.
+- Providers/self-hosted: read model-scoped llama.cpp runtime context from `/props.default_generation_settings.n_ctx` while keeping top-level `n_ctx` as a fallback, so session budgeting reflects the loaded context window. Fixes #73664. Thanks @brokemac79.
 - Video generation: wait up to 20 minutes for slow fal/MiniMax queue-backed jobs, stop forwarding unsupported Google Veo generated-audio options, and normalize MiniMax `720P` requests to its supported `768P` resolution with the usual override warning/details instead of failing fallback.
 - Channels/durable delivery: preserve channel-specific final reply semantics when using durable sends, including Telegram selected quotes and silent error replies plus WhatsApp message-sending cancellations.
 - Channels/message lifecycle: build legacy channel delivery results from message receipts and add receipts to BlueBubbles, Feishu, Google Chat, iMessage, IRC, LINE, Nextcloud Talk, QQ Bot, Signal, Synology Chat, Tlon, Twitch, WhatsApp, Zalo, and Zalo Personal send results and owner-path reply delivery plus Discord, Matrix, Mattermost, Slack, and Teams send results while preserving existing message id compatibility.
--- a/src/plugins/provider-self-hosted-setup.test.ts
+++ b/src/plugins/provider-self-hosted-setup.test.ts
@@ -146,7 +146,129 @@ describe("discoverOpenAICompatibleLocalModels", () => {
    expect(propsRelease).toHaveBeenCalledOnce();
  });

-  it("uses llama.cpp /props n_ctx as the runtime context cap", async () => {
+  it("uses llama.cpp nested /props n_ctx as the runtime context cap", async () => {
+    const modelsRelease = vi.fn(async () => undefined);
+    const propsRelease = vi.fn(async () => undefined);
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(
+        JSON.stringify({
+          data: [
+            {
+              id: "qwen3.6-mxfp4-moe",
+              meta: { n_ctx_train: 262_144 },
+            },
+          ],
+        }),
+        { status: 200 },
+      ),
+      finalUrl: "http://127.0.0.1:8080/v1/models",
+      release: modelsRelease,
+    });
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(JSON.stringify({ default_generation_settings: { n_ctx: 65_536 } }), {
+        status: 200,
+      }),
+      finalUrl: "http://127.0.0.1:8080/props",
+      release: propsRelease,
+    });
+
+    const models = await discoverOpenAICompatibleLocalModels({
+      baseUrl: "http://127.0.0.1:8080/v1",
+      label: "llama.cpp",
+      env: {},
+    });
+
+    expect(models).toEqual([
+      expect.objectContaining({
+        id: "qwen3.6-mxfp4-moe",
+        contextWindow: 262_144,
+        contextTokens: 65_536,
+      }),
+    ]);
+    expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
+      2,
+      expect.objectContaining({
+        url: "http://127.0.0.1:8080/props",
+      }),
+    );
+    expect(modelsRelease).toHaveBeenCalledOnce();
+    expect(propsRelease).toHaveBeenCalledOnce();
+  });
+
+  it("scopes llama.cpp /props runtime caps to each discovered model", async () => {
+    const modelsRelease = vi.fn(async () => undefined);
+    const firstPropsRelease = vi.fn(async () => undefined);
+    const secondPropsRelease = vi.fn(async () => undefined);
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(
+        JSON.stringify({
+          data: [
+            {
+              id: "qwen/router-a",
+              meta: { n_ctx_train: 262_144 },
+            },
+            {
+              id: "qwen/router-b",
+              meta: { n_ctx_train: 131_072 },
+            },
+          ],
+        }),
+        { status: 200 },
+      ),
+      finalUrl: "http://127.0.0.1:8080/v1/models",
+      release: modelsRelease,
+    });
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(JSON.stringify({ default_generation_settings: { n_ctx: 65_536 } }), {
+        status: 200,
+      }),
+      finalUrl: "http://127.0.0.1:8080/props?model=qwen%2Frouter-a",
+      release: firstPropsRelease,
+    });
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(JSON.stringify({ default_generation_settings: { n_ctx: 32_768 } }), {
+        status: 200,
+      }),
+      finalUrl: "http://127.0.0.1:8080/props?model=qwen%2Frouter-b",
+      release: secondPropsRelease,
+    });
+
+    const models = await discoverOpenAICompatibleLocalModels({
+      baseUrl: "http://127.0.0.1:8080/v1",
+      label: "llama.cpp",
+      env: {},
+    });
+
+    expect(models).toEqual([
+      expect.objectContaining({
+        id: "qwen/router-a",
+        contextWindow: 262_144,
+        contextTokens: 65_536,
+      }),
+      expect.objectContaining({
+        id: "qwen/router-b",
+        contextWindow: 131_072,
+        contextTokens: 32_768,
+      }),
+    ]);
+    expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
+      2,
+      expect.objectContaining({
+        url: "http://127.0.0.1:8080/props?model=qwen%2Frouter-a",
+      }),
+    );
+    expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
+      3,
+      expect.objectContaining({
+        url: "http://127.0.0.1:8080/props?model=qwen%2Frouter-b",
+      }),
+    );
+    expect(modelsRelease).toHaveBeenCalledOnce();
+    expect(firstPropsRelease).toHaveBeenCalledOnce();
+    expect(secondPropsRelease).toHaveBeenCalledOnce();
+  });
+
+  it("keeps top-level llama.cpp /props n_ctx as a compatibility fallback", async () => {
    const modelsRelease = vi.fn(async () => undefined);
    const propsRelease = vi.fn(async () => undefined);
    fetchWithSsrFGuardMock.mockResolvedValueOnce({
@@ -183,12 +305,6 @@ describe("discoverOpenAICompatibleLocalModels", () => {
        contextTokens: 65_536,
      }),
    ]);
-    expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
-      2,
-      expect.objectContaining({
-        url: "http://127.0.0.1:8080/props",
-      }),
-    );
    expect(modelsRelease).toHaveBeenCalledOnce();
    expect(propsRelease).toHaveBeenCalledOnce();
  });
--- a/src/plugins/provider-self-hosted-setup.ts
+++ b/src/plugins/provider-self-hosted-setup.ts
@@ -42,6 +42,9 @@ type OpenAICompatModelsResponse = {
 };

 type LlamaCppPropsResponse = {
+  default_generation_settings?: {
+    n_ctx?: unknown;
+  };
  n_ctx?: unknown;
 };

@@ -76,23 +79,28 @@ function readPositiveInteger(value: unknown): number | undefined {
  return Math.trunc(value);
 }

-function resolveLlamaCppPropsUrl(baseUrl: string): string {
+function resolveLlamaCppPropsUrl(baseUrl: string, modelId?: string): string {
  const parsed = new URL(baseUrl);
  const pathname = parsed.pathname.replace(/\/+$/, "");
-  parsed.pathname = pathname.endsWith("/v1") ? pathname.slice(0, -3) || "/" : pathname;
+  const rootPathname = pathname.endsWith("/v1") ? pathname.slice(0, -3) || "/" : pathname;
+  parsed.pathname = `${rootPathname.replace(/\/+$/, "")}/props`;
  parsed.search = "";
  parsed.hash = "";
-  const root = parsed.toString().replace(/\/+$/, "");
-  return `${root}/props`;
+  const normalizedModelId = normalizeOptionalString(modelId);
+  if (normalizedModelId) {
+    parsed.searchParams.set("model", normalizedModelId);
+  }
+  return parsed.toString();
 }

 async function discoverLlamaCppRuntimeContextTokens(params: {
  baseUrl: string;
  apiKey?: string;
+  modelId?: string;
 }): Promise<number | undefined> {
  let url: string;
  try {
-    url = resolveLlamaCppPropsUrl(params.baseUrl);
+    url = resolveLlamaCppPropsUrl(params.baseUrl, params.modelId);
  } catch {
    return undefined;
  }
@@ -111,7 +119,10 @@ async function discoverLlamaCppRuntimeContextTokens(params: {
        return undefined;
      }
      const data = (await response.json()) as LlamaCppPropsResponse;
-      return readPositiveInteger(data.n_ctx);
+      return (
+        readPositiveInteger(data.default_generation_settings?.n_ctx) ??
+        readPositiveInteger(data.n_ctx)
+      );
    } finally {
      await release();
    }
@@ -158,23 +169,41 @@ export async function discoverOpenAICompatibleLocalModels(params: {
        return [];
      }

-      const runtimeContextTokens =
-        params.contextWindow === undefined
-          ? await discoverLlamaCppRuntimeContextTokens({
-              baseUrl: trimmedBaseUrl,
-              apiKey: params.apiKey,
-            })
-          : undefined;
-
-      return models.flatMap((model) => {
+      const discoveredModels = models.flatMap((model) => {
        const modelId = normalizeOptionalString(model.id);
        if (!modelId) {
          return [];
        }
+        return [{ id: modelId, meta: model.meta }];
+      });
+      const runtimeContextTokensByModelId = new Map<string, number>();
+      if (params.contextWindow === undefined) {
+        const uniqueModelIds = [...new Set(discoveredModels.map((model) => model.id))];
+        const runtimeContextTokenResults = await Promise.all(
+          uniqueModelIds.map(
+            async (modelId) =>
+              [
+                modelId,
+                await discoverLlamaCppRuntimeContextTokens({
+                  baseUrl: trimmedBaseUrl,
+                  apiKey: params.apiKey,
+                  modelId: uniqueModelIds.length > 1 ? modelId : undefined,
+                }),
+              ] as const,
+          ),
+        );
+        for (const [modelId, runtimeContextTokens] of runtimeContextTokenResults) {
+          if (runtimeContextTokens) {
+            runtimeContextTokensByModelId.set(modelId, runtimeContextTokens);
+          }
+        }
+      }
+
+      return discoveredModels.map((model) => {
        const modelConfig: ModelDefinitionConfig = {
-          id: modelId,
-          name: modelId,
-          reasoning: isReasoningModelHeuristic(modelId),
+          id: model.id,
+          name: model.id,
+          reasoning: isReasoningModelHeuristic(model.id),
          input: ["text"],
          cost: SELF_HOSTED_DEFAULT_COST,
          contextWindow:
@@ -183,10 +212,11 @@ export async function discoverOpenAICompatibleLocalModels(params: {
            SELF_HOSTED_DEFAULT_CONTEXT_WINDOW,
          maxTokens: params.maxTokens ?? SELF_HOSTED_DEFAULT_MAX_TOKENS,
        };
+        const runtimeContextTokens = runtimeContextTokensByModelId.get(model.id);
        if (runtimeContextTokens) {
          modelConfig.contextTokens = runtimeContextTokens;
        }
-        return [modelConfig];
+        return modelConfig;
      });
    } finally {
      await release();