fix(providers): use llama.cpp runtime context cap

2026-05-13 23:56:07 +00:00 · 2026-04-29 05:31:34 +01:00
parent ccdaf1875a
commit 7c7d19ec84
2 changed files with 194 additions and 23 deletions
--- a/src/plugins/provider-self-hosted-setup.test.ts
+++ b/src/plugins/provider-self-hosted-setup.test.ts
@@ -88,6 +88,7 @@ async function configureSelfHostedTestProvider(params: {
 describe("discoverOpenAICompatibleLocalModels", () => {
  it("uses guarded fetch pinned to the configured self-hosted provider", async () => {
    const release = vi.fn(async () => undefined);
+    const propsRelease = vi.fn(async () => undefined);
    fetchWithSsrFGuardMock.mockResolvedValueOnce({
      response: new Response(JSON.stringify({ data: [{ id: "Qwen/Qwen3-32B" }] }), {
        status: 200,
@@ -95,6 +96,11 @@ describe("discoverOpenAICompatibleLocalModels", () => {
      finalUrl: "http://127.0.0.1:8000/v1/models",
      release,
    });
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response("{}", { status: 404 }),
+      finalUrl: "http://127.0.0.1:8000/props",
+      release: propsRelease,
+    });

    const models = await discoverOpenAICompatibleLocalModels({
      baseUrl: "http://127.0.0.1:8000/v1/",
@@ -114,15 +120,107 @@ describe("discoverOpenAICompatibleLocalModels", () => {
        maxTokens: 8192,
      },
    ]);
-    expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith({
-      url: "http://127.0.0.1:8000/v1/models",
-      init: { headers: { Authorization: "Bearer self-hosted-test-key" } },
-      policy: {
-        hostnameAllowlist: ["127.0.0.1"],
-        allowPrivateNetwork: true,
-      },
-      timeoutMs: 5000,
+    expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        url: "http://127.0.0.1:8000/v1/models",
+        init: { headers: { Authorization: "Bearer self-hosted-test-key" } },
+        policy: {
+          hostnameAllowlist: ["127.0.0.1"],
+          allowPrivateNetwork: true,
+        },
+        timeoutMs: 5000,
+      }),
+    );
+    expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith(
+      expect.objectContaining({
+        url: "http://127.0.0.1:8000/props",
+        init: { headers: { Authorization: "Bearer self-hosted-test-key" } },
+        policy: {
+          hostnameAllowlist: ["127.0.0.1"],
+          allowPrivateNetwork: true,
+        },
+        timeoutMs: 2500,
+      }),
+    );
+    expect(release).toHaveBeenCalledOnce();
+    expect(propsRelease).toHaveBeenCalledOnce();
+  });
+
+  it("uses llama.cpp /props n_ctx as the runtime context cap", async () => {
+    const modelsRelease = vi.fn(async () => undefined);
+    const propsRelease = vi.fn(async () => undefined);
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(
+        JSON.stringify({
+          data: [
+            {
+              id: "qwen3.6-mxfp4-moe",
+              meta: { n_ctx_train: 262_144 },
+            },
+          ],
+        }),
+        { status: 200 },
+      ),
+      finalUrl: "http://127.0.0.1:8080/v1/models",
+      release: modelsRelease,
    });
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(JSON.stringify({ n_ctx: 65_536 }), { status: 200 }),
+      finalUrl: "http://127.0.0.1:8080/props",
+      release: propsRelease,
+    });
+
+    const models = await discoverOpenAICompatibleLocalModels({
+      baseUrl: "http://127.0.0.1:8080/v1",
+      label: "llama.cpp",
+      env: {},
+    });
+
+    expect(models).toEqual([
+      expect.objectContaining({
+        id: "qwen3.6-mxfp4-moe",
+        contextWindow: 262_144,
+        contextTokens: 65_536,
+      }),
+    ]);
+    expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
+      2,
+      expect.objectContaining({
+        url: "http://127.0.0.1:8080/props",
+      }),
+    );
+    expect(modelsRelease).toHaveBeenCalledOnce();
+    expect(propsRelease).toHaveBeenCalledOnce();
+  });
+
+  it("preserves explicit configured context windows ahead of llama.cpp /props", async () => {
+    const release = vi.fn(async () => undefined);
+    fetchWithSsrFGuardMock.mockResolvedValueOnce({
+      response: new Response(
+        JSON.stringify({
+          data: [{ id: "qwen3.6-mxfp4-moe", meta: { n_ctx_train: 262_144 } }],
+        }),
+        { status: 200 },
+      ),
+      finalUrl: "http://127.0.0.1:8080/v1/models",
+      release,
+    });
+
+    const models = await discoverOpenAICompatibleLocalModels({
+      baseUrl: "http://127.0.0.1:8080/v1",
+      label: "llama.cpp",
+      contextWindow: 65_536,
+      env: {},
+    });
+
+    expect(models).toEqual([
+      expect.objectContaining({
+        id: "qwen3.6-mxfp4-moe",
+        contextWindow: 65_536,
+      }),
+    ]);
+    expect(models[0]).not.toHaveProperty("contextTokens");
+    expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(1);
    expect(release).toHaveBeenCalledOnce();
  });

--- a/src/plugins/provider-self-hosted-setup.ts
+++ b/src/plugins/provider-self-hosted-setup.ts
@@ -35,9 +35,16 @@ const log = createSubsystemLogger("plugins/self-hosted-provider-setup");
 type OpenAICompatModelsResponse = {
  data?: Array<{
    id?: string;
+    meta?: {
+      n_ctx_train?: unknown;
+    };
  }>;
 };

+type LlamaCppPropsResponse = {
+  n_ctx?: unknown;
+};
+
 function isReasoningModelHeuristic(modelId: string): boolean {
  return /r1|reasoning|think|reason/i.test(modelId);
 }
@@ -62,6 +69,57 @@ function buildSelfHostedBaseUrlSsrFPolicy(baseUrl: string): SsrFPolicy | undefin
  }
 }

+function readPositiveInteger(value: unknown): number | undefined {
+  if (typeof value !== "number" || !Number.isFinite(value) || value <= 0) {
+    return undefined;
+  }
+  return Math.trunc(value);
+}
+
+function resolveLlamaCppPropsUrl(baseUrl: string): string {
+  const parsed = new URL(baseUrl);
+  const pathname = parsed.pathname.replace(/\/+$/, "");
+  parsed.pathname = pathname.endsWith("/v1") ? pathname.slice(0, -3) || "/" : pathname;
+  parsed.search = "";
+  parsed.hash = "";
+  const root = parsed.toString().replace(/\/+$/, "");
+  return `${root}/props`;
+}
+
+async function discoverLlamaCppRuntimeContextTokens(params: {
+  baseUrl: string;
+  apiKey?: string;
+}): Promise<number | undefined> {
+  let url: string;
+  try {
+    url = resolveLlamaCppPropsUrl(params.baseUrl);
+  } catch {
+    return undefined;
+  }
+  try {
+    const trimmedApiKey = normalizeOptionalString(params.apiKey);
+    const { response, release } = await fetchWithSsrFGuard({
+      url,
+      init: {
+        headers: trimmedApiKey ? { Authorization: `Bearer ${trimmedApiKey}` } : undefined,
+      },
+      policy: buildSelfHostedBaseUrlSsrFPolicy(params.baseUrl),
+      timeoutMs: 2500,
+    });
+    try {
+      if (!response.ok) {
+        return undefined;
+      }
+      const data = (await response.json()) as LlamaCppPropsResponse;
+      return readPositiveInteger(data.n_ctx);
+    } finally {
+      await release();
+    }
+  } catch {
+    return undefined;
+  }
+}
+
 export async function discoverOpenAICompatibleLocalModels(params: {
  baseUrl: string;
  apiKey?: string;
@@ -100,21 +158,36 @@ export async function discoverOpenAICompatibleLocalModels(params: {
        return [];
      }

-      return models
-        .map((model) => ({ id: normalizeOptionalString(model.id) ?? "" }))
-        .filter((model) => Boolean(model.id))
-        .map((model) => {
-          const modelId = model.id;
-          return {
-            id: modelId,
-            name: modelId,
-            reasoning: isReasoningModelHeuristic(modelId),
-            input: ["text"],
-            cost: SELF_HOSTED_DEFAULT_COST,
-            contextWindow: params.contextWindow ?? SELF_HOSTED_DEFAULT_CONTEXT_WINDOW,
-            maxTokens: params.maxTokens ?? SELF_HOSTED_DEFAULT_MAX_TOKENS,
-          } satisfies ModelDefinitionConfig;
-        });
+      const runtimeContextTokens =
+        params.contextWindow === undefined
+          ? await discoverLlamaCppRuntimeContextTokens({
+              baseUrl: trimmedBaseUrl,
+              apiKey: params.apiKey,
+            })
+          : undefined;
+
+      return models.flatMap((model) => {
+        const modelId = normalizeOptionalString(model.id);
+        if (!modelId) {
+          return [];
+        }
+        const modelConfig: ModelDefinitionConfig = {
+          id: modelId,
+          name: modelId,
+          reasoning: isReasoningModelHeuristic(modelId),
+          input: ["text"],
+          cost: SELF_HOSTED_DEFAULT_COST,
+          contextWindow:
+            params.contextWindow ??
+            readPositiveInteger(model.meta?.n_ctx_train) ??
+            SELF_HOSTED_DEFAULT_CONTEXT_WINDOW,
+          maxTokens: params.maxTokens ?? SELF_HOSTED_DEFAULT_MAX_TOKENS,
+        };
+        if (runtimeContextTokens) {
+          modelConfig.contextTokens = runtimeContextTokens;
+        }
+        return [modelConfig];
+      });
    } finally {
      await release();
    }