fix(providers): use llama.cpp runtime context cap

This commit is contained in:
brokemac79
2026-04-29 05:31:34 +01:00
committed by Peter Steinberger
parent ccdaf1875a
commit 7c7d19ec84
2 changed files with 194 additions and 23 deletions

View File

@@ -88,6 +88,7 @@ async function configureSelfHostedTestProvider(params: {
describe("discoverOpenAICompatibleLocalModels", () => {
it("uses guarded fetch pinned to the configured self-hosted provider", async () => {
const release = vi.fn(async () => undefined);
const propsRelease = vi.fn(async () => undefined);
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: new Response(JSON.stringify({ data: [{ id: "Qwen/Qwen3-32B" }] }), {
status: 200,
@@ -95,6 +96,11 @@ describe("discoverOpenAICompatibleLocalModels", () => {
finalUrl: "http://127.0.0.1:8000/v1/models",
release,
});
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: new Response("{}", { status: 404 }),
finalUrl: "http://127.0.0.1:8000/props",
release: propsRelease,
});
const models = await discoverOpenAICompatibleLocalModels({
baseUrl: "http://127.0.0.1:8000/v1/",
@@ -114,15 +120,107 @@ describe("discoverOpenAICompatibleLocalModels", () => {
maxTokens: 8192,
},
]);
expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith({
url: "http://127.0.0.1:8000/v1/models",
init: { headers: { Authorization: "Bearer self-hosted-test-key" } },
policy: {
hostnameAllowlist: ["127.0.0.1"],
allowPrivateNetwork: true,
},
timeoutMs: 5000,
expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith(
expect.objectContaining({
url: "http://127.0.0.1:8000/v1/models",
init: { headers: { Authorization: "Bearer self-hosted-test-key" } },
policy: {
hostnameAllowlist: ["127.0.0.1"],
allowPrivateNetwork: true,
},
timeoutMs: 5000,
}),
);
expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith(
expect.objectContaining({
url: "http://127.0.0.1:8000/props",
init: { headers: { Authorization: "Bearer self-hosted-test-key" } },
policy: {
hostnameAllowlist: ["127.0.0.1"],
allowPrivateNetwork: true,
},
timeoutMs: 2500,
}),
);
expect(release).toHaveBeenCalledOnce();
expect(propsRelease).toHaveBeenCalledOnce();
});
it("uses llama.cpp /props n_ctx as the runtime context cap", async () => {
const modelsRelease = vi.fn(async () => undefined);
const propsRelease = vi.fn(async () => undefined);
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: new Response(
JSON.stringify({
data: [
{
id: "qwen3.6-mxfp4-moe",
meta: { n_ctx_train: 262_144 },
},
],
}),
{ status: 200 },
),
finalUrl: "http://127.0.0.1:8080/v1/models",
release: modelsRelease,
});
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: new Response(JSON.stringify({ n_ctx: 65_536 }), { status: 200 }),
finalUrl: "http://127.0.0.1:8080/props",
release: propsRelease,
});
const models = await discoverOpenAICompatibleLocalModels({
baseUrl: "http://127.0.0.1:8080/v1",
label: "llama.cpp",
env: {},
});
expect(models).toEqual([
expect.objectContaining({
id: "qwen3.6-mxfp4-moe",
contextWindow: 262_144,
contextTokens: 65_536,
}),
]);
expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
url: "http://127.0.0.1:8080/props",
}),
);
expect(modelsRelease).toHaveBeenCalledOnce();
expect(propsRelease).toHaveBeenCalledOnce();
});
it("preserves explicit configured context windows ahead of llama.cpp /props", async () => {
const release = vi.fn(async () => undefined);
fetchWithSsrFGuardMock.mockResolvedValueOnce({
response: new Response(
JSON.stringify({
data: [{ id: "qwen3.6-mxfp4-moe", meta: { n_ctx_train: 262_144 } }],
}),
{ status: 200 },
),
finalUrl: "http://127.0.0.1:8080/v1/models",
release,
});
const models = await discoverOpenAICompatibleLocalModels({
baseUrl: "http://127.0.0.1:8080/v1",
label: "llama.cpp",
contextWindow: 65_536,
env: {},
});
expect(models).toEqual([
expect.objectContaining({
id: "qwen3.6-mxfp4-moe",
contextWindow: 65_536,
}),
]);
expect(models[0]).not.toHaveProperty("contextTokens");
expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(1);
expect(release).toHaveBeenCalledOnce();
});

View File

@@ -35,9 +35,16 @@ const log = createSubsystemLogger("plugins/self-hosted-provider-setup");
type OpenAICompatModelsResponse = {
data?: Array<{
id?: string;
meta?: {
n_ctx_train?: unknown;
};
}>;
};
type LlamaCppPropsResponse = {
n_ctx?: unknown;
};
function isReasoningModelHeuristic(modelId: string): boolean {
return /r1|reasoning|think|reason/i.test(modelId);
}
@@ -62,6 +69,57 @@ function buildSelfHostedBaseUrlSsrFPolicy(baseUrl: string): SsrFPolicy | undefin
}
}
function readPositiveInteger(value: unknown): number | undefined {
if (typeof value !== "number" || !Number.isFinite(value) || value <= 0) {
return undefined;
}
return Math.trunc(value);
}
function resolveLlamaCppPropsUrl(baseUrl: string): string {
const parsed = new URL(baseUrl);
const pathname = parsed.pathname.replace(/\/+$/, "");
parsed.pathname = pathname.endsWith("/v1") ? pathname.slice(0, -3) || "/" : pathname;
parsed.search = "";
parsed.hash = "";
const root = parsed.toString().replace(/\/+$/, "");
return `${root}/props`;
}
async function discoverLlamaCppRuntimeContextTokens(params: {
baseUrl: string;
apiKey?: string;
}): Promise<number | undefined> {
let url: string;
try {
url = resolveLlamaCppPropsUrl(params.baseUrl);
} catch {
return undefined;
}
try {
const trimmedApiKey = normalizeOptionalString(params.apiKey);
const { response, release } = await fetchWithSsrFGuard({
url,
init: {
headers: trimmedApiKey ? { Authorization: `Bearer ${trimmedApiKey}` } : undefined,
},
policy: buildSelfHostedBaseUrlSsrFPolicy(params.baseUrl),
timeoutMs: 2500,
});
try {
if (!response.ok) {
return undefined;
}
const data = (await response.json()) as LlamaCppPropsResponse;
return readPositiveInteger(data.n_ctx);
} finally {
await release();
}
} catch {
return undefined;
}
}
export async function discoverOpenAICompatibleLocalModels(params: {
baseUrl: string;
apiKey?: string;
@@ -100,21 +158,36 @@ export async function discoverOpenAICompatibleLocalModels(params: {
return [];
}
return models
.map((model) => ({ id: normalizeOptionalString(model.id) ?? "" }))
.filter((model) => Boolean(model.id))
.map((model) => {
const modelId = model.id;
return {
id: modelId,
name: modelId,
reasoning: isReasoningModelHeuristic(modelId),
input: ["text"],
cost: SELF_HOSTED_DEFAULT_COST,
contextWindow: params.contextWindow ?? SELF_HOSTED_DEFAULT_CONTEXT_WINDOW,
maxTokens: params.maxTokens ?? SELF_HOSTED_DEFAULT_MAX_TOKENS,
} satisfies ModelDefinitionConfig;
});
const runtimeContextTokens =
params.contextWindow === undefined
? await discoverLlamaCppRuntimeContextTokens({
baseUrl: trimmedBaseUrl,
apiKey: params.apiKey,
})
: undefined;
return models.flatMap((model) => {
const modelId = normalizeOptionalString(model.id);
if (!modelId) {
return [];
}
const modelConfig: ModelDefinitionConfig = {
id: modelId,
name: modelId,
reasoning: isReasoningModelHeuristic(modelId),
input: ["text"],
cost: SELF_HOSTED_DEFAULT_COST,
contextWindow:
params.contextWindow ??
readPositiveInteger(model.meta?.n_ctx_train) ??
SELF_HOSTED_DEFAULT_CONTEXT_WINDOW,
maxTokens: params.maxTokens ?? SELF_HOSTED_DEFAULT_MAX_TOKENS,
};
if (runtimeContextTokens) {
modelConfig.contextTokens = runtimeContextTokens;
}
return [modelConfig];
});
} finally {
await release();
}