mirror of
https://github.com/moltbot/moltbot.git
synced 2026-05-13 23:56:07 +00:00
fix(providers): use llama.cpp runtime context cap
This commit is contained in:
committed by
Peter Steinberger
parent
ccdaf1875a
commit
7c7d19ec84
@@ -88,6 +88,7 @@ async function configureSelfHostedTestProvider(params: {
|
||||
describe("discoverOpenAICompatibleLocalModels", () => {
|
||||
it("uses guarded fetch pinned to the configured self-hosted provider", async () => {
|
||||
const release = vi.fn(async () => undefined);
|
||||
const propsRelease = vi.fn(async () => undefined);
|
||||
fetchWithSsrFGuardMock.mockResolvedValueOnce({
|
||||
response: new Response(JSON.stringify({ data: [{ id: "Qwen/Qwen3-32B" }] }), {
|
||||
status: 200,
|
||||
@@ -95,6 +96,11 @@ describe("discoverOpenAICompatibleLocalModels", () => {
|
||||
finalUrl: "http://127.0.0.1:8000/v1/models",
|
||||
release,
|
||||
});
|
||||
fetchWithSsrFGuardMock.mockResolvedValueOnce({
|
||||
response: new Response("{}", { status: 404 }),
|
||||
finalUrl: "http://127.0.0.1:8000/props",
|
||||
release: propsRelease,
|
||||
});
|
||||
|
||||
const models = await discoverOpenAICompatibleLocalModels({
|
||||
baseUrl: "http://127.0.0.1:8000/v1/",
|
||||
@@ -114,15 +120,107 @@ describe("discoverOpenAICompatibleLocalModels", () => {
|
||||
maxTokens: 8192,
|
||||
},
|
||||
]);
|
||||
expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith({
|
||||
url: "http://127.0.0.1:8000/v1/models",
|
||||
init: { headers: { Authorization: "Bearer self-hosted-test-key" } },
|
||||
policy: {
|
||||
hostnameAllowlist: ["127.0.0.1"],
|
||||
allowPrivateNetwork: true,
|
||||
},
|
||||
timeoutMs: 5000,
|
||||
expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
url: "http://127.0.0.1:8000/v1/models",
|
||||
init: { headers: { Authorization: "Bearer self-hosted-test-key" } },
|
||||
policy: {
|
||||
hostnameAllowlist: ["127.0.0.1"],
|
||||
allowPrivateNetwork: true,
|
||||
},
|
||||
timeoutMs: 5000,
|
||||
}),
|
||||
);
|
||||
expect(fetchWithSsrFGuardMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
url: "http://127.0.0.1:8000/props",
|
||||
init: { headers: { Authorization: "Bearer self-hosted-test-key" } },
|
||||
policy: {
|
||||
hostnameAllowlist: ["127.0.0.1"],
|
||||
allowPrivateNetwork: true,
|
||||
},
|
||||
timeoutMs: 2500,
|
||||
}),
|
||||
);
|
||||
expect(release).toHaveBeenCalledOnce();
|
||||
expect(propsRelease).toHaveBeenCalledOnce();
|
||||
});
|
||||
|
||||
it("uses llama.cpp /props n_ctx as the runtime context cap", async () => {
|
||||
const modelsRelease = vi.fn(async () => undefined);
|
||||
const propsRelease = vi.fn(async () => undefined);
|
||||
fetchWithSsrFGuardMock.mockResolvedValueOnce({
|
||||
response: new Response(
|
||||
JSON.stringify({
|
||||
data: [
|
||||
{
|
||||
id: "qwen3.6-mxfp4-moe",
|
||||
meta: { n_ctx_train: 262_144 },
|
||||
},
|
||||
],
|
||||
}),
|
||||
{ status: 200 },
|
||||
),
|
||||
finalUrl: "http://127.0.0.1:8080/v1/models",
|
||||
release: modelsRelease,
|
||||
});
|
||||
fetchWithSsrFGuardMock.mockResolvedValueOnce({
|
||||
response: new Response(JSON.stringify({ n_ctx: 65_536 }), { status: 200 }),
|
||||
finalUrl: "http://127.0.0.1:8080/props",
|
||||
release: propsRelease,
|
||||
});
|
||||
|
||||
const models = await discoverOpenAICompatibleLocalModels({
|
||||
baseUrl: "http://127.0.0.1:8080/v1",
|
||||
label: "llama.cpp",
|
||||
env: {},
|
||||
});
|
||||
|
||||
expect(models).toEqual([
|
||||
expect.objectContaining({
|
||||
id: "qwen3.6-mxfp4-moe",
|
||||
contextWindow: 262_144,
|
||||
contextTokens: 65_536,
|
||||
}),
|
||||
]);
|
||||
expect(fetchWithSsrFGuardMock).toHaveBeenNthCalledWith(
|
||||
2,
|
||||
expect.objectContaining({
|
||||
url: "http://127.0.0.1:8080/props",
|
||||
}),
|
||||
);
|
||||
expect(modelsRelease).toHaveBeenCalledOnce();
|
||||
expect(propsRelease).toHaveBeenCalledOnce();
|
||||
});
|
||||
|
||||
it("preserves explicit configured context windows ahead of llama.cpp /props", async () => {
|
||||
const release = vi.fn(async () => undefined);
|
||||
fetchWithSsrFGuardMock.mockResolvedValueOnce({
|
||||
response: new Response(
|
||||
JSON.stringify({
|
||||
data: [{ id: "qwen3.6-mxfp4-moe", meta: { n_ctx_train: 262_144 } }],
|
||||
}),
|
||||
{ status: 200 },
|
||||
),
|
||||
finalUrl: "http://127.0.0.1:8080/v1/models",
|
||||
release,
|
||||
});
|
||||
|
||||
const models = await discoverOpenAICompatibleLocalModels({
|
||||
baseUrl: "http://127.0.0.1:8080/v1",
|
||||
label: "llama.cpp",
|
||||
contextWindow: 65_536,
|
||||
env: {},
|
||||
});
|
||||
|
||||
expect(models).toEqual([
|
||||
expect.objectContaining({
|
||||
id: "qwen3.6-mxfp4-moe",
|
||||
contextWindow: 65_536,
|
||||
}),
|
||||
]);
|
||||
expect(models[0]).not.toHaveProperty("contextTokens");
|
||||
expect(fetchWithSsrFGuardMock).toHaveBeenCalledTimes(1);
|
||||
expect(release).toHaveBeenCalledOnce();
|
||||
});
|
||||
|
||||
|
||||
@@ -35,9 +35,16 @@ const log = createSubsystemLogger("plugins/self-hosted-provider-setup");
|
||||
type OpenAICompatModelsResponse = {
|
||||
data?: Array<{
|
||||
id?: string;
|
||||
meta?: {
|
||||
n_ctx_train?: unknown;
|
||||
};
|
||||
}>;
|
||||
};
|
||||
|
||||
type LlamaCppPropsResponse = {
|
||||
n_ctx?: unknown;
|
||||
};
|
||||
|
||||
function isReasoningModelHeuristic(modelId: string): boolean {
|
||||
return /r1|reasoning|think|reason/i.test(modelId);
|
||||
}
|
||||
@@ -62,6 +69,57 @@ function buildSelfHostedBaseUrlSsrFPolicy(baseUrl: string): SsrFPolicy | undefin
|
||||
}
|
||||
}
|
||||
|
||||
function readPositiveInteger(value: unknown): number | undefined {
|
||||
if (typeof value !== "number" || !Number.isFinite(value) || value <= 0) {
|
||||
return undefined;
|
||||
}
|
||||
return Math.trunc(value);
|
||||
}
|
||||
|
||||
function resolveLlamaCppPropsUrl(baseUrl: string): string {
|
||||
const parsed = new URL(baseUrl);
|
||||
const pathname = parsed.pathname.replace(/\/+$/, "");
|
||||
parsed.pathname = pathname.endsWith("/v1") ? pathname.slice(0, -3) || "/" : pathname;
|
||||
parsed.search = "";
|
||||
parsed.hash = "";
|
||||
const root = parsed.toString().replace(/\/+$/, "");
|
||||
return `${root}/props`;
|
||||
}
|
||||
|
||||
async function discoverLlamaCppRuntimeContextTokens(params: {
|
||||
baseUrl: string;
|
||||
apiKey?: string;
|
||||
}): Promise<number | undefined> {
|
||||
let url: string;
|
||||
try {
|
||||
url = resolveLlamaCppPropsUrl(params.baseUrl);
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
try {
|
||||
const trimmedApiKey = normalizeOptionalString(params.apiKey);
|
||||
const { response, release } = await fetchWithSsrFGuard({
|
||||
url,
|
||||
init: {
|
||||
headers: trimmedApiKey ? { Authorization: `Bearer ${trimmedApiKey}` } : undefined,
|
||||
},
|
||||
policy: buildSelfHostedBaseUrlSsrFPolicy(params.baseUrl),
|
||||
timeoutMs: 2500,
|
||||
});
|
||||
try {
|
||||
if (!response.ok) {
|
||||
return undefined;
|
||||
}
|
||||
const data = (await response.json()) as LlamaCppPropsResponse;
|
||||
return readPositiveInteger(data.n_ctx);
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
export async function discoverOpenAICompatibleLocalModels(params: {
|
||||
baseUrl: string;
|
||||
apiKey?: string;
|
||||
@@ -100,21 +158,36 @@ export async function discoverOpenAICompatibleLocalModels(params: {
|
||||
return [];
|
||||
}
|
||||
|
||||
return models
|
||||
.map((model) => ({ id: normalizeOptionalString(model.id) ?? "" }))
|
||||
.filter((model) => Boolean(model.id))
|
||||
.map((model) => {
|
||||
const modelId = model.id;
|
||||
return {
|
||||
id: modelId,
|
||||
name: modelId,
|
||||
reasoning: isReasoningModelHeuristic(modelId),
|
||||
input: ["text"],
|
||||
cost: SELF_HOSTED_DEFAULT_COST,
|
||||
contextWindow: params.contextWindow ?? SELF_HOSTED_DEFAULT_CONTEXT_WINDOW,
|
||||
maxTokens: params.maxTokens ?? SELF_HOSTED_DEFAULT_MAX_TOKENS,
|
||||
} satisfies ModelDefinitionConfig;
|
||||
});
|
||||
const runtimeContextTokens =
|
||||
params.contextWindow === undefined
|
||||
? await discoverLlamaCppRuntimeContextTokens({
|
||||
baseUrl: trimmedBaseUrl,
|
||||
apiKey: params.apiKey,
|
||||
})
|
||||
: undefined;
|
||||
|
||||
return models.flatMap((model) => {
|
||||
const modelId = normalizeOptionalString(model.id);
|
||||
if (!modelId) {
|
||||
return [];
|
||||
}
|
||||
const modelConfig: ModelDefinitionConfig = {
|
||||
id: modelId,
|
||||
name: modelId,
|
||||
reasoning: isReasoningModelHeuristic(modelId),
|
||||
input: ["text"],
|
||||
cost: SELF_HOSTED_DEFAULT_COST,
|
||||
contextWindow:
|
||||
params.contextWindow ??
|
||||
readPositiveInteger(model.meta?.n_ctx_train) ??
|
||||
SELF_HOSTED_DEFAULT_CONTEXT_WINDOW,
|
||||
maxTokens: params.maxTokens ?? SELF_HOSTED_DEFAULT_MAX_TOKENS,
|
||||
};
|
||||
if (runtimeContextTokens) {
|
||||
modelConfig.contextTokens = runtimeContextTokens;
|
||||
}
|
||||
return [modelConfig];
|
||||
});
|
||||
} finally {
|
||||
await release();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user