mirror of
https://github.com/moltbot/moltbot.git
synced 2026-05-13 15:47:28 +00:00
fix(onboard): infer custom model image input
This commit is contained in:
@@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai
|
||||
### Fixes
|
||||
|
||||
- Cron/Telegram: preserve explicit `:topic:` delivery targets over stale session-derived thread IDs when isolated cron announces to Telegram forum topics. Carries forward #59069; refs #49704 and #43808. Thanks @roytong9.
|
||||
- CLI/onboarding: infer image input for common custom-provider vision model IDs, ask only for unknown models, and keep `--custom-image-input`/`--custom-text-input` overrides so vision-capable proxies do not get saved as text-only configs. Fixes #51869. Thanks @Antsoldier1974.
|
||||
- Memory/Dreaming: retry Dream Diary once with the session default when a configured dreaming model is unavailable, while leaving subagent trust and allowlist errors visible instead of silently masking configuration problems. Refs #67409 and #69209. Thanks @Ghiggins18 and @everySympathy.
|
||||
- Feishu/inbound files: recover CJK filenames from plain `Content-Disposition: filename=` download headers when Feishu exposes UTF-8 bytes through Latin-1 header decoding, while leaving valid Latin-1 and JSON-derived names unchanged. (#48578, #50435, #59431) Thanks @alex-xuweilong, @lishuaigit, and @DoChaoing.
|
||||
|
||||
|
||||
@@ -61,10 +61,12 @@ openclaw onboard --non-interactive \
|
||||
--custom-model-id "foo-large" \
|
||||
--custom-api-key "$CUSTOM_API_KEY" \
|
||||
--secret-input-mode plaintext \
|
||||
--custom-compatibility openai
|
||||
--custom-compatibility openai \
|
||||
--custom-image-input
|
||||
```
|
||||
|
||||
`--custom-api-key` is optional in non-interactive mode. If omitted, onboarding checks `CUSTOM_API_KEY`.
|
||||
OpenClaw marks common vision model IDs as image-capable automatically. Pass `--custom-image-input` for unknown custom vision IDs, or `--custom-text-input` to force text-only metadata.
|
||||
|
||||
LM Studio also supports a provider-specific key flag in non-interactive mode:
|
||||
|
||||
|
||||
@@ -456,6 +456,7 @@ OpenClaw uses the built-in model catalog. Add custom providers via `models.provi
|
||||
</Accordion>
|
||||
<Accordion title="Model catalog entries">
|
||||
- `models.providers.*.models`: explicit provider model catalog entries.
|
||||
- `models.providers.*.models.*.input`: model input modalities. Use `["text"]` for text-only models and `["text", "image"]` for native image/vision models. Image attachments are only injected into agent turns when the selected model is marked image-capable.
|
||||
- `models.providers.*.models.*.contextWindow`: native model context window metadata. This overrides provider-level `contextWindow` for that model.
|
||||
- `models.providers.*.models.*.contextTokens`: optional runtime context cap. This overrides provider-level `contextTokens`; use it when you want a smaller effective context budget than the model's native `contextWindow`; `openclaw models list` shows both values when they differ.
|
||||
- `models.providers.*.models.*.compat.supportsDeveloperRole`: optional compatibility hint. For `api: "openai-completions"` with a non-empty non-native `baseUrl` (host not `api.openai.com`), OpenClaw forces this to `false` at runtime. Empty/omitted `baseUrl` keeps default OpenAI behavior.
|
||||
@@ -472,6 +473,8 @@ OpenClaw uses the built-in model catalog. Add custom providers via `models.provi
|
||||
</Accordion>
|
||||
</AccordionGroup>
|
||||
|
||||
Interactive custom-provider onboarding infers image input for common vision model IDs such as GPT-4o, Claude, Gemini, Qwen-VL, LLaVA, Pixtral, InternVL, Mllama, MiniCPM-V, and GLM-4V, and skips the extra question for known text-only families. Unknown model IDs still prompt for image support. Non-interactive onboarding uses the same inference; pass `--custom-image-input` to force image-capable metadata or `--custom-text-input` to force text-only metadata.
|
||||
|
||||
### Provider examples
|
||||
|
||||
<AccordionGroup>
|
||||
|
||||
@@ -168,6 +168,13 @@ catalog id and model ref:
|
||||
- `models.providers.mlx.models[].id: "mlx-community/Qwen3-30B-A3B-6bit"`
|
||||
- `agents.defaults.model.primary: "mlx/mlx-community/Qwen3-30B-A3B-6bit"`
|
||||
|
||||
Set `input: ["text", "image"]` on local or proxied vision models so image
|
||||
attachments are injected into agent turns. Interactive custom-provider
|
||||
onboarding infers common vision model IDs and asks only for unknown names.
|
||||
Non-interactive onboarding uses the same inference; use `--custom-image-input`
|
||||
for unknown vision IDs or `--custom-text-input` when a known-looking model is
|
||||
text-only behind your endpoint.
|
||||
|
||||
Keep `models.mode: "merge"` so hosted models stay available as fallbacks.
|
||||
Use `models.providers.<id>.timeoutSeconds` for slow local or remote model
|
||||
servers before raising `agents.defaults.timeoutSeconds`. The provider timeout
|
||||
|
||||
@@ -166,11 +166,13 @@ openclaw onboard --non-interactive \
|
||||
--custom-api-key "$CUSTOM_API_KEY" \
|
||||
--custom-provider-id "my-custom" \
|
||||
--custom-compatibility anthropic \
|
||||
--custom-image-input \
|
||||
--gateway-port 18789 \
|
||||
--gateway-bind loopback
|
||||
```
|
||||
|
||||
`--custom-api-key` is optional. If omitted, onboarding checks `CUSTOM_API_KEY`.
|
||||
OpenClaw marks common vision model IDs as image-capable automatically. Add `--custom-image-input` for unknown custom vision IDs, or `--custom-text-input` to force text-only metadata.
|
||||
|
||||
Ref-mode variant:
|
||||
|
||||
@@ -184,6 +186,7 @@ openclaw onboard --non-interactive \
|
||||
--secret-input-mode ref \
|
||||
--custom-provider-id "my-custom" \
|
||||
--custom-compatibility anthropic \
|
||||
--custom-image-input \
|
||||
--gateway-port 18789 \
|
||||
--gateway-bind loopback
|
||||
```
|
||||
|
||||
@@ -202,6 +202,7 @@ What you set:
|
||||
- `--custom-api-key` (optional; falls back to `CUSTOM_API_KEY`)
|
||||
- `--custom-provider-id` (optional)
|
||||
- `--custom-compatibility <openai|anthropic>` (optional; default `openai`)
|
||||
- `--custom-image-input` / `--custom-text-input` (optional; override inferred model input capability)
|
||||
|
||||
</Accordion>
|
||||
<Accordion title="Skip">
|
||||
@@ -212,6 +213,7 @@ What you set:
|
||||
Model behavior:
|
||||
|
||||
- Pick default model from detected options, or enter provider and model manually.
|
||||
- Custom-provider onboarding infers image support for common model IDs and asks only when the model name is unknown.
|
||||
- When onboarding starts from a provider auth choice, the model picker prefers
|
||||
that provider automatically. For Volcengine and BytePlus, the same preference
|
||||
also matches their coding-plan variants (`volcengine-plan/*`,
|
||||
|
||||
@@ -144,6 +144,8 @@ export function registerOnboardCommand(program: Command) {
|
||||
"--custom-compatibility <mode>",
|
||||
"Custom provider API compatibility: openai|anthropic (default: openai)",
|
||||
)
|
||||
.option("--custom-image-input", "Mark the custom provider model as image-capable")
|
||||
.option("--custom-text-input", "Mark the custom provider model as text-only")
|
||||
.option("--gateway-port <port>", "Gateway port")
|
||||
.option("--gateway-bind <mode>", "Gateway bind: loopback|tailnet|lan|auto|custom")
|
||||
.option("--gateway-auth <mode>", "Gateway auth: token|password")
|
||||
@@ -214,6 +216,12 @@ export function registerOnboardCommand(program: Command) {
|
||||
customModelId: opts.customModelId as string | undefined,
|
||||
customProviderId: opts.customProviderId as string | undefined,
|
||||
customCompatibility: opts.customCompatibility as "openai" | "anthropic" | undefined,
|
||||
customImageInput:
|
||||
opts.customTextInput === true
|
||||
? false
|
||||
: opts.customImageInput === true
|
||||
? true
|
||||
: undefined,
|
||||
gatewayPort:
|
||||
typeof gatewayPort === "number" && Number.isFinite(gatewayPort)
|
||||
? gatewayPort
|
||||
|
||||
@@ -5,7 +5,9 @@ import {
|
||||
applyCustomApiConfig,
|
||||
buildAnthropicVerificationProbeRequest,
|
||||
buildOpenAiVerificationProbeRequest,
|
||||
inferCustomModelSupportsImageInput,
|
||||
parseNonInteractiveCustomApiFlags,
|
||||
resolveCustomModelImageInputInference,
|
||||
} from "./onboard-custom-config.js";
|
||||
|
||||
function buildCustomProviderConfig(contextWindow?: number) {
|
||||
@@ -311,6 +313,60 @@ describe("applyCustomApiConfig", () => {
|
||||
).toBeUndefined();
|
||||
});
|
||||
|
||||
it("adds image input for new non-azure custom models when requested", () => {
|
||||
const result = applyCustomApiConfig({
|
||||
config: {},
|
||||
baseUrl: "https://llm.example.com/v1",
|
||||
modelId: "gpt-4o",
|
||||
compatibility: "openai",
|
||||
providerId: "custom",
|
||||
supportsImageInput: true,
|
||||
});
|
||||
|
||||
expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text", "image"]);
|
||||
});
|
||||
|
||||
it("infers image input for known non-azure custom vision models", () => {
|
||||
const result = applyCustomApiConfig({
|
||||
config: {},
|
||||
baseUrl: "https://llm.example.com/v1",
|
||||
modelId: "gpt-4o",
|
||||
compatibility: "openai",
|
||||
providerId: "custom",
|
||||
});
|
||||
|
||||
expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text", "image"]);
|
||||
});
|
||||
|
||||
it("lets explicit text input override known non-azure custom vision inference", () => {
|
||||
const result = applyCustomApiConfig({
|
||||
config: {},
|
||||
baseUrl: "https://llm.example.com/v1",
|
||||
modelId: "gpt-4o",
|
||||
compatibility: "openai",
|
||||
providerId: "custom",
|
||||
supportsImageInput: false,
|
||||
});
|
||||
|
||||
expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text"]);
|
||||
});
|
||||
|
||||
it("updates existing non-azure custom model input when image support is explicitly requested", () => {
|
||||
const result = applyCustomApiConfig({
|
||||
config: buildCustomProviderConfig(CONTEXT_WINDOW_HARD_MIN_TOKENS),
|
||||
baseUrl: "https://llm.example.com/v1",
|
||||
modelId: "foo-large",
|
||||
compatibility: "openai",
|
||||
providerId: "custom",
|
||||
supportsImageInput: true,
|
||||
});
|
||||
const model = result.config.models?.providers?.custom?.models?.find(
|
||||
(entry) => entry.id === "foo-large",
|
||||
);
|
||||
|
||||
expect(model?.input).toEqual(["text", "image"]);
|
||||
});
|
||||
|
||||
it("re-onboard preserves user-customized fields for non-azure models", () => {
|
||||
const result = applyCustomApiConfig({
|
||||
config: {
|
||||
@@ -391,6 +447,16 @@ describe("parseNonInteractiveCustomApiFlags", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("parses custom image input opt-in", () => {
|
||||
const result = parseNonInteractiveCustomApiFlags({
|
||||
baseUrl: "https://llm.example.com/v1",
|
||||
modelId: "foo-large",
|
||||
supportsImageInput: true,
|
||||
});
|
||||
|
||||
expect(result.supportsImageInput).toBe(true);
|
||||
});
|
||||
|
||||
it.each([
|
||||
{
|
||||
name: "missing required flags",
|
||||
@@ -419,3 +485,30 @@ describe("parseNonInteractiveCustomApiFlags", () => {
|
||||
expect(() => parseNonInteractiveCustomApiFlags(flags)).toThrow(expectedMessage);
|
||||
});
|
||||
});
|
||||
|
||||
describe("inferCustomModelSupportsImageInput", () => {
|
||||
it.each(["gpt-4o", "claude-sonnet-4-6", "gemini-3-flash", "qwen2.5-vl", "llava"])(
|
||||
"detects likely vision model %s",
|
||||
(modelId) => {
|
||||
expect(inferCustomModelSupportsImageInput(modelId)).toBe(true);
|
||||
},
|
||||
);
|
||||
|
||||
it.each(["llama3", "deepseek-v3", "evolvable-text-model"])(
|
||||
"does not over-match text model %s",
|
||||
(modelId) => {
|
||||
expect(inferCustomModelSupportsImageInput(modelId)).toBe(false);
|
||||
},
|
||||
);
|
||||
|
||||
it("reports confidence for known text and unknown custom models", () => {
|
||||
expect(resolveCustomModelImageInputInference("llama3")).toEqual({
|
||||
supportsImageInput: false,
|
||||
confidence: "known",
|
||||
});
|
||||
expect(resolveCustomModelImageInputInference("my-private-model")).toEqual({
|
||||
supportsImageInput: false,
|
||||
confidence: "unknown",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -18,12 +18,72 @@ const DEFAULT_MAX_TOKENS = 4096;
|
||||
// Azure OpenAI uses the Responses API which supports larger defaults
|
||||
const AZURE_DEFAULT_CONTEXT_WINDOW = 400_000;
|
||||
const AZURE_DEFAULT_MAX_TOKENS = 16_384;
|
||||
type CustomModelInput = "text" | "image";
|
||||
export type CustomModelImageInputInference = {
|
||||
supportsImageInput: boolean;
|
||||
confidence: "known" | "unknown";
|
||||
};
|
||||
|
||||
function normalizeContextWindowForCustomModel(value: unknown): number {
|
||||
const parsed = typeof value === "number" && Number.isFinite(value) ? Math.floor(value) : 0;
|
||||
return parsed >= CONTEXT_WINDOW_HARD_MIN_TOKENS ? parsed : CONTEXT_WINDOW_HARD_MIN_TOKENS;
|
||||
}
|
||||
|
||||
function customModelInputs(supportsImageInput: boolean): CustomModelInput[] {
|
||||
return supportsImageInput ? ["text", "image"] : ["text"];
|
||||
}
|
||||
|
||||
export function resolveCustomModelImageInputInference(
|
||||
modelId: string,
|
||||
): CustomModelImageInputInference {
|
||||
const normalized = normalizeLowercaseStringOrEmpty(modelId);
|
||||
if (!normalized) {
|
||||
return { supportsImageInput: false, confidence: "unknown" };
|
||||
}
|
||||
const matchesKnownVision =
|
||||
/\b(?:gpt-4o|gpt-4\.1|gpt-[5-9]|o[134])\b/.test(normalized) ||
|
||||
/\bclaude-(?:3|4|sonnet|opus|haiku)\b/.test(normalized) ||
|
||||
/\bgemini\b/.test(normalized) ||
|
||||
/\b(?:qwen[\w.-]*-?vl|qwen-vl)\b/.test(normalized) ||
|
||||
/\b(?:vision|llava|pixtral|internvl|mllama|minicpm-v|glm-4v)\b/.test(normalized) ||
|
||||
/(?:^|[-_/])vl(?:[-_/]|$)/.test(normalized);
|
||||
if (matchesKnownVision) {
|
||||
return { supportsImageInput: true, confidence: "known" };
|
||||
}
|
||||
|
||||
const matchesKnownText =
|
||||
/\b(?:llama\d*|deepseek|mistral|mixtral|kimi|moonshot|codestral|devstral|phi|qwq|codellama)\b/.test(
|
||||
normalized,
|
||||
) || /\bqwen(?!.*(?:vl|vision))/.test(normalized);
|
||||
if (matchesKnownText) {
|
||||
return { supportsImageInput: false, confidence: "known" };
|
||||
}
|
||||
|
||||
return { supportsImageInput: false, confidence: "unknown" };
|
||||
}
|
||||
|
||||
export function inferCustomModelSupportsImageInput(modelId: string): boolean {
|
||||
return resolveCustomModelImageInputInference(modelId).supportsImageInput;
|
||||
}
|
||||
|
||||
function resolveCustomModelSupportsImageInput(params: {
|
||||
modelId: string;
|
||||
explicit?: boolean;
|
||||
fallback: boolean;
|
||||
inferKnownModels: boolean;
|
||||
}): boolean {
|
||||
return (
|
||||
params.explicit ??
|
||||
((): boolean => {
|
||||
if (!params.inferKnownModels) {
|
||||
return params.fallback;
|
||||
}
|
||||
const inference = resolveCustomModelImageInputInference(params.modelId);
|
||||
return inference.confidence === "known" ? inference.supportsImageInput : params.fallback;
|
||||
})()
|
||||
);
|
||||
}
|
||||
|
||||
function isAzureFoundryUrl(baseUrl: string): boolean {
|
||||
try {
|
||||
const url = new URL(baseUrl);
|
||||
@@ -112,6 +172,7 @@ export type ApplyCustomApiConfigParams = {
|
||||
apiKey?: SecretInput;
|
||||
providerId?: string;
|
||||
alias?: string;
|
||||
supportsImageInput?: boolean;
|
||||
};
|
||||
|
||||
export type ParseNonInteractiveCustomApiFlagsParams = {
|
||||
@@ -120,6 +181,7 @@ export type ParseNonInteractiveCustomApiFlagsParams = {
|
||||
compatibility?: string;
|
||||
apiKey?: string;
|
||||
providerId?: string;
|
||||
supportsImageInput?: boolean;
|
||||
};
|
||||
|
||||
export type ParsedNonInteractiveCustomApiFlags = {
|
||||
@@ -128,6 +190,7 @@ export type ParsedNonInteractiveCustomApiFlags = {
|
||||
compatibility: CustomApiCompatibility;
|
||||
apiKey?: string;
|
||||
providerId?: string;
|
||||
supportsImageInput?: boolean;
|
||||
};
|
||||
|
||||
export type CustomApiErrorCode =
|
||||
@@ -439,6 +502,9 @@ export function parseNonInteractiveCustomApiFlags(
|
||||
compatibility: parseCustomApiCompatibility(params.compatibility),
|
||||
...(apiKey ? { apiKey } : {}),
|
||||
...(providerId ? { providerId } : {}),
|
||||
...(params.supportsImageInput === undefined
|
||||
? {}
|
||||
: { supportsImageInput: params.supportsImageInput }),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -487,15 +553,25 @@ export function applyCustomApiConfig(params: ApplyCustomApiConfigParams): Custom
|
||||
const existingModels = Array.isArray(existingProvider?.models) ? existingProvider.models : [];
|
||||
const hasModel = existingModels.some((model) => model.id === modelId);
|
||||
const isLikelyReasoningModel = isAzure && /\b(o[134]|gpt-([5-9]|\d{2,}))\b/i.test(modelId);
|
||||
const explicitInput =
|
||||
params.supportsImageInput === undefined
|
||||
? undefined
|
||||
: customModelInputs(params.supportsImageInput);
|
||||
const generatedInput = customModelInputs(
|
||||
resolveCustomModelSupportsImageInput({
|
||||
modelId,
|
||||
explicit: params.supportsImageInput,
|
||||
fallback: isAzure && isLikelyReasoningModel,
|
||||
inferKnownModels: !isAzure,
|
||||
}),
|
||||
);
|
||||
const nextModel = isAzure
|
||||
? {
|
||||
id: modelId,
|
||||
name: `${modelId} (Custom Provider)`,
|
||||
contextWindow: AZURE_DEFAULT_CONTEXT_WINDOW,
|
||||
maxTokens: AZURE_DEFAULT_MAX_TOKENS,
|
||||
input: isLikelyReasoningModel
|
||||
? (["text", "image"] as Array<"text" | "image">)
|
||||
: (["text"] as ["text"]),
|
||||
input: generatedInput,
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
reasoning: isLikelyReasoningModel,
|
||||
compat: { supportsStore: false },
|
||||
@@ -505,7 +581,7 @@ export function applyCustomApiConfig(params: ApplyCustomApiConfigParams): Custom
|
||||
name: `${modelId} (Custom Provider)`,
|
||||
contextWindow: DEFAULT_CONTEXT_WINDOW,
|
||||
maxTokens: DEFAULT_MAX_TOKENS,
|
||||
input: ["text"] as ["text"],
|
||||
input: generatedInput,
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
reasoning: false,
|
||||
};
|
||||
@@ -515,6 +591,7 @@ export function applyCustomApiConfig(params: ApplyCustomApiConfigParams): Custom
|
||||
? {
|
||||
...model,
|
||||
...(isAzure ? nextModel : {}),
|
||||
...(explicitInput ? { input: explicitInput } : {}),
|
||||
name: model.name ?? nextModel.name,
|
||||
cost: model.cost ?? nextModel.cost,
|
||||
contextWindow: normalizeContextWindowForCustomModel(model.contextWindow),
|
||||
|
||||
@@ -19,7 +19,7 @@ vi.mock("../plugins/provider-auth-input.js", () => ({
|
||||
),
|
||||
}));
|
||||
|
||||
function createTestPrompter(params: { text: string[]; select?: string[] }): {
|
||||
function createTestPrompter(params: { text: string[]; select?: string[]; confirm?: boolean[] }): {
|
||||
text: ReturnType<typeof vi.fn>;
|
||||
select: ReturnType<typeof vi.fn>;
|
||||
confirm: ReturnType<typeof vi.fn>;
|
||||
@@ -34,6 +34,10 @@ function createTestPrompter(params: { text: string[]; select?: string[] }): {
|
||||
for (const answer of params.select ?? []) {
|
||||
select.mockResolvedValueOnce(answer);
|
||||
}
|
||||
const confirm = vi.fn(async () => false);
|
||||
for (const answer of params.confirm ?? []) {
|
||||
confirm.mockResolvedValueOnce(answer);
|
||||
}
|
||||
return {
|
||||
text,
|
||||
progress: vi.fn(() => ({
|
||||
@@ -41,7 +45,7 @@ function createTestPrompter(params: { text: string[]; select?: string[] }): {
|
||||
stop: vi.fn(),
|
||||
})),
|
||||
select,
|
||||
confirm: vi.fn(),
|
||||
confirm,
|
||||
note: vi.fn(),
|
||||
};
|
||||
}
|
||||
@@ -100,6 +104,38 @@ describe("promptCustomApiConfig", () => {
|
||||
|
||||
expectOpenAiCompatResult({ prompter, textCalls: 5, selectCalls: 2, result });
|
||||
expect(result.config.agents?.defaults?.models?.["custom/llama3"]?.alias).toBe("local");
|
||||
expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text"]);
|
||||
expect(prompter.confirm).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("skips the image-input prompt for known custom vision models", async () => {
|
||||
const prompter = createTestPrompter({
|
||||
text: ["https://proxy.example.com/v1", "test-key", "gpt-4o", "custom", ""],
|
||||
select: ["plaintext", "openai"],
|
||||
});
|
||||
stubFetchSequence([{ ok: true }]);
|
||||
|
||||
const result = await runPromptCustomApi(prompter);
|
||||
|
||||
expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text", "image"]);
|
||||
expect(prompter.confirm).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("prompts for custom model image support when the model is unknown", async () => {
|
||||
const prompter = createTestPrompter({
|
||||
text: ["https://proxy.example.com/v1", "test-key", "private-model", "custom", ""],
|
||||
select: ["plaintext", "openai"],
|
||||
confirm: [true],
|
||||
});
|
||||
stubFetchSequence([{ ok: true }]);
|
||||
|
||||
const result = await runPromptCustomApi(prompter);
|
||||
|
||||
expect(result.config.models?.providers?.custom?.models?.[0]?.input).toEqual(["text", "image"]);
|
||||
expect(prompter.confirm).toHaveBeenCalledWith({
|
||||
message: "Does this model support image input?",
|
||||
initialValue: false,
|
||||
});
|
||||
});
|
||||
|
||||
it("defaults custom setup to the native Ollama base URL", async () => {
|
||||
|
||||
@@ -15,6 +15,7 @@ import {
|
||||
normalizeEndpointId,
|
||||
normalizeOptionalProviderApiKey,
|
||||
resolveCustomModelAliasError,
|
||||
resolveCustomModelImageInputInference,
|
||||
resolveCustomProviderId,
|
||||
type CustomApiCompatibility,
|
||||
type CustomApiResult,
|
||||
@@ -24,11 +25,14 @@ export {
|
||||
buildAnthropicVerificationProbeRequest,
|
||||
buildOpenAiVerificationProbeRequest,
|
||||
CustomApiError,
|
||||
inferCustomModelSupportsImageInput,
|
||||
parseNonInteractiveCustomApiFlags,
|
||||
resolveCustomModelImageInputInference,
|
||||
resolveCustomProviderId,
|
||||
type ApplyCustomApiConfigParams,
|
||||
type CustomApiCompatibility,
|
||||
type CustomApiErrorCode,
|
||||
type CustomModelImageInputInference,
|
||||
type CustomApiResult,
|
||||
type ParseNonInteractiveCustomApiFlagsParams,
|
||||
type ParsedNonInteractiveCustomApiFlags,
|
||||
@@ -341,6 +345,14 @@ export async function promptCustomApiConfig(params: {
|
||||
return resolveCustomModelAliasError({ raw: value, cfg: config, modelRef });
|
||||
},
|
||||
});
|
||||
const imageInputInference = resolveCustomModelImageInputInference(modelId);
|
||||
const supportsImageInput =
|
||||
imageInputInference.confidence === "known"
|
||||
? imageInputInference.supportsImageInput
|
||||
: await prompter.confirm({
|
||||
message: "Does this model support image input?",
|
||||
initialValue: imageInputInference.supportsImageInput,
|
||||
});
|
||||
const resolvedCompatibility = compatibility ?? "openai";
|
||||
const result = applyCustomApiConfig({
|
||||
config,
|
||||
@@ -350,6 +362,7 @@ export async function promptCustomApiConfig(params: {
|
||||
apiKey,
|
||||
providerId: providerIdInput,
|
||||
alias: aliasInput,
|
||||
supportsImageInput,
|
||||
});
|
||||
|
||||
if (result.providerIdRenamedFrom && result.providerId) {
|
||||
|
||||
@@ -136,4 +136,71 @@ describe("applyNonInteractiveAuthChoice", () => {
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("marks non-interactive custom provider models as image-capable when requested", async () => {
|
||||
const runtime = createRuntime();
|
||||
const nextConfig = { agents: { defaults: {} } } as OpenClawConfig;
|
||||
resolveNonInteractiveApiKey.mockResolvedValueOnce(undefined);
|
||||
|
||||
const result = await applyNonInteractiveAuthChoice({
|
||||
nextConfig,
|
||||
authChoice: "custom-api-key",
|
||||
opts: {
|
||||
customBaseUrl: "https://models.custom.local/v1",
|
||||
customModelId: "gpt-4o",
|
||||
customImageInput: true,
|
||||
} as never,
|
||||
runtime: runtime as never,
|
||||
baseConfig: nextConfig,
|
||||
});
|
||||
|
||||
expect(result?.models?.providers?.["custom-models-custom-local"]?.models?.[0]?.input).toEqual([
|
||||
"text",
|
||||
"image",
|
||||
]);
|
||||
});
|
||||
|
||||
it("infers image-capable non-interactive custom provider models by known model id", async () => {
|
||||
const runtime = createRuntime();
|
||||
const nextConfig = { agents: { defaults: {} } } as OpenClawConfig;
|
||||
resolveNonInteractiveApiKey.mockResolvedValueOnce(undefined);
|
||||
|
||||
const result = await applyNonInteractiveAuthChoice({
|
||||
nextConfig,
|
||||
authChoice: "custom-api-key",
|
||||
opts: {
|
||||
customBaseUrl: "https://models.custom.local/v1",
|
||||
customModelId: "gpt-4o",
|
||||
} as never,
|
||||
runtime: runtime as never,
|
||||
baseConfig: nextConfig,
|
||||
});
|
||||
|
||||
expect(result?.models?.providers?.["custom-models-custom-local"]?.models?.[0]?.input).toEqual([
|
||||
"text",
|
||||
"image",
|
||||
]);
|
||||
});
|
||||
|
||||
it("honors explicit text-only override for known custom vision models", async () => {
|
||||
const runtime = createRuntime();
|
||||
const nextConfig = { agents: { defaults: {} } } as OpenClawConfig;
|
||||
resolveNonInteractiveApiKey.mockResolvedValueOnce(undefined);
|
||||
|
||||
const result = await applyNonInteractiveAuthChoice({
|
||||
nextConfig,
|
||||
authChoice: "custom-api-key",
|
||||
opts: {
|
||||
customBaseUrl: "https://models.custom.local/v1",
|
||||
customModelId: "gpt-4o",
|
||||
customImageInput: false,
|
||||
} as never,
|
||||
runtime: runtime as never,
|
||||
baseConfig: nextConfig,
|
||||
});
|
||||
|
||||
expect(result?.models?.providers?.["custom-models-custom-local"]?.models?.[0]?.input).toEqual([
|
||||
"text",
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -177,6 +177,7 @@ export async function applyNonInteractiveAuthChoice(params: {
|
||||
compatibility: opts.customCompatibility,
|
||||
apiKey: opts.customApiKey,
|
||||
providerId: opts.customProviderId,
|
||||
supportsImageInput: opts.customImageInput,
|
||||
});
|
||||
const resolvedProviderId = resolveCustomProviderId({
|
||||
config: nextConfig,
|
||||
@@ -213,6 +214,7 @@ export async function applyNonInteractiveAuthChoice(params: {
|
||||
compatibility: customAuth.compatibility,
|
||||
apiKey: customApiKeyInput,
|
||||
providerId: customAuth.providerId,
|
||||
supportsImageInput: customAuth.supportsImageInput,
|
||||
});
|
||||
if (result.providerIdRenamedFrom && result.providerId) {
|
||||
runtime.log(
|
||||
|
||||
@@ -62,6 +62,7 @@ export type OnboardOptions = OnboardDynamicProviderOptions & {
|
||||
customModelId?: string;
|
||||
customProviderId?: string;
|
||||
customCompatibility?: "openai" | "anthropic";
|
||||
customImageInput?: boolean;
|
||||
gatewayPort?: number;
|
||||
gatewayBind?: GatewayBind;
|
||||
gatewayAuth?: GatewayAuthChoice;
|
||||
|
||||
Reference in New Issue
Block a user