feat: add support for image inputs in chat functionality (#179)

This commit is contained in:
Ken Qian
2026-03-11 18:44:31 +08:00
committed by GitHub
parent a60b864a40
commit d73e034c9d
4 changed files with 171 additions and 7 deletions

View File

@@ -327,6 +327,134 @@ describe("AIPex", () => {
expect(events[0]?.type).toBe("session_created");
expect(agent.getConversationManager()).toBe(customManager);
});
it("should pass images as multimodal AgentInputItem[] to run()", async () => {
vi.mocked(run).mockResolvedValue(
createMockRunResult({
finalOutput: "I see a cat",
streamEvents: [
{
type: "raw_model_stream_event",
data: { type: "output_text_delta", delta: "I see a cat" },
},
],
}),
);
const agent = AIPex.create({
instructions: "Describe images",
model: mockModel,
});
const events: AgentEvent[] = [];
for await (const event of agent.chat("What is in this image?", {
images: [{ image: "data:image/png;base64,abc123", detail: "high" }],
})) {
events.push(event);
}
expect(run).toHaveBeenCalledTimes(1);
const runCallArgs = vi.mocked(run).mock.calls[0]!;
const input = runCallArgs[1] as Array<{
type: string;
role: string;
content: Array<{
type: string;
text?: string;
image?: string;
detail?: string;
}>;
}>;
expect(Array.isArray(input)).toBe(true);
expect(input).toHaveLength(1);
expect(input[0]!.role).toBe("user");
expect(input[0]!.content).toHaveLength(2);
expect(input[0]!.content[0]).toEqual({
type: "input_text",
text: "What is in this image?",
});
expect(input[0]!.content[1]).toEqual({
type: "input_image",
image: "data:image/png;base64,abc123",
detail: "high",
});
});
it("should default image detail to 'auto' when not specified", async () => {
vi.mocked(run).mockResolvedValue(
createMockRunResult({ finalOutput: "OK" }),
);
const agent = AIPex.create({
instructions: "Test",
model: mockModel,
});
for await (const _ of agent.chat("Describe", {
images: [{ image: "https://example.com/img.png" }],
})) {
// consume
}
const runCallArgs = vi.mocked(run).mock.calls[0]!;
const input = runCallArgs[1] as Array<{
content: Array<{ type: string; detail?: string }>;
}>;
const imagePart = input[0]!.content[1]!;
expect(imagePart.detail).toBe("auto");
});
it("should support multiple images in a single message", async () => {
vi.mocked(run).mockResolvedValue(
createMockRunResult({ finalOutput: "Two images" }),
);
const agent = AIPex.create({
instructions: "Test",
model: mockModel,
});
for await (const _ of agent.chat("Compare these", {
images: [
{ image: "img1_base64" },
{ image: "img2_base64", detail: "low" },
],
})) {
// consume
}
const runCallArgs = vi.mocked(run).mock.calls[0]!;
const input = runCallArgs[1] as Array<{
content: Array<{ type: string; image?: string; detail?: string }>;
}>;
expect(input[0]!.content).toHaveLength(3);
expect(input[0]!.content[0]!.type).toBe("input_text");
expect(input[0]!.content[1]!.type).toBe("input_image");
expect(input[0]!.content[1]!.image).toBe("img1_base64");
expect(input[0]!.content[2]!.type).toBe("input_image");
expect(input[0]!.content[2]!.image).toBe("img2_base64");
expect(input[0]!.content[2]!.detail).toBe("low");
});
it("should pass plain string to run() when no images provided", async () => {
vi.mocked(run).mockResolvedValue(
createMockRunResult({ finalOutput: "Reply" }),
);
const agent = AIPex.create({
instructions: "Test",
model: mockModel,
});
for await (const _ of agent.chat("Hello")) {
// consume
}
const runCallArgs = vi.mocked(run).mock.calls[0]!;
expect(typeof runCallArgs[1]).toBe("string");
expect(runCallArgs[1]).toBe("Hello");
});
});
describe("chat - continue conversation", () => {

View File

@@ -1,4 +1,5 @@
import {
type AgentInputItem,
Agent as OpenAIAgent,
type RunItemStreamEvent,
run,
@@ -116,7 +117,7 @@ export class AIPex {
}
private async *runExecution(
input: string,
input: string | AgentInputItem[],
session: Session | null,
): AsyncGenerator<AgentEvent> {
const startTime = Date.now();
@@ -344,7 +345,7 @@ export class AIPex {
input: string,
options?: ChatOptions,
): AsyncGenerator<AgentEvent> {
let finalInput = input;
let finalTextInput = input;
let chatOptions = options;
let resolvedContexts: Context[] | undefined;
@@ -367,7 +368,7 @@ export class AIPex {
resolvedContexts = contextObjs;
// Format contexts and prepend to input
const contextText = formatContextsForPrompt(contextObjs);
finalInput = `${contextText}\n\n${input}`;
finalTextInput = `${contextText}\n\n${input}`;
yield { type: "contexts_attached", contexts: contextObjs };
}
@@ -382,11 +383,11 @@ export class AIPex {
}
const beforeChat = await this.runBeforeChatHooks({
input: finalInput,
input: finalTextInput,
options: chatOptions,
contexts: resolvedContexts,
});
finalInput = beforeChat.input;
let finalInput: string | AgentInputItem[] = beforeChat.input;
if (beforeChat.options) {
chatOptions = { ...(chatOptions ?? {}), ...beforeChat.options };
}
@@ -395,6 +396,27 @@ export class AIPex {
chatOptions = { ...(chatOptions ?? {}), contexts: beforeChat.contexts };
}
// When images are provided, build a multimodal UserMessageItem
const images = chatOptions?.images;
if (images && images.length > 0 && typeof finalInput === "string") {
const contentParts: Array<
| { type: "input_text"; text: string }
| { type: "input_image"; image: string; detail?: string }
> = [{ type: "input_text", text: finalInput }];
for (const img of images) {
contentParts.push({
type: "input_image",
image: img.image,
detail: img.detail ?? "auto",
});
}
finalInput = [
{ type: "message", role: "user", content: contentParts },
] as AgentInputItem[];
}
// If sessionId is provided, continue existing conversation
if (chatOptions?.sessionId) {
if (!this.conversationManager) {

View File

@@ -82,6 +82,7 @@ export type {
ConversationConfig,
ForkInfo,
FunctionTool,
ImageInput,
MetricsPayload,
OpenAIAgent,
SerializedSession,

View File

@@ -87,6 +87,13 @@ export interface CompressionOptions extends CompressionConfig {
model: AiSdkModel;
}
export interface ImageInput {
/** base64-encoded image data, a URL, or a file ID */
image: string;
/** Vision detail level. Defaults to "auto". */
detail?: "auto" | "low" | "high";
}
export interface ChatOptions {
sessionId?: string;
/**
@@ -95,6 +102,12 @@ export interface ChatOptions {
* Context IDs will be resolved using the ContextManager.
*/
contexts?: Context[] | string[];
/**
* Images to include with this message.
* When provided, the text input and images are combined into a
* multimodal UserMessageItem sent to the model's vision path.
*/
images?: ImageInput[];
}
export interface AgentMetrics {
@@ -136,13 +149,13 @@ export type AgentEvent =
// ============================================================================
export interface BeforeChatPayload {
input: string;
input: string | AgentInputItem[];
options?: ChatOptions;
contexts?: Context[];
}
export interface AfterResponsePayload {
input: string;
input: string | AgentInputItem[];
finalOutput: string;
metrics: AgentMetrics;
sessionId?: string;