mirror of
https://github.com/AIPexStudio/AIPex.git
synced 2026-05-13 18:51:35 +00:00
feat: add support for image inputs in chat functionality (#179)
This commit is contained in:
@@ -327,6 +327,134 @@ describe("AIPex", () => {
|
||||
expect(events[0]?.type).toBe("session_created");
|
||||
expect(agent.getConversationManager()).toBe(customManager);
|
||||
});
|
||||
|
||||
it("should pass images as multimodal AgentInputItem[] to run()", async () => {
|
||||
vi.mocked(run).mockResolvedValue(
|
||||
createMockRunResult({
|
||||
finalOutput: "I see a cat",
|
||||
streamEvents: [
|
||||
{
|
||||
type: "raw_model_stream_event",
|
||||
data: { type: "output_text_delta", delta: "I see a cat" },
|
||||
},
|
||||
],
|
||||
}),
|
||||
);
|
||||
|
||||
const agent = AIPex.create({
|
||||
instructions: "Describe images",
|
||||
model: mockModel,
|
||||
});
|
||||
|
||||
const events: AgentEvent[] = [];
|
||||
for await (const event of agent.chat("What is in this image?", {
|
||||
images: [{ image: "data:image/png;base64,abc123", detail: "high" }],
|
||||
})) {
|
||||
events.push(event);
|
||||
}
|
||||
|
||||
expect(run).toHaveBeenCalledTimes(1);
|
||||
const runCallArgs = vi.mocked(run).mock.calls[0]!;
|
||||
const input = runCallArgs[1] as Array<{
|
||||
type: string;
|
||||
role: string;
|
||||
content: Array<{
|
||||
type: string;
|
||||
text?: string;
|
||||
image?: string;
|
||||
detail?: string;
|
||||
}>;
|
||||
}>;
|
||||
|
||||
expect(Array.isArray(input)).toBe(true);
|
||||
expect(input).toHaveLength(1);
|
||||
expect(input[0]!.role).toBe("user");
|
||||
expect(input[0]!.content).toHaveLength(2);
|
||||
expect(input[0]!.content[0]).toEqual({
|
||||
type: "input_text",
|
||||
text: "What is in this image?",
|
||||
});
|
||||
expect(input[0]!.content[1]).toEqual({
|
||||
type: "input_image",
|
||||
image: "data:image/png;base64,abc123",
|
||||
detail: "high",
|
||||
});
|
||||
});
|
||||
|
||||
it("should default image detail to 'auto' when not specified", async () => {
|
||||
vi.mocked(run).mockResolvedValue(
|
||||
createMockRunResult({ finalOutput: "OK" }),
|
||||
);
|
||||
|
||||
const agent = AIPex.create({
|
||||
instructions: "Test",
|
||||
model: mockModel,
|
||||
});
|
||||
|
||||
for await (const _ of agent.chat("Describe", {
|
||||
images: [{ image: "https://example.com/img.png" }],
|
||||
})) {
|
||||
// consume
|
||||
}
|
||||
|
||||
const runCallArgs = vi.mocked(run).mock.calls[0]!;
|
||||
const input = runCallArgs[1] as Array<{
|
||||
content: Array<{ type: string; detail?: string }>;
|
||||
}>;
|
||||
const imagePart = input[0]!.content[1]!;
|
||||
expect(imagePart.detail).toBe("auto");
|
||||
});
|
||||
|
||||
it("should support multiple images in a single message", async () => {
|
||||
vi.mocked(run).mockResolvedValue(
|
||||
createMockRunResult({ finalOutput: "Two images" }),
|
||||
);
|
||||
|
||||
const agent = AIPex.create({
|
||||
instructions: "Test",
|
||||
model: mockModel,
|
||||
});
|
||||
|
||||
for await (const _ of agent.chat("Compare these", {
|
||||
images: [
|
||||
{ image: "img1_base64" },
|
||||
{ image: "img2_base64", detail: "low" },
|
||||
],
|
||||
})) {
|
||||
// consume
|
||||
}
|
||||
|
||||
const runCallArgs = vi.mocked(run).mock.calls[0]!;
|
||||
const input = runCallArgs[1] as Array<{
|
||||
content: Array<{ type: string; image?: string; detail?: string }>;
|
||||
}>;
|
||||
expect(input[0]!.content).toHaveLength(3);
|
||||
expect(input[0]!.content[0]!.type).toBe("input_text");
|
||||
expect(input[0]!.content[1]!.type).toBe("input_image");
|
||||
expect(input[0]!.content[1]!.image).toBe("img1_base64");
|
||||
expect(input[0]!.content[2]!.type).toBe("input_image");
|
||||
expect(input[0]!.content[2]!.image).toBe("img2_base64");
|
||||
expect(input[0]!.content[2]!.detail).toBe("low");
|
||||
});
|
||||
|
||||
it("should pass plain string to run() when no images provided", async () => {
|
||||
vi.mocked(run).mockResolvedValue(
|
||||
createMockRunResult({ finalOutput: "Reply" }),
|
||||
);
|
||||
|
||||
const agent = AIPex.create({
|
||||
instructions: "Test",
|
||||
model: mockModel,
|
||||
});
|
||||
|
||||
for await (const _ of agent.chat("Hello")) {
|
||||
// consume
|
||||
}
|
||||
|
||||
const runCallArgs = vi.mocked(run).mock.calls[0]!;
|
||||
expect(typeof runCallArgs[1]).toBe("string");
|
||||
expect(runCallArgs[1]).toBe("Hello");
|
||||
});
|
||||
});
|
||||
|
||||
describe("chat - continue conversation", () => {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import {
|
||||
type AgentInputItem,
|
||||
Agent as OpenAIAgent,
|
||||
type RunItemStreamEvent,
|
||||
run,
|
||||
@@ -116,7 +117,7 @@ export class AIPex {
|
||||
}
|
||||
|
||||
private async *runExecution(
|
||||
input: string,
|
||||
input: string | AgentInputItem[],
|
||||
session: Session | null,
|
||||
): AsyncGenerator<AgentEvent> {
|
||||
const startTime = Date.now();
|
||||
@@ -344,7 +345,7 @@ export class AIPex {
|
||||
input: string,
|
||||
options?: ChatOptions,
|
||||
): AsyncGenerator<AgentEvent> {
|
||||
let finalInput = input;
|
||||
let finalTextInput = input;
|
||||
let chatOptions = options;
|
||||
let resolvedContexts: Context[] | undefined;
|
||||
|
||||
@@ -367,7 +368,7 @@ export class AIPex {
|
||||
resolvedContexts = contextObjs;
|
||||
// Format contexts and prepend to input
|
||||
const contextText = formatContextsForPrompt(contextObjs);
|
||||
finalInput = `${contextText}\n\n${input}`;
|
||||
finalTextInput = `${contextText}\n\n${input}`;
|
||||
|
||||
yield { type: "contexts_attached", contexts: contextObjs };
|
||||
}
|
||||
@@ -382,11 +383,11 @@ export class AIPex {
|
||||
}
|
||||
|
||||
const beforeChat = await this.runBeforeChatHooks({
|
||||
input: finalInput,
|
||||
input: finalTextInput,
|
||||
options: chatOptions,
|
||||
contexts: resolvedContexts,
|
||||
});
|
||||
finalInput = beforeChat.input;
|
||||
let finalInput: string | AgentInputItem[] = beforeChat.input;
|
||||
if (beforeChat.options) {
|
||||
chatOptions = { ...(chatOptions ?? {}), ...beforeChat.options };
|
||||
}
|
||||
@@ -395,6 +396,27 @@ export class AIPex {
|
||||
chatOptions = { ...(chatOptions ?? {}), contexts: beforeChat.contexts };
|
||||
}
|
||||
|
||||
// When images are provided, build a multimodal UserMessageItem
|
||||
const images = chatOptions?.images;
|
||||
if (images && images.length > 0 && typeof finalInput === "string") {
|
||||
const contentParts: Array<
|
||||
| { type: "input_text"; text: string }
|
||||
| { type: "input_image"; image: string; detail?: string }
|
||||
> = [{ type: "input_text", text: finalInput }];
|
||||
|
||||
for (const img of images) {
|
||||
contentParts.push({
|
||||
type: "input_image",
|
||||
image: img.image,
|
||||
detail: img.detail ?? "auto",
|
||||
});
|
||||
}
|
||||
|
||||
finalInput = [
|
||||
{ type: "message", role: "user", content: contentParts },
|
||||
] as AgentInputItem[];
|
||||
}
|
||||
|
||||
// If sessionId is provided, continue existing conversation
|
||||
if (chatOptions?.sessionId) {
|
||||
if (!this.conversationManager) {
|
||||
|
||||
@@ -82,6 +82,7 @@ export type {
|
||||
ConversationConfig,
|
||||
ForkInfo,
|
||||
FunctionTool,
|
||||
ImageInput,
|
||||
MetricsPayload,
|
||||
OpenAIAgent,
|
||||
SerializedSession,
|
||||
|
||||
@@ -87,6 +87,13 @@ export interface CompressionOptions extends CompressionConfig {
|
||||
model: AiSdkModel;
|
||||
}
|
||||
|
||||
export interface ImageInput {
|
||||
/** base64-encoded image data, a URL, or a file ID */
|
||||
image: string;
|
||||
/** Vision detail level. Defaults to "auto". */
|
||||
detail?: "auto" | "low" | "high";
|
||||
}
|
||||
|
||||
export interface ChatOptions {
|
||||
sessionId?: string;
|
||||
/**
|
||||
@@ -95,6 +102,12 @@ export interface ChatOptions {
|
||||
* Context IDs will be resolved using the ContextManager.
|
||||
*/
|
||||
contexts?: Context[] | string[];
|
||||
/**
|
||||
* Images to include with this message.
|
||||
* When provided, the text input and images are combined into a
|
||||
* multimodal UserMessageItem sent to the model's vision path.
|
||||
*/
|
||||
images?: ImageInput[];
|
||||
}
|
||||
|
||||
export interface AgentMetrics {
|
||||
@@ -136,13 +149,13 @@ export type AgentEvent =
|
||||
// ============================================================================
|
||||
|
||||
export interface BeforeChatPayload {
|
||||
input: string;
|
||||
input: string | AgentInputItem[];
|
||||
options?: ChatOptions;
|
||||
contexts?: Context[];
|
||||
}
|
||||
|
||||
export interface AfterResponsePayload {
|
||||
input: string;
|
||||
input: string | AgentInputItem[];
|
||||
finalOutput: string;
|
||||
metrics: AgentMetrics;
|
||||
sessionId?: string;
|
||||
|
||||
Reference in New Issue
Block a user