feat: add support for image inputs in chat functionality (#179)

2026-05-13 18:51:35 +00:00 · 2026-03-11 18:44:31 +08:00
parent a60b864a40
commit d73e034c9d
4 changed files with 171 additions and 7 deletions
--- a/packages/core/src/agent/aipex.test.ts
+++ b/packages/core/src/agent/aipex.test.ts
@@ -327,6 +327,134 @@ describe("AIPex", () => {
      expect(events[0]?.type).toBe("session_created");
      expect(agent.getConversationManager()).toBe(customManager);
    });
+
+    it("should pass images as multimodal AgentInputItem[] to run()", async () => {
+      vi.mocked(run).mockResolvedValue(
+        createMockRunResult({
+          finalOutput: "I see a cat",
+          streamEvents: [
+            {
+              type: "raw_model_stream_event",
+              data: { type: "output_text_delta", delta: "I see a cat" },
+            },
+          ],
+        }),
+      );
+
+      const agent = AIPex.create({
+        instructions: "Describe images",
+        model: mockModel,
+      });
+
+      const events: AgentEvent[] = [];
+      for await (const event of agent.chat("What is in this image?", {
+        images: [{ image: "data:image/png;base64,abc123", detail: "high" }],
+      })) {
+        events.push(event);
+      }
+
+      expect(run).toHaveBeenCalledTimes(1);
+      const runCallArgs = vi.mocked(run).mock.calls[0]!;
+      const input = runCallArgs[1] as Array<{
+        type: string;
+        role: string;
+        content: Array<{
+          type: string;
+          text?: string;
+          image?: string;
+          detail?: string;
+        }>;
+      }>;
+
+      expect(Array.isArray(input)).toBe(true);
+      expect(input).toHaveLength(1);
+      expect(input[0]!.role).toBe("user");
+      expect(input[0]!.content).toHaveLength(2);
+      expect(input[0]!.content[0]).toEqual({
+        type: "input_text",
+        text: "What is in this image?",
+      });
+      expect(input[0]!.content[1]).toEqual({
+        type: "input_image",
+        image: "data:image/png;base64,abc123",
+        detail: "high",
+      });
+    });
+
+    it("should default image detail to 'auto' when not specified", async () => {
+      vi.mocked(run).mockResolvedValue(
+        createMockRunResult({ finalOutput: "OK" }),
+      );
+
+      const agent = AIPex.create({
+        instructions: "Test",
+        model: mockModel,
+      });
+
+      for await (const _ of agent.chat("Describe", {
+        images: [{ image: "https://example.com/img.png" }],
+      })) {
+        // consume
+      }
+
+      const runCallArgs = vi.mocked(run).mock.calls[0]!;
+      const input = runCallArgs[1] as Array<{
+        content: Array<{ type: string; detail?: string }>;
+      }>;
+      const imagePart = input[0]!.content[1]!;
+      expect(imagePart.detail).toBe("auto");
+    });
+
+    it("should support multiple images in a single message", async () => {
+      vi.mocked(run).mockResolvedValue(
+        createMockRunResult({ finalOutput: "Two images" }),
+      );
+
+      const agent = AIPex.create({
+        instructions: "Test",
+        model: mockModel,
+      });
+
+      for await (const _ of agent.chat("Compare these", {
+        images: [
+          { image: "img1_base64" },
+          { image: "img2_base64", detail: "low" },
+        ],
+      })) {
+        // consume
+      }
+
+      const runCallArgs = vi.mocked(run).mock.calls[0]!;
+      const input = runCallArgs[1] as Array<{
+        content: Array<{ type: string; image?: string; detail?: string }>;
+      }>;
+      expect(input[0]!.content).toHaveLength(3);
+      expect(input[0]!.content[0]!.type).toBe("input_text");
+      expect(input[0]!.content[1]!.type).toBe("input_image");
+      expect(input[0]!.content[1]!.image).toBe("img1_base64");
+      expect(input[0]!.content[2]!.type).toBe("input_image");
+      expect(input[0]!.content[2]!.image).toBe("img2_base64");
+      expect(input[0]!.content[2]!.detail).toBe("low");
+    });
+
+    it("should pass plain string to run() when no images provided", async () => {
+      vi.mocked(run).mockResolvedValue(
+        createMockRunResult({ finalOutput: "Reply" }),
+      );
+
+      const agent = AIPex.create({
+        instructions: "Test",
+        model: mockModel,
+      });
+
+      for await (const _ of agent.chat("Hello")) {
+        // consume
+      }
+
+      const runCallArgs = vi.mocked(run).mock.calls[0]!;
+      expect(typeof runCallArgs[1]).toBe("string");
+      expect(runCallArgs[1]).toBe("Hello");
+    });
  });

  describe("chat - continue conversation", () => {
--- a/packages/core/src/agent/aipex.ts
+++ b/packages/core/src/agent/aipex.ts
@@ -1,4 +1,5 @@
 import {
+  type AgentInputItem,
  Agent as OpenAIAgent,
  type RunItemStreamEvent,
  run,
@@ -116,7 +117,7 @@ export class AIPex {
  }

  private async *runExecution(
-    input: string,
+    input: string | AgentInputItem[],
    session: Session | null,
  ): AsyncGenerator<AgentEvent> {
    const startTime = Date.now();
@@ -344,7 +345,7 @@ export class AIPex {
    input: string,
    options?: ChatOptions,
  ): AsyncGenerator<AgentEvent> {
-    let finalInput = input;
+    let finalTextInput = input;
    let chatOptions = options;
    let resolvedContexts: Context[] | undefined;

@@ -367,7 +368,7 @@ export class AIPex {
          resolvedContexts = contextObjs;
          // Format contexts and prepend to input
          const contextText = formatContextsForPrompt(contextObjs);
-          finalInput = `${contextText}\n\n${input}`;
+          finalTextInput = `${contextText}\n\n${input}`;

          yield { type: "contexts_attached", contexts: contextObjs };
        }
@@ -382,11 +383,11 @@ export class AIPex {
    }

    const beforeChat = await this.runBeforeChatHooks({
-      input: finalInput,
+      input: finalTextInput,
      options: chatOptions,
      contexts: resolvedContexts,
    });
-    finalInput = beforeChat.input;
+    let finalInput: string | AgentInputItem[] = beforeChat.input;
    if (beforeChat.options) {
      chatOptions = { ...(chatOptions ?? {}), ...beforeChat.options };
    }
@@ -395,6 +396,27 @@ export class AIPex {
      chatOptions = { ...(chatOptions ?? {}), contexts: beforeChat.contexts };
    }

+    // When images are provided, build a multimodal UserMessageItem
+    const images = chatOptions?.images;
+    if (images && images.length > 0 && typeof finalInput === "string") {
+      const contentParts: Array<
+        | { type: "input_text"; text: string }
+        | { type: "input_image"; image: string; detail?: string }
+      > = [{ type: "input_text", text: finalInput }];
+
+      for (const img of images) {
+        contentParts.push({
+          type: "input_image",
+          image: img.image,
+          detail: img.detail ?? "auto",
+        });
+      }
+
+      finalInput = [
+        { type: "message", role: "user", content: contentParts },
+      ] as AgentInputItem[];
+    }
+
    // If sessionId is provided, continue existing conversation
    if (chatOptions?.sessionId) {
      if (!this.conversationManager) {
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -82,6 +82,7 @@ export type {
  ConversationConfig,
  ForkInfo,
  FunctionTool,
+  ImageInput,
  MetricsPayload,
  OpenAIAgent,
  SerializedSession,
--- a/packages/core/src/types.ts
+++ b/packages/core/src/types.ts
@@ -87,6 +87,13 @@ export interface CompressionOptions extends CompressionConfig {
  model: AiSdkModel;
 }

+export interface ImageInput {
+  /** base64-encoded image data, a URL, or a file ID */
+  image: string;
+  /** Vision detail level. Defaults to "auto". */
+  detail?: "auto" | "low" | "high";
+}
+
 export interface ChatOptions {
  sessionId?: string;
  /**
@@ -95,6 +102,12 @@ export interface ChatOptions {
   * Context IDs will be resolved using the ContextManager.
   */
  contexts?: Context[] | string[];
+  /**
+   * Images to include with this message.
+   * When provided, the text input and images are combined into a
+   * multimodal UserMessageItem sent to the model's vision path.
+   */
+  images?: ImageInput[];
 }

 export interface AgentMetrics {
@@ -136,13 +149,13 @@ export type AgentEvent =
 // ============================================================================

 export interface BeforeChatPayload {
-  input: string;
+  input: string | AgentInputItem[];
  options?: ChatOptions;
  contexts?: Context[];
 }

 export interface AfterResponsePayload {
-  input: string;
+  input: string | AgentInputItem[];
  finalOutput: string;
  metrics: AgentMetrics;
  sessionId?: string;