feat: enhance screenshot handling and display in chatbot components

- Added functionality to extract and apply screenshots from tool results in the ChatAdapter, improving user experience by allowing immediate rendering of screenshots. - Implemented a new ToolScreenshot component to display screenshots inline, supporting both base64 data and IndexedDB references. - Updated message item rendering to transform screenshot placeholders into actual images, enhancing the visual feedback for users. - Introduced collapsible message items for intermediate assistant messages, improving the organization of conversation turns in the message list. - Enhanced model fetching logic in the Chatbot component to ensure server models are prioritized, improving model selection reliability. - Updated localization files to include new translation keys for improved user guidance.
2026-05-13 18:51:35 +00:00 · 2026-02-15 13:44:03 +08:00
parent c5b6371223
commit 343f6fa146
29 changed files with 2782 additions and 110 deletions
--- a/packages/aipex-react/src/adapters/chat-adapter.ts
+++ b/packages/aipex-react/src/adapters/chat-adapter.ts
@@ -1,5 +1,11 @@
 import type { AgentEvent } from "@aipexstudio/aipex-core";
 import { generateId } from "@aipexstudio/aipex-core";
+import { ScreenshotStorage } from "../lib/screenshot-storage";
+import {
+  extractScreenshotFromToolResult,
+  isCaptureScreenshotTool,
+  type ScreenshotExtraction,
+} from "../lib/screenshot-utils";
 import type {
  ChatAdapterOptions,
  ChatAdapterState,
@@ -412,6 +418,18 @@ export class ChatAdapter {
      return;
    }

+    // Extract screenshot data from screenshot tools
+    if (isCaptureScreenshotTool(toolName)) {
+      const screenshotInfo = extractScreenshotFromToolResult(
+        toolName,
+        result,
+      );
+      if (screenshotInfo) {
+        this.applyScreenshotToolResult(callId, result, screenshotInfo);
+        return;
+      }
+    }
+
    this.updateToolPart(callId, (toolPart) => ({
      ...toolPart,
      state: "completed",
@@ -419,6 +437,59 @@ export class ChatAdapter {
    }));
  }

+  /**
+   * Handle a completed screenshot tool result.
+   *
+   * Uses the tool-provided screenshotUid (the tool already saved to IndexedDB)
+   * rather than generating a new one. Falls back to UI-side storage only if
+   * screenshotUid is missing (e.g., IndexedDB save failed in the tool).
+   */
+  private applyScreenshotToolResult(
+    callId: string,
+    result: unknown,
+    info: ScreenshotExtraction,
+  ): void {
+    if (info.screenshotUid) {
+      // Tool already saved to IndexedDB — use its uid directly
+      this.updateToolPart(callId, (toolPart) => ({
+        ...toolPart,
+        state: "completed",
+        output: result,
+        screenshotUid: info.screenshotUid!,
+        // Keep inline screenshot for immediate rendering if base64 is present
+        ...(info.imageData ? { screenshot: info.imageData } : {}),
+      }));
+    } else if (info.imageData) {
+      // Fallback: tool didn't provide a uid (storage failure) — save in UI
+      this.updateToolPart(callId, (toolPart) => ({
+        ...toolPart,
+        state: "completed",
+        output: result,
+        screenshot: info.imageData!,
+      }));
+      ScreenshotStorage.saveScreenshot(info.imageData)
+        .then((uid) => {
+          this.updateToolPart(callId, (toolPart) => ({
+            ...toolPart,
+            screenshotUid: uid,
+          }));
+        })
+        .catch(() => {
+          // Storage failed — screenshot still visible via inline data
+        });
+    } else {
+      // No image data at all (sendToLLM=false path) — just complete
+      this.updateToolPart(callId, (toolPart) => ({
+        ...toolPart,
+        state: "completed",
+        output: result,
+        ...(info.screenshotUid
+          ? { screenshotUid: info.screenshotUid }
+          : {}),
+      }));
+    }
+  }
+
  /**
   * Check if a tool result indicates a business-level failure.
   * Many tools return { success: false, error: "..." } instead of throwing.
--- a/packages/aipex-react/src/components/ai-elements/tool.tsx
+++ b/packages/aipex-react/src/components/ai-elements/tool.tsx
@@ -9,7 +9,8 @@ import {
  WrenchIcon,
  XCircleIcon,
 } from "lucide-react";
-import type { ComponentProps, ReactNode } from "react";
+import { type ComponentProps, type ReactNode, useEffect, useState } from "react";
+import { ScreenshotStorage } from "../../lib/screenshot-storage";
 import { cn } from "../../lib/utils";
 import { Badge } from "../ui/badge";
 import {
@@ -29,7 +30,8 @@ export const Tool = ({ className, ...props }: ToolProps) => (
 );

 export type ToolHeaderProps = {
-  type: ToolUIPart["type"];
+  /** Display label for the tool – either a raw `tool-${name}` key or a translated name */
+  type: string;
  state: ToolUIPart["state"] | "executing";
  className?: string;
 };
@@ -154,3 +156,78 @@ export const ToolOutput = ({
    </div>
  );
 };
+
+// ============ Screenshot Display ============
+
+export type ToolScreenshotProps = ComponentProps<"div"> & {
+  /** Inline base64 screenshot data URL */
+  screenshot?: string;
+  /** UID referencing a screenshot stored in ScreenshotStorage (IndexedDB) */
+  screenshotUid?: string;
+};
+
+/**
+ * ToolScreenshot – renders a screenshot captured by a tool.
+ * Supports both inline base64 data and IndexedDB uid references.
+ */
+export const ToolScreenshot = ({
+  className,
+  screenshot,
+  screenshotUid,
+  ...props
+}: ToolScreenshotProps) => {
+  const [imageData, setImageData] = useState<string | null>(
+    screenshot ?? null,
+  );
+  const [loading, setLoading] = useState(false);
+  const [error, setError] = useState<string | null>(null);
+
+  useEffect(() => {
+    // Prefer inline screenshot
+    if (screenshot) {
+      setImageData(screenshot);
+      return;
+    }
+
+    // Load from IndexedDB by uid
+    if (screenshotUid) {
+      setLoading(true);
+      setError(null);
+      ScreenshotStorage.getScreenshot(screenshotUid)
+        .then((data) => {
+          setImageData(data);
+          if (!data) setError("Screenshot not found");
+        })
+        .catch(() => {
+          setError("Failed to load screenshot");
+        })
+        .finally(() => {
+          setLoading(false);
+        });
+    }
+  }, [screenshot, screenshotUid]);
+
+  if (!screenshot && !screenshotUid) return null;
+
+  return (
+    <div className={cn("space-y-2 p-4", className)} {...props}>
+      <h4 className="font-medium text-muted-foreground text-xs uppercase tracking-wide">
+        Screenshot
+      </h4>
+      {loading ? (
+        <div className="flex items-center gap-2 text-muted-foreground text-sm">
+          <ClockIcon className="size-4 animate-spin" />
+          <span>Loading screenshot...</span>
+        </div>
+      ) : error ? (
+        <div className="text-destructive text-sm">{error}</div>
+      ) : imageData ? (
+        <img
+          src={imageData}
+          alt="Screenshot"
+          className="cursor-pointer rounded-md max-w-full"
+        />
+      ) : null}
+    </div>
+  );
+};
--- a/packages/aipex-react/src/components/chatbot/components/chatbot.tsx
+++ b/packages/aipex-react/src/components/chatbot/components/chatbot.tsx
@@ -1,6 +1,7 @@
-import { useCallback, useContext, useMemo, useState } from "react";
+import { useCallback, useContext, useEffect, useMemo, useState } from "react";
 import { useChat, useChatConfig } from "../../../hooks";
 import { useTranslation } from "../../../i18n/context";
+import { fetchModelsForSelector } from "../../../lib/models";
 import { cn } from "../../../lib/utils";
 import type { ChatbotThemeVariables, ContextItem } from "../../../types";
 import { DEFAULT_MODELS } from "../constants";
@@ -237,6 +238,27 @@ function ChatbotContent({
  const [inputResetCount, setInputResetCount] = useState(0);
  const [isUxAuditDialogOpen, setIsUxAuditDialogOpen] = useState(false);

+  // Fetch server model list on mount, fall back to prop-provided models
+  const [fetchedModels, setFetchedModels] = useState<
+    Array<{ name: string; value: string }> | null
+  >(null);
+  useEffect(() => {
+    let cancelled = false;
+    fetchModelsForSelector()
+      .then((serverModels) => {
+        if (!cancelled && serverModels.length > 0) {
+          setFetchedModels(serverModels);
+        }
+      })
+      .catch(() => {
+        // Fallback to prop-provided models — already used below
+      });
+    return () => {
+      cancelled = true;
+    };
+  }, []);
+  const effectiveModels = fetchedModels ?? models;
+
  const handleSubmit = useCallback(
    (text: string, files?: File[], contexts?: ContextItem[]) => {
      void sendMessage?.(text, files, contexts);
@@ -318,7 +340,7 @@ function ChatbotContent({
            onSubmit={handleSubmit}
            onStop={interrupt}
            status={status || "idle"}
-            models={models}
+            models={effectiveModels}
            placeholderTexts={placeholderTexts}
          />
        </>
--- a/packages/aipex-react/src/components/chatbot/components/message-item.tsx
+++ b/packages/aipex-react/src/components/chatbot/components/message-item.tsx
@@ -1,7 +1,15 @@
-import { CopyIcon, RefreshCcwIcon } from "lucide-react";
-import { Fragment } from "react";
+import { CopyIcon, RefreshCcwIcon, WrenchIcon } from "lucide-react";
+import { Fragment, useMemo } from "react";
+import { useTranslation } from "../../../i18n/context";
+import { translatedToolName } from "../../../i18n/tool-names";
+import { transformScreenshotPlaceholders } from "../../../lib/screenshot-utils";
 import { cn } from "../../../lib/utils";
-import type { MessageItemProps, UISourceUrlPart } from "../../../types";
+import type {
+  MessageItemProps,
+  UIMessage,
+  UISourceUrlPart,
+  UIToolPart,
+} from "../../../types";
 import { Action, Actions } from "../../ai-elements/actions";
 import { Message, MessageContent } from "../../ai-elements/message";
 import {
@@ -55,6 +63,21 @@ export function DefaultMessageItem({
    return null;
  }

+  // Collect screenshot data from tool parts for placeholder resolution
+  const { screenshotUidList, screenshotDataMap } = useMemo(() => {
+    const uids: string[] = [];
+    const dataMap = new Map<string, string>();
+    for (const p of message.parts) {
+      if (p.type === "tool" && p.screenshotUid) {
+        uids.push(p.screenshotUid);
+        if (p.screenshot) {
+          dataMap.set(p.screenshotUid, p.screenshot);
+        }
+      }
+    }
+    return { screenshotUidList: uids, screenshotDataMap: dataMap };
+  }, [message.parts]);
+
  // Render sources if present
  const sourceUrls = message.parts.filter(
    (part): part is UISourceUrlPart => part.type === "source-url",
@@ -79,12 +102,27 @@ export function DefaultMessageItem({
        const key = `${message.id}-${i}`;

        switch (part.type) {
-          case "text":
+          case "text": {
+            // Transform [[screenshot:...]] placeholders to markdown images.
+            // First resolve to special URLs, then replace with actual
+            // base64 data URLs when available for inline rendering.
+            let processedText = part.text;
+            if (screenshotUidList.length > 0) {
+              processedText = transformScreenshotPlaceholders(
+                processedText,
+                screenshotUidList,
+              );
+              // Replace aipex-screenshot.invalid URLs with actual data
+              for (const [uid, data] of screenshotDataMap) {
+                const placeholder = `https://aipex-screenshot.invalid/${uid}`;
+                processedText = processedText.split(placeholder).join(data);
+              }
+            }
            return (
              <Fragment key={key}>
                <Message from={message.role as "user" | "assistant" | "system"}>
                  <MessageContent>
-                    <Response>{part.text}</Response>
+                    <Response>{processedText}</Response>
                  </MessageContent>
                </Message>
                {/* Actions for last assistant message */}
@@ -112,6 +150,7 @@ export function DefaultMessageItem({
                  ))}
              </Fragment>
            );
+          }

          case "file":
            return (
@@ -241,6 +280,59 @@ export function DefaultMessageItem({
  );
 }

+// ============ Collapsed tool display for folded messages ============
+
+function CollapsedToolDisplay({ tool }: { tool: UIToolPart }) {
+  const { t } = useTranslation();
+  const displayName = translatedToolName(t, tool.toolName);
+  return (
+    <div className="text-xs text-muted-foreground py-1 px-2 flex items-center gap-1.5">
+      <WrenchIcon className="size-3" />
+      {displayName}
+    </div>
+  );
+}
+
+// ============ Collapsed message item for intermediate assistant messages ============
+
+/**
+ * CollapsedMessageItem – simplified rendering for intermediate assistant
+ * messages inside a folded "thinking details" section.
+ * Shows text as bullet points and tools as compact single-line displays.
+ */
+export function CollapsedMessageItem({ message }: { message: UIMessage }) {
+  return (
+    <div>
+      {message.parts.map((part, i) => {
+        const key = `${message.id}-collapsed-${i}`;
+        switch (part.type) {
+          case "text":
+            return (
+              <div key={key} className="text-sm text-muted-foreground py-1">
+                - {part.text}
+              </div>
+            );
+          case "tool":
+            return <CollapsedToolDisplay key={key} tool={part} />;
+          case "reasoning":
+            return (
+              <div
+                key={key}
+                className="text-xs text-muted-foreground/70 py-0.5 italic"
+              >
+                {part.text.length > 120
+                  ? `${part.text.slice(0, 120)}…`
+                  : part.text}
+              </div>
+            );
+          default:
+            return null;
+        }
+      })}
+    </div>
+  );
+}
+
 /**
 * MessageItem - Renders either custom or default message item
 */
--- a/packages/aipex-react/src/components/chatbot/components/message-list.tsx
+++ b/packages/aipex-react/src/components/chatbot/components/message-list.tsx
@@ -1,15 +1,55 @@
+import { BrainIcon, ChevronDownIcon } from "lucide-react";
+import { useMemo } from "react";
+import { useTranslation } from "../../../i18n/context";
 import { cn } from "../../../lib/utils";
-import type { MessageListProps } from "../../../types";
+import type { MessageListProps, UIMessage } from "../../../types";
 import {
  Conversation,
  ConversationContent,
  ConversationScrollButton,
 } from "../../ai-elements/conversation";
 import { Loader } from "../../ai-elements/loader";
+import {
+  Collapsible,
+  CollapsibleContent,
+  CollapsibleTrigger,
+} from "../../ui/collapsible";
 import { useComponentsContext } from "../context";
-import { MessageItem } from "./message-item";
+import { CollapsedMessageItem, MessageItem } from "./message-item";
 import { WelcomeScreen } from "./welcome-screen";

+/**
+ * A conversation turn: one optional user message followed by one or more
+ * assistant messages produced before the next user message.
+ */
+interface ConversationTurn {
+  userMessage?: UIMessage;
+  assistantMessages: UIMessage[];
+}
+
+/**
+ * Group a flat message list into conversation turns so we can collapse
+ * intermediate assistant messages (thinking / tool-call steps).
+ */
+function groupIntoTurns(messages: UIMessage[]): ConversationTurn[] {
+  const turns: ConversationTurn[] = [];
+  let current: ConversationTurn | null = null;
+
+  for (const message of messages) {
+    if (message.role === "user") {
+      if (current) turns.push(current);
+      current = { userMessage: message, assistantMessages: [] };
+    } else if (message.role === "assistant") {
+      if (!current) {
+        current = { assistantMessages: [] };
+      }
+      current.assistantMessages.push(message);
+    }
+  }
+  if (current) turns.push(current);
+  return turns;
+}
+
 /**
 * Default MessageList component
 */
@@ -27,10 +67,18 @@ export function DefaultMessageList({
  onUxAuditClick?: () => void;
 }) {
  const { slots } = useComponentsContext();
+  const { t } = useTranslation();

  // Filter out system messages for display
  const displayMessages = messages.filter((m) => m.role !== "system");

+  // Group into conversation turns for folding
+  const turns = useMemo(() => groupIntoTurns(displayMessages), [displayMessages]);
+
+  // Determine if a message is the very last display message
+  const lastMessage = displayMessages[displayMessages.length - 1];
+  const lastMessageId = lastMessage?.id ?? null;
+
  return (
    <div className={cn("flex-1 overflow-hidden", className)} {...props}>
      <Conversation className="h-full">
@@ -45,15 +93,78 @@ export function DefaultMessageList({
              onUxAuditClick={onUxAuditClick}
            />
          ) : (
-            displayMessages.map((message, index) => (
-              <MessageItem
-                key={message.id}
-                message={message}
-                isLast={index === displayMessages.length - 1}
-                isStreaming={status === "streaming"}
-                onRegenerate={onRegenerate}
-                onCopy={onCopy}
-              />
+            turns.map((turn, turnIndex) => (
+              <div key={`turn-${turnIndex}`}>
+                {/* Render user message */}
+                {turn.userMessage && (
+                  <MessageItem
+                    key={turn.userMessage.id}
+                    message={turn.userMessage}
+                    isLast={turn.userMessage.id === lastMessageId}
+                    isStreaming={status === "streaming"}
+                    onRegenerate={onRegenerate}
+                    onCopy={onCopy}
+                  />
+                )}
+
+                {/* Render assistant messages with folding */}
+                {turn.assistantMessages.length > 1 ? (
+                  (() => {
+                    const finalMsg =
+                      turn.assistantMessages[
+                        turn.assistantMessages.length - 1
+                      ]!;
+                    return (
+                      <>
+                        {/* Intermediate messages – collapsed by default */}
+                        <Collapsible defaultOpen={false} className="mb-2">
+                          <CollapsibleTrigger className="flex w-full cursor-pointer items-center gap-2 rounded-md border border-muted bg-muted/30 px-3 py-2 text-sm text-muted-foreground transition-colors hover:bg-muted/50 hover:text-foreground">
+                            <BrainIcon className="size-4" />
+                            <span className="flex-1 text-left">
+                              {t("common.showThinkingDetails")}
+                            </span>
+                            <ChevronDownIcon className="size-4 transition-transform [[data-state=open]>&]:rotate-180" />
+                          </CollapsibleTrigger>
+                          <CollapsibleContent className="mt-2">
+                            <div className="rounded-md border border-muted/50 bg-muted/10 p-3 space-y-2">
+                              {turn.assistantMessages
+                                .slice(0, -1)
+                                .map((msg) => (
+                                  <CollapsedMessageItem
+                                    key={msg.id}
+                                    message={msg}
+                                  />
+                                ))}
+                            </div>
+                          </CollapsibleContent>
+                        </Collapsible>
+
+                        {/* Final assistant message – always expanded */}
+                        <MessageItem
+                          key={finalMsg.id}
+                          message={finalMsg}
+                          isLast={finalMsg.id === lastMessageId}
+                          isStreaming={status === "streaming"}
+                          onRegenerate={onRegenerate}
+                          onCopy={onCopy}
+                        />
+                      </>
+                    );
+                  })()
+                ) : (
+                  // Single assistant message – render normally
+                  turn.assistantMessages.map((msg) => (
+                    <MessageItem
+                      key={msg.id}
+                      message={msg}
+                      isLast={msg.id === lastMessageId}
+                      isStreaming={status === "streaming"}
+                      onRegenerate={onRegenerate}
+                      onCopy={onCopy}
+                    />
+                  ))
+                )}
+              </div>
            ))
          )}
          {/* Loading indicator */}
--- a/packages/aipex-react/src/components/chatbot/components/model-change-prompt.tsx
+++ b/packages/aipex-react/src/components/chatbot/components/model-change-prompt.tsx
@@ -1,5 +1,6 @@
 import type React from "react";
-import { useEffect, useState } from "react";
+import { useCallback, useEffect, useState } from "react";
+import { fetchModelsForPrompt } from "../../../lib/models";

 export interface ModelInfo {
  id: string;
@@ -46,24 +47,36 @@ export const ModelChangePrompt: React.FC<ModelChangePromptProps> = ({
  const [allModels, setAllModels] = useState<ModelInfo[]>(availableModels);
  const [isLoadingModels, setIsLoadingModels] = useState(false);

-  // Fetch models from API
-  useEffect(() => {
-    const loadModels = async () => {
-      if (!onFetchModels) return;
+  // Resolve the fetch function: use the provided callback or fall back to
+  // the built-in fetchModelsForPrompt so models are always loaded.
+  const resolvedFetch = useCallback(
+    () => (onFetchModels ? onFetchModels() : fetchModelsForPrompt()),
+    [onFetchModels],
+  );

+  // Fetch models from API (always runs — no longer gated on onFetchModels)
+  useEffect(() => {
+    let cancelled = false;
+    const loadModels = async () => {
      setIsLoadingModels(true);
      try {
-        const fetchedModels = await onFetchModels();
-        setAllModels(fetchedModels);
-      } catch (error) {
-        console.error("Failed to load models:", error);
+        const fetched = await resolvedFetch();
+        if (!cancelled) {
+          setAllModels(fetched);
+        }
+      } catch (_error) {
        // Keep using availableModels as fallback
      } finally {
-        setIsLoadingModels(false);
+        if (!cancelled) {
+          setIsLoadingModels(false);
+        }
      }
    };
    loadModels();
-  }, [onFetchModels]);
+    return () => {
+      cancelled = true;
+    };
+  }, [resolvedFetch]);

  // Update models when availableModels prop changes
  useEffect(() => {
--- a/packages/aipex-react/src/components/chatbot/components/slots/tool-display.tsx
+++ b/packages/aipex-react/src/components/chatbot/components/slots/tool-display.tsx
@@ -4,6 +4,8 @@ import {
  WrenchIcon,
  XCircleIcon,
 } from "lucide-react";
+import { useTranslation } from "../../../../i18n/context";
+import { translatedToolName } from "../../../../i18n/tool-names";
 import { cn } from "../../../../lib/utils";
 import type { ToolDisplaySlotProps } from "../../../../types";
 import { Response } from "../../../ai-elements/response";
@@ -13,6 +15,7 @@ import {
  ToolHeader,
  ToolInput,
  ToolOutput,
+  ToolScreenshot,
 } from "../../../ai-elements/tool";
 import {
  Collapsible,
@@ -26,13 +29,15 @@ import { formatToolOutput, mapToolState } from "../../tools";
 * Opens by default when there's an error so users can see the failure reason
 */
 export function DefaultToolDisplay({ tool }: ToolDisplaySlotProps) {
+  const { t } = useTranslation();
+  const displayName = translatedToolName(t, tool.toolName);
  // Expand by default when in error state to make failure reasons visible
  const shouldExpandByDefault = tool.state === "error";

  return (
    <Tool defaultOpen={shouldExpandByDefault}>
      <ToolHeader
-        type={`tool-${tool.toolName}`}
+        type={displayName}
        state={mapToolState(tool.state)}
      />
      <ToolContent>
@@ -45,6 +50,10 @@ export function DefaultToolDisplay({ tool }: ToolDisplaySlotProps) {
          }
          errorText={tool.errorText}
        />
+        <ToolScreenshot
+          screenshot={tool.screenshot}
+          screenshotUid={tool.screenshotUid}
+        />
      </ToolContent>
    </Tool>
  );
@@ -55,6 +64,8 @@ export function DefaultToolDisplay({ tool }: ToolDisplaySlotProps) {
 * Opens by default when there's an error so users can see the failure reason
 */
 export function CompactToolDisplay({ tool }: ToolDisplaySlotProps) {
+  const { t } = useTranslation();
+  const displayName = translatedToolName(t, tool.toolName);
  const getStatusIcon = () => {
    switch (tool.state) {
      case "pending":
@@ -75,7 +86,7 @@ export function CompactToolDisplay({ tool }: ToolDisplaySlotProps) {
    <Collapsible defaultOpen={shouldExpandByDefault}>
      <CollapsibleTrigger className="flex items-center gap-2 w-full p-2 rounded-md hover:bg-muted/50 transition-colors">
        {getStatusIcon()}
-        <span className="text-sm font-medium">{tool.toolName}</span>
+        <span className="text-sm font-medium">{displayName}</span>
        {tool.duration && (
          <span className="text-xs text-muted-foreground ml-auto">
            {tool.duration}ms
@@ -118,6 +129,8 @@ export function CompactToolDisplay({ tool }: ToolDisplaySlotProps) {
 * Minimal tool display (just status indicator)
 */
 export function MinimalToolDisplay({ tool }: ToolDisplaySlotProps) {
+  const { t } = useTranslation();
+  const displayName = translatedToolName(t, tool.toolName);
  const getStatusColor = () => {
    switch (tool.state) {
      case "pending":
@@ -134,7 +147,7 @@ export function MinimalToolDisplay({ tool }: ToolDisplaySlotProps) {
  return (
    <div className="inline-flex items-center gap-1.5 px-2 py-1 text-xs rounded-full bg-muted">
      <div className={cn("w-2 h-2 rounded-full", getStatusColor())} />
-      <span>{tool.toolName}</span>
+      <span>{displayName}</span>
      {tool.state === "executing" && (
        <Loader2Icon className="size-3 animate-spin" />
      )}
--- a/packages/aipex-react/src/hooks/use-chat.ts
+++ b/packages/aipex-react/src/hooks/use-chat.ts
@@ -91,6 +91,9 @@ export function useChat(
  const [sessionId, setSessionId] = useState<string | null>(null);
  const [metrics, setMetrics] = useState<AgentMetrics | null>(null);

+  // Cumulative session-level metrics (sum across all runs)
+  const cumulativeMetricsRef = useRef<AgentMetrics | null>(null);
+
  // Refs for stable callbacks
  const handlersRef = useRef(handlers);
  handlersRef.current = handlers;
@@ -153,11 +156,28 @@ export function useChat(
            handlersRef.current?.onError?.(event.error);
          }

-          // Handle metrics update
+          // Handle metrics update – accumulate across the session
          if (event.type === "metrics_update") {
-            setMetrics(event.metrics);
+            const prev = cumulativeMetricsRef.current;
+            const cumulative: AgentMetrics = {
+              tokensUsed:
+                (prev?.tokensUsed ?? 0) + event.metrics.tokensUsed,
+              promptTokens:
+                (prev?.promptTokens ?? 0) + event.metrics.promptTokens,
+              completionTokens:
+                (prev?.completionTokens ?? 0) +
+                event.metrics.completionTokens,
+              // Non-cumulative fields: use latest values
+              itemCount: event.metrics.itemCount,
+              maxTurns: event.metrics.maxTurns,
+              duration:
+                (prev?.duration ?? 0) + event.metrics.duration,
+              startTime: prev?.startTime ?? event.metrics.startTime,
+            };
+            cumulativeMetricsRef.current = cumulative;
+            setMetrics(cumulative);
            handlersRef.current?.onMetricsUpdate?.(
-              event.metrics,
+              cumulative,
              event.sessionId,
            );
          }
@@ -263,6 +283,7 @@ export function useChat(
    activeGeneratorRef.current = null;
    setSessionId(null);
    setMetrics(null);
+    cumulativeMetricsRef.current = null;
    adapter.reset(configRef.current?.initialMessages ?? []);
  }, [adapter, agent, sessionId]);

--- a/packages/aipex-react/src/i18n/locales/en.json
+++ b/packages/aipex-react/src/i18n/locales/en.json
@@ -10,7 +10,9 @@
    "send": "Send",
    "stop": "Stop",
    "processing": "Processing...",
-    "noActions": "No actions"
+    "noActions": "No actions",
+    "showThinkingDetails": "Show thinking details",
+    "clickToExpand": "Click to expand"
  },
  "settings": {
    "title": "Settings",
--- a/packages/aipex-react/src/i18n/locales/zh.json
+++ b/packages/aipex-react/src/i18n/locales/zh.json
@@ -10,7 +10,9 @@
    "send": "发送",
    "stop": "停止",
    "processing": "处理中...",
-    "noActions": "无可用操作"
+    "noActions": "无可用操作",
+    "showThinkingDetails": "显示思考过程",
+    "clickToExpand": "点击展开"
  },
  "settings": {
    "title": "设置",
--- a/packages/aipex-react/src/i18n/types.ts
+++ b/packages/aipex-react/src/i18n/types.ts
@@ -13,6 +13,8 @@ export interface TranslationResources {
    stop: string;
    processing: string;
    noActions: string;
+    showThinkingDetails: string;
+    clickToExpand: string;
  };
  settings: {
    title: string;
@@ -229,6 +231,8 @@ export type BaseTranslationKey =
  | "common.stop"
  | "common.processing"
  | "common.noActions"
+  | "common.showThinkingDetails"
+  | "common.clickToExpand"
  | "settings.title"
  | "settings.subtitle"
  | "settings.language"
--- a/packages/aipex-react/src/lib/models.ts
+++ b/packages/aipex-react/src/lib/models.ts
@@ -0,0 +1,180 @@
+// API response types (must match server contract)
+interface ApiModelPricing {
+  input: number;
+  output: number;
+}
+
+interface ApiModel {
+  id: string;
+  name: string;
+  provider: string;
+  description: string;
+  pricing: ApiModelPricing;
+}
+
+interface ApiResponse {
+  success: boolean;
+  data: {
+    models: ApiModel[];
+    count: number;
+    cache: {
+      lastUpdate: number;
+      modelCount: number;
+    };
+  };
+}
+
+// Internal model info used by the chatbot UI
+export interface ModelInfo {
+  id: string;
+  name: string;
+  provider: string;
+  description: string;
+  supportsTools: boolean;
+  contextLength?: number;
+  pricing?: {
+    input: string;
+    output: string;
+  };
+  priceLevel: "cheap" | "normal" | "expensive";
+}
+
+// Fallback models in case API fails
+const FALLBACK_MODELS: ModelInfo[] = [
+  {
+    id: "anthropic/claude-3-haiku",
+    name: "Claude 3 Haiku",
+    provider: "Anthropic",
+    description: "Cost-effective choice for basic tasks",
+    supportsTools: true,
+    contextLength: 200_000,
+    pricing: {
+      input: "$0.30/1M tokens",
+      output: "$1.50/1M tokens",
+    },
+    priceLevel: "cheap",
+  },
+  {
+    id: "anthropic/claude-sonnet-4.5",
+    name: "Claude Sonnet 4.5",
+    provider: "Anthropic",
+    description: "AI model for various tasks",
+    supportsTools: true,
+    contextLength: 200_000,
+    pricing: {
+      input: "$3.60/1M tokens",
+      output: "$18.00/1M tokens",
+    },
+    priceLevel: "expensive",
+  },
+];
+
+const MODELS_API_URL = "https://www.claudechrome.com/api/models";
+
+// Convert API pricing to price level
+function getPriceLevel(
+  pricing: ApiModelPricing,
+): "cheap" | "normal" | "expensive" {
+  const totalCost = pricing.input + pricing.output;
+  if (totalCost < 2) return "cheap";
+  if (totalCost < 10) return "normal";
+  return "expensive";
+}
+
+// Convert API model to internal ModelInfo
+function convertApiModel(apiModel: ApiModel): ModelInfo {
+  return {
+    id: apiModel.id,
+    name: apiModel.name,
+    provider: apiModel.provider,
+    description: apiModel.description,
+    supportsTools: true,
+    pricing: {
+      input: `$${apiModel.pricing.input.toFixed(2)}/1M tokens`,
+      output: `$${apiModel.pricing.output.toFixed(2)}/1M tokens`,
+    },
+    priceLevel: getPriceLevel(apiModel.pricing),
+  };
+}
+
+// Validate that the API response matches the expected schema
+function isValidApiResponse(data: unknown): data is ApiResponse {
+  if (typeof data !== "object" || data === null) return false;
+  const obj = data as Record<string, unknown>;
+  if (typeof obj.success !== "boolean") return false;
+  if (typeof obj.data !== "object" || obj.data === null) return false;
+  const d = obj.data as Record<string, unknown>;
+  if (!Array.isArray(d.models)) return false;
+  // Validate first model shape if present
+  if (d.models.length > 0) {
+    const first = d.models[0] as Record<string, unknown>;
+    if (typeof first.id !== "string" || typeof first.name !== "string") {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Cache for models
+let cachedModels: ModelInfo[] | null = null;
+let lastFetchTime = 0;
+const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
+const MAX_MODELS = 200; // Safety cap on number of models
+
+/**
+ * Fetch models from the server API with caching and fallback.
+ * Returns cached result if still valid (5 min TTL).
+ * Falls back to FALLBACK_MODELS on any error.
+ */
+export async function fetchModels(): Promise<ModelInfo[]> {
+  // Return cached models if still valid
+  if (cachedModels && Date.now() - lastFetchTime < CACHE_DURATION) {
+    return cachedModels;
+  }
+
+  try {
+    const response = await fetch(MODELS_API_URL);
+
+    if (!response.ok) {
+      throw new Error(`API request failed: ${response.status}`);
+    }
+
+    const data: unknown = await response.json();
+
+    if (!isValidApiResponse(data)) {
+      throw new Error("Invalid API response structure");
+    }
+
+    if (data.success && data.data.models.length > 0) {
+      // Apply safety cap
+      const models = data.data.models
+        .slice(0, MAX_MODELS)
+        .map(convertApiModel);
+      cachedModels = models;
+      lastFetchTime = Date.now();
+      return cachedModels;
+    }
+
+    throw new Error("Empty model list from API");
+  } catch (_error) {
+    // Return fallback - do not log sensitive details
+    return FALLBACK_MODELS;
+  }
+}
+
+/**
+ * Fetch models and convert to the {name, value} format used by the model selector.
+ */
+export async function fetchModelsForSelector(): Promise<
+  Array<{ name: string; value: string }>
+> {
+  const models = await fetchModels();
+  return models.map((m) => ({ name: m.name, value: m.id }));
+}
+
+/**
+ * Fetch models as ModelInfo[] for ModelChangePrompt compatibility.
+ */
+export async function fetchModelsForPrompt(): Promise<ModelInfo[]> {
+  return fetchModels();
+}
--- a/packages/aipex-react/src/lib/screenshot-storage.ts
+++ b/packages/aipex-react/src/lib/screenshot-storage.ts
@@ -0,0 +1,177 @@
+/**
+ * Screenshot storage using IndexedDB.
+ * Stores screenshots with a uid for efficient reference and retrieval.
+ * Applies an LRU eviction policy (max 50 screenshots).
+ */
+
+export interface ScreenshotData {
+  uid: string;
+  /** Complete data URL: data:image/png;base64,... */
+  base64Data: string;
+  timestamp: number;
+  tabId?: number;
+  metadata?: {
+    width: number;
+    height: number;
+    viewportWidth: number;
+    viewportHeight: number;
+  };
+}
+
+const DB_NAME = "aipex-screenshots-db";
+const DB_VERSION = 1;
+const STORE_NAME = "screenshots";
+const MAX_SCREENSHOTS = 50;
+
+let db: IDBDatabase | null = null;
+let initPromise: Promise<void> | null = null;
+
+function initialize(): Promise<void> {
+  if (initPromise) return initPromise;
+  if (db) return Promise.resolve();
+
+  initPromise = new Promise<void>((resolve, reject) => {
+    const request = indexedDB.open(DB_NAME, DB_VERSION);
+
+    request.onerror = () => {
+      initPromise = null;
+      reject(request.error);
+    };
+
+    request.onsuccess = () => {
+      db = request.result;
+      initPromise = null;
+      resolve();
+    };
+
+    request.onupgradeneeded = (event) => {
+      const database = (event.target as IDBOpenDBRequest).result;
+      if (!database.objectStoreNames.contains(STORE_NAME)) {
+        const store = database.createObjectStore(STORE_NAME, {
+          keyPath: "uid",
+        });
+        store.createIndex("timestamp", "timestamp", { unique: false });
+      }
+    };
+  });
+
+  return initPromise;
+}
+
+function generateUid(): string {
+  return `screenshot_${Date.now()}_${Math.random().toString(36).slice(2, 11)}`;
+}
+
+async function applyLRU(): Promise<void> {
+  if (!db) return;
+  const tx = db.transaction([STORE_NAME], "readonly");
+  const store = tx.objectStore(STORE_NAME);
+  const all: ScreenshotData[] = await new Promise((res, rej) => {
+    const req = store.getAll();
+    req.onsuccess = () => res(req.result as ScreenshotData[]);
+    req.onerror = () => rej(req.error);
+  });
+
+  if (all.length <= MAX_SCREENSHOTS) return;
+
+  all.sort((a, b) => b.timestamp - a.timestamp);
+  const toDelete = all.slice(MAX_SCREENSHOTS);
+
+  const delTx = db.transaction([STORE_NAME], "readwrite");
+  const delStore = delTx.objectStore(STORE_NAME);
+  for (const item of toDelete) {
+    delStore.delete(item.uid);
+  }
+}
+
+export const ScreenshotStorage = {
+  /**
+   * Save a screenshot and return its uid.
+   * The base64Data must be a valid data URL (validated before storing).
+   */
+  async saveScreenshot(
+    base64Data: string,
+    metadata?: {
+      tabId?: number;
+      width?: number;
+      height?: number;
+      viewportWidth?: number;
+      viewportHeight?: number;
+    },
+  ): Promise<string> {
+    // Validate that it's a data URL (not arbitrary content)
+    if (
+      typeof base64Data !== "string" ||
+      !base64Data.startsWith("data:image/")
+    ) {
+      throw new Error("Invalid screenshot data: expected data:image/ URL");
+    }
+
+    await initialize();
+    if (!db) throw new Error("Database not initialized");
+
+    const uid = generateUid();
+    const entry: ScreenshotData = {
+      uid,
+      base64Data,
+      timestamp: Date.now(),
+      tabId: metadata?.tabId,
+      metadata: metadata
+        ? {
+            width: metadata.width ?? 0,
+            height: metadata.height ?? 0,
+            viewportWidth: metadata.viewportWidth ?? 0,
+            viewportHeight: metadata.viewportHeight ?? 0,
+          }
+        : undefined,
+    };
+
+    await new Promise<void>((resolve, reject) => {
+      const tx = db!.transaction([STORE_NAME], "readwrite");
+      const store = tx.objectStore(STORE_NAME);
+      const req = store.put(entry);
+      req.onsuccess = () => resolve();
+      req.onerror = () => reject(req.error);
+    });
+
+    // Async LRU eviction — fire-and-forget
+    applyLRU().catch(() => {});
+
+    return uid;
+  },
+
+  /**
+   * Get screenshot base64 data by uid.
+   */
+  async getScreenshot(uid: string): Promise<string | null> {
+    await initialize();
+    if (!db) throw new Error("Database not initialized");
+
+    return new Promise((resolve, reject) => {
+      const tx = db!.transaction([STORE_NAME], "readonly");
+      const store = tx.objectStore(STORE_NAME);
+      const req = store.get(uid);
+      req.onsuccess = () => {
+        const data = req.result as ScreenshotData | undefined;
+        resolve(data?.base64Data ?? null);
+      };
+      req.onerror = () => reject(req.error);
+    });
+  },
+
+  /**
+   * Clear all screenshots.
+   */
+  async clearAll(): Promise<void> {
+    await initialize();
+    if (!db) throw new Error("Database not initialized");
+
+    await new Promise<void>((resolve, reject) => {
+      const tx = db!.transaction([STORE_NAME], "readwrite");
+      const store = tx.objectStore(STORE_NAME);
+      const req = store.clear();
+      req.onsuccess = () => resolve();
+      req.onerror = () => reject(req.error);
+    });
+  },
+};
--- a/packages/aipex-react/src/lib/screenshot-utils.ts
+++ b/packages/aipex-react/src/lib/screenshot-utils.ts
@@ -0,0 +1,185 @@
+/**
+ * Utilities for detecting screenshot tools and extracting image data
+ * from tool results.
+ */
+
+/** Tool names that produce screenshot image data */
+const SCREENSHOT_TOOL_NAMES = new Set([
+  "capture_screenshot",
+  "capture_screenshot_with_highlight",
+  "capture_tab_screenshot",
+]);
+
+/** URL prefix used in markdown for screenshot references */
+export const AIPEX_SCREENSHOT_URL_PREFIX = "https://aipex-screenshot.invalid/";
+
+/** Regex matching [[screenshot:...]] placeholders */
+const SCREENSHOT_PLACEHOLDER_REGEX = /\[\[screenshot:([^\]]+)\]\]/g;
+
+/** Validate that a uid looks like a screenshot uid */
+export function isValidScreenshotUid(uid: string): boolean {
+  return /^screenshot_\d+_[a-z0-9]{1,20}$/i.test(uid);
+}
+
+/**
+ * Check if a tool is a screenshot/capture tool.
+ */
+export function isCaptureScreenshotTool(toolName: string): boolean {
+  return SCREENSHOT_TOOL_NAMES.has(toolName);
+}
+
+export interface ScreenshotExtraction {
+  /** Base64 data URL if available (may be null if already stripped) */
+  imageData: string | null;
+  /** Whether the screenshot was intended for LLM vision */
+  sendToLLM: boolean;
+  /** Unique identifier for loading from IndexedDB storage */
+  screenshotUid: string | null;
+}
+
+/**
+ * Extract screenshot info from a tool result.
+ * Works with capture_screenshot and capture_tab_screenshot tools.
+ *
+ * Supports multiple result formats:
+ * - Object: { success, imageData, sendToLLM, screenshotUid }
+ * - Nested object: { success, data: { imageData, sendToLLM, screenshotUid } }
+ * - SDK structured array: [{ type: "text", text: JSON }, { type: "image", image: dataUrl }]
+ *
+ * Returns screenshot details if found, null if this is not a screenshot result.
+ */
+export function extractScreenshotFromToolResult(
+  toolName: string,
+  result: unknown,
+): ScreenshotExtraction | null {
+  if (!isCaptureScreenshotTool(toolName)) return null;
+
+  try {
+    const content =
+      typeof result === "string" ? JSON.parse(result) : result;
+    if (content === null || content === undefined) return null;
+
+    // SDK structured array format:
+    // [{ type: "text", text: '{"success":true,...}' }, { type: "image", image: "data:..." }]
+    if (Array.isArray(content)) {
+      return extractFromStructuredArray(content);
+    }
+
+    if (typeof content !== "object") return null;
+
+    const obj = content as Record<string, unknown>;
+
+    // Handle nested structure: { success, data: { imageData, sendToLLM } }
+    // or direct: { success, imageData, sendToLLM }
+    const middleLayer = obj.data as Record<string, unknown> | undefined;
+    const actualData =
+      (middleLayer?.data as Record<string, unknown>) ?? middleLayer ?? obj;
+
+    if (!obj.success) return null;
+
+    // Extract screenshotUid (always present if tool saved to IndexedDB)
+    const screenshotUid =
+      typeof actualData.screenshotUid === "string"
+        ? actualData.screenshotUid
+        : null;
+
+    // Extract imageData (may be a real data URL or a placeholder)
+    const rawImageData = actualData.imageData;
+    const imageData =
+      typeof rawImageData === "string" &&
+      rawImageData.startsWith("data:image/")
+        ? rawImageData
+        : null;
+
+    const sendToLLM = actualData.sendToLLM === true;
+
+    // Return if we have at least a uid or image data
+    if (screenshotUid || imageData) {
+      return { imageData, sendToLLM, screenshotUid };
+    }
+  } catch {
+    // parse failed – ignore
+  }
+
+  return null;
+}
+
+/**
+ * Extract screenshot from SDK structured array format.
+ */
+function extractFromStructuredArray(
+  arr: unknown[],
+): ScreenshotExtraction | null {
+  let imageData: string | null = null;
+  let screenshotUid: string | null = null;
+  let sendToLLM = false;
+
+  for (const item of arr) {
+    if (typeof item !== "object" || item === null) continue;
+    const part = item as Record<string, unknown>;
+
+    if (part.type === "image" && typeof part.image === "string") {
+      if (part.image.startsWith("data:image/")) {
+        imageData = part.image;
+      }
+    }
+
+    if (part.type === "text" && typeof part.text === "string") {
+      try {
+        const parsed = JSON.parse(part.text) as Record<string, unknown>;
+        if (parsed.sendToLLM === true) sendToLLM = true;
+        if (typeof parsed.screenshotUid === "string") {
+          screenshotUid = parsed.screenshotUid;
+        }
+      } catch {
+        // ignore
+      }
+    }
+  }
+
+  if (imageData) {
+    return { imageData, sendToLLM: sendToLLM || true, screenshotUid };
+  }
+  return null;
+}
+
+/**
+ * Transform [[screenshot:...]] placeholders in text into markdown images
+ * with the special aipex-screenshot.invalid URL prefix.
+ *
+ * Supported formats:
+ * - [[screenshot:screenshot_123_abc]]  → ![](https://aipex-screenshot.invalid/screenshot_123_abc)
+ * - [[screenshot:1]]                   → 1-based index into screenshotUidList
+ */
+export function transformScreenshotPlaceholders(
+  text: string,
+  screenshotUidList: string[],
+): string {
+  return text.replace(
+    SCREENSHOT_PLACEHOLDER_REGEX,
+    (match: string, content: string) => {
+      const trimmed = content.trim();
+
+      // Case 1: Direct uid
+      if (isValidScreenshotUid(trimmed)) {
+        return `![](${AIPEX_SCREENSHOT_URL_PREFIX}${trimmed})`;
+      }
+
+      // Case 2: Numeric 1-based index
+      const index = parseInt(trimmed, 10);
+      if (
+        !isNaN(index) &&
+        index >= 1 &&
+        index <= screenshotUidList.length
+      ) {
+        const uid = screenshotUidList[index - 1];
+        if (uid && isValidScreenshotUid(uid)) {
+          return `![](${AIPEX_SCREENSHOT_URL_PREFIX}${uid})`;
+        }
+      }
+
+      // Invalid – leave as-is
+      return match;
+    },
+  );
+}
--- a/packages/aipex-react/src/types/ui.ts
+++ b/packages/aipex-react/src/types/ui.ts
@@ -47,6 +47,10 @@ export interface UIToolPart {
  state: UIToolState;
  errorText?: string;
  duration?: number;
+  /** Base64 data URL of the screenshot (inline) */
+  screenshot?: string;
+  /** UID referencing a screenshot in ScreenshotStorage (IndexedDB) */
+  screenshotUid?: string;
 }

 export interface UIContextPart {
--- a/packages/browser-ext/src/lib/chat-images-listener.tsx
+++ b/packages/browser-ext/src/lib/chat-images-listener.tsx
@@ -41,32 +41,58 @@ export function ChatImagesListener() {

        for (const msg of messages) {
          for (const part of msg.parts) {
-            // Tool parts may carry screenshot data in their output
+            // Tool parts may carry screenshot data inline (screenshot field)
+            // or in their output (imageData field)
            if (part.type === "tool") {
-              const output = (part as { output?: unknown }).output;
+              const toolPart = part as {
+                output?: unknown;
+                screenshot?: string;
+                toolName?: string;
+              };
+
+              // Prefer the inline screenshot field (set by ChatAdapter)
+              const screenshotData = toolPart.screenshot;
              if (
-                output &&
-                typeof output === "object" &&
-                "imageData" in output
+                screenshotData &&
+                typeof screenshotData === "string" &&
+                screenshotData.startsWith("data:image/")
              ) {
-                const imageData = (output as { imageData?: string }).imageData;
+                images.push({
+                  id: msg.id,
+                  parts: [
+                    {
+                      type: "image",
+                      imageData: screenshotData,
+                      imageTitle: toolPart.toolName || "Screenshot",
+                    },
+                  ],
+                });
+              } else {
+                // Fall back to extracting from output
+                const output = toolPart.output;
                if (
-                  imageData &&
-                  typeof imageData === "string" &&
-                  imageData.startsWith("data:image/")
+                  output &&
+                  typeof output === "object" &&
+                  "imageData" in output
                ) {
-                  images.push({
-                    id: msg.id,
-                    parts: [
-                      {
-                        type: "image",
-                        imageData,
-                        imageTitle:
-                          (part as { toolName?: string }).toolName ||
-                          "Screenshot",
-                      },
-                    ],
-                  });
+                  const imageData = (output as { imageData?: string })
+                    .imageData;
+                  if (
+                    imageData &&
+                    typeof imageData === "string" &&
+                    imageData.startsWith("data:image/")
+                  ) {
+                    images.push({
+                      id: msg.id,
+                      parts: [
+                        {
+                          type: "image",
+                          imageData,
+                          imageTitle: toolPart.toolName || "Screenshot",
+                        },
+                      ],
+                    });
+                  }
                }
              }
            }
--- a/packages/browser-ext/src/lib/message-adapter.test.ts
+++ b/packages/browser-ext/src/lib/message-adapter.test.ts
@@ -0,0 +1,257 @@
+import { describe, expect, it } from "vitest";
+import { fromStorageFormat, toStorageFormat } from "./message-adapter";
+
+const TEST_IMAGE_DATA = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQ==";
+const TEST_SCREENSHOT_UID = "screenshot_1234567890_abcdefghi";
+const PLACEHOLDER = "[Image data removed - see following user message]";
+
+describe("message-adapter", () => {
+  describe("toStorageFormat – screenshot stripping", () => {
+    it("should strip base64 imageData from screenshot tool results", () => {
+      const output = {
+        success: true,
+        imageData: TEST_IMAGE_DATA,
+        sendToLLM: true,
+        screenshotUid: TEST_SCREENSHOT_UID,
+        tabId: 1,
+        url: "https://example.com",
+        title: "Example",
+      };
+
+      const messages = [
+        {
+          id: "msg-1",
+          role: "assistant" as const,
+          parts: [
+            {
+              type: "tool" as const,
+              toolCallId: "call-1",
+              toolName: "capture_screenshot",
+              input: { sendToLLM: true },
+              output,
+              state: "completed" as const,
+              screenshot: TEST_IMAGE_DATA,
+              screenshotUid: TEST_SCREENSHOT_UID,
+            },
+          ],
+          timestamp: Date.now(),
+        },
+      ];
+
+      const stored = toStorageFormat(messages as any);
+      expect(stored.length).toBe(1);
+
+      // Find the tool_result part
+      const toolResultPart = stored[0]!.parts.find(
+        (p: any) => p.type === "tool_result",
+      ) as any;
+      expect(toolResultPart).toBeTruthy();
+
+      // Parse the content and verify imageData is stripped
+      const parsedContent = JSON.parse(toolResultPart.content);
+      expect(parsedContent.imageData).toBe(PLACEHOLDER);
+      expect(parsedContent.screenshotUid).toBe(TEST_SCREENSHOT_UID);
+      expect(parsedContent.success).toBe(true);
+    });
+
+    it("should not strip non-screenshot tool results", () => {
+      const output = {
+        tabs: [{ id: 1, title: "Tab" }],
+        imageData: TEST_IMAGE_DATA, // Even if it has imageData
+      };
+
+      const messages = [
+        {
+          id: "msg-1",
+          role: "assistant" as const,
+          parts: [
+            {
+              type: "tool" as const,
+              toolCallId: "call-1",
+              toolName: "get_tabs",
+              input: {},
+              output,
+              state: "completed" as const,
+            },
+          ],
+          timestamp: Date.now(),
+        },
+      ];
+
+      const stored = toStorageFormat(messages as any);
+      const toolResultPart = stored[0]!.parts.find(
+        (p: any) => p.type === "tool_result",
+      ) as any;
+      const parsedContent = JSON.parse(toolResultPart.content);
+      expect(parsedContent.imageData).toBe(TEST_IMAGE_DATA);
+    });
+  });
+
+  describe("fromStorageFormat – screenshotUid restoration", () => {
+    it("should restore screenshotUid from stored tool result", () => {
+      const storedOutput = {
+        success: true,
+        imageData: PLACEHOLDER,
+        sendToLLM: true,
+        screenshotUid: TEST_SCREENSHOT_UID,
+        tabId: 1,
+      };
+
+      const storedMessages = [
+        {
+          id: "msg-1",
+          role: "assistant" as const,
+          parts: [
+            {
+              type: "tool_use" as const,
+              id: "call-1",
+              name: "capture_screenshot",
+              input: { sendToLLM: true },
+            },
+            {
+              type: "tool_result" as const,
+              tool_use_id: "call-1",
+              content: JSON.stringify(storedOutput),
+              is_error: false,
+            },
+          ],
+          timestamp: Date.now(),
+        },
+      ];
+
+      const restored = fromStorageFormat(storedMessages as any);
+      expect(restored.length).toBe(1);
+
+      // Find the tool part (merged from tool_use + tool_result)
+      const toolPart = restored[0]!.parts.find(
+        (p: any) => p.type === "tool",
+      ) as any;
+      expect(toolPart).toBeTruthy();
+      expect(toolPart.screenshotUid).toBe(TEST_SCREENSHOT_UID);
+      // imageData is the placeholder, not a real data URL, so screenshot should NOT be set
+      expect(toolPart.screenshot).toBeUndefined();
+      expect(toolPart.state).toBe("completed");
+    });
+
+    it("should restore both screenshotUid and screenshot when real imageData is present", () => {
+      const storedOutput = {
+        success: true,
+        imageData: TEST_IMAGE_DATA,
+        sendToLLM: true,
+        screenshotUid: TEST_SCREENSHOT_UID,
+        tabId: 1,
+      };
+
+      const storedMessages = [
+        {
+          id: "msg-1",
+          role: "assistant" as const,
+          parts: [
+            {
+              type: "tool_use" as const,
+              id: "call-1",
+              name: "capture_screenshot",
+              input: { sendToLLM: true },
+            },
+            {
+              type: "tool_result" as const,
+              tool_use_id: "call-1",
+              content: JSON.stringify(storedOutput),
+              is_error: false,
+            },
+          ],
+          timestamp: Date.now(),
+        },
+      ];
+
+      const restored = fromStorageFormat(storedMessages as any);
+      const toolPart = restored[0]!.parts.find(
+        (p: any) => p.type === "tool",
+      ) as any;
+      expect(toolPart.screenshotUid).toBe(TEST_SCREENSHOT_UID);
+      expect(toolPart.screenshot).toBe(TEST_IMAGE_DATA);
+    });
+  });
+
+  describe("round-trip: toStorageFormat -> fromStorageFormat", () => {
+    it("should preserve screenshotUid through round-trip", () => {
+      const original = [
+        {
+          id: "msg-1",
+          role: "assistant" as const,
+          parts: [
+            {
+              type: "tool" as const,
+              toolCallId: "call-1",
+              toolName: "capture_screenshot",
+              input: { sendToLLM: true },
+              output: {
+                success: true,
+                imageData: TEST_IMAGE_DATA,
+                sendToLLM: true,
+                screenshotUid: TEST_SCREENSHOT_UID,
+                tabId: 1,
+              },
+              state: "completed" as const,
+              screenshot: TEST_IMAGE_DATA,
+              screenshotUid: TEST_SCREENSHOT_UID,
+            },
+          ],
+          timestamp: Date.now(),
+        },
+      ];
+
+      // Store -> Restore
+      const stored = toStorageFormat(original as any);
+      const restored = fromStorageFormat(stored);
+
+      const toolPart = restored[0]!.parts.find(
+        (p: any) => p.type === "tool",
+      ) as any;
+
+      // screenshotUid should survive the round-trip
+      expect(toolPart.screenshotUid).toBe(TEST_SCREENSHOT_UID);
+      // imageData was stripped during storage, so inline screenshot is gone
+      expect(toolPart.screenshot).toBeUndefined();
+      expect(toolPart.state).toBe("completed");
+      expect(toolPart.toolName).toBe("capture_screenshot");
+    });
+
+    it("should handle capture_tab_screenshot round-trip", () => {
+      const original = [
+        {
+          id: "msg-1",
+          role: "assistant" as const,
+          parts: [
+            {
+              type: "tool" as const,
+              toolCallId: "call-1",
+              toolName: "capture_tab_screenshot",
+              input: { tabId: 42, sendToLLM: true },
+              output: {
+                success: true,
+                imageData: TEST_IMAGE_DATA,
+                sendToLLM: true,
+                screenshotUid: TEST_SCREENSHOT_UID,
+                tabId: 42,
+              },
+              state: "completed" as const,
+              screenshot: TEST_IMAGE_DATA,
+              screenshotUid: TEST_SCREENSHOT_UID,
+            },
+          ],
+          timestamp: Date.now(),
+        },
+      ];
+
+      const stored = toStorageFormat(original as any);
+      const restored = fromStorageFormat(stored);
+
+      const toolPart = restored[0]!.parts.find(
+        (p: any) => p.type === "tool",
+      ) as any;
+      expect(toolPart.screenshotUid).toBe(TEST_SCREENSHOT_UID);
+      expect(toolPart.toolName).toBe("capture_tab_screenshot");
+    });
+  });
+});
--- a/packages/browser-ext/src/lib/message-adapter.ts
+++ b/packages/browser-ext/src/lib/message-adapter.ts
@@ -6,6 +6,90 @@
 import type { UIMessage as ReactUIMessage } from "@aipexstudio/aipex-react/types";
 import type { UIMessage as RuntimeUIMessage } from "@aipexstudio/browser-runtime";

+/** Tool names whose results may include screenshot image data */
+const SCREENSHOT_TOOL_NAMES = new Set([
+  "capture_screenshot",
+  "capture_screenshot_with_highlight",
+  "capture_tab_screenshot",
+]);
+
+/** Placeholder that replaces base64 imageData in stored tool results */
+const IMAGE_DATA_PLACEHOLDER =
+  "[Image data removed - see following user message]";
+
+interface ScreenshotToolInfo {
+  /** The base64 data URL if present (may be null if already stripped) */
+  imageData: string | null;
+  /** The screenshot uid if present */
+  screenshotUid: string | null;
+}
+
+/**
+ * Navigate into the parsed tool result to find the "actual data" layer.
+ * Handles nesting: { data: { ... } }, { data: { data: { ... } } }, or flat.
+ */
+function getScreenshotActualData(
+  parsedOutput: unknown,
+): Record<string, unknown> | null {
+  if (typeof parsedOutput !== "object" || parsedOutput === null) return null;
+  const obj = parsedOutput as Record<string, unknown>;
+  const middleLayer = obj.data as Record<string, unknown> | undefined;
+  return (
+    (middleLayer?.data as Record<string, unknown>) ?? middleLayer ?? obj
+  );
+}
+
+/**
+ * Extract screenshot info (imageData + screenshotUid) from a parsed tool result.
+ */
+function extractScreenshotInfo(
+  toolName: string,
+  parsedOutput: unknown,
+): ScreenshotToolInfo | null {
+  if (!SCREENSHOT_TOOL_NAMES.has(toolName)) return null;
+  const actual = getScreenshotActualData(parsedOutput);
+  if (!actual) return null;
+
+  const imageData =
+    typeof actual.imageData === "string" &&
+    actual.imageData.startsWith("data:image/")
+      ? actual.imageData
+      : null;
+  const screenshotUid =
+    typeof actual.screenshotUid === "string" ? actual.screenshotUid : null;
+
+  if (!imageData && !screenshotUid) return null;
+  return { imageData, screenshotUid };
+}
+
+/**
+ * Strip base64 imageData from a screenshot tool result string, replacing it
+ * with a placeholder. Returns the stripped string (or the original if not applicable).
+ */
+function stripImageDataFromToolOutput(
+  toolName: string,
+  content: string,
+): string {
+  if (!SCREENSHOT_TOOL_NAMES.has(toolName)) return content;
+
+  const parsed = safeJsonParse<Record<string, unknown>>(content);
+  if (!parsed) return content;
+
+  const actual = getScreenshotActualData(parsed);
+  if (!actual) return content;
+
+  if (
+    typeof actual.imageData !== "string" ||
+    !actual.imageData.startsWith("data:image/")
+  ) {
+    return content;
+  }
+
+  // Replace imageData in the actual data layer
+  actual.imageData = IMAGE_DATA_PLACEHOLDER;
+  return JSON.stringify(parsed);
+}
+
 /**
 * Convert aipex-react UIMessage to runtime UIMessage for storage
 */
@@ -15,7 +99,7 @@ export function toStorageFormat(
  return messages.map((msg) => ({
    id: msg.id,
    role: msg.role === "tool" ? "assistant" : msg.role, // Map "tool" to "assistant"
-    parts: msg.parts.map((part) => {
+    parts: msg.parts.flatMap((part) => {
      switch (part.type) {
        case "text":
          return { type: "text", text: part.text };
@@ -27,19 +111,37 @@ export function toStorageFormat(
            imageTitle: part.filename,
          };
        case "tool":
-          // Map tool to tool_use or tool_result based on state
+          // Map tool to tool_use + tool_result pair (when completed)
+          // or just tool_use (when pending/executing).
+          // Emitting both ensures fromStorageFormat can correlate them
+          // to restore the proper toolName and input.
          if (part.output !== undefined) {
-            // Avoid double-stringifying if output is already a string
-            const content =
+            // Avoid double-stringifying if output is already a string.
+            let content =
              typeof part.output === "string"
                ? part.output
                : JSON.stringify(part.output);
-            return {
-              type: "tool_result",
-              tool_use_id: part.toolCallId,
-              content,
-              is_error: part.state === "error",
-            };
+
+            // Strip base64 imageData from screenshot tool results before
+            // persisting to keep stored conversations small and avoid
+            // storing large blobs. The screenshotUid is preserved in the
+            // output so images can be loaded from IndexedDB on restore.
+            content = stripImageDataFromToolOutput(part.toolName, content);
+
+            return [
+              {
+                type: "tool_use",
+                id: part.toolCallId,
+                name: part.toolName,
+                input: part.input as Record<string, unknown>,
+              },
+              {
+                type: "tool_result",
+                tool_use_id: part.toolCallId,
+                content,
+                is_error: part.state === "error",
+              },
+            ];
          }
          return {
            type: "tool_use",
@@ -210,7 +312,11 @@ export function fromStorageFormat(
            };
          }

-          // Normal successful completion
+          // Normal successful completion – restore screenshot data
+          const screenshotInfo = extractScreenshotInfo(
+            toolName,
+            parsedOutput,
+          );
          return {
            type: "tool",
            toolName,
@@ -218,6 +324,15 @@ export function fromStorageFormat(
            input,
            output: parsedOutput,
            state: "completed" as const,
+            // Restore screenshotUid so UI can load from IndexedDB
+            ...(screenshotInfo?.screenshotUid
+              ? { screenshotUid: screenshotInfo.screenshotUid }
+              : {}),
+            // Restore inline screenshot only if actual base64 is present
+            // (not when it's been replaced with a placeholder)
+            ...(screenshotInfo?.imageData
+              ? { screenshot: screenshotInfo.imageData }
+              : {}),
          };
        }
        default:
--- a/packages/browser-runtime/src/index.ts
+++ b/packages/browser-runtime/src/index.ts
@@ -19,6 +19,8 @@ export type {
 } from "./lib/vm/zenfs-manager.js";
 // Virtual File System
 export { zenfs } from "./lib/vm/zenfs-manager.js";
+// Screenshot Storage (IndexedDB)
+export { RuntimeScreenshotStorage } from "./lib/screenshot-storage.js";
 export * from "./runtime/automation-mode.js";
 export * from "./runtime/browser-automation-host.js";
 export * from "./runtime/context-providers.js";
--- a/packages/browser-runtime/src/intervention/element-capture.ts
+++ b/packages/browser-runtime/src/intervention/element-capture.ts
@@ -10,6 +10,7 @@
 */

 import type { ElementCaptureEvent, ElementCaptureOptions } from "./types.js";
+import { captureVisibleTabWithElementCrop } from "../tools/screenshot-helpers.js";

 type CaptureCallback = (event: ElementCaptureEvent) => void;

@@ -232,34 +233,45 @@ export class ElementCaptureService {
  }

  /**
-   * Capture screenshot functionality (with highlight)
+   * Capture screenshot functionality (with highlight / element crop).
+   *
+   * Delegates to the shared `captureVisibleTabWithElementCrop` helper so that
+   * the element-rect resolution, DPR scaling, crop, and restricted-page
+   * checks are consistent with `captureScreenshotWithHighlightTool`.
+   *
+   * Falls back to a full-page screenshot if the selector cannot be resolved.
   */
  async captureScreenshot(
-    _selector: string,
-    _options?: {
+    selector: string,
+    options?: {
      cropToElement?: boolean;
      padding?: number;
    },
  ): Promise<string | null> {
    try {
-      // Use Chrome's captureVisibleTab API directly
      if (!this.currentTabId) {
-        console.warn("⚠️ [ElementCaptureService] No current tab for screenshot");
+        console.warn(
+          "⚠️ [ElementCaptureService] No current tab for screenshot",
+        );
        return null;
      }

-      // Get the tab to find its window ID
      const tab = await chrome.tabs.get(this.currentTabId);
      if (!tab.windowId) {
        console.warn("⚠️ [ElementCaptureService] No window ID for tab");
        return null;
      }

-      const screenshot = await chrome.tabs.captureVisibleTab(tab.windowId, {
-        format: "png",
+      const result = await captureVisibleTabWithElementCrop({
+        tabId: this.currentTabId,
+        windowId: tab.windowId,
+        tabUrl: tab.url,
+        selector,
+        cropToElement: options?.cropToElement ?? true,
+        padding: options?.padding ?? 50,
      });

-      return screenshot;
+      return result.dataUrl;
    } catch (error) {
      console.error("❌ [ElementCaptureService] Screenshot error:", error);
      return null;
--- a/packages/browser-runtime/src/lib/screenshot-storage.ts
+++ b/packages/browser-runtime/src/lib/screenshot-storage.ts
@@ -0,0 +1,186 @@
+/**
+ * Screenshot storage using IndexedDB.
+ * Stores screenshots with a uid for efficient reference and retrieval.
+ * Applies an LRU eviction policy (max 50 screenshots).
+ *
+ * Uses the same DB/store as the aipex ScreenshotStorage so both
+ * can share screenshots during the migration period.
+ */
+
+export interface ScreenshotData {
+  uid: string;
+  /** Complete data URL: data:image/png;base64,... */
+  base64Data: string;
+  timestamp: number;
+  tabId?: number;
+  metadata?: {
+    width: number;
+    height: number;
+    viewportWidth: number;
+    viewportHeight: number;
+  };
+}
+
+const DB_NAME = "aipex-screenshots-db";
+const DB_VERSION = 1;
+const STORE_NAME = "screenshots";
+const MAX_SCREENSHOTS = 50;
+
+let db: IDBDatabase | null = null;
+let initPromise: Promise<void> | null = null;
+
+function initialize(): Promise<void> {
+  if (initPromise) return initPromise;
+  if (db) return Promise.resolve();
+
+  initPromise = new Promise<void>((resolve, reject) => {
+    const request = indexedDB.open(DB_NAME, DB_VERSION);
+
+    request.onerror = () => {
+      initPromise = null;
+      reject(request.error);
+    };
+
+    request.onsuccess = () => {
+      db = request.result;
+      initPromise = null;
+      resolve();
+    };
+
+    request.onupgradeneeded = (event) => {
+      const database = (event.target as IDBOpenDBRequest).result;
+      if (!database.objectStoreNames.contains(STORE_NAME)) {
+        const store = database.createObjectStore(STORE_NAME, {
+          keyPath: "uid",
+        });
+        store.createIndex("timestamp", "timestamp", { unique: false });
+        store.createIndex("tabId", "tabId", { unique: false });
+      }
+    };
+  });
+
+  return initPromise;
+}
+
+function generateUid(): string {
+  return `screenshot_${Date.now()}_${Math.random().toString(36).slice(2, 11)}`;
+}
+
+async function applyLRU(): Promise<void> {
+  if (!db) return;
+  const tx = db.transaction([STORE_NAME], "readonly");
+  const store = tx.objectStore(STORE_NAME);
+  const all: ScreenshotData[] = await new Promise((res, rej) => {
+    const req = store.getAll();
+    req.onsuccess = () => res(req.result as ScreenshotData[]);
+    req.onerror = () => rej(req.error);
+  });
+
+  if (all.length <= MAX_SCREENSHOTS) return;
+
+  all.sort((a, b) => b.timestamp - a.timestamp);
+  const toDelete = all.slice(MAX_SCREENSHOTS);
+
+  const delTx = db.transaction([STORE_NAME], "readwrite");
+  const delStore = delTx.objectStore(STORE_NAME);
+  for (const item of toDelete) {
+    delStore.delete(item.uid);
+  }
+}
+
+/**
+ * Runtime-level screenshot storage (for use inside browser-runtime tools).
+ * Shares the same IndexedDB database as the UI-level ScreenshotStorage
+ * in aipex-react so screenshots are accessible across packages.
+ */
+export const RuntimeScreenshotStorage = {
+  /**
+   * Save a screenshot and return its uid.
+   * The base64Data must be a valid data URL (validated before storing).
+   */
+  async saveScreenshot(
+    base64Data: string,
+    metadata?: {
+      tabId?: number;
+      width?: number;
+      height?: number;
+      viewportWidth?: number;
+      viewportHeight?: number;
+    },
+  ): Promise<string> {
+    // Validate that it's a data URL (not arbitrary content)
+    if (
+      typeof base64Data !== "string" ||
+      !base64Data.startsWith("data:image/")
+    ) {
+      throw new Error("Invalid screenshot data: expected data:image/ URL");
+    }
+
+    await initialize();
+    if (!db) throw new Error("Database not initialized");
+
+    const uid = generateUid();
+    const entry: ScreenshotData = {
+      uid,
+      base64Data,
+      timestamp: Date.now(),
+      tabId: metadata?.tabId,
+      metadata: metadata
+        ? {
+            width: metadata.width ?? 0,
+            height: metadata.height ?? 0,
+            viewportWidth: metadata.viewportWidth ?? 0,
+            viewportHeight: metadata.viewportHeight ?? 0,
+          }
+        : undefined,
+    };
+
+    await new Promise<void>((resolve, reject) => {
+      const tx = db!.transaction([STORE_NAME], "readwrite");
+      const store = tx.objectStore(STORE_NAME);
+      const req = store.put(entry);
+      req.onsuccess = () => resolve();
+      req.onerror = () => reject(req.error);
+    });
+
+    // Async LRU eviction — fire-and-forget
+    applyLRU().catch(() => {});
+
+    return uid;
+  },
+
+  /**
+   * Get screenshot base64 data by uid.
+   */
+  async getScreenshot(uid: string): Promise<string | null> {
+    await initialize();
+    if (!db) throw new Error("Database not initialized");
+
+    return new Promise((resolve, reject) => {
+      const tx = db!.transaction([STORE_NAME], "readonly");
+      const store = tx.objectStore(STORE_NAME);
+      const req = store.get(uid);
+      req.onsuccess = () => {
+        const data = req.result as ScreenshotData | undefined;
+        resolve(data?.base64Data ?? null);
+      };
+      req.onerror = () => reject(req.error);
+    });
+  },
+
+  /**
+   * Clear all screenshots.
+   */
+  async clearAll(): Promise<void> {
+    await initialize();
+    if (!db) throw new Error("Database not initialized");
+
+    await new Promise<void>((resolve, reject) => {
+      const tx = db!.transaction([STORE_NAME], "readwrite");
+      const store = tx.objectStore(STORE_NAME);
+      const req = store.clear();
+      req.onsuccess = () => resolve();
+      req.onerror = () => reject(req.error);
+    });
+  },
+};
--- a/packages/browser-runtime/src/tools/index.ts
+++ b/packages/browser-runtime/src/tools/index.ts
@@ -15,7 +15,18 @@ import {
  highlightTextInlineTool,
  scrollToElementTool,
 } from "./page";
-import { captureScreenshotTool, captureTabScreenshotTool } from "./screenshot";
+import {
+  captureScreenshotTool,
+  captureScreenshotWithHighlightTool,
+  captureTabScreenshotTool,
+} from "./screenshot";
+// Clipboard image tools – available but not registered in the default bundle.
+// Enable explicitly if the product decides to ship clipboard access.
+// import {
+//   captureScreenshotToClipboardTool,
+//   readClipboardImageTool,
+//   getClipboardImageInfoTool,
+// } from "./screenshot";
 import { skillTools } from "./skill";
 import { searchElementsTool } from "./snapshot";
 import {
@@ -30,13 +41,15 @@ import { downloadChatImagesTool, downloadImageTool } from "./tools/downloads";

 /**
 * All browser tools registered for AI use
- * Total: 31 tools (27 core + 4 intervention tools)
+ * Total: 32 tools (28 core + 4 intervention tools)
 *
 * Disabled tools (per aipex):
 * - switch_to_tab (causes context switching issues)
 * - duplicate_tab (not in aipex)
 * - wait (replaced by computer tool's wait action)
- * - capture_screenshot_to_clipboard (not enabled in aipex)
+ * - capture_screenshot_to_clipboard (not enabled in aipex default bundle)
+ * - read_clipboard_image (P1 clipboard tool – not enabled by default; requires security review)
+ * - get_clipboard_image_info (P1 clipboard tool – not enabled by default; requires security review)
 * - download_text_as_markdown (not enabled in aipex)
 * - download_current_chat_images (architecture issue, not enabled in aipex)
 * - organize_tabs (stub implementation, temporarily disabled until AI grouping is complete)
@@ -72,8 +85,9 @@ const browserFunctionTools: BrowserFunctionTool[] = [
  highlightElementTool,
  highlightTextInlineTool,

-  // Screenshot (2 tools)
+  // Screenshot (3 tools)
  captureScreenshotTool,
+  captureScreenshotWithHighlightTool,
  captureTabScreenshotTool,

  // Download (2 tools)
--- a/packages/browser-runtime/src/tools/screenshot-helpers.ts
+++ b/packages/browser-runtime/src/tools/screenshot-helpers.ts
@@ -0,0 +1,210 @@
+/**
+ * Shared screenshot helpers.
+ *
+ * This module is intentionally kept free of imports from `./index` or any
+ * module that participates in the tools ↔ screenshot circular-import chain.
+ * Both `captureScreenshotWithHighlightTool` (in screenshot.ts) and
+ * `ElementCaptureService` (in intervention/element-capture.ts) import from
+ * here without triggering a cycle.
+ */
+
+/** Maximum padding in pixels */
+export const MAX_PADDING = 200;
+
+// ===================== Image utilities =====================
+
+/**
+ * Crop image to a specific region using canvas.
+ */
+export async function cropImage(
+  dataUrl: string,
+  region: { x: number; y: number; width: number; height: number },
+): Promise<string> {
+  return new Promise((resolve, reject) => {
+    const img = new Image();
+    img.onload = () => {
+      const canvas = document.createElement("canvas");
+      const ctx = canvas.getContext("2d");
+      if (!ctx) {
+        reject(new Error("Failed to get canvas context"));
+        return;
+      }
+
+      canvas.width = region.width;
+      canvas.height = region.height;
+
+      ctx.drawImage(
+        img,
+        region.x,
+        region.y,
+        region.width,
+        region.height,
+        0,
+        0,
+        region.width,
+        region.height,
+      );
+
+      resolve(canvas.toDataURL("image/png", 0.9));
+    };
+    img.onerror = () => reject(new Error("Failed to load image"));
+    img.src = dataUrl;
+  });
+}
+
+// ===================== Shared capture helper =====================
+
+/**
+ * Options for the shared capture + element-crop helper.
+ */
+export interface CaptureWithElementCropOptions {
+  tabId: number;
+  windowId: number;
+  tabUrl?: string;
+  /** CSS selector of the element to focus on. Max length enforced by callers. */
+  selector?: string;
+  /** Whether to crop the screenshot to the element bounding box (plus padding). */
+  cropToElement?: boolean;
+  /** Padding around the element in CSS pixels when cropping (default 50, max 200). */
+  padding?: number;
+}
+
+/**
+ * Result returned by the shared capture helper.
+ */
+export interface CaptureWithElementCropResult {
+  /** The captured (and optionally cropped) image as a data URL. */
+  dataUrl: string;
+  /** True if the image was actually cropped to the element. */
+  cropped: boolean;
+  /** True if the selector matched an element on the page. */
+  elementFound: boolean;
+}
+
+/**
+ * Core logic for capturing the visible tab and optionally cropping to an
+ * element identified by CSS selector.
+ *
+ * This is shared by `captureScreenshotWithHighlightTool` (the agent-facing
+ * tool) and `ElementCaptureService.captureScreenshot` so that both use the
+ * same element-rect resolution, DPR scaling, and crop logic.
+ *
+ * Security notes:
+ * - Rejects browser-internal pages (chrome://, edge://, about:, extension://).
+ * - Selector length must be bounded by the caller (tool uses zod `.max()`).
+ * - Padding is clamped to [0, MAX_PADDING].
+ */
+export async function captureVisibleTabWithElementCrop(
+  options: CaptureWithElementCropOptions,
+): Promise<CaptureWithElementCropResult> {
+  const {
+    tabId,
+    windowId,
+    tabUrl,
+    selector,
+    cropToElement = false,
+    padding = 50,
+  } = options;
+
+  // Reject restricted pages
+  if (
+    tabUrl &&
+    (tabUrl.startsWith("chrome://") ||
+      tabUrl.startsWith("chrome-extension://") ||
+      tabUrl.startsWith("edge://") ||
+      tabUrl.startsWith("about:"))
+  ) {
+    throw new Error("Cannot capture browser internal pages");
+  }
+
+  // Clamp padding to safe range
+  const safePadding = Math.max(0, Math.min(padding, MAX_PADDING));
+
+  // If a selector is provided, resolve the element rect via content script
+  let elementRect: {
+    x: number;
+    y: number;
+    width: number;
+    height: number;
+    devicePixelRatio: number;
+  } | null = null;
+
+  if (selector) {
+    try {
+      const result = await chrome.scripting.executeScript({
+        target: { tabId },
+        func: (sel: string) => {
+          const element = document.querySelector(sel);
+          if (!element) return null;
+
+          const rect = element.getBoundingClientRect();
+          const dpr = window.devicePixelRatio || 1;
+
+          return {
+            x: rect.x * dpr,
+            y: rect.y * dpr,
+            width: rect.width * dpr,
+            height: rect.height * dpr,
+            devicePixelRatio: dpr,
+          };
+        },
+        args: [selector],
+      });
+
+      if (result[0]?.result) {
+        elementRect = result[0].result;
+      }
+    } catch (err) {
+      console.warn("[Screenshot] Failed to get element rect:", err);
+      // Continue with full-page screenshot if selector fails
+    }
+  }
+
+  // Focus window and capture
+  await chrome.windows.update(windowId, { focused: true });
+  await new Promise((resolve) => setTimeout(resolve, 100));
+
+  let dataUrl = await chrome.tabs.captureVisibleTab(windowId, {
+    format: "png",
+    quality: 90,
+  });
+
+  if (!dataUrl || !dataUrl.startsWith("data:image/")) {
+    throw new Error("Invalid image data captured");
+  }
+
+  const cropped = !!(cropToElement && elementRect);
+
+  // Crop to element if requested and the element was found
+  if (cropToElement && elementRect) {
+    const dpr = elementRect.devicePixelRatio || 1;
+    const scaledPadding = safePadding * dpr;
+
+    // Load image to get actual dimensions for bounds checking
+    const img = new Image();
+    await new Promise<void>((resolve, reject) => {
+      img.onload = () => resolve();
+      img.onerror = () => reject(new Error("Failed to load image for crop"));
+      img.src = dataUrl;
+    });
+
+    const x = Math.max(0, Math.round(elementRect.x - scaledPadding));
+    const y = Math.max(0, Math.round(elementRect.y - scaledPadding));
+    const maxWidth = img.width - x;
+    const maxHeight = img.height - y;
+    const width = Math.min(
+      Math.round(elementRect.width + scaledPadding * 2),
+      maxWidth,
+    );
+    const height = Math.min(
+      Math.round(elementRect.height + scaledPadding * 2),
+      maxHeight,
+    );
+
+    if (width > 0 && height > 0) {
+      dataUrl = await cropImage(dataUrl, { x, y, width, height });
+    }
+  }
+
+  return { dataUrl, cropped, elementFound: !!elementRect };
+}
--- a/packages/browser-runtime/src/tools/screenshot.ts
+++ b/packages/browser-runtime/src/tools/screenshot.ts
@@ -1,8 +1,20 @@
 import { tool } from "@aipexstudio/aipex-core";
 import { z } from "zod";
 import { cacheScreenshotMetadata } from "../automation/computer";
+import { RuntimeScreenshotStorage } from "../lib/screenshot-storage";
 import { getAutomationMode } from "../runtime/automation-mode";
 import { getActiveTab } from "./index";
+import {
+  captureVisibleTabWithElementCrop,
+  MAX_PADDING,
+} from "./screenshot-helpers.js";
+
+// Re-export the shared helper types/function so existing consumers aren't broken
+export type {
+  CaptureWithElementCropOptions,
+  CaptureWithElementCropResult,
+} from "./screenshot-helpers.js";
+export { captureVisibleTabWithElementCrop } from "./screenshot-helpers.js";

 async function compressImage(
  dataUrl: string,
@@ -93,15 +105,25 @@ export const captureScreenshotTool = tool({
      throw new Error("Invalid image data captured");
    }

-    // Get viewport dimensions for metadata caching
-    const viewportDimensions = await chrome.scripting.executeScript({
-      target: { tabId: tab.id },
-      func: () => ({
-        width: window.innerWidth,
-        height: window.innerHeight,
-      }),
-    });
-    const viewport = viewportDimensions[0]?.result;
+    // Get viewport dimensions for metadata caching (graceful degradation)
+    let viewport: { width: number; height: number } | undefined;
+    try {
+      const viewportDimensions = await chrome.scripting.executeScript({
+        target: { tabId: tab.id },
+        func: () => ({
+          width: window.innerWidth,
+          height: window.innerHeight,
+        }),
+      });
+      viewport = viewportDimensions[0]?.result ?? undefined;
+    } catch (e) {
+      console.warn("[Screenshot] Failed to get viewport dimensions:", e);
+      // Continue without viewport metadata – screenshot still works
+    }
+
+    // Get image dimensions for metadata
+    let imageWidth = 0;
+    let imageHeight = 0;

    if (sendToLLM) {
      // Compress for LLM
@@ -114,6 +136,8 @@ export const captureScreenshotTool = tool({
        img.onerror = reject;
        img.src = dataUrl;
      });
+      imageWidth = img.width;
+      imageHeight = img.height;

      // Cache screenshot metadata for computer tool
      if (viewport) {
@@ -125,12 +149,50 @@ export const captureScreenshotTool = tool({
          viewport.height,
        );
      }
+    } else {
+      // Get original image dimensions for non-LLM screenshots
+      const img = new Image();
+      await new Promise((resolve, reject) => {
+        img.onload = resolve;
+        img.onerror = reject;
+        img.src = dataUrl;
+      });
+      imageWidth = img.width;
+      imageHeight = img.height;
+    }
+
+    // Save screenshot to IndexedDB and get uid
+    let screenshotUid: string | undefined;
+    try {
+      screenshotUid = await RuntimeScreenshotStorage.saveScreenshot(dataUrl, {
+        tabId: tab.id,
+        width: imageWidth,
+        height: imageHeight,
+        viewportWidth: viewport?.width ?? 0,
+        viewportHeight: viewport?.height ?? 0,
+      });
+    } catch (err) {
+      console.error("[Screenshot] Failed to save to IndexedDB:", err);
+      // Continue even if storage fails
+    }
+
+    if (sendToLLM) {
+      return {
+        success: true,
+        imageData: dataUrl,
+        sendToLLM: true,
+        screenshotUid,
+        tabId: tab.id,
+        url: tab.url,
+        title: tab.title,
+      };
    }

    return {
      success: true,
-      imageData: sendToLLM ? dataUrl : undefined,
-      captured: !sendToLLM,
+      captured: true,
+      sendToLLM: false,
+      screenshotUid,
      tabId: tab.id,
      url: tab.url,
      title: tab.title,
@@ -177,15 +239,25 @@ export const captureTabScreenshotTool = tool({
      quality: 90,
    });

-    // Get viewport dimensions for metadata caching
-    const viewportDimensions = await chrome.scripting.executeScript({
-      target: { tabId },
-      func: () => ({
-        width: window.innerWidth,
-        height: window.innerHeight,
-      }),
-    });
-    const viewport = viewportDimensions[0]?.result;
+    // Get viewport dimensions for metadata caching (graceful degradation)
+    let viewport: { width: number; height: number } | undefined;
+    try {
+      const viewportDimensions = await chrome.scripting.executeScript({
+        target: { tabId },
+        func: () => ({
+          width: window.innerWidth,
+          height: window.innerHeight,
+        }),
+      });
+      viewport = viewportDimensions[0]?.result ?? undefined;
+    } catch (e) {
+      console.warn("[Screenshot] Failed to get viewport dimensions:", e);
+      // Continue without viewport metadata – screenshot still works
+    }
+
+    // Get image dimensions for metadata
+    let imageWidth = 0;
+    let imageHeight = 0;

    if (sendToLLM) {
      // Compress for LLM
@@ -198,6 +270,8 @@ export const captureTabScreenshotTool = tool({
        img.onerror = reject;
        img.src = dataUrl;
      });
+      imageWidth = img.width;
+      imageHeight = img.height;

      // Cache screenshot metadata for computer tool
      if (viewport) {
@@ -209,12 +283,50 @@ export const captureTabScreenshotTool = tool({
          viewport.height,
        );
      }
+    } else {
+      // Get original image dimensions for non-LLM screenshots
+      const img = new Image();
+      await new Promise((resolve, reject) => {
+        img.onload = resolve;
+        img.onerror = reject;
+        img.src = dataUrl;
+      });
+      imageWidth = img.width;
+      imageHeight = img.height;
+    }
+
+    // Save screenshot to IndexedDB and get uid
+    let screenshotUid: string | undefined;
+    try {
+      screenshotUid = await RuntimeScreenshotStorage.saveScreenshot(dataUrl, {
+        tabId,
+        width: imageWidth,
+        height: imageHeight,
+        viewportWidth: viewport?.width ?? 0,
+        viewportHeight: viewport?.height ?? 0,
+      });
+    } catch (err) {
+      console.error("[Screenshot] Failed to save to IndexedDB:", err);
+      // Continue even if storage fails
+    }
+
+    if (sendToLLM) {
+      return {
+        success: true,
+        imageData: dataUrl,
+        sendToLLM: true,
+        screenshotUid,
+        tabId,
+        url: tab.url,
+        title: tab.title,
+      };
    }

    return {
      success: true,
-      imageData: sendToLLM ? dataUrl : undefined,
-      captured: !sendToLLM,
+      captured: true,
+      sendToLLM: false,
+      screenshotUid,
      tabId,
      url: tab.url,
      title: tab.title,
@@ -222,6 +334,169 @@ export const captureTabScreenshotTool = tool({
  },
 });

+/** Maximum allowed CSS selector length to prevent injection of excessively long strings */
+const MAX_SELECTOR_LENGTH = 500;
+
+// ===================== Tool definition =====================
+
+export const captureScreenshotWithHighlightTool = tool({
+  name: "capture_screenshot_with_highlight",
+  description:
+    "Capture screenshot of the current visible tab, optionally highlighting and cropping to a specific element identified by CSS selector. The screenshot is always sent to the LLM for visual analysis. NOTE: This tool requires focus mode.",
+  parameters: z.object({
+    selector: z
+      .string()
+      .max(MAX_SELECTOR_LENGTH)
+      .optional()
+      .describe("CSS selector of element to highlight/focus on"),
+    cropToElement: z
+      .boolean()
+      .optional()
+      .default(false)
+      .describe(
+        "Whether to crop the screenshot to the element region (plus padding)",
+      ),
+    padding: z
+      .number()
+      .min(0)
+      .max(MAX_PADDING)
+      .optional()
+      .default(50)
+      .describe("Padding around element in pixels when cropping (default: 50)"),
+    sendToLLM: z
+      .boolean()
+      .nullable()
+      .optional()
+      .default(true)
+      .describe(
+        "Whether to send the screenshot to LLM for visual analysis. Defaults to true.",
+      ),
+  }),
+  execute: async ({
+    selector,
+    cropToElement = false,
+    padding = 50,
+    sendToLLM = true,
+  }) => {
+    const mode = await getAutomationMode();
+    console.log(
+      "🔧 [captureScreenshotWithHighlight] Automation mode:",
+      mode,
+    );
+
+    if (mode === "background") {
+      throw new Error(
+        "Screenshot capture is disabled in background mode. Please switch to focus mode to use visual tools.",
+      );
+    }
+
+    const tab = await getActiveTab();
+
+    if (!tab.id || !tab.windowId) {
+      throw new Error("No active tab found");
+    }
+
+    // Delegate to shared helper for capture + element crop
+    const capture = await captureVisibleTabWithElementCrop({
+      tabId: tab.id,
+      windowId: tab.windowId,
+      tabUrl: tab.url,
+      selector,
+      cropToElement,
+      padding,
+    });
+
+    let { dataUrl } = capture;
+
+    // Get viewport dimensions (graceful degradation)
+    let viewport: { width: number; height: number } | undefined;
+    try {
+      const viewportDimensions = await chrome.scripting.executeScript({
+        target: { tabId: tab.id },
+        func: () => ({
+          width: window.innerWidth,
+          height: window.innerHeight,
+        }),
+      });
+      viewport = viewportDimensions[0]?.result ?? undefined;
+    } catch (e) {
+      console.warn(
+        "[ScreenshotHighlight] Failed to get viewport dimensions:",
+        e,
+      );
+    }
+
+    if (sendToLLM) {
+      // Compress for LLM
+      dataUrl = await compressImage(dataUrl, 0.6, 1024);
+    }
+
+    // Extract image dimensions
+    const finalImg = new Image();
+    await new Promise<void>((resolve, reject) => {
+      finalImg.onload = () => resolve();
+      finalImg.onerror = () => reject(new Error("Failed to load image"));
+      finalImg.src = dataUrl;
+    });
+    const imageWidth = finalImg.width;
+    const imageHeight = finalImg.height;
+
+    // Cache screenshot metadata for computer tool
+    if (sendToLLM && viewport) {
+      cacheScreenshotMetadata(
+        tab.id,
+        imageWidth,
+        imageHeight,
+        viewport.width,
+        viewport.height,
+      );
+    }
+
+    // Save screenshot to IndexedDB
+    let screenshotUid: string | undefined;
+    try {
+      screenshotUid = await RuntimeScreenshotStorage.saveScreenshot(dataUrl, {
+        tabId: tab.id,
+        width: imageWidth,
+        height: imageHeight,
+        viewportWidth: viewport?.width ?? 0,
+        viewportHeight: viewport?.height ?? 0,
+      });
+    } catch (err) {
+      console.error(
+        "[ScreenshotHighlight] Failed to save to IndexedDB:",
+        err,
+      );
+    }
+
+    if (sendToLLM) {
+      return {
+        success: true,
+        imageData: dataUrl,
+        sendToLLM: true,
+        screenshotUid,
+        tabId: tab.id,
+        url: tab.url,
+        title: tab.title,
+        selector: selector ?? undefined,
+        cropped: capture.cropped,
+      };
+    }
+
+    return {
+      success: true,
+      captured: true,
+      sendToLLM: false,
+      screenshotUid,
+      tabId: tab.id,
+      url: tab.url,
+      title: tab.title,
+      selector: selector ?? undefined,
+      cropped: capture.cropped,
+    };
+  },
+});
+
 export const captureScreenshotToClipboardTool = tool({
  name: "capture_screenshot_to_clipboard",
  description:
@@ -267,3 +542,83 @@ export const captureScreenshotToClipboardTool = tool({
    };
  },
 });
+
+// ===================== Clipboard image tools (P1) =====================
+
+export const readClipboardImageTool = tool({
+  name: "read_clipboard_image",
+  description:
+    "Read an image from the system clipboard and return it as a base64 data URL. " +
+    "Useful for inspecting images the user has copied. Returns an error if no image is present.",
+  parameters: z.object({}),
+  execute: async () => {
+    try {
+      const clipboardItems = await navigator.clipboard.read();
+
+      for (const item of clipboardItems) {
+        for (const type of item.types) {
+          if (type.startsWith("image/")) {
+            const blob = await item.getType(type);
+
+            // Convert blob to data URL
+            const dataUrl = await new Promise<string>((resolve, reject) => {
+              const reader = new FileReader();
+              reader.onload = () => resolve(reader.result as string);
+              reader.onerror = () =>
+                reject(new Error("Failed to read image data"));
+              reader.readAsDataURL(blob);
+            });
+
+            return {
+              success: true,
+              imageData: dataUrl,
+            };
+          }
+        }
+      }
+
+      return { success: false, error: "No image found in clipboard" };
+    } catch (error: unknown) {
+      const message =
+        error instanceof Error ? error.message : String(error);
+      return {
+        success: false,
+        error: `Failed to read clipboard: ${message}`,
+      };
+    }
+  },
+});
+
+export const getClipboardImageInfoTool = tool({
+  name: "get_clipboard_image_info",
+  description:
+    "Check whether the system clipboard contains an image, and if so return " +
+    "its MIME type. Does NOT read the full image data.",
+  parameters: z.object({}),
+  execute: async () => {
+    try {
+      const clipboardItems = await navigator.clipboard.read();
+
+      for (const item of clipboardItems) {
+        for (const type of item.types) {
+          if (type.startsWith("image/")) {
+            return {
+              success: true,
+              hasImage: true,
+              imageType: type,
+            };
+          }
+        }
+      }
+
+      return { success: true, hasImage: false };
+    } catch (error: unknown) {
+      const message =
+        error instanceof Error ? error.message : String(error);
+      return {
+        success: false,
+        error: `Failed to read clipboard: ${message}`,
+      };
+    }
+  },
+});
--- a/packages/core/src/conversation/manager.ts
+++ b/packages/core/src/conversation/manager.ts
@@ -7,6 +7,7 @@ import type {
  SessionTree,
 } from "../types.js";
 import { generateId } from "../utils/id-generator.js";
+import { pruneTransientScreenshotItems } from "../utils/screenshot-shaping.js";
 import type { ConversationCompressor } from "./compressor.js";
 import { Session } from "./session.js";

@@ -87,7 +88,10 @@ export class ConversationManager {
  }

  private async doCompress(session: Session): Promise<{ summary: string }> {
-    const items = await session.getItems();
+    // Prune transient screenshot user-image messages before compression
+    // to avoid sending large base64 blobs to the compressor/LLM.
+    const rawItems = await session.getItems();
+    const items = pruneTransientScreenshotItems(rawItems);
    const { summary, compressedItems } =
      await this.compressor!.compressItems(items);

--- a/packages/core/src/conversation/session.ts
+++ b/packages/core/src/conversation/session.ts
@@ -8,6 +8,11 @@ import type {
  SessionSummary,
 } from "../types.js";
 import { generateId } from "../utils/id-generator.js";
+import {
+  isTransientScreenshotItem,
+  pruneTransientScreenshotItems,
+  shapeScreenshotItems,
+} from "../utils/screenshot-shaping.js";

 function createEmptySessionMetrics(): SessionMetrics {
  return {
@@ -53,7 +58,11 @@ export class Session implements OpenAISession {
  }

  async addItems(items: AgentInputItem[]): Promise<void> {
-    this.items.push(...items);
+    // Shape screenshot tool results: strip base64 imageData from the tool
+    // result and inject a transient user message with the real image so the
+    // model can consume it via the standard vision path.
+    const shaped = shapeScreenshotItems(items);
+    this.items.push(...shaped);
    this.metadata["lastActiveAt"] = Date.now();
    this.updatePreview();
  }
@@ -156,7 +165,12 @@ export class Session implements OpenAISession {
  private updatePreview(): void {
    const latestUserMessage = [...this.items]
      .reverse()
-      .find((item) => item.type === "message" && item.role === "user");
+      .find(
+        (item) =>
+          item.type === "message" &&
+          item.role === "user" &&
+          !isTransientScreenshotItem(item),
+      );

    const previewSource =
      this.extractContent(latestUserMessage) ??
@@ -207,7 +221,9 @@ export class Session implements OpenAISession {
  toJSON(): SerializedSession {
    return {
      id: this.id,
-      items: this.items,
+      // Prune transient screenshot user-image messages before persisting
+      // to avoid storing large base64 blobs in conversation history.
+      items: pruneTransientScreenshotItems(this.items),
      metadata: this.metadata,
      config: this.config,
      metrics: this.sessionMetrics,
--- a/packages/core/src/utils/index.ts
+++ b/packages/core/src/utils/index.ts
@@ -3,3 +3,9 @@
 */

 export { CancellationError, CancellationToken } from "./cancellation-token.js";
+export {
+  isTransientScreenshotItem,
+  pruneTransientScreenshotItems,
+  shapeScreenshotItems,
+  TRANSIENT_SCREENSHOT_MARKER,
+} from "./screenshot-shaping.js";
--- a/packages/core/src/utils/screenshot-shaping.test.ts
+++ b/packages/core/src/utils/screenshot-shaping.test.ts
@@ -0,0 +1,296 @@
+import type { AgentInputItem } from "@openai/agents";
+import { describe, expect, it } from "vitest";
+import {
+  isTransientScreenshotItem,
+  pruneTransientScreenshotItems,
+  shapeScreenshotItems,
+  TRANSIENT_SCREENSHOT_MARKER,
+} from "./screenshot-shaping.js";
+
+// --- Helpers ---
+
+const TEST_IMAGE_DATA = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQ==";
+const TEST_SCREENSHOT_UID = "screenshot_1234567890_abcdefghi";
+const PLACEHOLDER = "[Image data removed - see following user message]";
+
+function createScreenshotToolResult(
+  overrides: Record<string, unknown> = {},
+): AgentInputItem {
+  const output = {
+    success: true,
+    imageData: TEST_IMAGE_DATA,
+    sendToLLM: true,
+    screenshotUid: TEST_SCREENSHOT_UID,
+    tabId: 1,
+    url: "https://example.com",
+    title: "Example",
+    ...overrides,
+  };
+  return {
+    type: "function_call_result",
+    name: "capture_screenshot",
+    callId: "call_abc123",
+    output: JSON.stringify(output),
+  } as AgentInputItem;
+}
+
+function createNonScreenshotToolResult(): AgentInputItem {
+  return {
+    type: "function_call_result",
+    name: "get_tabs",
+    callId: "call_other",
+    output: JSON.stringify({ tabs: [{ id: 1, title: "Tab" }] }),
+  } as AgentInputItem;
+}
+
+function createUserMessage(text: string): AgentInputItem {
+  return {
+    type: "message",
+    role: "user",
+    content: text,
+  };
+}
+
+// --- Tests ---
+
+describe("shapeScreenshotItems", () => {
+  it("should strip imageData and inject transient user image message for sendToLLM=true", () => {
+    const items = [createScreenshotToolResult()];
+    const shaped = shapeScreenshotItems(items);
+
+    expect(shaped.length).toBe(2);
+
+    // First item: stripped tool result
+    const toolResult = shaped[0] as { type: string; output: string };
+    expect(toolResult.type).toBe("function_call_result");
+    const parsed = JSON.parse(toolResult.output);
+    expect(parsed.success).toBe(true);
+    expect(parsed.imageData).toBe(PLACEHOLDER);
+    expect(parsed.screenshotUid).toBe(TEST_SCREENSHOT_UID);
+    expect(parsed.sendToLLM).toBe(true);
+
+    // Second item: transient user image message
+    const userMsg = shaped[1] as {
+      type: string;
+      role: string;
+      content: Array<{ type: string; text?: string; image?: string }>;
+      providerData?: Record<string, unknown>;
+    };
+    expect(userMsg.type).toBe("message");
+    expect(userMsg.role).toBe("user");
+    expect(userMsg.providerData?.[TRANSIENT_SCREENSHOT_MARKER]).toBe(true);
+
+    // Check content has text + image parts
+    const textPart = userMsg.content.find((c) => c.type === "input_text");
+    const imagePart = userMsg.content.find((c) => c.type === "input_image");
+    expect(textPart).toBeTruthy();
+    expect(imagePart).toBeTruthy();
+    expect((imagePart as { image: string }).image).toBe(TEST_IMAGE_DATA);
+  });
+
+  it("should pass through items when sendToLLM=false", () => {
+    const items = [
+      createScreenshotToolResult({
+        sendToLLM: false,
+        imageData: undefined,
+        captured: true,
+      }),
+    ];
+    const shaped = shapeScreenshotItems(items);
+
+    // Should not inject a user image message
+    expect(shaped.length).toBe(1);
+    expect(shaped[0]).toEqual(items[0]);
+  });
+
+  it("should pass through non-screenshot tools unchanged", () => {
+    const items = [createNonScreenshotToolResult()];
+    const shaped = shapeScreenshotItems(items);
+
+    expect(shaped.length).toBe(1);
+    expect(shaped[0]).toEqual(items[0]);
+  });
+
+  it("should pass through non-tool items unchanged", () => {
+    const items = [createUserMessage("hello")];
+    const shaped = shapeScreenshotItems(items);
+
+    expect(shaped.length).toBe(1);
+    expect(shaped[0]).toEqual(items[0]);
+  });
+
+  it("should handle capture_tab_screenshot the same way", () => {
+    const toolResult = createScreenshotToolResult();
+    (toolResult as { name: string }).name = "capture_tab_screenshot";
+    const shaped = shapeScreenshotItems([toolResult]);
+
+    expect(shaped.length).toBe(2);
+    expect((shaped[0] as { type: string }).type).toBe("function_call_result");
+    expect((shaped[1] as { type: string; role: string }).role).toBe("user");
+  });
+
+  it("should handle capture_screenshot_with_highlight the same way", () => {
+    const output = {
+      success: true,
+      imageData: TEST_IMAGE_DATA,
+      sendToLLM: true,
+      screenshotUid: TEST_SCREENSHOT_UID,
+      tabId: 1,
+      url: "https://example.com",
+      title: "Example",
+      selector: ".my-element",
+      cropped: true,
+    };
+    const item: AgentInputItem = {
+      type: "function_call_result",
+      name: "capture_screenshot_with_highlight",
+      callId: "call_highlight",
+      output: JSON.stringify(output),
+    } as AgentInputItem;
+
+    const shaped = shapeScreenshotItems([item]);
+
+    expect(shaped.length).toBe(2);
+
+    // First item: stripped tool result
+    const toolResult = shaped[0] as { type: string; output: string };
+    expect(toolResult.type).toBe("function_call_result");
+    const parsed = JSON.parse(toolResult.output);
+    expect(parsed.success).toBe(true);
+    expect(parsed.imageData).toBe(PLACEHOLDER);
+    expect(parsed.screenshotUid).toBe(TEST_SCREENSHOT_UID);
+    expect(parsed.sendToLLM).toBe(true);
+
+    // Second item: transient user image message
+    const userMsg = shaped[1] as {
+      type: string;
+      role: string;
+      content: Array<{ type: string; text?: string; image?: string }>;
+      providerData?: Record<string, unknown>;
+    };
+    expect(userMsg.type).toBe("message");
+    expect(userMsg.role).toBe("user");
+    expect(userMsg.providerData?.[TRANSIENT_SCREENSHOT_MARKER]).toBe(true);
+    const imagePart = userMsg.content.find((c) => c.type === "input_image");
+    expect(imagePart).toBeTruthy();
+    expect((imagePart as { image: string }).image).toBe(TEST_IMAGE_DATA);
+  });
+
+  it("should pass through capture_screenshot_with_highlight when sendToLLM=false", () => {
+    const output = {
+      success: true,
+      captured: true,
+      sendToLLM: false,
+      screenshotUid: TEST_SCREENSHOT_UID,
+      tabId: 1,
+      selector: ".my-element",
+      cropped: true,
+    };
+    const item: AgentInputItem = {
+      type: "function_call_result",
+      name: "capture_screenshot_with_highlight",
+      callId: "call_highlight_no_llm",
+      output: JSON.stringify(output),
+    } as AgentInputItem;
+
+    const shaped = shapeScreenshotItems([item]);
+
+    // No imageData + sendToLLM=false → pass through unchanged
+    expect(shaped.length).toBe(1);
+    expect(shaped[0]).toEqual(item);
+  });
+
+  it("should handle mixed items correctly", () => {
+    const items = [
+      createUserMessage("Take a screenshot"),
+      createNonScreenshotToolResult(),
+      createScreenshotToolResult(),
+      createUserMessage("What do you see?"),
+    ];
+    const shaped = shapeScreenshotItems(items);
+
+    // Original 4 items + 1 injected user image = 5
+    expect(shaped.length).toBe(5);
+
+    // Verify order: user, non-screenshot tool, stripped screenshot, user image, user
+    expect((shaped[0] as { role: string }).role).toBe("user");
+    expect((shaped[1] as { name: string }).name).toBe("get_tabs");
+    expect((shaped[2] as { type: string }).type).toBe("function_call_result");
+    expect(
+      (shaped[3] as { providerData?: Record<string, unknown> }).providerData?.[
+        TRANSIENT_SCREENSHOT_MARKER
+      ],
+    ).toBe(true);
+    expect((shaped[4] as { role: string }).role).toBe("user");
+  });
+
+  it("should handle nested data structure", () => {
+    const output = {
+      success: true,
+      data: {
+        success: true,
+        imageData: TEST_IMAGE_DATA,
+        sendToLLM: true,
+        screenshotUid: TEST_SCREENSHOT_UID,
+      },
+    };
+    const item: AgentInputItem = {
+      type: "function_call_result",
+      name: "capture_screenshot",
+      callId: "call_nested",
+      output: JSON.stringify(output),
+    } as AgentInputItem;
+
+    const shaped = shapeScreenshotItems([item]);
+    expect(shaped.length).toBe(2);
+
+    const parsedOutput = JSON.parse(
+      (shaped[0] as { output: string }).output,
+    );
+    expect(parsedOutput.success).toBe(true);
+    expect(parsedOutput.data.imageData).toBe(PLACEHOLDER);
+    expect(parsedOutput.data.screenshotUid).toBe(TEST_SCREENSHOT_UID);
+  });
+});
+
+describe("pruneTransientScreenshotItems", () => {
+  it("should remove transient screenshot items", () => {
+    const transient: AgentInputItem = {
+      type: "message",
+      role: "user",
+      content: [
+        { type: "input_text", text: "screenshot" },
+        { type: "input_image", image: TEST_IMAGE_DATA, detail: "auto" },
+      ],
+      providerData: { [TRANSIENT_SCREENSHOT_MARKER]: true },
+    } as AgentInputItem;
+
+    const normal = createUserMessage("hello");
+
+    const pruned = pruneTransientScreenshotItems([normal, transient]);
+    expect(pruned.length).toBe(1);
+    expect(pruned[0]).toEqual(normal);
+  });
+
+  it("should keep all items when no transients exist", () => {
+    const items = [createUserMessage("a"), createUserMessage("b")];
+    const pruned = pruneTransientScreenshotItems(items);
+    expect(pruned.length).toBe(2);
+  });
+});
+
+describe("isTransientScreenshotItem", () => {
+  it("should return true for transient items", () => {
+    const item = {
+      type: "message",
+      role: "user",
+      content: "test",
+      providerData: { [TRANSIENT_SCREENSHOT_MARKER]: true },
+    } as unknown as AgentInputItem;
+    expect(isTransientScreenshotItem(item)).toBe(true);
+  });
+
+  it("should return false for normal items", () => {
+    expect(isTransientScreenshotItem(createUserMessage("hello"))).toBe(false);
+  });
+});
--- a/packages/core/src/utils/screenshot-shaping.ts
+++ b/packages/core/src/utils/screenshot-shaping.ts
@@ -0,0 +1,199 @@
+/**
+ * Screenshot message shaping utilities.
+ *
+ * When a screenshot tool returns `sendToLLM=true`, the large base64 imageData
+ * must NOT be sent inside the function_call_result output (models may not
+ * support images there, and it bloats token counts).
+ *
+ * Instead, the imageData is:
+ * 1. Stripped from the tool result (replaced with a placeholder string).
+ * 2. Injected as a follow-up user message with `input_image` content.
+ *
+ * This matches the message flow used in the original aipex codebase.
+ */
+
+import type { AgentInputItem } from "@openai/agents";
+import { safeJsonParse } from "./json.js";
+
+/** Tool names whose results may include screenshot image data */
+const SCREENSHOT_TOOL_NAMES = new Set([
+  "capture_screenshot",
+  "capture_screenshot_with_highlight",
+  "capture_tab_screenshot",
+]);
+
+/** Placeholder that replaces imageData in the tool result */
+const IMAGE_DATA_PLACEHOLDER =
+  "[Image data removed - see following user message]";
+
+/** Marker on transient user-image messages so they can be pruned */
+export const TRANSIENT_SCREENSHOT_MARKER = "__transient_screenshot__";
+
+/**
+ * Process a batch of AgentInputItems. For any `function_call_result` from
+ * a screenshot tool that contains `imageData` with `sendToLLM=true`:
+ *   - Replace imageData with a placeholder in the tool result.
+ *   - Insert a transient user message with the real image right after.
+ *
+ * Items that are not screenshot tool results pass through unchanged.
+ */
+export function shapeScreenshotItems(
+  items: AgentInputItem[],
+): AgentInputItem[] {
+  const result: AgentInputItem[] = [];
+
+  for (const item of items) {
+    if (item.type !== "function_call_result") {
+      result.push(item);
+      continue;
+    }
+
+    const funcResult = item as {
+      type: "function_call_result";
+      name: string;
+      callId: string;
+      output: string;
+      [key: string]: unknown;
+    };
+
+    if (!SCREENSHOT_TOOL_NAMES.has(funcResult.name)) {
+      result.push(item);
+      continue;
+    }
+
+    // Try to parse the output and extract imageData
+    const parsed = safeJsonParse<Record<string, unknown>>(funcResult.output);
+    if (!parsed) {
+      result.push(item);
+      continue;
+    }
+
+    const extracted = extractImageData(parsed);
+    if (!extracted) {
+      // No sendToLLM image data – pass through
+      result.push(item);
+      continue;
+    }
+
+    // 1. Rewrite the tool result with imageData stripped
+    const strippedOutput = buildStrippedOutput(parsed, extracted.screenshotUid);
+    const strippedItem: AgentInputItem = {
+      ...item,
+      output: JSON.stringify(strippedOutput),
+    } as AgentInputItem;
+    result.push(strippedItem);
+
+    // 2. Insert a transient user message carrying the real image
+    const toolName = funcResult.name;
+    const messageText =
+      toolName === "computer"
+        ? "Here is the screenshot from the computer action:"
+        : "Here is the screenshot you requested:";
+
+    const userImageMessage: AgentInputItem = {
+      type: "message",
+      role: "user",
+      content: [
+        { type: "input_text", text: messageText },
+        {
+          type: "input_image",
+          image: extracted.imageData,
+          detail: "auto",
+        },
+      ],
+      // Mark as transient so it can be pruned before persistence/compression
+      providerData: { [TRANSIENT_SCREENSHOT_MARKER]: true },
+    } as AgentInputItem;
+
+    result.push(userImageMessage);
+  }
+
+  return result;
+}
+
+/**
+ * Remove transient screenshot user-image messages from items.
+ * Used before persistence or compression.
+ */
+export function pruneTransientScreenshotItems(
+  items: AgentInputItem[],
+): AgentInputItem[] {
+  return items.filter((item) => {
+    const pd = (item as { providerData?: Record<string, unknown> })
+      .providerData;
+    return !pd?.[TRANSIENT_SCREENSHOT_MARKER];
+  });
+}
+
+/**
+ * Check if an item is a transient screenshot user-image message.
+ */
+export function isTransientScreenshotItem(item: AgentInputItem): boolean {
+  const pd = (item as { providerData?: Record<string, unknown> }).providerData;
+  return !!pd?.[TRANSIENT_SCREENSHOT_MARKER];
+}
+
+// ===================== Internal helpers =====================
+
+interface ExtractedImage {
+  imageData: string;
+  screenshotUid?: string;
+}
+
+/**
+ * Extract imageData from parsed tool output.
+ * Handles nested structures:
+ *   { success, data: { imageData, sendToLLM, screenshotUid } }
+ *   { success, imageData, sendToLLM, screenshotUid }
+ */
+function extractImageData(
+  parsed: Record<string, unknown>,
+): ExtractedImage | null {
+  if (!parsed.success) return null;
+
+  // Navigate possible nesting levels
+  const data = parsed.data as Record<string, unknown> | undefined;
+  const actual = data ?? parsed;
+
+  // Must have sendToLLM === true
+  if (actual.sendToLLM !== true) return null;
+
+  const imageData = actual.imageData;
+  if (typeof imageData !== "string" || !imageData.startsWith("data:image/")) {
+    return null;
+  }
+
+  return {
+    imageData,
+    screenshotUid:
+      typeof actual.screenshotUid === "string"
+        ? actual.screenshotUid
+        : undefined,
+  };
+}
+
+/**
+ * Build the stripped tool output object (imageData replaced with placeholder).
+ */
+function buildStrippedOutput(
+  parsed: Record<string, unknown>,
+  screenshotUid?: string,
+): Record<string, unknown> {
+  const data = parsed.data as Record<string, unknown> | undefined;
+  const actual = data ?? parsed;
+
+  const stripped: Record<string, unknown> = {
+    ...actual,
+    imageData: IMAGE_DATA_PLACEHOLDER,
+  };
+
+  if (screenshotUid) {
+    stripped.screenshotUid = screenshotUid;
+  }
+
+  // If there was a `data` wrapper, preserve it
+  if (data) {
+    return { success: true, data: stripped };
+  }
+  return { success: true, ...stripped };
+}