feat: enhance screenshot handling and display in chatbot components

- Added functionality to extract and apply screenshots from tool results in the ChatAdapter, improving user experience by allowing immediate rendering of screenshots.
- Implemented a new ToolScreenshot component to display screenshots inline, supporting both base64 data and IndexedDB references.
- Updated message item rendering to transform screenshot placeholders into actual images, enhancing the visual feedback for users.
- Introduced collapsible message items for intermediate assistant messages, improving the organization of conversation turns in the message list.
- Enhanced model fetching logic in the Chatbot component to ensure server models are prioritized, improving model selection reliability.
- Updated localization files to include new translation keys for improved user guidance.
This commit is contained in:
ropzislaw
2026-02-15 13:44:03 +08:00
parent c5b6371223
commit 343f6fa146
29 changed files with 2782 additions and 110 deletions

View File

@@ -1,5 +1,11 @@
import type { AgentEvent } from "@aipexstudio/aipex-core";
import { generateId } from "@aipexstudio/aipex-core";
import { ScreenshotStorage } from "../lib/screenshot-storage";
import {
extractScreenshotFromToolResult,
isCaptureScreenshotTool,
type ScreenshotExtraction,
} from "../lib/screenshot-utils";
import type {
ChatAdapterOptions,
ChatAdapterState,
@@ -412,6 +418,18 @@ export class ChatAdapter {
return;
}
// Extract screenshot data from screenshot tools
if (isCaptureScreenshotTool(toolName)) {
const screenshotInfo = extractScreenshotFromToolResult(
toolName,
result,
);
if (screenshotInfo) {
this.applyScreenshotToolResult(callId, result, screenshotInfo);
return;
}
}
this.updateToolPart(callId, (toolPart) => ({
...toolPart,
state: "completed",
@@ -419,6 +437,59 @@ export class ChatAdapter {
}));
}
/**
* Handle a completed screenshot tool result.
*
* Uses the tool-provided screenshotUid (the tool already saved to IndexedDB)
* rather than generating a new one. Falls back to UI-side storage only if
* screenshotUid is missing (e.g., IndexedDB save failed in the tool).
*/
private applyScreenshotToolResult(
callId: string,
result: unknown,
info: ScreenshotExtraction,
): void {
if (info.screenshotUid) {
// Tool already saved to IndexedDB — use its uid directly
this.updateToolPart(callId, (toolPart) => ({
...toolPart,
state: "completed",
output: result,
screenshotUid: info.screenshotUid!,
// Keep inline screenshot for immediate rendering if base64 is present
...(info.imageData ? { screenshot: info.imageData } : {}),
}));
} else if (info.imageData) {
// Fallback: tool didn't provide a uid (storage failure) — save in UI
this.updateToolPart(callId, (toolPart) => ({
...toolPart,
state: "completed",
output: result,
screenshot: info.imageData!,
}));
ScreenshotStorage.saveScreenshot(info.imageData)
.then((uid) => {
this.updateToolPart(callId, (toolPart) => ({
...toolPart,
screenshotUid: uid,
}));
})
.catch(() => {
// Storage failed — screenshot still visible via inline data
});
} else {
// No image data at all (sendToLLM=false path) — just complete
this.updateToolPart(callId, (toolPart) => ({
...toolPart,
state: "completed",
output: result,
...(info.screenshotUid
? { screenshotUid: info.screenshotUid }
: {}),
}));
}
}
/**
* Check if a tool result indicates a business-level failure.
* Many tools return { success: false, error: "..." } instead of throwing.

View File

@@ -9,7 +9,8 @@ import {
WrenchIcon,
XCircleIcon,
} from "lucide-react";
import type { ComponentProps, ReactNode } from "react";
import { type ComponentProps, type ReactNode, useEffect, useState } from "react";
import { ScreenshotStorage } from "../../lib/screenshot-storage";
import { cn } from "../../lib/utils";
import { Badge } from "../ui/badge";
import {
@@ -29,7 +30,8 @@ export const Tool = ({ className, ...props }: ToolProps) => (
);
export type ToolHeaderProps = {
type: ToolUIPart["type"];
/** Display label for the tool either a raw `tool-${name}` key or a translated name */
type: string;
state: ToolUIPart["state"] | "executing";
className?: string;
};
@@ -154,3 +156,78 @@ export const ToolOutput = ({
</div>
);
};
// ============ Screenshot Display ============
export type ToolScreenshotProps = ComponentProps<"div"> & {
/** Inline base64 screenshot data URL */
screenshot?: string;
/** UID referencing a screenshot stored in ScreenshotStorage (IndexedDB) */
screenshotUid?: string;
};
/**
* ToolScreenshot renders a screenshot captured by a tool.
* Supports both inline base64 data and IndexedDB uid references.
*/
export const ToolScreenshot = ({
className,
screenshot,
screenshotUid,
...props
}: ToolScreenshotProps) => {
const [imageData, setImageData] = useState<string | null>(
screenshot ?? null,
);
const [loading, setLoading] = useState(false);
const [error, setError] = useState<string | null>(null);
useEffect(() => {
// Prefer inline screenshot
if (screenshot) {
setImageData(screenshot);
return;
}
// Load from IndexedDB by uid
if (screenshotUid) {
setLoading(true);
setError(null);
ScreenshotStorage.getScreenshot(screenshotUid)
.then((data) => {
setImageData(data);
if (!data) setError("Screenshot not found");
})
.catch(() => {
setError("Failed to load screenshot");
})
.finally(() => {
setLoading(false);
});
}
}, [screenshot, screenshotUid]);
if (!screenshot && !screenshotUid) return null;
return (
<div className={cn("space-y-2 p-4", className)} {...props}>
<h4 className="font-medium text-muted-foreground text-xs uppercase tracking-wide">
Screenshot
</h4>
{loading ? (
<div className="flex items-center gap-2 text-muted-foreground text-sm">
<ClockIcon className="size-4 animate-spin" />
<span>Loading screenshot...</span>
</div>
) : error ? (
<div className="text-destructive text-sm">{error}</div>
) : imageData ? (
<img
src={imageData}
alt="Screenshot"
className="cursor-pointer rounded-md max-w-full"
/>
) : null}
</div>
);
};

View File

@@ -1,6 +1,7 @@
import { useCallback, useContext, useMemo, useState } from "react";
import { useCallback, useContext, useEffect, useMemo, useState } from "react";
import { useChat, useChatConfig } from "../../../hooks";
import { useTranslation } from "../../../i18n/context";
import { fetchModelsForSelector } from "../../../lib/models";
import { cn } from "../../../lib/utils";
import type { ChatbotThemeVariables, ContextItem } from "../../../types";
import { DEFAULT_MODELS } from "../constants";
@@ -237,6 +238,27 @@ function ChatbotContent({
const [inputResetCount, setInputResetCount] = useState(0);
const [isUxAuditDialogOpen, setIsUxAuditDialogOpen] = useState(false);
// Fetch server model list on mount, fall back to prop-provided models
const [fetchedModels, setFetchedModels] = useState<
Array<{ name: string; value: string }> | null
>(null);
useEffect(() => {
let cancelled = false;
fetchModelsForSelector()
.then((serverModels) => {
if (!cancelled && serverModels.length > 0) {
setFetchedModels(serverModels);
}
})
.catch(() => {
// Fallback to prop-provided models — already used below
});
return () => {
cancelled = true;
};
}, []);
const effectiveModels = fetchedModels ?? models;
const handleSubmit = useCallback(
(text: string, files?: File[], contexts?: ContextItem[]) => {
void sendMessage?.(text, files, contexts);
@@ -318,7 +340,7 @@ function ChatbotContent({
onSubmit={handleSubmit}
onStop={interrupt}
status={status || "idle"}
models={models}
models={effectiveModels}
placeholderTexts={placeholderTexts}
/>
</>

View File

@@ -1,7 +1,15 @@
import { CopyIcon, RefreshCcwIcon } from "lucide-react";
import { Fragment } from "react";
import { CopyIcon, RefreshCcwIcon, WrenchIcon } from "lucide-react";
import { Fragment, useMemo } from "react";
import { useTranslation } from "../../../i18n/context";
import { translatedToolName } from "../../../i18n/tool-names";
import { transformScreenshotPlaceholders } from "../../../lib/screenshot-utils";
import { cn } from "../../../lib/utils";
import type { MessageItemProps, UISourceUrlPart } from "../../../types";
import type {
MessageItemProps,
UIMessage,
UISourceUrlPart,
UIToolPart,
} from "../../../types";
import { Action, Actions } from "../../ai-elements/actions";
import { Message, MessageContent } from "../../ai-elements/message";
import {
@@ -55,6 +63,21 @@ export function DefaultMessageItem({
return null;
}
// Collect screenshot data from tool parts for placeholder resolution
const { screenshotUidList, screenshotDataMap } = useMemo(() => {
const uids: string[] = [];
const dataMap = new Map<string, string>();
for (const p of message.parts) {
if (p.type === "tool" && p.screenshotUid) {
uids.push(p.screenshotUid);
if (p.screenshot) {
dataMap.set(p.screenshotUid, p.screenshot);
}
}
}
return { screenshotUidList: uids, screenshotDataMap: dataMap };
}, [message.parts]);
// Render sources if present
const sourceUrls = message.parts.filter(
(part): part is UISourceUrlPart => part.type === "source-url",
@@ -79,12 +102,27 @@ export function DefaultMessageItem({
const key = `${message.id}-${i}`;
switch (part.type) {
case "text":
case "text": {
// Transform [[screenshot:...]] placeholders to markdown images.
// First resolve to special URLs, then replace with actual
// base64 data URLs when available for inline rendering.
let processedText = part.text;
if (screenshotUidList.length > 0) {
processedText = transformScreenshotPlaceholders(
processedText,
screenshotUidList,
);
// Replace aipex-screenshot.invalid URLs with actual data
for (const [uid, data] of screenshotDataMap) {
const placeholder = `https://aipex-screenshot.invalid/${uid}`;
processedText = processedText.split(placeholder).join(data);
}
}
return (
<Fragment key={key}>
<Message from={message.role as "user" | "assistant" | "system"}>
<MessageContent>
<Response>{part.text}</Response>
<Response>{processedText}</Response>
</MessageContent>
</Message>
{/* Actions for last assistant message */}
@@ -112,6 +150,7 @@ export function DefaultMessageItem({
))}
</Fragment>
);
}
case "file":
return (
@@ -241,6 +280,59 @@ export function DefaultMessageItem({
);
}
// ============ Collapsed tool display for folded messages ============
function CollapsedToolDisplay({ tool }: { tool: UIToolPart }) {
const { t } = useTranslation();
const displayName = translatedToolName(t, tool.toolName);
return (
<div className="text-xs text-muted-foreground py-1 px-2 flex items-center gap-1.5">
<WrenchIcon className="size-3" />
{displayName}
</div>
);
}
// ============ Collapsed message item for intermediate assistant messages ============
/**
* CollapsedMessageItem simplified rendering for intermediate assistant
* messages inside a folded "thinking details" section.
* Shows text as bullet points and tools as compact single-line displays.
*/
export function CollapsedMessageItem({ message }: { message: UIMessage }) {
return (
<div>
{message.parts.map((part, i) => {
const key = `${message.id}-collapsed-${i}`;
switch (part.type) {
case "text":
return (
<div key={key} className="text-sm text-muted-foreground py-1">
- {part.text}
</div>
);
case "tool":
return <CollapsedToolDisplay key={key} tool={part} />;
case "reasoning":
return (
<div
key={key}
className="text-xs text-muted-foreground/70 py-0.5 italic"
>
{part.text.length > 120
? `${part.text.slice(0, 120)}`
: part.text}
</div>
);
default:
return null;
}
})}
</div>
);
}
/**
* MessageItem - Renders either custom or default message item
*/

View File

@@ -1,15 +1,55 @@
import { BrainIcon, ChevronDownIcon } from "lucide-react";
import { useMemo } from "react";
import { useTranslation } from "../../../i18n/context";
import { cn } from "../../../lib/utils";
import type { MessageListProps } from "../../../types";
import type { MessageListProps, UIMessage } from "../../../types";
import {
Conversation,
ConversationContent,
ConversationScrollButton,
} from "../../ai-elements/conversation";
import { Loader } from "../../ai-elements/loader";
import {
Collapsible,
CollapsibleContent,
CollapsibleTrigger,
} from "../../ui/collapsible";
import { useComponentsContext } from "../context";
import { MessageItem } from "./message-item";
import { CollapsedMessageItem, MessageItem } from "./message-item";
import { WelcomeScreen } from "./welcome-screen";
/**
* A conversation turn: one optional user message followed by one or more
* assistant messages produced before the next user message.
*/
interface ConversationTurn {
userMessage?: UIMessage;
assistantMessages: UIMessage[];
}
/**
* Group a flat message list into conversation turns so we can collapse
* intermediate assistant messages (thinking / tool-call steps).
*/
function groupIntoTurns(messages: UIMessage[]): ConversationTurn[] {
const turns: ConversationTurn[] = [];
let current: ConversationTurn | null = null;
for (const message of messages) {
if (message.role === "user") {
if (current) turns.push(current);
current = { userMessage: message, assistantMessages: [] };
} else if (message.role === "assistant") {
if (!current) {
current = { assistantMessages: [] };
}
current.assistantMessages.push(message);
}
}
if (current) turns.push(current);
return turns;
}
/**
* Default MessageList component
*/
@@ -27,10 +67,18 @@ export function DefaultMessageList({
onUxAuditClick?: () => void;
}) {
const { slots } = useComponentsContext();
const { t } = useTranslation();
// Filter out system messages for display
const displayMessages = messages.filter((m) => m.role !== "system");
// Group into conversation turns for folding
const turns = useMemo(() => groupIntoTurns(displayMessages), [displayMessages]);
// Determine if a message is the very last display message
const lastMessage = displayMessages[displayMessages.length - 1];
const lastMessageId = lastMessage?.id ?? null;
return (
<div className={cn("flex-1 overflow-hidden", className)} {...props}>
<Conversation className="h-full">
@@ -45,15 +93,78 @@ export function DefaultMessageList({
onUxAuditClick={onUxAuditClick}
/>
) : (
displayMessages.map((message, index) => (
<MessageItem
key={message.id}
message={message}
isLast={index === displayMessages.length - 1}
isStreaming={status === "streaming"}
onRegenerate={onRegenerate}
onCopy={onCopy}
/>
turns.map((turn, turnIndex) => (
<div key={`turn-${turnIndex}`}>
{/* Render user message */}
{turn.userMessage && (
<MessageItem
key={turn.userMessage.id}
message={turn.userMessage}
isLast={turn.userMessage.id === lastMessageId}
isStreaming={status === "streaming"}
onRegenerate={onRegenerate}
onCopy={onCopy}
/>
)}
{/* Render assistant messages with folding */}
{turn.assistantMessages.length > 1 ? (
(() => {
const finalMsg =
turn.assistantMessages[
turn.assistantMessages.length - 1
]!;
return (
<>
{/* Intermediate messages collapsed by default */}
<Collapsible defaultOpen={false} className="mb-2">
<CollapsibleTrigger className="flex w-full cursor-pointer items-center gap-2 rounded-md border border-muted bg-muted/30 px-3 py-2 text-sm text-muted-foreground transition-colors hover:bg-muted/50 hover:text-foreground">
<BrainIcon className="size-4" />
<span className="flex-1 text-left">
{t("common.showThinkingDetails")}
</span>
<ChevronDownIcon className="size-4 transition-transform [[data-state=open]>&]:rotate-180" />
</CollapsibleTrigger>
<CollapsibleContent className="mt-2">
<div className="rounded-md border border-muted/50 bg-muted/10 p-3 space-y-2">
{turn.assistantMessages
.slice(0, -1)
.map((msg) => (
<CollapsedMessageItem
key={msg.id}
message={msg}
/>
))}
</div>
</CollapsibleContent>
</Collapsible>
{/* Final assistant message always expanded */}
<MessageItem
key={finalMsg.id}
message={finalMsg}
isLast={finalMsg.id === lastMessageId}
isStreaming={status === "streaming"}
onRegenerate={onRegenerate}
onCopy={onCopy}
/>
</>
);
})()
) : (
// Single assistant message render normally
turn.assistantMessages.map((msg) => (
<MessageItem
key={msg.id}
message={msg}
isLast={msg.id === lastMessageId}
isStreaming={status === "streaming"}
onRegenerate={onRegenerate}
onCopy={onCopy}
/>
))
)}
</div>
))
)}
{/* Loading indicator */}

View File

@@ -1,5 +1,6 @@
import type React from "react";
import { useEffect, useState } from "react";
import { useCallback, useEffect, useState } from "react";
import { fetchModelsForPrompt } from "../../../lib/models";
export interface ModelInfo {
id: string;
@@ -46,24 +47,36 @@ export const ModelChangePrompt: React.FC<ModelChangePromptProps> = ({
const [allModels, setAllModels] = useState<ModelInfo[]>(availableModels);
const [isLoadingModels, setIsLoadingModels] = useState(false);
// Fetch models from API
useEffect(() => {
const loadModels = async () => {
if (!onFetchModels) return;
// Resolve the fetch function: use the provided callback or fall back to
// the built-in fetchModelsForPrompt so models are always loaded.
const resolvedFetch = useCallback(
() => (onFetchModels ? onFetchModels() : fetchModelsForPrompt()),
[onFetchModels],
);
// Fetch models from API (always runs — no longer gated on onFetchModels)
useEffect(() => {
let cancelled = false;
const loadModels = async () => {
setIsLoadingModels(true);
try {
const fetchedModels = await onFetchModels();
setAllModels(fetchedModels);
} catch (error) {
console.error("Failed to load models:", error);
const fetched = await resolvedFetch();
if (!cancelled) {
setAllModels(fetched);
}
} catch (_error) {
// Keep using availableModels as fallback
} finally {
setIsLoadingModels(false);
if (!cancelled) {
setIsLoadingModels(false);
}
}
};
loadModels();
}, [onFetchModels]);
return () => {
cancelled = true;
};
}, [resolvedFetch]);
// Update models when availableModels prop changes
useEffect(() => {

View File

@@ -4,6 +4,8 @@ import {
WrenchIcon,
XCircleIcon,
} from "lucide-react";
import { useTranslation } from "../../../../i18n/context";
import { translatedToolName } from "../../../../i18n/tool-names";
import { cn } from "../../../../lib/utils";
import type { ToolDisplaySlotProps } from "../../../../types";
import { Response } from "../../../ai-elements/response";
@@ -13,6 +15,7 @@ import {
ToolHeader,
ToolInput,
ToolOutput,
ToolScreenshot,
} from "../../../ai-elements/tool";
import {
Collapsible,
@@ -26,13 +29,15 @@ import { formatToolOutput, mapToolState } from "../../tools";
* Opens by default when there's an error so users can see the failure reason
*/
export function DefaultToolDisplay({ tool }: ToolDisplaySlotProps) {
const { t } = useTranslation();
const displayName = translatedToolName(t, tool.toolName);
// Expand by default when in error state to make failure reasons visible
const shouldExpandByDefault = tool.state === "error";
return (
<Tool defaultOpen={shouldExpandByDefault}>
<ToolHeader
type={`tool-${tool.toolName}`}
type={displayName}
state={mapToolState(tool.state)}
/>
<ToolContent>
@@ -45,6 +50,10 @@ export function DefaultToolDisplay({ tool }: ToolDisplaySlotProps) {
}
errorText={tool.errorText}
/>
<ToolScreenshot
screenshot={tool.screenshot}
screenshotUid={tool.screenshotUid}
/>
</ToolContent>
</Tool>
);
@@ -55,6 +64,8 @@ export function DefaultToolDisplay({ tool }: ToolDisplaySlotProps) {
* Opens by default when there's an error so users can see the failure reason
*/
export function CompactToolDisplay({ tool }: ToolDisplaySlotProps) {
const { t } = useTranslation();
const displayName = translatedToolName(t, tool.toolName);
const getStatusIcon = () => {
switch (tool.state) {
case "pending":
@@ -75,7 +86,7 @@ export function CompactToolDisplay({ tool }: ToolDisplaySlotProps) {
<Collapsible defaultOpen={shouldExpandByDefault}>
<CollapsibleTrigger className="flex items-center gap-2 w-full p-2 rounded-md hover:bg-muted/50 transition-colors">
{getStatusIcon()}
<span className="text-sm font-medium">{tool.toolName}</span>
<span className="text-sm font-medium">{displayName}</span>
{tool.duration && (
<span className="text-xs text-muted-foreground ml-auto">
{tool.duration}ms
@@ -118,6 +129,8 @@ export function CompactToolDisplay({ tool }: ToolDisplaySlotProps) {
* Minimal tool display (just status indicator)
*/
export function MinimalToolDisplay({ tool }: ToolDisplaySlotProps) {
const { t } = useTranslation();
const displayName = translatedToolName(t, tool.toolName);
const getStatusColor = () => {
switch (tool.state) {
case "pending":
@@ -134,7 +147,7 @@ export function MinimalToolDisplay({ tool }: ToolDisplaySlotProps) {
return (
<div className="inline-flex items-center gap-1.5 px-2 py-1 text-xs rounded-full bg-muted">
<div className={cn("w-2 h-2 rounded-full", getStatusColor())} />
<span>{tool.toolName}</span>
<span>{displayName}</span>
{tool.state === "executing" && (
<Loader2Icon className="size-3 animate-spin" />
)}

View File

@@ -91,6 +91,9 @@ export function useChat(
const [sessionId, setSessionId] = useState<string | null>(null);
const [metrics, setMetrics] = useState<AgentMetrics | null>(null);
// Cumulative session-level metrics (sum across all runs)
const cumulativeMetricsRef = useRef<AgentMetrics | null>(null);
// Refs for stable callbacks
const handlersRef = useRef(handlers);
handlersRef.current = handlers;
@@ -153,11 +156,28 @@ export function useChat(
handlersRef.current?.onError?.(event.error);
}
// Handle metrics update
// Handle metrics update accumulate across the session
if (event.type === "metrics_update") {
setMetrics(event.metrics);
const prev = cumulativeMetricsRef.current;
const cumulative: AgentMetrics = {
tokensUsed:
(prev?.tokensUsed ?? 0) + event.metrics.tokensUsed,
promptTokens:
(prev?.promptTokens ?? 0) + event.metrics.promptTokens,
completionTokens:
(prev?.completionTokens ?? 0) +
event.metrics.completionTokens,
// Non-cumulative fields: use latest values
itemCount: event.metrics.itemCount,
maxTurns: event.metrics.maxTurns,
duration:
(prev?.duration ?? 0) + event.metrics.duration,
startTime: prev?.startTime ?? event.metrics.startTime,
};
cumulativeMetricsRef.current = cumulative;
setMetrics(cumulative);
handlersRef.current?.onMetricsUpdate?.(
event.metrics,
cumulative,
event.sessionId,
);
}
@@ -263,6 +283,7 @@ export function useChat(
activeGeneratorRef.current = null;
setSessionId(null);
setMetrics(null);
cumulativeMetricsRef.current = null;
adapter.reset(configRef.current?.initialMessages ?? []);
}, [adapter, agent, sessionId]);

View File

@@ -10,7 +10,9 @@
"send": "Send",
"stop": "Stop",
"processing": "Processing...",
"noActions": "No actions"
"noActions": "No actions",
"showThinkingDetails": "Show thinking details",
"clickToExpand": "Click to expand"
},
"settings": {
"title": "Settings",

View File

@@ -10,7 +10,9 @@
"send": "发送",
"stop": "停止",
"processing": "处理中...",
"noActions": "无可用操作"
"noActions": "无可用操作",
"showThinkingDetails": "显示思考过程",
"clickToExpand": "点击展开"
},
"settings": {
"title": "设置",

View File

@@ -13,6 +13,8 @@ export interface TranslationResources {
stop: string;
processing: string;
noActions: string;
showThinkingDetails: string;
clickToExpand: string;
};
settings: {
title: string;
@@ -229,6 +231,8 @@ export type BaseTranslationKey =
| "common.stop"
| "common.processing"
| "common.noActions"
| "common.showThinkingDetails"
| "common.clickToExpand"
| "settings.title"
| "settings.subtitle"
| "settings.language"

View File

@@ -0,0 +1,180 @@
// API response types (must match server contract)
interface ApiModelPricing {
input: number;
output: number;
}
interface ApiModel {
id: string;
name: string;
provider: string;
description: string;
pricing: ApiModelPricing;
}
interface ApiResponse {
success: boolean;
data: {
models: ApiModel[];
count: number;
cache: {
lastUpdate: number;
modelCount: number;
};
};
}
// Internal model info used by the chatbot UI
export interface ModelInfo {
id: string;
name: string;
provider: string;
description: string;
supportsTools: boolean;
contextLength?: number;
pricing?: {
input: string;
output: string;
};
priceLevel: "cheap" | "normal" | "expensive";
}
// Fallback models in case API fails
const FALLBACK_MODELS: ModelInfo[] = [
{
id: "anthropic/claude-3-haiku",
name: "Claude 3 Haiku",
provider: "Anthropic",
description: "Cost-effective choice for basic tasks",
supportsTools: true,
contextLength: 200_000,
pricing: {
input: "$0.30/1M tokens",
output: "$1.50/1M tokens",
},
priceLevel: "cheap",
},
{
id: "anthropic/claude-sonnet-4.5",
name: "Claude Sonnet 4.5",
provider: "Anthropic",
description: "AI model for various tasks",
supportsTools: true,
contextLength: 200_000,
pricing: {
input: "$3.60/1M tokens",
output: "$18.00/1M tokens",
},
priceLevel: "expensive",
},
];
const MODELS_API_URL = "https://www.claudechrome.com/api/models";
// Convert API pricing to price level
function getPriceLevel(
pricing: ApiModelPricing,
): "cheap" | "normal" | "expensive" {
const totalCost = pricing.input + pricing.output;
if (totalCost < 2) return "cheap";
if (totalCost < 10) return "normal";
return "expensive";
}
// Convert API model to internal ModelInfo
function convertApiModel(apiModel: ApiModel): ModelInfo {
return {
id: apiModel.id,
name: apiModel.name,
provider: apiModel.provider,
description: apiModel.description,
supportsTools: true,
pricing: {
input: `$${apiModel.pricing.input.toFixed(2)}/1M tokens`,
output: `$${apiModel.pricing.output.toFixed(2)}/1M tokens`,
},
priceLevel: getPriceLevel(apiModel.pricing),
};
}
// Validate that the API response matches the expected schema
function isValidApiResponse(data: unknown): data is ApiResponse {
if (typeof data !== "object" || data === null) return false;
const obj = data as Record<string, unknown>;
if (typeof obj.success !== "boolean") return false;
if (typeof obj.data !== "object" || obj.data === null) return false;
const d = obj.data as Record<string, unknown>;
if (!Array.isArray(d.models)) return false;
// Validate first model shape if present
if (d.models.length > 0) {
const first = d.models[0] as Record<string, unknown>;
if (typeof first.id !== "string" || typeof first.name !== "string") {
return false;
}
}
return true;
}
// Cache for models
let cachedModels: ModelInfo[] | null = null;
let lastFetchTime = 0;
const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
const MAX_MODELS = 200; // Safety cap on number of models
/**
* Fetch models from the server API with caching and fallback.
* Returns cached result if still valid (5 min TTL).
* Falls back to FALLBACK_MODELS on any error.
*/
export async function fetchModels(): Promise<ModelInfo[]> {
// Return cached models if still valid
if (cachedModels && Date.now() - lastFetchTime < CACHE_DURATION) {
return cachedModels;
}
try {
const response = await fetch(MODELS_API_URL);
if (!response.ok) {
throw new Error(`API request failed: ${response.status}`);
}
const data: unknown = await response.json();
if (!isValidApiResponse(data)) {
throw new Error("Invalid API response structure");
}
if (data.success && data.data.models.length > 0) {
// Apply safety cap
const models = data.data.models
.slice(0, MAX_MODELS)
.map(convertApiModel);
cachedModels = models;
lastFetchTime = Date.now();
return cachedModels;
}
throw new Error("Empty model list from API");
} catch (_error) {
// Return fallback - do not log sensitive details
return FALLBACK_MODELS;
}
}
/**
* Fetch models and convert to the {name, value} format used by the model selector.
*/
export async function fetchModelsForSelector(): Promise<
Array<{ name: string; value: string }>
> {
const models = await fetchModels();
return models.map((m) => ({ name: m.name, value: m.id }));
}
/**
* Fetch models as ModelInfo[] for ModelChangePrompt compatibility.
*/
export async function fetchModelsForPrompt(): Promise<ModelInfo[]> {
return fetchModels();
}

View File

@@ -0,0 +1,177 @@
/**
* Screenshot storage using IndexedDB.
* Stores screenshots with a uid for efficient reference and retrieval.
* Applies an LRU eviction policy (max 50 screenshots).
*/
export interface ScreenshotData {
uid: string;
/** Complete data URL: data:image/png;base64,... */
base64Data: string;
timestamp: number;
tabId?: number;
metadata?: {
width: number;
height: number;
viewportWidth: number;
viewportHeight: number;
};
}
const DB_NAME = "aipex-screenshots-db";
const DB_VERSION = 1;
const STORE_NAME = "screenshots";
const MAX_SCREENSHOTS = 50;
let db: IDBDatabase | null = null;
let initPromise: Promise<void> | null = null;
function initialize(): Promise<void> {
if (initPromise) return initPromise;
if (db) return Promise.resolve();
initPromise = new Promise<void>((resolve, reject) => {
const request = indexedDB.open(DB_NAME, DB_VERSION);
request.onerror = () => {
initPromise = null;
reject(request.error);
};
request.onsuccess = () => {
db = request.result;
initPromise = null;
resolve();
};
request.onupgradeneeded = (event) => {
const database = (event.target as IDBOpenDBRequest).result;
if (!database.objectStoreNames.contains(STORE_NAME)) {
const store = database.createObjectStore(STORE_NAME, {
keyPath: "uid",
});
store.createIndex("timestamp", "timestamp", { unique: false });
}
};
});
return initPromise;
}
function generateUid(): string {
return `screenshot_${Date.now()}_${Math.random().toString(36).slice(2, 11)}`;
}
async function applyLRU(): Promise<void> {
if (!db) return;
const tx = db.transaction([STORE_NAME], "readonly");
const store = tx.objectStore(STORE_NAME);
const all: ScreenshotData[] = await new Promise((res, rej) => {
const req = store.getAll();
req.onsuccess = () => res(req.result as ScreenshotData[]);
req.onerror = () => rej(req.error);
});
if (all.length <= MAX_SCREENSHOTS) return;
all.sort((a, b) => b.timestamp - a.timestamp);
const toDelete = all.slice(MAX_SCREENSHOTS);
const delTx = db.transaction([STORE_NAME], "readwrite");
const delStore = delTx.objectStore(STORE_NAME);
for (const item of toDelete) {
delStore.delete(item.uid);
}
}
export const ScreenshotStorage = {
/**
* Save a screenshot and return its uid.
* The base64Data must be a valid data URL (validated before storing).
*/
async saveScreenshot(
base64Data: string,
metadata?: {
tabId?: number;
width?: number;
height?: number;
viewportWidth?: number;
viewportHeight?: number;
},
): Promise<string> {
// Validate that it's a data URL (not arbitrary content)
if (
typeof base64Data !== "string" ||
!base64Data.startsWith("data:image/")
) {
throw new Error("Invalid screenshot data: expected data:image/ URL");
}
await initialize();
if (!db) throw new Error("Database not initialized");
const uid = generateUid();
const entry: ScreenshotData = {
uid,
base64Data,
timestamp: Date.now(),
tabId: metadata?.tabId,
metadata: metadata
? {
width: metadata.width ?? 0,
height: metadata.height ?? 0,
viewportWidth: metadata.viewportWidth ?? 0,
viewportHeight: metadata.viewportHeight ?? 0,
}
: undefined,
};
await new Promise<void>((resolve, reject) => {
const tx = db!.transaction([STORE_NAME], "readwrite");
const store = tx.objectStore(STORE_NAME);
const req = store.put(entry);
req.onsuccess = () => resolve();
req.onerror = () => reject(req.error);
});
// Async LRU eviction — fire-and-forget
applyLRU().catch(() => {});
return uid;
},
/**
* Get screenshot base64 data by uid.
*/
async getScreenshot(uid: string): Promise<string | null> {
await initialize();
if (!db) throw new Error("Database not initialized");
return new Promise((resolve, reject) => {
const tx = db!.transaction([STORE_NAME], "readonly");
const store = tx.objectStore(STORE_NAME);
const req = store.get(uid);
req.onsuccess = () => {
const data = req.result as ScreenshotData | undefined;
resolve(data?.base64Data ?? null);
};
req.onerror = () => reject(req.error);
});
},
/**
* Clear all screenshots.
*/
async clearAll(): Promise<void> {
await initialize();
if (!db) throw new Error("Database not initialized");
await new Promise<void>((resolve, reject) => {
const tx = db!.transaction([STORE_NAME], "readwrite");
const store = tx.objectStore(STORE_NAME);
const req = store.clear();
req.onsuccess = () => resolve();
req.onerror = () => reject(req.error);
});
},
};

View File

@@ -0,0 +1,185 @@
/**
* Utilities for detecting screenshot tools and extracting image data
* from tool results.
*/
/** Tool names that produce screenshot image data */
const SCREENSHOT_TOOL_NAMES = new Set([
"capture_screenshot",
"capture_screenshot_with_highlight",
"capture_tab_screenshot",
]);
/** URL prefix used in markdown for screenshot references */
export const AIPEX_SCREENSHOT_URL_PREFIX = "https://aipex-screenshot.invalid/";
/** Regex matching [[screenshot:...]] placeholders */
const SCREENSHOT_PLACEHOLDER_REGEX = /\[\[screenshot:([^\]]+)\]\]/g;
/** Validate that a uid looks like a screenshot uid */
export function isValidScreenshotUid(uid: string): boolean {
return /^screenshot_\d+_[a-z0-9]{1,20}$/i.test(uid);
}
/**
* Check if a tool is a screenshot/capture tool.
*/
export function isCaptureScreenshotTool(toolName: string): boolean {
return SCREENSHOT_TOOL_NAMES.has(toolName);
}
export interface ScreenshotExtraction {
/** Base64 data URL if available (may be null if already stripped) */
imageData: string | null;
/** Whether the screenshot was intended for LLM vision */
sendToLLM: boolean;
/** Unique identifier for loading from IndexedDB storage */
screenshotUid: string | null;
}
/**
* Extract screenshot info from a tool result.
* Works with capture_screenshot and capture_tab_screenshot tools.
*
* Supports multiple result formats:
* - Object: { success, imageData, sendToLLM, screenshotUid }
* - Nested object: { success, data: { imageData, sendToLLM, screenshotUid } }
* - SDK structured array: [{ type: "text", text: JSON }, { type: "image", image: dataUrl }]
*
* Returns screenshot details if found, null if this is not a screenshot result.
*/
export function extractScreenshotFromToolResult(
toolName: string,
result: unknown,
): ScreenshotExtraction | null {
if (!isCaptureScreenshotTool(toolName)) return null;
try {
const content =
typeof result === "string" ? JSON.parse(result) : result;
if (content === null || content === undefined) return null;
// SDK structured array format:
// [{ type: "text", text: '{"success":true,...}' }, { type: "image", image: "data:..." }]
if (Array.isArray(content)) {
return extractFromStructuredArray(content);
}
if (typeof content !== "object") return null;
const obj = content as Record<string, unknown>;
// Handle nested structure: { success, data: { imageData, sendToLLM } }
// or direct: { success, imageData, sendToLLM }
const middleLayer = obj.data as Record<string, unknown> | undefined;
const actualData =
(middleLayer?.data as Record<string, unknown>) ?? middleLayer ?? obj;
if (!obj.success) return null;
// Extract screenshotUid (always present if tool saved to IndexedDB)
const screenshotUid =
typeof actualData.screenshotUid === "string"
? actualData.screenshotUid
: null;
// Extract imageData (may be a real data URL or a placeholder)
const rawImageData = actualData.imageData;
const imageData =
typeof rawImageData === "string" &&
rawImageData.startsWith("data:image/")
? rawImageData
: null;
const sendToLLM = actualData.sendToLLM === true;
// Return if we have at least a uid or image data
if (screenshotUid || imageData) {
return { imageData, sendToLLM, screenshotUid };
}
} catch {
// parse failed ignore
}
return null;
}
/**
* Extract screenshot from SDK structured array format.
*/
function extractFromStructuredArray(
arr: unknown[],
): ScreenshotExtraction | null {
let imageData: string | null = null;
let screenshotUid: string | null = null;
let sendToLLM = false;
for (const item of arr) {
if (typeof item !== "object" || item === null) continue;
const part = item as Record<string, unknown>;
if (part.type === "image" && typeof part.image === "string") {
if (part.image.startsWith("data:image/")) {
imageData = part.image;
}
}
if (part.type === "text" && typeof part.text === "string") {
try {
const parsed = JSON.parse(part.text) as Record<string, unknown>;
if (parsed.sendToLLM === true) sendToLLM = true;
if (typeof parsed.screenshotUid === "string") {
screenshotUid = parsed.screenshotUid;
}
} catch {
// ignore
}
}
}
if (imageData) {
return { imageData, sendToLLM: sendToLLM || true, screenshotUid };
}
return null;
}
/**
* Transform [[screenshot:...]] placeholders in text into markdown images
* with the special aipex-screenshot.invalid URL prefix.
*
* Supported formats:
* - [[screenshot:screenshot_123_abc]] → ![](https://aipex-screenshot.invalid/screenshot_123_abc)
* - [[screenshot:1]] → 1-based index into screenshotUidList
*/
export function transformScreenshotPlaceholders(
text: string,
screenshotUidList: string[],
): string {
return text.replace(
SCREENSHOT_PLACEHOLDER_REGEX,
(match: string, content: string) => {
const trimmed = content.trim();
// Case 1: Direct uid
if (isValidScreenshotUid(trimmed)) {
return `![](${AIPEX_SCREENSHOT_URL_PREFIX}${trimmed})`;
}
// Case 2: Numeric 1-based index
const index = parseInt(trimmed, 10);
if (
!isNaN(index) &&
index >= 1 &&
index <= screenshotUidList.length
) {
const uid = screenshotUidList[index - 1];
if (uid && isValidScreenshotUid(uid)) {
return `![](${AIPEX_SCREENSHOT_URL_PREFIX}${uid})`;
}
}
// Invalid leave as-is
return match;
},
);
}

View File

@@ -47,6 +47,10 @@ export interface UIToolPart {
state: UIToolState;
errorText?: string;
duration?: number;
/** Base64 data URL of the screenshot (inline) */
screenshot?: string;
/** UID referencing a screenshot in ScreenshotStorage (IndexedDB) */
screenshotUid?: string;
}
export interface UIContextPart {

View File

@@ -41,32 +41,58 @@ export function ChatImagesListener() {
for (const msg of messages) {
for (const part of msg.parts) {
// Tool parts may carry screenshot data in their output
// Tool parts may carry screenshot data inline (screenshot field)
// or in their output (imageData field)
if (part.type === "tool") {
const output = (part as { output?: unknown }).output;
const toolPart = part as {
output?: unknown;
screenshot?: string;
toolName?: string;
};
// Prefer the inline screenshot field (set by ChatAdapter)
const screenshotData = toolPart.screenshot;
if (
output &&
typeof output === "object" &&
"imageData" in output
screenshotData &&
typeof screenshotData === "string" &&
screenshotData.startsWith("data:image/")
) {
const imageData = (output as { imageData?: string }).imageData;
images.push({
id: msg.id,
parts: [
{
type: "image",
imageData: screenshotData,
imageTitle: toolPart.toolName || "Screenshot",
},
],
});
} else {
// Fall back to extracting from output
const output = toolPart.output;
if (
imageData &&
typeof imageData === "string" &&
imageData.startsWith("data:image/")
output &&
typeof output === "object" &&
"imageData" in output
) {
images.push({
id: msg.id,
parts: [
{
type: "image",
imageData,
imageTitle:
(part as { toolName?: string }).toolName ||
"Screenshot",
},
],
});
const imageData = (output as { imageData?: string })
.imageData;
if (
imageData &&
typeof imageData === "string" &&
imageData.startsWith("data:image/")
) {
images.push({
id: msg.id,
parts: [
{
type: "image",
imageData,
imageTitle: toolPart.toolName || "Screenshot",
},
],
});
}
}
}
}

View File

@@ -0,0 +1,257 @@
import { describe, expect, it } from "vitest";
import { fromStorageFormat, toStorageFormat } from "./message-adapter";
const TEST_IMAGE_DATA = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQ==";
const TEST_SCREENSHOT_UID = "screenshot_1234567890_abcdefghi";
const PLACEHOLDER = "[Image data removed - see following user message]";
describe("message-adapter", () => {
describe("toStorageFormat screenshot stripping", () => {
it("should strip base64 imageData from screenshot tool results", () => {
const output = {
success: true,
imageData: TEST_IMAGE_DATA,
sendToLLM: true,
screenshotUid: TEST_SCREENSHOT_UID,
tabId: 1,
url: "https://example.com",
title: "Example",
};
const messages = [
{
id: "msg-1",
role: "assistant" as const,
parts: [
{
type: "tool" as const,
toolCallId: "call-1",
toolName: "capture_screenshot",
input: { sendToLLM: true },
output,
state: "completed" as const,
screenshot: TEST_IMAGE_DATA,
screenshotUid: TEST_SCREENSHOT_UID,
},
],
timestamp: Date.now(),
},
];
const stored = toStorageFormat(messages as any);
expect(stored.length).toBe(1);
// Find the tool_result part
const toolResultPart = stored[0]!.parts.find(
(p: any) => p.type === "tool_result",
) as any;
expect(toolResultPart).toBeTruthy();
// Parse the content and verify imageData is stripped
const parsedContent = JSON.parse(toolResultPart.content);
expect(parsedContent.imageData).toBe(PLACEHOLDER);
expect(parsedContent.screenshotUid).toBe(TEST_SCREENSHOT_UID);
expect(parsedContent.success).toBe(true);
});
it("should not strip non-screenshot tool results", () => {
const output = {
tabs: [{ id: 1, title: "Tab" }],
imageData: TEST_IMAGE_DATA, // Even if it has imageData
};
const messages = [
{
id: "msg-1",
role: "assistant" as const,
parts: [
{
type: "tool" as const,
toolCallId: "call-1",
toolName: "get_tabs",
input: {},
output,
state: "completed" as const,
},
],
timestamp: Date.now(),
},
];
const stored = toStorageFormat(messages as any);
const toolResultPart = stored[0]!.parts.find(
(p: any) => p.type === "tool_result",
) as any;
const parsedContent = JSON.parse(toolResultPart.content);
expect(parsedContent.imageData).toBe(TEST_IMAGE_DATA);
});
});
describe("fromStorageFormat screenshotUid restoration", () => {
it("should restore screenshotUid from stored tool result", () => {
const storedOutput = {
success: true,
imageData: PLACEHOLDER,
sendToLLM: true,
screenshotUid: TEST_SCREENSHOT_UID,
tabId: 1,
};
const storedMessages = [
{
id: "msg-1",
role: "assistant" as const,
parts: [
{
type: "tool_use" as const,
id: "call-1",
name: "capture_screenshot",
input: { sendToLLM: true },
},
{
type: "tool_result" as const,
tool_use_id: "call-1",
content: JSON.stringify(storedOutput),
is_error: false,
},
],
timestamp: Date.now(),
},
];
const restored = fromStorageFormat(storedMessages as any);
expect(restored.length).toBe(1);
// Find the tool part (merged from tool_use + tool_result)
const toolPart = restored[0]!.parts.find(
(p: any) => p.type === "tool",
) as any;
expect(toolPart).toBeTruthy();
expect(toolPart.screenshotUid).toBe(TEST_SCREENSHOT_UID);
// imageData is the placeholder, not a real data URL, so screenshot should NOT be set
expect(toolPart.screenshot).toBeUndefined();
expect(toolPart.state).toBe("completed");
});
it("should restore both screenshotUid and screenshot when real imageData is present", () => {
const storedOutput = {
success: true,
imageData: TEST_IMAGE_DATA,
sendToLLM: true,
screenshotUid: TEST_SCREENSHOT_UID,
tabId: 1,
};
const storedMessages = [
{
id: "msg-1",
role: "assistant" as const,
parts: [
{
type: "tool_use" as const,
id: "call-1",
name: "capture_screenshot",
input: { sendToLLM: true },
},
{
type: "tool_result" as const,
tool_use_id: "call-1",
content: JSON.stringify(storedOutput),
is_error: false,
},
],
timestamp: Date.now(),
},
];
const restored = fromStorageFormat(storedMessages as any);
const toolPart = restored[0]!.parts.find(
(p: any) => p.type === "tool",
) as any;
expect(toolPart.screenshotUid).toBe(TEST_SCREENSHOT_UID);
expect(toolPart.screenshot).toBe(TEST_IMAGE_DATA);
});
});
describe("round-trip: toStorageFormat -> fromStorageFormat", () => {
it("should preserve screenshotUid through round-trip", () => {
const original = [
{
id: "msg-1",
role: "assistant" as const,
parts: [
{
type: "tool" as const,
toolCallId: "call-1",
toolName: "capture_screenshot",
input: { sendToLLM: true },
output: {
success: true,
imageData: TEST_IMAGE_DATA,
sendToLLM: true,
screenshotUid: TEST_SCREENSHOT_UID,
tabId: 1,
},
state: "completed" as const,
screenshot: TEST_IMAGE_DATA,
screenshotUid: TEST_SCREENSHOT_UID,
},
],
timestamp: Date.now(),
},
];
// Store -> Restore
const stored = toStorageFormat(original as any);
const restored = fromStorageFormat(stored);
const toolPart = restored[0]!.parts.find(
(p: any) => p.type === "tool",
) as any;
// screenshotUid should survive the round-trip
expect(toolPart.screenshotUid).toBe(TEST_SCREENSHOT_UID);
// imageData was stripped during storage, so inline screenshot is gone
expect(toolPart.screenshot).toBeUndefined();
expect(toolPart.state).toBe("completed");
expect(toolPart.toolName).toBe("capture_screenshot");
});
it("should handle capture_tab_screenshot round-trip", () => {
const original = [
{
id: "msg-1",
role: "assistant" as const,
parts: [
{
type: "tool" as const,
toolCallId: "call-1",
toolName: "capture_tab_screenshot",
input: { tabId: 42, sendToLLM: true },
output: {
success: true,
imageData: TEST_IMAGE_DATA,
sendToLLM: true,
screenshotUid: TEST_SCREENSHOT_UID,
tabId: 42,
},
state: "completed" as const,
screenshot: TEST_IMAGE_DATA,
screenshotUid: TEST_SCREENSHOT_UID,
},
],
timestamp: Date.now(),
},
];
const stored = toStorageFormat(original as any);
const restored = fromStorageFormat(stored);
const toolPart = restored[0]!.parts.find(
(p: any) => p.type === "tool",
) as any;
expect(toolPart.screenshotUid).toBe(TEST_SCREENSHOT_UID);
expect(toolPart.toolName).toBe("capture_tab_screenshot");
});
});
});

View File

@@ -6,6 +6,90 @@
import type { UIMessage as ReactUIMessage } from "@aipexstudio/aipex-react/types";
import type { UIMessage as RuntimeUIMessage } from "@aipexstudio/browser-runtime";
/** Tool names whose results may include screenshot image data */
const SCREENSHOT_TOOL_NAMES = new Set([
"capture_screenshot",
"capture_screenshot_with_highlight",
"capture_tab_screenshot",
]);
/** Placeholder that replaces base64 imageData in stored tool results */
const IMAGE_DATA_PLACEHOLDER =
"[Image data removed - see following user message]";
interface ScreenshotToolInfo {
/** The base64 data URL if present (may be null if already stripped) */
imageData: string | null;
/** The screenshot uid if present */
screenshotUid: string | null;
}
/**
* Navigate into the parsed tool result to find the "actual data" layer.
* Handles nesting: { data: { ... } }, { data: { data: { ... } } }, or flat.
*/
function getScreenshotActualData(
parsedOutput: unknown,
): Record<string, unknown> | null {
if (typeof parsedOutput !== "object" || parsedOutput === null) return null;
const obj = parsedOutput as Record<string, unknown>;
const middleLayer = obj.data as Record<string, unknown> | undefined;
return (
(middleLayer?.data as Record<string, unknown>) ?? middleLayer ?? obj
);
}
/**
* Extract screenshot info (imageData + screenshotUid) from a parsed tool result.
*/
function extractScreenshotInfo(
toolName: string,
parsedOutput: unknown,
): ScreenshotToolInfo | null {
if (!SCREENSHOT_TOOL_NAMES.has(toolName)) return null;
const actual = getScreenshotActualData(parsedOutput);
if (!actual) return null;
const imageData =
typeof actual.imageData === "string" &&
actual.imageData.startsWith("data:image/")
? actual.imageData
: null;
const screenshotUid =
typeof actual.screenshotUid === "string" ? actual.screenshotUid : null;
if (!imageData && !screenshotUid) return null;
return { imageData, screenshotUid };
}
/**
* Strip base64 imageData from a screenshot tool result string, replacing it
* with a placeholder. Returns the stripped string (or the original if not applicable).
*/
function stripImageDataFromToolOutput(
toolName: string,
content: string,
): string {
if (!SCREENSHOT_TOOL_NAMES.has(toolName)) return content;
const parsed = safeJsonParse<Record<string, unknown>>(content);
if (!parsed) return content;
const actual = getScreenshotActualData(parsed);
if (!actual) return content;
if (
typeof actual.imageData !== "string" ||
!actual.imageData.startsWith("data:image/")
) {
return content;
}
// Replace imageData in the actual data layer
actual.imageData = IMAGE_DATA_PLACEHOLDER;
return JSON.stringify(parsed);
}
/**
* Convert aipex-react UIMessage to runtime UIMessage for storage
*/
@@ -15,7 +99,7 @@ export function toStorageFormat(
return messages.map((msg) => ({
id: msg.id,
role: msg.role === "tool" ? "assistant" : msg.role, // Map "tool" to "assistant"
parts: msg.parts.map((part) => {
parts: msg.parts.flatMap((part) => {
switch (part.type) {
case "text":
return { type: "text", text: part.text };
@@ -27,19 +111,37 @@ export function toStorageFormat(
imageTitle: part.filename,
};
case "tool":
// Map tool to tool_use or tool_result based on state
// Map tool to tool_use + tool_result pair (when completed)
// or just tool_use (when pending/executing).
// Emitting both ensures fromStorageFormat can correlate them
// to restore the proper toolName and input.
if (part.output !== undefined) {
// Avoid double-stringifying if output is already a string
const content =
// Avoid double-stringifying if output is already a string.
let content =
typeof part.output === "string"
? part.output
: JSON.stringify(part.output);
return {
type: "tool_result",
tool_use_id: part.toolCallId,
content,
is_error: part.state === "error",
};
// Strip base64 imageData from screenshot tool results before
// persisting to keep stored conversations small and avoid
// storing large blobs. The screenshotUid is preserved in the
// output so images can be loaded from IndexedDB on restore.
content = stripImageDataFromToolOutput(part.toolName, content);
return [
{
type: "tool_use",
id: part.toolCallId,
name: part.toolName,
input: part.input as Record<string, unknown>,
},
{
type: "tool_result",
tool_use_id: part.toolCallId,
content,
is_error: part.state === "error",
},
];
}
return {
type: "tool_use",
@@ -210,7 +312,11 @@ export function fromStorageFormat(
};
}
// Normal successful completion
// Normal successful completion restore screenshot data
const screenshotInfo = extractScreenshotInfo(
toolName,
parsedOutput,
);
return {
type: "tool",
toolName,
@@ -218,6 +324,15 @@ export function fromStorageFormat(
input,
output: parsedOutput,
state: "completed" as const,
// Restore screenshotUid so UI can load from IndexedDB
...(screenshotInfo?.screenshotUid
? { screenshotUid: screenshotInfo.screenshotUid }
: {}),
// Restore inline screenshot only if actual base64 is present
// (not when it's been replaced with a placeholder)
...(screenshotInfo?.imageData
? { screenshot: screenshotInfo.imageData }
: {}),
};
}
default:

View File

@@ -19,6 +19,8 @@ export type {
} from "./lib/vm/zenfs-manager.js";
// Virtual File System
export { zenfs } from "./lib/vm/zenfs-manager.js";
// Screenshot Storage (IndexedDB)
export { RuntimeScreenshotStorage } from "./lib/screenshot-storage.js";
export * from "./runtime/automation-mode.js";
export * from "./runtime/browser-automation-host.js";
export * from "./runtime/context-providers.js";

View File

@@ -10,6 +10,7 @@
*/
import type { ElementCaptureEvent, ElementCaptureOptions } from "./types.js";
import { captureVisibleTabWithElementCrop } from "../tools/screenshot-helpers.js";
type CaptureCallback = (event: ElementCaptureEvent) => void;
@@ -232,34 +233,45 @@ export class ElementCaptureService {
}
/**
* Capture screenshot functionality (with highlight)
* Capture screenshot functionality (with highlight / element crop).
*
* Delegates to the shared `captureVisibleTabWithElementCrop` helper so that
* the element-rect resolution, DPR scaling, crop, and restricted-page
* checks are consistent with `captureScreenshotWithHighlightTool`.
*
* Falls back to a full-page screenshot if the selector cannot be resolved.
*/
async captureScreenshot(
_selector: string,
_options?: {
selector: string,
options?: {
cropToElement?: boolean;
padding?: number;
},
): Promise<string | null> {
try {
// Use Chrome's captureVisibleTab API directly
if (!this.currentTabId) {
console.warn("⚠️ [ElementCaptureService] No current tab for screenshot");
console.warn(
"⚠️ [ElementCaptureService] No current tab for screenshot",
);
return null;
}
// Get the tab to find its window ID
const tab = await chrome.tabs.get(this.currentTabId);
if (!tab.windowId) {
console.warn("⚠️ [ElementCaptureService] No window ID for tab");
return null;
}
const screenshot = await chrome.tabs.captureVisibleTab(tab.windowId, {
format: "png",
const result = await captureVisibleTabWithElementCrop({
tabId: this.currentTabId,
windowId: tab.windowId,
tabUrl: tab.url,
selector,
cropToElement: options?.cropToElement ?? true,
padding: options?.padding ?? 50,
});
return screenshot;
return result.dataUrl;
} catch (error) {
console.error("❌ [ElementCaptureService] Screenshot error:", error);
return null;

View File

@@ -0,0 +1,186 @@
/**
* Screenshot storage using IndexedDB.
* Stores screenshots with a uid for efficient reference and retrieval.
* Applies an LRU eviction policy (max 50 screenshots).
*
* Uses the same DB/store as the aipex ScreenshotStorage so both
* can share screenshots during the migration period.
*/
export interface ScreenshotData {
uid: string;
/** Complete data URL: data:image/png;base64,... */
base64Data: string;
timestamp: number;
tabId?: number;
metadata?: {
width: number;
height: number;
viewportWidth: number;
viewportHeight: number;
};
}
const DB_NAME = "aipex-screenshots-db";
const DB_VERSION = 1;
const STORE_NAME = "screenshots";
const MAX_SCREENSHOTS = 50;
let db: IDBDatabase | null = null;
let initPromise: Promise<void> | null = null;
function initialize(): Promise<void> {
if (initPromise) return initPromise;
if (db) return Promise.resolve();
initPromise = new Promise<void>((resolve, reject) => {
const request = indexedDB.open(DB_NAME, DB_VERSION);
request.onerror = () => {
initPromise = null;
reject(request.error);
};
request.onsuccess = () => {
db = request.result;
initPromise = null;
resolve();
};
request.onupgradeneeded = (event) => {
const database = (event.target as IDBOpenDBRequest).result;
if (!database.objectStoreNames.contains(STORE_NAME)) {
const store = database.createObjectStore(STORE_NAME, {
keyPath: "uid",
});
store.createIndex("timestamp", "timestamp", { unique: false });
store.createIndex("tabId", "tabId", { unique: false });
}
};
});
return initPromise;
}
function generateUid(): string {
return `screenshot_${Date.now()}_${Math.random().toString(36).slice(2, 11)}`;
}
async function applyLRU(): Promise<void> {
if (!db) return;
const tx = db.transaction([STORE_NAME], "readonly");
const store = tx.objectStore(STORE_NAME);
const all: ScreenshotData[] = await new Promise((res, rej) => {
const req = store.getAll();
req.onsuccess = () => res(req.result as ScreenshotData[]);
req.onerror = () => rej(req.error);
});
if (all.length <= MAX_SCREENSHOTS) return;
all.sort((a, b) => b.timestamp - a.timestamp);
const toDelete = all.slice(MAX_SCREENSHOTS);
const delTx = db.transaction([STORE_NAME], "readwrite");
const delStore = delTx.objectStore(STORE_NAME);
for (const item of toDelete) {
delStore.delete(item.uid);
}
}
/**
* Runtime-level screenshot storage (for use inside browser-runtime tools).
* Shares the same IndexedDB database as the UI-level ScreenshotStorage
* in aipex-react so screenshots are accessible across packages.
*/
export const RuntimeScreenshotStorage = {
/**
* Save a screenshot and return its uid.
* The base64Data must be a valid data URL (validated before storing).
*/
async saveScreenshot(
base64Data: string,
metadata?: {
tabId?: number;
width?: number;
height?: number;
viewportWidth?: number;
viewportHeight?: number;
},
): Promise<string> {
// Validate that it's a data URL (not arbitrary content)
if (
typeof base64Data !== "string" ||
!base64Data.startsWith("data:image/")
) {
throw new Error("Invalid screenshot data: expected data:image/ URL");
}
await initialize();
if (!db) throw new Error("Database not initialized");
const uid = generateUid();
const entry: ScreenshotData = {
uid,
base64Data,
timestamp: Date.now(),
tabId: metadata?.tabId,
metadata: metadata
? {
width: metadata.width ?? 0,
height: metadata.height ?? 0,
viewportWidth: metadata.viewportWidth ?? 0,
viewportHeight: metadata.viewportHeight ?? 0,
}
: undefined,
};
await new Promise<void>((resolve, reject) => {
const tx = db!.transaction([STORE_NAME], "readwrite");
const store = tx.objectStore(STORE_NAME);
const req = store.put(entry);
req.onsuccess = () => resolve();
req.onerror = () => reject(req.error);
});
// Async LRU eviction — fire-and-forget
applyLRU().catch(() => {});
return uid;
},
/**
* Get screenshot base64 data by uid.
*/
async getScreenshot(uid: string): Promise<string | null> {
await initialize();
if (!db) throw new Error("Database not initialized");
return new Promise((resolve, reject) => {
const tx = db!.transaction([STORE_NAME], "readonly");
const store = tx.objectStore(STORE_NAME);
const req = store.get(uid);
req.onsuccess = () => {
const data = req.result as ScreenshotData | undefined;
resolve(data?.base64Data ?? null);
};
req.onerror = () => reject(req.error);
});
},
/**
* Clear all screenshots.
*/
async clearAll(): Promise<void> {
await initialize();
if (!db) throw new Error("Database not initialized");
await new Promise<void>((resolve, reject) => {
const tx = db!.transaction([STORE_NAME], "readwrite");
const store = tx.objectStore(STORE_NAME);
const req = store.clear();
req.onsuccess = () => resolve();
req.onerror = () => reject(req.error);
});
},
};

View File

@@ -15,7 +15,18 @@ import {
highlightTextInlineTool,
scrollToElementTool,
} from "./page";
import { captureScreenshotTool, captureTabScreenshotTool } from "./screenshot";
import {
captureScreenshotTool,
captureScreenshotWithHighlightTool,
captureTabScreenshotTool,
} from "./screenshot";
// Clipboard image tools available but not registered in the default bundle.
// Enable explicitly if the product decides to ship clipboard access.
// import {
// captureScreenshotToClipboardTool,
// readClipboardImageTool,
// getClipboardImageInfoTool,
// } from "./screenshot";
import { skillTools } from "./skill";
import { searchElementsTool } from "./snapshot";
import {
@@ -30,13 +41,15 @@ import { downloadChatImagesTool, downloadImageTool } from "./tools/downloads";
/**
* All browser tools registered for AI use
* Total: 31 tools (27 core + 4 intervention tools)
* Total: 32 tools (28 core + 4 intervention tools)
*
* Disabled tools (per aipex):
* - switch_to_tab (causes context switching issues)
* - duplicate_tab (not in aipex)
* - wait (replaced by computer tool's wait action)
* - capture_screenshot_to_clipboard (not enabled in aipex)
* - capture_screenshot_to_clipboard (not enabled in aipex default bundle)
* - read_clipboard_image (P1 clipboard tool not enabled by default; requires security review)
* - get_clipboard_image_info (P1 clipboard tool not enabled by default; requires security review)
* - download_text_as_markdown (not enabled in aipex)
* - download_current_chat_images (architecture issue, not enabled in aipex)
* - organize_tabs (stub implementation, temporarily disabled until AI grouping is complete)
@@ -72,8 +85,9 @@ const browserFunctionTools: BrowserFunctionTool[] = [
highlightElementTool,
highlightTextInlineTool,
// Screenshot (2 tools)
// Screenshot (3 tools)
captureScreenshotTool,
captureScreenshotWithHighlightTool,
captureTabScreenshotTool,
// Download (2 tools)

View File

@@ -0,0 +1,210 @@
/**
* Shared screenshot helpers.
*
* This module is intentionally kept free of imports from `./index` or any
* module that participates in the tools ↔ screenshot circular-import chain.
* Both `captureScreenshotWithHighlightTool` (in screenshot.ts) and
* `ElementCaptureService` (in intervention/element-capture.ts) import from
* here without triggering a cycle.
*/
/** Maximum padding in pixels */
export const MAX_PADDING = 200;
// ===================== Image utilities =====================
/**
* Crop image to a specific region using canvas.
*/
export async function cropImage(
dataUrl: string,
region: { x: number; y: number; width: number; height: number },
): Promise<string> {
return new Promise((resolve, reject) => {
const img = new Image();
img.onload = () => {
const canvas = document.createElement("canvas");
const ctx = canvas.getContext("2d");
if (!ctx) {
reject(new Error("Failed to get canvas context"));
return;
}
canvas.width = region.width;
canvas.height = region.height;
ctx.drawImage(
img,
region.x,
region.y,
region.width,
region.height,
0,
0,
region.width,
region.height,
);
resolve(canvas.toDataURL("image/png", 0.9));
};
img.onerror = () => reject(new Error("Failed to load image"));
img.src = dataUrl;
});
}
// ===================== Shared capture helper =====================
/**
* Options for the shared capture + element-crop helper.
*/
export interface CaptureWithElementCropOptions {
tabId: number;
windowId: number;
tabUrl?: string;
/** CSS selector of the element to focus on. Max length enforced by callers. */
selector?: string;
/** Whether to crop the screenshot to the element bounding box (plus padding). */
cropToElement?: boolean;
/** Padding around the element in CSS pixels when cropping (default 50, max 200). */
padding?: number;
}
/**
* Result returned by the shared capture helper.
*/
export interface CaptureWithElementCropResult {
/** The captured (and optionally cropped) image as a data URL. */
dataUrl: string;
/** True if the image was actually cropped to the element. */
cropped: boolean;
/** True if the selector matched an element on the page. */
elementFound: boolean;
}
/**
* Core logic for capturing the visible tab and optionally cropping to an
* element identified by CSS selector.
*
* This is shared by `captureScreenshotWithHighlightTool` (the agent-facing
* tool) and `ElementCaptureService.captureScreenshot` so that both use the
* same element-rect resolution, DPR scaling, and crop logic.
*
* Security notes:
* - Rejects browser-internal pages (chrome://, edge://, about:, extension://).
* - Selector length must be bounded by the caller (tool uses zod `.max()`).
* - Padding is clamped to [0, MAX_PADDING].
*/
export async function captureVisibleTabWithElementCrop(
options: CaptureWithElementCropOptions,
): Promise<CaptureWithElementCropResult> {
const {
tabId,
windowId,
tabUrl,
selector,
cropToElement = false,
padding = 50,
} = options;
// Reject restricted pages
if (
tabUrl &&
(tabUrl.startsWith("chrome://") ||
tabUrl.startsWith("chrome-extension://") ||
tabUrl.startsWith("edge://") ||
tabUrl.startsWith("about:"))
) {
throw new Error("Cannot capture browser internal pages");
}
// Clamp padding to safe range
const safePadding = Math.max(0, Math.min(padding, MAX_PADDING));
// If a selector is provided, resolve the element rect via content script
let elementRect: {
x: number;
y: number;
width: number;
height: number;
devicePixelRatio: number;
} | null = null;
if (selector) {
try {
const result = await chrome.scripting.executeScript({
target: { tabId },
func: (sel: string) => {
const element = document.querySelector(sel);
if (!element) return null;
const rect = element.getBoundingClientRect();
const dpr = window.devicePixelRatio || 1;
return {
x: rect.x * dpr,
y: rect.y * dpr,
width: rect.width * dpr,
height: rect.height * dpr,
devicePixelRatio: dpr,
};
},
args: [selector],
});
if (result[0]?.result) {
elementRect = result[0].result;
}
} catch (err) {
console.warn("[Screenshot] Failed to get element rect:", err);
// Continue with full-page screenshot if selector fails
}
}
// Focus window and capture
await chrome.windows.update(windowId, { focused: true });
await new Promise((resolve) => setTimeout(resolve, 100));
let dataUrl = await chrome.tabs.captureVisibleTab(windowId, {
format: "png",
quality: 90,
});
if (!dataUrl || !dataUrl.startsWith("data:image/")) {
throw new Error("Invalid image data captured");
}
const cropped = !!(cropToElement && elementRect);
// Crop to element if requested and the element was found
if (cropToElement && elementRect) {
const dpr = elementRect.devicePixelRatio || 1;
const scaledPadding = safePadding * dpr;
// Load image to get actual dimensions for bounds checking
const img = new Image();
await new Promise<void>((resolve, reject) => {
img.onload = () => resolve();
img.onerror = () => reject(new Error("Failed to load image for crop"));
img.src = dataUrl;
});
const x = Math.max(0, Math.round(elementRect.x - scaledPadding));
const y = Math.max(0, Math.round(elementRect.y - scaledPadding));
const maxWidth = img.width - x;
const maxHeight = img.height - y;
const width = Math.min(
Math.round(elementRect.width + scaledPadding * 2),
maxWidth,
);
const height = Math.min(
Math.round(elementRect.height + scaledPadding * 2),
maxHeight,
);
if (width > 0 && height > 0) {
dataUrl = await cropImage(dataUrl, { x, y, width, height });
}
}
return { dataUrl, cropped, elementFound: !!elementRect };
}

View File

@@ -1,8 +1,20 @@
import { tool } from "@aipexstudio/aipex-core";
import { z } from "zod";
import { cacheScreenshotMetadata } from "../automation/computer";
import { RuntimeScreenshotStorage } from "../lib/screenshot-storage";
import { getAutomationMode } from "../runtime/automation-mode";
import { getActiveTab } from "./index";
import {
captureVisibleTabWithElementCrop,
MAX_PADDING,
} from "./screenshot-helpers.js";
// Re-export the shared helper types/function so existing consumers aren't broken
export type {
CaptureWithElementCropOptions,
CaptureWithElementCropResult,
} from "./screenshot-helpers.js";
export { captureVisibleTabWithElementCrop } from "./screenshot-helpers.js";
async function compressImage(
dataUrl: string,
@@ -93,15 +105,25 @@ export const captureScreenshotTool = tool({
throw new Error("Invalid image data captured");
}
// Get viewport dimensions for metadata caching
const viewportDimensions = await chrome.scripting.executeScript({
target: { tabId: tab.id },
func: () => ({
width: window.innerWidth,
height: window.innerHeight,
}),
});
const viewport = viewportDimensions[0]?.result;
// Get viewport dimensions for metadata caching (graceful degradation)
let viewport: { width: number; height: number } | undefined;
try {
const viewportDimensions = await chrome.scripting.executeScript({
target: { tabId: tab.id },
func: () => ({
width: window.innerWidth,
height: window.innerHeight,
}),
});
viewport = viewportDimensions[0]?.result ?? undefined;
} catch (e) {
console.warn("[Screenshot] Failed to get viewport dimensions:", e);
// Continue without viewport metadata screenshot still works
}
// Get image dimensions for metadata
let imageWidth = 0;
let imageHeight = 0;
if (sendToLLM) {
// Compress for LLM
@@ -114,6 +136,8 @@ export const captureScreenshotTool = tool({
img.onerror = reject;
img.src = dataUrl;
});
imageWidth = img.width;
imageHeight = img.height;
// Cache screenshot metadata for computer tool
if (viewport) {
@@ -125,12 +149,50 @@ export const captureScreenshotTool = tool({
viewport.height,
);
}
} else {
// Get original image dimensions for non-LLM screenshots
const img = new Image();
await new Promise((resolve, reject) => {
img.onload = resolve;
img.onerror = reject;
img.src = dataUrl;
});
imageWidth = img.width;
imageHeight = img.height;
}
// Save screenshot to IndexedDB and get uid
let screenshotUid: string | undefined;
try {
screenshotUid = await RuntimeScreenshotStorage.saveScreenshot(dataUrl, {
tabId: tab.id,
width: imageWidth,
height: imageHeight,
viewportWidth: viewport?.width ?? 0,
viewportHeight: viewport?.height ?? 0,
});
} catch (err) {
console.error("[Screenshot] Failed to save to IndexedDB:", err);
// Continue even if storage fails
}
if (sendToLLM) {
return {
success: true,
imageData: dataUrl,
sendToLLM: true,
screenshotUid,
tabId: tab.id,
url: tab.url,
title: tab.title,
};
}
return {
success: true,
imageData: sendToLLM ? dataUrl : undefined,
captured: !sendToLLM,
captured: true,
sendToLLM: false,
screenshotUid,
tabId: tab.id,
url: tab.url,
title: tab.title,
@@ -177,15 +239,25 @@ export const captureTabScreenshotTool = tool({
quality: 90,
});
// Get viewport dimensions for metadata caching
const viewportDimensions = await chrome.scripting.executeScript({
target: { tabId },
func: () => ({
width: window.innerWidth,
height: window.innerHeight,
}),
});
const viewport = viewportDimensions[0]?.result;
// Get viewport dimensions for metadata caching (graceful degradation)
let viewport: { width: number; height: number } | undefined;
try {
const viewportDimensions = await chrome.scripting.executeScript({
target: { tabId },
func: () => ({
width: window.innerWidth,
height: window.innerHeight,
}),
});
viewport = viewportDimensions[0]?.result ?? undefined;
} catch (e) {
console.warn("[Screenshot] Failed to get viewport dimensions:", e);
// Continue without viewport metadata screenshot still works
}
// Get image dimensions for metadata
let imageWidth = 0;
let imageHeight = 0;
if (sendToLLM) {
// Compress for LLM
@@ -198,6 +270,8 @@ export const captureTabScreenshotTool = tool({
img.onerror = reject;
img.src = dataUrl;
});
imageWidth = img.width;
imageHeight = img.height;
// Cache screenshot metadata for computer tool
if (viewport) {
@@ -209,12 +283,50 @@ export const captureTabScreenshotTool = tool({
viewport.height,
);
}
} else {
// Get original image dimensions for non-LLM screenshots
const img = new Image();
await new Promise((resolve, reject) => {
img.onload = resolve;
img.onerror = reject;
img.src = dataUrl;
});
imageWidth = img.width;
imageHeight = img.height;
}
// Save screenshot to IndexedDB and get uid
let screenshotUid: string | undefined;
try {
screenshotUid = await RuntimeScreenshotStorage.saveScreenshot(dataUrl, {
tabId,
width: imageWidth,
height: imageHeight,
viewportWidth: viewport?.width ?? 0,
viewportHeight: viewport?.height ?? 0,
});
} catch (err) {
console.error("[Screenshot] Failed to save to IndexedDB:", err);
// Continue even if storage fails
}
if (sendToLLM) {
return {
success: true,
imageData: dataUrl,
sendToLLM: true,
screenshotUid,
tabId,
url: tab.url,
title: tab.title,
};
}
return {
success: true,
imageData: sendToLLM ? dataUrl : undefined,
captured: !sendToLLM,
captured: true,
sendToLLM: false,
screenshotUid,
tabId,
url: tab.url,
title: tab.title,
@@ -222,6 +334,169 @@ export const captureTabScreenshotTool = tool({
},
});
/** Maximum allowed CSS selector length to prevent injection of excessively long strings */
const MAX_SELECTOR_LENGTH = 500;
// ===================== Tool definition =====================
export const captureScreenshotWithHighlightTool = tool({
name: "capture_screenshot_with_highlight",
description:
"Capture screenshot of the current visible tab, optionally highlighting and cropping to a specific element identified by CSS selector. The screenshot is always sent to the LLM for visual analysis. NOTE: This tool requires focus mode.",
parameters: z.object({
selector: z
.string()
.max(MAX_SELECTOR_LENGTH)
.optional()
.describe("CSS selector of element to highlight/focus on"),
cropToElement: z
.boolean()
.optional()
.default(false)
.describe(
"Whether to crop the screenshot to the element region (plus padding)",
),
padding: z
.number()
.min(0)
.max(MAX_PADDING)
.optional()
.default(50)
.describe("Padding around element in pixels when cropping (default: 50)"),
sendToLLM: z
.boolean()
.nullable()
.optional()
.default(true)
.describe(
"Whether to send the screenshot to LLM for visual analysis. Defaults to true.",
),
}),
execute: async ({
selector,
cropToElement = false,
padding = 50,
sendToLLM = true,
}) => {
const mode = await getAutomationMode();
console.log(
"🔧 [captureScreenshotWithHighlight] Automation mode:",
mode,
);
if (mode === "background") {
throw new Error(
"Screenshot capture is disabled in background mode. Please switch to focus mode to use visual tools.",
);
}
const tab = await getActiveTab();
if (!tab.id || !tab.windowId) {
throw new Error("No active tab found");
}
// Delegate to shared helper for capture + element crop
const capture = await captureVisibleTabWithElementCrop({
tabId: tab.id,
windowId: tab.windowId,
tabUrl: tab.url,
selector,
cropToElement,
padding,
});
let { dataUrl } = capture;
// Get viewport dimensions (graceful degradation)
let viewport: { width: number; height: number } | undefined;
try {
const viewportDimensions = await chrome.scripting.executeScript({
target: { tabId: tab.id },
func: () => ({
width: window.innerWidth,
height: window.innerHeight,
}),
});
viewport = viewportDimensions[0]?.result ?? undefined;
} catch (e) {
console.warn(
"[ScreenshotHighlight] Failed to get viewport dimensions:",
e,
);
}
if (sendToLLM) {
// Compress for LLM
dataUrl = await compressImage(dataUrl, 0.6, 1024);
}
// Extract image dimensions
const finalImg = new Image();
await new Promise<void>((resolve, reject) => {
finalImg.onload = () => resolve();
finalImg.onerror = () => reject(new Error("Failed to load image"));
finalImg.src = dataUrl;
});
const imageWidth = finalImg.width;
const imageHeight = finalImg.height;
// Cache screenshot metadata for computer tool
if (sendToLLM && viewport) {
cacheScreenshotMetadata(
tab.id,
imageWidth,
imageHeight,
viewport.width,
viewport.height,
);
}
// Save screenshot to IndexedDB
let screenshotUid: string | undefined;
try {
screenshotUid = await RuntimeScreenshotStorage.saveScreenshot(dataUrl, {
tabId: tab.id,
width: imageWidth,
height: imageHeight,
viewportWidth: viewport?.width ?? 0,
viewportHeight: viewport?.height ?? 0,
});
} catch (err) {
console.error(
"[ScreenshotHighlight] Failed to save to IndexedDB:",
err,
);
}
if (sendToLLM) {
return {
success: true,
imageData: dataUrl,
sendToLLM: true,
screenshotUid,
tabId: tab.id,
url: tab.url,
title: tab.title,
selector: selector ?? undefined,
cropped: capture.cropped,
};
}
return {
success: true,
captured: true,
sendToLLM: false,
screenshotUid,
tabId: tab.id,
url: tab.url,
title: tab.title,
selector: selector ?? undefined,
cropped: capture.cropped,
};
},
});
export const captureScreenshotToClipboardTool = tool({
name: "capture_screenshot_to_clipboard",
description:
@@ -267,3 +542,83 @@ export const captureScreenshotToClipboardTool = tool({
};
},
});
// ===================== Clipboard image tools (P1) =====================
export const readClipboardImageTool = tool({
name: "read_clipboard_image",
description:
"Read an image from the system clipboard and return it as a base64 data URL. " +
"Useful for inspecting images the user has copied. Returns an error if no image is present.",
parameters: z.object({}),
execute: async () => {
try {
const clipboardItems = await navigator.clipboard.read();
for (const item of clipboardItems) {
for (const type of item.types) {
if (type.startsWith("image/")) {
const blob = await item.getType(type);
// Convert blob to data URL
const dataUrl = await new Promise<string>((resolve, reject) => {
const reader = new FileReader();
reader.onload = () => resolve(reader.result as string);
reader.onerror = () =>
reject(new Error("Failed to read image data"));
reader.readAsDataURL(blob);
});
return {
success: true,
imageData: dataUrl,
};
}
}
}
return { success: false, error: "No image found in clipboard" };
} catch (error: unknown) {
const message =
error instanceof Error ? error.message : String(error);
return {
success: false,
error: `Failed to read clipboard: ${message}`,
};
}
},
});
export const getClipboardImageInfoTool = tool({
name: "get_clipboard_image_info",
description:
"Check whether the system clipboard contains an image, and if so return " +
"its MIME type. Does NOT read the full image data.",
parameters: z.object({}),
execute: async () => {
try {
const clipboardItems = await navigator.clipboard.read();
for (const item of clipboardItems) {
for (const type of item.types) {
if (type.startsWith("image/")) {
return {
success: true,
hasImage: true,
imageType: type,
};
}
}
}
return { success: true, hasImage: false };
} catch (error: unknown) {
const message =
error instanceof Error ? error.message : String(error);
return {
success: false,
error: `Failed to read clipboard: ${message}`,
};
}
},
});

View File

@@ -7,6 +7,7 @@ import type {
SessionTree,
} from "../types.js";
import { generateId } from "../utils/id-generator.js";
import { pruneTransientScreenshotItems } from "../utils/screenshot-shaping.js";
import type { ConversationCompressor } from "./compressor.js";
import { Session } from "./session.js";
@@ -87,7 +88,10 @@ export class ConversationManager {
}
private async doCompress(session: Session): Promise<{ summary: string }> {
const items = await session.getItems();
// Prune transient screenshot user-image messages before compression
// to avoid sending large base64 blobs to the compressor/LLM.
const rawItems = await session.getItems();
const items = pruneTransientScreenshotItems(rawItems);
const { summary, compressedItems } =
await this.compressor!.compressItems(items);

View File

@@ -8,6 +8,11 @@ import type {
SessionSummary,
} from "../types.js";
import { generateId } from "../utils/id-generator.js";
import {
isTransientScreenshotItem,
pruneTransientScreenshotItems,
shapeScreenshotItems,
} from "../utils/screenshot-shaping.js";
function createEmptySessionMetrics(): SessionMetrics {
return {
@@ -53,7 +58,11 @@ export class Session implements OpenAISession {
}
async addItems(items: AgentInputItem[]): Promise<void> {
this.items.push(...items);
// Shape screenshot tool results: strip base64 imageData from the tool
// result and inject a transient user message with the real image so the
// model can consume it via the standard vision path.
const shaped = shapeScreenshotItems(items);
this.items.push(...shaped);
this.metadata["lastActiveAt"] = Date.now();
this.updatePreview();
}
@@ -156,7 +165,12 @@ export class Session implements OpenAISession {
private updatePreview(): void {
const latestUserMessage = [...this.items]
.reverse()
.find((item) => item.type === "message" && item.role === "user");
.find(
(item) =>
item.type === "message" &&
item.role === "user" &&
!isTransientScreenshotItem(item),
);
const previewSource =
this.extractContent(latestUserMessage) ??
@@ -207,7 +221,9 @@ export class Session implements OpenAISession {
toJSON(): SerializedSession {
return {
id: this.id,
items: this.items,
// Prune transient screenshot user-image messages before persisting
// to avoid storing large base64 blobs in conversation history.
items: pruneTransientScreenshotItems(this.items),
metadata: this.metadata,
config: this.config,
metrics: this.sessionMetrics,

View File

@@ -3,3 +3,9 @@
*/
export { CancellationError, CancellationToken } from "./cancellation-token.js";
export {
isTransientScreenshotItem,
pruneTransientScreenshotItems,
shapeScreenshotItems,
TRANSIENT_SCREENSHOT_MARKER,
} from "./screenshot-shaping.js";

View File

@@ -0,0 +1,296 @@
import type { AgentInputItem } from "@openai/agents";
import { describe, expect, it } from "vitest";
import {
isTransientScreenshotItem,
pruneTransientScreenshotItems,
shapeScreenshotItems,
TRANSIENT_SCREENSHOT_MARKER,
} from "./screenshot-shaping.js";
// --- Helpers ---
const TEST_IMAGE_DATA = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQ==";
const TEST_SCREENSHOT_UID = "screenshot_1234567890_abcdefghi";
const PLACEHOLDER = "[Image data removed - see following user message]";
function createScreenshotToolResult(
overrides: Record<string, unknown> = {},
): AgentInputItem {
const output = {
success: true,
imageData: TEST_IMAGE_DATA,
sendToLLM: true,
screenshotUid: TEST_SCREENSHOT_UID,
tabId: 1,
url: "https://example.com",
title: "Example",
...overrides,
};
return {
type: "function_call_result",
name: "capture_screenshot",
callId: "call_abc123",
output: JSON.stringify(output),
} as AgentInputItem;
}
function createNonScreenshotToolResult(): AgentInputItem {
return {
type: "function_call_result",
name: "get_tabs",
callId: "call_other",
output: JSON.stringify({ tabs: [{ id: 1, title: "Tab" }] }),
} as AgentInputItem;
}
function createUserMessage(text: string): AgentInputItem {
return {
type: "message",
role: "user",
content: text,
};
}
// --- Tests ---
describe("shapeScreenshotItems", () => {
it("should strip imageData and inject transient user image message for sendToLLM=true", () => {
const items = [createScreenshotToolResult()];
const shaped = shapeScreenshotItems(items);
expect(shaped.length).toBe(2);
// First item: stripped tool result
const toolResult = shaped[0] as { type: string; output: string };
expect(toolResult.type).toBe("function_call_result");
const parsed = JSON.parse(toolResult.output);
expect(parsed.success).toBe(true);
expect(parsed.imageData).toBe(PLACEHOLDER);
expect(parsed.screenshotUid).toBe(TEST_SCREENSHOT_UID);
expect(parsed.sendToLLM).toBe(true);
// Second item: transient user image message
const userMsg = shaped[1] as {
type: string;
role: string;
content: Array<{ type: string; text?: string; image?: string }>;
providerData?: Record<string, unknown>;
};
expect(userMsg.type).toBe("message");
expect(userMsg.role).toBe("user");
expect(userMsg.providerData?.[TRANSIENT_SCREENSHOT_MARKER]).toBe(true);
// Check content has text + image parts
const textPart = userMsg.content.find((c) => c.type === "input_text");
const imagePart = userMsg.content.find((c) => c.type === "input_image");
expect(textPart).toBeTruthy();
expect(imagePart).toBeTruthy();
expect((imagePart as { image: string }).image).toBe(TEST_IMAGE_DATA);
});
it("should pass through items when sendToLLM=false", () => {
const items = [
createScreenshotToolResult({
sendToLLM: false,
imageData: undefined,
captured: true,
}),
];
const shaped = shapeScreenshotItems(items);
// Should not inject a user image message
expect(shaped.length).toBe(1);
expect(shaped[0]).toEqual(items[0]);
});
it("should pass through non-screenshot tools unchanged", () => {
const items = [createNonScreenshotToolResult()];
const shaped = shapeScreenshotItems(items);
expect(shaped.length).toBe(1);
expect(shaped[0]).toEqual(items[0]);
});
it("should pass through non-tool items unchanged", () => {
const items = [createUserMessage("hello")];
const shaped = shapeScreenshotItems(items);
expect(shaped.length).toBe(1);
expect(shaped[0]).toEqual(items[0]);
});
it("should handle capture_tab_screenshot the same way", () => {
const toolResult = createScreenshotToolResult();
(toolResult as { name: string }).name = "capture_tab_screenshot";
const shaped = shapeScreenshotItems([toolResult]);
expect(shaped.length).toBe(2);
expect((shaped[0] as { type: string }).type).toBe("function_call_result");
expect((shaped[1] as { type: string; role: string }).role).toBe("user");
});
it("should handle capture_screenshot_with_highlight the same way", () => {
const output = {
success: true,
imageData: TEST_IMAGE_DATA,
sendToLLM: true,
screenshotUid: TEST_SCREENSHOT_UID,
tabId: 1,
url: "https://example.com",
title: "Example",
selector: ".my-element",
cropped: true,
};
const item: AgentInputItem = {
type: "function_call_result",
name: "capture_screenshot_with_highlight",
callId: "call_highlight",
output: JSON.stringify(output),
} as AgentInputItem;
const shaped = shapeScreenshotItems([item]);
expect(shaped.length).toBe(2);
// First item: stripped tool result
const toolResult = shaped[0] as { type: string; output: string };
expect(toolResult.type).toBe("function_call_result");
const parsed = JSON.parse(toolResult.output);
expect(parsed.success).toBe(true);
expect(parsed.imageData).toBe(PLACEHOLDER);
expect(parsed.screenshotUid).toBe(TEST_SCREENSHOT_UID);
expect(parsed.sendToLLM).toBe(true);
// Second item: transient user image message
const userMsg = shaped[1] as {
type: string;
role: string;
content: Array<{ type: string; text?: string; image?: string }>;
providerData?: Record<string, unknown>;
};
expect(userMsg.type).toBe("message");
expect(userMsg.role).toBe("user");
expect(userMsg.providerData?.[TRANSIENT_SCREENSHOT_MARKER]).toBe(true);
const imagePart = userMsg.content.find((c) => c.type === "input_image");
expect(imagePart).toBeTruthy();
expect((imagePart as { image: string }).image).toBe(TEST_IMAGE_DATA);
});
it("should pass through capture_screenshot_with_highlight when sendToLLM=false", () => {
const output = {
success: true,
captured: true,
sendToLLM: false,
screenshotUid: TEST_SCREENSHOT_UID,
tabId: 1,
selector: ".my-element",
cropped: true,
};
const item: AgentInputItem = {
type: "function_call_result",
name: "capture_screenshot_with_highlight",
callId: "call_highlight_no_llm",
output: JSON.stringify(output),
} as AgentInputItem;
const shaped = shapeScreenshotItems([item]);
// No imageData + sendToLLM=false → pass through unchanged
expect(shaped.length).toBe(1);
expect(shaped[0]).toEqual(item);
});
it("should handle mixed items correctly", () => {
const items = [
createUserMessage("Take a screenshot"),
createNonScreenshotToolResult(),
createScreenshotToolResult(),
createUserMessage("What do you see?"),
];
const shaped = shapeScreenshotItems(items);
// Original 4 items + 1 injected user image = 5
expect(shaped.length).toBe(5);
// Verify order: user, non-screenshot tool, stripped screenshot, user image, user
expect((shaped[0] as { role: string }).role).toBe("user");
expect((shaped[1] as { name: string }).name).toBe("get_tabs");
expect((shaped[2] as { type: string }).type).toBe("function_call_result");
expect(
(shaped[3] as { providerData?: Record<string, unknown> }).providerData?.[
TRANSIENT_SCREENSHOT_MARKER
],
).toBe(true);
expect((shaped[4] as { role: string }).role).toBe("user");
});
it("should handle nested data structure", () => {
const output = {
success: true,
data: {
success: true,
imageData: TEST_IMAGE_DATA,
sendToLLM: true,
screenshotUid: TEST_SCREENSHOT_UID,
},
};
const item: AgentInputItem = {
type: "function_call_result",
name: "capture_screenshot",
callId: "call_nested",
output: JSON.stringify(output),
} as AgentInputItem;
const shaped = shapeScreenshotItems([item]);
expect(shaped.length).toBe(2);
const parsedOutput = JSON.parse(
(shaped[0] as { output: string }).output,
);
expect(parsedOutput.success).toBe(true);
expect(parsedOutput.data.imageData).toBe(PLACEHOLDER);
expect(parsedOutput.data.screenshotUid).toBe(TEST_SCREENSHOT_UID);
});
});
describe("pruneTransientScreenshotItems", () => {
it("should remove transient screenshot items", () => {
const transient: AgentInputItem = {
type: "message",
role: "user",
content: [
{ type: "input_text", text: "screenshot" },
{ type: "input_image", image: TEST_IMAGE_DATA, detail: "auto" },
],
providerData: { [TRANSIENT_SCREENSHOT_MARKER]: true },
} as AgentInputItem;
const normal = createUserMessage("hello");
const pruned = pruneTransientScreenshotItems([normal, transient]);
expect(pruned.length).toBe(1);
expect(pruned[0]).toEqual(normal);
});
it("should keep all items when no transients exist", () => {
const items = [createUserMessage("a"), createUserMessage("b")];
const pruned = pruneTransientScreenshotItems(items);
expect(pruned.length).toBe(2);
});
});
describe("isTransientScreenshotItem", () => {
it("should return true for transient items", () => {
const item = {
type: "message",
role: "user",
content: "test",
providerData: { [TRANSIENT_SCREENSHOT_MARKER]: true },
} as unknown as AgentInputItem;
expect(isTransientScreenshotItem(item)).toBe(true);
});
it("should return false for normal items", () => {
expect(isTransientScreenshotItem(createUserMessage("hello"))).toBe(false);
});
});

View File

@@ -0,0 +1,199 @@
/**
* Screenshot message shaping utilities.
*
* When a screenshot tool returns `sendToLLM=true`, the large base64 imageData
* must NOT be sent inside the function_call_result output (models may not
* support images there, and it bloats token counts).
*
* Instead, the imageData is:
* 1. Stripped from the tool result (replaced with a placeholder string).
* 2. Injected as a follow-up user message with `input_image` content.
*
* This matches the message flow used in the original aipex codebase.
*/
import type { AgentInputItem } from "@openai/agents";
import { safeJsonParse } from "./json.js";
/** Tool names whose results may include screenshot image data */
const SCREENSHOT_TOOL_NAMES = new Set([
"capture_screenshot",
"capture_screenshot_with_highlight",
"capture_tab_screenshot",
]);
/** Placeholder that replaces imageData in the tool result */
const IMAGE_DATA_PLACEHOLDER =
"[Image data removed - see following user message]";
/** Marker on transient user-image messages so they can be pruned */
export const TRANSIENT_SCREENSHOT_MARKER = "__transient_screenshot__";
/**
* Process a batch of AgentInputItems. For any `function_call_result` from
* a screenshot tool that contains `imageData` with `sendToLLM=true`:
* - Replace imageData with a placeholder in the tool result.
* - Insert a transient user message with the real image right after.
*
* Items that are not screenshot tool results pass through unchanged.
*/
export function shapeScreenshotItems(
items: AgentInputItem[],
): AgentInputItem[] {
const result: AgentInputItem[] = [];
for (const item of items) {
if (item.type !== "function_call_result") {
result.push(item);
continue;
}
const funcResult = item as {
type: "function_call_result";
name: string;
callId: string;
output: string;
[key: string]: unknown;
};
if (!SCREENSHOT_TOOL_NAMES.has(funcResult.name)) {
result.push(item);
continue;
}
// Try to parse the output and extract imageData
const parsed = safeJsonParse<Record<string, unknown>>(funcResult.output);
if (!parsed) {
result.push(item);
continue;
}
const extracted = extractImageData(parsed);
if (!extracted) {
// No sendToLLM image data pass through
result.push(item);
continue;
}
// 1. Rewrite the tool result with imageData stripped
const strippedOutput = buildStrippedOutput(parsed, extracted.screenshotUid);
const strippedItem: AgentInputItem = {
...item,
output: JSON.stringify(strippedOutput),
} as AgentInputItem;
result.push(strippedItem);
// 2. Insert a transient user message carrying the real image
const toolName = funcResult.name;
const messageText =
toolName === "computer"
? "Here is the screenshot from the computer action:"
: "Here is the screenshot you requested:";
const userImageMessage: AgentInputItem = {
type: "message",
role: "user",
content: [
{ type: "input_text", text: messageText },
{
type: "input_image",
image: extracted.imageData,
detail: "auto",
},
],
// Mark as transient so it can be pruned before persistence/compression
providerData: { [TRANSIENT_SCREENSHOT_MARKER]: true },
} as AgentInputItem;
result.push(userImageMessage);
}
return result;
}
/**
* Remove transient screenshot user-image messages from items.
* Used before persistence or compression.
*/
export function pruneTransientScreenshotItems(
items: AgentInputItem[],
): AgentInputItem[] {
return items.filter((item) => {
const pd = (item as { providerData?: Record<string, unknown> })
.providerData;
return !pd?.[TRANSIENT_SCREENSHOT_MARKER];
});
}
/**
* Check if an item is a transient screenshot user-image message.
*/
export function isTransientScreenshotItem(item: AgentInputItem): boolean {
const pd = (item as { providerData?: Record<string, unknown> }).providerData;
return !!pd?.[TRANSIENT_SCREENSHOT_MARKER];
}
// ===================== Internal helpers =====================
interface ExtractedImage {
imageData: string;
screenshotUid?: string;
}
/**
* Extract imageData from parsed tool output.
* Handles nested structures:
* { success, data: { imageData, sendToLLM, screenshotUid } }
* { success, imageData, sendToLLM, screenshotUid }
*/
function extractImageData(
parsed: Record<string, unknown>,
): ExtractedImage | null {
if (!parsed.success) return null;
// Navigate possible nesting levels
const data = parsed.data as Record<string, unknown> | undefined;
const actual = data ?? parsed;
// Must have sendToLLM === true
if (actual.sendToLLM !== true) return null;
const imageData = actual.imageData;
if (typeof imageData !== "string" || !imageData.startsWith("data:image/")) {
return null;
}
return {
imageData,
screenshotUid:
typeof actual.screenshotUid === "string"
? actual.screenshotUid
: undefined,
};
}
/**
* Build the stripped tool output object (imageData replaced with placeholder).
*/
function buildStrippedOutput(
parsed: Record<string, unknown>,
screenshotUid?: string,
): Record<string, unknown> {
const data = parsed.data as Record<string, unknown> | undefined;
const actual = data ?? parsed;
const stripped: Record<string, unknown> = {
...actual,
imageData: IMAGE_DATA_PLACEHOLDER,
};
if (screenshotUid) {
stripped.screenshotUid = screenshotUid;
}
// If there was a `data` wrapper, preserve it
if (data) {
return { success: true, data: stripped };
}
return { success: true, ...stripped };
}