mirror of
https://github.com/AIPexStudio/AIPex.git
synced 2026-05-13 18:51:35 +00:00
feat: enhance screenshot handling and display in chatbot components
- Added functionality to extract and apply screenshots from tool results in the ChatAdapter, improving user experience by allowing immediate rendering of screenshots. - Implemented a new ToolScreenshot component to display screenshots inline, supporting both base64 data and IndexedDB references. - Updated message item rendering to transform screenshot placeholders into actual images, enhancing the visual feedback for users. - Introduced collapsible message items for intermediate assistant messages, improving the organization of conversation turns in the message list. - Enhanced model fetching logic in the Chatbot component to ensure server models are prioritized, improving model selection reliability. - Updated localization files to include new translation keys for improved user guidance.
This commit is contained in:
@@ -1,5 +1,11 @@
|
||||
import type { AgentEvent } from "@aipexstudio/aipex-core";
|
||||
import { generateId } from "@aipexstudio/aipex-core";
|
||||
import { ScreenshotStorage } from "../lib/screenshot-storage";
|
||||
import {
|
||||
extractScreenshotFromToolResult,
|
||||
isCaptureScreenshotTool,
|
||||
type ScreenshotExtraction,
|
||||
} from "../lib/screenshot-utils";
|
||||
import type {
|
||||
ChatAdapterOptions,
|
||||
ChatAdapterState,
|
||||
@@ -412,6 +418,18 @@ export class ChatAdapter {
|
||||
return;
|
||||
}
|
||||
|
||||
// Extract screenshot data from screenshot tools
|
||||
if (isCaptureScreenshotTool(toolName)) {
|
||||
const screenshotInfo = extractScreenshotFromToolResult(
|
||||
toolName,
|
||||
result,
|
||||
);
|
||||
if (screenshotInfo) {
|
||||
this.applyScreenshotToolResult(callId, result, screenshotInfo);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
this.updateToolPart(callId, (toolPart) => ({
|
||||
...toolPart,
|
||||
state: "completed",
|
||||
@@ -419,6 +437,59 @@ export class ChatAdapter {
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle a completed screenshot tool result.
|
||||
*
|
||||
* Uses the tool-provided screenshotUid (the tool already saved to IndexedDB)
|
||||
* rather than generating a new one. Falls back to UI-side storage only if
|
||||
* screenshotUid is missing (e.g., IndexedDB save failed in the tool).
|
||||
*/
|
||||
private applyScreenshotToolResult(
|
||||
callId: string,
|
||||
result: unknown,
|
||||
info: ScreenshotExtraction,
|
||||
): void {
|
||||
if (info.screenshotUid) {
|
||||
// Tool already saved to IndexedDB — use its uid directly
|
||||
this.updateToolPart(callId, (toolPart) => ({
|
||||
...toolPart,
|
||||
state: "completed",
|
||||
output: result,
|
||||
screenshotUid: info.screenshotUid!,
|
||||
// Keep inline screenshot for immediate rendering if base64 is present
|
||||
...(info.imageData ? { screenshot: info.imageData } : {}),
|
||||
}));
|
||||
} else if (info.imageData) {
|
||||
// Fallback: tool didn't provide a uid (storage failure) — save in UI
|
||||
this.updateToolPart(callId, (toolPart) => ({
|
||||
...toolPart,
|
||||
state: "completed",
|
||||
output: result,
|
||||
screenshot: info.imageData!,
|
||||
}));
|
||||
ScreenshotStorage.saveScreenshot(info.imageData)
|
||||
.then((uid) => {
|
||||
this.updateToolPart(callId, (toolPart) => ({
|
||||
...toolPart,
|
||||
screenshotUid: uid,
|
||||
}));
|
||||
})
|
||||
.catch(() => {
|
||||
// Storage failed — screenshot still visible via inline data
|
||||
});
|
||||
} else {
|
||||
// No image data at all (sendToLLM=false path) — just complete
|
||||
this.updateToolPart(callId, (toolPart) => ({
|
||||
...toolPart,
|
||||
state: "completed",
|
||||
output: result,
|
||||
...(info.screenshotUid
|
||||
? { screenshotUid: info.screenshotUid }
|
||||
: {}),
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a tool result indicates a business-level failure.
|
||||
* Many tools return { success: false, error: "..." } instead of throwing.
|
||||
|
||||
@@ -9,7 +9,8 @@ import {
|
||||
WrenchIcon,
|
||||
XCircleIcon,
|
||||
} from "lucide-react";
|
||||
import type { ComponentProps, ReactNode } from "react";
|
||||
import { type ComponentProps, type ReactNode, useEffect, useState } from "react";
|
||||
import { ScreenshotStorage } from "../../lib/screenshot-storage";
|
||||
import { cn } from "../../lib/utils";
|
||||
import { Badge } from "../ui/badge";
|
||||
import {
|
||||
@@ -29,7 +30,8 @@ export const Tool = ({ className, ...props }: ToolProps) => (
|
||||
);
|
||||
|
||||
export type ToolHeaderProps = {
|
||||
type: ToolUIPart["type"];
|
||||
/** Display label for the tool – either a raw `tool-${name}` key or a translated name */
|
||||
type: string;
|
||||
state: ToolUIPart["state"] | "executing";
|
||||
className?: string;
|
||||
};
|
||||
@@ -154,3 +156,78 @@ export const ToolOutput = ({
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
// ============ Screenshot Display ============
|
||||
|
||||
export type ToolScreenshotProps = ComponentProps<"div"> & {
|
||||
/** Inline base64 screenshot data URL */
|
||||
screenshot?: string;
|
||||
/** UID referencing a screenshot stored in ScreenshotStorage (IndexedDB) */
|
||||
screenshotUid?: string;
|
||||
};
|
||||
|
||||
/**
|
||||
* ToolScreenshot – renders a screenshot captured by a tool.
|
||||
* Supports both inline base64 data and IndexedDB uid references.
|
||||
*/
|
||||
export const ToolScreenshot = ({
|
||||
className,
|
||||
screenshot,
|
||||
screenshotUid,
|
||||
...props
|
||||
}: ToolScreenshotProps) => {
|
||||
const [imageData, setImageData] = useState<string | null>(
|
||||
screenshot ?? null,
|
||||
);
|
||||
const [loading, setLoading] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
useEffect(() => {
|
||||
// Prefer inline screenshot
|
||||
if (screenshot) {
|
||||
setImageData(screenshot);
|
||||
return;
|
||||
}
|
||||
|
||||
// Load from IndexedDB by uid
|
||||
if (screenshotUid) {
|
||||
setLoading(true);
|
||||
setError(null);
|
||||
ScreenshotStorage.getScreenshot(screenshotUid)
|
||||
.then((data) => {
|
||||
setImageData(data);
|
||||
if (!data) setError("Screenshot not found");
|
||||
})
|
||||
.catch(() => {
|
||||
setError("Failed to load screenshot");
|
||||
})
|
||||
.finally(() => {
|
||||
setLoading(false);
|
||||
});
|
||||
}
|
||||
}, [screenshot, screenshotUid]);
|
||||
|
||||
if (!screenshot && !screenshotUid) return null;
|
||||
|
||||
return (
|
||||
<div className={cn("space-y-2 p-4", className)} {...props}>
|
||||
<h4 className="font-medium text-muted-foreground text-xs uppercase tracking-wide">
|
||||
Screenshot
|
||||
</h4>
|
||||
{loading ? (
|
||||
<div className="flex items-center gap-2 text-muted-foreground text-sm">
|
||||
<ClockIcon className="size-4 animate-spin" />
|
||||
<span>Loading screenshot...</span>
|
||||
</div>
|
||||
) : error ? (
|
||||
<div className="text-destructive text-sm">{error}</div>
|
||||
) : imageData ? (
|
||||
<img
|
||||
src={imageData}
|
||||
alt="Screenshot"
|
||||
className="cursor-pointer rounded-md max-w-full"
|
||||
/>
|
||||
) : null}
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { useCallback, useContext, useMemo, useState } from "react";
|
||||
import { useCallback, useContext, useEffect, useMemo, useState } from "react";
|
||||
import { useChat, useChatConfig } from "../../../hooks";
|
||||
import { useTranslation } from "../../../i18n/context";
|
||||
import { fetchModelsForSelector } from "../../../lib/models";
|
||||
import { cn } from "../../../lib/utils";
|
||||
import type { ChatbotThemeVariables, ContextItem } from "../../../types";
|
||||
import { DEFAULT_MODELS } from "../constants";
|
||||
@@ -237,6 +238,27 @@ function ChatbotContent({
|
||||
const [inputResetCount, setInputResetCount] = useState(0);
|
||||
const [isUxAuditDialogOpen, setIsUxAuditDialogOpen] = useState(false);
|
||||
|
||||
// Fetch server model list on mount, fall back to prop-provided models
|
||||
const [fetchedModels, setFetchedModels] = useState<
|
||||
Array<{ name: string; value: string }> | null
|
||||
>(null);
|
||||
useEffect(() => {
|
||||
let cancelled = false;
|
||||
fetchModelsForSelector()
|
||||
.then((serverModels) => {
|
||||
if (!cancelled && serverModels.length > 0) {
|
||||
setFetchedModels(serverModels);
|
||||
}
|
||||
})
|
||||
.catch(() => {
|
||||
// Fallback to prop-provided models — already used below
|
||||
});
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, []);
|
||||
const effectiveModels = fetchedModels ?? models;
|
||||
|
||||
const handleSubmit = useCallback(
|
||||
(text: string, files?: File[], contexts?: ContextItem[]) => {
|
||||
void sendMessage?.(text, files, contexts);
|
||||
@@ -318,7 +340,7 @@ function ChatbotContent({
|
||||
onSubmit={handleSubmit}
|
||||
onStop={interrupt}
|
||||
status={status || "idle"}
|
||||
models={models}
|
||||
models={effectiveModels}
|
||||
placeholderTexts={placeholderTexts}
|
||||
/>
|
||||
</>
|
||||
|
||||
@@ -1,7 +1,15 @@
|
||||
import { CopyIcon, RefreshCcwIcon } from "lucide-react";
|
||||
import { Fragment } from "react";
|
||||
import { CopyIcon, RefreshCcwIcon, WrenchIcon } from "lucide-react";
|
||||
import { Fragment, useMemo } from "react";
|
||||
import { useTranslation } from "../../../i18n/context";
|
||||
import { translatedToolName } from "../../../i18n/tool-names";
|
||||
import { transformScreenshotPlaceholders } from "../../../lib/screenshot-utils";
|
||||
import { cn } from "../../../lib/utils";
|
||||
import type { MessageItemProps, UISourceUrlPart } from "../../../types";
|
||||
import type {
|
||||
MessageItemProps,
|
||||
UIMessage,
|
||||
UISourceUrlPart,
|
||||
UIToolPart,
|
||||
} from "../../../types";
|
||||
import { Action, Actions } from "../../ai-elements/actions";
|
||||
import { Message, MessageContent } from "../../ai-elements/message";
|
||||
import {
|
||||
@@ -55,6 +63,21 @@ export function DefaultMessageItem({
|
||||
return null;
|
||||
}
|
||||
|
||||
// Collect screenshot data from tool parts for placeholder resolution
|
||||
const { screenshotUidList, screenshotDataMap } = useMemo(() => {
|
||||
const uids: string[] = [];
|
||||
const dataMap = new Map<string, string>();
|
||||
for (const p of message.parts) {
|
||||
if (p.type === "tool" && p.screenshotUid) {
|
||||
uids.push(p.screenshotUid);
|
||||
if (p.screenshot) {
|
||||
dataMap.set(p.screenshotUid, p.screenshot);
|
||||
}
|
||||
}
|
||||
}
|
||||
return { screenshotUidList: uids, screenshotDataMap: dataMap };
|
||||
}, [message.parts]);
|
||||
|
||||
// Render sources if present
|
||||
const sourceUrls = message.parts.filter(
|
||||
(part): part is UISourceUrlPart => part.type === "source-url",
|
||||
@@ -79,12 +102,27 @@ export function DefaultMessageItem({
|
||||
const key = `${message.id}-${i}`;
|
||||
|
||||
switch (part.type) {
|
||||
case "text":
|
||||
case "text": {
|
||||
// Transform [[screenshot:...]] placeholders to markdown images.
|
||||
// First resolve to special URLs, then replace with actual
|
||||
// base64 data URLs when available for inline rendering.
|
||||
let processedText = part.text;
|
||||
if (screenshotUidList.length > 0) {
|
||||
processedText = transformScreenshotPlaceholders(
|
||||
processedText,
|
||||
screenshotUidList,
|
||||
);
|
||||
// Replace aipex-screenshot.invalid URLs with actual data
|
||||
for (const [uid, data] of screenshotDataMap) {
|
||||
const placeholder = `https://aipex-screenshot.invalid/${uid}`;
|
||||
processedText = processedText.split(placeholder).join(data);
|
||||
}
|
||||
}
|
||||
return (
|
||||
<Fragment key={key}>
|
||||
<Message from={message.role as "user" | "assistant" | "system"}>
|
||||
<MessageContent>
|
||||
<Response>{part.text}</Response>
|
||||
<Response>{processedText}</Response>
|
||||
</MessageContent>
|
||||
</Message>
|
||||
{/* Actions for last assistant message */}
|
||||
@@ -112,6 +150,7 @@ export function DefaultMessageItem({
|
||||
))}
|
||||
</Fragment>
|
||||
);
|
||||
}
|
||||
|
||||
case "file":
|
||||
return (
|
||||
@@ -241,6 +280,59 @@ export function DefaultMessageItem({
|
||||
);
|
||||
}
|
||||
|
||||
// ============ Collapsed tool display for folded messages ============
|
||||
|
||||
function CollapsedToolDisplay({ tool }: { tool: UIToolPart }) {
|
||||
const { t } = useTranslation();
|
||||
const displayName = translatedToolName(t, tool.toolName);
|
||||
return (
|
||||
<div className="text-xs text-muted-foreground py-1 px-2 flex items-center gap-1.5">
|
||||
<WrenchIcon className="size-3" />
|
||||
{displayName}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
// ============ Collapsed message item for intermediate assistant messages ============
|
||||
|
||||
/**
|
||||
* CollapsedMessageItem – simplified rendering for intermediate assistant
|
||||
* messages inside a folded "thinking details" section.
|
||||
* Shows text as bullet points and tools as compact single-line displays.
|
||||
*/
|
||||
export function CollapsedMessageItem({ message }: { message: UIMessage }) {
|
||||
return (
|
||||
<div>
|
||||
{message.parts.map((part, i) => {
|
||||
const key = `${message.id}-collapsed-${i}`;
|
||||
switch (part.type) {
|
||||
case "text":
|
||||
return (
|
||||
<div key={key} className="text-sm text-muted-foreground py-1">
|
||||
- {part.text}
|
||||
</div>
|
||||
);
|
||||
case "tool":
|
||||
return <CollapsedToolDisplay key={key} tool={part} />;
|
||||
case "reasoning":
|
||||
return (
|
||||
<div
|
||||
key={key}
|
||||
className="text-xs text-muted-foreground/70 py-0.5 italic"
|
||||
>
|
||||
{part.text.length > 120
|
||||
? `${part.text.slice(0, 120)}…`
|
||||
: part.text}
|
||||
</div>
|
||||
);
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
})}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* MessageItem - Renders either custom or default message item
|
||||
*/
|
||||
|
||||
@@ -1,15 +1,55 @@
|
||||
import { BrainIcon, ChevronDownIcon } from "lucide-react";
|
||||
import { useMemo } from "react";
|
||||
import { useTranslation } from "../../../i18n/context";
|
||||
import { cn } from "../../../lib/utils";
|
||||
import type { MessageListProps } from "../../../types";
|
||||
import type { MessageListProps, UIMessage } from "../../../types";
|
||||
import {
|
||||
Conversation,
|
||||
ConversationContent,
|
||||
ConversationScrollButton,
|
||||
} from "../../ai-elements/conversation";
|
||||
import { Loader } from "../../ai-elements/loader";
|
||||
import {
|
||||
Collapsible,
|
||||
CollapsibleContent,
|
||||
CollapsibleTrigger,
|
||||
} from "../../ui/collapsible";
|
||||
import { useComponentsContext } from "../context";
|
||||
import { MessageItem } from "./message-item";
|
||||
import { CollapsedMessageItem, MessageItem } from "./message-item";
|
||||
import { WelcomeScreen } from "./welcome-screen";
|
||||
|
||||
/**
|
||||
* A conversation turn: one optional user message followed by one or more
|
||||
* assistant messages produced before the next user message.
|
||||
*/
|
||||
interface ConversationTurn {
|
||||
userMessage?: UIMessage;
|
||||
assistantMessages: UIMessage[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Group a flat message list into conversation turns so we can collapse
|
||||
* intermediate assistant messages (thinking / tool-call steps).
|
||||
*/
|
||||
function groupIntoTurns(messages: UIMessage[]): ConversationTurn[] {
|
||||
const turns: ConversationTurn[] = [];
|
||||
let current: ConversationTurn | null = null;
|
||||
|
||||
for (const message of messages) {
|
||||
if (message.role === "user") {
|
||||
if (current) turns.push(current);
|
||||
current = { userMessage: message, assistantMessages: [] };
|
||||
} else if (message.role === "assistant") {
|
||||
if (!current) {
|
||||
current = { assistantMessages: [] };
|
||||
}
|
||||
current.assistantMessages.push(message);
|
||||
}
|
||||
}
|
||||
if (current) turns.push(current);
|
||||
return turns;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default MessageList component
|
||||
*/
|
||||
@@ -27,10 +67,18 @@ export function DefaultMessageList({
|
||||
onUxAuditClick?: () => void;
|
||||
}) {
|
||||
const { slots } = useComponentsContext();
|
||||
const { t } = useTranslation();
|
||||
|
||||
// Filter out system messages for display
|
||||
const displayMessages = messages.filter((m) => m.role !== "system");
|
||||
|
||||
// Group into conversation turns for folding
|
||||
const turns = useMemo(() => groupIntoTurns(displayMessages), [displayMessages]);
|
||||
|
||||
// Determine if a message is the very last display message
|
||||
const lastMessage = displayMessages[displayMessages.length - 1];
|
||||
const lastMessageId = lastMessage?.id ?? null;
|
||||
|
||||
return (
|
||||
<div className={cn("flex-1 overflow-hidden", className)} {...props}>
|
||||
<Conversation className="h-full">
|
||||
@@ -45,15 +93,78 @@ export function DefaultMessageList({
|
||||
onUxAuditClick={onUxAuditClick}
|
||||
/>
|
||||
) : (
|
||||
displayMessages.map((message, index) => (
|
||||
<MessageItem
|
||||
key={message.id}
|
||||
message={message}
|
||||
isLast={index === displayMessages.length - 1}
|
||||
isStreaming={status === "streaming"}
|
||||
onRegenerate={onRegenerate}
|
||||
onCopy={onCopy}
|
||||
/>
|
||||
turns.map((turn, turnIndex) => (
|
||||
<div key={`turn-${turnIndex}`}>
|
||||
{/* Render user message */}
|
||||
{turn.userMessage && (
|
||||
<MessageItem
|
||||
key={turn.userMessage.id}
|
||||
message={turn.userMessage}
|
||||
isLast={turn.userMessage.id === lastMessageId}
|
||||
isStreaming={status === "streaming"}
|
||||
onRegenerate={onRegenerate}
|
||||
onCopy={onCopy}
|
||||
/>
|
||||
)}
|
||||
|
||||
{/* Render assistant messages with folding */}
|
||||
{turn.assistantMessages.length > 1 ? (
|
||||
(() => {
|
||||
const finalMsg =
|
||||
turn.assistantMessages[
|
||||
turn.assistantMessages.length - 1
|
||||
]!;
|
||||
return (
|
||||
<>
|
||||
{/* Intermediate messages – collapsed by default */}
|
||||
<Collapsible defaultOpen={false} className="mb-2">
|
||||
<CollapsibleTrigger className="flex w-full cursor-pointer items-center gap-2 rounded-md border border-muted bg-muted/30 px-3 py-2 text-sm text-muted-foreground transition-colors hover:bg-muted/50 hover:text-foreground">
|
||||
<BrainIcon className="size-4" />
|
||||
<span className="flex-1 text-left">
|
||||
{t("common.showThinkingDetails")}
|
||||
</span>
|
||||
<ChevronDownIcon className="size-4 transition-transform [[data-state=open]>&]:rotate-180" />
|
||||
</CollapsibleTrigger>
|
||||
<CollapsibleContent className="mt-2">
|
||||
<div className="rounded-md border border-muted/50 bg-muted/10 p-3 space-y-2">
|
||||
{turn.assistantMessages
|
||||
.slice(0, -1)
|
||||
.map((msg) => (
|
||||
<CollapsedMessageItem
|
||||
key={msg.id}
|
||||
message={msg}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
</CollapsibleContent>
|
||||
</Collapsible>
|
||||
|
||||
{/* Final assistant message – always expanded */}
|
||||
<MessageItem
|
||||
key={finalMsg.id}
|
||||
message={finalMsg}
|
||||
isLast={finalMsg.id === lastMessageId}
|
||||
isStreaming={status === "streaming"}
|
||||
onRegenerate={onRegenerate}
|
||||
onCopy={onCopy}
|
||||
/>
|
||||
</>
|
||||
);
|
||||
})()
|
||||
) : (
|
||||
// Single assistant message – render normally
|
||||
turn.assistantMessages.map((msg) => (
|
||||
<MessageItem
|
||||
key={msg.id}
|
||||
message={msg}
|
||||
isLast={msg.id === lastMessageId}
|
||||
isStreaming={status === "streaming"}
|
||||
onRegenerate={onRegenerate}
|
||||
onCopy={onCopy}
|
||||
/>
|
||||
))
|
||||
)}
|
||||
</div>
|
||||
))
|
||||
)}
|
||||
{/* Loading indicator */}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import type React from "react";
|
||||
import { useEffect, useState } from "react";
|
||||
import { useCallback, useEffect, useState } from "react";
|
||||
import { fetchModelsForPrompt } from "../../../lib/models";
|
||||
|
||||
export interface ModelInfo {
|
||||
id: string;
|
||||
@@ -46,24 +47,36 @@ export const ModelChangePrompt: React.FC<ModelChangePromptProps> = ({
|
||||
const [allModels, setAllModels] = useState<ModelInfo[]>(availableModels);
|
||||
const [isLoadingModels, setIsLoadingModels] = useState(false);
|
||||
|
||||
// Fetch models from API
|
||||
useEffect(() => {
|
||||
const loadModels = async () => {
|
||||
if (!onFetchModels) return;
|
||||
// Resolve the fetch function: use the provided callback or fall back to
|
||||
// the built-in fetchModelsForPrompt so models are always loaded.
|
||||
const resolvedFetch = useCallback(
|
||||
() => (onFetchModels ? onFetchModels() : fetchModelsForPrompt()),
|
||||
[onFetchModels],
|
||||
);
|
||||
|
||||
// Fetch models from API (always runs — no longer gated on onFetchModels)
|
||||
useEffect(() => {
|
||||
let cancelled = false;
|
||||
const loadModels = async () => {
|
||||
setIsLoadingModels(true);
|
||||
try {
|
||||
const fetchedModels = await onFetchModels();
|
||||
setAllModels(fetchedModels);
|
||||
} catch (error) {
|
||||
console.error("Failed to load models:", error);
|
||||
const fetched = await resolvedFetch();
|
||||
if (!cancelled) {
|
||||
setAllModels(fetched);
|
||||
}
|
||||
} catch (_error) {
|
||||
// Keep using availableModels as fallback
|
||||
} finally {
|
||||
setIsLoadingModels(false);
|
||||
if (!cancelled) {
|
||||
setIsLoadingModels(false);
|
||||
}
|
||||
}
|
||||
};
|
||||
loadModels();
|
||||
}, [onFetchModels]);
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, [resolvedFetch]);
|
||||
|
||||
// Update models when availableModels prop changes
|
||||
useEffect(() => {
|
||||
|
||||
@@ -4,6 +4,8 @@ import {
|
||||
WrenchIcon,
|
||||
XCircleIcon,
|
||||
} from "lucide-react";
|
||||
import { useTranslation } from "../../../../i18n/context";
|
||||
import { translatedToolName } from "../../../../i18n/tool-names";
|
||||
import { cn } from "../../../../lib/utils";
|
||||
import type { ToolDisplaySlotProps } from "../../../../types";
|
||||
import { Response } from "../../../ai-elements/response";
|
||||
@@ -13,6 +15,7 @@ import {
|
||||
ToolHeader,
|
||||
ToolInput,
|
||||
ToolOutput,
|
||||
ToolScreenshot,
|
||||
} from "../../../ai-elements/tool";
|
||||
import {
|
||||
Collapsible,
|
||||
@@ -26,13 +29,15 @@ import { formatToolOutput, mapToolState } from "../../tools";
|
||||
* Opens by default when there's an error so users can see the failure reason
|
||||
*/
|
||||
export function DefaultToolDisplay({ tool }: ToolDisplaySlotProps) {
|
||||
const { t } = useTranslation();
|
||||
const displayName = translatedToolName(t, tool.toolName);
|
||||
// Expand by default when in error state to make failure reasons visible
|
||||
const shouldExpandByDefault = tool.state === "error";
|
||||
|
||||
return (
|
||||
<Tool defaultOpen={shouldExpandByDefault}>
|
||||
<ToolHeader
|
||||
type={`tool-${tool.toolName}`}
|
||||
type={displayName}
|
||||
state={mapToolState(tool.state)}
|
||||
/>
|
||||
<ToolContent>
|
||||
@@ -45,6 +50,10 @@ export function DefaultToolDisplay({ tool }: ToolDisplaySlotProps) {
|
||||
}
|
||||
errorText={tool.errorText}
|
||||
/>
|
||||
<ToolScreenshot
|
||||
screenshot={tool.screenshot}
|
||||
screenshotUid={tool.screenshotUid}
|
||||
/>
|
||||
</ToolContent>
|
||||
</Tool>
|
||||
);
|
||||
@@ -55,6 +64,8 @@ export function DefaultToolDisplay({ tool }: ToolDisplaySlotProps) {
|
||||
* Opens by default when there's an error so users can see the failure reason
|
||||
*/
|
||||
export function CompactToolDisplay({ tool }: ToolDisplaySlotProps) {
|
||||
const { t } = useTranslation();
|
||||
const displayName = translatedToolName(t, tool.toolName);
|
||||
const getStatusIcon = () => {
|
||||
switch (tool.state) {
|
||||
case "pending":
|
||||
@@ -75,7 +86,7 @@ export function CompactToolDisplay({ tool }: ToolDisplaySlotProps) {
|
||||
<Collapsible defaultOpen={shouldExpandByDefault}>
|
||||
<CollapsibleTrigger className="flex items-center gap-2 w-full p-2 rounded-md hover:bg-muted/50 transition-colors">
|
||||
{getStatusIcon()}
|
||||
<span className="text-sm font-medium">{tool.toolName}</span>
|
||||
<span className="text-sm font-medium">{displayName}</span>
|
||||
{tool.duration && (
|
||||
<span className="text-xs text-muted-foreground ml-auto">
|
||||
{tool.duration}ms
|
||||
@@ -118,6 +129,8 @@ export function CompactToolDisplay({ tool }: ToolDisplaySlotProps) {
|
||||
* Minimal tool display (just status indicator)
|
||||
*/
|
||||
export function MinimalToolDisplay({ tool }: ToolDisplaySlotProps) {
|
||||
const { t } = useTranslation();
|
||||
const displayName = translatedToolName(t, tool.toolName);
|
||||
const getStatusColor = () => {
|
||||
switch (tool.state) {
|
||||
case "pending":
|
||||
@@ -134,7 +147,7 @@ export function MinimalToolDisplay({ tool }: ToolDisplaySlotProps) {
|
||||
return (
|
||||
<div className="inline-flex items-center gap-1.5 px-2 py-1 text-xs rounded-full bg-muted">
|
||||
<div className={cn("w-2 h-2 rounded-full", getStatusColor())} />
|
||||
<span>{tool.toolName}</span>
|
||||
<span>{displayName}</span>
|
||||
{tool.state === "executing" && (
|
||||
<Loader2Icon className="size-3 animate-spin" />
|
||||
)}
|
||||
|
||||
@@ -91,6 +91,9 @@ export function useChat(
|
||||
const [sessionId, setSessionId] = useState<string | null>(null);
|
||||
const [metrics, setMetrics] = useState<AgentMetrics | null>(null);
|
||||
|
||||
// Cumulative session-level metrics (sum across all runs)
|
||||
const cumulativeMetricsRef = useRef<AgentMetrics | null>(null);
|
||||
|
||||
// Refs for stable callbacks
|
||||
const handlersRef = useRef(handlers);
|
||||
handlersRef.current = handlers;
|
||||
@@ -153,11 +156,28 @@ export function useChat(
|
||||
handlersRef.current?.onError?.(event.error);
|
||||
}
|
||||
|
||||
// Handle metrics update
|
||||
// Handle metrics update – accumulate across the session
|
||||
if (event.type === "metrics_update") {
|
||||
setMetrics(event.metrics);
|
||||
const prev = cumulativeMetricsRef.current;
|
||||
const cumulative: AgentMetrics = {
|
||||
tokensUsed:
|
||||
(prev?.tokensUsed ?? 0) + event.metrics.tokensUsed,
|
||||
promptTokens:
|
||||
(prev?.promptTokens ?? 0) + event.metrics.promptTokens,
|
||||
completionTokens:
|
||||
(prev?.completionTokens ?? 0) +
|
||||
event.metrics.completionTokens,
|
||||
// Non-cumulative fields: use latest values
|
||||
itemCount: event.metrics.itemCount,
|
||||
maxTurns: event.metrics.maxTurns,
|
||||
duration:
|
||||
(prev?.duration ?? 0) + event.metrics.duration,
|
||||
startTime: prev?.startTime ?? event.metrics.startTime,
|
||||
};
|
||||
cumulativeMetricsRef.current = cumulative;
|
||||
setMetrics(cumulative);
|
||||
handlersRef.current?.onMetricsUpdate?.(
|
||||
event.metrics,
|
||||
cumulative,
|
||||
event.sessionId,
|
||||
);
|
||||
}
|
||||
@@ -263,6 +283,7 @@ export function useChat(
|
||||
activeGeneratorRef.current = null;
|
||||
setSessionId(null);
|
||||
setMetrics(null);
|
||||
cumulativeMetricsRef.current = null;
|
||||
adapter.reset(configRef.current?.initialMessages ?? []);
|
||||
}, [adapter, agent, sessionId]);
|
||||
|
||||
|
||||
@@ -10,7 +10,9 @@
|
||||
"send": "Send",
|
||||
"stop": "Stop",
|
||||
"processing": "Processing...",
|
||||
"noActions": "No actions"
|
||||
"noActions": "No actions",
|
||||
"showThinkingDetails": "Show thinking details",
|
||||
"clickToExpand": "Click to expand"
|
||||
},
|
||||
"settings": {
|
||||
"title": "Settings",
|
||||
|
||||
@@ -10,7 +10,9 @@
|
||||
"send": "发送",
|
||||
"stop": "停止",
|
||||
"processing": "处理中...",
|
||||
"noActions": "无可用操作"
|
||||
"noActions": "无可用操作",
|
||||
"showThinkingDetails": "显示思考过程",
|
||||
"clickToExpand": "点击展开"
|
||||
},
|
||||
"settings": {
|
||||
"title": "设置",
|
||||
|
||||
@@ -13,6 +13,8 @@ export interface TranslationResources {
|
||||
stop: string;
|
||||
processing: string;
|
||||
noActions: string;
|
||||
showThinkingDetails: string;
|
||||
clickToExpand: string;
|
||||
};
|
||||
settings: {
|
||||
title: string;
|
||||
@@ -229,6 +231,8 @@ export type BaseTranslationKey =
|
||||
| "common.stop"
|
||||
| "common.processing"
|
||||
| "common.noActions"
|
||||
| "common.showThinkingDetails"
|
||||
| "common.clickToExpand"
|
||||
| "settings.title"
|
||||
| "settings.subtitle"
|
||||
| "settings.language"
|
||||
|
||||
180
packages/aipex-react/src/lib/models.ts
Normal file
180
packages/aipex-react/src/lib/models.ts
Normal file
@@ -0,0 +1,180 @@
|
||||
// API response types (must match server contract)
|
||||
interface ApiModelPricing {
|
||||
input: number;
|
||||
output: number;
|
||||
}
|
||||
|
||||
interface ApiModel {
|
||||
id: string;
|
||||
name: string;
|
||||
provider: string;
|
||||
description: string;
|
||||
pricing: ApiModelPricing;
|
||||
}
|
||||
|
||||
interface ApiResponse {
|
||||
success: boolean;
|
||||
data: {
|
||||
models: ApiModel[];
|
||||
count: number;
|
||||
cache: {
|
||||
lastUpdate: number;
|
||||
modelCount: number;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
// Internal model info used by the chatbot UI
|
||||
export interface ModelInfo {
|
||||
id: string;
|
||||
name: string;
|
||||
provider: string;
|
||||
description: string;
|
||||
supportsTools: boolean;
|
||||
contextLength?: number;
|
||||
pricing?: {
|
||||
input: string;
|
||||
output: string;
|
||||
};
|
||||
priceLevel: "cheap" | "normal" | "expensive";
|
||||
}
|
||||
|
||||
// Fallback models in case API fails
|
||||
const FALLBACK_MODELS: ModelInfo[] = [
|
||||
{
|
||||
id: "anthropic/claude-3-haiku",
|
||||
name: "Claude 3 Haiku",
|
||||
provider: "Anthropic",
|
||||
description: "Cost-effective choice for basic tasks",
|
||||
supportsTools: true,
|
||||
contextLength: 200_000,
|
||||
pricing: {
|
||||
input: "$0.30/1M tokens",
|
||||
output: "$1.50/1M tokens",
|
||||
},
|
||||
priceLevel: "cheap",
|
||||
},
|
||||
{
|
||||
id: "anthropic/claude-sonnet-4.5",
|
||||
name: "Claude Sonnet 4.5",
|
||||
provider: "Anthropic",
|
||||
description: "AI model for various tasks",
|
||||
supportsTools: true,
|
||||
contextLength: 200_000,
|
||||
pricing: {
|
||||
input: "$3.60/1M tokens",
|
||||
output: "$18.00/1M tokens",
|
||||
},
|
||||
priceLevel: "expensive",
|
||||
},
|
||||
];
|
||||
|
||||
const MODELS_API_URL = "https://www.claudechrome.com/api/models";
|
||||
|
||||
// Convert API pricing to price level
|
||||
function getPriceLevel(
|
||||
pricing: ApiModelPricing,
|
||||
): "cheap" | "normal" | "expensive" {
|
||||
const totalCost = pricing.input + pricing.output;
|
||||
if (totalCost < 2) return "cheap";
|
||||
if (totalCost < 10) return "normal";
|
||||
return "expensive";
|
||||
}
|
||||
|
||||
// Convert API model to internal ModelInfo
|
||||
function convertApiModel(apiModel: ApiModel): ModelInfo {
|
||||
return {
|
||||
id: apiModel.id,
|
||||
name: apiModel.name,
|
||||
provider: apiModel.provider,
|
||||
description: apiModel.description,
|
||||
supportsTools: true,
|
||||
pricing: {
|
||||
input: `$${apiModel.pricing.input.toFixed(2)}/1M tokens`,
|
||||
output: `$${apiModel.pricing.output.toFixed(2)}/1M tokens`,
|
||||
},
|
||||
priceLevel: getPriceLevel(apiModel.pricing),
|
||||
};
|
||||
}
|
||||
|
||||
// Validate that the API response matches the expected schema
|
||||
function isValidApiResponse(data: unknown): data is ApiResponse {
|
||||
if (typeof data !== "object" || data === null) return false;
|
||||
const obj = data as Record<string, unknown>;
|
||||
if (typeof obj.success !== "boolean") return false;
|
||||
if (typeof obj.data !== "object" || obj.data === null) return false;
|
||||
const d = obj.data as Record<string, unknown>;
|
||||
if (!Array.isArray(d.models)) return false;
|
||||
// Validate first model shape if present
|
||||
if (d.models.length > 0) {
|
||||
const first = d.models[0] as Record<string, unknown>;
|
||||
if (typeof first.id !== "string" || typeof first.name !== "string") {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Cache for models
|
||||
let cachedModels: ModelInfo[] | null = null;
|
||||
let lastFetchTime = 0;
|
||||
const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
|
||||
const MAX_MODELS = 200; // Safety cap on number of models
|
||||
|
||||
/**
|
||||
* Fetch models from the server API with caching and fallback.
|
||||
* Returns cached result if still valid (5 min TTL).
|
||||
* Falls back to FALLBACK_MODELS on any error.
|
||||
*/
|
||||
export async function fetchModels(): Promise<ModelInfo[]> {
|
||||
// Return cached models if still valid
|
||||
if (cachedModels && Date.now() - lastFetchTime < CACHE_DURATION) {
|
||||
return cachedModels;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(MODELS_API_URL);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`API request failed: ${response.status}`);
|
||||
}
|
||||
|
||||
const data: unknown = await response.json();
|
||||
|
||||
if (!isValidApiResponse(data)) {
|
||||
throw new Error("Invalid API response structure");
|
||||
}
|
||||
|
||||
if (data.success && data.data.models.length > 0) {
|
||||
// Apply safety cap
|
||||
const models = data.data.models
|
||||
.slice(0, MAX_MODELS)
|
||||
.map(convertApiModel);
|
||||
cachedModels = models;
|
||||
lastFetchTime = Date.now();
|
||||
return cachedModels;
|
||||
}
|
||||
|
||||
throw new Error("Empty model list from API");
|
||||
} catch (_error) {
|
||||
// Return fallback - do not log sensitive details
|
||||
return FALLBACK_MODELS;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch models and convert to the {name, value} format used by the model selector.
|
||||
*/
|
||||
export async function fetchModelsForSelector(): Promise<
|
||||
Array<{ name: string; value: string }>
|
||||
> {
|
||||
const models = await fetchModels();
|
||||
return models.map((m) => ({ name: m.name, value: m.id }));
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch models as ModelInfo[] for ModelChangePrompt compatibility.
|
||||
*/
|
||||
export async function fetchModelsForPrompt(): Promise<ModelInfo[]> {
|
||||
return fetchModels();
|
||||
}
|
||||
177
packages/aipex-react/src/lib/screenshot-storage.ts
Normal file
177
packages/aipex-react/src/lib/screenshot-storage.ts
Normal file
@@ -0,0 +1,177 @@
|
||||
/**
|
||||
* Screenshot storage using IndexedDB.
|
||||
* Stores screenshots with a uid for efficient reference and retrieval.
|
||||
* Applies an LRU eviction policy (max 50 screenshots).
|
||||
*/
|
||||
|
||||
export interface ScreenshotData {
|
||||
uid: string;
|
||||
/** Complete data URL: data:image/png;base64,... */
|
||||
base64Data: string;
|
||||
timestamp: number;
|
||||
tabId?: number;
|
||||
metadata?: {
|
||||
width: number;
|
||||
height: number;
|
||||
viewportWidth: number;
|
||||
viewportHeight: number;
|
||||
};
|
||||
}
|
||||
|
||||
const DB_NAME = "aipex-screenshots-db";
|
||||
const DB_VERSION = 1;
|
||||
const STORE_NAME = "screenshots";
|
||||
const MAX_SCREENSHOTS = 50;
|
||||
|
||||
let db: IDBDatabase | null = null;
|
||||
let initPromise: Promise<void> | null = null;
|
||||
|
||||
function initialize(): Promise<void> {
|
||||
if (initPromise) return initPromise;
|
||||
if (db) return Promise.resolve();
|
||||
|
||||
initPromise = new Promise<void>((resolve, reject) => {
|
||||
const request = indexedDB.open(DB_NAME, DB_VERSION);
|
||||
|
||||
request.onerror = () => {
|
||||
initPromise = null;
|
||||
reject(request.error);
|
||||
};
|
||||
|
||||
request.onsuccess = () => {
|
||||
db = request.result;
|
||||
initPromise = null;
|
||||
resolve();
|
||||
};
|
||||
|
||||
request.onupgradeneeded = (event) => {
|
||||
const database = (event.target as IDBOpenDBRequest).result;
|
||||
if (!database.objectStoreNames.contains(STORE_NAME)) {
|
||||
const store = database.createObjectStore(STORE_NAME, {
|
||||
keyPath: "uid",
|
||||
});
|
||||
store.createIndex("timestamp", "timestamp", { unique: false });
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
return initPromise;
|
||||
}
|
||||
|
||||
function generateUid(): string {
|
||||
return `screenshot_${Date.now()}_${Math.random().toString(36).slice(2, 11)}`;
|
||||
}
|
||||
|
||||
async function applyLRU(): Promise<void> {
|
||||
if (!db) return;
|
||||
const tx = db.transaction([STORE_NAME], "readonly");
|
||||
const store = tx.objectStore(STORE_NAME);
|
||||
const all: ScreenshotData[] = await new Promise((res, rej) => {
|
||||
const req = store.getAll();
|
||||
req.onsuccess = () => res(req.result as ScreenshotData[]);
|
||||
req.onerror = () => rej(req.error);
|
||||
});
|
||||
|
||||
if (all.length <= MAX_SCREENSHOTS) return;
|
||||
|
||||
all.sort((a, b) => b.timestamp - a.timestamp);
|
||||
const toDelete = all.slice(MAX_SCREENSHOTS);
|
||||
|
||||
const delTx = db.transaction([STORE_NAME], "readwrite");
|
||||
const delStore = delTx.objectStore(STORE_NAME);
|
||||
for (const item of toDelete) {
|
||||
delStore.delete(item.uid);
|
||||
}
|
||||
}
|
||||
|
||||
export const ScreenshotStorage = {
|
||||
/**
|
||||
* Save a screenshot and return its uid.
|
||||
* The base64Data must be a valid data URL (validated before storing).
|
||||
*/
|
||||
async saveScreenshot(
|
||||
base64Data: string,
|
||||
metadata?: {
|
||||
tabId?: number;
|
||||
width?: number;
|
||||
height?: number;
|
||||
viewportWidth?: number;
|
||||
viewportHeight?: number;
|
||||
},
|
||||
): Promise<string> {
|
||||
// Validate that it's a data URL (not arbitrary content)
|
||||
if (
|
||||
typeof base64Data !== "string" ||
|
||||
!base64Data.startsWith("data:image/")
|
||||
) {
|
||||
throw new Error("Invalid screenshot data: expected data:image/ URL");
|
||||
}
|
||||
|
||||
await initialize();
|
||||
if (!db) throw new Error("Database not initialized");
|
||||
|
||||
const uid = generateUid();
|
||||
const entry: ScreenshotData = {
|
||||
uid,
|
||||
base64Data,
|
||||
timestamp: Date.now(),
|
||||
tabId: metadata?.tabId,
|
||||
metadata: metadata
|
||||
? {
|
||||
width: metadata.width ?? 0,
|
||||
height: metadata.height ?? 0,
|
||||
viewportWidth: metadata.viewportWidth ?? 0,
|
||||
viewportHeight: metadata.viewportHeight ?? 0,
|
||||
}
|
||||
: undefined,
|
||||
};
|
||||
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const tx = db!.transaction([STORE_NAME], "readwrite");
|
||||
const store = tx.objectStore(STORE_NAME);
|
||||
const req = store.put(entry);
|
||||
req.onsuccess = () => resolve();
|
||||
req.onerror = () => reject(req.error);
|
||||
});
|
||||
|
||||
// Async LRU eviction — fire-and-forget
|
||||
applyLRU().catch(() => {});
|
||||
|
||||
return uid;
|
||||
},
|
||||
|
||||
/**
|
||||
* Get screenshot base64 data by uid.
|
||||
*/
|
||||
async getScreenshot(uid: string): Promise<string | null> {
|
||||
await initialize();
|
||||
if (!db) throw new Error("Database not initialized");
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
const tx = db!.transaction([STORE_NAME], "readonly");
|
||||
const store = tx.objectStore(STORE_NAME);
|
||||
const req = store.get(uid);
|
||||
req.onsuccess = () => {
|
||||
const data = req.result as ScreenshotData | undefined;
|
||||
resolve(data?.base64Data ?? null);
|
||||
};
|
||||
req.onerror = () => reject(req.error);
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
* Clear all screenshots.
|
||||
*/
|
||||
async clearAll(): Promise<void> {
|
||||
await initialize();
|
||||
if (!db) throw new Error("Database not initialized");
|
||||
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const tx = db!.transaction([STORE_NAME], "readwrite");
|
||||
const store = tx.objectStore(STORE_NAME);
|
||||
const req = store.clear();
|
||||
req.onsuccess = () => resolve();
|
||||
req.onerror = () => reject(req.error);
|
||||
});
|
||||
},
|
||||
};
|
||||
185
packages/aipex-react/src/lib/screenshot-utils.ts
Normal file
185
packages/aipex-react/src/lib/screenshot-utils.ts
Normal file
@@ -0,0 +1,185 @@
|
||||
/**
|
||||
* Utilities for detecting screenshot tools and extracting image data
|
||||
* from tool results.
|
||||
*/
|
||||
|
||||
/** Tool names that produce screenshot image data */
|
||||
const SCREENSHOT_TOOL_NAMES = new Set([
|
||||
"capture_screenshot",
|
||||
"capture_screenshot_with_highlight",
|
||||
"capture_tab_screenshot",
|
||||
]);
|
||||
|
||||
/** URL prefix used in markdown for screenshot references */
|
||||
export const AIPEX_SCREENSHOT_URL_PREFIX = "https://aipex-screenshot.invalid/";
|
||||
|
||||
/** Regex matching [[screenshot:...]] placeholders */
|
||||
const SCREENSHOT_PLACEHOLDER_REGEX = /\[\[screenshot:([^\]]+)\]\]/g;
|
||||
|
||||
/** Validate that a uid looks like a screenshot uid */
|
||||
export function isValidScreenshotUid(uid: string): boolean {
|
||||
return /^screenshot_\d+_[a-z0-9]{1,20}$/i.test(uid);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a tool is a screenshot/capture tool.
|
||||
*/
|
||||
export function isCaptureScreenshotTool(toolName: string): boolean {
|
||||
return SCREENSHOT_TOOL_NAMES.has(toolName);
|
||||
}
|
||||
|
||||
export interface ScreenshotExtraction {
|
||||
/** Base64 data URL if available (may be null if already stripped) */
|
||||
imageData: string | null;
|
||||
/** Whether the screenshot was intended for LLM vision */
|
||||
sendToLLM: boolean;
|
||||
/** Unique identifier for loading from IndexedDB storage */
|
||||
screenshotUid: string | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract screenshot info from a tool result.
|
||||
* Works with capture_screenshot and capture_tab_screenshot tools.
|
||||
*
|
||||
* Supports multiple result formats:
|
||||
* - Object: { success, imageData, sendToLLM, screenshotUid }
|
||||
* - Nested object: { success, data: { imageData, sendToLLM, screenshotUid } }
|
||||
* - SDK structured array: [{ type: "text", text: JSON }, { type: "image", image: dataUrl }]
|
||||
*
|
||||
* Returns screenshot details if found, null if this is not a screenshot result.
|
||||
*/
|
||||
export function extractScreenshotFromToolResult(
|
||||
toolName: string,
|
||||
result: unknown,
|
||||
): ScreenshotExtraction | null {
|
||||
if (!isCaptureScreenshotTool(toolName)) return null;
|
||||
|
||||
try {
|
||||
const content =
|
||||
typeof result === "string" ? JSON.parse(result) : result;
|
||||
if (content === null || content === undefined) return null;
|
||||
|
||||
// SDK structured array format:
|
||||
// [{ type: "text", text: '{"success":true,...}' }, { type: "image", image: "data:..." }]
|
||||
if (Array.isArray(content)) {
|
||||
return extractFromStructuredArray(content);
|
||||
}
|
||||
|
||||
if (typeof content !== "object") return null;
|
||||
|
||||
const obj = content as Record<string, unknown>;
|
||||
|
||||
// Handle nested structure: { success, data: { imageData, sendToLLM } }
|
||||
// or direct: { success, imageData, sendToLLM }
|
||||
const middleLayer = obj.data as Record<string, unknown> | undefined;
|
||||
const actualData =
|
||||
(middleLayer?.data as Record<string, unknown>) ?? middleLayer ?? obj;
|
||||
|
||||
if (!obj.success) return null;
|
||||
|
||||
// Extract screenshotUid (always present if tool saved to IndexedDB)
|
||||
const screenshotUid =
|
||||
typeof actualData.screenshotUid === "string"
|
||||
? actualData.screenshotUid
|
||||
: null;
|
||||
|
||||
// Extract imageData (may be a real data URL or a placeholder)
|
||||
const rawImageData = actualData.imageData;
|
||||
const imageData =
|
||||
typeof rawImageData === "string" &&
|
||||
rawImageData.startsWith("data:image/")
|
||||
? rawImageData
|
||||
: null;
|
||||
|
||||
const sendToLLM = actualData.sendToLLM === true;
|
||||
|
||||
// Return if we have at least a uid or image data
|
||||
if (screenshotUid || imageData) {
|
||||
return { imageData, sendToLLM, screenshotUid };
|
||||
}
|
||||
} catch {
|
||||
// parse failed – ignore
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract screenshot from SDK structured array format.
|
||||
*/
|
||||
function extractFromStructuredArray(
|
||||
arr: unknown[],
|
||||
): ScreenshotExtraction | null {
|
||||
let imageData: string | null = null;
|
||||
let screenshotUid: string | null = null;
|
||||
let sendToLLM = false;
|
||||
|
||||
for (const item of arr) {
|
||||
if (typeof item !== "object" || item === null) continue;
|
||||
const part = item as Record<string, unknown>;
|
||||
|
||||
if (part.type === "image" && typeof part.image === "string") {
|
||||
if (part.image.startsWith("data:image/")) {
|
||||
imageData = part.image;
|
||||
}
|
||||
}
|
||||
|
||||
if (part.type === "text" && typeof part.text === "string") {
|
||||
try {
|
||||
const parsed = JSON.parse(part.text) as Record<string, unknown>;
|
||||
if (parsed.sendToLLM === true) sendToLLM = true;
|
||||
if (typeof parsed.screenshotUid === "string") {
|
||||
screenshotUid = parsed.screenshotUid;
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (imageData) {
|
||||
return { imageData, sendToLLM: sendToLLM || true, screenshotUid };
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transform [[screenshot:...]] placeholders in text into markdown images
|
||||
* with the special aipex-screenshot.invalid URL prefix.
|
||||
*
|
||||
* Supported formats:
|
||||
* - [[screenshot:screenshot_123_abc]] → 
|
||||
* - [[screenshot:1]] → 1-based index into screenshotUidList
|
||||
*/
|
||||
export function transformScreenshotPlaceholders(
|
||||
text: string,
|
||||
screenshotUidList: string[],
|
||||
): string {
|
||||
return text.replace(
|
||||
SCREENSHOT_PLACEHOLDER_REGEX,
|
||||
(match: string, content: string) => {
|
||||
const trimmed = content.trim();
|
||||
|
||||
// Case 1: Direct uid
|
||||
if (isValidScreenshotUid(trimmed)) {
|
||||
return ``;
|
||||
}
|
||||
|
||||
// Case 2: Numeric 1-based index
|
||||
const index = parseInt(trimmed, 10);
|
||||
if (
|
||||
!isNaN(index) &&
|
||||
index >= 1 &&
|
||||
index <= screenshotUidList.length
|
||||
) {
|
||||
const uid = screenshotUidList[index - 1];
|
||||
if (uid && isValidScreenshotUid(uid)) {
|
||||
return ``;
|
||||
}
|
||||
}
|
||||
|
||||
// Invalid – leave as-is
|
||||
return match;
|
||||
},
|
||||
);
|
||||
}
|
||||
@@ -47,6 +47,10 @@ export interface UIToolPart {
|
||||
state: UIToolState;
|
||||
errorText?: string;
|
||||
duration?: number;
|
||||
/** Base64 data URL of the screenshot (inline) */
|
||||
screenshot?: string;
|
||||
/** UID referencing a screenshot in ScreenshotStorage (IndexedDB) */
|
||||
screenshotUid?: string;
|
||||
}
|
||||
|
||||
export interface UIContextPart {
|
||||
|
||||
@@ -41,32 +41,58 @@ export function ChatImagesListener() {
|
||||
|
||||
for (const msg of messages) {
|
||||
for (const part of msg.parts) {
|
||||
// Tool parts may carry screenshot data in their output
|
||||
// Tool parts may carry screenshot data inline (screenshot field)
|
||||
// or in their output (imageData field)
|
||||
if (part.type === "tool") {
|
||||
const output = (part as { output?: unknown }).output;
|
||||
const toolPart = part as {
|
||||
output?: unknown;
|
||||
screenshot?: string;
|
||||
toolName?: string;
|
||||
};
|
||||
|
||||
// Prefer the inline screenshot field (set by ChatAdapter)
|
||||
const screenshotData = toolPart.screenshot;
|
||||
if (
|
||||
output &&
|
||||
typeof output === "object" &&
|
||||
"imageData" in output
|
||||
screenshotData &&
|
||||
typeof screenshotData === "string" &&
|
||||
screenshotData.startsWith("data:image/")
|
||||
) {
|
||||
const imageData = (output as { imageData?: string }).imageData;
|
||||
images.push({
|
||||
id: msg.id,
|
||||
parts: [
|
||||
{
|
||||
type: "image",
|
||||
imageData: screenshotData,
|
||||
imageTitle: toolPart.toolName || "Screenshot",
|
||||
},
|
||||
],
|
||||
});
|
||||
} else {
|
||||
// Fall back to extracting from output
|
||||
const output = toolPart.output;
|
||||
if (
|
||||
imageData &&
|
||||
typeof imageData === "string" &&
|
||||
imageData.startsWith("data:image/")
|
||||
output &&
|
||||
typeof output === "object" &&
|
||||
"imageData" in output
|
||||
) {
|
||||
images.push({
|
||||
id: msg.id,
|
||||
parts: [
|
||||
{
|
||||
type: "image",
|
||||
imageData,
|
||||
imageTitle:
|
||||
(part as { toolName?: string }).toolName ||
|
||||
"Screenshot",
|
||||
},
|
||||
],
|
||||
});
|
||||
const imageData = (output as { imageData?: string })
|
||||
.imageData;
|
||||
if (
|
||||
imageData &&
|
||||
typeof imageData === "string" &&
|
||||
imageData.startsWith("data:image/")
|
||||
) {
|
||||
images.push({
|
||||
id: msg.id,
|
||||
parts: [
|
||||
{
|
||||
type: "image",
|
||||
imageData,
|
||||
imageTitle: toolPart.toolName || "Screenshot",
|
||||
},
|
||||
],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
257
packages/browser-ext/src/lib/message-adapter.test.ts
Normal file
257
packages/browser-ext/src/lib/message-adapter.test.ts
Normal file
@@ -0,0 +1,257 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { fromStorageFormat, toStorageFormat } from "./message-adapter";
|
||||
|
||||
const TEST_IMAGE_DATA = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQ==";
|
||||
const TEST_SCREENSHOT_UID = "screenshot_1234567890_abcdefghi";
|
||||
const PLACEHOLDER = "[Image data removed - see following user message]";
|
||||
|
||||
describe("message-adapter", () => {
|
||||
describe("toStorageFormat – screenshot stripping", () => {
|
||||
it("should strip base64 imageData from screenshot tool results", () => {
|
||||
const output = {
|
||||
success: true,
|
||||
imageData: TEST_IMAGE_DATA,
|
||||
sendToLLM: true,
|
||||
screenshotUid: TEST_SCREENSHOT_UID,
|
||||
tabId: 1,
|
||||
url: "https://example.com",
|
||||
title: "Example",
|
||||
};
|
||||
|
||||
const messages = [
|
||||
{
|
||||
id: "msg-1",
|
||||
role: "assistant" as const,
|
||||
parts: [
|
||||
{
|
||||
type: "tool" as const,
|
||||
toolCallId: "call-1",
|
||||
toolName: "capture_screenshot",
|
||||
input: { sendToLLM: true },
|
||||
output,
|
||||
state: "completed" as const,
|
||||
screenshot: TEST_IMAGE_DATA,
|
||||
screenshotUid: TEST_SCREENSHOT_UID,
|
||||
},
|
||||
],
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
];
|
||||
|
||||
const stored = toStorageFormat(messages as any);
|
||||
expect(stored.length).toBe(1);
|
||||
|
||||
// Find the tool_result part
|
||||
const toolResultPart = stored[0]!.parts.find(
|
||||
(p: any) => p.type === "tool_result",
|
||||
) as any;
|
||||
expect(toolResultPart).toBeTruthy();
|
||||
|
||||
// Parse the content and verify imageData is stripped
|
||||
const parsedContent = JSON.parse(toolResultPart.content);
|
||||
expect(parsedContent.imageData).toBe(PLACEHOLDER);
|
||||
expect(parsedContent.screenshotUid).toBe(TEST_SCREENSHOT_UID);
|
||||
expect(parsedContent.success).toBe(true);
|
||||
});
|
||||
|
||||
it("should not strip non-screenshot tool results", () => {
|
||||
const output = {
|
||||
tabs: [{ id: 1, title: "Tab" }],
|
||||
imageData: TEST_IMAGE_DATA, // Even if it has imageData
|
||||
};
|
||||
|
||||
const messages = [
|
||||
{
|
||||
id: "msg-1",
|
||||
role: "assistant" as const,
|
||||
parts: [
|
||||
{
|
||||
type: "tool" as const,
|
||||
toolCallId: "call-1",
|
||||
toolName: "get_tabs",
|
||||
input: {},
|
||||
output,
|
||||
state: "completed" as const,
|
||||
},
|
||||
],
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
];
|
||||
|
||||
const stored = toStorageFormat(messages as any);
|
||||
const toolResultPart = stored[0]!.parts.find(
|
||||
(p: any) => p.type === "tool_result",
|
||||
) as any;
|
||||
const parsedContent = JSON.parse(toolResultPart.content);
|
||||
expect(parsedContent.imageData).toBe(TEST_IMAGE_DATA);
|
||||
});
|
||||
});
|
||||
|
||||
describe("fromStorageFormat – screenshotUid restoration", () => {
|
||||
it("should restore screenshotUid from stored tool result", () => {
|
||||
const storedOutput = {
|
||||
success: true,
|
||||
imageData: PLACEHOLDER,
|
||||
sendToLLM: true,
|
||||
screenshotUid: TEST_SCREENSHOT_UID,
|
||||
tabId: 1,
|
||||
};
|
||||
|
||||
const storedMessages = [
|
||||
{
|
||||
id: "msg-1",
|
||||
role: "assistant" as const,
|
||||
parts: [
|
||||
{
|
||||
type: "tool_use" as const,
|
||||
id: "call-1",
|
||||
name: "capture_screenshot",
|
||||
input: { sendToLLM: true },
|
||||
},
|
||||
{
|
||||
type: "tool_result" as const,
|
||||
tool_use_id: "call-1",
|
||||
content: JSON.stringify(storedOutput),
|
||||
is_error: false,
|
||||
},
|
||||
],
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
];
|
||||
|
||||
const restored = fromStorageFormat(storedMessages as any);
|
||||
expect(restored.length).toBe(1);
|
||||
|
||||
// Find the tool part (merged from tool_use + tool_result)
|
||||
const toolPart = restored[0]!.parts.find(
|
||||
(p: any) => p.type === "tool",
|
||||
) as any;
|
||||
expect(toolPart).toBeTruthy();
|
||||
expect(toolPart.screenshotUid).toBe(TEST_SCREENSHOT_UID);
|
||||
// imageData is the placeholder, not a real data URL, so screenshot should NOT be set
|
||||
expect(toolPart.screenshot).toBeUndefined();
|
||||
expect(toolPart.state).toBe("completed");
|
||||
});
|
||||
|
||||
it("should restore both screenshotUid and screenshot when real imageData is present", () => {
|
||||
const storedOutput = {
|
||||
success: true,
|
||||
imageData: TEST_IMAGE_DATA,
|
||||
sendToLLM: true,
|
||||
screenshotUid: TEST_SCREENSHOT_UID,
|
||||
tabId: 1,
|
||||
};
|
||||
|
||||
const storedMessages = [
|
||||
{
|
||||
id: "msg-1",
|
||||
role: "assistant" as const,
|
||||
parts: [
|
||||
{
|
||||
type: "tool_use" as const,
|
||||
id: "call-1",
|
||||
name: "capture_screenshot",
|
||||
input: { sendToLLM: true },
|
||||
},
|
||||
{
|
||||
type: "tool_result" as const,
|
||||
tool_use_id: "call-1",
|
||||
content: JSON.stringify(storedOutput),
|
||||
is_error: false,
|
||||
},
|
||||
],
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
];
|
||||
|
||||
const restored = fromStorageFormat(storedMessages as any);
|
||||
const toolPart = restored[0]!.parts.find(
|
||||
(p: any) => p.type === "tool",
|
||||
) as any;
|
||||
expect(toolPart.screenshotUid).toBe(TEST_SCREENSHOT_UID);
|
||||
expect(toolPart.screenshot).toBe(TEST_IMAGE_DATA);
|
||||
});
|
||||
});
|
||||
|
||||
describe("round-trip: toStorageFormat -> fromStorageFormat", () => {
|
||||
it("should preserve screenshotUid through round-trip", () => {
|
||||
const original = [
|
||||
{
|
||||
id: "msg-1",
|
||||
role: "assistant" as const,
|
||||
parts: [
|
||||
{
|
||||
type: "tool" as const,
|
||||
toolCallId: "call-1",
|
||||
toolName: "capture_screenshot",
|
||||
input: { sendToLLM: true },
|
||||
output: {
|
||||
success: true,
|
||||
imageData: TEST_IMAGE_DATA,
|
||||
sendToLLM: true,
|
||||
screenshotUid: TEST_SCREENSHOT_UID,
|
||||
tabId: 1,
|
||||
},
|
||||
state: "completed" as const,
|
||||
screenshot: TEST_IMAGE_DATA,
|
||||
screenshotUid: TEST_SCREENSHOT_UID,
|
||||
},
|
||||
],
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
];
|
||||
|
||||
// Store -> Restore
|
||||
const stored = toStorageFormat(original as any);
|
||||
const restored = fromStorageFormat(stored);
|
||||
|
||||
const toolPart = restored[0]!.parts.find(
|
||||
(p: any) => p.type === "tool",
|
||||
) as any;
|
||||
|
||||
// screenshotUid should survive the round-trip
|
||||
expect(toolPart.screenshotUid).toBe(TEST_SCREENSHOT_UID);
|
||||
// imageData was stripped during storage, so inline screenshot is gone
|
||||
expect(toolPart.screenshot).toBeUndefined();
|
||||
expect(toolPart.state).toBe("completed");
|
||||
expect(toolPart.toolName).toBe("capture_screenshot");
|
||||
});
|
||||
|
||||
it("should handle capture_tab_screenshot round-trip", () => {
|
||||
const original = [
|
||||
{
|
||||
id: "msg-1",
|
||||
role: "assistant" as const,
|
||||
parts: [
|
||||
{
|
||||
type: "tool" as const,
|
||||
toolCallId: "call-1",
|
||||
toolName: "capture_tab_screenshot",
|
||||
input: { tabId: 42, sendToLLM: true },
|
||||
output: {
|
||||
success: true,
|
||||
imageData: TEST_IMAGE_DATA,
|
||||
sendToLLM: true,
|
||||
screenshotUid: TEST_SCREENSHOT_UID,
|
||||
tabId: 42,
|
||||
},
|
||||
state: "completed" as const,
|
||||
screenshot: TEST_IMAGE_DATA,
|
||||
screenshotUid: TEST_SCREENSHOT_UID,
|
||||
},
|
||||
],
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
];
|
||||
|
||||
const stored = toStorageFormat(original as any);
|
||||
const restored = fromStorageFormat(stored);
|
||||
|
||||
const toolPart = restored[0]!.parts.find(
|
||||
(p: any) => p.type === "tool",
|
||||
) as any;
|
||||
expect(toolPart.screenshotUid).toBe(TEST_SCREENSHOT_UID);
|
||||
expect(toolPart.toolName).toBe("capture_tab_screenshot");
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -6,6 +6,90 @@
|
||||
import type { UIMessage as ReactUIMessage } from "@aipexstudio/aipex-react/types";
|
||||
import type { UIMessage as RuntimeUIMessage } from "@aipexstudio/browser-runtime";
|
||||
|
||||
/** Tool names whose results may include screenshot image data */
|
||||
const SCREENSHOT_TOOL_NAMES = new Set([
|
||||
"capture_screenshot",
|
||||
"capture_screenshot_with_highlight",
|
||||
"capture_tab_screenshot",
|
||||
]);
|
||||
|
||||
/** Placeholder that replaces base64 imageData in stored tool results */
|
||||
const IMAGE_DATA_PLACEHOLDER =
|
||||
"[Image data removed - see following user message]";
|
||||
|
||||
interface ScreenshotToolInfo {
|
||||
/** The base64 data URL if present (may be null if already stripped) */
|
||||
imageData: string | null;
|
||||
/** The screenshot uid if present */
|
||||
screenshotUid: string | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Navigate into the parsed tool result to find the "actual data" layer.
|
||||
* Handles nesting: { data: { ... } }, { data: { data: { ... } } }, or flat.
|
||||
*/
|
||||
function getScreenshotActualData(
|
||||
parsedOutput: unknown,
|
||||
): Record<string, unknown> | null {
|
||||
if (typeof parsedOutput !== "object" || parsedOutput === null) return null;
|
||||
const obj = parsedOutput as Record<string, unknown>;
|
||||
const middleLayer = obj.data as Record<string, unknown> | undefined;
|
||||
return (
|
||||
(middleLayer?.data as Record<string, unknown>) ?? middleLayer ?? obj
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract screenshot info (imageData + screenshotUid) from a parsed tool result.
|
||||
*/
|
||||
function extractScreenshotInfo(
|
||||
toolName: string,
|
||||
parsedOutput: unknown,
|
||||
): ScreenshotToolInfo | null {
|
||||
if (!SCREENSHOT_TOOL_NAMES.has(toolName)) return null;
|
||||
const actual = getScreenshotActualData(parsedOutput);
|
||||
if (!actual) return null;
|
||||
|
||||
const imageData =
|
||||
typeof actual.imageData === "string" &&
|
||||
actual.imageData.startsWith("data:image/")
|
||||
? actual.imageData
|
||||
: null;
|
||||
const screenshotUid =
|
||||
typeof actual.screenshotUid === "string" ? actual.screenshotUid : null;
|
||||
|
||||
if (!imageData && !screenshotUid) return null;
|
||||
return { imageData, screenshotUid };
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip base64 imageData from a screenshot tool result string, replacing it
|
||||
* with a placeholder. Returns the stripped string (or the original if not applicable).
|
||||
*/
|
||||
function stripImageDataFromToolOutput(
|
||||
toolName: string,
|
||||
content: string,
|
||||
): string {
|
||||
if (!SCREENSHOT_TOOL_NAMES.has(toolName)) return content;
|
||||
|
||||
const parsed = safeJsonParse<Record<string, unknown>>(content);
|
||||
if (!parsed) return content;
|
||||
|
||||
const actual = getScreenshotActualData(parsed);
|
||||
if (!actual) return content;
|
||||
|
||||
if (
|
||||
typeof actual.imageData !== "string" ||
|
||||
!actual.imageData.startsWith("data:image/")
|
||||
) {
|
||||
return content;
|
||||
}
|
||||
|
||||
// Replace imageData in the actual data layer
|
||||
actual.imageData = IMAGE_DATA_PLACEHOLDER;
|
||||
return JSON.stringify(parsed);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert aipex-react UIMessage to runtime UIMessage for storage
|
||||
*/
|
||||
@@ -15,7 +99,7 @@ export function toStorageFormat(
|
||||
return messages.map((msg) => ({
|
||||
id: msg.id,
|
||||
role: msg.role === "tool" ? "assistant" : msg.role, // Map "tool" to "assistant"
|
||||
parts: msg.parts.map((part) => {
|
||||
parts: msg.parts.flatMap((part) => {
|
||||
switch (part.type) {
|
||||
case "text":
|
||||
return { type: "text", text: part.text };
|
||||
@@ -27,19 +111,37 @@ export function toStorageFormat(
|
||||
imageTitle: part.filename,
|
||||
};
|
||||
case "tool":
|
||||
// Map tool to tool_use or tool_result based on state
|
||||
// Map tool to tool_use + tool_result pair (when completed)
|
||||
// or just tool_use (when pending/executing).
|
||||
// Emitting both ensures fromStorageFormat can correlate them
|
||||
// to restore the proper toolName and input.
|
||||
if (part.output !== undefined) {
|
||||
// Avoid double-stringifying if output is already a string
|
||||
const content =
|
||||
// Avoid double-stringifying if output is already a string.
|
||||
let content =
|
||||
typeof part.output === "string"
|
||||
? part.output
|
||||
: JSON.stringify(part.output);
|
||||
return {
|
||||
type: "tool_result",
|
||||
tool_use_id: part.toolCallId,
|
||||
content,
|
||||
is_error: part.state === "error",
|
||||
};
|
||||
|
||||
// Strip base64 imageData from screenshot tool results before
|
||||
// persisting to keep stored conversations small and avoid
|
||||
// storing large blobs. The screenshotUid is preserved in the
|
||||
// output so images can be loaded from IndexedDB on restore.
|
||||
content = stripImageDataFromToolOutput(part.toolName, content);
|
||||
|
||||
return [
|
||||
{
|
||||
type: "tool_use",
|
||||
id: part.toolCallId,
|
||||
name: part.toolName,
|
||||
input: part.input as Record<string, unknown>,
|
||||
},
|
||||
{
|
||||
type: "tool_result",
|
||||
tool_use_id: part.toolCallId,
|
||||
content,
|
||||
is_error: part.state === "error",
|
||||
},
|
||||
];
|
||||
}
|
||||
return {
|
||||
type: "tool_use",
|
||||
@@ -210,7 +312,11 @@ export function fromStorageFormat(
|
||||
};
|
||||
}
|
||||
|
||||
// Normal successful completion
|
||||
// Normal successful completion – restore screenshot data
|
||||
const screenshotInfo = extractScreenshotInfo(
|
||||
toolName,
|
||||
parsedOutput,
|
||||
);
|
||||
return {
|
||||
type: "tool",
|
||||
toolName,
|
||||
@@ -218,6 +324,15 @@ export function fromStorageFormat(
|
||||
input,
|
||||
output: parsedOutput,
|
||||
state: "completed" as const,
|
||||
// Restore screenshotUid so UI can load from IndexedDB
|
||||
...(screenshotInfo?.screenshotUid
|
||||
? { screenshotUid: screenshotInfo.screenshotUid }
|
||||
: {}),
|
||||
// Restore inline screenshot only if actual base64 is present
|
||||
// (not when it's been replaced with a placeholder)
|
||||
...(screenshotInfo?.imageData
|
||||
? { screenshot: screenshotInfo.imageData }
|
||||
: {}),
|
||||
};
|
||||
}
|
||||
default:
|
||||
|
||||
@@ -19,6 +19,8 @@ export type {
|
||||
} from "./lib/vm/zenfs-manager.js";
|
||||
// Virtual File System
|
||||
export { zenfs } from "./lib/vm/zenfs-manager.js";
|
||||
// Screenshot Storage (IndexedDB)
|
||||
export { RuntimeScreenshotStorage } from "./lib/screenshot-storage.js";
|
||||
export * from "./runtime/automation-mode.js";
|
||||
export * from "./runtime/browser-automation-host.js";
|
||||
export * from "./runtime/context-providers.js";
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
*/
|
||||
|
||||
import type { ElementCaptureEvent, ElementCaptureOptions } from "./types.js";
|
||||
import { captureVisibleTabWithElementCrop } from "../tools/screenshot-helpers.js";
|
||||
|
||||
type CaptureCallback = (event: ElementCaptureEvent) => void;
|
||||
|
||||
@@ -232,34 +233,45 @@ export class ElementCaptureService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Capture screenshot functionality (with highlight)
|
||||
* Capture screenshot functionality (with highlight / element crop).
|
||||
*
|
||||
* Delegates to the shared `captureVisibleTabWithElementCrop` helper so that
|
||||
* the element-rect resolution, DPR scaling, crop, and restricted-page
|
||||
* checks are consistent with `captureScreenshotWithHighlightTool`.
|
||||
*
|
||||
* Falls back to a full-page screenshot if the selector cannot be resolved.
|
||||
*/
|
||||
async captureScreenshot(
|
||||
_selector: string,
|
||||
_options?: {
|
||||
selector: string,
|
||||
options?: {
|
||||
cropToElement?: boolean;
|
||||
padding?: number;
|
||||
},
|
||||
): Promise<string | null> {
|
||||
try {
|
||||
// Use Chrome's captureVisibleTab API directly
|
||||
if (!this.currentTabId) {
|
||||
console.warn("⚠️ [ElementCaptureService] No current tab for screenshot");
|
||||
console.warn(
|
||||
"⚠️ [ElementCaptureService] No current tab for screenshot",
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Get the tab to find its window ID
|
||||
const tab = await chrome.tabs.get(this.currentTabId);
|
||||
if (!tab.windowId) {
|
||||
console.warn("⚠️ [ElementCaptureService] No window ID for tab");
|
||||
return null;
|
||||
}
|
||||
|
||||
const screenshot = await chrome.tabs.captureVisibleTab(tab.windowId, {
|
||||
format: "png",
|
||||
const result = await captureVisibleTabWithElementCrop({
|
||||
tabId: this.currentTabId,
|
||||
windowId: tab.windowId,
|
||||
tabUrl: tab.url,
|
||||
selector,
|
||||
cropToElement: options?.cropToElement ?? true,
|
||||
padding: options?.padding ?? 50,
|
||||
});
|
||||
|
||||
return screenshot;
|
||||
return result.dataUrl;
|
||||
} catch (error) {
|
||||
console.error("❌ [ElementCaptureService] Screenshot error:", error);
|
||||
return null;
|
||||
|
||||
186
packages/browser-runtime/src/lib/screenshot-storage.ts
Normal file
186
packages/browser-runtime/src/lib/screenshot-storage.ts
Normal file
@@ -0,0 +1,186 @@
|
||||
/**
|
||||
* Screenshot storage using IndexedDB.
|
||||
* Stores screenshots with a uid for efficient reference and retrieval.
|
||||
* Applies an LRU eviction policy (max 50 screenshots).
|
||||
*
|
||||
* Uses the same DB/store as the aipex ScreenshotStorage so both
|
||||
* can share screenshots during the migration period.
|
||||
*/
|
||||
|
||||
export interface ScreenshotData {
|
||||
uid: string;
|
||||
/** Complete data URL: data:image/png;base64,... */
|
||||
base64Data: string;
|
||||
timestamp: number;
|
||||
tabId?: number;
|
||||
metadata?: {
|
||||
width: number;
|
||||
height: number;
|
||||
viewportWidth: number;
|
||||
viewportHeight: number;
|
||||
};
|
||||
}
|
||||
|
||||
const DB_NAME = "aipex-screenshots-db";
|
||||
const DB_VERSION = 1;
|
||||
const STORE_NAME = "screenshots";
|
||||
const MAX_SCREENSHOTS = 50;
|
||||
|
||||
let db: IDBDatabase | null = null;
|
||||
let initPromise: Promise<void> | null = null;
|
||||
|
||||
function initialize(): Promise<void> {
|
||||
if (initPromise) return initPromise;
|
||||
if (db) return Promise.resolve();
|
||||
|
||||
initPromise = new Promise<void>((resolve, reject) => {
|
||||
const request = indexedDB.open(DB_NAME, DB_VERSION);
|
||||
|
||||
request.onerror = () => {
|
||||
initPromise = null;
|
||||
reject(request.error);
|
||||
};
|
||||
|
||||
request.onsuccess = () => {
|
||||
db = request.result;
|
||||
initPromise = null;
|
||||
resolve();
|
||||
};
|
||||
|
||||
request.onupgradeneeded = (event) => {
|
||||
const database = (event.target as IDBOpenDBRequest).result;
|
||||
if (!database.objectStoreNames.contains(STORE_NAME)) {
|
||||
const store = database.createObjectStore(STORE_NAME, {
|
||||
keyPath: "uid",
|
||||
});
|
||||
store.createIndex("timestamp", "timestamp", { unique: false });
|
||||
store.createIndex("tabId", "tabId", { unique: false });
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
return initPromise;
|
||||
}
|
||||
|
||||
function generateUid(): string {
|
||||
return `screenshot_${Date.now()}_${Math.random().toString(36).slice(2, 11)}`;
|
||||
}
|
||||
|
||||
async function applyLRU(): Promise<void> {
|
||||
if (!db) return;
|
||||
const tx = db.transaction([STORE_NAME], "readonly");
|
||||
const store = tx.objectStore(STORE_NAME);
|
||||
const all: ScreenshotData[] = await new Promise((res, rej) => {
|
||||
const req = store.getAll();
|
||||
req.onsuccess = () => res(req.result as ScreenshotData[]);
|
||||
req.onerror = () => rej(req.error);
|
||||
});
|
||||
|
||||
if (all.length <= MAX_SCREENSHOTS) return;
|
||||
|
||||
all.sort((a, b) => b.timestamp - a.timestamp);
|
||||
const toDelete = all.slice(MAX_SCREENSHOTS);
|
||||
|
||||
const delTx = db.transaction([STORE_NAME], "readwrite");
|
||||
const delStore = delTx.objectStore(STORE_NAME);
|
||||
for (const item of toDelete) {
|
||||
delStore.delete(item.uid);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Runtime-level screenshot storage (for use inside browser-runtime tools).
|
||||
* Shares the same IndexedDB database as the UI-level ScreenshotStorage
|
||||
* in aipex-react so screenshots are accessible across packages.
|
||||
*/
|
||||
export const RuntimeScreenshotStorage = {
|
||||
/**
|
||||
* Save a screenshot and return its uid.
|
||||
* The base64Data must be a valid data URL (validated before storing).
|
||||
*/
|
||||
async saveScreenshot(
|
||||
base64Data: string,
|
||||
metadata?: {
|
||||
tabId?: number;
|
||||
width?: number;
|
||||
height?: number;
|
||||
viewportWidth?: number;
|
||||
viewportHeight?: number;
|
||||
},
|
||||
): Promise<string> {
|
||||
// Validate that it's a data URL (not arbitrary content)
|
||||
if (
|
||||
typeof base64Data !== "string" ||
|
||||
!base64Data.startsWith("data:image/")
|
||||
) {
|
||||
throw new Error("Invalid screenshot data: expected data:image/ URL");
|
||||
}
|
||||
|
||||
await initialize();
|
||||
if (!db) throw new Error("Database not initialized");
|
||||
|
||||
const uid = generateUid();
|
||||
const entry: ScreenshotData = {
|
||||
uid,
|
||||
base64Data,
|
||||
timestamp: Date.now(),
|
||||
tabId: metadata?.tabId,
|
||||
metadata: metadata
|
||||
? {
|
||||
width: metadata.width ?? 0,
|
||||
height: metadata.height ?? 0,
|
||||
viewportWidth: metadata.viewportWidth ?? 0,
|
||||
viewportHeight: metadata.viewportHeight ?? 0,
|
||||
}
|
||||
: undefined,
|
||||
};
|
||||
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const tx = db!.transaction([STORE_NAME], "readwrite");
|
||||
const store = tx.objectStore(STORE_NAME);
|
||||
const req = store.put(entry);
|
||||
req.onsuccess = () => resolve();
|
||||
req.onerror = () => reject(req.error);
|
||||
});
|
||||
|
||||
// Async LRU eviction — fire-and-forget
|
||||
applyLRU().catch(() => {});
|
||||
|
||||
return uid;
|
||||
},
|
||||
|
||||
/**
|
||||
* Get screenshot base64 data by uid.
|
||||
*/
|
||||
async getScreenshot(uid: string): Promise<string | null> {
|
||||
await initialize();
|
||||
if (!db) throw new Error("Database not initialized");
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
const tx = db!.transaction([STORE_NAME], "readonly");
|
||||
const store = tx.objectStore(STORE_NAME);
|
||||
const req = store.get(uid);
|
||||
req.onsuccess = () => {
|
||||
const data = req.result as ScreenshotData | undefined;
|
||||
resolve(data?.base64Data ?? null);
|
||||
};
|
||||
req.onerror = () => reject(req.error);
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
* Clear all screenshots.
|
||||
*/
|
||||
async clearAll(): Promise<void> {
|
||||
await initialize();
|
||||
if (!db) throw new Error("Database not initialized");
|
||||
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const tx = db!.transaction([STORE_NAME], "readwrite");
|
||||
const store = tx.objectStore(STORE_NAME);
|
||||
const req = store.clear();
|
||||
req.onsuccess = () => resolve();
|
||||
req.onerror = () => reject(req.error);
|
||||
});
|
||||
},
|
||||
};
|
||||
@@ -15,7 +15,18 @@ import {
|
||||
highlightTextInlineTool,
|
||||
scrollToElementTool,
|
||||
} from "./page";
|
||||
import { captureScreenshotTool, captureTabScreenshotTool } from "./screenshot";
|
||||
import {
|
||||
captureScreenshotTool,
|
||||
captureScreenshotWithHighlightTool,
|
||||
captureTabScreenshotTool,
|
||||
} from "./screenshot";
|
||||
// Clipboard image tools – available but not registered in the default bundle.
|
||||
// Enable explicitly if the product decides to ship clipboard access.
|
||||
// import {
|
||||
// captureScreenshotToClipboardTool,
|
||||
// readClipboardImageTool,
|
||||
// getClipboardImageInfoTool,
|
||||
// } from "./screenshot";
|
||||
import { skillTools } from "./skill";
|
||||
import { searchElementsTool } from "./snapshot";
|
||||
import {
|
||||
@@ -30,13 +41,15 @@ import { downloadChatImagesTool, downloadImageTool } from "./tools/downloads";
|
||||
|
||||
/**
|
||||
* All browser tools registered for AI use
|
||||
* Total: 31 tools (27 core + 4 intervention tools)
|
||||
* Total: 32 tools (28 core + 4 intervention tools)
|
||||
*
|
||||
* Disabled tools (per aipex):
|
||||
* - switch_to_tab (causes context switching issues)
|
||||
* - duplicate_tab (not in aipex)
|
||||
* - wait (replaced by computer tool's wait action)
|
||||
* - capture_screenshot_to_clipboard (not enabled in aipex)
|
||||
* - capture_screenshot_to_clipboard (not enabled in aipex default bundle)
|
||||
* - read_clipboard_image (P1 clipboard tool – not enabled by default; requires security review)
|
||||
* - get_clipboard_image_info (P1 clipboard tool – not enabled by default; requires security review)
|
||||
* - download_text_as_markdown (not enabled in aipex)
|
||||
* - download_current_chat_images (architecture issue, not enabled in aipex)
|
||||
* - organize_tabs (stub implementation, temporarily disabled until AI grouping is complete)
|
||||
@@ -72,8 +85,9 @@ const browserFunctionTools: BrowserFunctionTool[] = [
|
||||
highlightElementTool,
|
||||
highlightTextInlineTool,
|
||||
|
||||
// Screenshot (2 tools)
|
||||
// Screenshot (3 tools)
|
||||
captureScreenshotTool,
|
||||
captureScreenshotWithHighlightTool,
|
||||
captureTabScreenshotTool,
|
||||
|
||||
// Download (2 tools)
|
||||
|
||||
210
packages/browser-runtime/src/tools/screenshot-helpers.ts
Normal file
210
packages/browser-runtime/src/tools/screenshot-helpers.ts
Normal file
@@ -0,0 +1,210 @@
|
||||
/**
|
||||
* Shared screenshot helpers.
|
||||
*
|
||||
* This module is intentionally kept free of imports from `./index` or any
|
||||
* module that participates in the tools ↔ screenshot circular-import chain.
|
||||
* Both `captureScreenshotWithHighlightTool` (in screenshot.ts) and
|
||||
* `ElementCaptureService` (in intervention/element-capture.ts) import from
|
||||
* here without triggering a cycle.
|
||||
*/
|
||||
|
||||
/** Maximum padding in pixels */
|
||||
export const MAX_PADDING = 200;
|
||||
|
||||
// ===================== Image utilities =====================
|
||||
|
||||
/**
|
||||
* Crop image to a specific region using canvas.
|
||||
*/
|
||||
export async function cropImage(
|
||||
dataUrl: string,
|
||||
region: { x: number; y: number; width: number; height: number },
|
||||
): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const img = new Image();
|
||||
img.onload = () => {
|
||||
const canvas = document.createElement("canvas");
|
||||
const ctx = canvas.getContext("2d");
|
||||
if (!ctx) {
|
||||
reject(new Error("Failed to get canvas context"));
|
||||
return;
|
||||
}
|
||||
|
||||
canvas.width = region.width;
|
||||
canvas.height = region.height;
|
||||
|
||||
ctx.drawImage(
|
||||
img,
|
||||
region.x,
|
||||
region.y,
|
||||
region.width,
|
||||
region.height,
|
||||
0,
|
||||
0,
|
||||
region.width,
|
||||
region.height,
|
||||
);
|
||||
|
||||
resolve(canvas.toDataURL("image/png", 0.9));
|
||||
};
|
||||
img.onerror = () => reject(new Error("Failed to load image"));
|
||||
img.src = dataUrl;
|
||||
});
|
||||
}
|
||||
|
||||
// ===================== Shared capture helper =====================
|
||||
|
||||
/**
|
||||
* Options for the shared capture + element-crop helper.
|
||||
*/
|
||||
export interface CaptureWithElementCropOptions {
|
||||
tabId: number;
|
||||
windowId: number;
|
||||
tabUrl?: string;
|
||||
/** CSS selector of the element to focus on. Max length enforced by callers. */
|
||||
selector?: string;
|
||||
/** Whether to crop the screenshot to the element bounding box (plus padding). */
|
||||
cropToElement?: boolean;
|
||||
/** Padding around the element in CSS pixels when cropping (default 50, max 200). */
|
||||
padding?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result returned by the shared capture helper.
|
||||
*/
|
||||
export interface CaptureWithElementCropResult {
|
||||
/** The captured (and optionally cropped) image as a data URL. */
|
||||
dataUrl: string;
|
||||
/** True if the image was actually cropped to the element. */
|
||||
cropped: boolean;
|
||||
/** True if the selector matched an element on the page. */
|
||||
elementFound: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Core logic for capturing the visible tab and optionally cropping to an
|
||||
* element identified by CSS selector.
|
||||
*
|
||||
* This is shared by `captureScreenshotWithHighlightTool` (the agent-facing
|
||||
* tool) and `ElementCaptureService.captureScreenshot` so that both use the
|
||||
* same element-rect resolution, DPR scaling, and crop logic.
|
||||
*
|
||||
* Security notes:
|
||||
* - Rejects browser-internal pages (chrome://, edge://, about:, extension://).
|
||||
* - Selector length must be bounded by the caller (tool uses zod `.max()`).
|
||||
* - Padding is clamped to [0, MAX_PADDING].
|
||||
*/
|
||||
export async function captureVisibleTabWithElementCrop(
|
||||
options: CaptureWithElementCropOptions,
|
||||
): Promise<CaptureWithElementCropResult> {
|
||||
const {
|
||||
tabId,
|
||||
windowId,
|
||||
tabUrl,
|
||||
selector,
|
||||
cropToElement = false,
|
||||
padding = 50,
|
||||
} = options;
|
||||
|
||||
// Reject restricted pages
|
||||
if (
|
||||
tabUrl &&
|
||||
(tabUrl.startsWith("chrome://") ||
|
||||
tabUrl.startsWith("chrome-extension://") ||
|
||||
tabUrl.startsWith("edge://") ||
|
||||
tabUrl.startsWith("about:"))
|
||||
) {
|
||||
throw new Error("Cannot capture browser internal pages");
|
||||
}
|
||||
|
||||
// Clamp padding to safe range
|
||||
const safePadding = Math.max(0, Math.min(padding, MAX_PADDING));
|
||||
|
||||
// If a selector is provided, resolve the element rect via content script
|
||||
let elementRect: {
|
||||
x: number;
|
||||
y: number;
|
||||
width: number;
|
||||
height: number;
|
||||
devicePixelRatio: number;
|
||||
} | null = null;
|
||||
|
||||
if (selector) {
|
||||
try {
|
||||
const result = await chrome.scripting.executeScript({
|
||||
target: { tabId },
|
||||
func: (sel: string) => {
|
||||
const element = document.querySelector(sel);
|
||||
if (!element) return null;
|
||||
|
||||
const rect = element.getBoundingClientRect();
|
||||
const dpr = window.devicePixelRatio || 1;
|
||||
|
||||
return {
|
||||
x: rect.x * dpr,
|
||||
y: rect.y * dpr,
|
||||
width: rect.width * dpr,
|
||||
height: rect.height * dpr,
|
||||
devicePixelRatio: dpr,
|
||||
};
|
||||
},
|
||||
args: [selector],
|
||||
});
|
||||
|
||||
if (result[0]?.result) {
|
||||
elementRect = result[0].result;
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn("[Screenshot] Failed to get element rect:", err);
|
||||
// Continue with full-page screenshot if selector fails
|
||||
}
|
||||
}
|
||||
|
||||
// Focus window and capture
|
||||
await chrome.windows.update(windowId, { focused: true });
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
|
||||
let dataUrl = await chrome.tabs.captureVisibleTab(windowId, {
|
||||
format: "png",
|
||||
quality: 90,
|
||||
});
|
||||
|
||||
if (!dataUrl || !dataUrl.startsWith("data:image/")) {
|
||||
throw new Error("Invalid image data captured");
|
||||
}
|
||||
|
||||
const cropped = !!(cropToElement && elementRect);
|
||||
|
||||
// Crop to element if requested and the element was found
|
||||
if (cropToElement && elementRect) {
|
||||
const dpr = elementRect.devicePixelRatio || 1;
|
||||
const scaledPadding = safePadding * dpr;
|
||||
|
||||
// Load image to get actual dimensions for bounds checking
|
||||
const img = new Image();
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
img.onload = () => resolve();
|
||||
img.onerror = () => reject(new Error("Failed to load image for crop"));
|
||||
img.src = dataUrl;
|
||||
});
|
||||
|
||||
const x = Math.max(0, Math.round(elementRect.x - scaledPadding));
|
||||
const y = Math.max(0, Math.round(elementRect.y - scaledPadding));
|
||||
const maxWidth = img.width - x;
|
||||
const maxHeight = img.height - y;
|
||||
const width = Math.min(
|
||||
Math.round(elementRect.width + scaledPadding * 2),
|
||||
maxWidth,
|
||||
);
|
||||
const height = Math.min(
|
||||
Math.round(elementRect.height + scaledPadding * 2),
|
||||
maxHeight,
|
||||
);
|
||||
|
||||
if (width > 0 && height > 0) {
|
||||
dataUrl = await cropImage(dataUrl, { x, y, width, height });
|
||||
}
|
||||
}
|
||||
|
||||
return { dataUrl, cropped, elementFound: !!elementRect };
|
||||
}
|
||||
@@ -1,8 +1,20 @@
|
||||
import { tool } from "@aipexstudio/aipex-core";
|
||||
import { z } from "zod";
|
||||
import { cacheScreenshotMetadata } from "../automation/computer";
|
||||
import { RuntimeScreenshotStorage } from "../lib/screenshot-storage";
|
||||
import { getAutomationMode } from "../runtime/automation-mode";
|
||||
import { getActiveTab } from "./index";
|
||||
import {
|
||||
captureVisibleTabWithElementCrop,
|
||||
MAX_PADDING,
|
||||
} from "./screenshot-helpers.js";
|
||||
|
||||
// Re-export the shared helper types/function so existing consumers aren't broken
|
||||
export type {
|
||||
CaptureWithElementCropOptions,
|
||||
CaptureWithElementCropResult,
|
||||
} from "./screenshot-helpers.js";
|
||||
export { captureVisibleTabWithElementCrop } from "./screenshot-helpers.js";
|
||||
|
||||
async function compressImage(
|
||||
dataUrl: string,
|
||||
@@ -93,15 +105,25 @@ export const captureScreenshotTool = tool({
|
||||
throw new Error("Invalid image data captured");
|
||||
}
|
||||
|
||||
// Get viewport dimensions for metadata caching
|
||||
const viewportDimensions = await chrome.scripting.executeScript({
|
||||
target: { tabId: tab.id },
|
||||
func: () => ({
|
||||
width: window.innerWidth,
|
||||
height: window.innerHeight,
|
||||
}),
|
||||
});
|
||||
const viewport = viewportDimensions[0]?.result;
|
||||
// Get viewport dimensions for metadata caching (graceful degradation)
|
||||
let viewport: { width: number; height: number } | undefined;
|
||||
try {
|
||||
const viewportDimensions = await chrome.scripting.executeScript({
|
||||
target: { tabId: tab.id },
|
||||
func: () => ({
|
||||
width: window.innerWidth,
|
||||
height: window.innerHeight,
|
||||
}),
|
||||
});
|
||||
viewport = viewportDimensions[0]?.result ?? undefined;
|
||||
} catch (e) {
|
||||
console.warn("[Screenshot] Failed to get viewport dimensions:", e);
|
||||
// Continue without viewport metadata – screenshot still works
|
||||
}
|
||||
|
||||
// Get image dimensions for metadata
|
||||
let imageWidth = 0;
|
||||
let imageHeight = 0;
|
||||
|
||||
if (sendToLLM) {
|
||||
// Compress for LLM
|
||||
@@ -114,6 +136,8 @@ export const captureScreenshotTool = tool({
|
||||
img.onerror = reject;
|
||||
img.src = dataUrl;
|
||||
});
|
||||
imageWidth = img.width;
|
||||
imageHeight = img.height;
|
||||
|
||||
// Cache screenshot metadata for computer tool
|
||||
if (viewport) {
|
||||
@@ -125,12 +149,50 @@ export const captureScreenshotTool = tool({
|
||||
viewport.height,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// Get original image dimensions for non-LLM screenshots
|
||||
const img = new Image();
|
||||
await new Promise((resolve, reject) => {
|
||||
img.onload = resolve;
|
||||
img.onerror = reject;
|
||||
img.src = dataUrl;
|
||||
});
|
||||
imageWidth = img.width;
|
||||
imageHeight = img.height;
|
||||
}
|
||||
|
||||
// Save screenshot to IndexedDB and get uid
|
||||
let screenshotUid: string | undefined;
|
||||
try {
|
||||
screenshotUid = await RuntimeScreenshotStorage.saveScreenshot(dataUrl, {
|
||||
tabId: tab.id,
|
||||
width: imageWidth,
|
||||
height: imageHeight,
|
||||
viewportWidth: viewport?.width ?? 0,
|
||||
viewportHeight: viewport?.height ?? 0,
|
||||
});
|
||||
} catch (err) {
|
||||
console.error("[Screenshot] Failed to save to IndexedDB:", err);
|
||||
// Continue even if storage fails
|
||||
}
|
||||
|
||||
if (sendToLLM) {
|
||||
return {
|
||||
success: true,
|
||||
imageData: dataUrl,
|
||||
sendToLLM: true,
|
||||
screenshotUid,
|
||||
tabId: tab.id,
|
||||
url: tab.url,
|
||||
title: tab.title,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
imageData: sendToLLM ? dataUrl : undefined,
|
||||
captured: !sendToLLM,
|
||||
captured: true,
|
||||
sendToLLM: false,
|
||||
screenshotUid,
|
||||
tabId: tab.id,
|
||||
url: tab.url,
|
||||
title: tab.title,
|
||||
@@ -177,15 +239,25 @@ export const captureTabScreenshotTool = tool({
|
||||
quality: 90,
|
||||
});
|
||||
|
||||
// Get viewport dimensions for metadata caching
|
||||
const viewportDimensions = await chrome.scripting.executeScript({
|
||||
target: { tabId },
|
||||
func: () => ({
|
||||
width: window.innerWidth,
|
||||
height: window.innerHeight,
|
||||
}),
|
||||
});
|
||||
const viewport = viewportDimensions[0]?.result;
|
||||
// Get viewport dimensions for metadata caching (graceful degradation)
|
||||
let viewport: { width: number; height: number } | undefined;
|
||||
try {
|
||||
const viewportDimensions = await chrome.scripting.executeScript({
|
||||
target: { tabId },
|
||||
func: () => ({
|
||||
width: window.innerWidth,
|
||||
height: window.innerHeight,
|
||||
}),
|
||||
});
|
||||
viewport = viewportDimensions[0]?.result ?? undefined;
|
||||
} catch (e) {
|
||||
console.warn("[Screenshot] Failed to get viewport dimensions:", e);
|
||||
// Continue without viewport metadata – screenshot still works
|
||||
}
|
||||
|
||||
// Get image dimensions for metadata
|
||||
let imageWidth = 0;
|
||||
let imageHeight = 0;
|
||||
|
||||
if (sendToLLM) {
|
||||
// Compress for LLM
|
||||
@@ -198,6 +270,8 @@ export const captureTabScreenshotTool = tool({
|
||||
img.onerror = reject;
|
||||
img.src = dataUrl;
|
||||
});
|
||||
imageWidth = img.width;
|
||||
imageHeight = img.height;
|
||||
|
||||
// Cache screenshot metadata for computer tool
|
||||
if (viewport) {
|
||||
@@ -209,12 +283,50 @@ export const captureTabScreenshotTool = tool({
|
||||
viewport.height,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// Get original image dimensions for non-LLM screenshots
|
||||
const img = new Image();
|
||||
await new Promise((resolve, reject) => {
|
||||
img.onload = resolve;
|
||||
img.onerror = reject;
|
||||
img.src = dataUrl;
|
||||
});
|
||||
imageWidth = img.width;
|
||||
imageHeight = img.height;
|
||||
}
|
||||
|
||||
// Save screenshot to IndexedDB and get uid
|
||||
let screenshotUid: string | undefined;
|
||||
try {
|
||||
screenshotUid = await RuntimeScreenshotStorage.saveScreenshot(dataUrl, {
|
||||
tabId,
|
||||
width: imageWidth,
|
||||
height: imageHeight,
|
||||
viewportWidth: viewport?.width ?? 0,
|
||||
viewportHeight: viewport?.height ?? 0,
|
||||
});
|
||||
} catch (err) {
|
||||
console.error("[Screenshot] Failed to save to IndexedDB:", err);
|
||||
// Continue even if storage fails
|
||||
}
|
||||
|
||||
if (sendToLLM) {
|
||||
return {
|
||||
success: true,
|
||||
imageData: dataUrl,
|
||||
sendToLLM: true,
|
||||
screenshotUid,
|
||||
tabId,
|
||||
url: tab.url,
|
||||
title: tab.title,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
imageData: sendToLLM ? dataUrl : undefined,
|
||||
captured: !sendToLLM,
|
||||
captured: true,
|
||||
sendToLLM: false,
|
||||
screenshotUid,
|
||||
tabId,
|
||||
url: tab.url,
|
||||
title: tab.title,
|
||||
@@ -222,6 +334,169 @@ export const captureTabScreenshotTool = tool({
|
||||
},
|
||||
});
|
||||
|
||||
/** Maximum allowed CSS selector length to prevent injection of excessively long strings */
|
||||
const MAX_SELECTOR_LENGTH = 500;
|
||||
|
||||
// ===================== Tool definition =====================
|
||||
|
||||
export const captureScreenshotWithHighlightTool = tool({
|
||||
name: "capture_screenshot_with_highlight",
|
||||
description:
|
||||
"Capture screenshot of the current visible tab, optionally highlighting and cropping to a specific element identified by CSS selector. The screenshot is always sent to the LLM for visual analysis. NOTE: This tool requires focus mode.",
|
||||
parameters: z.object({
|
||||
selector: z
|
||||
.string()
|
||||
.max(MAX_SELECTOR_LENGTH)
|
||||
.optional()
|
||||
.describe("CSS selector of element to highlight/focus on"),
|
||||
cropToElement: z
|
||||
.boolean()
|
||||
.optional()
|
||||
.default(false)
|
||||
.describe(
|
||||
"Whether to crop the screenshot to the element region (plus padding)",
|
||||
),
|
||||
padding: z
|
||||
.number()
|
||||
.min(0)
|
||||
.max(MAX_PADDING)
|
||||
.optional()
|
||||
.default(50)
|
||||
.describe("Padding around element in pixels when cropping (default: 50)"),
|
||||
sendToLLM: z
|
||||
.boolean()
|
||||
.nullable()
|
||||
.optional()
|
||||
.default(true)
|
||||
.describe(
|
||||
"Whether to send the screenshot to LLM for visual analysis. Defaults to true.",
|
||||
),
|
||||
}),
|
||||
execute: async ({
|
||||
selector,
|
||||
cropToElement = false,
|
||||
padding = 50,
|
||||
sendToLLM = true,
|
||||
}) => {
|
||||
const mode = await getAutomationMode();
|
||||
console.log(
|
||||
"🔧 [captureScreenshotWithHighlight] Automation mode:",
|
||||
mode,
|
||||
);
|
||||
|
||||
if (mode === "background") {
|
||||
throw new Error(
|
||||
"Screenshot capture is disabled in background mode. Please switch to focus mode to use visual tools.",
|
||||
);
|
||||
}
|
||||
|
||||
const tab = await getActiveTab();
|
||||
|
||||
if (!tab.id || !tab.windowId) {
|
||||
throw new Error("No active tab found");
|
||||
}
|
||||
|
||||
// Delegate to shared helper for capture + element crop
|
||||
const capture = await captureVisibleTabWithElementCrop({
|
||||
tabId: tab.id,
|
||||
windowId: tab.windowId,
|
||||
tabUrl: tab.url,
|
||||
selector,
|
||||
cropToElement,
|
||||
padding,
|
||||
});
|
||||
|
||||
let { dataUrl } = capture;
|
||||
|
||||
// Get viewport dimensions (graceful degradation)
|
||||
let viewport: { width: number; height: number } | undefined;
|
||||
try {
|
||||
const viewportDimensions = await chrome.scripting.executeScript({
|
||||
target: { tabId: tab.id },
|
||||
func: () => ({
|
||||
width: window.innerWidth,
|
||||
height: window.innerHeight,
|
||||
}),
|
||||
});
|
||||
viewport = viewportDimensions[0]?.result ?? undefined;
|
||||
} catch (e) {
|
||||
console.warn(
|
||||
"[ScreenshotHighlight] Failed to get viewport dimensions:",
|
||||
e,
|
||||
);
|
||||
}
|
||||
|
||||
if (sendToLLM) {
|
||||
// Compress for LLM
|
||||
dataUrl = await compressImage(dataUrl, 0.6, 1024);
|
||||
}
|
||||
|
||||
// Extract image dimensions
|
||||
const finalImg = new Image();
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
finalImg.onload = () => resolve();
|
||||
finalImg.onerror = () => reject(new Error("Failed to load image"));
|
||||
finalImg.src = dataUrl;
|
||||
});
|
||||
const imageWidth = finalImg.width;
|
||||
const imageHeight = finalImg.height;
|
||||
|
||||
// Cache screenshot metadata for computer tool
|
||||
if (sendToLLM && viewport) {
|
||||
cacheScreenshotMetadata(
|
||||
tab.id,
|
||||
imageWidth,
|
||||
imageHeight,
|
||||
viewport.width,
|
||||
viewport.height,
|
||||
);
|
||||
}
|
||||
|
||||
// Save screenshot to IndexedDB
|
||||
let screenshotUid: string | undefined;
|
||||
try {
|
||||
screenshotUid = await RuntimeScreenshotStorage.saveScreenshot(dataUrl, {
|
||||
tabId: tab.id,
|
||||
width: imageWidth,
|
||||
height: imageHeight,
|
||||
viewportWidth: viewport?.width ?? 0,
|
||||
viewportHeight: viewport?.height ?? 0,
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(
|
||||
"[ScreenshotHighlight] Failed to save to IndexedDB:",
|
||||
err,
|
||||
);
|
||||
}
|
||||
|
||||
if (sendToLLM) {
|
||||
return {
|
||||
success: true,
|
||||
imageData: dataUrl,
|
||||
sendToLLM: true,
|
||||
screenshotUid,
|
||||
tabId: tab.id,
|
||||
url: tab.url,
|
||||
title: tab.title,
|
||||
selector: selector ?? undefined,
|
||||
cropped: capture.cropped,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
captured: true,
|
||||
sendToLLM: false,
|
||||
screenshotUid,
|
||||
tabId: tab.id,
|
||||
url: tab.url,
|
||||
title: tab.title,
|
||||
selector: selector ?? undefined,
|
||||
cropped: capture.cropped,
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
export const captureScreenshotToClipboardTool = tool({
|
||||
name: "capture_screenshot_to_clipboard",
|
||||
description:
|
||||
@@ -267,3 +542,83 @@ export const captureScreenshotToClipboardTool = tool({
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
// ===================== Clipboard image tools (P1) =====================
|
||||
|
||||
export const readClipboardImageTool = tool({
|
||||
name: "read_clipboard_image",
|
||||
description:
|
||||
"Read an image from the system clipboard and return it as a base64 data URL. " +
|
||||
"Useful for inspecting images the user has copied. Returns an error if no image is present.",
|
||||
parameters: z.object({}),
|
||||
execute: async () => {
|
||||
try {
|
||||
const clipboardItems = await navigator.clipboard.read();
|
||||
|
||||
for (const item of clipboardItems) {
|
||||
for (const type of item.types) {
|
||||
if (type.startsWith("image/")) {
|
||||
const blob = await item.getType(type);
|
||||
|
||||
// Convert blob to data URL
|
||||
const dataUrl = await new Promise<string>((resolve, reject) => {
|
||||
const reader = new FileReader();
|
||||
reader.onload = () => resolve(reader.result as string);
|
||||
reader.onerror = () =>
|
||||
reject(new Error("Failed to read image data"));
|
||||
reader.readAsDataURL(blob);
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
imageData: dataUrl,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { success: false, error: "No image found in clipboard" };
|
||||
} catch (error: unknown) {
|
||||
const message =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
return {
|
||||
success: false,
|
||||
error: `Failed to read clipboard: ${message}`,
|
||||
};
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
export const getClipboardImageInfoTool = tool({
|
||||
name: "get_clipboard_image_info",
|
||||
description:
|
||||
"Check whether the system clipboard contains an image, and if so return " +
|
||||
"its MIME type. Does NOT read the full image data.",
|
||||
parameters: z.object({}),
|
||||
execute: async () => {
|
||||
try {
|
||||
const clipboardItems = await navigator.clipboard.read();
|
||||
|
||||
for (const item of clipboardItems) {
|
||||
for (const type of item.types) {
|
||||
if (type.startsWith("image/")) {
|
||||
return {
|
||||
success: true,
|
||||
hasImage: true,
|
||||
imageType: type,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { success: true, hasImage: false };
|
||||
} catch (error: unknown) {
|
||||
const message =
|
||||
error instanceof Error ? error.message : String(error);
|
||||
return {
|
||||
success: false,
|
||||
error: `Failed to read clipboard: ${message}`,
|
||||
};
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
@@ -7,6 +7,7 @@ import type {
|
||||
SessionTree,
|
||||
} from "../types.js";
|
||||
import { generateId } from "../utils/id-generator.js";
|
||||
import { pruneTransientScreenshotItems } from "../utils/screenshot-shaping.js";
|
||||
import type { ConversationCompressor } from "./compressor.js";
|
||||
import { Session } from "./session.js";
|
||||
|
||||
@@ -87,7 +88,10 @@ export class ConversationManager {
|
||||
}
|
||||
|
||||
private async doCompress(session: Session): Promise<{ summary: string }> {
|
||||
const items = await session.getItems();
|
||||
// Prune transient screenshot user-image messages before compression
|
||||
// to avoid sending large base64 blobs to the compressor/LLM.
|
||||
const rawItems = await session.getItems();
|
||||
const items = pruneTransientScreenshotItems(rawItems);
|
||||
const { summary, compressedItems } =
|
||||
await this.compressor!.compressItems(items);
|
||||
|
||||
|
||||
@@ -8,6 +8,11 @@ import type {
|
||||
SessionSummary,
|
||||
} from "../types.js";
|
||||
import { generateId } from "../utils/id-generator.js";
|
||||
import {
|
||||
isTransientScreenshotItem,
|
||||
pruneTransientScreenshotItems,
|
||||
shapeScreenshotItems,
|
||||
} from "../utils/screenshot-shaping.js";
|
||||
|
||||
function createEmptySessionMetrics(): SessionMetrics {
|
||||
return {
|
||||
@@ -53,7 +58,11 @@ export class Session implements OpenAISession {
|
||||
}
|
||||
|
||||
async addItems(items: AgentInputItem[]): Promise<void> {
|
||||
this.items.push(...items);
|
||||
// Shape screenshot tool results: strip base64 imageData from the tool
|
||||
// result and inject a transient user message with the real image so the
|
||||
// model can consume it via the standard vision path.
|
||||
const shaped = shapeScreenshotItems(items);
|
||||
this.items.push(...shaped);
|
||||
this.metadata["lastActiveAt"] = Date.now();
|
||||
this.updatePreview();
|
||||
}
|
||||
@@ -156,7 +165,12 @@ export class Session implements OpenAISession {
|
||||
private updatePreview(): void {
|
||||
const latestUserMessage = [...this.items]
|
||||
.reverse()
|
||||
.find((item) => item.type === "message" && item.role === "user");
|
||||
.find(
|
||||
(item) =>
|
||||
item.type === "message" &&
|
||||
item.role === "user" &&
|
||||
!isTransientScreenshotItem(item),
|
||||
);
|
||||
|
||||
const previewSource =
|
||||
this.extractContent(latestUserMessage) ??
|
||||
@@ -207,7 +221,9 @@ export class Session implements OpenAISession {
|
||||
toJSON(): SerializedSession {
|
||||
return {
|
||||
id: this.id,
|
||||
items: this.items,
|
||||
// Prune transient screenshot user-image messages before persisting
|
||||
// to avoid storing large base64 blobs in conversation history.
|
||||
items: pruneTransientScreenshotItems(this.items),
|
||||
metadata: this.metadata,
|
||||
config: this.config,
|
||||
metrics: this.sessionMetrics,
|
||||
|
||||
@@ -3,3 +3,9 @@
|
||||
*/
|
||||
|
||||
export { CancellationError, CancellationToken } from "./cancellation-token.js";
|
||||
export {
|
||||
isTransientScreenshotItem,
|
||||
pruneTransientScreenshotItems,
|
||||
shapeScreenshotItems,
|
||||
TRANSIENT_SCREENSHOT_MARKER,
|
||||
} from "./screenshot-shaping.js";
|
||||
|
||||
296
packages/core/src/utils/screenshot-shaping.test.ts
Normal file
296
packages/core/src/utils/screenshot-shaping.test.ts
Normal file
@@ -0,0 +1,296 @@
|
||||
import type { AgentInputItem } from "@openai/agents";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
isTransientScreenshotItem,
|
||||
pruneTransientScreenshotItems,
|
||||
shapeScreenshotItems,
|
||||
TRANSIENT_SCREENSHOT_MARKER,
|
||||
} from "./screenshot-shaping.js";
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
const TEST_IMAGE_DATA = "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQ==";
|
||||
const TEST_SCREENSHOT_UID = "screenshot_1234567890_abcdefghi";
|
||||
const PLACEHOLDER = "[Image data removed - see following user message]";
|
||||
|
||||
function createScreenshotToolResult(
|
||||
overrides: Record<string, unknown> = {},
|
||||
): AgentInputItem {
|
||||
const output = {
|
||||
success: true,
|
||||
imageData: TEST_IMAGE_DATA,
|
||||
sendToLLM: true,
|
||||
screenshotUid: TEST_SCREENSHOT_UID,
|
||||
tabId: 1,
|
||||
url: "https://example.com",
|
||||
title: "Example",
|
||||
...overrides,
|
||||
};
|
||||
return {
|
||||
type: "function_call_result",
|
||||
name: "capture_screenshot",
|
||||
callId: "call_abc123",
|
||||
output: JSON.stringify(output),
|
||||
} as AgentInputItem;
|
||||
}
|
||||
|
||||
function createNonScreenshotToolResult(): AgentInputItem {
|
||||
return {
|
||||
type: "function_call_result",
|
||||
name: "get_tabs",
|
||||
callId: "call_other",
|
||||
output: JSON.stringify({ tabs: [{ id: 1, title: "Tab" }] }),
|
||||
} as AgentInputItem;
|
||||
}
|
||||
|
||||
function createUserMessage(text: string): AgentInputItem {
|
||||
return {
|
||||
type: "message",
|
||||
role: "user",
|
||||
content: text,
|
||||
};
|
||||
}
|
||||
|
||||
// --- Tests ---
|
||||
|
||||
describe("shapeScreenshotItems", () => {
|
||||
it("should strip imageData and inject transient user image message for sendToLLM=true", () => {
|
||||
const items = [createScreenshotToolResult()];
|
||||
const shaped = shapeScreenshotItems(items);
|
||||
|
||||
expect(shaped.length).toBe(2);
|
||||
|
||||
// First item: stripped tool result
|
||||
const toolResult = shaped[0] as { type: string; output: string };
|
||||
expect(toolResult.type).toBe("function_call_result");
|
||||
const parsed = JSON.parse(toolResult.output);
|
||||
expect(parsed.success).toBe(true);
|
||||
expect(parsed.imageData).toBe(PLACEHOLDER);
|
||||
expect(parsed.screenshotUid).toBe(TEST_SCREENSHOT_UID);
|
||||
expect(parsed.sendToLLM).toBe(true);
|
||||
|
||||
// Second item: transient user image message
|
||||
const userMsg = shaped[1] as {
|
||||
type: string;
|
||||
role: string;
|
||||
content: Array<{ type: string; text?: string; image?: string }>;
|
||||
providerData?: Record<string, unknown>;
|
||||
};
|
||||
expect(userMsg.type).toBe("message");
|
||||
expect(userMsg.role).toBe("user");
|
||||
expect(userMsg.providerData?.[TRANSIENT_SCREENSHOT_MARKER]).toBe(true);
|
||||
|
||||
// Check content has text + image parts
|
||||
const textPart = userMsg.content.find((c) => c.type === "input_text");
|
||||
const imagePart = userMsg.content.find((c) => c.type === "input_image");
|
||||
expect(textPart).toBeTruthy();
|
||||
expect(imagePart).toBeTruthy();
|
||||
expect((imagePart as { image: string }).image).toBe(TEST_IMAGE_DATA);
|
||||
});
|
||||
|
||||
it("should pass through items when sendToLLM=false", () => {
|
||||
const items = [
|
||||
createScreenshotToolResult({
|
||||
sendToLLM: false,
|
||||
imageData: undefined,
|
||||
captured: true,
|
||||
}),
|
||||
];
|
||||
const shaped = shapeScreenshotItems(items);
|
||||
|
||||
// Should not inject a user image message
|
||||
expect(shaped.length).toBe(1);
|
||||
expect(shaped[0]).toEqual(items[0]);
|
||||
});
|
||||
|
||||
it("should pass through non-screenshot tools unchanged", () => {
|
||||
const items = [createNonScreenshotToolResult()];
|
||||
const shaped = shapeScreenshotItems(items);
|
||||
|
||||
expect(shaped.length).toBe(1);
|
||||
expect(shaped[0]).toEqual(items[0]);
|
||||
});
|
||||
|
||||
it("should pass through non-tool items unchanged", () => {
|
||||
const items = [createUserMessage("hello")];
|
||||
const shaped = shapeScreenshotItems(items);
|
||||
|
||||
expect(shaped.length).toBe(1);
|
||||
expect(shaped[0]).toEqual(items[0]);
|
||||
});
|
||||
|
||||
it("should handle capture_tab_screenshot the same way", () => {
|
||||
const toolResult = createScreenshotToolResult();
|
||||
(toolResult as { name: string }).name = "capture_tab_screenshot";
|
||||
const shaped = shapeScreenshotItems([toolResult]);
|
||||
|
||||
expect(shaped.length).toBe(2);
|
||||
expect((shaped[0] as { type: string }).type).toBe("function_call_result");
|
||||
expect((shaped[1] as { type: string; role: string }).role).toBe("user");
|
||||
});
|
||||
|
||||
it("should handle capture_screenshot_with_highlight the same way", () => {
|
||||
const output = {
|
||||
success: true,
|
||||
imageData: TEST_IMAGE_DATA,
|
||||
sendToLLM: true,
|
||||
screenshotUid: TEST_SCREENSHOT_UID,
|
||||
tabId: 1,
|
||||
url: "https://example.com",
|
||||
title: "Example",
|
||||
selector: ".my-element",
|
||||
cropped: true,
|
||||
};
|
||||
const item: AgentInputItem = {
|
||||
type: "function_call_result",
|
||||
name: "capture_screenshot_with_highlight",
|
||||
callId: "call_highlight",
|
||||
output: JSON.stringify(output),
|
||||
} as AgentInputItem;
|
||||
|
||||
const shaped = shapeScreenshotItems([item]);
|
||||
|
||||
expect(shaped.length).toBe(2);
|
||||
|
||||
// First item: stripped tool result
|
||||
const toolResult = shaped[0] as { type: string; output: string };
|
||||
expect(toolResult.type).toBe("function_call_result");
|
||||
const parsed = JSON.parse(toolResult.output);
|
||||
expect(parsed.success).toBe(true);
|
||||
expect(parsed.imageData).toBe(PLACEHOLDER);
|
||||
expect(parsed.screenshotUid).toBe(TEST_SCREENSHOT_UID);
|
||||
expect(parsed.sendToLLM).toBe(true);
|
||||
|
||||
// Second item: transient user image message
|
||||
const userMsg = shaped[1] as {
|
||||
type: string;
|
||||
role: string;
|
||||
content: Array<{ type: string; text?: string; image?: string }>;
|
||||
providerData?: Record<string, unknown>;
|
||||
};
|
||||
expect(userMsg.type).toBe("message");
|
||||
expect(userMsg.role).toBe("user");
|
||||
expect(userMsg.providerData?.[TRANSIENT_SCREENSHOT_MARKER]).toBe(true);
|
||||
const imagePart = userMsg.content.find((c) => c.type === "input_image");
|
||||
expect(imagePart).toBeTruthy();
|
||||
expect((imagePart as { image: string }).image).toBe(TEST_IMAGE_DATA);
|
||||
});
|
||||
|
||||
it("should pass through capture_screenshot_with_highlight when sendToLLM=false", () => {
|
||||
const output = {
|
||||
success: true,
|
||||
captured: true,
|
||||
sendToLLM: false,
|
||||
screenshotUid: TEST_SCREENSHOT_UID,
|
||||
tabId: 1,
|
||||
selector: ".my-element",
|
||||
cropped: true,
|
||||
};
|
||||
const item: AgentInputItem = {
|
||||
type: "function_call_result",
|
||||
name: "capture_screenshot_with_highlight",
|
||||
callId: "call_highlight_no_llm",
|
||||
output: JSON.stringify(output),
|
||||
} as AgentInputItem;
|
||||
|
||||
const shaped = shapeScreenshotItems([item]);
|
||||
|
||||
// No imageData + sendToLLM=false → pass through unchanged
|
||||
expect(shaped.length).toBe(1);
|
||||
expect(shaped[0]).toEqual(item);
|
||||
});
|
||||
|
||||
it("should handle mixed items correctly", () => {
|
||||
const items = [
|
||||
createUserMessage("Take a screenshot"),
|
||||
createNonScreenshotToolResult(),
|
||||
createScreenshotToolResult(),
|
||||
createUserMessage("What do you see?"),
|
||||
];
|
||||
const shaped = shapeScreenshotItems(items);
|
||||
|
||||
// Original 4 items + 1 injected user image = 5
|
||||
expect(shaped.length).toBe(5);
|
||||
|
||||
// Verify order: user, non-screenshot tool, stripped screenshot, user image, user
|
||||
expect((shaped[0] as { role: string }).role).toBe("user");
|
||||
expect((shaped[1] as { name: string }).name).toBe("get_tabs");
|
||||
expect((shaped[2] as { type: string }).type).toBe("function_call_result");
|
||||
expect(
|
||||
(shaped[3] as { providerData?: Record<string, unknown> }).providerData?.[
|
||||
TRANSIENT_SCREENSHOT_MARKER
|
||||
],
|
||||
).toBe(true);
|
||||
expect((shaped[4] as { role: string }).role).toBe("user");
|
||||
});
|
||||
|
||||
it("should handle nested data structure", () => {
|
||||
const output = {
|
||||
success: true,
|
||||
data: {
|
||||
success: true,
|
||||
imageData: TEST_IMAGE_DATA,
|
||||
sendToLLM: true,
|
||||
screenshotUid: TEST_SCREENSHOT_UID,
|
||||
},
|
||||
};
|
||||
const item: AgentInputItem = {
|
||||
type: "function_call_result",
|
||||
name: "capture_screenshot",
|
||||
callId: "call_nested",
|
||||
output: JSON.stringify(output),
|
||||
} as AgentInputItem;
|
||||
|
||||
const shaped = shapeScreenshotItems([item]);
|
||||
expect(shaped.length).toBe(2);
|
||||
|
||||
const parsedOutput = JSON.parse(
|
||||
(shaped[0] as { output: string }).output,
|
||||
);
|
||||
expect(parsedOutput.success).toBe(true);
|
||||
expect(parsedOutput.data.imageData).toBe(PLACEHOLDER);
|
||||
expect(parsedOutput.data.screenshotUid).toBe(TEST_SCREENSHOT_UID);
|
||||
});
|
||||
});
|
||||
|
||||
describe("pruneTransientScreenshotItems", () => {
|
||||
it("should remove transient screenshot items", () => {
|
||||
const transient: AgentInputItem = {
|
||||
type: "message",
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "input_text", text: "screenshot" },
|
||||
{ type: "input_image", image: TEST_IMAGE_DATA, detail: "auto" },
|
||||
],
|
||||
providerData: { [TRANSIENT_SCREENSHOT_MARKER]: true },
|
||||
} as AgentInputItem;
|
||||
|
||||
const normal = createUserMessage("hello");
|
||||
|
||||
const pruned = pruneTransientScreenshotItems([normal, transient]);
|
||||
expect(pruned.length).toBe(1);
|
||||
expect(pruned[0]).toEqual(normal);
|
||||
});
|
||||
|
||||
it("should keep all items when no transients exist", () => {
|
||||
const items = [createUserMessage("a"), createUserMessage("b")];
|
||||
const pruned = pruneTransientScreenshotItems(items);
|
||||
expect(pruned.length).toBe(2);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isTransientScreenshotItem", () => {
|
||||
it("should return true for transient items", () => {
|
||||
const item = {
|
||||
type: "message",
|
||||
role: "user",
|
||||
content: "test",
|
||||
providerData: { [TRANSIENT_SCREENSHOT_MARKER]: true },
|
||||
} as unknown as AgentInputItem;
|
||||
expect(isTransientScreenshotItem(item)).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for normal items", () => {
|
||||
expect(isTransientScreenshotItem(createUserMessage("hello"))).toBe(false);
|
||||
});
|
||||
});
|
||||
199
packages/core/src/utils/screenshot-shaping.ts
Normal file
199
packages/core/src/utils/screenshot-shaping.ts
Normal file
@@ -0,0 +1,199 @@
|
||||
/**
|
||||
* Screenshot message shaping utilities.
|
||||
*
|
||||
* When a screenshot tool returns `sendToLLM=true`, the large base64 imageData
|
||||
* must NOT be sent inside the function_call_result output (models may not
|
||||
* support images there, and it bloats token counts).
|
||||
*
|
||||
* Instead, the imageData is:
|
||||
* 1. Stripped from the tool result (replaced with a placeholder string).
|
||||
* 2. Injected as a follow-up user message with `input_image` content.
|
||||
*
|
||||
* This matches the message flow used in the original aipex codebase.
|
||||
*/
|
||||
|
||||
import type { AgentInputItem } from "@openai/agents";
|
||||
import { safeJsonParse } from "./json.js";
|
||||
|
||||
/** Tool names whose results may include screenshot image data */
|
||||
const SCREENSHOT_TOOL_NAMES = new Set([
|
||||
"capture_screenshot",
|
||||
"capture_screenshot_with_highlight",
|
||||
"capture_tab_screenshot",
|
||||
]);
|
||||
|
||||
/** Placeholder that replaces imageData in the tool result */
|
||||
const IMAGE_DATA_PLACEHOLDER =
|
||||
"[Image data removed - see following user message]";
|
||||
|
||||
/** Marker on transient user-image messages so they can be pruned */
|
||||
export const TRANSIENT_SCREENSHOT_MARKER = "__transient_screenshot__";
|
||||
|
||||
/**
|
||||
* Process a batch of AgentInputItems. For any `function_call_result` from
|
||||
* a screenshot tool that contains `imageData` with `sendToLLM=true`:
|
||||
* - Replace imageData with a placeholder in the tool result.
|
||||
* - Insert a transient user message with the real image right after.
|
||||
*
|
||||
* Items that are not screenshot tool results pass through unchanged.
|
||||
*/
|
||||
export function shapeScreenshotItems(
|
||||
items: AgentInputItem[],
|
||||
): AgentInputItem[] {
|
||||
const result: AgentInputItem[] = [];
|
||||
|
||||
for (const item of items) {
|
||||
if (item.type !== "function_call_result") {
|
||||
result.push(item);
|
||||
continue;
|
||||
}
|
||||
|
||||
const funcResult = item as {
|
||||
type: "function_call_result";
|
||||
name: string;
|
||||
callId: string;
|
||||
output: string;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
|
||||
if (!SCREENSHOT_TOOL_NAMES.has(funcResult.name)) {
|
||||
result.push(item);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Try to parse the output and extract imageData
|
||||
const parsed = safeJsonParse<Record<string, unknown>>(funcResult.output);
|
||||
if (!parsed) {
|
||||
result.push(item);
|
||||
continue;
|
||||
}
|
||||
|
||||
const extracted = extractImageData(parsed);
|
||||
if (!extracted) {
|
||||
// No sendToLLM image data – pass through
|
||||
result.push(item);
|
||||
continue;
|
||||
}
|
||||
|
||||
// 1. Rewrite the tool result with imageData stripped
|
||||
const strippedOutput = buildStrippedOutput(parsed, extracted.screenshotUid);
|
||||
const strippedItem: AgentInputItem = {
|
||||
...item,
|
||||
output: JSON.stringify(strippedOutput),
|
||||
} as AgentInputItem;
|
||||
result.push(strippedItem);
|
||||
|
||||
// 2. Insert a transient user message carrying the real image
|
||||
const toolName = funcResult.name;
|
||||
const messageText =
|
||||
toolName === "computer"
|
||||
? "Here is the screenshot from the computer action:"
|
||||
: "Here is the screenshot you requested:";
|
||||
|
||||
const userImageMessage: AgentInputItem = {
|
||||
type: "message",
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "input_text", text: messageText },
|
||||
{
|
||||
type: "input_image",
|
||||
image: extracted.imageData,
|
||||
detail: "auto",
|
||||
},
|
||||
],
|
||||
// Mark as transient so it can be pruned before persistence/compression
|
||||
providerData: { [TRANSIENT_SCREENSHOT_MARKER]: true },
|
||||
} as AgentInputItem;
|
||||
|
||||
result.push(userImageMessage);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove transient screenshot user-image messages from items.
|
||||
* Used before persistence or compression.
|
||||
*/
|
||||
export function pruneTransientScreenshotItems(
|
||||
items: AgentInputItem[],
|
||||
): AgentInputItem[] {
|
||||
return items.filter((item) => {
|
||||
const pd = (item as { providerData?: Record<string, unknown> })
|
||||
.providerData;
|
||||
return !pd?.[TRANSIENT_SCREENSHOT_MARKER];
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if an item is a transient screenshot user-image message.
|
||||
*/
|
||||
export function isTransientScreenshotItem(item: AgentInputItem): boolean {
|
||||
const pd = (item as { providerData?: Record<string, unknown> }).providerData;
|
||||
return !!pd?.[TRANSIENT_SCREENSHOT_MARKER];
|
||||
}
|
||||
|
||||
// ===================== Internal helpers =====================
|
||||
|
||||
interface ExtractedImage {
|
||||
imageData: string;
|
||||
screenshotUid?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract imageData from parsed tool output.
|
||||
* Handles nested structures:
|
||||
* { success, data: { imageData, sendToLLM, screenshotUid } }
|
||||
* { success, imageData, sendToLLM, screenshotUid }
|
||||
*/
|
||||
function extractImageData(
|
||||
parsed: Record<string, unknown>,
|
||||
): ExtractedImage | null {
|
||||
if (!parsed.success) return null;
|
||||
|
||||
// Navigate possible nesting levels
|
||||
const data = parsed.data as Record<string, unknown> | undefined;
|
||||
const actual = data ?? parsed;
|
||||
|
||||
// Must have sendToLLM === true
|
||||
if (actual.sendToLLM !== true) return null;
|
||||
|
||||
const imageData = actual.imageData;
|
||||
if (typeof imageData !== "string" || !imageData.startsWith("data:image/")) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
imageData,
|
||||
screenshotUid:
|
||||
typeof actual.screenshotUid === "string"
|
||||
? actual.screenshotUid
|
||||
: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the stripped tool output object (imageData replaced with placeholder).
|
||||
*/
|
||||
function buildStrippedOutput(
|
||||
parsed: Record<string, unknown>,
|
||||
screenshotUid?: string,
|
||||
): Record<string, unknown> {
|
||||
const data = parsed.data as Record<string, unknown> | undefined;
|
||||
const actual = data ?? parsed;
|
||||
|
||||
const stripped: Record<string, unknown> = {
|
||||
...actual,
|
||||
imageData: IMAGE_DATA_PLACEHOLDER,
|
||||
};
|
||||
|
||||
if (screenshotUid) {
|
||||
stripped.screenshotUid = screenshotUid;
|
||||
}
|
||||
|
||||
// If there was a `data` wrapper, preserve it
|
||||
if (data) {
|
||||
return { success: true, data: stripped };
|
||||
}
|
||||
return { success: true, ...stripped };
|
||||
}
|
||||
Reference in New Issue
Block a user