Compare commits

...

7 Commits

Author SHA1 Message Date
shivammittal274
8c62697a47 fix: remove unreachable CSS branch in recording waveform div 2026-03-17 18:27:53 +05:30
shivammittal274
b038d772e8 fix: keep mic button always visible inside input alongside send
Both mic and send buttons are always visible inside the input field,
positioned on the right side (ChatGPT-style). Mic is disabled while
AI is streaming. Send is disabled during recording/transcribing.
2026-03-17 18:16:29 +05:30
shivammittal274
c3229bd5f3 fix: keep mic button always visible alongside send button
Mic and send are now separate buttons, both always visible.
Mic is disabled while AI is streaming. Send is disabled during
recording/transcribing. Buttons are no longer absolutely positioned
inside the textarea — they sit beside it in the flex row.
2026-03-17 18:14:53 +05:30
shivammittal274
62f8107405 fix: analytics only tracks on success, clean up stream on failure, type API response
- startRecording returns boolean; track(RECORDING_STARTED) only fires on success
- Catch block cleans up MediaStream tracks and AudioContext on partial failure
- Type transcription API response with TranscribeResponse interface
2026-03-17 17:46:55 +05:30
shivammittal274
45da6526ab fix: await startRecording before tracking, narrow SurveyChat effect deps
- Await startRecording() so analytics only fires after mic permission granted
- Narrow SurveyChat useEffect dependency from [voice] to [voice.transcript, voice.isTranscribing]
2026-03-17 17:25:35 +05:30
shivammittal274
41e8461f01 fix: address review — add fetch timeout, await stopRecording, deduplicate VoiceInputState
- Add AbortSignal.timeout(30s) to transcription fetch
- Await stopRecording() and track analytics after completion
- Export VoiceInputState from useVoiceInput, import in consumers
2026-03-17 17:08:46 +05:30
shivammittal274
62440de783 feat: add voice input to agent chat sidebar
Allow users to record voice and transcribe to text in the chat input.
Mic button shows when input is empty, waveform visualizer during recording,
transcription via OpenAI (llm.browseros.com/api/transcribe).

- Extract shared useVoiceInput hook to lib/voice/
- Time-domain waveform bars that bounce per-frequency-band
- Bar height capped to fit input container
- Analytics events for recording lifecycle
2026-03-17 16:09:33 +05:30
6 changed files with 244 additions and 54 deletions

View File

@@ -4,8 +4,8 @@ import { MessageResponse } from '@/components/ai-elements/message'
import { Button } from '@/components/ui/button'
import { Textarea } from '@/components/ui/textarea'
import { cn } from '@/lib/utils'
import { useVoiceInput } from '@/lib/voice/useVoiceInput'
import type { Message } from './useSurveyChat'
import { useVoiceInput } from './useVoiceInput'
import { VoiceInputButton } from './VoiceInputButton'
interface Props {
@@ -81,6 +81,7 @@ export const Chat: FC<Props> = ({
}, [messagesLength])
// Insert transcript into input when transcription completes
// biome-ignore lint/correctness/useExhaustiveDependencies: only trigger on transcript/transcribing change
useEffect(() => {
if (voice.transcript && !voice.isTranscribing) {
setInput((prev) => {
@@ -89,7 +90,7 @@ export const Chat: FC<Props> = ({
})
voice.clearTranscript()
}
}, [voice])
}, [voice.transcript, voice.isTranscribing])
const handleSubmit = (e: FormEvent) => {
e.preventDefault()

View File

@@ -8,9 +8,14 @@ import {
SIDEPANEL_SUGGESTION_CLICKED_EVENT,
SIDEPANEL_TAB_REMOVED_EVENT,
SIDEPANEL_TAB_TOGGLED_EVENT,
SIDEPANEL_VOICE_ERROR_EVENT,
SIDEPANEL_VOICE_RECORDING_STARTED_EVENT,
SIDEPANEL_VOICE_RECORDING_STOPPED_EVENT,
SIDEPANEL_VOICE_TRANSCRIPTION_COMPLETED_EVENT,
} from '@/lib/constants/analyticsEvents'
import { useJtbdPopup } from '@/lib/jtbd-popup/useJtbdPopup'
import { track } from '@/lib/metrics/track'
import { useVoiceInput } from '@/lib/voice/useVoiceInput'
import { useChatSessionContext } from '../layout/ChatSessionContext'
import { ChatEmptyState } from './ChatEmptyState'
import { ChatError } from './ChatError'
@@ -48,6 +53,8 @@ export const Chat = () => {
onDismiss: onDismissJtbdPopup,
} = useJtbdPopup()
const voice = useVoiceInput()
const [input, setInput] = useState('')
const [attachedTabs, setAttachedTabs] = useState<chrome.tabs.Tab[]>([])
const [mounted, setMounted] = useState(false)
@@ -83,6 +90,26 @@ export const Chat = () => {
previousChatStatus.current = status
}, [status])
// Insert transcript into input when transcription completes
// biome-ignore lint/correctness/useExhaustiveDependencies: only trigger on transcript/transcribing change
useEffect(() => {
if (voice.transcript && !voice.isTranscribing) {
setInput((prev) => {
const separator = prev.trim() ? ' ' : ''
return prev + separator + voice.transcript
})
track(SIDEPANEL_VOICE_TRANSCRIPTION_COMPLETED_EVENT)
voice.clearTranscript()
}
}, [voice.transcript, voice.isTranscribing])
// Track voice errors
useEffect(() => {
if (voice.error) {
track(SIDEPANEL_VOICE_ERROR_EVENT, { error: voice.error })
}
}, [voice.error])
const handleModeChange = (newMode: ChatMode) => {
track(SIDEPANEL_MODE_CHANGED_EVENT, { from: mode, to: newMode })
setMode(newMode)
@@ -147,6 +174,27 @@ export const Chat = () => {
executeMessage(suggestion)
}
const handleStartRecording = async () => {
const started = await voice.startRecording()
if (started) {
track(SIDEPANEL_VOICE_RECORDING_STARTED_EVENT)
}
}
const handleStopRecording = async () => {
await voice.stopRecording()
track(SIDEPANEL_VOICE_RECORDING_STOPPED_EVENT)
}
const voiceState = {
isRecording: voice.isRecording,
isTranscribing: voice.isTranscribing,
audioLevels: voice.audioLevels,
error: voice.error,
onStartRecording: handleStartRecording,
onStopRecording: handleStopRecording,
}
return (
<>
<main className="mt-4 flex h-full flex-1 flex-col space-y-4 overflow-y-auto">
@@ -190,6 +238,7 @@ export const Chat = () => {
attachedTabs={attachedTabs}
onToggleTab={toggleTabSelection}
onRemoveTab={removeTab}
voice={voiceState}
/>
</>
)

View File

@@ -10,6 +10,7 @@ import { useCapabilities } from '@/lib/browseros/useCapabilities'
import { useMcpServers } from '@/lib/mcp/mcpServerStorage'
import { useSyncRemoteIntegrations } from '@/lib/mcp/useSyncRemoteIntegrations'
import { cn } from '@/lib/utils'
import type { VoiceInputState } from '@/lib/voice/useVoiceInput'
import { useWorkspace } from '@/lib/workspace/use-workspace'
import { ChatAttachedTabs } from './ChatAttachedTabs'
import { ChatInput, type ChatInputHandle } from './ChatInput'
@@ -27,6 +28,7 @@ interface ChatFooterProps {
attachedTabs: chrome.tabs.Tab[]
onToggleTab: (tab: chrome.tabs.Tab) => void
onRemoveTab: (tabId?: number) => void
voice?: VoiceInputState
}
export const ChatFooter: FC<ChatFooterProps> = ({
@@ -40,6 +42,7 @@ export const ChatFooter: FC<ChatFooterProps> = ({
attachedTabs,
onToggleTab,
onRemoveTab,
voice,
}) => {
const { selectedFolder } = useWorkspace()
const { supports } = useCapabilities()
@@ -172,6 +175,10 @@ export const ChatFooter: FC<ChatFooterProps> = ({
</div>
</div>
{voice?.error && (
<div className="mt-1 text-destructive text-xs">{voice.error}</div>
)}
<ChatInput
input={input}
status={status}
@@ -182,6 +189,7 @@ export const ChatFooter: FC<ChatFooterProps> = ({
selectedTabs={attachedTabs}
onToggleTab={onToggleTab}
onTabMentionOpenChange={setIsTabMentionOpen}
voice={voice}
ref={chatInputRef}
/>
</div>

View File

@@ -1,4 +1,4 @@
import { Send, SquareStop } from 'lucide-react'
import { Loader2, Mic, Send, Square, SquareStop } from 'lucide-react'
import type { FormEvent, KeyboardEvent } from 'react'
import {
forwardRef,
@@ -10,6 +10,7 @@ import {
} from 'react'
import { TabPickerPopover } from '@/components/elements/tab-picker-popover'
import { cn } from '@/lib/utils'
import type { VoiceInputState } from '@/lib/voice/useVoiceInput'
import type { ChatMode } from './chatTypes'
interface MentionState {
@@ -28,6 +29,7 @@ interface ChatInputProps {
selectedTabs: chrome.tabs.Tab[]
onToggleTab: (tab: chrome.tabs.Tab) => void
onTabMentionOpenChange?: (isOpen: boolean) => void
voice?: VoiceInputState
}
export interface ChatInputHandle {
@@ -49,6 +51,7 @@ export const ChatInput = forwardRef<ChatInputHandle, ChatInputProps>(
selectedTabs,
onToggleTab,
onTabMentionOpenChange,
voice,
},
ref,
) => {
@@ -259,6 +262,70 @@ export const ChatInput = forwardRef<ChatInputHandle, ChatInputProps>(
return () => document.removeEventListener('mousedown', handleClickOutside)
}, [mentionState.isOpen, closeMention])
const renderVoiceButton = () => {
if (!voice) return null
if (voice.isRecording) {
return (
<button
type="button"
onClick={voice.onStopRecording}
className="cursor-pointer rounded-full bg-red-600 p-2 text-white shadow-sm transition-all duration-200 hover:bg-red-900"
>
<Square className="h-3.5 w-3.5" />
<span className="sr-only">Stop recording</span>
</button>
)
}
if (voice.isTranscribing) {
return (
<button type="button" disabled className="rounded-full p-2 text-muted-foreground">
<Loader2 className="h-3.5 w-3.5 animate-spin" />
<span className="sr-only">Transcribing</span>
</button>
)
}
return (
<button
type="button"
onClick={voice.onStartRecording}
disabled={isBusy}
className="cursor-pointer rounded-full p-2 text-muted-foreground transition-all duration-200 hover:bg-muted hover:text-foreground disabled:cursor-not-allowed disabled:opacity-50"
>
<Mic className="h-3.5 w-3.5" />
<span className="sr-only">Voice input</span>
</button>
)
}
const renderSendButton = () => {
if (isBusy) {
return (
<button
type="button"
onClick={onStop}
className="cursor-pointer rounded-full bg-red-600 p-2 text-white shadow-sm transition-all duration-200 hover:bg-red-900"
>
<SquareStop className="h-3.5 w-3.5" />
<span className="sr-only">Stop</span>
</button>
)
}
return (
<button
type="submit"
disabled={!input.trim() || voice?.isRecording || voice?.isTranscribing}
className="cursor-pointer rounded-full bg-[var(--accent-orange)] p-2 text-white shadow-sm transition-all duration-200 hover:bg-[var(--accent-orange-bright)] disabled:cursor-not-allowed disabled:opacity-50"
>
<Send className="h-3.5 w-3.5" />
<span className="sr-only">Send</span>
</button>
)
}
return (
<form
onSubmit={handleSubmit}
@@ -273,38 +340,45 @@ export const ChatInput = forwardRef<ChatInputHandle, ChatInputProps>(
onClose={closeMention}
anchorRef={textareaRef}
/>
<textarea
ref={textareaRef}
className={cn(
'field-sizing-content max-h-60 min-h-[42px] flex-1 resize-none overflow-hidden rounded-2xl border border-border/50 bg-muted/50 px-4 py-2.5 pr-11 text-sm outline-none transition-colors placeholder:text-muted-foreground/70 hover:border-border focus:border-[var(--accent-orange)]',
)}
value={input}
onChange={(e) => handleInputChange(e.target.value)}
onKeyDown={handleKeyDown}
placeholder={
mode === 'chat' ? 'Ask about this page...' : 'What should I do?'
}
rows={1}
/>
{isBusy ? (
<button
type="button"
onClick={onStop}
className="absolute right-1.5 bottom-1.5 cursor-pointer rounded-full bg-red-600 p-2 text-white shadow-sm transition-all duration-200 hover:bg-red-900 disabled:cursor-not-allowed disabled:opacity-50"
{voice?.isRecording ? (
<div
className="flex min-h-[42px] flex-1 items-center justify-center gap-1 rounded-2xl border border-red-500/50 bg-muted/50 px-4 py-2.5 pr-[4.5rem]"
>
<SquareStop className="h-3.5 w-3.5" />
<span className="sr-only">Stop</span>
</button>
{voice.audioLevels.map((level, i) => (
<div
key={i}
className="w-1 rounded-full bg-red-500 transition-all duration-75"
style={{
height: `${Math.max(4, Math.min(20, level * 0.6))}px`,
}}
/>
))}
</div>
) : (
<button
type="submit"
disabled={!input.trim()}
className="absolute right-1.5 bottom-1.5 cursor-pointer rounded-full bg-[var(--accent-orange)] p-2 text-white shadow-sm transition-all duration-200 hover:bg-[var(--accent-orange-bright)] disabled:cursor-not-allowed disabled:opacity-50"
>
<Send className="h-3.5 w-3.5" />
<span className="sr-only">Send</span>
</button>
<textarea
ref={textareaRef}
className={cn(
'field-sizing-content max-h-60 min-h-[42px] flex-1 resize-none overflow-hidden rounded-2xl border border-border/50 bg-muted/50 px-4 py-2.5 text-sm outline-none transition-colors placeholder:text-muted-foreground/70 hover:border-border focus:border-[var(--accent-orange)]',
voice ? 'pr-[4.5rem]' : 'pr-11',
)}
value={input}
onChange={(e) => handleInputChange(e.target.value)}
onKeyDown={handleKeyDown}
placeholder={
voice?.isTranscribing
? 'Transcribing...'
: mode === 'chat'
? 'Ask about this page...'
: 'What should I do?'
}
disabled={voice?.isTranscribing}
rows={1}
/>
)}
<div className="absolute right-1.5 bottom-1.5 flex items-center gap-1">
{renderVoiceButton()}
{renderSendButton()}
</div>
</form>
)
},

View File

@@ -251,3 +251,18 @@ export const KIMI_RATE_LIMIT_DOCS_CLICKED_EVENT =
/** @public */
export const KIMI_RATE_LIMIT_PLATFORM_CLICKED_EVENT =
'ui.rate_limit.moonshot_platform_clicked'
/** @public */
export const SIDEPANEL_VOICE_RECORDING_STARTED_EVENT =
'sidepanel.voice.recording_started'
/** @public */
export const SIDEPANEL_VOICE_RECORDING_STOPPED_EVENT =
'sidepanel.voice.recording_stopped'
/** @public */
export const SIDEPANEL_VOICE_TRANSCRIPTION_COMPLETED_EVENT =
'sidepanel.voice.transcription_completed'
/** @public */
export const SIDEPANEL_VOICE_ERROR_EVENT = 'sidepanel.voice.error'

View File

@@ -1,18 +1,35 @@
import { useCallback, useEffect, useRef, useState } from 'react'
import { useEffect, useRef, useState } from 'react'
const GATEWAY_URL = 'https://llm.browseros.com'
const WAVEFORM_BAND_COUNT = 5
interface UseVoiceInputReturn {
export interface VoiceInputState {
isRecording: boolean
isTranscribing: boolean
audioLevels: number[]
error: string | null
onStartRecording: () => void
onStopRecording: () => void
}
export interface UseVoiceInputReturn {
isRecording: boolean
isTranscribing: boolean
transcript: string
audioLevel: number
audioLevels: number[]
error: string | null
startRecording: () => Promise<void>
startRecording: () => Promise<boolean>
stopRecording: () => Promise<void>
clearTranscript: () => void
}
const EMPTY_LEVELS = Array(WAVEFORM_BAND_COUNT).fill(0)
interface TranscribeResponse {
text: string
}
async function transcribeAudio(audioBlob: Blob): Promise<string> {
const formData = new FormData()
formData.append('file', audioBlob, 'recording.webm')
@@ -21,16 +38,17 @@ async function transcribeAudio(audioBlob: Blob): Promise<string> {
const response = await fetch(`${GATEWAY_URL}/api/transcribe`, {
method: 'POST',
body: formData,
signal: AbortSignal.timeout(30_000),
})
if (!response.ok) {
const error = await response
const errorBody: { error?: string } = await response
.json()
.catch(() => ({ error: 'Transcription failed' }))
throw new Error(error.error || `Transcription failed: ${response.status}`)
throw new Error(errorBody.error || `Transcription failed: ${response.status}`)
}
const result = await response.json()
const result: TranscribeResponse = await response.json()
return result.text || ''
}
@@ -39,6 +57,7 @@ export function useVoiceInput(): UseVoiceInputReturn {
const [isTranscribing, setIsTranscribing] = useState(false)
const [transcript, setTranscript] = useState('')
const [audioLevel, setAudioLevel] = useState(0)
const [audioLevels, setAudioLevels] = useState<number[]>(EMPTY_LEVELS)
const [error, setError] = useState<string | null>(null)
const mediaRecorderRef = useRef<MediaRecorder | null>(null)
@@ -48,7 +67,7 @@ export function useVoiceInput(): UseVoiceInputReturn {
const analyserRef = useRef<AnalyserNode | null>(null)
const animationFrameRef = useRef<number | null>(null)
const stopAudioLevelMonitoring = useCallback(() => {
const stopAudioLevelMonitoring = () => {
if (animationFrameRef.current) {
cancelAnimationFrame(animationFrameRef.current)
animationFrameRef.current = null
@@ -59,7 +78,8 @@ export function useVoiceInput(): UseVoiceInputReturn {
audioContextRef.current = null
analyserRef.current = null
setAudioLevel(0)
}, [])
setAudioLevels(EMPTY_LEVELS)
}
useEffect(() => {
return () => {
@@ -71,9 +91,9 @@ export function useVoiceInput(): UseVoiceInputReturn {
}
stopAudioLevelMonitoring()
}
}, [stopAudioLevelMonitoring])
}, [])
const startAudioLevelMonitoring = useCallback((stream: MediaStream) => {
const startAudioLevelMonitoring = (stream: MediaStream) => {
const audioContext = new AudioContext()
const analyser = audioContext.createAnalyser()
analyser.fftSize = 256
@@ -87,20 +107,36 @@ export function useVoiceInput(): UseVoiceInputReturn {
const updateLevel = () => {
if (!analyserRef.current) return
const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount)
analyserRef.current.getByteFrequencyData(dataArray)
const dataArray = new Uint8Array(analyserRef.current.fftSize)
analyserRef.current.getByteTimeDomainData(dataArray)
const average = dataArray.reduce((a, b) => a + b, 0) / dataArray.length
const normalized = Math.min(100, (average / 128) * 100)
setAudioLevel(Math.round(normalized))
const binCount = dataArray.length
const levels: number[] = []
let totalPeak = 0
for (let band = 0; band < WAVEFORM_BAND_COUNT; band++) {
const start = Math.floor((band / WAVEFORM_BAND_COUNT) * binCount)
const end = Math.floor(((band + 1) / WAVEFORM_BAND_COUNT) * binCount)
let peak = 0
for (let j = start; j < end; j++) {
const amplitude = Math.abs(dataArray[j] - 128)
if (amplitude > peak) peak = amplitude
}
const normalized = Math.round(Math.min(100, (peak / 50) * 100))
levels.push(normalized)
totalPeak += normalized
}
setAudioLevels(levels)
setAudioLevel(Math.round(totalPeak / WAVEFORM_BAND_COUNT))
animationFrameRef.current = requestAnimationFrame(updateLevel)
}
updateLevel()
}, [])
}
const startRecording = useCallback(async () => {
const startRecording = async (): Promise<boolean> => {
try {
setError(null)
setTranscript('')
@@ -133,7 +169,12 @@ export function useVoiceInput(): UseVoiceInputReturn {
mediaRecorder.start(250)
setIsRecording(true)
return true
} catch (err) {
streamRef.current?.getTracks().forEach((track) => track.stop())
streamRef.current = null
stopAudioLevelMonitoring()
if (err instanceof Error) {
if (err.name === 'NotAllowedError') {
setError('Microphone permission denied')
@@ -145,10 +186,11 @@ export function useVoiceInput(): UseVoiceInputReturn {
} else {
setError('Failed to start recording')
}
return false
}
}, [startAudioLevelMonitoring])
}
const stopRecording = useCallback(async () => {
const stopRecording = async () => {
const mediaRecorder = mediaRecorderRef.current
if (!mediaRecorder || mediaRecorder.state === 'inactive') {
@@ -188,18 +230,19 @@ export function useVoiceInput(): UseVoiceInputReturn {
} finally {
setIsTranscribing(false)
}
}, [stopAudioLevelMonitoring])
}
const clearTranscript = useCallback(() => {
const clearTranscript = () => {
setTranscript('')
setError(null)
}, [])
}
return {
isRecording,
isTranscribing,
transcript,
audioLevel,
audioLevels,
error,
startRecording,
stopRecording,