Files
BrowserOS/packages/browseros-agent/apps/agent/lib/voice/useVoiceInput.ts
Nikhil 2271277b4d feat: add voice input to new tab search bar (#509)
* feat: add voice recording UI with waveform overlay to new tab search bar

Add a microphone button to the NewTab search bar that opens a fullscreen
recording overlay powered by react-voice-visualizer. The overlay shows a
real-time waveform visualization during recording, recording time, and a
stop button. On completion, the audio is transcribed via the existing
gateway endpoint and the transcript auto-navigates to inline chat.

Changes:
- Extract transcribeAudio() to shared lib/voice/transcribe-audio.ts
- Add VoiceRecordingOverlay component with react-voice-visualizer
- Add Mic button to NewTab search bar
- Track analytics via existing NEWTAB_VOICE_* events
- Handle cancel (backdrop click) vs submit (stop button) correctly

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: address PR review comments for voice recording overlay

- Reset processingRef on transcription error to prevent stuck state
- Use stable callback refs to prevent useEffect re-runs from inline
  arrow function props (fixes timer reset and unnecessary re-processing)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: replace voice overlay with inline sidepanel-style voice UI

Remove react-voice-visualizer dependency and VoiceRecordingOverlay.
Instead use the same inline voice pattern as the sidepanel ChatInput:
- Waveform bars replace the search input during recording
- Mic/stop/loading button states in the search bar
- Transcript populates the search input on completion
- Voice error shown inline below the search bar

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-20 11:33:01 -07:00

229 lines
6.2 KiB
TypeScript

import { useEffect, useRef, useState } from 'react'
import { transcribeAudio } from './transcribe-audio'
const WAVEFORM_BAND_COUNT = 5
export interface VoiceInputState {
isRecording: boolean
isTranscribing: boolean
audioLevels: number[]
error: string | null
onStartRecording: () => void
onStopRecording: () => void
}
export interface UseVoiceInputReturn {
isRecording: boolean
isTranscribing: boolean
transcript: string
audioLevel: number
audioLevels: number[]
error: string | null
startRecording: () => Promise<boolean>
stopRecording: () => Promise<void>
clearTranscript: () => void
}
const EMPTY_LEVELS = Array(WAVEFORM_BAND_COUNT).fill(0)
export function useVoiceInput(): UseVoiceInputReturn {
const [isRecording, setIsRecording] = useState(false)
const [isTranscribing, setIsTranscribing] = useState(false)
const [transcript, setTranscript] = useState('')
const [audioLevel, setAudioLevel] = useState(0)
const [audioLevels, setAudioLevels] = useState<number[]>(EMPTY_LEVELS)
const [error, setError] = useState<string | null>(null)
const mediaRecorderRef = useRef<MediaRecorder | null>(null)
const streamRef = useRef<MediaStream | null>(null)
const chunksRef = useRef<Blob[]>([])
const audioContextRef = useRef<AudioContext | null>(null)
const analyserRef = useRef<AnalyserNode | null>(null)
const animationFrameRef = useRef<number | null>(null)
const stopAudioLevelMonitoring = () => {
if (animationFrameRef.current) {
cancelAnimationFrame(animationFrameRef.current)
animationFrameRef.current = null
}
if (audioContextRef.current?.state !== 'closed') {
audioContextRef.current?.close()
}
audioContextRef.current = null
analyserRef.current = null
setAudioLevel(0)
setAudioLevels(EMPTY_LEVELS)
}
// biome-ignore lint/correctness/useExhaustiveDependencies: cleanup only needs to run on unmount
useEffect(() => {
return () => {
streamRef.current?.getTracks().forEach((track) => {
track.stop()
})
if (mediaRecorderRef.current?.state === 'recording') {
mediaRecorderRef.current.stop()
}
stopAudioLevelMonitoring()
}
}, [])
const startAudioLevelMonitoring = (stream: MediaStream) => {
const audioContext = new AudioContext()
const analyser = audioContext.createAnalyser()
analyser.fftSize = 256
const source = audioContext.createMediaStreamSource(stream)
source.connect(analyser)
audioContextRef.current = audioContext
analyserRef.current = analyser
const updateLevel = () => {
if (!analyserRef.current) return
const dataArray = new Uint8Array(analyserRef.current.fftSize)
analyserRef.current.getByteTimeDomainData(dataArray)
const binCount = dataArray.length
const levels: number[] = []
let totalPeak = 0
for (let band = 0; band < WAVEFORM_BAND_COUNT; band++) {
const start = Math.floor((band / WAVEFORM_BAND_COUNT) * binCount)
const end = Math.floor(((band + 1) / WAVEFORM_BAND_COUNT) * binCount)
let peak = 0
for (let j = start; j < end; j++) {
const amplitude = Math.abs(dataArray[j] - 128)
if (amplitude > peak) peak = amplitude
}
const normalized = Math.round(Math.min(100, (peak / 50) * 100))
levels.push(normalized)
totalPeak += normalized
}
setAudioLevels(levels)
setAudioLevel(Math.round(totalPeak / WAVEFORM_BAND_COUNT))
animationFrameRef.current = requestAnimationFrame(updateLevel)
}
updateLevel()
}
const startRecording = async (): Promise<boolean> => {
try {
setError(null)
setTranscript('')
chunksRef.current = []
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
sampleRate: 16000,
echoCancellation: true,
noiseSuppression: true,
},
})
streamRef.current = stream
startAudioLevelMonitoring(stream)
const mimeType = MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
? 'audio/webm;codecs=opus'
: 'audio/webm'
const mediaRecorder = new MediaRecorder(stream, { mimeType })
mediaRecorderRef.current = mediaRecorder
mediaRecorder.ondataavailable = (e) => {
if (e.data.size > 0) {
chunksRef.current.push(e.data)
}
}
mediaRecorder.start(250)
setIsRecording(true)
return true
} catch (err) {
streamRef.current?.getTracks().forEach((track) => {
track.stop()
})
streamRef.current = null
stopAudioLevelMonitoring()
if (err instanceof Error) {
if (err.name === 'NotAllowedError') {
setError('Microphone permission denied')
} else if (err.name === 'NotFoundError') {
setError('No microphone found')
} else {
setError(err.message)
}
} else {
setError('Failed to start recording')
}
return false
}
}
const stopRecording = async () => {
const mediaRecorder = mediaRecorderRef.current
if (!mediaRecorder || mediaRecorder.state === 'inactive') {
return
}
await new Promise<void>((resolve) => {
mediaRecorder.onstop = () => resolve()
mediaRecorder.stop()
})
streamRef.current?.getTracks().forEach((track) => {
track.stop()
})
streamRef.current = null
stopAudioLevelMonitoring()
setIsRecording(false)
const audioBlob = new Blob(chunksRef.current, { type: 'audio/webm' })
chunksRef.current = []
if (audioBlob.size === 0) {
setError('No audio recorded')
return
}
setIsTranscribing(true)
try {
const text = await transcribeAudio(audioBlob)
if (text.trim()) {
setTranscript(text.trim())
} else {
setError('No speech detected')
}
} catch (err) {
setError(err instanceof Error ? err.message : 'Transcription failed')
} finally {
setIsTranscribing(false)
}
}
const clearTranscript = () => {
setTranscript('')
setError(null)
}
return {
isRecording,
isTranscribing,
transcript,
audioLevel,
audioLevels,
error,
startRecording,
stopRecording,
clearTranscript,
}
}