mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-14 08:03:58 +00:00
feat: jtbd agent add transcription support (#212)
* feat: v0.1 of voice transcription for JTBD survey Add voice input capability to the JTBD Product Survey chat: - useVoiceInput hook for audio recording and transcription - VoiceInputButton component for mic/stop/loading states - Waveform visualization during recording - Integration with BrowserOS gateway transcription endpoint Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * style: make voice button orange like send button Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * chore: refactor jtbd agent * chore: udpate text * fix: clean up stop recording if stopped midway --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,7 @@ import { HashRouter, Navigate, Route, Routes } from 'react-router'
|
||||
import { AISettingsPage } from './ai-settings/AISettingsPage'
|
||||
import { ConnectMCP } from './connect-mcp/ConnectMCP'
|
||||
import { CustomizationPage } from './customization/CustomizationPage'
|
||||
import { JTBDAgentPage } from './jtbd-agent/jtbd-agent-page'
|
||||
import { SurveyPage } from './jtbd-agent'
|
||||
import { DashboardLayout } from './layout/DashboardLayout'
|
||||
import { LlmHubPage } from './llm-hub/LlmHubPage'
|
||||
import { MCPSettingsPage } from './mcp-settings/MCPSettingsPage'
|
||||
@@ -36,7 +36,7 @@ export const App: FC = () => {
|
||||
element={<AISettingsPage key="onboarding" />}
|
||||
/>
|
||||
<Route path="scheduled" element={<ScheduledTasksPage />} />
|
||||
<Route path="jtbd-agent" element={<JTBDAgentPage />} />
|
||||
<Route path="jtbd-agent" element={<SurveyPage />} />
|
||||
</Route>
|
||||
</Routes>
|
||||
</HashRouter>
|
||||
|
||||
@@ -4,7 +4,9 @@ import { MessageResponse } from '@/components/ai-elements/message'
|
||||
import { Button } from '@/components/ui/button'
|
||||
import { Textarea } from '@/components/ui/textarea'
|
||||
import { cn } from '@/lib/utils'
|
||||
import type { Message } from './use-jtbd-agent-chat'
|
||||
import type { Message } from './use-chat'
|
||||
import { useVoiceInput } from './use-voice-input'
|
||||
import { VoiceInputButton } from './voice-input-button'
|
||||
|
||||
interface Props {
|
||||
messages: Message[]
|
||||
@@ -39,7 +41,29 @@ const MessageBubble: FC<{ message: Message }> = ({ message }) => {
|
||||
)
|
||||
}
|
||||
|
||||
export const JTBDAgentChat: FC<Props> = ({
|
||||
const WAVEFORM_BARS = [0, 1, 2, 3, 4] as const
|
||||
|
||||
const WaveformIndicator: FC<{ level: number }> = ({ level }) => {
|
||||
return (
|
||||
<div className="flex items-center justify-center gap-1">
|
||||
{WAVEFORM_BARS.map((barIndex) => {
|
||||
const barLevel = Math.max(
|
||||
0.2,
|
||||
Math.sin((barIndex / WAVEFORM_BARS.length) * Math.PI) * (level / 100),
|
||||
)
|
||||
return (
|
||||
<div
|
||||
key={barIndex}
|
||||
className="w-1 rounded-full bg-destructive transition-all duration-75"
|
||||
style={{ height: `${Math.max(4, barLevel * 20)}px` }}
|
||||
/>
|
||||
)
|
||||
})}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export const Chat: FC<Props> = ({
|
||||
messages,
|
||||
isStreaming,
|
||||
onSendMessage,
|
||||
@@ -48,15 +72,28 @@ export const JTBDAgentChat: FC<Props> = ({
|
||||
const [input, setInput] = useState('')
|
||||
const messagesEndRef = useRef<HTMLDivElement>(null)
|
||||
|
||||
const voice = useVoiceInput()
|
||||
|
||||
const messagesLength = messages.length
|
||||
// biome-ignore lint/correctness/useExhaustiveDependencies: intentionally scroll on message count change
|
||||
useEffect(() => {
|
||||
messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' })
|
||||
}, [messagesLength])
|
||||
|
||||
// Insert transcript into input when transcription completes
|
||||
useEffect(() => {
|
||||
if (voice.transcript && !voice.isTranscribing) {
|
||||
setInput((prev) => {
|
||||
const separator = prev.trim() ? ' ' : ''
|
||||
return prev + separator + voice.transcript
|
||||
})
|
||||
voice.clearTranscript()
|
||||
}
|
||||
}, [voice.transcript, voice.isTranscribing, voice.clearTranscript])
|
||||
|
||||
const handleSubmit = (e: FormEvent) => {
|
||||
e.preventDefault()
|
||||
if (!input.trim() || isStreaming) return
|
||||
if (!input.trim() || isStreaming || voice.isRecording) return
|
||||
onSendMessage(input.trim())
|
||||
setInput('')
|
||||
}
|
||||
@@ -68,6 +105,9 @@ export const JTBDAgentChat: FC<Props> = ({
|
||||
}
|
||||
}
|
||||
|
||||
const isInputDisabled =
|
||||
isStreaming || voice.isRecording || voice.isTranscribing
|
||||
|
||||
return (
|
||||
<div className="flex h-[calc(100vh-250px)] flex-col rounded-xl border border-border bg-card shadow-sm">
|
||||
<div className="flex-1 space-y-4 overflow-y-auto p-4">
|
||||
@@ -78,16 +118,43 @@ export const JTBDAgentChat: FC<Props> = ({
|
||||
</div>
|
||||
|
||||
<form onSubmit={handleSubmit} className="border-border border-t p-4">
|
||||
{voice.error && (
|
||||
<div className="mb-2 text-destructive text-sm">{voice.error}</div>
|
||||
)}
|
||||
|
||||
<div className="flex gap-2">
|
||||
<Textarea
|
||||
value={input}
|
||||
onChange={(e) => setInput(e.target.value)}
|
||||
onKeyDown={handleKeyDown}
|
||||
placeholder="Type your response..."
|
||||
{voice.isRecording ? (
|
||||
<div className="flex flex-1 items-center justify-center gap-2 rounded-md border border-input bg-muted/50 px-3 py-2">
|
||||
<WaveformIndicator level={voice.audioLevel} />
|
||||
<span className="text-muted-foreground text-sm">
|
||||
Listening...
|
||||
</span>
|
||||
</div>
|
||||
) : (
|
||||
<Textarea
|
||||
value={input}
|
||||
onChange={(e) => setInput(e.target.value)}
|
||||
onKeyDown={handleKeyDown}
|
||||
placeholder={
|
||||
voice.isTranscribing
|
||||
? 'Transcribing...'
|
||||
: 'Type your response...'
|
||||
}
|
||||
disabled={isInputDisabled}
|
||||
className="max-h-40 min-h-[44px] resize-none"
|
||||
rows={1}
|
||||
/>
|
||||
)}
|
||||
|
||||
<VoiceInputButton
|
||||
isRecording={voice.isRecording}
|
||||
isTranscribing={voice.isTranscribing}
|
||||
audioLevel={voice.audioLevel}
|
||||
disabled={isStreaming}
|
||||
className="max-h-40 min-h-[44px] resize-none"
|
||||
rows={1}
|
||||
onStart={voice.startRecording}
|
||||
onStop={voice.stopRecording}
|
||||
/>
|
||||
|
||||
{isStreaming ? (
|
||||
<Button
|
||||
type="button"
|
||||
@@ -98,7 +165,13 @@ export const JTBDAgentChat: FC<Props> = ({
|
||||
<Square className="h-4 w-4" />
|
||||
</Button>
|
||||
) : (
|
||||
<Button type="submit" disabled={!input.trim()} size="icon">
|
||||
<Button
|
||||
type="submit"
|
||||
disabled={
|
||||
!input.trim() || voice.isRecording || voice.isTranscribing
|
||||
}
|
||||
size="icon"
|
||||
>
|
||||
<Send className="h-4 w-4" />
|
||||
</Button>
|
||||
)}
|
||||
@@ -1,7 +1,7 @@
|
||||
import { MessageSquareHeart } from 'lucide-react'
|
||||
import type { FC } from 'react'
|
||||
|
||||
export const JTBDAgentHeader: FC = () => {
|
||||
export const Header: FC = () => {
|
||||
return (
|
||||
<div className="rounded-xl border border-border bg-card p-6 shadow-sm transition-all hover:shadow-md">
|
||||
<div className="flex items-start gap-4">
|
||||
@@ -11,7 +11,7 @@ export const JTBDAgentHeader: FC = () => {
|
||||
<div className="flex-1">
|
||||
<h2 className="mb-1 font-semibold text-xl">Product Survey</h2>
|
||||
<p className="text-muted-foreground text-sm">
|
||||
Share your experience with BrowserOS to help us improve
|
||||
We'd love your honest feedback. All responses are anonymous.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
@@ -1,10 +1,10 @@
|
||||
import { AlertCircle, CheckCircle2, RotateCcw } from 'lucide-react'
|
||||
import type { FC } from 'react'
|
||||
import { Button } from '@/components/ui/button'
|
||||
import { JTBDAgentChat } from './jtbd-agent-chat'
|
||||
import { JTBDAgentHeader } from './jtbd-agent-header'
|
||||
import { JTBDAgentWelcome } from './jtbd-agent-welcome'
|
||||
import { useJTBDAgentChat } from './use-jtbd-agent-chat'
|
||||
import { Chat } from './chat'
|
||||
import { Header } from './header'
|
||||
import { useChat } from './use-chat'
|
||||
import { Welcome } from './welcome'
|
||||
|
||||
const ThankYouCard: FC<{ onReset: () => void }> = ({ onReset }) => (
|
||||
<div className="rounded-xl border border-border bg-card p-8 text-center shadow-sm">
|
||||
@@ -42,19 +42,19 @@ const ErrorCard: FC<{ error: Error; onRetry: () => void }> = ({
|
||||
</div>
|
||||
)
|
||||
|
||||
export const JTBDAgentPage: FC = () => {
|
||||
const chat = useJTBDAgentChat()
|
||||
export const SurveyPage: FC = () => {
|
||||
const chat = useChat()
|
||||
|
||||
return (
|
||||
<div className="fade-in slide-in-from-bottom-5 animate-in space-y-6 duration-500">
|
||||
<JTBDAgentHeader />
|
||||
<Header />
|
||||
|
||||
{chat.phase === 'idle' && (
|
||||
<JTBDAgentWelcome onStart={chat.start} isLoading={chat.isStreaming} />
|
||||
<Welcome onStart={chat.start} isLoading={chat.isStreaming} />
|
||||
)}
|
||||
|
||||
{chat.phase === 'active' && (
|
||||
<JTBDAgentChat
|
||||
<Chat
|
||||
messages={chat.messages}
|
||||
isStreaming={chat.isStreaming}
|
||||
onSendMessage={chat.respond}
|
||||
@@ -76,7 +76,7 @@ async function* streamSSE(
|
||||
}
|
||||
}
|
||||
|
||||
export function useJTBDAgentChat() {
|
||||
export function useChat() {
|
||||
const [phase, setPhase] = useState<Phase>('idle')
|
||||
const [messages, setMessages] = useState<Message[]>([])
|
||||
const [isStreaming, setIsStreaming] = useState(false)
|
||||
208
apps/agent/entrypoints/options/jtbd-agent/use-voice-input.ts
Normal file
208
apps/agent/entrypoints/options/jtbd-agent/use-voice-input.ts
Normal file
@@ -0,0 +1,208 @@
|
||||
import { useCallback, useEffect, useRef, useState } from 'react'
|
||||
|
||||
const GATEWAY_URL = 'https://llm.browseros.com'
|
||||
|
||||
interface UseVoiceInputReturn {
|
||||
isRecording: boolean
|
||||
isTranscribing: boolean
|
||||
transcript: string
|
||||
audioLevel: number
|
||||
error: string | null
|
||||
startRecording: () => Promise<void>
|
||||
stopRecording: () => Promise<void>
|
||||
clearTranscript: () => void
|
||||
}
|
||||
|
||||
async function transcribeAudio(audioBlob: Blob): Promise<string> {
|
||||
const formData = new FormData()
|
||||
formData.append('file', audioBlob, 'recording.webm')
|
||||
formData.append('response_format', 'json')
|
||||
|
||||
const response = await fetch(`${GATEWAY_URL}/api/transcribe`, {
|
||||
method: 'POST',
|
||||
body: formData,
|
||||
})
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response
|
||||
.json()
|
||||
.catch(() => ({ error: 'Transcription failed' }))
|
||||
throw new Error(error.error || `Transcription failed: ${response.status}`)
|
||||
}
|
||||
|
||||
const result = await response.json()
|
||||
return result.text || ''
|
||||
}
|
||||
|
||||
export function useVoiceInput(): UseVoiceInputReturn {
|
||||
const [isRecording, setIsRecording] = useState(false)
|
||||
const [isTranscribing, setIsTranscribing] = useState(false)
|
||||
const [transcript, setTranscript] = useState('')
|
||||
const [audioLevel, setAudioLevel] = useState(0)
|
||||
const [error, setError] = useState<string | null>(null)
|
||||
|
||||
const mediaRecorderRef = useRef<MediaRecorder | null>(null)
|
||||
const streamRef = useRef<MediaStream | null>(null)
|
||||
const chunksRef = useRef<Blob[]>([])
|
||||
const audioContextRef = useRef<AudioContext | null>(null)
|
||||
const analyserRef = useRef<AnalyserNode | null>(null)
|
||||
const animationFrameRef = useRef<number | null>(null)
|
||||
|
||||
const stopAudioLevelMonitoring = useCallback(() => {
|
||||
if (animationFrameRef.current) {
|
||||
cancelAnimationFrame(animationFrameRef.current)
|
||||
animationFrameRef.current = null
|
||||
}
|
||||
if (audioContextRef.current?.state !== 'closed') {
|
||||
audioContextRef.current?.close()
|
||||
}
|
||||
audioContextRef.current = null
|
||||
analyserRef.current = null
|
||||
setAudioLevel(0)
|
||||
}, [])
|
||||
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
streamRef.current?.getTracks().forEach((track) => {
|
||||
track.stop()
|
||||
})
|
||||
if (mediaRecorderRef.current?.state === 'recording') {
|
||||
mediaRecorderRef.current.stop()
|
||||
}
|
||||
stopAudioLevelMonitoring()
|
||||
}
|
||||
}, [stopAudioLevelMonitoring])
|
||||
|
||||
const startAudioLevelMonitoring = useCallback((stream: MediaStream) => {
|
||||
const audioContext = new AudioContext()
|
||||
const analyser = audioContext.createAnalyser()
|
||||
analyser.fftSize = 256
|
||||
|
||||
const source = audioContext.createMediaStreamSource(stream)
|
||||
source.connect(analyser)
|
||||
|
||||
audioContextRef.current = audioContext
|
||||
analyserRef.current = analyser
|
||||
|
||||
const updateLevel = () => {
|
||||
if (!analyserRef.current) return
|
||||
|
||||
const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount)
|
||||
analyserRef.current.getByteFrequencyData(dataArray)
|
||||
|
||||
const average = dataArray.reduce((a, b) => a + b, 0) / dataArray.length
|
||||
const normalized = Math.min(100, (average / 128) * 100)
|
||||
setAudioLevel(Math.round(normalized))
|
||||
|
||||
animationFrameRef.current = requestAnimationFrame(updateLevel)
|
||||
}
|
||||
|
||||
updateLevel()
|
||||
}, [])
|
||||
|
||||
const startRecording = useCallback(async () => {
|
||||
try {
|
||||
setError(null)
|
||||
setTranscript('')
|
||||
chunksRef.current = []
|
||||
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: {
|
||||
channelCount: 1,
|
||||
sampleRate: 16000,
|
||||
echoCancellation: true,
|
||||
noiseSuppression: true,
|
||||
},
|
||||
})
|
||||
|
||||
streamRef.current = stream
|
||||
startAudioLevelMonitoring(stream)
|
||||
|
||||
const mimeType = MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
|
||||
? 'audio/webm;codecs=opus'
|
||||
: 'audio/webm'
|
||||
|
||||
const mediaRecorder = new MediaRecorder(stream, { mimeType })
|
||||
mediaRecorderRef.current = mediaRecorder
|
||||
|
||||
mediaRecorder.ondataavailable = (e) => {
|
||||
if (e.data.size > 0) {
|
||||
chunksRef.current.push(e.data)
|
||||
}
|
||||
}
|
||||
|
||||
mediaRecorder.start(250)
|
||||
setIsRecording(true)
|
||||
} catch (err) {
|
||||
if (err instanceof Error) {
|
||||
if (err.name === 'NotAllowedError') {
|
||||
setError('Microphone permission denied')
|
||||
} else if (err.name === 'NotFoundError') {
|
||||
setError('No microphone found')
|
||||
} else {
|
||||
setError(err.message)
|
||||
}
|
||||
} else {
|
||||
setError('Failed to start recording')
|
||||
}
|
||||
}
|
||||
}, [startAudioLevelMonitoring])
|
||||
|
||||
const stopRecording = useCallback(async () => {
|
||||
const mediaRecorder = mediaRecorderRef.current
|
||||
|
||||
if (!mediaRecorder || mediaRecorder.state === 'inactive') {
|
||||
return
|
||||
}
|
||||
|
||||
await new Promise<void>((resolve) => {
|
||||
mediaRecorder.onstop = () => resolve()
|
||||
mediaRecorder.stop()
|
||||
})
|
||||
|
||||
streamRef.current?.getTracks().forEach((track) => {
|
||||
track.stop()
|
||||
})
|
||||
streamRef.current = null
|
||||
stopAudioLevelMonitoring()
|
||||
setIsRecording(false)
|
||||
|
||||
const audioBlob = new Blob(chunksRef.current, { type: 'audio/webm' })
|
||||
chunksRef.current = []
|
||||
|
||||
if (audioBlob.size === 0) {
|
||||
setError('No audio recorded')
|
||||
return
|
||||
}
|
||||
|
||||
setIsTranscribing(true)
|
||||
try {
|
||||
const text = await transcribeAudio(audioBlob)
|
||||
if (text.trim()) {
|
||||
setTranscript(text.trim())
|
||||
} else {
|
||||
setError('No speech detected')
|
||||
}
|
||||
} catch (err) {
|
||||
setError(err instanceof Error ? err.message : 'Transcription failed')
|
||||
} finally {
|
||||
setIsTranscribing(false)
|
||||
}
|
||||
}, [stopAudioLevelMonitoring])
|
||||
|
||||
const clearTranscript = useCallback(() => {
|
||||
setTranscript('')
|
||||
setError(null)
|
||||
}, [])
|
||||
|
||||
return {
|
||||
isRecording,
|
||||
isTranscribing,
|
||||
transcript,
|
||||
audioLevel,
|
||||
error,
|
||||
startRecording,
|
||||
stopRecording,
|
||||
clearTranscript,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
import { Loader2, Mic, Square } from 'lucide-react'
|
||||
import type { FC } from 'react'
|
||||
import { Button } from '@/components/ui/button'
|
||||
|
||||
interface Props {
|
||||
isRecording: boolean
|
||||
isTranscribing: boolean
|
||||
audioLevel: number
|
||||
disabled?: boolean
|
||||
onStart: () => void
|
||||
onStop: () => void
|
||||
}
|
||||
|
||||
export const VoiceInputButton: FC<Props> = ({
|
||||
isRecording,
|
||||
isTranscribing,
|
||||
audioLevel,
|
||||
disabled,
|
||||
onStart,
|
||||
onStop,
|
||||
}) => {
|
||||
if (isTranscribing) {
|
||||
return (
|
||||
<Button variant="outline" size="icon" disabled>
|
||||
<Loader2 className="h-4 w-4 animate-spin" />
|
||||
</Button>
|
||||
)
|
||||
}
|
||||
|
||||
if (isRecording) {
|
||||
return (
|
||||
<Button
|
||||
type="button"
|
||||
variant="destructive"
|
||||
size="icon"
|
||||
onClick={onStop}
|
||||
className="relative"
|
||||
>
|
||||
<Square className="h-4 w-4" />
|
||||
<span
|
||||
className="absolute inset-0 animate-ping rounded-md bg-destructive/50"
|
||||
style={{ opacity: Math.min(0.7, audioLevel / 100) }}
|
||||
/>
|
||||
</Button>
|
||||
)
|
||||
}
|
||||
|
||||
return (
|
||||
<Button type="button" size="icon" onClick={onStart} disabled={disabled}>
|
||||
<Mic className="h-4 w-4" />
|
||||
</Button>
|
||||
)
|
||||
}
|
||||
@@ -7,7 +7,7 @@ interface Props {
|
||||
isLoading?: boolean
|
||||
}
|
||||
|
||||
export const JTBDAgentWelcome: FC<Props> = ({ onStart, isLoading }) => {
|
||||
export const Welcome: FC<Props> = ({ onStart, isLoading }) => {
|
||||
return (
|
||||
<div className="rounded-xl border border-border bg-card p-8 text-center shadow-sm">
|
||||
<div className="mx-auto mb-4 flex h-16 w-16 items-center justify-center rounded-full bg-[var(--accent-orange)]/10">
|
||||
Reference in New Issue
Block a user