feat: jtbd agent add transcription support (#212)

* feat: v0.1 of voice transcription for JTBD survey

Add voice input capability to the JTBD Product Survey chat:
- useVoiceInput hook for audio recording and transcription
- VoiceInputButton component for mic/stop/loading states
- Waveform visualization during recording
- Integration with BrowserOS gateway transcription endpoint

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* style: make voice button orange like send button

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* chore: refactor jtbd agent

* chore: udpate text

* fix: clean up stop recording if stopped midway

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Felarof
2026-01-12 15:03:13 -08:00
committed by GitHub
parent c5f29c67f6
commit 7cfe55a360
8 changed files with 360 additions and 26 deletions

View File

@@ -3,7 +3,7 @@ import { HashRouter, Navigate, Route, Routes } from 'react-router'
import { AISettingsPage } from './ai-settings/AISettingsPage'
import { ConnectMCP } from './connect-mcp/ConnectMCP'
import { CustomizationPage } from './customization/CustomizationPage'
import { JTBDAgentPage } from './jtbd-agent/jtbd-agent-page'
import { SurveyPage } from './jtbd-agent'
import { DashboardLayout } from './layout/DashboardLayout'
import { LlmHubPage } from './llm-hub/LlmHubPage'
import { MCPSettingsPage } from './mcp-settings/MCPSettingsPage'
@@ -36,7 +36,7 @@ export const App: FC = () => {
element={<AISettingsPage key="onboarding" />}
/>
<Route path="scheduled" element={<ScheduledTasksPage />} />
<Route path="jtbd-agent" element={<JTBDAgentPage />} />
<Route path="jtbd-agent" element={<SurveyPage />} />
</Route>
</Routes>
</HashRouter>

View File

@@ -4,7 +4,9 @@ import { MessageResponse } from '@/components/ai-elements/message'
import { Button } from '@/components/ui/button'
import { Textarea } from '@/components/ui/textarea'
import { cn } from '@/lib/utils'
import type { Message } from './use-jtbd-agent-chat'
import type { Message } from './use-chat'
import { useVoiceInput } from './use-voice-input'
import { VoiceInputButton } from './voice-input-button'
interface Props {
messages: Message[]
@@ -39,7 +41,29 @@ const MessageBubble: FC<{ message: Message }> = ({ message }) => {
)
}
export const JTBDAgentChat: FC<Props> = ({
const WAVEFORM_BARS = [0, 1, 2, 3, 4] as const
const WaveformIndicator: FC<{ level: number }> = ({ level }) => {
return (
<div className="flex items-center justify-center gap-1">
{WAVEFORM_BARS.map((barIndex) => {
const barLevel = Math.max(
0.2,
Math.sin((barIndex / WAVEFORM_BARS.length) * Math.PI) * (level / 100),
)
return (
<div
key={barIndex}
className="w-1 rounded-full bg-destructive transition-all duration-75"
style={{ height: `${Math.max(4, barLevel * 20)}px` }}
/>
)
})}
</div>
)
}
export const Chat: FC<Props> = ({
messages,
isStreaming,
onSendMessage,
@@ -48,15 +72,28 @@ export const JTBDAgentChat: FC<Props> = ({
const [input, setInput] = useState('')
const messagesEndRef = useRef<HTMLDivElement>(null)
const voice = useVoiceInput()
const messagesLength = messages.length
// biome-ignore lint/correctness/useExhaustiveDependencies: intentionally scroll on message count change
useEffect(() => {
messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' })
}, [messagesLength])
// Insert transcript into input when transcription completes
useEffect(() => {
if (voice.transcript && !voice.isTranscribing) {
setInput((prev) => {
const separator = prev.trim() ? ' ' : ''
return prev + separator + voice.transcript
})
voice.clearTranscript()
}
}, [voice.transcript, voice.isTranscribing, voice.clearTranscript])
const handleSubmit = (e: FormEvent) => {
e.preventDefault()
if (!input.trim() || isStreaming) return
if (!input.trim() || isStreaming || voice.isRecording) return
onSendMessage(input.trim())
setInput('')
}
@@ -68,6 +105,9 @@ export const JTBDAgentChat: FC<Props> = ({
}
}
const isInputDisabled =
isStreaming || voice.isRecording || voice.isTranscribing
return (
<div className="flex h-[calc(100vh-250px)] flex-col rounded-xl border border-border bg-card shadow-sm">
<div className="flex-1 space-y-4 overflow-y-auto p-4">
@@ -78,16 +118,43 @@ export const JTBDAgentChat: FC<Props> = ({
</div>
<form onSubmit={handleSubmit} className="border-border border-t p-4">
{voice.error && (
<div className="mb-2 text-destructive text-sm">{voice.error}</div>
)}
<div className="flex gap-2">
<Textarea
value={input}
onChange={(e) => setInput(e.target.value)}
onKeyDown={handleKeyDown}
placeholder="Type your response..."
{voice.isRecording ? (
<div className="flex flex-1 items-center justify-center gap-2 rounded-md border border-input bg-muted/50 px-3 py-2">
<WaveformIndicator level={voice.audioLevel} />
<span className="text-muted-foreground text-sm">
Listening...
</span>
</div>
) : (
<Textarea
value={input}
onChange={(e) => setInput(e.target.value)}
onKeyDown={handleKeyDown}
placeholder={
voice.isTranscribing
? 'Transcribing...'
: 'Type your response...'
}
disabled={isInputDisabled}
className="max-h-40 min-h-[44px] resize-none"
rows={1}
/>
)}
<VoiceInputButton
isRecording={voice.isRecording}
isTranscribing={voice.isTranscribing}
audioLevel={voice.audioLevel}
disabled={isStreaming}
className="max-h-40 min-h-[44px] resize-none"
rows={1}
onStart={voice.startRecording}
onStop={voice.stopRecording}
/>
{isStreaming ? (
<Button
type="button"
@@ -98,7 +165,13 @@ export const JTBDAgentChat: FC<Props> = ({
<Square className="h-4 w-4" />
</Button>
) : (
<Button type="submit" disabled={!input.trim()} size="icon">
<Button
type="submit"
disabled={
!input.trim() || voice.isRecording || voice.isTranscribing
}
size="icon"
>
<Send className="h-4 w-4" />
</Button>
)}

View File

@@ -1,7 +1,7 @@
import { MessageSquareHeart } from 'lucide-react'
import type { FC } from 'react'
export const JTBDAgentHeader: FC = () => {
export const Header: FC = () => {
return (
<div className="rounded-xl border border-border bg-card p-6 shadow-sm transition-all hover:shadow-md">
<div className="flex items-start gap-4">
@@ -11,7 +11,7 @@ export const JTBDAgentHeader: FC = () => {
<div className="flex-1">
<h2 className="mb-1 font-semibold text-xl">Product Survey</h2>
<p className="text-muted-foreground text-sm">
Share your experience with BrowserOS to help us improve
We'd love your honest feedback. All responses are anonymous.
</p>
</div>
</div>

View File

@@ -1,10 +1,10 @@
import { AlertCircle, CheckCircle2, RotateCcw } from 'lucide-react'
import type { FC } from 'react'
import { Button } from '@/components/ui/button'
import { JTBDAgentChat } from './jtbd-agent-chat'
import { JTBDAgentHeader } from './jtbd-agent-header'
import { JTBDAgentWelcome } from './jtbd-agent-welcome'
import { useJTBDAgentChat } from './use-jtbd-agent-chat'
import { Chat } from './chat'
import { Header } from './header'
import { useChat } from './use-chat'
import { Welcome } from './welcome'
const ThankYouCard: FC<{ onReset: () => void }> = ({ onReset }) => (
<div className="rounded-xl border border-border bg-card p-8 text-center shadow-sm">
@@ -42,19 +42,19 @@ const ErrorCard: FC<{ error: Error; onRetry: () => void }> = ({
</div>
)
export const JTBDAgentPage: FC = () => {
const chat = useJTBDAgentChat()
export const SurveyPage: FC = () => {
const chat = useChat()
return (
<div className="fade-in slide-in-from-bottom-5 animate-in space-y-6 duration-500">
<JTBDAgentHeader />
<Header />
{chat.phase === 'idle' && (
<JTBDAgentWelcome onStart={chat.start} isLoading={chat.isStreaming} />
<Welcome onStart={chat.start} isLoading={chat.isStreaming} />
)}
{chat.phase === 'active' && (
<JTBDAgentChat
<Chat
messages={chat.messages}
isStreaming={chat.isStreaming}
onSendMessage={chat.respond}

View File

@@ -76,7 +76,7 @@ async function* streamSSE(
}
}
export function useJTBDAgentChat() {
export function useChat() {
const [phase, setPhase] = useState<Phase>('idle')
const [messages, setMessages] = useState<Message[]>([])
const [isStreaming, setIsStreaming] = useState(false)

View File

@@ -0,0 +1,208 @@
import { useCallback, useEffect, useRef, useState } from 'react'
const GATEWAY_URL = 'https://llm.browseros.com'
interface UseVoiceInputReturn {
isRecording: boolean
isTranscribing: boolean
transcript: string
audioLevel: number
error: string | null
startRecording: () => Promise<void>
stopRecording: () => Promise<void>
clearTranscript: () => void
}
async function transcribeAudio(audioBlob: Blob): Promise<string> {
const formData = new FormData()
formData.append('file', audioBlob, 'recording.webm')
formData.append('response_format', 'json')
const response = await fetch(`${GATEWAY_URL}/api/transcribe`, {
method: 'POST',
body: formData,
})
if (!response.ok) {
const error = await response
.json()
.catch(() => ({ error: 'Transcription failed' }))
throw new Error(error.error || `Transcription failed: ${response.status}`)
}
const result = await response.json()
return result.text || ''
}
export function useVoiceInput(): UseVoiceInputReturn {
const [isRecording, setIsRecording] = useState(false)
const [isTranscribing, setIsTranscribing] = useState(false)
const [transcript, setTranscript] = useState('')
const [audioLevel, setAudioLevel] = useState(0)
const [error, setError] = useState<string | null>(null)
const mediaRecorderRef = useRef<MediaRecorder | null>(null)
const streamRef = useRef<MediaStream | null>(null)
const chunksRef = useRef<Blob[]>([])
const audioContextRef = useRef<AudioContext | null>(null)
const analyserRef = useRef<AnalyserNode | null>(null)
const animationFrameRef = useRef<number | null>(null)
const stopAudioLevelMonitoring = useCallback(() => {
if (animationFrameRef.current) {
cancelAnimationFrame(animationFrameRef.current)
animationFrameRef.current = null
}
if (audioContextRef.current?.state !== 'closed') {
audioContextRef.current?.close()
}
audioContextRef.current = null
analyserRef.current = null
setAudioLevel(0)
}, [])
useEffect(() => {
return () => {
streamRef.current?.getTracks().forEach((track) => {
track.stop()
})
if (mediaRecorderRef.current?.state === 'recording') {
mediaRecorderRef.current.stop()
}
stopAudioLevelMonitoring()
}
}, [stopAudioLevelMonitoring])
const startAudioLevelMonitoring = useCallback((stream: MediaStream) => {
const audioContext = new AudioContext()
const analyser = audioContext.createAnalyser()
analyser.fftSize = 256
const source = audioContext.createMediaStreamSource(stream)
source.connect(analyser)
audioContextRef.current = audioContext
analyserRef.current = analyser
const updateLevel = () => {
if (!analyserRef.current) return
const dataArray = new Uint8Array(analyserRef.current.frequencyBinCount)
analyserRef.current.getByteFrequencyData(dataArray)
const average = dataArray.reduce((a, b) => a + b, 0) / dataArray.length
const normalized = Math.min(100, (average / 128) * 100)
setAudioLevel(Math.round(normalized))
animationFrameRef.current = requestAnimationFrame(updateLevel)
}
updateLevel()
}, [])
const startRecording = useCallback(async () => {
try {
setError(null)
setTranscript('')
chunksRef.current = []
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
sampleRate: 16000,
echoCancellation: true,
noiseSuppression: true,
},
})
streamRef.current = stream
startAudioLevelMonitoring(stream)
const mimeType = MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
? 'audio/webm;codecs=opus'
: 'audio/webm'
const mediaRecorder = new MediaRecorder(stream, { mimeType })
mediaRecorderRef.current = mediaRecorder
mediaRecorder.ondataavailable = (e) => {
if (e.data.size > 0) {
chunksRef.current.push(e.data)
}
}
mediaRecorder.start(250)
setIsRecording(true)
} catch (err) {
if (err instanceof Error) {
if (err.name === 'NotAllowedError') {
setError('Microphone permission denied')
} else if (err.name === 'NotFoundError') {
setError('No microphone found')
} else {
setError(err.message)
}
} else {
setError('Failed to start recording')
}
}
}, [startAudioLevelMonitoring])
const stopRecording = useCallback(async () => {
const mediaRecorder = mediaRecorderRef.current
if (!mediaRecorder || mediaRecorder.state === 'inactive') {
return
}
await new Promise<void>((resolve) => {
mediaRecorder.onstop = () => resolve()
mediaRecorder.stop()
})
streamRef.current?.getTracks().forEach((track) => {
track.stop()
})
streamRef.current = null
stopAudioLevelMonitoring()
setIsRecording(false)
const audioBlob = new Blob(chunksRef.current, { type: 'audio/webm' })
chunksRef.current = []
if (audioBlob.size === 0) {
setError('No audio recorded')
return
}
setIsTranscribing(true)
try {
const text = await transcribeAudio(audioBlob)
if (text.trim()) {
setTranscript(text.trim())
} else {
setError('No speech detected')
}
} catch (err) {
setError(err instanceof Error ? err.message : 'Transcription failed')
} finally {
setIsTranscribing(false)
}
}, [stopAudioLevelMonitoring])
const clearTranscript = useCallback(() => {
setTranscript('')
setError(null)
}, [])
return {
isRecording,
isTranscribing,
transcript,
audioLevel,
error,
startRecording,
stopRecording,
clearTranscript,
}
}

View File

@@ -0,0 +1,53 @@
import { Loader2, Mic, Square } from 'lucide-react'
import type { FC } from 'react'
import { Button } from '@/components/ui/button'
interface Props {
isRecording: boolean
isTranscribing: boolean
audioLevel: number
disabled?: boolean
onStart: () => void
onStop: () => void
}
export const VoiceInputButton: FC<Props> = ({
isRecording,
isTranscribing,
audioLevel,
disabled,
onStart,
onStop,
}) => {
if (isTranscribing) {
return (
<Button variant="outline" size="icon" disabled>
<Loader2 className="h-4 w-4 animate-spin" />
</Button>
)
}
if (isRecording) {
return (
<Button
type="button"
variant="destructive"
size="icon"
onClick={onStop}
className="relative"
>
<Square className="h-4 w-4" />
<span
className="absolute inset-0 animate-ping rounded-md bg-destructive/50"
style={{ opacity: Math.min(0.7, audioLevel / 100) }}
/>
</Button>
)
}
return (
<Button type="button" size="icon" onClick={onStart} disabled={disabled}>
<Mic className="h-4 w-4" />
</Button>
)
}

View File

@@ -7,7 +7,7 @@ interface Props {
isLoading?: boolean
}
export const JTBDAgentWelcome: FC<Props> = ({ onStart, isLoading }) => {
export const Welcome: FC<Props> = ({ onStart, isLoading }) => {
return (
<div className="rounded-xl border border-border bg-card p-8 text-center shadow-sm">
<div className="mx-auto mb-4 flex h-16 w-16 items-center justify-center rounded-full bg-[var(--accent-orange)]/10">