mirror of
https://github.com/QuentinFuxa/WhisperLiveKit.git
synced 2026-05-13 23:53:50 +00:00
Remove unused benchmark and cascade bridge scripts
This commit is contained in:
@@ -1,124 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Standalone Voxtral benchmark — no whisperlivekit imports."""
|
||||
import json, logging, re, time, wave, queue, threading
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
for n in ["transformers","torch","httpx"]:
|
||||
logging.getLogger(n).setLevel(logging.ERROR)
|
||||
|
||||
from jiwer import wer as compute_wer
|
||||
from transformers import AutoProcessor, VoxtralRealtimeForConditionalGeneration, TextIteratorStreamer
|
||||
|
||||
def norm(t):
|
||||
return re.sub(r' +', ' ', re.sub(r'[^a-z0-9 ]', ' ', t.lower())).strip()
|
||||
|
||||
def load_audio(path):
|
||||
with wave.open(path, 'r') as wf:
|
||||
return np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16).astype(np.float32) / 32768.0
|
||||
|
||||
# Load model
|
||||
print("Loading Voxtral-Mini-4B...", flush=True)
|
||||
MODEL_ID = "mistralai/Voxtral-Mini-4B-Realtime-2602"
|
||||
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
||||
model = VoxtralRealtimeForConditionalGeneration.from_pretrained(
|
||||
MODEL_ID, torch_dtype=torch.bfloat16, device_map="cuda:0",
|
||||
)
|
||||
print(f"Loaded, GPU: {torch.cuda.memory_allocated()/1e9:.1f} GB", flush=True)
|
||||
|
||||
def transcribe_batch(audio_np):
|
||||
"""Simple batch transcription (not streaming)."""
|
||||
# Voxtral expects audio as input_features from processor
|
||||
inputs = processor(
|
||||
audio=audio_np, sampling_rate=16000, return_tensors="pt",
|
||||
).to("cuda:0").to(torch.bfloat16)
|
||||
|
||||
t0 = time.perf_counter()
|
||||
with torch.inference_mode():
|
||||
generated = model.generate(**inputs, max_new_tokens=1024)
|
||||
t1 = time.perf_counter()
|
||||
|
||||
text = processor.batch_decode(generated, skip_special_tokens=True)[0].strip()
|
||||
return text, t1 - t0
|
||||
|
||||
# 1. LibriSpeech test-clean
|
||||
print("\n=== Voxtral / LibriSpeech test-clean ===", flush=True)
|
||||
clean = json.load(open("/home/cloud/benchmark_data/metadata.json"))
|
||||
wers = []; ta = tp = 0
|
||||
for i, s in enumerate(clean):
|
||||
audio = load_audio(s['path'])
|
||||
hyp, pt = transcribe_batch(audio)
|
||||
w = compute_wer(norm(s['reference']), norm(hyp))
|
||||
wers.append(w); ta += s['duration']; tp += pt
|
||||
if i < 3 or i % 20 == 0:
|
||||
print(f" [{i}] {s['duration']:.1f}s RTF={pt/s['duration']:.2f} WER={w:.1%} | {hyp[:60]}", flush=True)
|
||||
clean_wer = np.mean(wers); clean_rtf = tp/ta
|
||||
print(f" CLEAN: WER {clean_wer:.2%}, RTF {clean_rtf:.3f} ({len(clean)} samples, {ta:.0f}s)")
|
||||
|
||||
# 2. LibriSpeech test-other
|
||||
print("\n=== Voxtral / LibriSpeech test-other ===", flush=True)
|
||||
other = json.load(open("/home/cloud/benchmark_data/metadata_other.json"))
|
||||
wers2 = []; ta2 = tp2 = 0
|
||||
for i, s in enumerate(other):
|
||||
audio = load_audio(s['path'])
|
||||
hyp, pt = transcribe_batch(audio)
|
||||
w = compute_wer(norm(s['reference']), norm(hyp))
|
||||
wers2.append(w); ta2 += s['duration']; tp2 += pt
|
||||
if i < 3 or i % 20 == 0:
|
||||
print(f" [{i}] {s['duration']:.1f}s RTF={pt/s['duration']:.2f} WER={w:.1%}", flush=True)
|
||||
other_wer = np.mean(wers2); other_rtf = tp2/ta2
|
||||
print(f" OTHER: WER {other_wer:.2%}, RTF {other_rtf:.3f} ({len(other)} samples, {ta2:.0f}s)")
|
||||
|
||||
# 3. ACL6060
|
||||
print("\n=== Voxtral / ACL6060 ===", flush=True)
|
||||
acl_results = []
|
||||
for talk in ["110", "117", "268", "367", "590"]:
|
||||
audio = load_audio(f"/home/cloud/acl6060_audio/2022.acl-long.{talk}.wav")
|
||||
dur = len(audio) / 16000
|
||||
gw = []
|
||||
with open(f"/home/cloud/iwslt26-sst/inputs/en/acl6060.ts/gold-jsonl/2022.acl-long.{talk}.jsonl") as f:
|
||||
for line in f:
|
||||
gw.append(json.loads(line)["text"].strip())
|
||||
gold = " ".join(gw)
|
||||
|
||||
# For long audio, process in 30s chunks
|
||||
all_hyp = []
|
||||
t0 = time.perf_counter()
|
||||
chunk_size = 30 * 16000
|
||||
for start in range(0, len(audio), chunk_size):
|
||||
chunk = audio[start:start + chunk_size]
|
||||
if len(chunk) < 1600: # skip very short tail
|
||||
continue
|
||||
hyp, _ = transcribe_batch(chunk)
|
||||
all_hyp.append(hyp)
|
||||
t1 = time.perf_counter()
|
||||
|
||||
full_hyp = " ".join(all_hyp)
|
||||
w = compute_wer(norm(gold), norm(full_hyp))
|
||||
rtf = (t1 - t0) / dur
|
||||
acl_results.append({"talk": talk, "wer": w, "rtf": rtf, "dur": dur})
|
||||
print(f" Talk {talk}: {dur:.0f}s, WER {w:.2%}, RTF {rtf:.3f}", flush=True)
|
||||
|
||||
acl_wer = np.mean([r["wer"] for r in acl_results])
|
||||
acl_rtf = np.mean([r["rtf"] for r in acl_results])
|
||||
print(f" ACL6060 AVERAGE: WER {acl_wer:.2%}, RTF {acl_rtf:.3f}")
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f" VOXTRAL BENCHMARK SUMMARY (H100 80GB)")
|
||||
print(f"{'='*60}")
|
||||
print(f" {'Dataset':>25} {'WER':>7} {'RTF':>7}")
|
||||
print(f" {'-'*42}")
|
||||
print(f" {'LibriSpeech clean':>25} {clean_wer:>6.2%} {clean_rtf:>7.3f}")
|
||||
print(f" {'LibriSpeech other':>25} {other_wer:>6.2%} {other_rtf:>7.3f}")
|
||||
print(f" {'ACL6060 (5 talks)':>25} {acl_wer:>6.2%} {acl_rtf:>7.3f}")
|
||||
|
||||
results = {
|
||||
"clean": {"avg_wer": round(float(clean_wer), 4), "rtf": round(float(clean_rtf), 3)},
|
||||
"other": {"avg_wer": round(float(other_wer), 4), "rtf": round(float(other_rtf), 3)},
|
||||
"acl6060": {"avg_wer": round(float(acl_wer), 4), "avg_rtf": round(float(acl_rtf), 3),
|
||||
"talks": [{k: (round(float(v), 4) if isinstance(v, (float, np.floating)) else v) for k, v in r.items()} for r in acl_results]},
|
||||
}
|
||||
json.dump(results, open("/home/cloud/bench_voxtral_results.json", "w"), indent=2)
|
||||
print(f"\nSaved to /home/cloud/bench_voxtral_results.json")
|
||||
@@ -1,122 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Benchmark Voxtral via vLLM WebSocket /v1/realtime — proper streaming."""
|
||||
import asyncio, json, base64, time, wave, re, os
|
||||
import numpy as np
|
||||
import websockets
|
||||
import librosa
|
||||
from jiwer import wer as compute_wer
|
||||
|
||||
MODEL = "mistralai/Voxtral-Mini-4B-Realtime-2602"
|
||||
WS_URI = "ws://localhost:8000/v1/realtime"
|
||||
|
||||
def norm(t):
|
||||
return re.sub(r' +', ' ', re.sub(r'[^a-z0-9 ]', ' ', t.lower())).strip()
|
||||
|
||||
async def transcribe(audio_path, max_tokens=4096):
|
||||
audio, _ = librosa.load(audio_path, sr=16000, mono=True)
|
||||
pcm16 = (audio * 32767).astype(np.int16).tobytes()
|
||||
dur = len(audio) / 16000
|
||||
|
||||
t0 = time.time()
|
||||
transcript = ""
|
||||
first_token_time = None
|
||||
|
||||
async with websockets.connect(WS_URI, max_size=2**24) as ws:
|
||||
await ws.recv() # session.created
|
||||
await ws.send(json.dumps({"type": "session.update", "model": MODEL}))
|
||||
await ws.send(json.dumps({"type": "input_audio_buffer.commit"})) # signal ready
|
||||
|
||||
# Send audio in 4KB chunks
|
||||
for i in range(0, len(pcm16), 4096):
|
||||
await ws.send(json.dumps({
|
||||
"type": "input_audio_buffer.append",
|
||||
"audio": base64.b64encode(pcm16[i:i+4096]).decode(),
|
||||
}))
|
||||
|
||||
await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": True}))
|
||||
|
||||
while True:
|
||||
try:
|
||||
msg = json.loads(await asyncio.wait_for(ws.recv(), timeout=120))
|
||||
if msg["type"] == "transcription.delta":
|
||||
d = msg.get("delta", "")
|
||||
if d.strip() and first_token_time is None:
|
||||
first_token_time = time.time() - t0
|
||||
transcript += d
|
||||
elif msg["type"] == "transcription.done":
|
||||
transcript = msg.get("text", transcript)
|
||||
break
|
||||
elif msg["type"] == "error":
|
||||
break
|
||||
except asyncio.TimeoutError:
|
||||
break
|
||||
|
||||
elapsed = time.time() - t0
|
||||
return transcript.strip(), dur, elapsed / dur, first_token_time or elapsed
|
||||
|
||||
async def main():
|
||||
# Warmup
|
||||
print("Warmup...", flush=True)
|
||||
await transcribe("/home/cloud/benchmark_data/librispeech_clean_0000.wav")
|
||||
|
||||
# LibriSpeech clean (full 91 samples)
|
||||
print("\n=== Voxtral vLLM Realtime / LibriSpeech clean ===", flush=True)
|
||||
clean = json.load(open("/home/cloud/benchmark_data/metadata.json"))
|
||||
wers = []; ta = tp = 0
|
||||
for i, s in enumerate(clean):
|
||||
hyp, dur, rtf, fwl = await transcribe(s['path'])
|
||||
w = compute_wer(norm(s['reference']), norm(hyp)) if hyp else 1.0
|
||||
wers.append(w); ta += dur; tp += dur * rtf
|
||||
if i < 3 or i % 20 == 0:
|
||||
print(f" [{i}] {dur:.1f}s RTF={rtf:.3f} FWL={fwl:.2f}s WER={w:.1%} | {hyp[:60]}", flush=True)
|
||||
clean_wer = np.mean(wers); clean_rtf = tp / ta
|
||||
print(f" CLEAN ({len(clean)}): WER {clean_wer:.2%}, RTF {clean_rtf:.3f}\n", flush=True)
|
||||
|
||||
# LibriSpeech other (full 133 samples)
|
||||
print("=== Voxtral vLLM Realtime / LibriSpeech other ===", flush=True)
|
||||
other = json.load(open("/home/cloud/benchmark_data/metadata_other.json"))
|
||||
wers2 = []; ta2 = tp2 = 0
|
||||
for i, s in enumerate(other):
|
||||
hyp, dur, rtf, fwl = await transcribe(s['path'])
|
||||
w = compute_wer(norm(s['reference']), norm(hyp)) if hyp else 1.0
|
||||
wers2.append(w); ta2 += dur; tp2 += dur * rtf
|
||||
if i < 3 or i % 20 == 0:
|
||||
print(f" [{i}] {dur:.1f}s RTF={rtf:.3f} WER={w:.1%}", flush=True)
|
||||
other_wer = np.mean(wers2); other_rtf = tp2 / ta2
|
||||
print(f" OTHER ({len(other)}): WER {other_wer:.2%}, RTF {other_rtf:.3f}\n", flush=True)
|
||||
|
||||
# ACL6060 talks
|
||||
print("=== Voxtral vLLM Realtime / ACL6060 ===", flush=True)
|
||||
acl = []
|
||||
for talk in ["110", "117", "268", "367", "590"]:
|
||||
gw = []
|
||||
with open(f"/home/cloud/iwslt26-sst/inputs/en/acl6060.ts/gold-jsonl/2022.acl-long.{talk}.jsonl") as f:
|
||||
for line in f: gw.append(json.loads(line)["text"].strip())
|
||||
gold = " ".join(gw)
|
||||
|
||||
hyp, dur, rtf, fwl = await transcribe(f"/home/cloud/acl6060_audio/2022.acl-long.{talk}.wav")
|
||||
w = compute_wer(norm(gold), norm(hyp)) if hyp else 1.0
|
||||
acl.append({"talk": talk, "wer": round(float(w),4), "rtf": round(float(rtf),3), "dur": round(dur,1)})
|
||||
print(f" Talk {talk}: {dur:.0f}s, WER {w:.2%}, RTF {rtf:.3f}, FWL {fwl:.2f}s", flush=True)
|
||||
|
||||
acl_wer = np.mean([r["wer"] for r in acl])
|
||||
acl_rtf = np.mean([r["rtf"] for r in acl])
|
||||
print(f" ACL6060 AVERAGE: WER {acl_wer:.2%}, RTF {acl_rtf:.3f}\n", flush=True)
|
||||
|
||||
# Summary
|
||||
print(f"{'='*55}")
|
||||
print(f" VOXTRAL vLLM REALTIME BENCHMARK (H100)")
|
||||
print(f"{'='*55}")
|
||||
print(f" LS clean ({len(clean)}): WER {clean_wer:.2%}, RTF {clean_rtf:.3f}")
|
||||
print(f" LS other ({len(other)}): WER {other_wer:.2%}, RTF {other_rtf:.3f}")
|
||||
print(f" ACL6060 (5): WER {acl_wer:.2%}, RTF {acl_rtf:.3f}")
|
||||
|
||||
results = {
|
||||
"clean": {"avg_wer": round(float(clean_wer),4), "rtf": round(float(clean_rtf),3), "n": len(clean)},
|
||||
"other": {"avg_wer": round(float(other_wer),4), "rtf": round(float(other_rtf),3), "n": len(other)},
|
||||
"acl6060": {"avg_wer": round(float(acl_wer),4), "avg_rtf": round(float(acl_rtf),3), "talks": acl},
|
||||
}
|
||||
json.dump(results, open("/home/cloud/bench_voxtral_realtime_results.json", "w"), indent=2)
|
||||
print(f"\n Saved to /home/cloud/bench_voxtral_realtime_results.json")
|
||||
|
||||
asyncio.run(main())
|
||||
@@ -1,116 +0,0 @@
|
||||
"""
|
||||
Bridge between WhisperLiveKit STT and IWSLT26 MT pipeline.
|
||||
|
||||
Converts streaming ASRToken output from SimulStreaming into the JSONL
|
||||
format expected by the AlignAtt MT agent (iwslt26-sst).
|
||||
|
||||
Output format (one JSON per line):
|
||||
{"text": "word or phrase", "emission_time": 1.234, "is_final": false, "speech_time": 1.0}
|
||||
|
||||
Where:
|
||||
- text: the emitted word/phrase
|
||||
- emission_time: wall-clock time when the word was emitted (for compute-aware eval)
|
||||
- speech_time: timestamp in the audio (for compute-unaware eval)
|
||||
- is_final: whether this is the last word of a segment/silence boundary
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from typing import List, TextIO
|
||||
|
||||
from whisperlivekit.timed_objects import ASRToken
|
||||
|
||||
|
||||
class CascadeBridge:
|
||||
"""Converts ASRToken stream to JSONL for the MT agent."""
|
||||
|
||||
def __init__(self, output_file: TextIO = None):
|
||||
self.output_file = output_file
|
||||
self.start_time = time.time()
|
||||
self.entries: List[dict] = []
|
||||
|
||||
def emit_tokens(self, tokens: List[ASRToken], is_final: bool = False):
|
||||
"""Emit a batch of tokens from the STT."""
|
||||
wall_clock = time.time() - self.start_time
|
||||
|
||||
for i, token in enumerate(tokens):
|
||||
entry = {
|
||||
"text": token.text.strip(),
|
||||
"emission_time": round(wall_clock, 3),
|
||||
"speech_time": round(token.start, 3),
|
||||
"is_final": is_final and (i == len(tokens) - 1),
|
||||
}
|
||||
self.entries.append(entry)
|
||||
if self.output_file:
|
||||
self.output_file.write(json.dumps(entry) + "\n")
|
||||
self.output_file.flush()
|
||||
|
||||
def get_entries(self) -> List[dict]:
|
||||
return self.entries
|
||||
|
||||
def get_text(self) -> str:
|
||||
"""Get the full transcribed text."""
|
||||
return " ".join(e["text"] for e in self.entries if e["text"])
|
||||
|
||||
def save(self, path: str):
|
||||
"""Save all entries to a JSONL file."""
|
||||
with open(path, "w") as f:
|
||||
for entry in self.entries:
|
||||
f.write(json.dumps(entry) + "\n")
|
||||
|
||||
|
||||
def run_stt_to_jsonl(
|
||||
audio_path: str,
|
||||
output_path: str,
|
||||
model_id: str = "Qwen/Qwen3-ASR-0.6B",
|
||||
alignment_heads_path: str = None,
|
||||
border_fraction: float = 0.20,
|
||||
language: str = "en",
|
||||
chunk_sec: float = 1.0,
|
||||
):
|
||||
"""Run STT on an audio file and save JSONL output for the MT agent.
|
||||
|
||||
This is the main entry point for the cascade: audio file → JSONL.
|
||||
"""
|
||||
import wave
|
||||
import numpy as np
|
||||
from whisperlivekit.qwen3_simul_kv import Qwen3SimulKVASR, Qwen3SimulKVOnlineProcessor
|
||||
|
||||
# Load audio
|
||||
with wave.open(audio_path, 'r') as wf:
|
||||
audio = np.frombuffer(
|
||||
wf.readframes(wf.getnframes()), dtype=np.int16
|
||||
).astype(np.float32) / 32768.0
|
||||
|
||||
# Initialize STT
|
||||
asr = Qwen3SimulKVASR(
|
||||
model_dir=model_id,
|
||||
lan=language,
|
||||
alignment_heads_path=alignment_heads_path,
|
||||
border_fraction=border_fraction,
|
||||
)
|
||||
proc = Qwen3SimulKVOnlineProcessor(asr)
|
||||
bridge = CascadeBridge()
|
||||
|
||||
# Stream audio in chunks
|
||||
chunk_samples = int(chunk_sec * 16000)
|
||||
offset = 0
|
||||
stream_time = 0.0
|
||||
|
||||
while offset < len(audio):
|
||||
chunk = audio[offset:offset + chunk_samples]
|
||||
stream_time += len(chunk) / 16000
|
||||
proc.insert_audio_chunk(chunk, stream_time)
|
||||
words, _ = proc.process_iter(is_last=False)
|
||||
if words:
|
||||
bridge.emit_tokens(words, is_final=False)
|
||||
offset += chunk_samples
|
||||
|
||||
# Final flush
|
||||
final_words, _ = proc.finish()
|
||||
if final_words:
|
||||
bridge.emit_tokens(final_words, is_final=True)
|
||||
|
||||
# Save
|
||||
bridge.save(output_path)
|
||||
return bridge
|
||||
Reference in New Issue
Block a user