Remove unused benchmark and cascade bridge scripts

This commit is contained in:
Quentin Fuxa
2026-03-31 23:10:44 +02:00
parent 3f233dc36c
commit 7fc341683a
3 changed files with 0 additions and 362 deletions

View File

@@ -1,124 +0,0 @@
#!/usr/bin/env python3
"""Standalone Voxtral benchmark — no whisperlivekit imports."""
import json, logging, re, time, wave, queue, threading
import numpy as np
import torch
logging.basicConfig(level=logging.WARNING)
for n in ["transformers","torch","httpx"]:
logging.getLogger(n).setLevel(logging.ERROR)
from jiwer import wer as compute_wer
from transformers import AutoProcessor, VoxtralRealtimeForConditionalGeneration, TextIteratorStreamer
def norm(t):
return re.sub(r' +', ' ', re.sub(r'[^a-z0-9 ]', ' ', t.lower())).strip()
def load_audio(path):
with wave.open(path, 'r') as wf:
return np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16).astype(np.float32) / 32768.0
# Load model
print("Loading Voxtral-Mini-4B...", flush=True)
MODEL_ID = "mistralai/Voxtral-Mini-4B-Realtime-2602"
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = VoxtralRealtimeForConditionalGeneration.from_pretrained(
MODEL_ID, torch_dtype=torch.bfloat16, device_map="cuda:0",
)
print(f"Loaded, GPU: {torch.cuda.memory_allocated()/1e9:.1f} GB", flush=True)
def transcribe_batch(audio_np):
"""Simple batch transcription (not streaming)."""
# Voxtral expects audio as input_features from processor
inputs = processor(
audio=audio_np, sampling_rate=16000, return_tensors="pt",
).to("cuda:0").to(torch.bfloat16)
t0 = time.perf_counter()
with torch.inference_mode():
generated = model.generate(**inputs, max_new_tokens=1024)
t1 = time.perf_counter()
text = processor.batch_decode(generated, skip_special_tokens=True)[0].strip()
return text, t1 - t0
# 1. LibriSpeech test-clean
print("\n=== Voxtral / LibriSpeech test-clean ===", flush=True)
clean = json.load(open("/home/cloud/benchmark_data/metadata.json"))
wers = []; ta = tp = 0
for i, s in enumerate(clean):
audio = load_audio(s['path'])
hyp, pt = transcribe_batch(audio)
w = compute_wer(norm(s['reference']), norm(hyp))
wers.append(w); ta += s['duration']; tp += pt
if i < 3 or i % 20 == 0:
print(f" [{i}] {s['duration']:.1f}s RTF={pt/s['duration']:.2f} WER={w:.1%} | {hyp[:60]}", flush=True)
clean_wer = np.mean(wers); clean_rtf = tp/ta
print(f" CLEAN: WER {clean_wer:.2%}, RTF {clean_rtf:.3f} ({len(clean)} samples, {ta:.0f}s)")
# 2. LibriSpeech test-other
print("\n=== Voxtral / LibriSpeech test-other ===", flush=True)
other = json.load(open("/home/cloud/benchmark_data/metadata_other.json"))
wers2 = []; ta2 = tp2 = 0
for i, s in enumerate(other):
audio = load_audio(s['path'])
hyp, pt = transcribe_batch(audio)
w = compute_wer(norm(s['reference']), norm(hyp))
wers2.append(w); ta2 += s['duration']; tp2 += pt
if i < 3 or i % 20 == 0:
print(f" [{i}] {s['duration']:.1f}s RTF={pt/s['duration']:.2f} WER={w:.1%}", flush=True)
other_wer = np.mean(wers2); other_rtf = tp2/ta2
print(f" OTHER: WER {other_wer:.2%}, RTF {other_rtf:.3f} ({len(other)} samples, {ta2:.0f}s)")
# 3. ACL6060
print("\n=== Voxtral / ACL6060 ===", flush=True)
acl_results = []
for talk in ["110", "117", "268", "367", "590"]:
audio = load_audio(f"/home/cloud/acl6060_audio/2022.acl-long.{talk}.wav")
dur = len(audio) / 16000
gw = []
with open(f"/home/cloud/iwslt26-sst/inputs/en/acl6060.ts/gold-jsonl/2022.acl-long.{talk}.jsonl") as f:
for line in f:
gw.append(json.loads(line)["text"].strip())
gold = " ".join(gw)
# For long audio, process in 30s chunks
all_hyp = []
t0 = time.perf_counter()
chunk_size = 30 * 16000
for start in range(0, len(audio), chunk_size):
chunk = audio[start:start + chunk_size]
if len(chunk) < 1600: # skip very short tail
continue
hyp, _ = transcribe_batch(chunk)
all_hyp.append(hyp)
t1 = time.perf_counter()
full_hyp = " ".join(all_hyp)
w = compute_wer(norm(gold), norm(full_hyp))
rtf = (t1 - t0) / dur
acl_results.append({"talk": talk, "wer": w, "rtf": rtf, "dur": dur})
print(f" Talk {talk}: {dur:.0f}s, WER {w:.2%}, RTF {rtf:.3f}", flush=True)
acl_wer = np.mean([r["wer"] for r in acl_results])
acl_rtf = np.mean([r["rtf"] for r in acl_results])
print(f" ACL6060 AVERAGE: WER {acl_wer:.2%}, RTF {acl_rtf:.3f}")
# Summary
print(f"\n{'='*60}")
print(f" VOXTRAL BENCHMARK SUMMARY (H100 80GB)")
print(f"{'='*60}")
print(f" {'Dataset':>25} {'WER':>7} {'RTF':>7}")
print(f" {'-'*42}")
print(f" {'LibriSpeech clean':>25} {clean_wer:>6.2%} {clean_rtf:>7.3f}")
print(f" {'LibriSpeech other':>25} {other_wer:>6.2%} {other_rtf:>7.3f}")
print(f" {'ACL6060 (5 talks)':>25} {acl_wer:>6.2%} {acl_rtf:>7.3f}")
results = {
"clean": {"avg_wer": round(float(clean_wer), 4), "rtf": round(float(clean_rtf), 3)},
"other": {"avg_wer": round(float(other_wer), 4), "rtf": round(float(other_rtf), 3)},
"acl6060": {"avg_wer": round(float(acl_wer), 4), "avg_rtf": round(float(acl_rtf), 3),
"talks": [{k: (round(float(v), 4) if isinstance(v, (float, np.floating)) else v) for k, v in r.items()} for r in acl_results]},
}
json.dump(results, open("/home/cloud/bench_voxtral_results.json", "w"), indent=2)
print(f"\nSaved to /home/cloud/bench_voxtral_results.json")

View File

@@ -1,122 +0,0 @@
#!/usr/bin/env python3
"""Benchmark Voxtral via vLLM WebSocket /v1/realtime — proper streaming."""
import asyncio, json, base64, time, wave, re, os
import numpy as np
import websockets
import librosa
from jiwer import wer as compute_wer
MODEL = "mistralai/Voxtral-Mini-4B-Realtime-2602"
WS_URI = "ws://localhost:8000/v1/realtime"
def norm(t):
return re.sub(r' +', ' ', re.sub(r'[^a-z0-9 ]', ' ', t.lower())).strip()
async def transcribe(audio_path, max_tokens=4096):
audio, _ = librosa.load(audio_path, sr=16000, mono=True)
pcm16 = (audio * 32767).astype(np.int16).tobytes()
dur = len(audio) / 16000
t0 = time.time()
transcript = ""
first_token_time = None
async with websockets.connect(WS_URI, max_size=2**24) as ws:
await ws.recv() # session.created
await ws.send(json.dumps({"type": "session.update", "model": MODEL}))
await ws.send(json.dumps({"type": "input_audio_buffer.commit"})) # signal ready
# Send audio in 4KB chunks
for i in range(0, len(pcm16), 4096):
await ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": base64.b64encode(pcm16[i:i+4096]).decode(),
}))
await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": True}))
while True:
try:
msg = json.loads(await asyncio.wait_for(ws.recv(), timeout=120))
if msg["type"] == "transcription.delta":
d = msg.get("delta", "")
if d.strip() and first_token_time is None:
first_token_time = time.time() - t0
transcript += d
elif msg["type"] == "transcription.done":
transcript = msg.get("text", transcript)
break
elif msg["type"] == "error":
break
except asyncio.TimeoutError:
break
elapsed = time.time() - t0
return transcript.strip(), dur, elapsed / dur, first_token_time or elapsed
async def main():
# Warmup
print("Warmup...", flush=True)
await transcribe("/home/cloud/benchmark_data/librispeech_clean_0000.wav")
# LibriSpeech clean (full 91 samples)
print("\n=== Voxtral vLLM Realtime / LibriSpeech clean ===", flush=True)
clean = json.load(open("/home/cloud/benchmark_data/metadata.json"))
wers = []; ta = tp = 0
for i, s in enumerate(clean):
hyp, dur, rtf, fwl = await transcribe(s['path'])
w = compute_wer(norm(s['reference']), norm(hyp)) if hyp else 1.0
wers.append(w); ta += dur; tp += dur * rtf
if i < 3 or i % 20 == 0:
print(f" [{i}] {dur:.1f}s RTF={rtf:.3f} FWL={fwl:.2f}s WER={w:.1%} | {hyp[:60]}", flush=True)
clean_wer = np.mean(wers); clean_rtf = tp / ta
print(f" CLEAN ({len(clean)}): WER {clean_wer:.2%}, RTF {clean_rtf:.3f}\n", flush=True)
# LibriSpeech other (full 133 samples)
print("=== Voxtral vLLM Realtime / LibriSpeech other ===", flush=True)
other = json.load(open("/home/cloud/benchmark_data/metadata_other.json"))
wers2 = []; ta2 = tp2 = 0
for i, s in enumerate(other):
hyp, dur, rtf, fwl = await transcribe(s['path'])
w = compute_wer(norm(s['reference']), norm(hyp)) if hyp else 1.0
wers2.append(w); ta2 += dur; tp2 += dur * rtf
if i < 3 or i % 20 == 0:
print(f" [{i}] {dur:.1f}s RTF={rtf:.3f} WER={w:.1%}", flush=True)
other_wer = np.mean(wers2); other_rtf = tp2 / ta2
print(f" OTHER ({len(other)}): WER {other_wer:.2%}, RTF {other_rtf:.3f}\n", flush=True)
# ACL6060 talks
print("=== Voxtral vLLM Realtime / ACL6060 ===", flush=True)
acl = []
for talk in ["110", "117", "268", "367", "590"]:
gw = []
with open(f"/home/cloud/iwslt26-sst/inputs/en/acl6060.ts/gold-jsonl/2022.acl-long.{talk}.jsonl") as f:
for line in f: gw.append(json.loads(line)["text"].strip())
gold = " ".join(gw)
hyp, dur, rtf, fwl = await transcribe(f"/home/cloud/acl6060_audio/2022.acl-long.{talk}.wav")
w = compute_wer(norm(gold), norm(hyp)) if hyp else 1.0
acl.append({"talk": talk, "wer": round(float(w),4), "rtf": round(float(rtf),3), "dur": round(dur,1)})
print(f" Talk {talk}: {dur:.0f}s, WER {w:.2%}, RTF {rtf:.3f}, FWL {fwl:.2f}s", flush=True)
acl_wer = np.mean([r["wer"] for r in acl])
acl_rtf = np.mean([r["rtf"] for r in acl])
print(f" ACL6060 AVERAGE: WER {acl_wer:.2%}, RTF {acl_rtf:.3f}\n", flush=True)
# Summary
print(f"{'='*55}")
print(f" VOXTRAL vLLM REALTIME BENCHMARK (H100)")
print(f"{'='*55}")
print(f" LS clean ({len(clean)}): WER {clean_wer:.2%}, RTF {clean_rtf:.3f}")
print(f" LS other ({len(other)}): WER {other_wer:.2%}, RTF {other_rtf:.3f}")
print(f" ACL6060 (5): WER {acl_wer:.2%}, RTF {acl_rtf:.3f}")
results = {
"clean": {"avg_wer": round(float(clean_wer),4), "rtf": round(float(clean_rtf),3), "n": len(clean)},
"other": {"avg_wer": round(float(other_wer),4), "rtf": round(float(other_rtf),3), "n": len(other)},
"acl6060": {"avg_wer": round(float(acl_wer),4), "avg_rtf": round(float(acl_rtf),3), "talks": acl},
}
json.dump(results, open("/home/cloud/bench_voxtral_realtime_results.json", "w"), indent=2)
print(f"\n Saved to /home/cloud/bench_voxtral_realtime_results.json")
asyncio.run(main())

View File

@@ -1,116 +0,0 @@
"""
Bridge between WhisperLiveKit STT and IWSLT26 MT pipeline.
Converts streaming ASRToken output from SimulStreaming into the JSONL
format expected by the AlignAtt MT agent (iwslt26-sst).
Output format (one JSON per line):
{"text": "word or phrase", "emission_time": 1.234, "is_final": false, "speech_time": 1.0}
Where:
- text: the emitted word/phrase
- emission_time: wall-clock time when the word was emitted (for compute-aware eval)
- speech_time: timestamp in the audio (for compute-unaware eval)
- is_final: whether this is the last word of a segment/silence boundary
"""
import json
import time
from typing import List, TextIO
from whisperlivekit.timed_objects import ASRToken
class CascadeBridge:
"""Converts ASRToken stream to JSONL for the MT agent."""
def __init__(self, output_file: TextIO = None):
self.output_file = output_file
self.start_time = time.time()
self.entries: List[dict] = []
def emit_tokens(self, tokens: List[ASRToken], is_final: bool = False):
"""Emit a batch of tokens from the STT."""
wall_clock = time.time() - self.start_time
for i, token in enumerate(tokens):
entry = {
"text": token.text.strip(),
"emission_time": round(wall_clock, 3),
"speech_time": round(token.start, 3),
"is_final": is_final and (i == len(tokens) - 1),
}
self.entries.append(entry)
if self.output_file:
self.output_file.write(json.dumps(entry) + "\n")
self.output_file.flush()
def get_entries(self) -> List[dict]:
return self.entries
def get_text(self) -> str:
"""Get the full transcribed text."""
return " ".join(e["text"] for e in self.entries if e["text"])
def save(self, path: str):
"""Save all entries to a JSONL file."""
with open(path, "w") as f:
for entry in self.entries:
f.write(json.dumps(entry) + "\n")
def run_stt_to_jsonl(
audio_path: str,
output_path: str,
model_id: str = "Qwen/Qwen3-ASR-0.6B",
alignment_heads_path: str = None,
border_fraction: float = 0.20,
language: str = "en",
chunk_sec: float = 1.0,
):
"""Run STT on an audio file and save JSONL output for the MT agent.
This is the main entry point for the cascade: audio file → JSONL.
"""
import wave
import numpy as np
from whisperlivekit.qwen3_simul_kv import Qwen3SimulKVASR, Qwen3SimulKVOnlineProcessor
# Load audio
with wave.open(audio_path, 'r') as wf:
audio = np.frombuffer(
wf.readframes(wf.getnframes()), dtype=np.int16
).astype(np.float32) / 32768.0
# Initialize STT
asr = Qwen3SimulKVASR(
model_dir=model_id,
lan=language,
alignment_heads_path=alignment_heads_path,
border_fraction=border_fraction,
)
proc = Qwen3SimulKVOnlineProcessor(asr)
bridge = CascadeBridge()
# Stream audio in chunks
chunk_samples = int(chunk_sec * 16000)
offset = 0
stream_time = 0.0
while offset < len(audio):
chunk = audio[offset:offset + chunk_samples]
stream_time += len(chunk) / 16000
proc.insert_audio_chunk(chunk, stream_time)
words, _ = proc.process_iter(is_last=False)
if words:
bridge.emit_tokens(words, is_final=False)
offset += chunk_samples
# Final flush
final_words, _ = proc.finish()
if final_words:
bridge.emit_tokens(final_words, is_final=True)
# Save
bridge.save(output_path)
return bridge