Remove unused benchmark and cascade bridge scripts

2026-05-13 23:53:50 +00:00 · 2026-03-31 23:10:44 +02:00
parent 3f233dc36c
commit 7fc341683a
3 changed files with 0 additions and 362 deletions
--- a/benchmarks/h100/bench_voxtral_hf_batch.py
+++ b/benchmarks/h100/bench_voxtral_hf_batch.py
@@ -1,124 +0,0 @@
-#!/usr/bin/env python3
-"""Standalone Voxtral benchmark — no whisperlivekit imports."""
-import json, logging, re, time, wave, queue, threading
-import numpy as np
-import torch
-
-logging.basicConfig(level=logging.WARNING)
-for n in ["transformers","torch","httpx"]:
-    logging.getLogger(n).setLevel(logging.ERROR)
-
-from jiwer import wer as compute_wer
-from transformers import AutoProcessor, VoxtralRealtimeForConditionalGeneration, TextIteratorStreamer
-
-def norm(t):
-    return re.sub(r' +', ' ', re.sub(r'[^a-z0-9 ]', ' ', t.lower())).strip()
-
-def load_audio(path):
-    with wave.open(path, 'r') as wf:
-        return np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16).astype(np.float32) / 32768.0
-
-# Load model
-print("Loading Voxtral-Mini-4B...", flush=True)
-MODEL_ID = "mistralai/Voxtral-Mini-4B-Realtime-2602"
-processor = AutoProcessor.from_pretrained(MODEL_ID)
-model = VoxtralRealtimeForConditionalGeneration.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, device_map="cuda:0",
-)
-print(f"Loaded, GPU: {torch.cuda.memory_allocated()/1e9:.1f} GB", flush=True)
-
-def transcribe_batch(audio_np):
-    """Simple batch transcription (not streaming)."""
-    # Voxtral expects audio as input_features from processor
-    inputs = processor(
-        audio=audio_np, sampling_rate=16000, return_tensors="pt",
-    ).to("cuda:0").to(torch.bfloat16)
-
-    t0 = time.perf_counter()
-    with torch.inference_mode():
-        generated = model.generate(**inputs, max_new_tokens=1024)
-    t1 = time.perf_counter()
-
-    text = processor.batch_decode(generated, skip_special_tokens=True)[0].strip()
-    return text, t1 - t0
-
-# 1. LibriSpeech test-clean
-print("\n=== Voxtral / LibriSpeech test-clean ===", flush=True)
-clean = json.load(open("/home/cloud/benchmark_data/metadata.json"))
-wers = []; ta = tp = 0
-for i, s in enumerate(clean):
-    audio = load_audio(s['path'])
-    hyp, pt = transcribe_batch(audio)
-    w = compute_wer(norm(s['reference']), norm(hyp))
-    wers.append(w); ta += s['duration']; tp += pt
-    if i < 3 or i % 20 == 0:
-        print(f"  [{i}] {s['duration']:.1f}s RTF={pt/s['duration']:.2f} WER={w:.1%} | {hyp[:60]}", flush=True)
-clean_wer = np.mean(wers); clean_rtf = tp/ta
-print(f"  CLEAN: WER {clean_wer:.2%}, RTF {clean_rtf:.3f} ({len(clean)} samples, {ta:.0f}s)")
-
-# 2. LibriSpeech test-other
-print("\n=== Voxtral / LibriSpeech test-other ===", flush=True)
-other = json.load(open("/home/cloud/benchmark_data/metadata_other.json"))
-wers2 = []; ta2 = tp2 = 0
-for i, s in enumerate(other):
-    audio = load_audio(s['path'])
-    hyp, pt = transcribe_batch(audio)
-    w = compute_wer(norm(s['reference']), norm(hyp))
-    wers2.append(w); ta2 += s['duration']; tp2 += pt
-    if i < 3 or i % 20 == 0:
-        print(f"  [{i}] {s['duration']:.1f}s RTF={pt/s['duration']:.2f} WER={w:.1%}", flush=True)
-other_wer = np.mean(wers2); other_rtf = tp2/ta2
-print(f"  OTHER: WER {other_wer:.2%}, RTF {other_rtf:.3f} ({len(other)} samples, {ta2:.0f}s)")
-
-# 3. ACL6060
-print("\n=== Voxtral / ACL6060 ===", flush=True)
-acl_results = []
-for talk in ["110", "117", "268", "367", "590"]:
-    audio = load_audio(f"/home/cloud/acl6060_audio/2022.acl-long.{talk}.wav")
-    dur = len(audio) / 16000
-    gw = []
-    with open(f"/home/cloud/iwslt26-sst/inputs/en/acl6060.ts/gold-jsonl/2022.acl-long.{talk}.jsonl") as f:
-        for line in f:
-            gw.append(json.loads(line)["text"].strip())
-    gold = " ".join(gw)
-
-    # For long audio, process in 30s chunks
-    all_hyp = []
-    t0 = time.perf_counter()
-    chunk_size = 30 * 16000
-    for start in range(0, len(audio), chunk_size):
-        chunk = audio[start:start + chunk_size]
-        if len(chunk) < 1600:  # skip very short tail
-            continue
-        hyp, _ = transcribe_batch(chunk)
-        all_hyp.append(hyp)
-    t1 = time.perf_counter()
-
-    full_hyp = " ".join(all_hyp)
-    w = compute_wer(norm(gold), norm(full_hyp))
-    rtf = (t1 - t0) / dur
-    acl_results.append({"talk": talk, "wer": w, "rtf": rtf, "dur": dur})
-    print(f"  Talk {talk}: {dur:.0f}s, WER {w:.2%}, RTF {rtf:.3f}", flush=True)
-
-acl_wer = np.mean([r["wer"] for r in acl_results])
-acl_rtf = np.mean([r["rtf"] for r in acl_results])
-print(f"  ACL6060 AVERAGE: WER {acl_wer:.2%}, RTF {acl_rtf:.3f}")
-
-# Summary
-print(f"\n{'='*60}")
-print(f"  VOXTRAL BENCHMARK SUMMARY (H100 80GB)")
-print(f"{'='*60}")
-print(f"  {'Dataset':>25} {'WER':>7} {'RTF':>7}")
-print(f"  {'-'*42}")
-print(f"  {'LibriSpeech clean':>25} {clean_wer:>6.2%} {clean_rtf:>7.3f}")
-print(f"  {'LibriSpeech other':>25} {other_wer:>6.2%} {other_rtf:>7.3f}")
-print(f"  {'ACL6060 (5 talks)':>25} {acl_wer:>6.2%} {acl_rtf:>7.3f}")
-
-results = {
-    "clean": {"avg_wer": round(float(clean_wer), 4), "rtf": round(float(clean_rtf), 3)},
-    "other": {"avg_wer": round(float(other_wer), 4), "rtf": round(float(other_rtf), 3)},
-    "acl6060": {"avg_wer": round(float(acl_wer), 4), "avg_rtf": round(float(acl_rtf), 3),
-                "talks": [{k: (round(float(v), 4) if isinstance(v, (float, np.floating)) else v) for k, v in r.items()} for r in acl_results]},
-}
-json.dump(results, open("/home/cloud/bench_voxtral_results.json", "w"), indent=2)
-print(f"\nSaved to /home/cloud/bench_voxtral_results.json")
--- a/benchmarks/h100/bench_voxtral_vllm_realtime.py
+++ b/benchmarks/h100/bench_voxtral_vllm_realtime.py
@@ -1,122 +0,0 @@
-#!/usr/bin/env python3
-"""Benchmark Voxtral via vLLM WebSocket /v1/realtime — proper streaming."""
-import asyncio, json, base64, time, wave, re, os
-import numpy as np
-import websockets
-import librosa
-from jiwer import wer as compute_wer
-
-MODEL = "mistralai/Voxtral-Mini-4B-Realtime-2602"
-WS_URI = "ws://localhost:8000/v1/realtime"
-
-def norm(t):
-    return re.sub(r' +', ' ', re.sub(r'[^a-z0-9 ]', ' ', t.lower())).strip()
-
-async def transcribe(audio_path, max_tokens=4096):
-    audio, _ = librosa.load(audio_path, sr=16000, mono=True)
-    pcm16 = (audio * 32767).astype(np.int16).tobytes()
-    dur = len(audio) / 16000
-
-    t0 = time.time()
-    transcript = ""
-    first_token_time = None
-
-    async with websockets.connect(WS_URI, max_size=2**24) as ws:
-        await ws.recv()  # session.created
-        await ws.send(json.dumps({"type": "session.update", "model": MODEL}))
-        await ws.send(json.dumps({"type": "input_audio_buffer.commit"}))  # signal ready
-
-        # Send audio in 4KB chunks
-        for i in range(0, len(pcm16), 4096):
-            await ws.send(json.dumps({
-                "type": "input_audio_buffer.append",
-                "audio": base64.b64encode(pcm16[i:i+4096]).decode(),
-            }))
-
-        await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": True}))
-
-        while True:
-            try:
-                msg = json.loads(await asyncio.wait_for(ws.recv(), timeout=120))
-                if msg["type"] == "transcription.delta":
-                    d = msg.get("delta", "")
-                    if d.strip() and first_token_time is None:
-                        first_token_time = time.time() - t0
-                    transcript += d
-                elif msg["type"] == "transcription.done":
-                    transcript = msg.get("text", transcript)
-                    break
-                elif msg["type"] == "error":
-                    break
-            except asyncio.TimeoutError:
-                break
-
-    elapsed = time.time() - t0
-    return transcript.strip(), dur, elapsed / dur, first_token_time or elapsed
-
-async def main():
-    # Warmup
-    print("Warmup...", flush=True)
-    await transcribe("/home/cloud/benchmark_data/librispeech_clean_0000.wav")
-
-    # LibriSpeech clean (full 91 samples)
-    print("\n=== Voxtral vLLM Realtime / LibriSpeech clean ===", flush=True)
-    clean = json.load(open("/home/cloud/benchmark_data/metadata.json"))
-    wers = []; ta = tp = 0
-    for i, s in enumerate(clean):
-        hyp, dur, rtf, fwl = await transcribe(s['path'])
-        w = compute_wer(norm(s['reference']), norm(hyp)) if hyp else 1.0
-        wers.append(w); ta += dur; tp += dur * rtf
-        if i < 3 or i % 20 == 0:
-            print(f"  [{i}] {dur:.1f}s RTF={rtf:.3f} FWL={fwl:.2f}s WER={w:.1%} | {hyp[:60]}", flush=True)
-    clean_wer = np.mean(wers); clean_rtf = tp / ta
-    print(f"  CLEAN ({len(clean)}): WER {clean_wer:.2%}, RTF {clean_rtf:.3f}\n", flush=True)
-
-    # LibriSpeech other (full 133 samples)
-    print("=== Voxtral vLLM Realtime / LibriSpeech other ===", flush=True)
-    other = json.load(open("/home/cloud/benchmark_data/metadata_other.json"))
-    wers2 = []; ta2 = tp2 = 0
-    for i, s in enumerate(other):
-        hyp, dur, rtf, fwl = await transcribe(s['path'])
-        w = compute_wer(norm(s['reference']), norm(hyp)) if hyp else 1.0
-        wers2.append(w); ta2 += dur; tp2 += dur * rtf
-        if i < 3 or i % 20 == 0:
-            print(f"  [{i}] {dur:.1f}s RTF={rtf:.3f} WER={w:.1%}", flush=True)
-    other_wer = np.mean(wers2); other_rtf = tp2 / ta2
-    print(f"  OTHER ({len(other)}): WER {other_wer:.2%}, RTF {other_rtf:.3f}\n", flush=True)
-
-    # ACL6060 talks
-    print("=== Voxtral vLLM Realtime / ACL6060 ===", flush=True)
-    acl = []
-    for talk in ["110", "117", "268", "367", "590"]:
-        gw = []
-        with open(f"/home/cloud/iwslt26-sst/inputs/en/acl6060.ts/gold-jsonl/2022.acl-long.{talk}.jsonl") as f:
-            for line in f: gw.append(json.loads(line)["text"].strip())
-        gold = " ".join(gw)
-
-        hyp, dur, rtf, fwl = await transcribe(f"/home/cloud/acl6060_audio/2022.acl-long.{talk}.wav")
-        w = compute_wer(norm(gold), norm(hyp)) if hyp else 1.0
-        acl.append({"talk": talk, "wer": round(float(w),4), "rtf": round(float(rtf),3), "dur": round(dur,1)})
-        print(f"  Talk {talk}: {dur:.0f}s, WER {w:.2%}, RTF {rtf:.3f}, FWL {fwl:.2f}s", flush=True)
-
-    acl_wer = np.mean([r["wer"] for r in acl])
-    acl_rtf = np.mean([r["rtf"] for r in acl])
-    print(f"  ACL6060 AVERAGE: WER {acl_wer:.2%}, RTF {acl_rtf:.3f}\n", flush=True)
-
-    # Summary
-    print(f"{'='*55}")
-    print(f"  VOXTRAL vLLM REALTIME BENCHMARK (H100)")
-    print(f"{'='*55}")
-    print(f"  LS clean ({len(clean)}): WER {clean_wer:.2%}, RTF {clean_rtf:.3f}")
-    print(f"  LS other ({len(other)}): WER {other_wer:.2%}, RTF {other_rtf:.3f}")
-    print(f"  ACL6060 (5):     WER {acl_wer:.2%}, RTF {acl_rtf:.3f}")
-
-    results = {
-        "clean": {"avg_wer": round(float(clean_wer),4), "rtf": round(float(clean_rtf),3), "n": len(clean)},
-        "other": {"avg_wer": round(float(other_wer),4), "rtf": round(float(other_rtf),3), "n": len(other)},
-        "acl6060": {"avg_wer": round(float(acl_wer),4), "avg_rtf": round(float(acl_rtf),3), "talks": acl},
-    }
-    json.dump(results, open("/home/cloud/bench_voxtral_realtime_results.json", "w"), indent=2)
-    print(f"\n  Saved to /home/cloud/bench_voxtral_realtime_results.json")
-
-asyncio.run(main())
--- a/whisperlivekit/cascade_bridge.py
+++ b/whisperlivekit/cascade_bridge.py
@@ -1,116 +0,0 @@
-"""
-Bridge between WhisperLiveKit STT and IWSLT26 MT pipeline.
-
-Converts streaming ASRToken output from SimulStreaming into the JSONL
-format expected by the AlignAtt MT agent (iwslt26-sst).
-
-Output format (one JSON per line):
-  {"text": "word or phrase", "emission_time": 1.234, "is_final": false, "speech_time": 1.0}
-
-Where:
-  - text: the emitted word/phrase
-  - emission_time: wall-clock time when the word was emitted (for compute-aware eval)
-  - speech_time: timestamp in the audio (for compute-unaware eval)
-  - is_final: whether this is the last word of a segment/silence boundary
-"""
-
-import json
-import time
-from typing import List, TextIO
-
-from whisperlivekit.timed_objects import ASRToken
-
-
-class CascadeBridge:
-    """Converts ASRToken stream to JSONL for the MT agent."""
-
-    def __init__(self, output_file: TextIO = None):
-        self.output_file = output_file
-        self.start_time = time.time()
-        self.entries: List[dict] = []
-
-    def emit_tokens(self, tokens: List[ASRToken], is_final: bool = False):
-        """Emit a batch of tokens from the STT."""
-        wall_clock = time.time() - self.start_time
-
-        for i, token in enumerate(tokens):
-            entry = {
-                "text": token.text.strip(),
-                "emission_time": round(wall_clock, 3),
-                "speech_time": round(token.start, 3),
-                "is_final": is_final and (i == len(tokens) - 1),
-            }
-            self.entries.append(entry)
-            if self.output_file:
-                self.output_file.write(json.dumps(entry) + "\n")
-                self.output_file.flush()
-
-    def get_entries(self) -> List[dict]:
-        return self.entries
-
-    def get_text(self) -> str:
-        """Get the full transcribed text."""
-        return " ".join(e["text"] for e in self.entries if e["text"])
-
-    def save(self, path: str):
-        """Save all entries to a JSONL file."""
-        with open(path, "w") as f:
-            for entry in self.entries:
-                f.write(json.dumps(entry) + "\n")
-
-
-def run_stt_to_jsonl(
-    audio_path: str,
-    output_path: str,
-    model_id: str = "Qwen/Qwen3-ASR-0.6B",
-    alignment_heads_path: str = None,
-    border_fraction: float = 0.20,
-    language: str = "en",
-    chunk_sec: float = 1.0,
-):
-    """Run STT on an audio file and save JSONL output for the MT agent.
-
-    This is the main entry point for the cascade: audio file → JSONL.
-    """
-    import wave
-    import numpy as np
-    from whisperlivekit.qwen3_simul_kv import Qwen3SimulKVASR, Qwen3SimulKVOnlineProcessor
-
-    # Load audio
-    with wave.open(audio_path, 'r') as wf:
-        audio = np.frombuffer(
-            wf.readframes(wf.getnframes()), dtype=np.int16
-        ).astype(np.float32) / 32768.0
-
-    # Initialize STT
-    asr = Qwen3SimulKVASR(
-        model_dir=model_id,
-        lan=language,
-        alignment_heads_path=alignment_heads_path,
-        border_fraction=border_fraction,
-    )
-    proc = Qwen3SimulKVOnlineProcessor(asr)
-    bridge = CascadeBridge()
-
-    # Stream audio in chunks
-    chunk_samples = int(chunk_sec * 16000)
-    offset = 0
-    stream_time = 0.0
-
-    while offset < len(audio):
-        chunk = audio[offset:offset + chunk_samples]
-        stream_time += len(chunk) / 16000
-        proc.insert_audio_chunk(chunk, stream_time)
-        words, _ = proc.process_iter(is_last=False)
-        if words:
-            bridge.emit_tokens(words, is_final=False)
-        offset += chunk_samples
-
-    # Final flush
-    final_words, _ = proc.finish()
-    if final_words:
-        bridge.emit_tokens(final_words, is_final=True)
-
-    # Save
-    bridge.save(output_path)
-    return bridge