Real-time speech-to-text using OpenAI Whisper (faster-whisper). Features browser audio capture, WebSocket streaming, and customizable display settings. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
103 lines
2.5 KiB
Python
103 lines
2.5 KiB
Python
"""
|
|
Whisper transcription module using faster-whisper.
|
|
"""
|
|
|
|
import os
|
|
import io
|
|
import tempfile
|
|
import logging
|
|
from faster_whisper import WhisperModel
|
|
from pydub import AudioSegment
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Global model instance (loaded once)
|
|
_model = None
|
|
|
|
|
|
def get_model():
|
|
"""Get or initialize the Whisper model."""
|
|
global _model
|
|
|
|
if _model is None:
|
|
model_size = os.environ.get('WHISPER_MODEL', 'base')
|
|
device = os.environ.get('WHISPER_DEVICE', 'cpu')
|
|
compute_type = os.environ.get('WHISPER_COMPUTE_TYPE', 'int8')
|
|
|
|
logger.info(f"Loading Whisper model: {model_size} on {device} ({compute_type})")
|
|
|
|
_model = WhisperModel(
|
|
model_size,
|
|
device=device,
|
|
compute_type=compute_type
|
|
)
|
|
|
|
logger.info("Whisper model loaded successfully")
|
|
|
|
return _model
|
|
|
|
|
|
def transcribe_audio(audio_bytes, format='webm'):
|
|
"""
|
|
Transcribe audio bytes to text.
|
|
|
|
Args:
|
|
audio_bytes: Raw audio data
|
|
format: Audio format (default: webm)
|
|
|
|
Returns:
|
|
Transcribed text string
|
|
"""
|
|
if not audio_bytes:
|
|
return ""
|
|
|
|
try:
|
|
# Convert audio to WAV format that Whisper expects
|
|
audio = AudioSegment.from_file(
|
|
io.BytesIO(audio_bytes),
|
|
format=format
|
|
)
|
|
|
|
# Convert to 16kHz mono WAV (Whisper's expected format)
|
|
audio = audio.set_frame_rate(16000).set_channels(1)
|
|
|
|
# Export to temporary file (faster-whisper needs a file path)
|
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
|
|
audio.export(tmp.name, format='wav')
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
# Transcribe
|
|
model = get_model()
|
|
segments, info = model.transcribe(
|
|
tmp_path,
|
|
beam_size=5,
|
|
vad_filter=True,
|
|
vad_parameters=dict(
|
|
min_silence_duration_ms=500
|
|
)
|
|
)
|
|
|
|
# Combine all segments into text
|
|
text = ' '.join(segment.text.strip() for segment in segments)
|
|
return text.strip()
|
|
|
|
finally:
|
|
# Clean up temp file
|
|
if os.path.exists(tmp_path):
|
|
os.unlink(tmp_path)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Transcription error: {e}")
|
|
return ""
|
|
|
|
|
|
def preload_model():
|
|
"""Preload the model during startup."""
|
|
try:
|
|
get_model()
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to preload model: {e}")
|
|
return False
|