""" Whisper transcription module using faster-whisper. """ import os import io import tempfile import logging from faster_whisper import WhisperModel from pydub import AudioSegment logger = logging.getLogger(__name__) # Global model instance (loaded once) _model = None def get_model(): """Get or initialize the Whisper model.""" global _model if _model is None: model_size = os.environ.get('WHISPER_MODEL', 'base') device = os.environ.get('WHISPER_DEVICE', 'cpu') compute_type = os.environ.get('WHISPER_COMPUTE_TYPE', 'int8') logger.info(f"Loading Whisper model: {model_size} on {device} ({compute_type})") _model = WhisperModel( model_size, device=device, compute_type=compute_type ) logger.info("Whisper model loaded successfully") return _model def transcribe_audio(audio_bytes, format='webm'): """ Transcribe audio bytes to text. Args: audio_bytes: Raw audio data format: Audio format (default: webm) Returns: Transcribed text string """ if not audio_bytes: return "" try: # Convert audio to WAV format that Whisper expects audio = AudioSegment.from_file( io.BytesIO(audio_bytes), format=format ) # Convert to 16kHz mono WAV (Whisper's expected format) audio = audio.set_frame_rate(16000).set_channels(1) # Export to temporary file (faster-whisper needs a file path) with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: audio.export(tmp.name, format='wav') tmp_path = tmp.name try: # Transcribe model = get_model() segments, info = model.transcribe( tmp_path, beam_size=5, vad_filter=True, vad_parameters=dict( min_silence_duration_ms=500 ) ) # Combine all segments into text text = ' '.join(segment.text.strip() for segment in segments) return text.strip() finally: # Clean up temp file if os.path.exists(tmp_path): os.unlink(tmp_path) except Exception as e: logger.error(f"Transcription error: {e}") return "" def preload_model(): """Preload the model during startup.""" try: get_model() return True except Exception as e: logger.error(f"Failed to preload model: {e}") return False