#!/usr/bin/env python3 """ Whisper Transcription Service A simple Flask API that wraps faster-whisper for audio transcription. """ import os import tempfile from flask import Flask, request, jsonify from faster_whisper import WhisperModel app = Flask(__name__) # Configuration from environment WHISPER_MODEL = os.getenv('WHISPER_MODEL', 'base') WHISPER_DEVICE = os.getenv('WHISPER_DEVICE', 'cuda') WHISPER_COMPUTE = os.getenv('WHISPER_COMPUTE', 'float16') # Initialize model (loaded once at startup) print(f"Loading Whisper model: {WHISPER_MODEL} on {WHISPER_DEVICE} with {WHISPER_COMPUTE}") model = None def get_model(): """Lazy load the model on first request""" global model if model is None: model = WhisperModel( WHISPER_MODEL, device=WHISPER_DEVICE, compute_type=WHISPER_COMPUTE ) print(f"Model loaded successfully") return model @app.route('/health', methods=['GET']) def health(): """Health check endpoint""" return jsonify({ 'status': 'ok', 'model': WHISPER_MODEL, 'device': WHISPER_DEVICE, 'compute': WHISPER_COMPUTE }) @app.route('/transcribe', methods=['POST']) def transcribe(): """ Transcribe an audio file. Expects a multipart form with an 'audio' file. Returns JSON with segments and detected language. """ if 'audio' not in request.files: return jsonify({'error': 'No audio file provided'}), 400 audio_file = request.files['audio'] if audio_file.filename == '': return jsonify({'error': 'No audio file selected'}), 400 # Save to temp file with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: audio_file.save(tmp.name) tmp_path = tmp.name try: # Get model whisper = get_model() # Check for language override language = request.form.get('language', None) # Transcribe with word timestamps # VAD filter enabled to reduce hallucination on non-speech audio (moans, breathing) # condition_on_previous_text=False prevents hallucination cascading (e.g. one Korean # hallucination causing all subsequent segments to also be Korean) transcribe_kwargs = { 'word_timestamps': True, 'vad_filter': True, 'vad_parameters': { 'min_silence_duration_ms': 500, 'speech_pad_ms': 200, }, 'condition_on_previous_text': False, } if language: transcribe_kwargs['language'] = language segments, info = whisper.transcribe(tmp_path, **transcribe_kwargs) # Build response result = [] for seg in segments: segment_data = { 'start': seg.start, 'end': seg.end, 'text': seg.text.strip() } # Add word-level timestamps if available if seg.words: segment_data['words'] = [ { 'word': w.word.strip(), 'start': w.start, 'end': w.end } for w in seg.words ] result.append(segment_data) return jsonify({ 'segments': result, 'language': info.language, 'language_probability': info.language_probability, 'duration': info.duration }) except Exception as e: return jsonify({'error': str(e)}), 500 finally: # Clean up temp file try: os.unlink(tmp_path) except: pass @app.route('/info', methods=['GET']) def info(): """Get model information""" return jsonify({ 'model': WHISPER_MODEL, 'device': WHISPER_DEVICE, 'compute_type': WHISPER_COMPUTE, 'available_models': ['tiny', 'base', 'small', 'medium', 'large-v2', 'large-v3'] }) if __name__ == '__main__': # Pre-load the model get_model() # Run with Flask's built-in server for development app.run(host='0.0.0.0', port=5000, debug=False)