151 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""
Whisper Transcription Service
A simple Flask API that wraps faster-whisper for audio transcription.
"""
import os
import tempfile
from flask import Flask, request, jsonify
from faster_whisper import WhisperModel
app = Flask(__name__)
# Configuration from environment
WHISPER_MODEL = os.getenv('WHISPER_MODEL', 'base')
WHISPER_DEVICE = os.getenv('WHISPER_DEVICE', 'cuda')
WHISPER_COMPUTE = os.getenv('WHISPER_COMPUTE', 'float16')
# Initialize model (loaded once at startup)
print(f"Loading Whisper model: {WHISPER_MODEL} on {WHISPER_DEVICE} with {WHISPER_COMPUTE}")
model = None
def get_model():
"""Lazy load the model on first request"""
global model
if model is None:
model = WhisperModel(
WHISPER_MODEL,
device=WHISPER_DEVICE,
compute_type=WHISPER_COMPUTE
)
print(f"Model loaded successfully")
return model
@app.route('/health', methods=['GET'])
def health():
"""Health check endpoint"""
return jsonify({
'status': 'ok',
'model': WHISPER_MODEL,
'device': WHISPER_DEVICE,
'compute': WHISPER_COMPUTE
})
@app.route('/transcribe', methods=['POST'])
def transcribe():
"""
Transcribe an audio file.
Expects a multipart form with an 'audio' file.
Returns JSON with segments and detected language.
"""
if 'audio' not in request.files:
return jsonify({'error': 'No audio file provided'}), 400
audio_file = request.files['audio']
if audio_file.filename == '':
return jsonify({'error': 'No audio file selected'}), 400
# Save to temp file
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
audio_file.save(tmp.name)
tmp_path = tmp.name
try:
# Get model
whisper = get_model()
# Check for language override
language = request.form.get('language', None)
# Transcribe with word timestamps
# VAD filter enabled to reduce hallucination on non-speech audio (moans, breathing)
# condition_on_previous_text=False prevents hallucination cascading (e.g. one Korean
# hallucination causing all subsequent segments to also be Korean)
transcribe_kwargs = {
'word_timestamps': True,
'vad_filter': True,
'vad_parameters': {
'min_silence_duration_ms': 500,
'speech_pad_ms': 200,
},
'condition_on_previous_text': False,
}
if language:
transcribe_kwargs['language'] = language
segments, info = whisper.transcribe(tmp_path, **transcribe_kwargs)
# Build response
result = []
for seg in segments:
segment_data = {
'start': seg.start,
'end': seg.end,
'text': seg.text.strip()
}
# Add word-level timestamps if available
if seg.words:
segment_data['words'] = [
{
'word': w.word.strip(),
'start': w.start,
'end': w.end
}
for w in seg.words
]
result.append(segment_data)
return jsonify({
'segments': result,
'language': info.language,
'language_probability': info.language_probability,
'duration': info.duration
})
except Exception as e:
return jsonify({'error': str(e)}), 500
finally:
# Clean up temp file
try:
os.unlink(tmp_path)
except:
pass
@app.route('/info', methods=['GET'])
def info():
"""Get model information"""
return jsonify({
'model': WHISPER_MODEL,
'device': WHISPER_DEVICE,
'compute_type': WHISPER_COMPUTE,
'available_models': ['tiny', 'base', 'small', 'medium', 'large-v2', 'large-v3']
})
if __name__ == '__main__':
# Pre-load the model
get_model()
# Run with Flask's built-in server for development
app.run(host='0.0.0.0', port=5000, debug=False)