151 lines
4.1 KiB
Python
151 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Whisper Transcription Service
|
|
|
|
A simple Flask API that wraps faster-whisper for audio transcription.
|
|
"""
|
|
|
|
import os
|
|
import tempfile
|
|
from flask import Flask, request, jsonify
|
|
from faster_whisper import WhisperModel
|
|
|
|
app = Flask(__name__)
|
|
|
|
# Configuration from environment
|
|
WHISPER_MODEL = os.getenv('WHISPER_MODEL', 'base')
|
|
WHISPER_DEVICE = os.getenv('WHISPER_DEVICE', 'cuda')
|
|
WHISPER_COMPUTE = os.getenv('WHISPER_COMPUTE', 'float16')
|
|
|
|
# Initialize model (loaded once at startup)
|
|
print(f"Loading Whisper model: {WHISPER_MODEL} on {WHISPER_DEVICE} with {WHISPER_COMPUTE}")
|
|
model = None
|
|
|
|
def get_model():
|
|
"""Lazy load the model on first request"""
|
|
global model
|
|
if model is None:
|
|
model = WhisperModel(
|
|
WHISPER_MODEL,
|
|
device=WHISPER_DEVICE,
|
|
compute_type=WHISPER_COMPUTE
|
|
)
|
|
print(f"Model loaded successfully")
|
|
return model
|
|
|
|
|
|
@app.route('/health', methods=['GET'])
|
|
def health():
|
|
"""Health check endpoint"""
|
|
return jsonify({
|
|
'status': 'ok',
|
|
'model': WHISPER_MODEL,
|
|
'device': WHISPER_DEVICE,
|
|
'compute': WHISPER_COMPUTE
|
|
})
|
|
|
|
|
|
@app.route('/transcribe', methods=['POST'])
|
|
def transcribe():
|
|
"""
|
|
Transcribe an audio file.
|
|
|
|
Expects a multipart form with an 'audio' file.
|
|
Returns JSON with segments and detected language.
|
|
"""
|
|
if 'audio' not in request.files:
|
|
return jsonify({'error': 'No audio file provided'}), 400
|
|
|
|
audio_file = request.files['audio']
|
|
|
|
if audio_file.filename == '':
|
|
return jsonify({'error': 'No audio file selected'}), 400
|
|
|
|
# Save to temp file
|
|
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
|
|
audio_file.save(tmp.name)
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
# Get model
|
|
whisper = get_model()
|
|
|
|
# Check for language override
|
|
language = request.form.get('language', None)
|
|
|
|
# Transcribe with word timestamps
|
|
# VAD filter enabled to reduce hallucination on non-speech audio (moans, breathing)
|
|
# condition_on_previous_text=False prevents hallucination cascading (e.g. one Korean
|
|
# hallucination causing all subsequent segments to also be Korean)
|
|
transcribe_kwargs = {
|
|
'word_timestamps': True,
|
|
'vad_filter': True,
|
|
'vad_parameters': {
|
|
'min_silence_duration_ms': 500,
|
|
'speech_pad_ms': 200,
|
|
},
|
|
'condition_on_previous_text': False,
|
|
}
|
|
if language:
|
|
transcribe_kwargs['language'] = language
|
|
|
|
segments, info = whisper.transcribe(tmp_path, **transcribe_kwargs)
|
|
|
|
# Build response
|
|
result = []
|
|
for seg in segments:
|
|
segment_data = {
|
|
'start': seg.start,
|
|
'end': seg.end,
|
|
'text': seg.text.strip()
|
|
}
|
|
|
|
# Add word-level timestamps if available
|
|
if seg.words:
|
|
segment_data['words'] = [
|
|
{
|
|
'word': w.word.strip(),
|
|
'start': w.start,
|
|
'end': w.end
|
|
}
|
|
for w in seg.words
|
|
]
|
|
|
|
result.append(segment_data)
|
|
|
|
return jsonify({
|
|
'segments': result,
|
|
'language': info.language,
|
|
'language_probability': info.language_probability,
|
|
'duration': info.duration
|
|
})
|
|
|
|
except Exception as e:
|
|
return jsonify({'error': str(e)}), 500
|
|
|
|
finally:
|
|
# Clean up temp file
|
|
try:
|
|
os.unlink(tmp_path)
|
|
except:
|
|
pass
|
|
|
|
|
|
@app.route('/info', methods=['GET'])
|
|
def info():
|
|
"""Get model information"""
|
|
return jsonify({
|
|
'model': WHISPER_MODEL,
|
|
'device': WHISPER_DEVICE,
|
|
'compute_type': WHISPER_COMPUTE,
|
|
'available_models': ['tiny', 'base', 'small', 'medium', 'large-v2', 'large-v3']
|
|
})
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Pre-load the model
|
|
get_model()
|
|
|
|
# Run with Flask's built-in server for development
|
|
app.run(host='0.0.0.0', port=5000, debug=False)
|