changemaker.lite/media-manager/whisper/transcribe.py

#!/usr/bin/env python3
"""
Whisper Transcription Service

A simple Flask API that wraps faster-whisper for audio transcription.
"""

import os
import tempfile
from flask import Flask, request, jsonify
from faster_whisper import WhisperModel

app = Flask(__name__)

# Configuration from environment
WHISPER_MODEL = os.getenv('WHISPER_MODEL', 'base')
WHISPER_DEVICE = os.getenv('WHISPER_DEVICE', 'cuda')
WHISPER_COMPUTE = os.getenv('WHISPER_COMPUTE', 'float16')

# Initialize model (loaded once at startup)
print(f"Loading Whisper model: {WHISPER_MODEL} on {WHISPER_DEVICE} with {WHISPER_COMPUTE}")
model = None

def get_model():
    """Lazy load the model on first request"""
    global model
    if model is None:
        model = WhisperModel(
            WHISPER_MODEL,
            device=WHISPER_DEVICE,
            compute_type=WHISPER_COMPUTE
        )
        print(f"Model loaded successfully")
    return model


@app.route('/health', methods=['GET'])
def health():
    """Health check endpoint"""
    return jsonify({
        'status': 'ok',
        'model': WHISPER_MODEL,
        'device': WHISPER_DEVICE,
        'compute': WHISPER_COMPUTE
    })


@app.route('/transcribe', methods=['POST'])
def transcribe():
    """
    Transcribe an audio file.

    Expects a multipart form with an 'audio' file.
    Returns JSON with segments and detected language.
    """
    if 'audio' not in request.files:
        return jsonify({'error': 'No audio file provided'}), 400

    audio_file = request.files['audio']

    if audio_file.filename == '':
        return jsonify({'error': 'No audio file selected'}), 400

    # Save to temp file
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
        audio_file.save(tmp.name)
        tmp_path = tmp.name

    try:
        # Get model
        whisper = get_model()

        # Check for language override
        language = request.form.get('language', None)

        # Transcribe with word timestamps
        # VAD filter enabled to reduce hallucination on non-speech audio (moans, breathing)
        # condition_on_previous_text=False prevents hallucination cascading (e.g. one Korean
        # hallucination causing all subsequent segments to also be Korean)
        transcribe_kwargs = {
            'word_timestamps': True,
            'vad_filter': True,
            'vad_parameters': {
                'min_silence_duration_ms': 500,
                'speech_pad_ms': 200,
            },
            'condition_on_previous_text': False,
        }
        if language:
            transcribe_kwargs['language'] = language

        segments, info = whisper.transcribe(tmp_path, **transcribe_kwargs)

        # Build response
        result = []
        for seg in segments:
            segment_data = {
                'start': seg.start,
                'end': seg.end,
                'text': seg.text.strip()
            }

            # Add word-level timestamps if available
            if seg.words:
                segment_data['words'] = [
                    {
                        'word': w.word.strip(),
                        'start': w.start,
                        'end': w.end
                    }
                    for w in seg.words
                ]

            result.append(segment_data)

        return jsonify({
            'segments': result,
            'language': info.language,
            'language_probability': info.language_probability,
            'duration': info.duration
        })

    except Exception as e:
        return jsonify({'error': str(e)}), 500

    finally:
        # Clean up temp file
        try:
            os.unlink(tmp_path)
        except:
            pass


@app.route('/info', methods=['GET'])
def info():
    """Get model information"""
    return jsonify({
        'model': WHISPER_MODEL,
        'device': WHISPER_DEVICE,
        'compute_type': WHISPER_COMPUTE,
        'available_models': ['tiny', 'base', 'small', 'medium', 'large-v2', 'large-v3']
    })


if __name__ == '__main__':
    # Pre-load the model
    get_model()

    # Run with Flask's built-in server for development
    app.run(host='0.0.0.0', port=5000, debug=False)