import gradio as gr
from transformers import AutomaticSpeechRecognitionPipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import torchaudio

# Model URLs
model_urls = [
    "kiranpantha/whisper-tiny-ne",
    "kiranpantha/whisper-base-ne",
    "kiranpantha/whisper-small-np",
    "kiranpantha/whisper-medium-nepali",
    "kiranpantha/whisper-large-v3-nepali",
    "kiranpantha/whisper-large-v3-turbo-nepali",
]

# Mapping model names correctly
processor_mappings = {
    "kiranpantha/whisper-tiny-ne": "openai/whisper-tiny",
    "kiranpantha/whisper-base-ne": "openai/whisper-base",
    "kiranpantha/whisper-small-np": "openai/whisper-small",
    "kiranpantha/whisper-medium-nepali": "openai/whisper-medium",
    "kiranpantha/whisper-large-v3-nepali": "openai/whisper-large-v3",
    "kiranpantha/whisper-large-v3-turbo-nepali": "openai/whisper-large-v3",
}

# Cache models and processors
model_cache = {}

def load_model(model_name):
    """Loads and caches the model and processor with proper device management."""
    if model_name not in model_cache:
        processor_name = processor_mappings.get(model_name, model_name)  # Handle mapping
        
        processor = AutoProcessor.from_pretrained(processor_name)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(device)
        
        model_cache[model_name] = (processor, model, device)
    
    return model_cache[model_name]

def create_pipeline(model_name):
    """Creates an ASR pipeline with proper configuration."""
    processor, model, device = load_model(model_name)
    
    return AutomaticSpeechRecognitionPipeline(
        model=model,
        processor=processor,
        device=device.index if device.type == "cuda" else -1,  # Ensure compatibility
        generate_kwargs={"task": "transcribe", "language": "ne"}  # "nepali" might not work
    )

def process_audio(model_url, audio_chunk):
    """Processes audio and returns transcription with error handling."""
    try:
        # Unpack audio_chunk (tuple) into audio array and sample rate
        audio_array, sample_rate = audio_chunk
        
        # Convert stereo to mono
        if len(audio_array.shape) > 1:
            audio_array = audio_array.mean(axis=0)
            
        # Resample to 16kHz if needed
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            audio_array = resampler(torch.tensor(audio_array).unsqueeze(0)).squeeze(0).numpy()
        
        # Create pipeline and process
        asr_pipeline = create_pipeline(model_url)
        transcription = asr_pipeline(audio_array)["text"]
        return transcription
    
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# Nepali Speech Recognition with Whisper Models")
    
    model_dropdown = gr.Dropdown(choices=model_urls, label="Select Model", value=model_urls[0])
    audio_input = gr.Audio(type="numpy", label="Input Audio")
    output_text = gr.Textbox(label="Transcription")
    transcribe_button = gr.Button("Transcribe")

    transcribe_button.click(
        fn=process_audio,
        inputs=[model_dropdown, audio_input],
        outputs=output_text,
    )

demo.launch()