import spaces
import gradio as gr
import numpy as np
import torch
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, AutomaticSpeechRecognitionPipeline

peft_model_id = "mfidabel/Modelo_3_Whisper_Large_V3"
language = "guarani"
task = "transcribe"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=False, device_map="cuda:0"
)
model = PeftModel.from_pretrained(model, peft_model_id)
model = model.merge_and_unload()
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task=task)

pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)

@spaces.GPU
def transcribe(audio):
    if audio is None:
        return "Espera a que la grabación termine de subirse al servidor !! Intentelo de nuevo en unos segundos"
        
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    with torch.autocast("cuda"):
        return pipeline({"sampling_rate": sr, "raw": y}, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"]

examples = [
    "./examples/audio_1.mp3",
    "./examples/audio_2.mp3",
    "./examples/audio_3.mp3",
    "./examples/audio_4.mp3"
]

title = "# 🇵🇾 Reconocimiento de Voz en Guaraní"

description = """Esta es una demostración del reconocimiento de voz en Guaraní utilizando el modelo speech-to-text [Whisper](https://arxiv.org/pdf/2212.04356.pdf)

                Autores:
                - Mateo Andrés Fidabel Gill
                - Santiago Ruben Acevedo Zarza
              """

audio_input = gr.Audio(value="./examples/audio_1.mp3", 
                       sources=["upload", "microphone"],
                       label="🎤 Audio a transcribir",
                       interactive=True)

transcription = gr.Textbox(label="📝 Transcripción",
                           interactive=False)

with gr.Blocks() as demo:
    
    with gr.Row():
        # Model Title and Description
        gr.Markdown(title)
        gr.Markdown(description)
        
    with gr.Row():
        # Audio Input
        audio_input.render()
        
    with gr.Row():
        # Text Output
        transcription.render()
        
    with gr.Row():
        # Submit and Clear Buttons
        submit = gr.Button("📝 Transcribir el Audio")

    with gr.Row():
        gr.Examples(examples=examples,
                    inputs=[audio_input],
                    outputs=[transcription],
                    fn=transcribe,
                    label="Ejemplos")

    submit.click(transcribe, 
                 inputs=[audio_input],
                 outputs = [transcription])


demo.queue()
demo.launch(share=True)