import gradio as gr
import torch
from transformers import pipeline
import numpy as np
import time

modelo_1 = pipeline("automatic-speech-recognition", model="IABDs8a/MODELO1_EQUIPO2")
modelo_2 = pipeline("automatic-speech-recognition", model="IABDs8a/MODELO1_EQUIPO2")
modelo_3 = pipeline("automatic-speech-recognition", model="IABDs8a/MODELO1_EQUIPO2")

def greet(grabacion):

    inicio = time.time()

    
    sr, y = grabacion
    # Pasamos el array de muestras a tipo NumPy de 32 bits
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    modelo = "small"
    pipe = modelo_1
    
    result1 = modelo + ":" + pipe({"sampling_rate": sr, "raw": y})["text"]
    fin1 = time.time()

    
    modelo = "small"
    pipe = modelo_2

    inicio2 = time.time()
    result2 = modelo + ":" + pipe({"sampling_rate": sr, "raw": y})["text"]
    fin2 = time.time()

    modelo = "small"
    pipe = modelo_3

    inicio3 = time.time()
    result3 = modelo + ":" + pipe({"sampling_rate": sr, "raw": y})["text"]
    fin3 = time.time()


    return result1, fin1 - inicio, result2, fin2 - inicio2, result3, fin3 - inicio3, fin3 - inicio

demo = gr.Interface(fn=greet,
        inputs=[
                gr.Audio()
        ],
        outputs=[
            gr.Text(label="Resultado modelo usuario con más audios"),
            gr.Number(label="Tiempo modelo 1:"),
            gr.Text(label="Resultado modelo 3 usuarios con más audios"),
            gr.Number(label="Tiempo modelo 2:"),
            gr.Text(label="Resultado modelo entida con más audios"),
            gr.Number(label="Tiempo modelo 3:"),
            gr.Number(label="Tiempo total")
        ])
demo.launch()