from nemo.collections.asr.models import EncDecMultiTaskModel
import gradio as gr
import torch
import json
import numpy as np
import soundfile as sf
import tempfile
from transformers import VitsTokenizer, VitsModel, set_seed


#just to import this piece of shit above me, one needs:

#gradio transformers
#nemo
#hydra
#librosa
#sentencepiece
#
#


# load model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')

# update decode params
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


#install accelerate

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map="cpu", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

messages = []

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}


tokenizer_vits = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
model_vits = VitsModel.from_pretrained("facebook/mms-tts-eng")

# Define the function to transcribe audio
def transcribe_audio(audio):
    audio_list, sample_rate = sf.read(audio)

    if audio_list.ndim > 1:
        audio_list = np.mean(audio_list,axis=1)
    
    # Create a temporary file to save the audio data
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
        temp_audio_path = temp_audio_file.name

        # Save the audio data to the temporary file
        sf.write(temp_audio_path, audio_list, sample_rate)

        # Transcribe audio using the canary model
        predicted_text = canary_model.transcribe(paths2audio_files=[temp_audio_path], batch_size=16)

    # Remove the temporary file

    # Return the transcription
    messages = [{"role": "user", "content": predicted_text[0]}]

    output_text =pipe(messages, **generation_args)

    inputs_vits = tokenizer_vits(text=output_text[0]["generated_text"], return_tensors="pt")

    set_seed(555)  # make deterministic

    with torch.no_grad():
        outputs_vits = model_vits(**inputs_vits)

    waveform = outputs_vits.waveform[0]

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file_2:
        temp_audio_path_2 = temp_audio_file_2.name

        # Save the audio data to the temporary file
        sf.write(temp_audio_path_2, waveform.numpy(), model_vits.config.sampling_rate)

    return temp_audio_path_2


# Create the Gradio interface
import gradio as gr


#gradio replaced .input and .output with .components
audio_input = gr.components.Audio(sources=["upload","microphone"], type="filepath", label="Record Audio")
audio_output = gr.components.Audio(label="Audio Output")
interface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=audio_output)

# Launch the interface
interface.launch()