from nemo.collections.asr.models import EncDecMultiTaskModel import gradio as gr import torch import json import numpy as np import soundfile as sf import tempfile from transformers import VitsTokenizer, VitsModel, set_seed #just to import this piece of shit above me, one needs: #gradio transformers #nemo #hydra #librosa #sentencepiece # # # load model canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b') # update decode params decode_cfg = canary_model.cfg.decoding decode_cfg.beam.beam_size = 1 canary_model.change_decoding_strategy(decode_cfg) import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline #install accelerate torch.random.manual_seed(0) model = AutoModelForCausalLM.from_pretrained( "microsoft/Phi-3-mini-128k-instruct", device_map="cpu", torch_dtype="auto", trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct") messages = [] pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, ) generation_args = { "max_new_tokens": 500, "return_full_text": False, "temperature": 0.0, "do_sample": False, } tokenizer_vits = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") model_vits = VitsModel.from_pretrained("facebook/mms-tts-eng") # Define the function to transcribe audio def transcribe_audio(audio): audio_list, sample_rate = sf.read(audio) if audio_list.ndim > 1: audio_list = np.mean(audio_list,axis=1) # Create a temporary file to save the audio data with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file: temp_audio_path = temp_audio_file.name # Save the audio data to the temporary file sf.write(temp_audio_path, audio_list, sample_rate) # Transcribe audio using the canary model predicted_text = canary_model.transcribe(paths2audio_files=[temp_audio_path], batch_size=16) # Remove the temporary file # Return the transcription messages = [{"role": "user", "content": predicted_text[0]}] output_text =pipe(messages, **generation_args) inputs_vits = tokenizer_vits(text=output_text[0]["generated_text"], return_tensors="pt") set_seed(555) # make deterministic with torch.no_grad(): outputs_vits = model_vits(**inputs_vits) waveform = outputs_vits.waveform[0] with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file_2: temp_audio_path_2 = temp_audio_file_2.name # Save the audio data to the temporary file sf.write(temp_audio_path_2, waveform.numpy(), model_vits.config.sampling_rate) return temp_audio_path_2 # Create the Gradio interface import gradio as gr #gradio replaced .input and .output with .components audio_input = gr.components.Audio(sources=["upload","microphone"], type="filepath", label="Record Audio") audio_output = gr.components.Audio(label="Audio Output") interface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=audio_output) # Launch the interface interface.launch()