""" Resources: Canary 1B: https://huggingface.co/nvidia/canary-1b Phi-3-Mini-4K-Instruct: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct VITS TTS: https://huggingface.co/docs/transformers/en/model_doc/vits Blocks and Event Listeners, Gradio Guide: https://www.gradio.app/guides/blocks-and-event-listeners """ import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import scipy import numpy as np ########################################## Phi 3 torch.random.manual_seed(0) model = AutoModelForCausalLM.from_pretrained( "microsoft/Phi-3-mini-4k-instruct", torch_dtype="auto", trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, ) generation_args = { "max_new_tokens": 64, "return_full_text": False, "temperature": 0.0, "do_sample": False, } def phi(user_question): messages = [{"role": "system", "content": "What can I do for you today"}, {"role": "user", "content": user_question}] output = pipe(messages, **generation_args) return output ########################################## Canary from nemo.collections.asr.models import EncDecMultiTaskModel # load model canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b') # update dcode params decode_cfg = canary_model.cfg.decoding decode_cfg.beam.beam_size = 1 canary_model.change_decoding_strategy(decode_cfg) ########################################## VITS from transformers import VitsTokenizer, VitsModel, set_seed tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng") set_seed(555) # make deterministic ########################################## Main import gradio as gr def fromvoice(input): query = canary_model.transcribe(input, batch_size=16) resp = phi(query[0]) voice = tokenizer(text=resp[0]['generated_text'], return_tensors="pt") with torch.no_grad(): v = vits_model(**voice) output=v.waveform[0].numpy() return (vits_model.config.sampling_rate, output) def fromtext(input): resp = phi(input) voice = tokenizer(text=resp[0]['generated_text'], return_tensors="pt") with torch.no_grad(): v = vits_model(**voice) output=v.waveform[0].numpy() return (vits_model.config.sampling_rate, output) Alexa = gr.Blocks() with Alexa: audio_file = gr.Audio(type="filepath") text = gr.Textbox() output=gr.Audio() b1 = gr.Button("From Speech") b2 = gr.Button("From Text") b1.click(fromvoice, inputs=audio_file, outputs=output) b2.click(fromtext, inputs=text, outputs=output) Alexa.launch()