197zAlexa / app.py
Kindler's picture
Create app.py
dff36fd verified
raw history blame
No virus
3.13 kB
from nemo.collections.asr.models import EncDecMultiTaskModel
import gradio as gr
import torch
import json
import numpy as np
import soundfile as sf
import tempfile
from transformers import VitsTokenizer, VitsModel, set_seed
#just to import this piece of shit above me, one needs:
#gradio transformers
#nemo
#hydra
#librosa
#sentencepiece
#
#
# load model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
# update decode params
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
#install accelerate
torch.random.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Phi-3-mini-128k-instruct",
device_map="cpu",
torch_dtype="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
messages = []
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
)
generation_args = {
"max_new_tokens": 500,
"return_full_text": False,
"temperature": 0.0,
"do_sample": False,
}
tokenizer_vits = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
model_vits = VitsModel.from_pretrained("facebook/mms-tts-eng")
# Define the function to transcribe audio
def transcribe_audio(audio):
audio_list, sample_rate = sf.read(audio)
if audio_list.ndim > 1:
audio_list = np.mean(audio_list,axis=1)
# Create a temporary file to save the audio data
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
temp_audio_path = temp_audio_file.name
# Save the audio data to the temporary file
sf.write(temp_audio_path, audio_list, sample_rate)
# Transcribe audio using the canary model
predicted_text = canary_model.transcribe(paths2audio_files=[temp_audio_path], batch_size=16)
# Remove the temporary file
# Return the transcription
messages = [{"role": "user", "content": predicted_text[0]}]
output_text =pipe(messages, **generation_args)
inputs_vits = tokenizer_vits(text=output_text[0]["generated_text"], return_tensors="pt")
set_seed(555) # make deterministic
with torch.no_grad():
outputs_vits = model_vits(**inputs_vits)
waveform = outputs_vits.waveform[0]
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file_2:
temp_audio_path_2 = temp_audio_file_2.name
# Save the audio data to the temporary file
sf.write(temp_audio_path_2, waveform.numpy(), model_vits.config.sampling_rate)
return temp_audio_path_2
# Create the Gradio interface
import gradio as gr
#gradio replaced .input and .output with .components
audio_input = gr.components.Audio(sources=["upload","microphone"], type="filepath", label="Record Audio")
audio_output = gr.components.Audio(label="Audio Output")
interface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=audio_output)
# Launch the interface
interface.launch()