Spaces:

Kindler
/

197zAlexa

Runtime error

App Files Files Community

Kindler commited on May 2

Commit

dff36fd

•

1 Parent(s): b40931a

Create app.py

Browse files

Files changed (1) hide show

app.py +129 -0

app.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from nemo.collections.asr.models import EncDecMultiTaskModel
+import gradio as gr
+import torch
+import json
+import numpy as np
+import soundfile as sf
+import tempfile
+from transformers import VitsTokenizer, VitsModel, set_seed
+#just to import this piece of shit above me, one needs:
+#gradio transformers
+#nemo
+#hydra
+#librosa
+#sentencepiece
+#
+#
+# load model
+canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
+# update decode params
+decode_cfg = canary_model.cfg.decoding
+decode_cfg.beam.beam_size = 1
+canary_model.change_decoding_strategy(decode_cfg)
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+#install accelerate
+torch.random.manual_seed(0)
+model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/Phi-3-mini-128k-instruct",
+    device_map="cpu",
+    torch_dtype="auto",
+    trust_remote_code=True,
+)
+tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
+messages = []
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+)
+generation_args = {
+    "max_new_tokens": 500,
+    "return_full_text": False,
+    "temperature": 0.0,
+    "do_sample": False,
+}
+tokenizer_vits = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+model_vits = VitsModel.from_pretrained("facebook/mms-tts-eng")
+# Define the function to transcribe audio
+def transcribe_audio(audio):
+    audio_list, sample_rate = sf.read(audio)
+    if audio_list.ndim > 1:
+        audio_list = np.mean(audio_list,axis=1)
+    # Create a temporary file to save the audio data
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
+        temp_audio_path = temp_audio_file.name
+        # Save the audio data to the temporary file
+        sf.write(temp_audio_path, audio_list, sample_rate)
+        # Transcribe audio using the canary model
+        predicted_text = canary_model.transcribe(paths2audio_files=[temp_audio_path], batch_size=16)
+    # Remove the temporary file
+    # Return the transcription
+    messages = [{"role": "user", "content": predicted_text[0]}]
+    output_text =pipe(messages, **generation_args)
+    inputs_vits = tokenizer_vits(text=output_text[0]["generated_text"], return_tensors="pt")
+    set_seed(555)  # make deterministic
+    with torch.no_grad():
+        outputs_vits = model_vits(**inputs_vits)
+    waveform = outputs_vits.waveform[0]
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file_2:
+        temp_audio_path_2 = temp_audio_file_2.name
+        # Save the audio data to the temporary file
+        sf.write(temp_audio_path_2, waveform.numpy(), model_vits.config.sampling_rate)
+    return temp_audio_path_2
+# Create the Gradio interface
+import gradio as gr
+#gradio replaced .input and .output with .components
+audio_input = gr.components.Audio(sources=["upload","microphone"], type="filepath", label="Record Audio")
+audio_output = gr.components.Audio(label="Audio Output")
+interface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=audio_output)
+# Launch the interface
+interface.launch()