Kindler commited on
Commit
dff36fd
1 Parent(s): b40931a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -0
app.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nemo.collections.asr.models import EncDecMultiTaskModel
2
+ import gradio as gr
3
+ import torch
4
+ import json
5
+ import numpy as np
6
+ import soundfile as sf
7
+ import tempfile
8
+ from transformers import VitsTokenizer, VitsModel, set_seed
9
+
10
+
11
+
12
+ #just to import this piece of shit above me, one needs:
13
+
14
+ #gradio transformers
15
+ #nemo
16
+ #hydra
17
+ #librosa
18
+ #sentencepiece
19
+ #
20
+ #
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+ # load model
29
+ canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
30
+
31
+ # update decode params
32
+ decode_cfg = canary_model.cfg.decoding
33
+ decode_cfg.beam.beam_size = 1
34
+ canary_model.change_decoding_strategy(decode_cfg)
35
+
36
+
37
+ import torch
38
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
39
+
40
+
41
+ #install accelerate
42
+
43
+ torch.random.manual_seed(0)
44
+
45
+ model = AutoModelForCausalLM.from_pretrained(
46
+ "microsoft/Phi-3-mini-128k-instruct",
47
+ device_map="cpu",
48
+ torch_dtype="auto",
49
+ trust_remote_code=True,
50
+ )
51
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
52
+
53
+ messages = []
54
+
55
+ pipe = pipeline(
56
+ "text-generation",
57
+ model=model,
58
+ tokenizer=tokenizer,
59
+ )
60
+
61
+ generation_args = {
62
+ "max_new_tokens": 500,
63
+ "return_full_text": False,
64
+ "temperature": 0.0,
65
+ "do_sample": False,
66
+ }
67
+
68
+
69
+ tokenizer_vits = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
70
+ model_vits = VitsModel.from_pretrained("facebook/mms-tts-eng")
71
+
72
+ # Define the function to transcribe audio
73
+ def transcribe_audio(audio):
74
+ audio_list, sample_rate = sf.read(audio)
75
+
76
+ if audio_list.ndim > 1:
77
+ audio_list = np.mean(audio_list,axis=1)
78
+
79
+ # Create a temporary file to save the audio data
80
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
81
+ temp_audio_path = temp_audio_file.name
82
+
83
+ # Save the audio data to the temporary file
84
+ sf.write(temp_audio_path, audio_list, sample_rate)
85
+
86
+ # Transcribe audio using the canary model
87
+ predicted_text = canary_model.transcribe(paths2audio_files=[temp_audio_path], batch_size=16)
88
+
89
+ # Remove the temporary file
90
+
91
+ # Return the transcription
92
+ messages = [{"role": "user", "content": predicted_text[0]}]
93
+
94
+ output_text =pipe(messages, **generation_args)
95
+
96
+ inputs_vits = tokenizer_vits(text=output_text[0]["generated_text"], return_tensors="pt")
97
+
98
+ set_seed(555) # make deterministic
99
+
100
+ with torch.no_grad():
101
+ outputs_vits = model_vits(**inputs_vits)
102
+
103
+ waveform = outputs_vits.waveform[0]
104
+
105
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file_2:
106
+ temp_audio_path_2 = temp_audio_file_2.name
107
+
108
+ # Save the audio data to the temporary file
109
+ sf.write(temp_audio_path_2, waveform.numpy(), model_vits.config.sampling_rate)
110
+
111
+ return temp_audio_path_2
112
+
113
+
114
+
115
+
116
+ # Create the Gradio interface
117
+ import gradio as gr
118
+
119
+
120
+
121
+
122
+
123
+ #gradio replaced .input and .output with .components
124
+ audio_input = gr.components.Audio(sources=["upload","microphone"], type="filepath", label="Record Audio")
125
+ audio_output = gr.components.Audio(label="Audio Output")
126
+ interface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=audio_output)
127
+
128
+ # Launch the interface
129
+ interface.launch()