jiuuee commited on
Commit
2f54f28
β€’
1 Parent(s): 894f789

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +168 -30
app.py CHANGED
@@ -1,57 +1,195 @@
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import json
 
3
  import os
 
4
  import tempfile
5
  import uuid
6
  from transformers import pipeline
7
- import librosa
8
- import soundfile as sf
9
 
10
- SAMPLE_RATE = 16000 # Hz
11
- MAX_AUDIO_SECS = 30 # Maximum duration of audio in seconds
 
 
 
 
 
 
12
  src_lang = "en"
13
  tgt_lang = "en"
14
- pnc = "no"
15
 
16
- # Load the ASR pipeline
17
- asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  def convert_audio(audio_filepath, tmpdir, utt_id):
21
- """
22
- Convert audio file to 16 kHz mono WAV format.
23
- Returns output filename and duration.
24
- """
25
- data, sr = librosa.load(audio_filepath, sr=None, mono=True)
26
- duration = librosa.get_duration(y=data, sr=sr)
27
 
28
- if duration > MAX_AUDIO_SECS:
29
- raise gr.Error(f"Maximum audio duration exceeded. Please provide an audio file of up to {MAX_AUDIO_SECS} seconds.")
30
 
31
- if sr != SAMPLE_RATE:
32
- data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
 
 
 
 
33
 
34
- out_filename = os.path.join(tmpdir, f"{utt_id}.wav")
35
- sf.write(out_filename, data, SAMPLE_RATE)
36
 
37
- return out_filename, duration
38
 
 
 
39
 
40
- def transcribe(audio_filepath):
41
- if audio_filepath is None:
42
- raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
43
 
44
- utt_id = uuid.uuid4()
45
- with tempfile.TemporaryDirectory() as tmpdir:
46
- converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
47
- transcribed_text = asr_pipeline(converted_audio_filepath, sampling_rate=SAMPLE_RATE)[0]["transcription"]
48
 
49
- return transcribed_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
 
 
 
 
 
51
 
52
- with gr.Interface(transcribe, gr.inputs.Audio(), "text", title="ASR with NeMo Canary Model") as iface:
53
- iface.launch()
54
 
 
 
55
 
56
  '''
57
 
 
1
+ '''
2
+ import gradio as gr
3
+ from transformers import pipeline
4
+ # Load pipelines for Canary ASR, LLama3 QA, and VITS TTS
5
+ asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b", device=0)
6
+ qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa")
7
+ tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0)
8
+ '''
9
+
10
  import gradio as gr
11
  import json
12
+ import librosa
13
  import os
14
+ import soundfile as sf
15
  import tempfile
16
  import uuid
17
  from transformers import pipeline
 
 
18
 
19
+ import torch
20
+
21
+ from nemo.collections.asr.models import ASRModel
22
+ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
23
+ from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
24
+
25
+ SAMPLE_RATE = 16000 # Hz
26
+ MAX_AUDIO_SECS = 30 # wont try to transcribe if longer than this
27
  src_lang = "en"
28
  tgt_lang = "en"
29
+ pnc="no"
30
 
31
+ model = ASRModel.from_pretrained("nvidia/canary-1b")
32
+ model.eval()
33
+
34
+ # make sure beam size always 1 for consistency
35
+ model.change_decoding_strategy(None)
36
+ decoding_cfg = model.cfg.decoding
37
+ decoding_cfg.beam.beam_size = 1
38
+ model.change_decoding_strategy(decoding_cfg)
39
+
40
+ # setup for buffered inference
41
+ model.cfg.preprocessor.dither = 0.0
42
+ model.cfg.preprocessor.pad_to = 0
43
+
44
+ feature_stride = model.cfg.preprocessor['window_stride']
45
+ model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
46
+
47
+ frame_asr = FrameBatchMultiTaskAED(
48
+ asr_model=model,
49
+ frame_len=40.0,
50
+ total_buffer=40.0,
51
+ batch_size=16,
52
+ )
53
+
54
+ amp_dtype = torch.float16
55
 
56
 
57
  def convert_audio(audio_filepath, tmpdir, utt_id):
58
+ """
59
+ Convert all files to monochannel 16 kHz wav files.
60
+ Do not convert and raise error if audio too long.
61
+ Returns output filename and duration.
62
+ """
63
+ data, sr = librosa.load(audio_filepath, sr=None, mono=True)
64
 
65
+ duration = librosa.get_duration(y=data, sr=sr)
 
66
 
67
+ if duration > MAX_AUDIO_SECS:
68
+ raise gr.Error(
69
+ f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
70
+ "If you wish, you may trim the audio using the Audio viewer in Step 1 "
71
+ "(click on the scissors icon to start trimming audio)."
72
+ )
73
 
74
+ if sr != SAMPLE_RATE:
75
+ data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
76
 
77
+ out_filename = os.path.join(tmpdir, utt_id + '.wav')
78
 
79
+ # save output audio
80
+ sf.write(out_filename, data, SAMPLE_RATE)
81
 
82
+ return out_filename, duration
 
 
83
 
 
 
 
 
84
 
85
+ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
86
+
87
+ if audio_filepath is None:
88
+ raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
89
+
90
+ utt_id = uuid.uuid4()
91
+ with tempfile.TemporaryDirectory() as tmpdir:
92
+ converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
93
+
94
+ # make manifest file and save
95
+ manifest_data = {
96
+ "audio_filepath": converted_audio_filepath,
97
+ "source_lang": src_lang,
98
+ "target_lang": tgt_lang,
99
+ "taskname": taskname,
100
+ "pnc": pnc,
101
+ "answer": "predict",
102
+ "duration": str(duration),
103
+ }
104
+
105
+ manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
106
+
107
+ with open(manifest_filepath, 'w') as fout:
108
+ line = json.dumps(manifest_data)
109
+ fout.write(line + '\n')
110
+
111
+ # call transcribe, passing in manifest filepath
112
+ if duration < 40:
113
+ output_text = model.transcribe(manifest_filepath)[0]
114
+ else: # do buffered inference
115
+ with torch.cuda.amp.autocast(dtype=amp_dtype): # TODO: make it work if no cuda
116
+ with torch.no_grad():
117
+ hyps = get_buffered_pred_feat_multitaskAED(
118
+ frame_asr,
119
+ model.cfg.preprocessor,
120
+ model_stride_in_secs,
121
+ model.device,
122
+ manifest=manifest_filepath,
123
+ filepaths=None,
124
+ )
125
+
126
+ output_text = hyps[0].text
127
+
128
+ return output_text
129
+
130
+
131
+
132
+ with gr.Blocks(
133
+ title="NeMo Canary Model",
134
+ css="""
135
+ textarea { font-size: 18px;}
136
+ #model_output_text_box span {
137
+ font-size: 18px;
138
+ font-weight: bold;
139
+ }
140
+ """,
141
+ theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
142
+ ) as demo:
143
+
144
+ gr.HTML("<h1 style='text-align: center'>NeMo Canary model: Transcribe & Translate audio</h1>")
145
+
146
+ with gr.Row():
147
+ with gr.Column():
148
+ gr.HTML(
149
+ "<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
150
+
151
+ "<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
152
+ "You can transcribe longer files locally with this NeMo "
153
+ "<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
154
+ )
155
+
156
+ audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
157
+
158
+ gr.HTML("<p><b>Step 2:</b> Choose the input and output language.</p>")
159
+
160
+
161
+ with gr.Column():
162
+
163
+ gr.HTML("<p><b>Step 3:</b> Run the model.</p>")
164
+
165
+ go_button = gr.Button(
166
+ value="Run model",
167
+ variant="primary", # make "primary" so it stands out (default is "secondary")
168
+ )
169
+
170
+ model_output_text_box = gr.Textbox(
171
+ label="Model Output",
172
+ elem_id="model_output_text_box",
173
+ )
174
+
175
+ with gr.Row():
176
+
177
+ gr.HTML(
178
+ "<p style='text-align: center'>"
179
+ "🐀 <a href='https://huggingface.co/nvidia/canary-1b' target='_blank'>Canary model</a> | "
180
+ "πŸ§‘β€πŸ’» <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
181
+ "</p>"
182
+ )
183
 
184
+ go_button.click(
185
+ fn=transcribe,
186
+ inputs = [audio_file],
187
+ outputs = [model_output_text_box]
188
+ )
189
 
 
 
190
 
191
+ demo.queue()
192
+ demo.launch()
193
 
194
  '''
195