jiuuee commited on
Commit
9b700a4
1 Parent(s): 50f06a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -171
app.py CHANGED
@@ -1,199 +1,57 @@
1
- '''
2
- import gradio as gr
3
- from transformers import pipeline
4
-
5
- # Load pipelines for Canary ASR, LLama3 QA, and VITS TTS
6
- asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b", device=0)
7
- qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa")
8
- tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0)
9
- '''
10
-
11
  import gradio as gr
12
  import json
13
- import librosa
14
  import os
15
- import soundfile as sf
16
  import tempfile
17
  import uuid
18
  from transformers import pipeline
 
 
19
 
20
- import torch
21
-
22
- from nemo.collections.asr.models import ASRModel
23
- from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
24
- from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
25
-
26
- SAMPLE_RATE = 16000 # Hz
27
- MAX_AUDIO_SECS = 30 # wont try to transcribe if longer than this
28
  src_lang = "en"
29
  tgt_lang = "en"
30
- pnc="no"
31
-
32
- model = ASRModel.from_pretrained("nvidia/canary-1b")
33
- model.eval()
34
-
35
- # make sure beam size always 1 for consistency
36
- model.change_decoding_strategy(None)
37
- decoding_cfg = model.cfg.decoding
38
- decoding_cfg.beam.beam_size = 1
39
- model.change_decoding_strategy(decoding_cfg)
40
-
41
- # setup for buffered inference
42
- model.cfg.preprocessor.dither = 0.0
43
- model.cfg.preprocessor.pad_to = 0
44
-
45
- feature_stride = model.cfg.preprocessor['window_stride']
46
- model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
47
-
48
- frame_asr = FrameBatchMultiTaskAED(
49
- asr_model=model,
50
- frame_len=40.0,
51
- total_buffer=40.0,
52
- batch_size=16,
53
- )
54
-
55
- amp_dtype = torch.float16
56
-
57
-
58
- def convert_audio(audio_filepath, tmpdir, utt_id):
59
- """
60
- Convert all files to monochannel 16 kHz wav files.
61
- Do not convert and raise error if audio too long.
62
- Returns output filename and duration.
63
- """
64
- data, sr = librosa.load(audio_filepath, sr=None, mono=True)
65
-
66
- duration = librosa.get_duration(y=data, sr=sr)
67
-
68
- if duration > MAX_AUDIO_SECS:
69
- raise gr.Error(
70
- f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
71
- "If you wish, you may trim the audio using the Audio viewer in Step 1 "
72
- "(click on the scissors icon to start trimming audio)."
73
- )
74
-
75
- if sr != SAMPLE_RATE:
76
- data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
77
-
78
- out_filename = os.path.join(tmpdir, utt_id + '.wav')
79
-
80
- # save output audio
81
- sf.write(out_filename, data, SAMPLE_RATE)
82
-
83
- return out_filename, duration
84
-
85
 
86
  # Load the ASR pipeline
87
  asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b")
88
 
89
- def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
90
-
91
- if audio_filepath is None:
92
- raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
93
-
94
- utt_id = uuid.uuid4()
95
- with tempfile.TemporaryDirectory() as tmpdir:
96
- converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
97
-
98
- # make manifest file and save
99
- manifest_data = {
100
- "audio_filepath": converted_audio_filepath,
101
- "source_lang": src_lang,
102
- "target_lang": tgt_lang,
103
- "taskname": taskname,
104
- "pnc": pnc,
105
- "answer": "predict",
106
- "duration": str(duration),
107
- }
108
-
109
- manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
110
 
111
- with open(manifest_filepath, 'w') as fout:
112
- line = json.dumps(manifest_data)
113
- fout.write(line + '\n')
114
-
115
- # call transcribe, passing in manifest filepath
116
- if duration < 40:
117
- output_text = model.transcribe(manifest_filepath)[0]
118
- else: # do buffered inference
119
- with torch.cuda.amp.autocast(dtype=amp_dtype): # TODO: make it work if no cuda
120
- with torch.no_grad():
121
- hyps = get_buffered_pred_feat_multitaskAED(
122
- frame_asr,
123
- model.cfg.preprocessor,
124
- model_stride_in_secs,
125
- model.device,
126
- manifest=manifest_filepath,
127
- filepaths=None,
128
- )
129
-
130
- output_text = hyps[0].text
131
-
132
- return output_text
133
-
134
-
135
-
136
- with gr.Blocks(
137
- title="NeMo Canary Model",
138
- css="""
139
- textarea { font-size: 18px;}
140
- #model_output_text_box span {
141
- font-size: 18px;
142
- font-weight: bold;
143
- }
144
- """,
145
- theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
146
- ) as demo:
147
-
148
- gr.HTML("<h1 style='text-align: center'>NeMo Canary model: Transcribe & Translate audio</h1>")
149
-
150
- with gr.Row():
151
- with gr.Column():
152
- gr.HTML(
153
- "<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
154
-
155
- "<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
156
- "You can transcribe longer files locally with this NeMo "
157
- "<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
158
- )
159
 
160
- audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
 
161
 
162
- gr.HTML("<p><b>Step 2:</b> Choose the input and output language.</p>")
 
163
 
164
-
165
- with gr.Column():
166
 
167
- gr.HTML("<p><b>Step 3:</b> Run the model.</p>")
168
 
169
- go_button = gr.Button(
170
- value="Run model",
171
- variant="primary", # make "primary" so it stands out (default is "secondary")
172
- )
173
 
174
- model_output_text_box = gr.Textbox(
175
- label="Model Output",
176
- elem_id="model_output_text_box",
177
- )
178
 
179
- with gr.Row():
 
 
 
180
 
181
- gr.HTML(
182
- "<p style='text-align: center'>"
183
- "🐤 <a href='https://huggingface.co/nvidia/canary-1b' target='_blank'>Canary model</a> | "
184
- "🧑‍💻 <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
185
- "</p>"
186
- )
187
 
188
- go_button.click(
189
- fn=transcribe,
190
- inputs = [audio_file],
191
- outputs = [model_output_text_box]
192
- )
193
 
 
 
194
 
195
- demo.queue()
196
- demo.launch()
197
 
198
  '''
199
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import json
 
3
  import os
 
4
  import tempfile
5
  import uuid
6
  from transformers import pipeline
7
+ import librosa
8
+ import soundfile as sf
9
 
10
+ SAMPLE_RATE = 16000 # Hz
11
+ MAX_AUDIO_SECS = 30 # Maximum duration of audio in seconds
 
 
 
 
 
 
12
  src_lang = "en"
13
  tgt_lang = "en"
14
+ pnc = "no"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # Load the ASR pipeline
17
  asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b")
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ def convert_audio(audio_filepath, tmpdir, utt_id):
21
+ """
22
+ Convert audio file to 16 kHz mono WAV format.
23
+ Returns output filename and duration.
24
+ """
25
+ data, sr = librosa.load(audio_filepath, sr=None, mono=True)
26
+ duration = librosa.get_duration(y=data, sr=sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ if duration > MAX_AUDIO_SECS:
29
+ raise gr.Error(f"Maximum audio duration exceeded. Please provide an audio file of up to {MAX_AUDIO_SECS} seconds.")
30
 
31
+ if sr != SAMPLE_RATE:
32
+ data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
33
 
34
+ out_filename = os.path.join(tmpdir, f"{utt_id}.wav")
35
+ sf.write(out_filename, data, SAMPLE_RATE)
36
 
37
+ return out_filename, duration
38
 
 
 
 
 
39
 
40
+ def transcribe(audio_filepath):
41
+ if audio_filepath is None:
42
+ raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
 
43
 
44
+ utt_id = uuid.uuid4()
45
+ with tempfile.TemporaryDirectory() as tmpdir:
46
+ converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
47
+ transcribed_text = asr_pipeline(converted_audio_filepath, sampling_rate=SAMPLE_RATE)[0]["transcription"]
48
 
49
+ return transcribed_text
 
 
 
 
 
50
 
 
 
 
 
 
51
 
52
+ with gr.Interface(transcribe, gr.inputs.Audio(), "text", title="ASR with NeMo Canary Model") as iface:
53
+ iface.launch()
54
 
 
 
55
 
56
  '''
57