Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
'''
|
6 |
import gradio as gr
|
7 |
from transformers import pipeline
|
@@ -11,6 +7,7 @@ asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b"
|
|
11 |
qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa")
|
12 |
tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0)
|
13 |
'''
|
|
|
14 |
import gradio as gr
|
15 |
import json
|
16 |
import librosa
|
@@ -26,7 +23,7 @@ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTask
|
|
26 |
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
|
27 |
|
28 |
SAMPLE_RATE = 16000 # Hz
|
29 |
-
|
30 |
|
31 |
model = ASRModel.from_pretrained("nvidia/canary-1b")
|
32 |
model.eval()
|
@@ -59,12 +56,11 @@ def convert_audio(audio_filepath, tmpdir, utt_id):
|
|
59 |
Do not convert and raise error if audio too long.
|
60 |
Returns output filename and duration.
|
61 |
"""
|
62 |
-
|
63 |
data, sr = librosa.load(audio_filepath, sr=None, mono=True)
|
64 |
|
65 |
duration = librosa.get_duration(y=data, sr=sr)
|
66 |
|
67 |
-
if duration
|
68 |
raise gr.Error(
|
69 |
f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
|
70 |
"If you wish, you may trim the audio using the Audio viewer in Step 1 "
|
@@ -92,9 +88,9 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
|
|
92 |
|
93 |
utt_id = uuid.uuid4()
|
94 |
with tempfile.TemporaryDirectory() as tmpdir:
|
95 |
-
converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(
|
96 |
|
97 |
-
|
98 |
manifest_data = {
|
99 |
"audio_filepath": converted_audio_filepath,
|
100 |
"source_lang": src_lang,
|
|
|
|
|
|
|
|
|
|
|
1 |
'''
|
2 |
import gradio as gr
|
3 |
from transformers import pipeline
|
|
|
7 |
qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa")
|
8 |
tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0)
|
9 |
'''
|
10 |
+
|
11 |
import gradio as gr
|
12 |
import json
|
13 |
import librosa
|
|
|
23 |
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
|
24 |
|
25 |
SAMPLE_RATE = 16000 # Hz
|
26 |
+
MAX_AUDIO_SECS = 30 # wont try to transcribe if longer than this
|
27 |
|
28 |
model = ASRModel.from_pretrained("nvidia/canary-1b")
|
29 |
model.eval()
|
|
|
56 |
Do not convert and raise error if audio too long.
|
57 |
Returns output filename and duration.
|
58 |
"""
|
|
|
59 |
data, sr = librosa.load(audio_filepath, sr=None, mono=True)
|
60 |
|
61 |
duration = librosa.get_duration(y=data, sr=sr)
|
62 |
|
63 |
+
if duration > MAX_AUDIO_SECS:
|
64 |
raise gr.Error(
|
65 |
f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
|
66 |
"If you wish, you may trim the audio using the Audio viewer in Step 1 "
|
|
|
88 |
|
89 |
utt_id = uuid.uuid4()
|
90 |
with tempfile.TemporaryDirectory() as tmpdir:
|
91 |
+
converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
|
92 |
|
93 |
+
# make manifest file and save
|
94 |
manifest_data = {
|
95 |
"audio_filepath": converted_audio_filepath,
|
96 |
"source_lang": src_lang,
|