Spaces:
Runtime error
Runtime error
Harshad Bhandwaldar
commited on
Commit
·
ef69a46
1
Parent(s):
a606014
model added
Browse files- app.py +2 -11
- requirements.txt +0 -1
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import os
|
2 |
os.system("pip install nemo_toolkit['all']")
|
3 |
-
import pytube
|
4 |
import gradio as gr
|
5 |
|
6 |
import nemo.collections.asr as nemo_asr
|
@@ -10,11 +9,6 @@ model = nemo_asr.models.EncDecCTCModel.from_pretrained(
|
|
10 |
model_name="stt_en_quartznet15x5"
|
11 |
)
|
12 |
|
13 |
-
def speech_youtube(x):
|
14 |
-
data = pytube.YouTube([f"{x}"])
|
15 |
-
audio = data.streams.get_audio_only()
|
16 |
-
text = model.transcribe(audio.download())
|
17 |
-
return text
|
18 |
|
19 |
def speech_file(x):
|
20 |
# print(x)
|
@@ -115,15 +109,13 @@ with gr.Blocks(css = css) as demo:
|
|
115 |
# Speech to Text - NVIDIA Qaurtznet15x5 (English)
|
116 |
QuartzNet is a Jasper-like network that uses separable convolutions and larger filter sizes. It has comparable accuracy to Jasper while having much fewer parameters. This particular model has 15 blocks each repeated 5 times.
|
117 |
""")
|
118 |
-
|
119 |
-
audio_input = gr.Textbox(label="YouTube Link", placeholder="paste the youtube link here")
|
120 |
-
text_output = gr.Textbox(label="Transcription", show_label=False)
|
121 |
-
youtube_button = gr.Button("Transcribe")
|
122 |
with gr.Tab("Audio File"):
|
123 |
with gr.Row().style(equal_height=True):
|
124 |
audio_input2 = gr.Audio(label="Audio File", type="filepath")
|
125 |
text_output2 = gr.Textbox(label="Transcription", show_label=False)
|
126 |
file_button = gr.Button("Transcribe")
|
|
|
127 |
with gr.Tab("Record"):
|
128 |
with gr.Row().style(equal_height=True):
|
129 |
audio_input3 = gr.Audio(label="Input Audio", source="microphone", type="filepath")
|
@@ -136,7 +128,6 @@ with gr.Blocks(css = css) as demo:
|
|
136 |
</div>
|
137 |
''')
|
138 |
|
139 |
-
youtube_button.click(speech_youtube, inputs=audio_input, outputs=text_output)
|
140 |
file_button.click(speech_file, inputs=audio_input2, outputs=text_output2)
|
141 |
rec_button.click(speech_record, inputs=audio_input3, outputs=text_output3)
|
142 |
|
|
|
1 |
import os
|
2 |
os.system("pip install nemo_toolkit['all']")
|
|
|
3 |
import gradio as gr
|
4 |
|
5 |
import nemo.collections.asr as nemo_asr
|
|
|
9 |
model_name="stt_en_quartznet15x5"
|
10 |
)
|
11 |
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def speech_file(x):
|
14 |
# print(x)
|
|
|
109 |
# Speech to Text - NVIDIA Qaurtznet15x5 (English)
|
110 |
QuartzNet is a Jasper-like network that uses separable convolutions and larger filter sizes. It has comparable accuracy to Jasper while having much fewer parameters. This particular model has 15 blocks each repeated 5 times.
|
111 |
""")
|
112 |
+
|
|
|
|
|
|
|
113 |
with gr.Tab("Audio File"):
|
114 |
with gr.Row().style(equal_height=True):
|
115 |
audio_input2 = gr.Audio(label="Audio File", type="filepath")
|
116 |
text_output2 = gr.Textbox(label="Transcription", show_label=False)
|
117 |
file_button = gr.Button("Transcribe")
|
118 |
+
|
119 |
with gr.Tab("Record"):
|
120 |
with gr.Row().style(equal_height=True):
|
121 |
audio_input3 = gr.Audio(label="Input Audio", source="microphone", type="filepath")
|
|
|
128 |
</div>
|
129 |
''')
|
130 |
|
|
|
131 |
file_button.click(speech_file, inputs=audio_input2, outputs=text_output2)
|
132 |
rec_button.click(speech_record, inputs=audio_input3, outputs=text_output3)
|
133 |
|
requirements.txt
CHANGED
@@ -2,5 +2,4 @@ fastapi==0.88.0
|
|
2 |
gradio==3.15.0
|
3 |
nemo==4.3.2
|
4 |
nemo_toolkit==1.14.0
|
5 |
-
pytube==12.1.2
|
6 |
uvicorn==0.20.0
|
|
|
2 |
gradio==3.15.0
|
3 |
nemo==4.3.2
|
4 |
nemo_toolkit==1.14.0
|
|
|
5 |
uvicorn==0.20.0
|