Spaces:

hashb
/

stt-quartznet15x5-en-nvidia

Runtime error

App Files Files Community

Harshad Bhandwaldar commited on Jan 9, 2023

Commit

ef69a46

1 Parent(s): a606014

model added

Browse files

Files changed (2) hide show

app.py +2 -11
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 os.system("pip install nemo_toolkit['all']")
-import pytube
 import gradio as gr
 import nemo.collections.asr as nemo_asr
@@ -10,11 +9,6 @@ model = nemo_asr.models.EncDecCTCModel.from_pretrained(
     model_name="stt_en_quartznet15x5"
 )
-def speech_youtube(x):
-    data = pytube.YouTube([f"{x}"])
-    audio = data.streams.get_audio_only()
-    text = model.transcribe(audio.download())
-    return text
 def speech_file(x):
     # print(x)
@@ -115,15 +109,13 @@ with gr.Blocks(css = css) as demo:
     # Speech to Text - NVIDIA Qaurtznet15x5 (English)
     QuartzNet is a Jasper-like network that uses separable convolutions and larger filter sizes. It has comparable accuracy to Jasper while having much fewer parameters. This particular model has 15 blocks each repeated 5 times.
     """)
-    with gr.Tab("YouTube"):
-        audio_input = gr.Textbox(label="YouTube Link", placeholder="paste the youtube link here")
-        text_output = gr.Textbox(label="Transcription", show_label=False)
-        youtube_button = gr.Button("Transcribe")
     with gr.Tab("Audio File"):
         with gr.Row().style(equal_height=True):
             audio_input2 = gr.Audio(label="Audio File", type="filepath")
             text_output2 = gr.Textbox(label="Transcription", show_label=False)
         file_button = gr.Button("Transcribe")
     with gr.Tab("Record"):
         with gr.Row().style(equal_height=True):
             audio_input3 = gr.Audio(label="Input Audio", source="microphone", type="filepath")
@@ -136,7 +128,6 @@ with gr.Blocks(css = css) as demo:
         </div>
         ''')
-    youtube_button.click(speech_youtube, inputs=audio_input, outputs=text_output)
     file_button.click(speech_file, inputs=audio_input2, outputs=text_output2)
     rec_button.click(speech_record, inputs=audio_input3, outputs=text_output3)

 import os
 os.system("pip install nemo_toolkit['all']")
 import gradio as gr
 import nemo.collections.asr as nemo_asr
     model_name="stt_en_quartznet15x5"
 )
 def speech_file(x):
     # print(x)
     # Speech to Text - NVIDIA Qaurtznet15x5 (English)
     QuartzNet is a Jasper-like network that uses separable convolutions and larger filter sizes. It has comparable accuracy to Jasper while having much fewer parameters. This particular model has 15 blocks each repeated 5 times.
     """)
     with gr.Tab("Audio File"):
         with gr.Row().style(equal_height=True):
             audio_input2 = gr.Audio(label="Audio File", type="filepath")
             text_output2 = gr.Textbox(label="Transcription", show_label=False)
         file_button = gr.Button("Transcribe")
     with gr.Tab("Record"):
         with gr.Row().style(equal_height=True):
             audio_input3 = gr.Audio(label="Input Audio", source="microphone", type="filepath")
         </div>
         ''')
     file_button.click(speech_file, inputs=audio_input2, outputs=text_output2)
     rec_button.click(speech_record, inputs=audio_input3, outputs=text_output3)

requirements.txt CHANGED Viewed

@@ -2,5 +2,4 @@ fastapi==0.88.0
 gradio==3.15.0
 nemo==4.3.2
 nemo_toolkit==1.14.0
-pytube==12.1.2
 uvicorn==0.20.0

 gradio==3.15.0
 nemo==4.3.2
 nemo_toolkit==1.14.0
 uvicorn==0.20.0