Harshad Bhandwaldar commited on
Commit
ef69a46
·
1 Parent(s): a606014

model added

Browse files
Files changed (2) hide show
  1. app.py +2 -11
  2. requirements.txt +0 -1
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
  os.system("pip install nemo_toolkit['all']")
3
- import pytube
4
  import gradio as gr
5
 
6
  import nemo.collections.asr as nemo_asr
@@ -10,11 +9,6 @@ model = nemo_asr.models.EncDecCTCModel.from_pretrained(
10
  model_name="stt_en_quartznet15x5"
11
  )
12
 
13
- def speech_youtube(x):
14
- data = pytube.YouTube([f"{x}"])
15
- audio = data.streams.get_audio_only()
16
- text = model.transcribe(audio.download())
17
- return text
18
 
19
  def speech_file(x):
20
  # print(x)
@@ -115,15 +109,13 @@ with gr.Blocks(css = css) as demo:
115
  # Speech to Text - NVIDIA Qaurtznet15x5 (English)
116
  QuartzNet is a Jasper-like network that uses separable convolutions and larger filter sizes. It has comparable accuracy to Jasper while having much fewer parameters. This particular model has 15 blocks each repeated 5 times.
117
  """)
118
- with gr.Tab("YouTube"):
119
- audio_input = gr.Textbox(label="YouTube Link", placeholder="paste the youtube link here")
120
- text_output = gr.Textbox(label="Transcription", show_label=False)
121
- youtube_button = gr.Button("Transcribe")
122
  with gr.Tab("Audio File"):
123
  with gr.Row().style(equal_height=True):
124
  audio_input2 = gr.Audio(label="Audio File", type="filepath")
125
  text_output2 = gr.Textbox(label="Transcription", show_label=False)
126
  file_button = gr.Button("Transcribe")
 
127
  with gr.Tab("Record"):
128
  with gr.Row().style(equal_height=True):
129
  audio_input3 = gr.Audio(label="Input Audio", source="microphone", type="filepath")
@@ -136,7 +128,6 @@ with gr.Blocks(css = css) as demo:
136
  </div>
137
  ''')
138
 
139
- youtube_button.click(speech_youtube, inputs=audio_input, outputs=text_output)
140
  file_button.click(speech_file, inputs=audio_input2, outputs=text_output2)
141
  rec_button.click(speech_record, inputs=audio_input3, outputs=text_output3)
142
 
 
1
  import os
2
  os.system("pip install nemo_toolkit['all']")
 
3
  import gradio as gr
4
 
5
  import nemo.collections.asr as nemo_asr
 
9
  model_name="stt_en_quartznet15x5"
10
  )
11
 
 
 
 
 
 
12
 
13
  def speech_file(x):
14
  # print(x)
 
109
  # Speech to Text - NVIDIA Qaurtznet15x5 (English)
110
  QuartzNet is a Jasper-like network that uses separable convolutions and larger filter sizes. It has comparable accuracy to Jasper while having much fewer parameters. This particular model has 15 blocks each repeated 5 times.
111
  """)
112
+
 
 
 
113
  with gr.Tab("Audio File"):
114
  with gr.Row().style(equal_height=True):
115
  audio_input2 = gr.Audio(label="Audio File", type="filepath")
116
  text_output2 = gr.Textbox(label="Transcription", show_label=False)
117
  file_button = gr.Button("Transcribe")
118
+
119
  with gr.Tab("Record"):
120
  with gr.Row().style(equal_height=True):
121
  audio_input3 = gr.Audio(label="Input Audio", source="microphone", type="filepath")
 
128
  </div>
129
  ''')
130
 
 
131
  file_button.click(speech_file, inputs=audio_input2, outputs=text_output2)
132
  rec_button.click(speech_record, inputs=audio_input3, outputs=text_output3)
133
 
requirements.txt CHANGED
@@ -2,5 +2,4 @@ fastapi==0.88.0
2
  gradio==3.15.0
3
  nemo==4.3.2
4
  nemo_toolkit==1.14.0
5
- pytube==12.1.2
6
  uvicorn==0.20.0
 
2
  gradio==3.15.0
3
  nemo==4.3.2
4
  nemo_toolkit==1.14.0
 
5
  uvicorn==0.20.0