ayymen commited on
Commit
7622552
1 Parent(s): 0d63b76

Use the new model

Browse files
Files changed (2) hide show
  1. app.py +27 -9
  2. common_voice_zgh_37838337.mp3 +0 -0
app.py CHANGED
@@ -1,20 +1,30 @@
1
- from nemo.collections.asr.models import EncDecRNNTBPEModel
2
  import yt_dlp as youtube_dl
3
  import os
4
  import tempfile
5
  import torch
6
  import gradio as gr
7
  from pydub import AudioSegment
 
8
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
- MODEL_NAME="nvidia/stt_kab_conformer_transducer_large"
11
  YT_LENGTH_LIMIT_S=3600
12
 
13
- model = EncDecRNNTBPEModel.from_pretrained(model_name=MODEL_NAME).to(device)
14
  model.eval()
15
 
16
  def get_transcripts(audio_path):
17
- text = model.transcribe([audio_path])[0][0]
 
 
 
 
 
 
 
 
 
18
  return text
19
 
20
  '''
@@ -27,14 +37,20 @@ article = (
27
  )
28
  '''
29
 
30
- examples = [
31
  ["135.wav"],
32
- ["common_voice_zgh_37837257.mp3"],
33
- ["common_voice_zgh_37838337.mp3"]
 
 
 
 
34
  ]
35
 
36
  def _return_yt_html_embed(yt_url):
37
  video_id = yt_url.split("?v=")[-1]
 
 
38
  HTML_str = (
39
  f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
40
  " </center>"
@@ -80,10 +96,11 @@ def yt_transcribe(yt_url, max_filesize=75.0):
80
  filepath = os.path.join(tmpdirname, "video.mp4")
81
  download_yt_audio(yt_url, filepath)
82
  audio = AudioSegment.from_file(filepath)
 
83
  wav_filepath = os.path.join(tmpdirname, "audio.wav")
84
  audio.export(wav_filepath, format="wav")
 
85
 
86
- text = get_transcripts(wav_filepath)
87
  return html_embed_str, text
88
 
89
 
@@ -110,7 +127,7 @@ file_transcribe = gr.Interface(
110
  gr.Audio(sources="upload", type="filepath", label="Audio file"),
111
  ],
112
  outputs="text",
113
- examples=examples,
114
  title="Transcribe Audio",
115
  description=(
116
  "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
@@ -126,6 +143,7 @@ youtube_transcribe = gr.Interface(
126
  gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
127
  ],
128
  outputs=["html", "text"],
 
129
  title="Transcribe Audio",
130
  description=(
131
  "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
 
1
+ from nemo.collections.asr.models import EncDecCTCModelBPE
2
  import yt_dlp as youtube_dl
3
  import os
4
  import tempfile
5
  import torch
6
  import gradio as gr
7
  from pydub import AudioSegment
8
+ import time
9
 
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
+ MODEL_NAME="ayymen/stt_zgh_fastconformer_ctc_small"
12
  YT_LENGTH_LIMIT_S=3600
13
 
14
+ model = EncDecCTCModelBPE.from_pretrained(model_name=MODEL_NAME).to(device)
15
  model.eval()
16
 
17
  def get_transcripts(audio_path):
18
+ audio = AudioSegment.from_file(audio_path)
19
+ # check if audio is mono 16kHz
20
+ if audio.channels != 1 or audio.frame_rate != 16000:
21
+ audio = audio.set_channels(1).set_frame_rate(16000) # convert to mono 16kHz
22
+ with tempfile.TemporaryDirectory() as tmpdirname:
23
+ audio_path = os.path.join(tmpdirname, "audio.wav")
24
+ audio.export(audio_path, format="wav")
25
+ text = model.transcribe([audio_path])[0]
26
+ else:
27
+ text = model.transcribe([audio_path])[0]
28
  return text
29
 
30
  '''
 
37
  )
38
  '''
39
 
40
+ EXAMPLES = [
41
  ["135.wav"],
42
+ ["common_voice_zgh_37837257.mp3"]
43
+ ]
44
+
45
+ YT_EXAMPLES = [
46
+ ["https://www.youtube.com/shorts/CSgTSE50MHY"],
47
+ ["https://www.youtube.com/shorts/OxQtqOyAFLE"]
48
  ]
49
 
50
  def _return_yt_html_embed(yt_url):
51
  video_id = yt_url.split("?v=")[-1]
52
+ if "youtube.com/shorts/" in video_id:
53
+ video_id = video_id.split("/")[-1]
54
  HTML_str = (
55
  f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
56
  " </center>"
 
96
  filepath = os.path.join(tmpdirname, "video.mp4")
97
  download_yt_audio(yt_url, filepath)
98
  audio = AudioSegment.from_file(filepath)
99
+ audio = audio.set_channels(1).set_frame_rate(16000) # convert to mono 16kHz
100
  wav_filepath = os.path.join(tmpdirname, "audio.wav")
101
  audio.export(wav_filepath, format="wav")
102
+ text = get_transcripts(wav_filepath)
103
 
 
104
  return html_embed_str, text
105
 
106
 
 
127
  gr.Audio(sources="upload", type="filepath", label="Audio file"),
128
  ],
129
  outputs="text",
130
+ examples=EXAMPLES,
131
  title="Transcribe Audio",
132
  description=(
133
  "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
 
143
  gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
144
  ],
145
  outputs=["html", "text"],
146
+ examples=YT_EXAMPLES,
147
  title="Transcribe Audio",
148
  description=(
149
  "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
common_voice_zgh_37838337.mp3 DELETED
Binary file (17.3 kB)