ASR

Running

App Files Files Community

ayymen commited on 24 days ago

Commit

7622552

•

1 Parent(s): 0d63b76

Use the new model

Browse files

Files changed (2) hide show

app.py +27 -9
common_voice_zgh_37838337.mp3 +0 -0

app.py CHANGED Viewed

@@ -1,20 +1,30 @@
-from nemo.collections.asr.models import EncDecRNNTBPEModel
 import yt_dlp as youtube_dl
 import os
 import tempfile
 import torch
 import gradio as gr
 from pydub import AudioSegment
 device = "cuda" if torch.cuda.is_available() else "cpu"
-MODEL_NAME="nvidia/stt_kab_conformer_transducer_large"
 YT_LENGTH_LIMIT_S=3600
-model = EncDecRNNTBPEModel.from_pretrained(model_name=MODEL_NAME).to(device)
 model.eval()
 def get_transcripts(audio_path):
-    text = model.transcribe([audio_path])[0][0]
     return text
 '''
@@ -27,14 +37,20 @@ article = (
 )
 '''
-examples = [
     ["135.wav"],
-    ["common_voice_zgh_37837257.mp3"],
-    ["common_voice_zgh_37838337.mp3"]
 ]
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
         f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
         " </center>"
@@ -80,10 +96,11 @@ def yt_transcribe(yt_url, max_filesize=75.0):
         filepath = os.path.join(tmpdirname, "video.mp4")
         download_yt_audio(yt_url, filepath)
         audio = AudioSegment.from_file(filepath)
         wav_filepath = os.path.join(tmpdirname, "audio.wav")
         audio.export(wav_filepath, format="wav")
-    text = get_transcripts(wav_filepath)
     return html_embed_str, text
@@ -110,7 +127,7 @@ file_transcribe = gr.Interface(
         gr.Audio(sources="upload", type="filepath", label="Audio file"),
     ],
     outputs="text",
-    examples=examples,
     title="Transcribe Audio",
     description=(
         "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
@@ -126,6 +143,7 @@ youtube_transcribe = gr.Interface(
         gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
     ],
     outputs=["html", "text"],
     title="Transcribe Audio",
     description=(
         "Transcribe microphone or audio inputs with the click of a button! Demo uses the"

+from nemo.collections.asr.models import EncDecCTCModelBPE
 import yt_dlp as youtube_dl
 import os
 import tempfile
 import torch
 import gradio as gr
 from pydub import AudioSegment
+import time
 device = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_NAME="ayymen/stt_zgh_fastconformer_ctc_small"
 YT_LENGTH_LIMIT_S=3600
+model = EncDecCTCModelBPE.from_pretrained(model_name=MODEL_NAME).to(device)
 model.eval()
 def get_transcripts(audio_path):
+    audio = AudioSegment.from_file(audio_path)
+    # check if audio is mono 16kHz
+    if audio.channels != 1 or audio.frame_rate != 16000:
+        audio = audio.set_channels(1).set_frame_rate(16000) # convert to mono 16kHz
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            audio_path = os.path.join(tmpdirname, "audio.wav")
+            audio.export(audio_path, format="wav")
+            text = model.transcribe([audio_path])[0]
+    else:
+        text = model.transcribe([audio_path])[0]
     return text
 '''
 )
 '''
+EXAMPLES = [
     ["135.wav"],
+    ["common_voice_zgh_37837257.mp3"]
+]
+YT_EXAMPLES = [
+    ["https://www.youtube.com/shorts/CSgTSE50MHY"],
+    ["https://www.youtube.com/shorts/OxQtqOyAFLE"]
 ]
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
+    if "youtube.com/shorts/" in video_id:
+        video_id = video_id.split("/")[-1]
     HTML_str = (
         f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
         " </center>"
         filepath = os.path.join(tmpdirname, "video.mp4")
         download_yt_audio(yt_url, filepath)
         audio = AudioSegment.from_file(filepath)
+        audio = audio.set_channels(1).set_frame_rate(16000) # convert to mono 16kHz
         wav_filepath = os.path.join(tmpdirname, "audio.wav")
         audio.export(wav_filepath, format="wav")
+        text = get_transcripts(wav_filepath)
     return html_embed_str, text
         gr.Audio(sources="upload", type="filepath", label="Audio file"),
     ],
     outputs="text",
+    examples=EXAMPLES,
     title="Transcribe Audio",
     description=(
         "Transcribe microphone or audio inputs with the click of a button! Demo uses the"
         gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
     ],
     outputs=["html", "text"],
+    examples=YT_EXAMPLES,
     title="Transcribe Audio",
     description=(
         "Transcribe microphone or audio inputs with the click of a button! Demo uses the"

common_voice_zgh_37838337.mp3 DELETED Viewed

Binary file (17.3 kB)