Nvidia_NeMo_Transcribe

Runtime error

App Files Files Community

JanDalhuysen

smajumdar commited on Apr 8, 2023

Commit

d76d8fd

0 Parent(s):

Duplicate from smajumdar/nemo_multilingual_language_id

Browse files

Co-authored-by: Somshubra Majumdar <smajumdar@users.noreply.huggingface.co>

Files changed (7) hide show

.gitattributes +33 -0
README.md +14 -0
app.py +641 -0
packages.txt +2 -0
requirements.txt +2 -0
speech_to_text_buffered_infer_ctc.py +193 -0
speech_to_text_buffered_infer_rnnt.py +247 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,33 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Nemo Multilingual Language Id
+emoji: 🐠
+colorFrom: blue
+colorTo: gray
+sdk: gradio
+sdk_version: 3.17.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+duplicated_from: smajumdar/nemo_multilingual_language_id
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,641 @@

+import os
+import json
+import shutil
+import uuid
+import tempfile
+import subprocess
+import re
+import time
+import traceback
+import gradio as gr
+import pytube as pt
+import nemo.collections.asr as nemo_asr
+import torch
+import speech_to_text_buffered_infer_ctc as buffered_ctc
+import speech_to_text_buffered_infer_rnnt as buffered_rnnt
+from nemo.utils import logging
+# Set NeMo cache dir as /tmp
+from nemo import constants
+os.environ[constants.NEMO_ENV_CACHE_DIR] = "/tmp/nemo/"
+SAMPLE_RATE = 16000  # Default sample rate for ASR
+BUFFERED_INFERENCE_DURATION_THRESHOLD = 60.0  # 60 second and above will require chunked inference.
+CHUNK_LEN_IN_SEC = 20.0  # Chunk size
+BUFFER_LEN_IN_SEC = 30.0  # Total buffer size
+TITLE = "NeMo ASR Inference on Hugging Face"
+DESCRIPTION = "Demo of all languages supported by NeMo ASR"
+DEFAULT_EN_MODEL = "nvidia/stt_en_conformer_transducer_xlarge"
+DEFAULT_BUFFERED_EN_MODEL = "nvidia/stt_en_conformer_transducer_large"
+# Pre-download and cache the model in disk space
+logging.setLevel(logging.ERROR)
+tmp_model = nemo_asr.models.ASRModel.from_pretrained(DEFAULT_BUFFERED_EN_MODEL, map_location='cpu')
+del tmp_model
+logging.setLevel(logging.INFO)
+MARKDOWN = f"""
+# {TITLE}
+## {DESCRIPTION}
+"""
+CSS = """
+p.big {
+  font-size: 20px;
+}
+/* From https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/blob/main/app.py */
+.result {display:flex;flex-direction:column}
+.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%;font-size:20px;}
+.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
+.result_item_error {background-color:#ff7070;color:white;align-self:start}
+"""
+ARTICLE = """
+<br><br>
+<p class='big' style='text-align: center'>
+    <a href='https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/intro.html' target='_blank'>NeMo ASR</a>
+    |
+    <a href='https://github.com/NVIDIA/NeMo#nvidia-nemo' target='_blank'>Github Repo</a>
+</p>
+"""
+SUPPORTED_LANGUAGES = set([])
+SUPPORTED_MODEL_NAMES = set([])
+# HF models, grouped by language identifier
+hf_filter = nemo_asr.models.ASRModel.get_hf_model_filter()
+hf_filter.task = "automatic-speech-recognition"
+hf_infos = nemo_asr.models.ASRModel.search_huggingface_models(model_filter=hf_filter)
+for info in hf_infos:
+    print("Model ID:", info.modelId)
+    try:
+        lang_id = info.modelId.split("_")[1]  # obtains lang id as str
+    except Exception:
+        print("WARNING: Skipping model id -", info)
+        continue
+    SUPPORTED_LANGUAGES.add(lang_id)
+    SUPPORTED_MODEL_NAMES.add(info.modelId)
+SUPPORTED_MODEL_NAMES = sorted(list(SUPPORTED_MODEL_NAMES))
+# DEBUG FILTER
+# SUPPORTED_MODEL_NAMES = list(filter(lambda x: "en" in x and "conformer_transducer_large" in x, SUPPORTED_MODEL_NAMES))
+model_dict = {}
+for model_name in SUPPORTED_MODEL_NAMES:
+    try:
+        iface = gr.Interface.load(f'models/{model_name}')
+        model_dict[model_name] = iface
+        # model_dict[model_name] = None
+    except:
+        pass
+if DEFAULT_EN_MODEL in model_dict:
+    # Preemptively load the default EN model
+    if model_dict[DEFAULT_EN_MODEL] is None:
+        model_dict[DEFAULT_EN_MODEL] = gr.Interface.load(f'models/{DEFAULT_EN_MODEL}')
+SUPPORTED_LANG_MODEL_DICT = {}
+for lang in SUPPORTED_LANGUAGES:
+    for model_id in SUPPORTED_MODEL_NAMES:
+        if ("_" + lang + "_") in model_id:
+            # create new lang in dict
+            if lang not in SUPPORTED_LANG_MODEL_DICT:
+                SUPPORTED_LANG_MODEL_DICT[lang] = [model_id]
+            else:
+                SUPPORTED_LANG_MODEL_DICT[lang].append(model_id)
+# Sort model names
+for lang in SUPPORTED_LANG_MODEL_DICT.keys():
+    model_ids = SUPPORTED_LANG_MODEL_DICT[lang]
+    model_ids = sorted(model_ids)
+    SUPPORTED_LANG_MODEL_DICT[lang] = model_ids
+def get_device():
+    gpu_available = torch.cuda.is_available()
+    if gpu_available:
+        return torch.cuda.get_device_name()
+    else:
+        return "CPU"
+def parse_duration(audio_file):
+    """
+    FFMPEG to calculate durations. Libraries can do it too, but filetypes cause different libraries to behave differently.
+    """
+    process = subprocess.Popen(['ffmpeg', '-i', audio_file], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    stdout, stderr = process.communicate()
+    matches = re.search(
+        r"Duration:\s{1}(?P<hours>\d+?):(?P<minutes>\d+?):(?P<seconds>\d+\.\d+?),", stdout.decode(), re.DOTALL
+    ).groupdict()
+    duration = 0.0
+    duration += float(matches['hours']) * 60.0 * 60.0
+    duration += float(matches['minutes']) * 60.0
+    duration += float(matches['seconds']) * 1.0
+    return duration
+def resolve_model_type(model_name: str) -> str:
+    """
+    Map model name to a class type, without loading the model. Has some hardcoded assumptions in
+    semantics of model naming.
+    """
+    # Loss specific maps
+    if 'hybrid' in model_name or 'hybrid_ctc' in model_name or 'hybrid_transducer' in model_name:
+        return 'hybrid'
+    elif 'transducer' in model_name or 'rnnt' in model_id:
+        return 'transducer'
+    elif 'ctc' in model_name:
+        return 'ctc'
+    # Model specific maps
+    if 'jasper' in model_name:
+        return 'ctc'
+    elif 'quartznet' in model_name:
+        return 'ctc'
+    elif 'citrinet' in model_name:
+        return 'ctc'
+    elif 'contextnet' in model_name:
+        return 'transducer'
+    return None
+def resolve_model_stride(model_name) -> int:
+    """
+    Model specific pre-calc of stride levels.
+    Dont laod model to get such info.
+    """
+    if 'jasper' in model_name:
+        return 2
+    if 'quartznet' in model_name:
+        return 2
+    if 'conformer' in model_name:
+        return 4
+    if 'squeezeformer' in model_name:
+        return 4
+    if 'citrinet' in model_name:
+        return 8
+    if 'contextnet' in model_name:
+        return 8
+    return -1
+def convert_audio(audio_filepath):
+    """
+    Transcode all mp3 files to monochannel 16 kHz wav files.
+    """
+    filedir = os.path.split(audio_filepath)[0]
+    filename, ext = os.path.splitext(audio_filepath)
+    if ext == 'wav':
+        return audio_filepath
+    out_filename = os.path.join(filedir, filename + '.wav')
+    process = subprocess.Popen(
+        ['ffmpeg', '-y', '-i', audio_filepath, '-ac', '1', '-ar', str(SAMPLE_RATE), out_filename],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        close_fds=True,
+    )
+    stdout, stderr = process.communicate()
+    if os.path.exists(out_filename):
+        return out_filename
+    else:
+        return None
+def extract_result_from_manifest(filepath, model_name) -> (bool, str):
+    """
+    Parse the written manifest which is result of the buffered inference process.
+    """
+    data = []
+    with open(filepath, 'r', encoding='utf-8') as f:
+        for line in f:
+            try:
+                line = json.loads(line)
+                data.append(line['pred_text'])
+            except Exception as e:
+                pass
+    if len(data) > 0:
+        return True, data[0]
+    else:
+        return False, f"Could not perform inference on model with name : {model_name}"
+def build_html_output(s: str, style: str = "result_item_success"):
+    return f"""
+    <div class='result'>
+        <div class='result_item {style}'>
+          {s}
+        </div>
+    </div>
+    """
+def infer_audio(model_name: str, audio_file: str) -> str:
+    """
+    Main method that switches from HF inference for small audio files to Buffered CTC/RNNT mode for long audio files.
+    Args:
+        model_name: Str name of the model (potentially with / to denote HF models)
+        audio_file: Path to an audio file (mp3 or wav)
+    Returns:
+        str which is the transcription if successful.
+        str which is HTML output of logs.
+    """
+    # Parse the duration of the audio file
+    duration = parse_duration(audio_file)
+    if duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:  # Longer than one minute; use buffered mode
+        # Process audio to be of wav type (possible youtube audio)
+        audio_file = convert_audio(audio_file)
+        # If audio file transcoding failed, let user know
+        if audio_file is None:
+            return "Error:- Failed to convert audio file to wav."
+        # Extract audio dir from resolved audio filepath
+        audio_dir = os.path.split(audio_file)[0]
+        # Next calculate the stride of each model
+        model_stride = resolve_model_stride(model_name)
+        if model_stride < 0:
+            return f"Error:- Failed to compute the model stride for model with name : {model_name}"
+        # Process model type (CTC/RNNT/Hybrid)
+        model_type = resolve_model_type(model_name)
+        if model_type is None:
+            # Model type could not be infered.
+            # Try all feasible options
+            RESULT = None
+            try:
+                ctc_config = buffered_ctc.TranscriptionConfig(
+                    pretrained_name=model_name,
+                    audio_dir=audio_dir,
+                    output_filename="output.json",
+                    audio_type="wav",
+                    overwrite_transcripts=True,
+                    model_stride=model_stride,
+                    chunk_len_in_secs=20.0,
+                    total_buffer_in_secs=30.0,
+                )
+                buffered_ctc.main(ctc_config)
+                result = extract_result_from_manifest('output.json', model_name)
+                if result[0]:
+                    RESULT = result[1]
+            except Exception as e:
+                pass
+            try:
+                rnnt_config = buffered_rnnt.TranscriptionConfig(
+                    pretrained_name=model_name,
+                    audio_dir=audio_dir,
+                    output_filename="output.json",
+                    audio_type="wav",
+                    overwrite_transcripts=True,
+                    model_stride=model_stride,
+                    chunk_len_in_secs=20.0,
+                    total_buffer_in_secs=30.0,
+                )
+                buffered_rnnt.main(rnnt_config)
+                result = extract_result_from_manifest('output.json', model_name)[-1]
+                if result[0]:
+                    RESULT = result[1]
+            except Exception as e:
+                pass
+            if RESULT is None:
+                return f"Error:- Could not parse model type; failed to perform inference with model {model_name}!"
+        elif model_type == 'ctc':
+            # CTC Buffered Inference
+            ctc_config = buffered_ctc.TranscriptionConfig(
+                pretrained_name=model_name,
+                audio_dir=audio_dir,
+                output_filename="output.json",
+                audio_type="wav",
+                overwrite_transcripts=True,
+                model_stride=model_stride,
+                chunk_len_in_secs=20.0,
+                total_buffer_in_secs=30.0,
+            )
+            buffered_ctc.main(ctc_config)
+            return extract_result_from_manifest('output.json', model_name)[-1]
+        elif model_type == 'transducer':
+            # RNNT Buffered Inference
+            rnnt_config = buffered_rnnt.TranscriptionConfig(
+                pretrained_name=model_name,
+                audio_dir=audio_dir,
+                output_filename="output.json",
+                audio_type="wav",
+                overwrite_transcripts=True,
+                model_stride=model_stride,
+                chunk_len_in_secs=20.0,
+                total_buffer_in_secs=30.0,
+            )
+            buffered_rnnt.main(rnnt_config)
+            return extract_result_from_manifest('output.json', model_name)[-1]
+        else:
+            return f"Error:- Could not parse model type; failed to perform inference with model {model_name}!"
+    else:
+        # Obtain Gradio Model function from cache of models
+        if model_name in model_dict:
+            model = model_dict[model_name]
+            if model is None:
+                # Load the gradio interface
+                # try:
+                iface = gr.Interface.load(f'models/{model_name}')
+                print(iface)
+                # except:
+                #     iface = None
+                if iface is not None:
+                    # Update model cache
+                    model_dict[model_name] = iface
+        else:
+            model = None
+        if model is not None:
+            # Use HF API for transcription
+            try:
+                transcriptions = model(audio_file)
+                return transcriptions
+            except Exception as e:
+                transcriptions = ""
+                error = ""
+                error += (
+                    f"The model `{model_name}` is currently loading and cannot be used "
+                    f"for transcription.<br>"
+                    f"Please try another model or wait a few minutes."
+                )
+                return error
+        else:
+            error = (
+                f"Error:- Could not find model {model_name} in list of available models : "
+                f"{list([k for k in model_dict.keys()])}"
+            )
+            return error
+def transcribe(microphone, audio_file, model_name):
+    audio_data = None
+    warn_output = ""
+    if (microphone is not None) and (audio_file is not None):
+        warn_output = (
+            "WARNING: You've uploaded an audio file and used the microphone. "
+            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
+        )
+        audio_data = microphone
+    elif (microphone is None) and (audio_file is None):
+        warn_output = "ERROR: You have to either use the microphone or upload an audio file"
+    elif microphone is not None:
+        audio_data = microphone
+    else:
+        audio_data = audio_file
+    if audio_data is not None:
+        audio_duration = parse_duration(audio_data)
+    else:
+        audio_duration = None
+    time_diff = None
+    try:
+        with tempfile.TemporaryDirectory() as tempdir:
+            filename = os.path.split(audio_data)[-1]
+            new_audio_data = os.path.join(tempdir, filename)
+            shutil.copy2(audio_data, new_audio_data)
+            if os.path.exists(audio_data):
+                os.remove(audio_data)
+            audio_data = new_audio_data
+            # Use HF API for transcription
+            start = time.time()
+            transcriptions = infer_audio(model_name, audio_data)
+            end = time.time()
+            time_diff = end - start
+    except Exception as e:
+        transcriptions = ""
+        warn_output = warn_output
+        if warn_output != "":
+            warn_output += "<br><br>"
+        warn_output += (
+            f"The model `{model_name}` is currently loading and cannot be used "
+            f"for transcription.<br>"
+            f"Please try another model or wait a few minutes."
+        )
+    # Built HTML output
+    if warn_output != "":
+        html_output = build_html_output(warn_output, style="result_item_error")
+    else:
+        if transcriptions.startswith("Error:-"):
+            html_output = build_html_output(transcriptions, style="result_item_error")
+        else:
+            output = f"Successfully transcribed on {get_device()} ! <br>" f"Transcription Time : {time_diff: 0.3f} s"
+            if audio_duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:
+                output += f""" <br><br>
+                Note: Audio duration was {audio_duration: 0.3f} s, so model had to be downloaded, initialized, and then
+                buffered inference was used. <br>
+                """
+            html_output = build_html_output(output)
+    return transcriptions, html_output
+def _return_yt_html_embed(yt_url):
+    """ Obtained from https://huggingface.co/spaces/whisper-event/whisper-demo """
+    video_id = yt_url.split("?v=")[-1]
+    HTML_str = (
+        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
+        " </center>"
+    )
+    return HTML_str
+def yt_transcribe(yt_url: str, model_name: str):
+    """ Modified from https://huggingface.co/spaces/whisper-event/whisper-demo """
+    if yt_url == "":
+        text = ""
+        html_embed_str = ""
+        html_output = build_html_output(f"""
+            Error:- No YouTube URL was provide !
+            """, style='result_item_error')
+        return text, html_embed_str, html_output
+    yt = pt.YouTube(yt_url)
+    html_embed_str = _return_yt_html_embed(yt_url)
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_uuid = str(uuid.uuid4().hex)
+        file_uuid = f"{tempdir}/{file_uuid}.mp3"
+        # Download YT Audio temporarily
+        download_time_start = time.time()
+        stream = yt.streams.filter(only_audio=True)[0]
+        stream.download(filename=file_uuid)
+        download_time_end = time.time()
+        # Get audio duration
+        audio_duration = parse_duration(file_uuid)
+        # Perform transcription
+        infer_time_start = time.time()
+        text = infer_audio(model_name, file_uuid)
+        infer_time_end = time.time()
+    if text.startswith("Error:-"):
+        html_output = build_html_output(text, style='result_item_error')
+    else:
+        html_output = f"""
+        Successfully transcribed on {get_device()} ! <br>
+        Audio Download Time : {download_time_end - download_time_start: 0.3f} s <br>
+        Transcription Time : {infer_time_end - infer_time_start: 0.3f} s <br>
+        """
+        if audio_duration > BUFFERED_INFERENCE_DURATION_THRESHOLD:
+            html_output += f""" <br>
+            Note: Audio duration was {audio_duration: 0.3f} s, so model had to be downloaded, initialized, and then
+            buffered inference was used. <br>
+            """
+        html_output = build_html_output(html_output)
+    return text, html_embed_str, html_output
+def create_lang_selector_component(default_en_model=DEFAULT_EN_MODEL):
+    """
+    Utility function to select a langauge from a dropdown menu, and simultanously update another dropdown
+    containing the corresponding model checkpoints for that language.
+    Args:
+        default_en_model: str name of a default english model that should be the set default.
+    Returns:
+        Gradio components for lang_selector (Dropdown menu) and models_in_lang (Dropdown menu)
+    """
+    lang_selector = gr.components.Dropdown(
+        choices=sorted(list(SUPPORTED_LANGUAGES)), value="en", type="value", label="Languages", interactive=True,
+    )
+    models_in_lang = gr.components.Dropdown(
+        choices=sorted(list(SUPPORTED_LANG_MODEL_DICT["en"])),
+        value=default_en_model,
+        label="Models",
+        interactive=True,
+    )
+    def update_models_with_lang(lang):
+        models_names = sorted(list(SUPPORTED_LANG_MODEL_DICT[lang]))
+        default = models_names[0]
+        if lang == 'en':
+            default = default_en_model
+        return models_in_lang.update(choices=models_names, value=default)
+    lang_selector.change(update_models_with_lang, inputs=[lang_selector], outputs=[models_in_lang])
+    return lang_selector, models_in_lang
+"""
+Define the GUI
+"""
+demo = gr.Blocks(title=TITLE, css=CSS)
+with demo:
+    header = gr.Markdown(MARKDOWN)
+    with gr.Tab("Transcribe Audio"):
+        with gr.Row() as row:
+            file_upload = gr.components.Audio(source="upload", type='filepath', label='Upload File')
+            microphone = gr.components.Audio(source="microphone", type='filepath', label='Microphone')
+        lang_selector, models_in_lang = create_lang_selector_component()
+        run = gr.components.Button('Transcribe')
+        transcript = gr.components.Label(label='Transcript')
+        audio_html_output = gr.components.HTML()
+        run.click(
+            transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript, audio_html_output]
+        )
+    with gr.Tab("Transcribe Youtube"):
+        yt_url = gr.components.Textbox(
+            lines=1, label="Youtube URL", placeholder="Paste the URL to a YouTube video here"
+        )
+        lang_selector_yt, models_in_lang_yt = create_lang_selector_component(
+            default_en_model=DEFAULT_BUFFERED_EN_MODEL
+        )
+        with gr.Row():
+            run = gr.components.Button('Transcribe YouTube')
+            embedded_video = gr.components.HTML()
+        transcript = gr.components.Label(label='Transcript')
+        yt_html_output = gr.components.HTML()
+        run.click(
+            yt_transcribe, inputs=[yt_url, models_in_lang_yt], outputs=[transcript, embedded_video, yt_html_output]
+        )
+    gr.components.HTML(ARTICLE)
+demo.queue(concurrency_count=1)
+demo.launch(enable_queue=True)

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ libsndfile1

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ nemo_toolkit[all]
2	+ pytube

speech_to_text_buffered_infer_ctc.py ADDED Viewed

	@@ -0,0 +1,193 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script serves three goals:
+    (1) Demonstrate how to use NeMo Models outside of PytorchLightning
+    (2) Shows example of batch ASR inference
+    (3) Serves as CI test for pre-trained checkpoint
+python speech_to_text_buffered_infer_ctc.py \
+    model_path=null \
+    pretrained_name=null \
+    audio_dir="<remove or path to folder of audio files>" \
+    dataset_manifest="<remove or path to manifest>" \
+    output_filename="<remove or specify output filename>" \
+    total_buffer_in_secs=4.0 \
+    chunk_len_in_secs=1.6 \
+    model_stride=4 \
+    batch_size=32
+# NOTE:
+    You can use `DEBUG=1 python speech_to_text_buffered_infer_ctc.py ...` to print out the
+    predictions of the model, and ground-truth text if presents in manifest.
+"""
+import contextlib
+import copy
+import glob
+import math
+import os
+from dataclasses import dataclass, is_dataclass
+from typing import Optional
+import torch
+from omegaconf import OmegaConf
+from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR
+from nemo.collections.asr.parts.utils.transcribe_utils import (
+    compute_output_filename,
+    get_buffered_pred_feat,
+    setup_model,
+    write_transcription,
+)
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+can_gpu = torch.cuda.is_available()
+@dataclass
+class TranscriptionConfig:
+    # Required configs
+    model_path: Optional[str] = None  # Path to a .nemo file
+    pretrained_name: Optional[str] = None  # Name of a pretrained model
+    audio_dir: Optional[str] = None  # Path to a directory which contains audio files
+    dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
+    # General configs
+    output_filename: Optional[str] = None
+    batch_size: int = 32
+    num_workers: int = 0
+    append_pred: bool = False  # Sets mode of work, if True it will add new field transcriptions.
+    pred_name_postfix: Optional[str] = None  # If you need to use another model name, rather than standard one.
+    # Chunked configs
+    chunk_len_in_secs: float = 1.6  # Chunk length in seconds
+    total_buffer_in_secs: float = 4.0  # Length of buffer (chunk + left and right padding) in seconds
+    model_stride: int = 8  # Model downsampling factor, 8 for Citrinet models and 4 for Conformer models",
+    # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
+    # device anyway, and do inference on CPU only if CUDA device is not found.
+    # If `cuda` is a negative number, inference will be on CPU only.
+    cuda: Optional[int] = None
+    amp: bool = False
+    audio_type: str = "wav"
+    # Recompute model transcription, even if the output folder exists with scores.
+    overwrite_transcripts: bool = True
+@hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
+def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
+    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+    torch.set_grad_enabled(False)
+    if is_dataclass(cfg):
+        cfg = OmegaConf.structured(cfg)
+    if cfg.model_path is None and cfg.pretrained_name is None:
+        raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!")
+    if cfg.audio_dir is None and cfg.dataset_manifest is None:
+        raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!")
+    filepaths = None
+    manifest = cfg.dataset_manifest
+    if cfg.audio_dir is not None:
+        filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True))
+        manifest = None  # ignore dataset_manifest if audio_dir and dataset_manifest both presents
+    # setup GPU
+    if cfg.cuda is None:
+        if torch.cuda.is_available():
+            device = [0]  # use 0th CUDA device
+            accelerator = 'gpu'
+        else:
+            device = 1
+            accelerator = 'cpu'
+    else:
+        device = [cfg.cuda]
+        accelerator = 'gpu'
+    map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu')
+    logging.info(f"Inference will be done on device : {device}")
+    asr_model, model_name = setup_model(cfg, map_location)
+    model_cfg = copy.deepcopy(asr_model._cfg)
+    OmegaConf.set_struct(model_cfg.preprocessor, False)
+    # some changes for streaming scenario
+    model_cfg.preprocessor.dither = 0.0
+    model_cfg.preprocessor.pad_to = 0
+    if model_cfg.preprocessor.normalize != "per_feature":
+        logging.error("Only EncDecCTCModelBPE models trained with per_feature normalization are supported currently")
+    # Disable config overwriting
+    OmegaConf.set_struct(model_cfg.preprocessor, True)
+    # setup AMP (optional)
+    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
+        logging.info("AMP enabled!\n")
+        autocast = torch.cuda.amp.autocast
+    else:
+        @contextlib.contextmanager
+        def autocast():
+            yield
+    # Compute output filename
+    cfg = compute_output_filename(cfg, model_name)
+    # if transcripts should not be overwritten, and already exists, skip re-transcription step and return
+    if not cfg.overwrite_transcripts and os.path.exists(cfg.output_filename):
+        logging.info(
+            f"Previous transcripts found at {cfg.output_filename}, and flag `overwrite_transcripts`"
+            f"is {cfg.overwrite_transcripts}. Returning without re-transcribing text."
+        )
+        return cfg
+    asr_model.eval()
+    asr_model = asr_model.to(asr_model.device)
+    feature_stride = model_cfg.preprocessor['window_stride']
+    model_stride_in_secs = feature_stride * cfg.model_stride
+    total_buffer = cfg.total_buffer_in_secs
+    chunk_len = float(cfg.chunk_len_in_secs)
+    tokens_per_chunk = math.ceil(chunk_len / model_stride_in_secs)
+    mid_delay = math.ceil((chunk_len + (total_buffer - chunk_len) / 2) / model_stride_in_secs)
+    logging.info(f"tokens_per_chunk is {tokens_per_chunk}, mid_delay is {mid_delay}")
+    frame_asr = FrameBatchASR(
+        asr_model=asr_model, frame_len=chunk_len, total_buffer=cfg.total_buffer_in_secs, batch_size=cfg.batch_size,
+    )
+    hyps = get_buffered_pred_feat(
+        frame_asr,
+        chunk_len,
+        tokens_per_chunk,
+        mid_delay,
+        model_cfg.preprocessor,
+        model_stride_in_secs,
+        asr_model.device,
+        manifest,
+        filepaths,
+    )
+    output_filename = write_transcription(hyps, cfg, model_name, filepaths=filepaths, compute_langs=False)
+    logging.info(f"Finished writing predictions to {output_filename}!")
+    return cfg
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter

speech_to_text_buffered_infer_rnnt.py ADDED Viewed

	@@ -0,0 +1,247 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Script to perform buffered inference using RNNT models.
+Buffered inference is the primary form of audio transcription when the audio segment is longer than 20-30 seconds.
+This is especially useful for models such as Conformers, which have quadratic time and memory scaling with
+audio duration.
+The difference between streaming and buffered inference is the chunk size (or the latency of inference).
+Buffered inference will use large chunk sizes (5-10 seconds) + some additional buffer for context.
+Streaming inference will use small chunk sizes (0.1 to 0.25 seconds) + some additional buffer for context.
+# Middle Token merge algorithm
+python speech_to_text_buffered_infer_rnnt.py \
+    model_path=null \
+    pretrained_name=null \
+    audio_dir="<remove or path to folder of audio files>" \
+    dataset_manifest="<remove or path to manifest>" \
+    output_filename="<remove or specify output filename>" \
+    total_buffer_in_secs=4.0 \
+    chunk_len_in_secs=1.6 \
+    model_stride=4 \
+    batch_size=32
+# Longer Common Subsequence (LCS) Merge algorithm
+python speech_to_text_buffered_infer_rnnt.py \
+    model_path=null \
+    pretrained_name=null \
+    audio_dir="<remove or path to folder of audio files>" \
+    dataset_manifest="<remove or path to manifest>" \
+    output_filename="<remove or specify output filename>" \
+    total_buffer_in_secs=4.0 \
+    chunk_len_in_secs=1.6 \
+    model_stride=4 \
+    batch_size=32 \
+    merge_algo="lcs" \
+    lcs_alignment_dir=<OPTIONAL: Some path to store the LCS alignments>
+# NOTE:
+    You can use `DEBUG=1 python speech_to_text_buffered_infer_ctc.py ...` to print out the
+    predictions of the model, and ground-truth text if presents in manifest.
+"""
+import copy
+import glob
+import math
+import os
+from dataclasses import dataclass, is_dataclass
+from typing import Optional
+import torch
+from omegaconf import OmegaConf, open_dict
+from nemo.collections.asr.parts.utils.streaming_utils import (
+    BatchedFrameASRRNNT,
+    LongestCommonSubsequenceBatchedFrameASRRNNT,
+)
+from nemo.collections.asr.parts.utils.transcribe_utils import (
+    compute_output_filename,
+    get_buffered_pred_feat_rnnt,
+    setup_model,
+    write_transcription,
+)
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+can_gpu = torch.cuda.is_available()
+@dataclass
+class TranscriptionConfig:
+    # Required configs
+    model_path: Optional[str] = None  # Path to a .nemo file
+    pretrained_name: Optional[str] = None  # Name of a pretrained model
+    audio_dir: Optional[str] = None  # Path to a directory which contains audio files
+    dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
+    # General configs
+    output_filename: Optional[str] = None
+    batch_size: int = 32
+    num_workers: int = 0
+    append_pred: bool = False  # Sets mode of work, if True it will add new field transcriptions.
+    pred_name_postfix: Optional[str] = None  # If you need to use another model name, rather than standard one.
+    # Chunked configs
+    chunk_len_in_secs: float = 1.6  # Chunk length in seconds
+    total_buffer_in_secs: float = 4.0  # Length of buffer (chunk + left and right padding) in seconds
+    model_stride: int = 8  # Model downsampling factor, 8 for Citrinet models and 4 for Conformer models",
+    # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
+    # device anyway, and do inference on CPU only if CUDA device is not found.
+    # If `cuda` is a negative number, inference will be on CPU only.
+    cuda: Optional[int] = None
+    audio_type: str = "wav"
+    # Recompute model transcription, even if the output folder exists with scores.
+    overwrite_transcripts: bool = True
+    # Decoding configs
+    max_steps_per_timestep: int = 5  #'Maximum number of tokens decoded per acoustic timestep'
+    stateful_decoding: bool = False  # Whether to perform stateful decoding
+    # Merge algorithm for transducers
+    merge_algo: Optional[str] = 'middle'  # choices=['middle', 'lcs'], choice of algorithm to apply during inference.
+    lcs_alignment_dir: Optional[str] = None  # Path to a directory to store LCS algo alignments
+@hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
+def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
+    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+    torch.set_grad_enabled(False)
+    if is_dataclass(cfg):
+        cfg = OmegaConf.structured(cfg)
+    if cfg.model_path is None and cfg.pretrained_name is None:
+        raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!")
+    if cfg.audio_dir is None and cfg.dataset_manifest is None:
+        raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!")
+    filepaths = None
+    manifest = cfg.dataset_manifest
+    if cfg.audio_dir is not None:
+        filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True))
+        manifest = None  # ignore dataset_manifest if audio_dir and dataset_manifest both presents
+    # setup GPU
+    if cfg.cuda is None:
+        if torch.cuda.is_available():
+            device = [0]  # use 0th CUDA device
+            accelerator = 'gpu'
+        else:
+            device = 1
+            accelerator = 'cpu'
+    else:
+        device = [cfg.cuda]
+        accelerator = 'gpu'
+    map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu')
+    logging.info(f"Inference will be done on device : {device}")
+    asr_model, model_name = setup_model(cfg, map_location)
+    model_cfg = copy.deepcopy(asr_model._cfg)
+    OmegaConf.set_struct(model_cfg.preprocessor, False)
+    # some changes for streaming scenario
+    model_cfg.preprocessor.dither = 0.0
+    model_cfg.preprocessor.pad_to = 0
+    if model_cfg.preprocessor.normalize != "per_feature":
+        logging.error("Only EncDecRNNTBPEModel models trained with per_feature normalization are supported currently")
+    # Disable config overwriting
+    OmegaConf.set_struct(model_cfg.preprocessor, True)
+    # Compute output filename
+    cfg = compute_output_filename(cfg, model_name)
+    # if transcripts should not be overwritten, and already exists, skip re-transcription step and return
+    if not cfg.overwrite_transcripts and os.path.exists(cfg.output_filename):
+        logging.info(
+            f"Previous transcripts found at {cfg.output_filename}, and flag `overwrite_transcripts`"
+            f"is {cfg.overwrite_transcripts}. Returning without re-transcribing text."
+        )
+        return cfg
+    asr_model.freeze()
+    asr_model = asr_model.to(asr_model.device)
+    # Change Decoding Config
+    decoding_cfg = asr_model.cfg.decoding
+    with open_dict(decoding_cfg):
+        if cfg.stateful_decoding:
+            decoding_cfg.strategy = "greedy"
+        else:
+            decoding_cfg.strategy = "greedy_batch"
+        decoding_cfg.preserve_alignments = True  # required to compute the middle token for transducers.
+        decoding_cfg.fused_batch_size = -1  # temporarily stop fused batch during inference.
+    asr_model.change_decoding_strategy(decoding_cfg)
+    feature_stride = model_cfg.preprocessor['window_stride']
+    model_stride_in_secs = feature_stride * cfg.model_stride
+    total_buffer = cfg.total_buffer_in_secs
+    chunk_len = float(cfg.chunk_len_in_secs)
+    tokens_per_chunk = math.ceil(chunk_len / model_stride_in_secs)
+    mid_delay = math.ceil((chunk_len + (total_buffer - chunk_len) / 2) / model_stride_in_secs)
+    logging.info(f"tokens_per_chunk is {tokens_per_chunk}, mid_delay is {mid_delay}")
+    if cfg.merge_algo == 'middle':
+        frame_asr = BatchedFrameASRRNNT(
+            asr_model=asr_model,
+            frame_len=chunk_len,
+            total_buffer=cfg.total_buffer_in_secs,
+            batch_size=cfg.batch_size,
+            max_steps_per_timestep=cfg.max_steps_per_timestep,
+            stateful_decoding=cfg.stateful_decoding,
+        )
+    elif cfg.merge_algo == 'lcs':
+        frame_asr = LongestCommonSubsequenceBatchedFrameASRRNNT(
+            asr_model=asr_model,
+            frame_len=chunk_len,
+            total_buffer=cfg.total_buffer_in_secs,
+            batch_size=cfg.batch_size,
+            max_steps_per_timestep=cfg.max_steps_per_timestep,
+            stateful_decoding=cfg.stateful_decoding,
+            alignment_basepath=cfg.lcs_alignment_dir,
+        )
+        # Set the LCS algorithm delay.
+        frame_asr.lcs_delay = math.floor(((total_buffer - chunk_len)) / model_stride_in_secs)
+    else:
+        raise ValueError("Invalid choice of merge algorithm for transducer buffered inference.")
+    hyps = get_buffered_pred_feat_rnnt(
+        asr=frame_asr,
+        tokens_per_chunk=tokens_per_chunk,
+        delay=mid_delay,
+        model_stride_in_secs=model_stride_in_secs,
+        batch_size=cfg.batch_size,
+        manifest=manifest,
+        filepaths=filepaths,
+    )
+    output_filename = write_transcription(hyps, cfg, model_name, filepaths=filepaths, compute_langs=False)
+    logging.info(f"Finished writing predictions to {output_filename}!")
+    return cfg
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter