Spaces:

k2-fsa
/

automatic-speech-recognition-with-whisper

Running

App Files Files Community

csukuangfj commited on Aug 16, 2023

Commit

3330d20

1 Parent(s): 3761eac

first commit

Browse files

Files changed (4) hide show

README.md +6 -5
app.py +290 -0
model.py +126 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,10 +1,11 @@
 ---
-title: Automatic Speech Recognition With Whisper
-emoji: 👀
-colorFrom: purple
-colorTo: purple
 sdk: gradio
-sdk_version: 3.40.1
 app_file: app.py
 pinned: false
 license: apache-2.0

 ---
+title: Automatic Speech Recognition
+emoji: 🌖
+colorFrom: yellow
+colorTo: green
 sdk: gradio
+python_version: 3.8.9
+sdk_version: 3.0.26
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py ADDED Viewed

	@@ -0,0 +1,290 @@

+#!/usr/bin/env python3
+#
+# Copyright      2022-2023  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# References:
+# https://gradio.app/docs/#dropdown
+import logging
+import os
+import tempfile
+import time
+from datetime import datetime
+import gradio as gr
+import soundfile as sf
+import urllib.request
+from examples import examples
+from model import decode, get_pretrained_model, whisper_models
+languages = list(language_to_models.keys())
+def convert_to_wav(in_filename: str) -> str:
+    """Convert the input audio file to a wave file"""
+    out_filename = in_filename + ".wav"
+    logging.info(f"Converting '{in_filename}' to '{out_filename}'")
+    _ = os.system(
+        f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 -ac 1 '{out_filename}'"
+    )
+    return out_filename
+def build_html_output(s: str, style: str = "result_item_success"):
+    return f"""
+    <div class='result'>
+        <div class='result_item {style}'>
+          {s}
+        </div>
+    </div>
+    """
+def process_url(
+    repo_id: str,
+    url: str,
+):
+    logging.info(f"Processing URL: {url}")
+    with tempfile.NamedTemporaryFile() as f:
+        try:
+            urllib.request.urlretrieve(url, f.name)
+            return process(
+                in_filename=f.name,
+                repo_id=repo_id,
+            )
+        except Exception as e:
+            logging.info(str(e))
+            return "", build_html_output(str(e), "result_item_error")
+def process_uploaded_file(
+    repo_id: str,
+    in_filename: str,
+):
+    if in_filename is None or in_filename == "":
+        return "", build_html_output(
+            "Please first upload a file and then click "
+            'the button "submit for recognition"',
+            "result_item_error",
+        )
+    logging.info(f"Processing uploaded file: {in_filename}")
+    try:
+        return process(
+            in_filename=in_filename,
+            repo_id=repo_id,
+        )
+    except Exception as e:
+        logging.info(str(e))
+        return "", build_html_output(str(e), "result_item_error")
+def process_microphone(
+    repo_id: str,
+    in_filename: str,
+):
+    if in_filename is None or in_filename == "":
+        return "", build_html_output(
+            "Please first click 'Record from microphone', speak, "
+            "click 'Stop recording', and then "
+            "click the button 'submit for recognition'",
+            "result_item_error",
+        )
+    logging.info(f"Processing microphone: {in_filename}")
+    try:
+        return process(
+            in_filename=in_filename,
+            repo_id=repo_id,
+        )
+    except Exception as e:
+        logging.info(str(e))
+        return "", build_html_output(str(e), "result_item_error")
+def process(
+    repo_id: str,
+    in_filename: str,
+):
+    logging.info(f"repo_id: {repo_id}")
+    logging.info(f"in_filename: {in_filename}")
+    filename = convert_to_wav(in_filename)
+    now = datetime.now()
+    date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
+    logging.info(f"Started at {date_time}")
+    start = time.time()
+    recognizer = get_pretrained_model(
+        repo_id,
+    )
+    text = decode(recognizer, filename)
+    date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
+    end = time.time()
+    info = torchaudio.info(filename)
+    duration = info.duration
+    elapsed = end - start
+    rtf = elapsed / duration
+    logging.info(f"Finished at {date_time} s. Elapsed: {elapsed: .3f} s")
+    info = f"""
+    Wave duration  : {duration: .3f} s <br/>
+    Processing time: {elapsed: .3f} s <br/>
+    RTF: {elapsed: .3f}/{duration: .3f} = {rtf:.3f} <br/>
+    """
+    if rtf > 1:
+        info += (
+            "<br/>We are loading the model for the first run. "
+            "Please run again to measure the real RTF.<br/>"
+        )
+    logging.info(info)
+    logging.info(f"\nrepo_id: {repo_id}\nhyp: {text}")
+    return text, build_html_output(info)
+title = "# Automatic Speech Recognition with Next-gen Kaldi using Whisper models"
+description = """
+This space shows how to do automatic speech recognition with Next-gen Kaldi
+using Whisper models.
+It is running on CPU within a docker container provided by Hugging Face.
+See more information by visiting the following links:
+- <https://github.com/k2-fsa/sherpa-onnx>
+If you want to deploy it locally, please see
+<https://k2-fsa.github.io/sherpa/>
+"""
+# css style is copied from
+# https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
+css = """
+.result {display:flex;flex-direction:column}
+.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
+.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
+.result_item_error {background-color:#ff7070;color:white;align-self:start}
+"""
+def update_model_dropdown(language: str):
+    if language in language_to_models:
+        choices = language_to_models[language]
+        return gr.Dropdown.update(choices=choices, value=choices[0])
+    raise ValueError(f"Unsupported language: {language}")
+demo = gr.Blocks(css=css)
+with demo:
+    gr.Markdown(title)
+    language_choices = list(language_to_models.keys())
+    model_choices = list(whisper_models.keys())
+    model_dropdown = gr.Dropdown(
+        choices=model_choices,
+        label="Select a model",
+        value=model_choices[0],
+    )
+    with gr.Tabs():
+        with gr.TabItem("Upload from disk"):
+            uploaded_file = gr.Audio(
+                source="upload",  # Choose between "microphone", "upload"
+                type="filepath",
+                optional=False,
+                label="Upload from disk",
+            )
+            upload_button = gr.Button("Submit for recognition")
+            uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
+            uploaded_html_info = gr.HTML(label="Info")
+        with gr.TabItem("Record from microphone"):
+            microphone = gr.Audio(
+                source="microphone",  # Choose between "microphone", "upload"
+                type="filepath",
+                optional=False,
+                label="Record from microphone",
+            )
+            record_button = gr.Button("Submit for recognition")
+            recorded_output = gr.Textbox(label="Recognized speech from recordings")
+            recorded_html_info = gr.HTML(label="Info")
+        with gr.TabItem("From URL"):
+            url_textbox = gr.Textbox(
+                max_lines=1,
+                placeholder="URL to an audio file",
+                label="URL",
+                interactive=True,
+            )
+            url_button = gr.Button("Submit for recognition")
+            url_output = gr.Textbox(label="Recognized speech from URL")
+            url_html_info = gr.HTML(label="Info")
+        upload_button.click(
+            process_uploaded_file,
+            inputs=[
+                model_dropdown,
+                uploaded_file,
+            ],
+            outputs=[uploaded_output, uploaded_html_info],
+        )
+        record_button.click(
+            process_microphone,
+            inputs=[
+                model_dropdown,
+                microphone,
+            ],
+            outputs=[recorded_output, recorded_html_info],
+        )
+        url_button.click(
+            process_url,
+            inputs=[
+                model_dropdown,
+                url_textbox,
+            ],
+            outputs=[url_output, url_html_info],
+        )
+    gr.Markdown(description)
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    demo.launch()

model.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright      2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import lru_cache
+from huggingface_hub import hf_hub_download
+import sherpa_onnx
+import numpy as np
+from typing import Tuple
+import wave
+sample_rate = 16000
+def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
+    """
+    Args:
+      wave_filename:
+        Path to a wave file. It should be single channel and each sample should
+        be 16-bit. Its sample rate does not need to be 16kHz.
+    Returns:
+      Return a tuple containing:
+       - A 1-D array of dtype np.float32 containing the samples, which are
+       normalized to the range [-1, 1].
+       - sample rate of the wave file
+    """
+    with wave.open(wave_filename) as f:
+        assert f.getnchannels() == 1, f.getnchannels()
+        assert f.getsampwidth() == 2, f.getsampwidth()  # it is in bytes
+        num_samples = f.getnframes()
+        samples = f.readframes(num_samples)
+        samples_int16 = np.frombuffer(samples, dtype=np.int16)
+        samples_float32 = samples_int16.astype(np.float32)
+        samples_float32 = samples_float32 / 32768
+        return samples_float32, f.getframerate()
+def decode(
+    recognizer: sherpa_onnx.OfflineRecognizer,
+    filename: str,
+) -> str:
+    s = recognizer.create_stream()
+    samples, sample_rate = read_wave(filename)
+    s.accept_waveform(sample_rate, samples)
+    recognizer.decode_stream(s)
+    return s.result.text.lower()
+def _get_nn_model_filename(
+    repo_id: str,
+    filename: str,
+    subfolder: str = ".",
+) -> str:
+    nn_model_filename = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        subfolder=subfolder,
+    )
+    return nn_model_filename
+def _get_token_filename(
+    repo_id: str,
+    filename: str,
+    subfolder: str = ".",
+) -> str:
+    token_filename = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        subfolder=subfolder,
+    )
+    return token_filename
+@lru_cache(maxsize=8)
+def get_pretrained_model(name: str) -> sherpa_onnx.OfflineRecognizer:
+    assert name in ("tiny.en", "base.en", "small.en", "tiny", "base", "small"), name
+    full_repo_id = "csukuangfj/sherpa-onnx-whisper-" + name
+    encoder = _get_nn_model_filename(
+        repo_id=full_repo_id,
+        filename=f"{name}-encoder.int8.ort",
+    )
+    decoder = _get_nn_model_filename(
+        repo_id=full_repo_id,
+        filename=f"{name}-decoder.int8.ort",
+    )
+    tokens = _get_token_filename(repo_id=full_repo_id, filename=f"{name}-tokens.txt")
+    recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
+        encoder=encoder,
+        decoder=decoder,
+        tokens=tokens,
+        num_threads=2,
+    )
+    return recognizer
+whisper_models = {
+    "tiny.en": get_pretrained_model,
+    "base.en": get_pretrained_model,
+    "small.en": get_pretrained_model,
+    "tiny": get_pretrained_model,
+    "base": get_pretrained_model,
+    "small": get_pretrained_model,
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+soundfile
+sentencepiece>=0.1.96
+numpy
+huggingface_hub
+sherpa-onnx>=1.7.7