Spaces:

k2-fsa
/

spoken-language-identification

Running

App Files Files Community

csukuangfj commited on Mar 24, 2024

Commit

060b671

1 Parent(s): ef9137c

add app

Browse files

Files changed (1) hide show

app.py +294 -1

app.py CHANGED Viewed

	@@ -1 +1,294 @@
1	- ~~print()~~

+#!/usr/bin/env python3
+#
+# Copyright      2022-2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# References:
+# https://gradio.app/docs/#dropdown
+import logging
+import os
+import tempfile
+import time
+import urllib.request
+from datetime import datetime
+from examples import examples
+import gradio as gr
+import soundfile as sf
+from model import decode, get_pretrained_model, whisper_models
+def convert_to_wav(in_filename: str) -> str:
+    """Convert the input audio file to a wave file"""
+    out_filename = in_filename + ".wav"
+    logging.info(f"Converting '{in_filename}' to '{out_filename}'")
+    _ = os.system(
+        f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 -ac 1 '{out_filename}' -y"
+    )
+    return out_filename
+def build_html_output(s: str, style: str = "result_item_success"):
+    return f"""
+    <div class='result'>
+        <div class='result_item {style}'>
+          {s}
+        </div>
+    </div>
+    """
+def process_url(
+    repo_id: str,
+    url: str,
+):
+    logging.info(f"Processing URL: {url}")
+    with tempfile.NamedTemporaryFile() as f:
+        try:
+            urllib.request.urlretrieve(url, f.name)
+            return process(
+                in_filename=f.name,
+                repo_id=repo_id,
+            )
+        except Exception as e:
+            logging.info(str(e))
+            return "", build_html_output(str(e), "result_item_error")
+def process_uploaded_file(
+    repo_id: str,
+    in_filename: str,
+):
+    if in_filename is None or in_filename == "":
+        return "", build_html_output(
+            "Please first upload a file and then click "
+            'the button "submit for recognition"',
+            "result_item_error",
+        )
+    logging.info(f"Processing uploaded file: {in_filename}")
+    try:
+        return process(
+            in_filename=in_filename,
+            repo_id=repo_id,
+        )
+    except Exception as e:
+        logging.info(str(e))
+        return "", build_html_output(str(e), "result_item_error")
+def process_microphone(
+    repo_id: str,
+    in_filename: str,
+):
+    if in_filename is None or in_filename == "":
+        return "", build_html_output(
+            "Please first click 'Record from microphone', speak, "
+            "click 'Stop recording', and then "
+            "click the button 'submit for recognition'",
+            "result_item_error",
+        )
+    logging.info(f"Processing microphone: {in_filename}")
+    try:
+        return process(
+            in_filename=in_filename,
+            repo_id=repo_id,
+        )
+    except Exception as e:
+        logging.info(str(e))
+        return "", build_html_output(str(e), "result_item_error")
+def process(
+    repo_id: str,
+    in_filename: str,
+):
+    logging.info(f"repo_id: {repo_id}")
+    logging.info(f"in_filename: {in_filename}")
+    filename = convert_to_wav(in_filename)
+    now = datetime.now()
+    date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
+    logging.info(f"Started at {date_time}")
+    start = time.time()
+    slid = get_pretrained_model(repo_id)
+    lang = decode(slid, filename)
+    date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
+    end = time.time()
+    info = sf.info(filename)
+    duration = info.duration
+    elapsed = end - start
+    rtf = elapsed / duration
+    logging.info(f"Finished at {date_time} s. Elapsed: {elapsed: .3f} s")
+    info = f"""
+    Wave duration  : {duration: .3f} s <br/>
+    Processing time: {elapsed: .3f} s <br/>
+    RTF: {elapsed: .3f}/{duration: .3f} = {rtf:.3f} <br/>
+    """
+    if rtf > 1:
+        info += (
+            "<br/>We are loading the model for the first run. "
+            "Please run again to measure the real RTF.<br/>"
+        )
+    logging.info(info)
+    logging.info(f"\nrepo_id: {repo_id}\nDetected language: {lang}")
+    return text, build_html_output(info)
+title = "# Spoken Language Identification: [Next-gen Kaldi](https://github.com/k2-fsa) + [Whisper](https://github.com/openai/whisper/)"
+description = """
+This space shows how to do spoken language identification with [Next-gen Kaldi](https://github.com/k2-fsa)
+using [Whisper](https://github.com/openai/whisper/) multilingual models.
+It is running on a machine with 2 vCPUs with 16 GB RAM within a docker container provided by Hugging Face.
+See more information by visiting the following links:
+- <https://github.com/k2-fsa/sherpa-onnx>
+If you want to deploy it locally, please see
+<https://k2-fsa.github.io/sherpa/onnx>
+"""
+# css style is copied from
+# https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
+css = """
+.result {display:flex;flex-direction:column}
+.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
+.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
+.result_item_error {background-color:#ff7070;color:white;align-self:start}
+"""
+demo = gr.Blocks(css=css)
+with demo:
+    gr.Markdown(title)
+    model_choices = list(whisper_models.keys())
+    model_dropdown = gr.Dropdown(
+        choices=model_choices,
+        label="Select a model",
+        value=model_choices[0],
+    )
+    with gr.Tabs():
+        with gr.TabItem("Upload from disk"):
+            uploaded_file = gr.Audio(
+                sources=["upload"],  # Choose between "microphone", "upload"
+                type="filepath",
+                label="Upload from disk",
+            )
+            upload_button = gr.Button("Submit for recognition")
+            uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
+            uploaded_html_info = gr.HTML(label="Info")
+            gr.Examples(
+                examples=examples,
+                inputs=[
+                    model_dropdown,
+                    uploaded_file,
+                ],
+                outputs=[uploaded_output, uploaded_html_info],
+                fn=process_uploaded_file,
+            )
+        with gr.TabItem("Record from microphone"):
+            microphone = gr.Audio(
+                sources=["microphone"],  # Choose between "microphone", "upload"
+                type="filepath",
+                label="Record from microphone",
+            )
+            record_button = gr.Button("Submit for recognition")
+            recorded_output = gr.Textbox(label="Recognized speech from recordings")
+            recorded_html_info = gr.HTML(label="Info")
+            gr.Examples(
+                examples=examples,
+                inputs=[
+                    model_dropdown,
+                    microphone,
+                ],
+                outputs=[recorded_output, recorded_html_info],
+                fn=process_microphone,
+            )
+        with gr.TabItem("From URL"):
+            url_textbox = gr.Textbox(
+                max_lines=1,
+                placeholder="URL to an audio file",
+                label="URL",
+                interactive=True,
+            )
+            url_button = gr.Button("Submit for recognition")
+            url_output = gr.Textbox(label="Recognized speech from URL")
+            url_html_info = gr.HTML(label="Info")
+        upload_button.click(
+            process_uploaded_file,
+            inputs=[
+                model_dropdown,
+                uploaded_file,
+            ],
+            outputs=[uploaded_output, uploaded_html_info],
+        )
+        record_button.click(
+            process_microphone,
+            inputs=[
+                model_dropdown,
+                microphone,
+            ],
+            outputs=[recorded_output, recorded_html_info],
+        )
+        url_button.click(
+            process_url,
+            inputs=[
+                model_dropdown,
+                url_textbox,
+            ],
+            outputs=[url_output, url_html_info],
+        )
+    gr.Markdown(description)
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    demo.launch()