Spaces:

Yehor
/

w2v-bert-uk-demo

Runtime error

App Files Files

Yehor Smoliakov commited on Aug 16, 2024

Commit

2483350

1 Parent(s): d395278

Update

Browse files

Files changed (16) hide show

.gitignore +3 -1
README.md +9 -3
app.py +141 -29
example_1.wav +0 -0
example_2.wav +0 -0
example_3.wav +0 -0
example_4.wav +0 -0
example_5.wav +0 -0
example_6.wav +0 -0
requirements.txt +6 -2
sample_1.wav +0 -3
sample_2.wav +0 -3
sample_3.wav +0 -3
sample_4.wav +0 -3
sample_5.wav +0 -3
sample_6.wav +0 -3

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 .idea/
 .venv/
-flagged/

 .idea/
 .venv/
+.ruff_cache/
+flagged/

README.md CHANGED Viewed

@@ -5,14 +5,14 @@ colorFrom: blue
 colorTo: gray
 sdk: gradio
 app_file: app.py
-pinned: false
 sdk_version: 4.41.0
 ---
 ## Install
 ```shell
-uv venv --python 3.12
 source .venv/bin/activate
@@ -20,4 +20,10 @@ uv pip install -r requirements.txt
 # in development mode
 uv pip install -r requirements-dev.txt
-```

 colorTo: gray
 sdk: gradio
 app_file: app.py
+pinned: true
 sdk_version: 4.41.0
 ---
 ## Install
 ```shell
+uv venv --python 3.10
 source .venv/bin/activate
 # in development mode
 uv pip install -r requirements-dev.txt
+```
+## Run
+```shell
+python app.py
+```

app.py CHANGED Viewed

@@ -1,54 +1,139 @@
 import time
 import torch
-import librosa
 import gradio as gr
 from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
 model_name = "Yehor/w2v-bert-uk"
-device = "cpu"
-max_duration = 30
-asr_model = AutoModelForCTC.from_pretrained(model_name).to(device)
 processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
-audio_samples = [
-    "sample_1.wav",
-    "sample_2.wav",
-    "sample_3.wav",
-    "sample_4.wav",
-    "sample_5.wav",
-    "sample_6.wav",
 ]
-description_head = """
 # Speech-to-Text for Ukrainian
 ## Overview
-This space uses https://huggingface.co/Yehor/w2v-bert-uk model that solves
-a Speech-to-Text task for the Ukrainian language.
 """.strip()
-description_foot = """
 ## Community
 - **Discord**: https://discord.gg/yVAjkBgmt4
 - Speech Recognition: https://t.me/speech_recognition_uk
 - Speech Synthesis: https://t.me/speech_synthesis_uk
 """.strip()
 def inference(audio_path, progress=gr.Progress()):
-    gr.Info("Starting process", duration=2)
-    progress(0, desc="Starting")
-    duration = librosa.get_duration(path=audio_path)
     if duration > max_duration:
-        raise gr.Error("The duration of the file exceeds 10 seconds.")
     paths = [
         audio_path,
@@ -59,18 +144,35 @@ def inference(audio_path, progress=gr.Progress()):
     for path in progress.tqdm(paths, desc="Recognizing...", unit="file"):
         t0 = time.time()
-        audio_duration = librosa.get_duration(path=path, sr=16_000)
-        audio_input, _ = librosa.load(path, mono=True, sr=16_000)
         features = processor([audio_input], sampling_rate=16_000).input_features
         features = torch.tensor(features).to(device)
         with torch.inference_mode():
             logits = asr_model(features).logits
         predicted_ids = torch.argmax(logits, dim=-1)
         predictions = processor.batch_decode(predicted_ids)
         elapsed_time = round(time.time() - t0, 2)
         rtf = round(elapsed_time / audio_duration, 4)
         audio_duration = round(audio_duration, 2)
@@ -84,7 +186,7 @@ def inference(audio_path, progress=gr.Progress()):
             }
         )
-    gr.Info("Finished...", duration=2)
     result_texts = []
@@ -103,29 +205,39 @@ def inference(audio_path, progress=gr.Progress()):
 demo = gr.Blocks(
     title="Speech-to-Text for Ukrainian",
     analytics_enabled=False,
 )
 with demo:
     gr.Markdown(description_head)
-    gr.Markdown(f"## Demo (max. duration: **{max_duration}** seconds)")
     with gr.Row():
         audio_file = gr.Audio(label="Audio file", type="filepath")
         transcription = gr.Markdown(
             label="Transcription",
-            value="Recognized text will appear here. Use **an example file** below the Recognize button,"
-            "upload **your audio file**, or use **the microphone** to record something...",
         )
-    gr.Button("Recognize").click(inference, inputs=audio_file, outputs=transcription)
     with gr.Row():
-        gr.Examples(
-            label="Choose an example audio", inputs=audio_file, examples=audio_samples
-        )
     gr.Markdown(description_foot)
 if __name__ == "__main__":
     demo.launch()

+import sys
 import time
+from importlib.metadata import version
 import torch
+import torchaudio
+import torchaudio.transforms as T
 import gradio as gr
 from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
+# Config
 model_name = "Yehor/w2v-bert-uk"
+min_duration = 0.5
+max_duration = 60
+concurrency_limit = 5
+use_torch_compile = False
+# Torch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# Load the model
+asr_model = AutoModelForCTC.from_pretrained(model_name, torch_dtype=torch_dtype, device_map=device)
 processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
+if use_torch_compile:
+    asr_model = torch.compile(asr_model)
+# Elements
+examples = [
+    "example_1.wav",
+    "example_2.wav",
+    "example_3.wav",
+    "example_4.wav",
+    "example_5.wav",
+    "example_6.wav",
 ]
+examples_table = """
+| File  | Text |
+| ------------- | ------------- |
+| `example_1.wav`  | тема про яку не люблять говорити офіційні джерела у генштабі і міноборони це хімічна зброя окупанти вже тривалий час використовують хімічну зброю заборонену |
+| `example_2.wav`  | всіма конвенціями якщо спочатку це були гранати з дронів то тепер фіксують випадки застосування |
+| `example_3.wav`  | хімічних снарядів причому склад отруйної речовони різний а отже й наслідки для наших військових теж різні  |
+| `example_4.wav`  | використовує на фронті все що має і хімічна зброя не вийняток тож з чим маємо справу розбиралася марія моганисян |
+| `example_5.wav`  | двох тисяч випадків застосування росіянами боєприпасів споряджених небезпечними хімічними речовинами |
+| `example_6.wav`  | на всі писані норми марія моганисян олександр моторний спецкор марафон єдині новини |
+""".strip()
+# https://www.tablesgenerator.com/markdown_tables
+authors_table = """
+## Authors
+Follow them in social networks and **contact** if you need any help or have any questions:
+| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
+|-------------------------------------------------------------------------------------------------|
+| https://t.me/smlkw in Telegram                                                                  |
+| https://x.com/yehor_smoliakov at X                                                              |
+| https://github.com/egorsmkv at GitHub                                                           |
+| https://huggingface.co/Yehor at Hugging Face                                                    |
+| or use egorsmkv@gmail.com                                                                       |
+""".strip()
+description_head = f"""
 # Speech-to-Text for Ukrainian
 ## Overview
+This space uses https://huggingface.co/Yehor/w2v-bert-uk model to recognize audio files.
+> Due to resource limitations, audio duration **must not** exceed **{max_duration}** seconds.
 """.strip()
+description_foot = f"""
 ## Community
 - **Discord**: https://discord.gg/yVAjkBgmt4
 - Speech Recognition: https://t.me/speech_recognition_uk
 - Speech Synthesis: https://t.me/speech_synthesis_uk
+## More
+Check out other ASR models: https://github.com/egorsmkv/speech-recognition-uk
+{authors_table}
+""".strip()
+transcription_value = """
+Recognized text will appear here.
+Choose **an example file** below the Recognize button, upload **your audio file**, or use **the microphone** to record own voice.
+""".strip()
+tech_env = f"""
+#### Environment
+- Python: {sys.version}
+- Torch device: {device}
+- Torch dtype: {torch_dtype}
+- Use torch.compile: {use_torch_compile}
+""".strip()
+tech_libraries = f"""
+#### Libraries
+- torch: {version('torch')}
+- torchaudio: {version('torchaudio')}
+- transformers: {version('transformers')}
+- accelerate: {version('accelerate')}
+- gradio: {version('gradio')}
 """.strip()
 def inference(audio_path, progress=gr.Progress()):
+    if not audio_path:
+        raise gr.Error("Please upload an audio file.")
+    gr.Info("Starting recognition", duration=2)
+    progress(0, desc="Recognizing")
+    meta = torchaudio.info(audio_path)
+    duration = meta.num_frames / meta.sample_rate
+    if duration < min_duration:
+        raise gr.Error(
+            f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds."
+        )
     if duration > max_duration:
+        raise gr.Error(f"The duration of the file exceeds {max_duration} seconds.")
     paths = [
         audio_path,
     for path in progress.tqdm(paths, desc="Recognizing...", unit="file"):
         t0 = time.time()
+        meta = torchaudio.info(audio_path)
+        audio_duration = meta.num_frames / meta.sample_rate
+        audio_input, sr = torchaudio.load(path)
+        if meta.num_channels > 1:
+            audio_input = torch.mean(audio_input, dim=0, keepdim=True)
+        if meta.sample_rate != 16_000:
+            resampler = T.Resample(sr, 16_000, dtype=audio_input.dtype)
+            audio_input = resampler(audio_input)
+        audio_input = audio_input.squeeze().numpy()
         features = processor([audio_input], sampling_rate=16_000).input_features
         features = torch.tensor(features).to(device)
+        if torch_dtype == torch.float16:
+            features = features.half()
         with torch.inference_mode():
             logits = asr_model(features).logits
         predicted_ids = torch.argmax(logits, dim=-1)
         predictions = processor.batch_decode(predicted_ids)
+        if not predictions:
+            predictions = "-"
         elapsed_time = round(time.time() - t0, 2)
         rtf = round(elapsed_time / audio_duration, 4)
         audio_duration = round(audio_duration, 2)
             }
         )
+    gr.Info("Finished!", duration=2)
     result_texts = []
 demo = gr.Blocks(
     title="Speech-to-Text for Ukrainian",
     analytics_enabled=False,
+    theme=gr.themes.Base(),
 )
 with demo:
     gr.Markdown(description_head)
+    gr.Markdown("## Usage")
     with gr.Row():
         audio_file = gr.Audio(label="Audio file", type="filepath")
         transcription = gr.Markdown(
             label="Transcription",
+            value=transcription_value,
         )
+    gr.Button("Recognize").click(
+        inference,
+        concurrency_limit=concurrency_limit,
+        inputs=audio_file,
+        outputs=transcription,
+    )
     with gr.Row():
+        gr.Examples(label="Choose an example", inputs=audio_file, examples=examples)
+    gr.Markdown(examples_table)
     gr.Markdown(description_foot)
+    gr.Markdown("### Gradio app uses:")
+    gr.Markdown(tech_env)
+    gr.Markdown(tech_libraries)
 if __name__ == "__main__":
+    demo.queue()
     demo.launch()

example_1.wav ADDED Viewed

Binary file (273 kB). View file

example_2.wav ADDED Viewed

Binary file (200 kB). View file

example_3.wav ADDED Viewed

Binary file (193 kB). View file

example_4.wav ADDED Viewed

Binary file (241 kB). View file

example_5.wav ADDED Viewed

Binary file (193 kB). View file

example_6.wav ADDED Viewed

Binary file (186 kB). View file

requirements.txt CHANGED Viewed

@@ -3,6 +3,10 @@ gradio
 torch
 torchaudio
-transformers
-librosa

 torch
 torchaudio
+soundfile
+triton
+setuptools
+transformers
+accelerate

sample_1.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:172ade978b299f4a0c47e3b76666d1a06161e6001fbb5591b82038a1bbc4b5ad
-size 272568

sample_2.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:98fe42f22f8ea632714081a958dc035f3d507523fd340b320a1223ac2f55ccac
-size 199942

sample_3.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:83c0b7375beada8cee74b5de226da494368fcc6a3ce692913b3302dcda0bd9a2
-size 192842

sample_4.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:19e466ee9c0c129c1eecf93eb6791a44c2ee8d68dce2c3e8fd3734b87f28324a
-size 241442

sample_5.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5af19120c92859846a08496e0a617c21877cae2db5807d211f0a431d95163a3e
-size 193388

sample_6.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ac877968d5749438930339497f7548046003390a848496136f6cbe8a74c51629
-size 186290