Yehor Smoliakov commited on
Commit
2483350
1 Parent(s): d395278
Files changed (16) hide show
  1. .gitignore +3 -1
  2. README.md +9 -3
  3. app.py +141 -29
  4. example_1.wav +0 -0
  5. example_2.wav +0 -0
  6. example_3.wav +0 -0
  7. example_4.wav +0 -0
  8. example_5.wav +0 -0
  9. example_6.wav +0 -0
  10. requirements.txt +6 -2
  11. sample_1.wav +0 -3
  12. sample_2.wav +0 -3
  13. sample_3.wav +0 -3
  14. sample_4.wav +0 -3
  15. sample_5.wav +0 -3
  16. sample_6.wav +0 -3
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
  .idea/
2
  .venv/
3
- flagged/
 
 
 
1
  .idea/
2
  .venv/
3
+ .ruff_cache/
4
+
5
+ flagged/
README.md CHANGED
@@ -5,14 +5,14 @@ colorFrom: blue
5
  colorTo: gray
6
  sdk: gradio
7
  app_file: app.py
8
- pinned: false
9
  sdk_version: 4.41.0
10
  ---
11
 
12
  ## Install
13
 
14
  ```shell
15
- uv venv --python 3.12
16
 
17
  source .venv/bin/activate
18
 
@@ -20,4 +20,10 @@ uv pip install -r requirements.txt
20
 
21
  # in development mode
22
  uv pip install -r requirements-dev.txt
23
- ```
 
 
 
 
 
 
 
5
  colorTo: gray
6
  sdk: gradio
7
  app_file: app.py
8
+ pinned: true
9
  sdk_version: 4.41.0
10
  ---
11
 
12
  ## Install
13
 
14
  ```shell
15
+ uv venv --python 3.10
16
 
17
  source .venv/bin/activate
18
 
 
20
 
21
  # in development mode
22
  uv pip install -r requirements-dev.txt
23
+ ```
24
+
25
+ ## Run
26
+
27
+ ```shell
28
+ python app.py
29
+ ```
app.py CHANGED
@@ -1,54 +1,139 @@
 
1
  import time
2
 
 
 
3
  import torch
4
- import librosa
 
5
 
6
  import gradio as gr
7
 
8
  from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
9
 
 
10
  model_name = "Yehor/w2v-bert-uk"
11
- device = "cpu"
12
- max_duration = 30
13
 
14
- asr_model = AutoModelForCTC.from_pretrained(model_name).to(device)
 
 
 
 
 
 
 
 
 
 
 
15
  processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
16
 
17
- audio_samples = [
18
- "sample_1.wav",
19
- "sample_2.wav",
20
- "sample_3.wav",
21
- "sample_4.wav",
22
- "sample_5.wav",
23
- "sample_6.wav",
 
 
 
 
24
  ]
25
 
26
- description_head = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # Speech-to-Text for Ukrainian
28
 
29
  ## Overview
30
 
31
- This space uses https://huggingface.co/Yehor/w2v-bert-uk model that solves
32
- a Speech-to-Text task for the Ukrainian language.
 
33
  """.strip()
34
 
35
- description_foot = """
36
  ## Community
37
 
38
  - **Discord**: https://discord.gg/yVAjkBgmt4
39
  - Speech Recognition: https://t.me/speech_recognition_uk
40
  - Speech Synthesis: https://t.me/speech_synthesis_uk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  """.strip()
42
 
43
 
44
  def inference(audio_path, progress=gr.Progress()):
45
- gr.Info("Starting process", duration=2)
 
46
 
47
- progress(0, desc="Starting")
48
 
49
- duration = librosa.get_duration(path=audio_path)
 
 
 
 
 
 
 
 
50
  if duration > max_duration:
51
- raise gr.Error("The duration of the file exceeds 10 seconds.")
52
 
53
  paths = [
54
  audio_path,
@@ -59,18 +144,35 @@ def inference(audio_path, progress=gr.Progress()):
59
  for path in progress.tqdm(paths, desc="Recognizing...", unit="file"):
60
  t0 = time.time()
61
 
62
- audio_duration = librosa.get_duration(path=path, sr=16_000)
63
- audio_input, _ = librosa.load(path, mono=True, sr=16_000)
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  features = processor([audio_input], sampling_rate=16_000).input_features
66
  features = torch.tensor(features).to(device)
67
 
 
 
 
68
  with torch.inference_mode():
69
  logits = asr_model(features).logits
70
 
71
  predicted_ids = torch.argmax(logits, dim=-1)
72
  predictions = processor.batch_decode(predicted_ids)
73
 
 
 
 
74
  elapsed_time = round(time.time() - t0, 2)
75
  rtf = round(elapsed_time / audio_duration, 4)
76
  audio_duration = round(audio_duration, 2)
@@ -84,7 +186,7 @@ def inference(audio_path, progress=gr.Progress()):
84
  }
85
  )
86
 
87
- gr.Info("Finished...", duration=2)
88
 
89
  result_texts = []
90
 
@@ -103,29 +205,39 @@ def inference(audio_path, progress=gr.Progress()):
103
  demo = gr.Blocks(
104
  title="Speech-to-Text for Ukrainian",
105
  analytics_enabled=False,
 
106
  )
107
 
108
  with demo:
109
  gr.Markdown(description_head)
110
 
111
- gr.Markdown(f"## Demo (max. duration: **{max_duration}** seconds)")
112
 
113
  with gr.Row():
114
  audio_file = gr.Audio(label="Audio file", type="filepath")
115
  transcription = gr.Markdown(
116
  label="Transcription",
117
- value="Recognized text will appear here. Use **an example file** below the Recognize button,"
118
- "upload **your audio file**, or use **the microphone** to record something...",
119
  )
120
 
121
- gr.Button("Recognize").click(inference, inputs=audio_file, outputs=transcription)
 
 
 
 
 
122
 
123
  with gr.Row():
124
- gr.Examples(
125
- label="Choose an example audio", inputs=audio_file, examples=audio_samples
126
- )
127
 
128
  gr.Markdown(description_foot)
129
 
 
 
 
 
130
  if __name__ == "__main__":
 
131
  demo.launch()
 
1
+ import sys
2
  import time
3
 
4
+ from importlib.metadata import version
5
+
6
  import torch
7
+ import torchaudio
8
+ import torchaudio.transforms as T
9
 
10
  import gradio as gr
11
 
12
  from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
13
 
14
+ # Config
15
  model_name = "Yehor/w2v-bert-uk"
 
 
16
 
17
+ min_duration = 0.5
18
+ max_duration = 60
19
+
20
+ concurrency_limit = 5
21
+ use_torch_compile = False
22
+
23
+ # Torch
24
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
26
+
27
+ # Load the model
28
+ asr_model = AutoModelForCTC.from_pretrained(model_name, torch_dtype=torch_dtype, device_map=device)
29
  processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
30
 
31
+ if use_torch_compile:
32
+ asr_model = torch.compile(asr_model)
33
+
34
+ # Elements
35
+ examples = [
36
+ "example_1.wav",
37
+ "example_2.wav",
38
+ "example_3.wav",
39
+ "example_4.wav",
40
+ "example_5.wav",
41
+ "example_6.wav",
42
  ]
43
 
44
+ examples_table = """
45
+ | File | Text |
46
+ | ------------- | ------------- |
47
+ | `example_1.wav` | тема про яку не люблять говорити офіційні джерела у генштабі і міноборони це хімічна зброя окупанти вже тривалий час використовують хімічну зброю заборонену |
48
+ | `example_2.wav` | всіма конвенціями якщо спочатку це були гранати з дронів то тепер фіксують випадки застосування |
49
+ | `example_3.wav` | хімічних снарядів причому склад отруйної речовони різний а отже й наслідки для наших військових теж різні |
50
+ | `example_4.wav` | використовує на фронті все що має і хімічна зброя не вийняток тож з чим маємо справу розбиралася марія моганисян |
51
+ | `example_5.wav` | двох тисяч випадків застосування росіянами боєприпасів споряджених небезпечними хімічними речовинами |
52
+ | `example_6.wav` | на всі писані норми марія моганисян олександр моторний спецкор марафон єдині новини |
53
+ """.strip()
54
+
55
+ # https://www.tablesgenerator.com/markdown_tables
56
+ authors_table = """
57
+ ## Authors
58
+
59
+ Follow them in social networks and **contact** if you need any help or have any questions:
60
+
61
+ | <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
62
+ |-------------------------------------------------------------------------------------------------|
63
+ | https://t.me/smlkw in Telegram |
64
+ | https://x.com/yehor_smoliakov at X |
65
+ | https://github.com/egorsmkv at GitHub |
66
+ | https://huggingface.co/Yehor at Hugging Face |
67
+ | or use egorsmkv@gmail.com |
68
+ """.strip()
69
+
70
+ description_head = f"""
71
  # Speech-to-Text for Ukrainian
72
 
73
  ## Overview
74
 
75
+ This space uses https://huggingface.co/Yehor/w2v-bert-uk model to recognize audio files.
76
+
77
+ > Due to resource limitations, audio duration **must not** exceed **{max_duration}** seconds.
78
  """.strip()
79
 
80
+ description_foot = f"""
81
  ## Community
82
 
83
  - **Discord**: https://discord.gg/yVAjkBgmt4
84
  - Speech Recognition: https://t.me/speech_recognition_uk
85
  - Speech Synthesis: https://t.me/speech_synthesis_uk
86
+
87
+ ## More
88
+
89
+ Check out other ASR models: https://github.com/egorsmkv/speech-recognition-uk
90
+
91
+ {authors_table}
92
+ """.strip()
93
+
94
+ transcription_value = """
95
+ Recognized text will appear here.
96
+
97
+ Choose **an example file** below the Recognize button, upload **your audio file**, or use **the microphone** to record own voice.
98
+ """.strip()
99
+
100
+ tech_env = f"""
101
+ #### Environment
102
+
103
+ - Python: {sys.version}
104
+ - Torch device: {device}
105
+ - Torch dtype: {torch_dtype}
106
+ - Use torch.compile: {use_torch_compile}
107
+ """.strip()
108
+
109
+ tech_libraries = f"""
110
+ #### Libraries
111
+
112
+ - torch: {version('torch')}
113
+ - torchaudio: {version('torchaudio')}
114
+ - transformers: {version('transformers')}
115
+ - accelerate: {version('accelerate')}
116
+ - gradio: {version('gradio')}
117
  """.strip()
118
 
119
 
120
  def inference(audio_path, progress=gr.Progress()):
121
+ if not audio_path:
122
+ raise gr.Error("Please upload an audio file.")
123
 
124
+ gr.Info("Starting recognition", duration=2)
125
 
126
+ progress(0, desc="Recognizing")
127
+
128
+ meta = torchaudio.info(audio_path)
129
+ duration = meta.num_frames / meta.sample_rate
130
+
131
+ if duration < min_duration:
132
+ raise gr.Error(
133
+ f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds."
134
+ )
135
  if duration > max_duration:
136
+ raise gr.Error(f"The duration of the file exceeds {max_duration} seconds.")
137
 
138
  paths = [
139
  audio_path,
 
144
  for path in progress.tqdm(paths, desc="Recognizing...", unit="file"):
145
  t0 = time.time()
146
 
147
+ meta = torchaudio.info(audio_path)
148
+ audio_duration = meta.num_frames / meta.sample_rate
149
+
150
+ audio_input, sr = torchaudio.load(path)
151
+
152
+ if meta.num_channels > 1:
153
+ audio_input = torch.mean(audio_input, dim=0, keepdim=True)
154
+
155
+ if meta.sample_rate != 16_000:
156
+ resampler = T.Resample(sr, 16_000, dtype=audio_input.dtype)
157
+ audio_input = resampler(audio_input)
158
+
159
+ audio_input = audio_input.squeeze().numpy()
160
 
161
  features = processor([audio_input], sampling_rate=16_000).input_features
162
  features = torch.tensor(features).to(device)
163
 
164
+ if torch_dtype == torch.float16:
165
+ features = features.half()
166
+
167
  with torch.inference_mode():
168
  logits = asr_model(features).logits
169
 
170
  predicted_ids = torch.argmax(logits, dim=-1)
171
  predictions = processor.batch_decode(predicted_ids)
172
 
173
+ if not predictions:
174
+ predictions = "-"
175
+
176
  elapsed_time = round(time.time() - t0, 2)
177
  rtf = round(elapsed_time / audio_duration, 4)
178
  audio_duration = round(audio_duration, 2)
 
186
  }
187
  )
188
 
189
+ gr.Info("Finished!", duration=2)
190
 
191
  result_texts = []
192
 
 
205
  demo = gr.Blocks(
206
  title="Speech-to-Text for Ukrainian",
207
  analytics_enabled=False,
208
+ theme=gr.themes.Base(),
209
  )
210
 
211
  with demo:
212
  gr.Markdown(description_head)
213
 
214
+ gr.Markdown("## Usage")
215
 
216
  with gr.Row():
217
  audio_file = gr.Audio(label="Audio file", type="filepath")
218
  transcription = gr.Markdown(
219
  label="Transcription",
220
+ value=transcription_value,
 
221
  )
222
 
223
+ gr.Button("Recognize").click(
224
+ inference,
225
+ concurrency_limit=concurrency_limit,
226
+ inputs=audio_file,
227
+ outputs=transcription,
228
+ )
229
 
230
  with gr.Row():
231
+ gr.Examples(label="Choose an example", inputs=audio_file, examples=examples)
232
+
233
+ gr.Markdown(examples_table)
234
 
235
  gr.Markdown(description_foot)
236
 
237
+ gr.Markdown("### Gradio app uses:")
238
+ gr.Markdown(tech_env)
239
+ gr.Markdown(tech_libraries)
240
+
241
  if __name__ == "__main__":
242
+ demo.queue()
243
  demo.launch()
example_1.wav ADDED
Binary file (273 kB). View file
 
example_2.wav ADDED
Binary file (200 kB). View file
 
example_3.wav ADDED
Binary file (193 kB). View file
 
example_4.wav ADDED
Binary file (241 kB). View file
 
example_5.wav ADDED
Binary file (193 kB). View file
 
example_6.wav ADDED
Binary file (186 kB). View file
 
requirements.txt CHANGED
@@ -3,6 +3,10 @@ gradio
3
  torch
4
  torchaudio
5
 
6
- transformers
 
 
 
7
 
8
- librosa
 
 
3
  torch
4
  torchaudio
5
 
6
+ soundfile
7
+
8
+ triton
9
+ setuptools
10
 
11
+ transformers
12
+ accelerate
sample_1.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:172ade978b299f4a0c47e3b76666d1a06161e6001fbb5591b82038a1bbc4b5ad
3
- size 272568
 
 
 
 
sample_2.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:98fe42f22f8ea632714081a958dc035f3d507523fd340b320a1223ac2f55ccac
3
- size 199942
 
 
 
 
sample_3.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:83c0b7375beada8cee74b5de226da494368fcc6a3ce692913b3302dcda0bd9a2
3
- size 192842
 
 
 
 
sample_4.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:19e466ee9c0c129c1eecf93eb6791a44c2ee8d68dce2c3e8fd3734b87f28324a
3
- size 241442
 
 
 
 
sample_5.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5af19120c92859846a08496e0a617c21877cae2db5807d211f0a431d95163a3e
3
- size 193388
 
 
 
 
sample_6.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac877968d5749438930339497f7548046003390a848496136f6cbe8a74c51629
3
- size 186290