Spaces:
Sleeping
Sleeping
Yehor Smoliakov
commited on
Commit
·
d6446fc
1
Parent(s):
3f925e7
Remove librosa
Browse files- README.md +7 -1
- app.py +25 -11
- requirements.txt +2 -2
README.md
CHANGED
@@ -20,4 +20,10 @@ uv pip install -r requirements.txt
|
|
20 |
|
21 |
# in development mode
|
22 |
uv pip install -r requirements-dev.txt
|
23 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# in development mode
|
22 |
uv pip install -r requirements-dev.txt
|
23 |
+
```
|
24 |
+
|
25 |
+
## Run
|
26 |
+
|
27 |
+
```shell
|
28 |
+
python app.py
|
29 |
+
```
|
app.py
CHANGED
@@ -2,7 +2,8 @@ import sys
|
|
2 |
import time
|
3 |
|
4 |
import torch
|
5 |
-
import
|
|
|
6 |
|
7 |
import gradio as gr
|
8 |
|
@@ -74,7 +75,7 @@ description_head = f"""
|
|
74 |
|
75 |
This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2 model to recognize audio files.
|
76 |
|
77 |
-
>
|
78 |
""".strip()
|
79 |
|
80 |
description_foot = f"""
|
@@ -93,7 +94,7 @@ Check out other ASR models: https://github.com/egorsmkv/speech-recognition-uk
|
|
93 |
transcription_value = """
|
94 |
Recognized text will appear here.
|
95 |
|
96 |
-
Choose **an example file** below the Recognize button, upload **your audio file**, or use **the microphone** to record
|
97 |
""".strip()
|
98 |
|
99 |
tech_env = f"""
|
@@ -108,10 +109,10 @@ tech_env = f"""
|
|
108 |
tech_libraries = f"""
|
109 |
#### Libraries
|
110 |
|
111 |
-
-
|
112 |
-
-
|
113 |
-
-
|
114 |
-
-
|
115 |
""".strip()
|
116 |
|
117 |
|
@@ -122,8 +123,10 @@ def inference(audio_path, progress=gr.Progress()):
|
|
122 |
gr.Info("Starting recognition", duration=2)
|
123 |
|
124 |
progress(0, desc="Recognizing")
|
|
|
|
|
|
|
125 |
|
126 |
-
duration = librosa.get_duration(path=audio_path)
|
127 |
if duration < min_duration:
|
128 |
raise gr.Error(
|
129 |
f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds."
|
@@ -140,8 +143,19 @@ def inference(audio_path, progress=gr.Progress()):
|
|
140 |
for path in progress.tqdm(paths, desc="Recognizing...", unit="file"):
|
141 |
t0 = time.time()
|
142 |
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
features = processor([audio_input], sampling_rate=16_000).input_features
|
147 |
features = torch.tensor(features).to(device)
|
@@ -196,7 +210,7 @@ demo = gr.Blocks(
|
|
196 |
with demo:
|
197 |
gr.Markdown(description_head)
|
198 |
|
199 |
-
gr.Markdown("##
|
200 |
|
201 |
with gr.Row():
|
202 |
audio_file = gr.Audio(label="Audio file", type="filepath")
|
|
|
2 |
import time
|
3 |
|
4 |
import torch
|
5 |
+
import torchaudio
|
6 |
+
import torchaudio.transforms as T
|
7 |
|
8 |
import gradio as gr
|
9 |
|
|
|
75 |
|
76 |
This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2 model to recognize audio files.
|
77 |
|
78 |
+
> Due to resource limitations, audio duration **must not** exceed **{max_duration}** seconds.
|
79 |
""".strip()
|
80 |
|
81 |
description_foot = f"""
|
|
|
94 |
transcription_value = """
|
95 |
Recognized text will appear here.
|
96 |
|
97 |
+
Choose **an example file** below the Recognize button, upload **your audio file**, or use **the microphone** to record own voice.
|
98 |
""".strip()
|
99 |
|
100 |
tech_env = f"""
|
|
|
109 |
tech_libraries = f"""
|
110 |
#### Libraries
|
111 |
|
112 |
+
- torch: {torch.__version__}
|
113 |
+
- torchaudio: {torchaudio.__version__}
|
114 |
+
- transformers: {transformers_version}
|
115 |
+
- gradio: {gr.__version__}
|
116 |
""".strip()
|
117 |
|
118 |
|
|
|
123 |
gr.Info("Starting recognition", duration=2)
|
124 |
|
125 |
progress(0, desc="Recognizing")
|
126 |
+
|
127 |
+
meta = torchaudio.info(audio_path)
|
128 |
+
duration = meta.num_frames / meta.sample_rate
|
129 |
|
|
|
130 |
if duration < min_duration:
|
131 |
raise gr.Error(
|
132 |
f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds."
|
|
|
143 |
for path in progress.tqdm(paths, desc="Recognizing...", unit="file"):
|
144 |
t0 = time.time()
|
145 |
|
146 |
+
meta = torchaudio.info(audio_path)
|
147 |
+
audio_duration = meta.num_frames / meta.sample_rate
|
148 |
+
|
149 |
+
audio_input, sr = torchaudio.load(path)
|
150 |
+
|
151 |
+
if meta.num_channels > 1:
|
152 |
+
audio_input = torch.mean(audio_input, dim=0, keepdim=True)
|
153 |
+
|
154 |
+
if meta.sample_rate != 16_000:
|
155 |
+
resampler = T.Resample(sr, 16_000, dtype=audio_input.dtype)
|
156 |
+
audio_input = resampler(audio_input)
|
157 |
+
|
158 |
+
audio_input = audio_input.squeeze().numpy()
|
159 |
|
160 |
features = processor([audio_input], sampling_rate=16_000).input_features
|
161 |
features = torch.tensor(features).to(device)
|
|
|
210 |
with demo:
|
211 |
gr.Markdown(description_head)
|
212 |
|
213 |
+
gr.Markdown("## Usage")
|
214 |
|
215 |
with gr.Row():
|
216 |
audio_file = gr.Audio(label="Audio file", type="filepath")
|
requirements.txt
CHANGED
@@ -3,9 +3,9 @@ gradio
|
|
3 |
torch
|
4 |
torchaudio
|
5 |
|
|
|
|
|
6 |
triton
|
7 |
setuptools
|
8 |
|
9 |
transformers
|
10 |
-
|
11 |
-
librosa
|
|
|
3 |
torch
|
4 |
torchaudio
|
5 |
|
6 |
+
soundfile
|
7 |
+
|
8 |
triton
|
9 |
setuptools
|
10 |
|
11 |
transformers
|
|
|
|