fully fledged
Browse files- app.py +25 -9
- requirements.txt +1 -1
- vad_utils.py +8 -4
app.py
CHANGED
@@ -2,6 +2,8 @@ import gradio as gr
|
|
2 |
import numpy as np
|
3 |
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
|
4 |
import torch
|
|
|
|
|
5 |
|
6 |
probs = None
|
7 |
audio_length_samples = None
|
@@ -14,26 +16,37 @@ def process_audio(audio_input):
|
|
14 |
return make_visualization(probs, 512 / 16_000)
|
15 |
|
16 |
def process_parameters(threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
|
17 |
-
print(probs)
|
18 |
timestamps = probs2speech_timestamps(probs, audio_length_samples,
|
19 |
threshold = threshold,
|
20 |
min_speech_duration_ms = min_speech_duration_ms,
|
21 |
min_silence_duration_ms=min_silence_duration_ms,
|
22 |
window_size_samples=window_size_samples,
|
23 |
speech_pad_ms=speech_pad_ms)
|
24 |
-
|
25 |
-
|
|
|
|
|
26 |
|
27 |
-
def
|
28 |
-
|
|
|
|
|
|
|
29 |
|
|
|
30 |
|
31 |
with gr.Blocks() as demo:
|
|
|
|
|
|
|
|
|
32 |
with gr.Row():
|
33 |
audio_input = gr.Audio(type="filepath")
|
34 |
-
button1 = gr.Button("Compute Probabilities")
|
35 |
figure = gr.Plot()
|
36 |
|
|
|
|
|
37 |
button1.click(process_audio, inputs=[audio_input], outputs=figure)
|
38 |
|
39 |
with gr.Row():
|
@@ -42,10 +55,13 @@ def main():
|
|
42 |
min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=100)
|
43 |
window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
|
44 |
speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
|
45 |
-
button2 = gr.Button("
|
46 |
-
|
|
|
|
|
47 |
|
48 |
-
button2.click(process_parameters, inputs=[threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms],
|
|
|
49 |
|
50 |
demo.launch()
|
51 |
|
|
|
2 |
import numpy as np
|
3 |
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
|
4 |
import torch
|
5 |
+
import pandas as pd
|
6 |
+
import gdown
|
7 |
|
8 |
probs = None
|
9 |
audio_length_samples = None
|
|
|
16 |
return make_visualization(probs, 512 / 16_000)
|
17 |
|
18 |
def process_parameters(threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
|
|
|
19 |
timestamps = probs2speech_timestamps(probs, audio_length_samples,
|
20 |
threshold = threshold,
|
21 |
min_speech_duration_ms = min_speech_duration_ms,
|
22 |
min_silence_duration_ms=min_silence_duration_ms,
|
23 |
window_size_samples=window_size_samples,
|
24 |
speech_pad_ms=speech_pad_ms)
|
25 |
+
df = pd.DataFrame(timestamps)
|
26 |
+
df["note"] = ""
|
27 |
+
df.to_csv("timestamps.txt", sep = '\t', header=False, index=False)
|
28 |
+
return "timestamps.txt", df
|
29 |
|
30 |
+
def download_gdrive(id):
|
31 |
+
output_file = "audio.wav" # Replace "data_file.ext" with the desired output filename and extension
|
32 |
+
|
33 |
+
gdown.download(f"https://drive.google.com/uc?id={id}", output_file)
|
34 |
+
return "output_file.wav"
|
35 |
|
36 |
+
def main():
|
37 |
|
38 |
with gr.Blocks() as demo:
|
39 |
+
with gr.Row():
|
40 |
+
gdrive_str = gr.Text("File ID")
|
41 |
+
download_button = gr.Button("Download Audio")
|
42 |
+
|
43 |
with gr.Row():
|
44 |
audio_input = gr.Audio(type="filepath")
|
45 |
+
button1 = gr.Button("Compute Speech Probabilities")
|
46 |
figure = gr.Plot()
|
47 |
|
48 |
+
download_button.click(download_gdrive, inputs=[gdrive_str], outputs=audio_input)
|
49 |
+
|
50 |
button1.click(process_audio, inputs=[audio_input], outputs=figure)
|
51 |
|
52 |
with gr.Row():
|
|
|
55 |
min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=100)
|
56 |
window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
|
57 |
speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
|
58 |
+
button2 = gr.Button("Compute Speech Timestamps")
|
59 |
+
output_file = gr.File()
|
60 |
+
with gr.Row():
|
61 |
+
output_df = gr.DataFrame()
|
62 |
|
63 |
+
button2.click(process_parameters, inputs=[threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms],
|
64 |
+
outputs=[output_file, output_df])
|
65 |
|
66 |
demo.launch()
|
67 |
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
torchaudio
|
2 |
onnxruntime
|
3 |
gradio
|
4 |
-
|
|
|
1 |
torchaudio
|
2 |
onnxruntime
|
3 |
gradio
|
4 |
+
gdown
|
vad_utils.py
CHANGED
@@ -4,6 +4,8 @@ import torchaudio
|
|
4 |
from typing import Callable, List
|
5 |
import torch.nn.functional as F
|
6 |
import warnings
|
|
|
|
|
7 |
|
8 |
def get_speech_probs(audio: torch.Tensor,
|
9 |
# model,
|
@@ -156,13 +158,15 @@ def probs2speech_timestamps(speech_probs, audio_length_samples,
|
|
156 |
return speeches
|
157 |
|
158 |
def make_visualization(probs, step):
|
159 |
-
|
160 |
-
|
161 |
-
|
|
|
162 |
kind='area', ylim=[0, 1.05], xlim=[0, len(probs) * step],
|
163 |
xlabel='seconds',
|
164 |
ylabel='speech probability',
|
165 |
colormap='tab20')
|
|
|
166 |
|
167 |
torch.set_num_threads(1)
|
168 |
|
@@ -172,7 +176,7 @@ USE_ONNX = True # change this to True if you want to test onnx model
|
|
172 |
|
173 |
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
174 |
model='silero_vad',
|
175 |
-
force_reload=True,
|
176 |
onnx=USE_ONNX)
|
177 |
(_,
|
178 |
_, read_audio,
|
|
|
4 |
from typing import Callable, List
|
5 |
import torch.nn.functional as F
|
6 |
import warnings
|
7 |
+
import pandas as pd
|
8 |
+
from matplotlib import pyplot as plt
|
9 |
|
10 |
def get_speech_probs(audio: torch.Tensor,
|
11 |
# model,
|
|
|
158 |
return speeches
|
159 |
|
160 |
def make_visualization(probs, step):
|
161 |
+
fig, ax = plt.subplots(figsize=(16, 8),)
|
162 |
+
|
163 |
+
pd.DataFrame({'probs': probs},
|
164 |
+
index=[x * step for x in range(len(probs))]).plot(ax = ax,
|
165 |
kind='area', ylim=[0, 1.05], xlim=[0, len(probs) * step],
|
166 |
xlabel='seconds',
|
167 |
ylabel='speech probability',
|
168 |
colormap='tab20')
|
169 |
+
return fig
|
170 |
|
171 |
torch.set_num_threads(1)
|
172 |
|
|
|
176 |
|
177 |
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
178 |
model='silero_vad',
|
179 |
+
# force_reload=True,
|
180 |
onnx=USE_ONNX)
|
181 |
(_,
|
182 |
_, read_audio,
|