Everything working fine, all needed info
Browse files- app.py +12 -9
- vad_utils.py +0 -1
app.py
CHANGED
@@ -5,14 +5,15 @@ import torch
|
|
5 |
import pandas as pd
|
6 |
import gdown
|
7 |
|
8 |
-
def process_audio(audio_input):
|
9 |
wav = read_audio(audio_input, sampling_rate=16_000)
|
10 |
audio_length_samples = len(wav)
|
11 |
-
probs = get_speech_probs(wav, sampling_rate=16_000)
|
12 |
return make_visualization(probs, 512 / 16_000), probs, audio_length_samples
|
13 |
|
14 |
def process_parameters(probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
|
15 |
-
|
|
|
16 |
timestamps = probs2speech_timestamps(probs, audio_length_samples,
|
17 |
threshold = threshold,
|
18 |
min_speech_duration_ms = min_speech_duration_ms,
|
@@ -21,7 +22,7 @@ def process_parameters(probs, audio_length_samples, threshold, min_speech_durati
|
|
21 |
speech_pad_ms=speech_pad_ms,
|
22 |
return_seconds=True,
|
23 |
rounding=3)
|
24 |
-
|
25 |
df = pd.DataFrame(timestamps)
|
26 |
df["note"] = ""
|
27 |
df.to_csv("timestamps.txt", sep = '\t', header=False, index=False)
|
@@ -46,18 +47,20 @@ def main():
|
|
46 |
|
47 |
with gr.Row():
|
48 |
audio_input = gr.Audio(type="filepath")
|
49 |
-
|
|
|
|
|
|
|
50 |
figure = gr.Plot()
|
51 |
|
52 |
download_button.click(download_gdrive, inputs=[gdrive_str], outputs=audio_input)
|
53 |
|
54 |
-
button1.click(process_audio, inputs=[audio_input], outputs=[figure, probs, audio_length_samples])
|
55 |
|
56 |
with gr.Row():
|
57 |
threshold = gr.Number(label="Threshold", value=0.6, minimum=0.0, maximum=1.0)
|
58 |
-
min_speech_duration_ms = gr.Number(label="
|
59 |
-
min_silence_duration_ms = gr.Number(label="
|
60 |
-
window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
|
61 |
speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
|
62 |
button2 = gr.Button("Compute Speech Timestamps")
|
63 |
output_file = gr.File()
|
|
|
5 |
import pandas as pd
|
6 |
import gdown
|
7 |
|
8 |
+
def process_audio(audio_input, window_size_samples):
|
9 |
wav = read_audio(audio_input, sampling_rate=16_000)
|
10 |
audio_length_samples = len(wav)
|
11 |
+
probs = get_speech_probs(wav, window_size_samples=window_size_samples, sampling_rate=16_000)
|
12 |
return make_visualization(probs, 512 / 16_000), probs, audio_length_samples
|
13 |
|
14 |
def process_parameters(probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
|
15 |
+
min_speech_duration_ms *= 1000
|
16 |
+
min_silence_duration_ms *= 1000
|
17 |
timestamps = probs2speech_timestamps(probs, audio_length_samples,
|
18 |
threshold = threshold,
|
19 |
min_speech_duration_ms = min_speech_duration_ms,
|
|
|
22 |
speech_pad_ms=speech_pad_ms,
|
23 |
return_seconds=True,
|
24 |
rounding=3)
|
25 |
+
print(timestamps)
|
26 |
df = pd.DataFrame(timestamps)
|
27 |
df["note"] = ""
|
28 |
df.to_csv("timestamps.txt", sep = '\t', header=False, index=False)
|
|
|
47 |
|
48 |
with gr.Row():
|
49 |
audio_input = gr.Audio(type="filepath")
|
50 |
+
with gr.Column():
|
51 |
+
md = gr.Markdown("[Parameter Documentation](https://github.com/snakers4/silero-vad/blob/master/utils_vad.py#L198)")
|
52 |
+
window_size_samples = gr.Dropdown(label="Window Size (samples)", choices=[512, 1024, 1536], value=512)
|
53 |
+
button1 = gr.Button("Compute Speech Probabilities")
|
54 |
figure = gr.Plot()
|
55 |
|
56 |
download_button.click(download_gdrive, inputs=[gdrive_str], outputs=audio_input)
|
57 |
|
58 |
+
button1.click(process_audio, inputs=[audio_input, window_size_samples], outputs=[figure, probs, audio_length_samples])
|
59 |
|
60 |
with gr.Row():
|
61 |
threshold = gr.Number(label="Threshold", value=0.6, minimum=0.0, maximum=1.0)
|
62 |
+
min_speech_duration_ms = gr.Number(label="Mininmum Speech Duration (s)", value=10.5)
|
63 |
+
min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (s)", value=5.5)
|
|
|
64 |
speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
|
65 |
button2 = gr.Button("Compute Speech Timestamps")
|
66 |
output_file = gr.File()
|
vad_utils.py
CHANGED
@@ -9,7 +9,6 @@ from matplotlib import pyplot as plt
|
|
9 |
|
10 |
def get_speech_probs(audio: torch.Tensor,
|
11 |
# model,
|
12 |
-
threshold: float = 0.5,
|
13 |
sampling_rate: int = 16000,
|
14 |
window_size_samples: int = 512,
|
15 |
progress_tracking_callback: Callable[[float], None] = None):
|
|
|
9 |
|
10 |
def get_speech_probs(audio: torch.Tensor,
|
11 |
# model,
|
|
|
12 |
sampling_rate: int = 16000,
|
13 |
window_size_samples: int = 512,
|
14 |
progress_tracking_callback: Callable[[float], None] = None):
|