mizoru commited on
Commit
3b8e519
1 Parent(s): 9117637

Everything working fine, all needed info

Browse files
Files changed (2) hide show
  1. app.py +12 -9
  2. vad_utils.py +0 -1
app.py CHANGED
@@ -5,14 +5,15 @@ import torch
5
  import pandas as pd
6
  import gdown
7
 
8
- def process_audio(audio_input):
9
  wav = read_audio(audio_input, sampling_rate=16_000)
10
  audio_length_samples = len(wav)
11
- probs = get_speech_probs(wav, sampling_rate=16_000)
12
  return make_visualization(probs, 512 / 16_000), probs, audio_length_samples
13
 
14
  def process_parameters(probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
15
- print(probs, audio_length_samples)
 
16
  timestamps = probs2speech_timestamps(probs, audio_length_samples,
17
  threshold = threshold,
18
  min_speech_duration_ms = min_speech_duration_ms,
@@ -21,7 +22,7 @@ def process_parameters(probs, audio_length_samples, threshold, min_speech_durati
21
  speech_pad_ms=speech_pad_ms,
22
  return_seconds=True,
23
  rounding=3)
24
-
25
  df = pd.DataFrame(timestamps)
26
  df["note"] = ""
27
  df.to_csv("timestamps.txt", sep = '\t', header=False, index=False)
@@ -46,18 +47,20 @@ def main():
46
 
47
  with gr.Row():
48
  audio_input = gr.Audio(type="filepath")
49
- button1 = gr.Button("Compute Speech Probabilities")
 
 
 
50
  figure = gr.Plot()
51
 
52
  download_button.click(download_gdrive, inputs=[gdrive_str], outputs=audio_input)
53
 
54
- button1.click(process_audio, inputs=[audio_input], outputs=[figure, probs, audio_length_samples])
55
 
56
  with gr.Row():
57
  threshold = gr.Number(label="Threshold", value=0.6, minimum=0.0, maximum=1.0)
58
- min_speech_duration_ms = gr.Number(label="Min Speech Duration (ms)", value=10_000)
59
- min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=5_000)
60
- window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
61
  speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
62
  button2 = gr.Button("Compute Speech Timestamps")
63
  output_file = gr.File()
 
5
  import pandas as pd
6
  import gdown
7
 
8
+ def process_audio(audio_input, window_size_samples):
9
  wav = read_audio(audio_input, sampling_rate=16_000)
10
  audio_length_samples = len(wav)
11
+ probs = get_speech_probs(wav, window_size_samples=window_size_samples, sampling_rate=16_000)
12
  return make_visualization(probs, 512 / 16_000), probs, audio_length_samples
13
 
14
  def process_parameters(probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
15
+ min_speech_duration_ms *= 1000
16
+ min_silence_duration_ms *= 1000
17
  timestamps = probs2speech_timestamps(probs, audio_length_samples,
18
  threshold = threshold,
19
  min_speech_duration_ms = min_speech_duration_ms,
 
22
  speech_pad_ms=speech_pad_ms,
23
  return_seconds=True,
24
  rounding=3)
25
+ print(timestamps)
26
  df = pd.DataFrame(timestamps)
27
  df["note"] = ""
28
  df.to_csv("timestamps.txt", sep = '\t', header=False, index=False)
 
47
 
48
  with gr.Row():
49
  audio_input = gr.Audio(type="filepath")
50
+ with gr.Column():
51
+ md = gr.Markdown("[Parameter Documentation](https://github.com/snakers4/silero-vad/blob/master/utils_vad.py#L198)")
52
+ window_size_samples = gr.Dropdown(label="Window Size (samples)", choices=[512, 1024, 1536], value=512)
53
+ button1 = gr.Button("Compute Speech Probabilities")
54
  figure = gr.Plot()
55
 
56
  download_button.click(download_gdrive, inputs=[gdrive_str], outputs=audio_input)
57
 
58
+ button1.click(process_audio, inputs=[audio_input, window_size_samples], outputs=[figure, probs, audio_length_samples])
59
 
60
  with gr.Row():
61
  threshold = gr.Number(label="Threshold", value=0.6, minimum=0.0, maximum=1.0)
62
+ min_speech_duration_ms = gr.Number(label="Mininmum Speech Duration (s)", value=10.5)
63
+ min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (s)", value=5.5)
 
64
  speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
65
  button2 = gr.Button("Compute Speech Timestamps")
66
  output_file = gr.File()
vad_utils.py CHANGED
@@ -9,7 +9,6 @@ from matplotlib import pyplot as plt
9
 
10
  def get_speech_probs(audio: torch.Tensor,
11
  # model,
12
- threshold: float = 0.5,
13
  sampling_rate: int = 16000,
14
  window_size_samples: int = 512,
15
  progress_tracking_callback: Callable[[float], None] = None):
 
9
 
10
  def get_speech_probs(audio: torch.Tensor,
11
  # model,
 
12
  sampling_rate: int = 16000,
13
  window_size_samples: int = 512,
14
  progress_tracking_callback: Callable[[float], None] = None):