mizoru commited on
Commit
0412962
1 Parent(s): 0c8ffff

almost working

Browse files
Files changed (2) hide show
  1. app.py +14 -13
  2. vad_utils.py +15 -2
app.py CHANGED
@@ -1,18 +1,19 @@
1
  import gradio as gr
2
  import numpy as np
3
- from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps
 
4
 
5
-
6
- def process_audio(audio_input, model):
7
- wav = np.array(audio_input)
8
- probs = get_speech_probs(wav, model, sampling_rate=16_000)
9
  return make_visualization(probs, 512 / 16_000)
10
 
11
  def process_parameters(probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
12
  return probs2speech_timestamps(probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms)
13
 
14
- def main():
15
- model = None #load_your_model() # replace with your model loading code
 
16
 
17
  with gr.Blocks() as demo:
18
  with gr.Row():
@@ -20,15 +21,15 @@ def main():
20
  button1 = gr.Button("Process Audio")
21
  figure = gr.Image()
22
 
23
- button1.click(process_audio, inputs=[audio_input, model], outputs=figure)
24
 
25
  with gr.Row():
26
  probs = gr.State(None)
27
- threshold = gr.Number(label="Threshold", default=0.5, minimum=0.0, maximum=1.0)
28
- min_speech_duration_ms = gr.Number(label="Min Speech Duration (ms)", default=250)
29
- min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", default=100)
30
- window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], default=1536)
31
- speech_pad_ms = gr.Number(label="Speech Pad (ms)", default=30)
32
  button2 = gr.Button("Process Parameters")
33
  output_text = gr.Textbox()
34
 
 
1
  import gradio as gr
2
  import numpy as np
3
+ from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
4
+ import torch
5
 
6
+ def process_audio(audio_input):
7
+ wav = read_audio(audio_input, sampling_rate=16_000)
8
+ probs = get_speech_probs(wav, sampling_rate=16_000)
 
9
  return make_visualization(probs, 512 / 16_000)
10
 
11
  def process_parameters(probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
12
  return probs2speech_timestamps(probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms)
13
 
14
+ def main():
15
+
16
+
17
 
18
  with gr.Blocks() as demo:
19
  with gr.Row():
 
21
  button1 = gr.Button("Process Audio")
22
  figure = gr.Image()
23
 
24
+ button1.click(process_audio, inputs=[audio_input], outputs=figure)
25
 
26
  with gr.Row():
27
  probs = gr.State(None)
28
+ threshold = gr.Number(label="Threshold", value=0.5, minimum=0.0, maximum=1.0)
29
+ min_speech_duration_ms = gr.Number(label="Min Speech Duration (ms)", value=250)
30
+ min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=100)
31
+ window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
32
+ speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
33
  button2 = gr.Button("Process Parameters")
34
  output_text = gr.Textbox()
35
 
vad_utils.py CHANGED
@@ -6,7 +6,7 @@ import torch.nn.functional as F
6
  import warnings
7
 
8
  def get_speech_probs(audio: torch.Tensor,
9
- model,
10
  threshold: float = 0.5,
11
  sampling_rate: int = 16000,
12
  window_size_samples: int = 512,
@@ -163,4 +163,17 @@ def make_visualization(probs, step):
163
  xlabel='seconds',
164
  ylabel='speech probability',
165
  colormap='tab20')
166
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import warnings
7
 
8
  def get_speech_probs(audio: torch.Tensor,
9
+ # model,
10
  threshold: float = 0.5,
11
  sampling_rate: int = 16000,
12
  window_size_samples: int = 512,
 
163
  xlabel='seconds',
164
  ylabel='speech probability',
165
  colormap='tab20')
166
+
167
+ torch.set_num_threads(1)
168
+
169
+
170
+ USE_ONNX = True # change this to True if you want to test onnx model
171
+
172
+
173
+ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
174
+ model='silero_vad',
175
+ force_reload=True,
176
+ onnx=USE_ONNX)
177
+ (_,
178
+ _, read_audio,
179
+ *_) = utils