mizoru commited on
Commit
50a5992
1 Parent(s): a00d114

fully fledged

Browse files
Files changed (3) hide show
  1. app.py +25 -9
  2. requirements.txt +1 -1
  3. vad_utils.py +8 -4
app.py CHANGED
@@ -2,6 +2,8 @@ import gradio as gr
2
  import numpy as np
3
  from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
4
  import torch
 
 
5
 
6
  probs = None
7
  audio_length_samples = None
@@ -14,26 +16,37 @@ def process_audio(audio_input):
14
  return make_visualization(probs, 512 / 16_000)
15
 
16
  def process_parameters(threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
17
- print(probs)
18
  timestamps = probs2speech_timestamps(probs, audio_length_samples,
19
  threshold = threshold,
20
  min_speech_duration_ms = min_speech_duration_ms,
21
  min_silence_duration_ms=min_silence_duration_ms,
22
  window_size_samples=window_size_samples,
23
  speech_pad_ms=speech_pad_ms)
24
- print(timestamps)
25
- return timestamps
 
 
26
 
27
- def main():
28
-
 
 
 
29
 
 
30
 
31
  with gr.Blocks() as demo:
 
 
 
 
32
  with gr.Row():
33
  audio_input = gr.Audio(type="filepath")
34
- button1 = gr.Button("Compute Probabilities")
35
  figure = gr.Plot()
36
 
 
 
37
  button1.click(process_audio, inputs=[audio_input], outputs=figure)
38
 
39
  with gr.Row():
@@ -42,10 +55,13 @@ def main():
42
  min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=100)
43
  window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
44
  speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
45
- button2 = gr.Button("Process Parameters")
46
- output_text = gr.Textbox()
 
 
47
 
48
- button2.click(process_parameters, inputs=[threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms], outputs=output_text)
 
49
 
50
  demo.launch()
51
 
 
2
  import numpy as np
3
  from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
4
  import torch
5
+ import pandas as pd
6
+ import gdown
7
 
8
  probs = None
9
  audio_length_samples = None
 
16
  return make_visualization(probs, 512 / 16_000)
17
 
18
  def process_parameters(threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
 
19
  timestamps = probs2speech_timestamps(probs, audio_length_samples,
20
  threshold = threshold,
21
  min_speech_duration_ms = min_speech_duration_ms,
22
  min_silence_duration_ms=min_silence_duration_ms,
23
  window_size_samples=window_size_samples,
24
  speech_pad_ms=speech_pad_ms)
25
+ df = pd.DataFrame(timestamps)
26
+ df["note"] = ""
27
+ df.to_csv("timestamps.txt", sep = '\t', header=False, index=False)
28
+ return "timestamps.txt", df
29
 
30
+ def download_gdrive(id):
31
+ output_file = "audio.wav" # Replace "data_file.ext" with the desired output filename and extension
32
+
33
+ gdown.download(f"https://drive.google.com/uc?id={id}", output_file)
34
+ return "output_file.wav"
35
 
36
+ def main():
37
 
38
  with gr.Blocks() as demo:
39
+ with gr.Row():
40
+ gdrive_str = gr.Text("File ID")
41
+ download_button = gr.Button("Download Audio")
42
+
43
  with gr.Row():
44
  audio_input = gr.Audio(type="filepath")
45
+ button1 = gr.Button("Compute Speech Probabilities")
46
  figure = gr.Plot()
47
 
48
+ download_button.click(download_gdrive, inputs=[gdrive_str], outputs=audio_input)
49
+
50
  button1.click(process_audio, inputs=[audio_input], outputs=figure)
51
 
52
  with gr.Row():
 
55
  min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=100)
56
  window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
57
  speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
58
+ button2 = gr.Button("Compute Speech Timestamps")
59
+ output_file = gr.File()
60
+ with gr.Row():
61
+ output_df = gr.DataFrame()
62
 
63
+ button2.click(process_parameters, inputs=[threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms],
64
+ outputs=[output_file, output_df])
65
 
66
  demo.launch()
67
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  torchaudio
2
  onnxruntime
3
  gradio
4
- pandas
 
1
  torchaudio
2
  onnxruntime
3
  gradio
4
+ gdown
vad_utils.py CHANGED
@@ -4,6 +4,8 @@ import torchaudio
4
  from typing import Callable, List
5
  import torch.nn.functional as F
6
  import warnings
 
 
7
 
8
  def get_speech_probs(audio: torch.Tensor,
9
  # model,
@@ -156,13 +158,15 @@ def probs2speech_timestamps(speech_probs, audio_length_samples,
156
  return speeches
157
 
158
  def make_visualization(probs, step):
159
- import pandas as pd
160
- return pd.DataFrame({'probs': probs},
161
- index=[x * step for x in range(len(probs))]).plot(figsize=(16, 8),
 
162
  kind='area', ylim=[0, 1.05], xlim=[0, len(probs) * step],
163
  xlabel='seconds',
164
  ylabel='speech probability',
165
  colormap='tab20')
 
166
 
167
  torch.set_num_threads(1)
168
 
@@ -172,7 +176,7 @@ USE_ONNX = True # change this to True if you want to test onnx model
172
 
173
  model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
174
  model='silero_vad',
175
- force_reload=True,
176
  onnx=USE_ONNX)
177
  (_,
178
  _, read_audio,
 
4
  from typing import Callable, List
5
  import torch.nn.functional as F
6
  import warnings
7
+ import pandas as pd
8
+ from matplotlib import pyplot as plt
9
 
10
  def get_speech_probs(audio: torch.Tensor,
11
  # model,
 
158
  return speeches
159
 
160
  def make_visualization(probs, step):
161
+ fig, ax = plt.subplots(figsize=(16, 8),)
162
+
163
+ pd.DataFrame({'probs': probs},
164
+ index=[x * step for x in range(len(probs))]).plot(ax = ax,
165
  kind='area', ylim=[0, 1.05], xlim=[0, len(probs) * step],
166
  xlabel='seconds',
167
  ylabel='speech probability',
168
  colormap='tab20')
169
+ return fig
170
 
171
  torch.set_num_threads(1)
172
 
 
176
 
177
  model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
178
  model='silero_vad',
179
+ # force_reload=True,
180
  onnx=USE_ONNX)
181
  (_,
182
  _, read_audio,