qgyd2021 commited on
Commit
fdbda89
1 Parent(s): acb6654
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.xlsx filter=lfs diff=lfs merge=lfs -text
37
+ *.pkl filter=lfs diff=lfs merge=lfs -text
38
+ *.xlsx filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ .git/
3
+ .idea/
4
+
5
+ data/
6
+ pretrained_models/
7
+ temp/
8
+
9
+ **/cache/
10
+ **/__pycache__/
11
+
12
+ **/*.env
13
+ **/*.mp3
14
+ **/*.png
15
+ **/*.xlsx
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.8
5
+
6
+ WORKDIR /code
7
+
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ RUN pip install --upgrade pip
11
+
12
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
13
+
14
+ # Set up a new user named "user" with user ID 1000
15
+ RUN useradd -m -u 1000 user
16
+
17
+ # Switch to the "user" user
18
+ USER user
19
+
20
+ RUN apt-get install -y git
21
+
22
+ # Set home to the user's home directory
23
+ ENV HOME=/home/user \
24
+ PATH=/home/user/.local/bin:$PATH
25
+
26
+ # Set the working directory to the user's home directory
27
+ WORKDIR $HOME/app
28
+
29
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
30
+ COPY --chown=user . $HOME/app
31
+
32
+ CMD ["python", "main.py"]
README.md CHANGED
@@ -1,13 +1,10 @@
1
  ---
2
  title: Voice Activity Detection
3
- emoji: 🐢
4
- colorFrom: yellow
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 4.16.0
8
- app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Voice Activity Detection
3
+ emoji: 🌍
4
+ colorFrom: purple
5
+ colorTo: gray
6
+ sdk: docker
 
 
7
  pinned: false
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
examples/webrtcvad/vad.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import collections
5
+ import contextlib
6
+
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ from scipy.io import wavfile
10
+ import wave
11
+ import webrtcvad
12
+
13
+ from project_settings import project_path
14
+
15
+
16
+ def get_args():
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument(
19
+ "--wav_file",
20
+ default=(project_path / "data/3300999628164249998.wav").as_posix(),
21
+ type=str,
22
+ )
23
+ parser.add_argument(
24
+ "--agg",
25
+ default=3,
26
+ type=int,
27
+ help="The level of aggressiveness of the VAD: [0-3]'"
28
+ )
29
+ parser.add_argument(
30
+ "--frame_duration_ms",
31
+ default=30,
32
+ type=int,
33
+ )
34
+ parser.add_argument(
35
+ "--silence_duration_threshold",
36
+ default=0.3,
37
+ type=float,
38
+ help="minimum silence duration, in seconds."
39
+ )
40
+ args = parser.parse_args()
41
+ return args
42
+
43
+
44
+ def read_wave(path):
45
+ with contextlib.closing(wave.open(path, 'rb')) as wf:
46
+ num_channels = wf.getnchannels()
47
+ assert num_channels == 1
48
+ sample_width = wf.getsampwidth()
49
+ assert sample_width == 2
50
+ sample_rate = wf.getframerate()
51
+ assert sample_rate in (8000, 16000, 32000, 48000)
52
+ pcm_data = wf.readframes(wf.getnframes())
53
+ return pcm_data, sample_rate
54
+
55
+
56
+ class Frame(object):
57
+ def __init__(self, audio_bytes, timestamp, duration):
58
+ self.audio_bytes = audio_bytes
59
+ self.timestamp = timestamp
60
+ self.duration = duration
61
+
62
+
63
+ def frame_generator(frame_duration_ms, audio, sample_rate):
64
+ n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
65
+ offset = 0
66
+ timestamp = 0.0
67
+ duration = (float(n) / sample_rate) / 2.0
68
+ while offset + n < len(audio):
69
+ yield Frame(audio[offset:offset + n], timestamp, duration)
70
+ timestamp += duration
71
+ offset += n
72
+
73
+
74
+ def vad_collector(sample_rate, frame_duration_ms,
75
+ padding_duration_ms, vad, frames):
76
+
77
+ num_padding_frames = int(padding_duration_ms / frame_duration_ms)
78
+ ring_buffer = collections.deque(maxlen=num_padding_frames)
79
+ triggered = False
80
+
81
+ voiced_frames = []
82
+ for frame in frames:
83
+ is_speech = vad.is_speech(frame.audio_bytes, sample_rate)
84
+
85
+ if not triggered:
86
+ ring_buffer.append((frame, is_speech))
87
+ num_voiced = len([f for f, speech in ring_buffer if speech])
88
+
89
+ if num_voiced > 0.9 * ring_buffer.maxlen:
90
+ triggered = True
91
+
92
+ for f, _ in ring_buffer:
93
+ voiced_frames.append(f)
94
+ ring_buffer.clear()
95
+ else:
96
+ voiced_frames.append(frame)
97
+ ring_buffer.append((frame, is_speech))
98
+ num_unvoiced = len([f for f, speech in ring_buffer if not speech])
99
+ if num_unvoiced > 0.9 * ring_buffer.maxlen:
100
+ triggered = False
101
+ yield [b''.join([f.audio_bytes for f in voiced_frames]),
102
+ voiced_frames[0].timestamp, voiced_frames[-1].timestamp]
103
+ ring_buffer.clear()
104
+ voiced_frames = []
105
+
106
+ if voiced_frames:
107
+ yield [b''.join([f.audio_bytes for f in voiced_frames]),
108
+ voiced_frames[0].timestamp, voiced_frames[-1].timestamp]
109
+
110
+
111
+ def main():
112
+ args = get_args()
113
+
114
+ vad = webrtcvad.Vad(mode=args.agg)
115
+
116
+ audio_pcm_data, sample_rate = read_wave(args.wav_file)
117
+ _, audio_data = wavfile.read(args.wav_file)
118
+ # audio_data_ = bytes(audio_data)
119
+
120
+ frames = frame_generator(
121
+ frame_duration_ms=args.frame_duration_ms,
122
+ audio=audio_pcm_data, sample_rate=sample_rate
123
+ )
124
+ frames = list(frames)
125
+
126
+ segments = vad_collector(sample_rate, args.frame_duration_ms, 300, vad, frames)
127
+ segments = list(segments)
128
+
129
+ vad_segments = list()
130
+ timestamp_start = 0.0
131
+ timestamp_end = 0.0
132
+
133
+ last_i = len(segments) - 1
134
+ for i, segment in enumerate(segments):
135
+ start = round(segment[1], 4)
136
+ end = round(segment[2], 4)
137
+
138
+ flag_first = i == 0
139
+ flag_last = i == last_i
140
+ if flag_first:
141
+ timestamp_start = start
142
+ timestamp_end = end
143
+ continue
144
+
145
+ if timestamp_start:
146
+ sil_duration = start - timestamp_end
147
+ if sil_duration > args.silence_duration_threshold:
148
+ vad_segments.append([timestamp_start, timestamp_end])
149
+ timestamp_start = start
150
+ timestamp_end = end
151
+ if flag_last:
152
+ vad_segments.append([timestamp_start, timestamp_end])
153
+ else:
154
+ timestamp_end = end
155
+
156
+ print(vad_segments)
157
+
158
+ time = np.arange(0, len(audio_data)) / sample_rate
159
+
160
+ plt.figure(figsize=(12, 5))
161
+
162
+ plt.plot(time, audio_data / 32768, color='b')
163
+
164
+ for start, end in vad_segments:
165
+ plt.axvline(x=start, ymin=0.25, ymax=0.75, color='g', linestyle='--', label='开始端点') # 标记开始端点
166
+ plt.axvline(x=end, ymin=0.25, ymax=0.75, color='r', linestyle='--', label='结束端点') # 标记结束端点
167
+
168
+ plt.show()
169
+ return
170
+
171
+
172
+ if __name__ == '__main__':
173
+ main()
main.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import platform
6
+ from typing import Tuple
7
+
8
+ import gradio as gr
9
+ import matplotlib.pyplot as plt
10
+ import numpy as np
11
+ from PIL import Image
12
+
13
+ from project_settings import project_path, temp_directory
14
+ from toolbox.webrtcvad.vad import WebRTCVad
15
+
16
+
17
+ def get_args():
18
+ parser = argparse.ArgumentParser()
19
+ parser.add_argument(
20
+ "--webrtcvad_examples_file",
21
+ default=(project_path / "webrtcvad_examples.json").as_posix(),
22
+ type=str
23
+ )
24
+ args = parser.parse_args()
25
+ return args
26
+
27
+
28
+ webrtcvad: WebRTCVad = None
29
+
30
+
31
+ def click_webrtcvad_button(audio: Tuple[int, np.ndarray],
32
+ agg: int = 3,
33
+ frame_duration_ms: int = 30,
34
+ padding_duration_ms: int = 300,
35
+ silence_duration_threshold: float = 0.3,
36
+ ):
37
+ global webrtcvad
38
+
39
+ sample_rate, signal = audio
40
+
41
+ webrtcvad = WebRTCVad(agg=int(agg),
42
+ frame_duration_ms=frame_duration_ms,
43
+ padding_duration_ms=padding_duration_ms,
44
+ silence_duration_threshold=silence_duration_threshold,
45
+ sample_rate=sample_rate,
46
+ )
47
+
48
+ vad_segments = list()
49
+ segments = webrtcvad.vad(signal)
50
+ vad_segments += segments
51
+ segments = webrtcvad.last_vad_segments()
52
+ vad_segments += segments
53
+
54
+ time = np.arange(0, len(signal)) / sample_rate
55
+ plt.figure(figsize=(12, 5))
56
+ plt.plot(time, signal / 32768, color='b')
57
+ for start, end in vad_segments:
58
+ plt.axvline(x=start, ymin=0.25, ymax=0.75, color='g', linestyle='--', label='开始端点') # 标记开始端点
59
+ plt.axvline(x=end, ymin=0.25, ymax=0.75, color='r', linestyle='--', label='结束端点') # 标记结束端点
60
+
61
+ temp_image_file = temp_directory / "temp.jpg"
62
+ plt.savefig(temp_image_file)
63
+ image = Image.open(open(temp_image_file, "rb"))
64
+
65
+ return image, vad_segments
66
+
67
+
68
+ def main():
69
+ args = get_args()
70
+
71
+ brief_description = """
72
+ ## Voice Activity Detection
73
+
74
+ """
75
+
76
+ # examples
77
+ with open(args.webrtcvad_examples_file, "r", encoding="utf-8") as f:
78
+ webrtcvad_examples = json.load(f)
79
+
80
+ # ui
81
+ with gr.Blocks() as blocks:
82
+ gr.Markdown(value=brief_description)
83
+
84
+ with gr.Row():
85
+ with gr.Column(scale=5):
86
+ with gr.Tabs():
87
+ with gr.TabItem("webrtcvad"):
88
+ gr.Markdown(value="")
89
+
90
+ with gr.Row():
91
+ with gr.Column(scale=1):
92
+ webrtcvad_wav = gr.Audio(label="wav")
93
+
94
+ with gr.Row():
95
+ webrtcvad_agg = gr.Dropdown(choices=[1, 2, 3], value=3, label="agg")
96
+ webrtcvad_frame_duration_ms = gr.Slider(minimum=0, maximum=100, value=30, label="frame_duration_ms")
97
+
98
+ with gr.Row():
99
+ webrtcvad_padding_duration_ms = gr.Slider(minimum=0, maximum=1000, value=300, label="padding_duration_ms")
100
+ webrtcvad_silence_duration_threshold = gr.Slider(minimum=0, maximum=1.0, value=0.3, step=0.1, label="silence_duration_threshold")
101
+
102
+ webrtcvad_button = gr.Button("retrieval", variant="primary")
103
+
104
+ with gr.Column(scale=1):
105
+ webrtcvad_image = gr.Image(label="image", height=300, width=720, show_label=False)
106
+ webrtcvad_end_points = gr.TextArea(label="end_points", max_lines=35)
107
+
108
+ gr.Examples(
109
+ examples=webrtcvad_examples,
110
+ inputs=[
111
+ webrtcvad_wav, webrtcvad_agg, webrtcvad_frame_duration_ms,
112
+ webrtcvad_padding_duration_ms, webrtcvad_silence_duration_threshold
113
+ ],
114
+ outputs=[webrtcvad_image, webrtcvad_end_points],
115
+ fn=click_webrtcvad_button
116
+ )
117
+
118
+ # click event
119
+ webrtcvad_button.click(
120
+ click_webrtcvad_button,
121
+ inputs=[
122
+ webrtcvad_wav, webrtcvad_agg, webrtcvad_frame_duration_ms,
123
+ webrtcvad_padding_duration_ms, webrtcvad_silence_duration_threshold
124
+ ],
125
+ outputs=[webrtcvad_image, webrtcvad_end_points],
126
+ )
127
+
128
+ blocks.queue().launch(
129
+ share=False if platform.system() == "Windows" else False
130
+ )
131
+ return
132
+
133
+
134
+ if __name__ == "__main__":
135
+ main()
project_settings.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import os
4
+ from pathlib import Path
5
+
6
+
7
+ project_path = os.path.abspath(os.path.dirname(__file__))
8
+ project_path = Path(project_path)
9
+
10
+
11
+ temp_directory = project_path / "temp"
12
+ temp_directory.mkdir(exist_ok=True)
13
+
14
+
15
+ if __name__ == '__main__':
16
+ pass
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==4.1.2
2
+ webrtcvad==2.0.10
3
+ wave==0.0.2
4
+ matplotlib==3.7.4
5
+ scipy==1.10.1
6
+ pillow==10.2.0
toolbox/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/webrtcvad/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/webrtcvad/vad.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import collections
5
+ from typing import List
6
+
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ from scipy.io import wavfile
10
+ import webrtcvad
11
+
12
+ from project_settings import project_path
13
+
14
+
15
+ class Frame(object):
16
+ def __init__(self, signal: np.ndarray, timestamp, duration):
17
+ self.signal = signal
18
+ self.timestamp = timestamp
19
+ self.duration = duration
20
+
21
+
22
+ class WebRTCVad(object):
23
+ def __init__(self,
24
+ agg: int = 3,
25
+ frame_duration_ms: int = 30,
26
+ padding_duration_ms: int = 300,
27
+ silence_duration_threshold: float = 0.3,
28
+ sample_rate: int = 8000
29
+ ):
30
+ self.agg = agg
31
+ self.frame_duration_ms = frame_duration_ms
32
+ self.padding_duration_ms = padding_duration_ms
33
+ self.silence_duration_threshold = silence_duration_threshold
34
+ self.sample_rate = sample_rate
35
+
36
+ self._vad = webrtcvad.Vad(mode=agg)
37
+
38
+ # frames
39
+ self.frame_length = int(sample_rate * (frame_duration_ms / 1000.0))
40
+ self.frame_timestamp = 0.0
41
+ self.signal_cache = None
42
+
43
+ # segments
44
+ self.num_padding_frames = int(padding_duration_ms / frame_duration_ms)
45
+ self.ring_buffer = collections.deque(maxlen=self.num_padding_frames)
46
+ self.triggered = False
47
+ self.voiced_frames: List[Frame] = list()
48
+ self.segments = list()
49
+
50
+ # vad segments
51
+ self.is_first_segment = True
52
+ self.timestamp_start = 0.0
53
+ self.timestamp_end = 0.0
54
+
55
+ def signal_to_frames(self, signal: np.ndarray):
56
+ frames = list()
57
+
58
+ l = len(signal)
59
+
60
+ duration = (float(self.frame_length) / self.sample_rate)
61
+
62
+ for offset in range(0, l, self.frame_length):
63
+ sub_signal = signal[offset:offset+self.frame_length]
64
+
65
+ frame = Frame(sub_signal, self.frame_timestamp, duration)
66
+ self.frame_timestamp += duration
67
+
68
+ frames.append(frame)
69
+ return frames
70
+
71
+ def segments_generator(self, signal: np.ndarray):
72
+ # signal rounding
73
+ if self.signal_cache is not None:
74
+ signal = np.concatenate([self.signal_cache, signal])
75
+
76
+ rest = len(signal) % self.frame_length
77
+
78
+ if rest == 0:
79
+ self.signal_cache = None
80
+ signal_ = signal
81
+ else:
82
+ self.signal_cache = signal[-rest:]
83
+ signal_ = signal[:-rest]
84
+
85
+ # frames
86
+ frames = self.signal_to_frames(signal_)
87
+
88
+ for frame in frames:
89
+ audio_bytes = bytes(frame.signal)
90
+ is_speech = self._vad.is_speech(audio_bytes, self.sample_rate)
91
+
92
+ if not self.triggered:
93
+ self.ring_buffer.append((frame, is_speech))
94
+ num_voiced = len([f for f, speech in self.ring_buffer if speech])
95
+
96
+ if num_voiced > 0.9 * self.ring_buffer.maxlen:
97
+ self.triggered = True
98
+
99
+ for f, _ in self.ring_buffer:
100
+ self.voiced_frames.append(f)
101
+ self.ring_buffer.clear()
102
+ else:
103
+ self.voiced_frames.append(frame)
104
+ self.ring_buffer.append((frame, is_speech))
105
+ num_unvoiced = len([f for f, speech in self.ring_buffer if not speech])
106
+ if num_unvoiced > 0.9 * self.ring_buffer.maxlen:
107
+ self.triggered = False
108
+ segment = [
109
+ np.concatenate([f.signal for f in self.voiced_frames]),
110
+ self.voiced_frames[0].timestamp,
111
+ self.voiced_frames[-1].timestamp
112
+ ]
113
+ yield segment
114
+ self.ring_buffer.clear()
115
+ self.voiced_frames = []
116
+
117
+ def vad_segments_generator(self, segments_generator):
118
+ segments = list(segments_generator)
119
+
120
+ for i, segment in enumerate(segments):
121
+ start = round(segment[1], 4)
122
+ end = round(segment[2], 4)
123
+
124
+ if self.is_first_segment:
125
+ self.timestamp_start = start
126
+ self.timestamp_end = end
127
+ self.is_first_segment = False
128
+ continue
129
+
130
+ if self.timestamp_start:
131
+ sil_duration = start - self.timestamp_end
132
+ if sil_duration > self.silence_duration_threshold:
133
+ vad_segment = [self.timestamp_start, self.timestamp_end]
134
+ yield vad_segment
135
+
136
+ self.timestamp_start = start
137
+ self.timestamp_end = end
138
+ else:
139
+ self.timestamp_end = end
140
+
141
+ def vad(self, signal: np.ndarray) -> List[list]:
142
+ segments = self.segments_generator(signal)
143
+ vad_segments = self.vad_segments_generator(segments)
144
+ vad_segments = list(vad_segments)
145
+ return vad_segments
146
+
147
+ def last_vad_segments(self) -> List[list]:
148
+ # last segments
149
+ if len(self.voiced_frames) == 0:
150
+ segments = []
151
+ else:
152
+ segment = [
153
+ np.concatenate([f.signal for f in self.voiced_frames]),
154
+ self.voiced_frames[0].timestamp,
155
+ self.voiced_frames[-1].timestamp
156
+ ]
157
+ segments = [segment]
158
+
159
+ # last vad segments
160
+ vad_segments = self.vad_segments_generator(segments)
161
+ vad_segments = list(vad_segments)
162
+
163
+ vad_segments = vad_segments + [[self.timestamp_start, self.timestamp_end]]
164
+ return vad_segments
165
+
166
+
167
+ def get_args():
168
+ parser = argparse.ArgumentParser()
169
+ parser.add_argument(
170
+ "--wav_file",
171
+ default=(project_path / "data/3300999628164249998.wav").as_posix(),
172
+ type=str,
173
+ )
174
+ parser.add_argument(
175
+ "--agg",
176
+ default=3,
177
+ type=int,
178
+ help="The level of aggressiveness of the VAD: [0-3]'"
179
+ )
180
+ parser.add_argument(
181
+ "--frame_duration_ms",
182
+ default=30,
183
+ type=int,
184
+ )
185
+ parser.add_argument(
186
+ "--silence_duration_threshold",
187
+ default=0.3,
188
+ type=float,
189
+ help="minimum silence duration, in seconds."
190
+ )
191
+ args = parser.parse_args()
192
+ return args
193
+
194
+
195
+ SAMPLE_RATE = 8000
196
+
197
+
198
+ def main():
199
+ args = get_args()
200
+
201
+ w_vad = WebRTCVad(sample_rate=SAMPLE_RATE)
202
+
203
+ sample_rate, signal = wavfile.read(args.wav_file)
204
+ if SAMPLE_RATE != sample_rate:
205
+ raise AssertionError
206
+
207
+ vad_segments = list()
208
+
209
+ segments = w_vad.vad(signal)
210
+ vad_segments += segments
211
+ for segment in segments:
212
+ print(segment)
213
+
214
+ # last vad segment
215
+ segments = w_vad.last_vad_segments()
216
+ vad_segments += segments
217
+ for segment in segments:
218
+ print(segment)
219
+
220
+ # plot
221
+ time = np.arange(0, len(signal)) / sample_rate
222
+ plt.figure(figsize=(12, 5))
223
+ plt.plot(time, signal / 32768, color='b')
224
+ for start, end in vad_segments:
225
+ plt.axvline(x=start, ymin=0.25, ymax=0.75, color='g', linestyle='--', label='开始端点') # 标记开始端点
226
+ plt.axvline(x=end, ymin=0.25, ymax=0.75, color='r', linestyle='--', label='结束端点') # 标记结束端点
227
+
228
+ plt.show()
229
+ return
230
+
231
+
232
+ if __name__ == '__main__':
233
+ main()
webrtcvad_examples.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [
2
+ [
3
+ "data/early_media/3300999628164249998.wav"
4
+ ],
5
+ [
6
+ "data/early_media/3300999628164852605.wav"
7
+ ]
8
+ ]