qgyd2021 commited on
Commit
b3f2891
·
1 Parent(s): 3a820e8
data/call_monitor/id-ID/voice/000a3f9a-b2bf-46fd-9c69-477fc62cda51_id-ID_1671935534167 - 副本.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf9e6ef0ee87be308c8a59a1459836dc9229c83be37c5e7204586c385d8d7a84
3
- size 32044
 
 
 
 
main.py CHANGED
@@ -41,6 +41,8 @@ def click_ring_vad_button(audio: Tuple[int, np.ndarray],
41
  max_silence_length_ms: int = 300,
42
  start_ring_rate: float = 0.9,
43
  end_ring_rate: float = 0.1,
 
 
44
  ):
45
  global vad
46
 
@@ -64,8 +66,11 @@ def click_ring_vad_button(audio: Tuple[int, np.ndarray],
64
  start_ring_rate=start_ring_rate,
65
  end_ring_rate=end_ring_rate,
66
  frame_length_ms=frame_length_ms,
 
67
  padding_length_ms=padding_length_ms,
68
  max_silence_length_ms=max_silence_length_ms,
 
 
69
  sample_rate=sample_rate,
70
  )
71
 
@@ -88,7 +93,7 @@ def click_ring_vad_button(audio: Tuple[int, np.ndarray],
88
  time = np.arange(0, len(signal)) / sample_rate
89
  plt.figure(figsize=(12, 5))
90
  plt.plot(time, signal / 32768, color="b")
91
- plt.plot(time, speech_probs * 2, color="gray")
92
 
93
  for start, end in vad_segments:
94
  plt.axvline(x=start, ymin=0.15, ymax=0.85, color="g", linestyle="--", label="开始端点")
@@ -143,6 +148,10 @@ def main():
143
  ring_start_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.9, step=0.05, label="start_ring_rate")
144
  ring_end_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="end_ring_rate")
145
 
 
 
 
 
146
  ring_button = gr.Button("retrieval", variant="primary")
147
 
148
  with gr.Column(scale=1):
@@ -156,7 +165,8 @@ def main():
156
  ring_model_name, ring_agg,
157
  ring_frame_length_ms, ring_frame_step_ms,
158
  ring_padding_length_ms, ring_max_silence_length_ms,
159
- ring_start_ring_rate, ring_end_ring_rate
 
160
  ],
161
  outputs=[ring_image, ring_end_points],
162
  fn=click_ring_vad_button
@@ -170,7 +180,8 @@ def main():
170
  ring_model_name, ring_agg,
171
  ring_frame_length_ms, ring_frame_step_ms,
172
  ring_padding_length_ms, ring_max_silence_length_ms,
173
- ring_start_ring_rate, ring_end_ring_rate
 
174
  ],
175
  outputs=[ring_image, ring_end_points],
176
  )
 
41
  max_silence_length_ms: int = 300,
42
  start_ring_rate: float = 0.9,
43
  end_ring_rate: float = 0.1,
44
+ max_speech_length_s: float = 2.0,
45
+ min_speech_length_s: float = 0.3,
46
  ):
47
  global vad
48
 
 
66
  start_ring_rate=start_ring_rate,
67
  end_ring_rate=end_ring_rate,
68
  frame_length_ms=frame_length_ms,
69
+ frame_step_ms=frame_step_ms,
70
  padding_length_ms=padding_length_ms,
71
  max_silence_length_ms=max_silence_length_ms,
72
+ max_speech_length_s=max_speech_length_s,
73
+ min_speech_length_s=min_speech_length_s,
74
  sample_rate=sample_rate,
75
  )
76
 
 
93
  time = np.arange(0, len(signal)) / sample_rate
94
  plt.figure(figsize=(12, 5))
95
  plt.plot(time, signal / 32768, color="b")
96
+ plt.plot(time, speech_probs, color="gray")
97
 
98
  for start, end in vad_segments:
99
  plt.axvline(x=start, ymin=0.15, ymax=0.85, color="g", linestyle="--", label="开始端点")
 
148
  ring_start_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.9, step=0.05, label="start_ring_rate")
149
  ring_end_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="end_ring_rate")
150
 
151
+ with gr.Row():
152
+ ring_max_speech_length_s = gr.Slider(minimum=0.0, maximum=10.0, value=2.0, step=0.05, label="max_speech_length_s")
153
+ ring_min_speech_length_s = gr.Slider(minimum=0.0, maximum=2.0, value=0.3, step=0.05, label="min_speech_length_s")
154
+
155
  ring_button = gr.Button("retrieval", variant="primary")
156
 
157
  with gr.Column(scale=1):
 
165
  ring_model_name, ring_agg,
166
  ring_frame_length_ms, ring_frame_step_ms,
167
  ring_padding_length_ms, ring_max_silence_length_ms,
168
+ ring_start_ring_rate, ring_end_ring_rate,
169
+ ring_max_speech_length_s, ring_min_speech_length_s
170
  ],
171
  outputs=[ring_image, ring_end_points],
172
  fn=click_ring_vad_button
 
180
  ring_model_name, ring_agg,
181
  ring_frame_length_ms, ring_frame_step_ms,
182
  ring_padding_length_ms, ring_max_silence_length_ms,
183
+ ring_start_ring_rate, ring_end_ring_rate,
184
+ ring_max_speech_length_s, ring_min_speech_length_s
185
  ],
186
  outputs=[ring_image, ring_end_points],
187
  )
ring_vad_examples.json CHANGED
@@ -1,38 +1,66 @@
1
  [
2
  [
3
  "data/early_media/3300999628164249998.wav",
4
- "webrtcvad", 3, 30, 300, 300, 300, 0.9, 0.1
5
  ],
6
  [
7
  "data/early_media/3300999628164852605.wav",
8
- "webrtcvad", 3, 30, 300, 300, 300, 0.9, 0.1
9
  ],
10
  [
11
  "data/early_media/3300999628164249998.wav",
12
- "silerovad", 3, 35, 350, 350, 350, 0.7, 0.3
13
  ],
14
  [
15
  "data/early_media/3300999628164852605.wav",
16
- "silerovad", 3, 35, 350, 350, 350, 0.5, 0.5
 
 
 
 
 
 
 
 
17
  ],
18
  [
19
  "data/early_media/3300999628164852605.wav",
20
- "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1
21
  ],
22
  [
23
  "data/early_media/62/3300999628999191096.wav",
24
- "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1
25
  ],
26
  [
27
  "data/early_media/62/33009996287818451333.wav",
28
- "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1
29
  ],
30
  [
31
- "data/call_monitor/id-ID/noise_mute/000d7fba-80ce-4bd7-84fe-e9c43de30f4a_id-ID_1678495379262.wav",
32
- "silerovad", 3, 35, 350, 350, 350, 0.7, 0.3
33
  ],
34
  [
35
- "data/call_monitor/id-ID/noise/00a0a2a3-14ff-4a84-8aee-b18b2fb65355_id-ID_1680237229413.wav",
36
- "silerovad", 3, 35, 350, 350, 350, 0.7, 0.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  ]
38
  ]
 
1
  [
2
  [
3
  "data/early_media/3300999628164249998.wav",
4
+ "webrtcvad", 3, 30, 30, 300, 300, 0.9, 0.1, 2.0, 0.3
5
  ],
6
  [
7
  "data/early_media/3300999628164852605.wav",
8
+ "webrtcvad", 3, 30, 30, 300, 300, 0.9, 0.1, 2.0, 0.3
9
  ],
10
  [
11
  "data/early_media/3300999628164249998.wav",
12
+ "silerovad", 3, 35, 35, 350, 350, 0.7, 0.3, 2.0, 0.3
13
  ],
14
  [
15
  "data/early_media/3300999628164852605.wav",
16
+ "silerovad", 3, 35, 35, 350, 350, 0.5, 0.5, 2.0, 0.3
17
+ ],
18
+ [
19
+ "data/call_monitor/id-ID/noise_mute/000d7fba-80ce-4bd7-84fe-e9c43de30f4a_id-ID_1678495379262.wav",
20
+ "silerovad", 3, 35, 35, 350, 350, 0.7, 0.3, 2.0, 0.3
21
+ ],
22
+ [
23
+ "data/call_monitor/id-ID/noise/00a0a2a3-14ff-4a84-8aee-b18b2fb65355_id-ID_1680237229413.wav",
24
+ "silerovad", 3, 35, 35, 350, 350, 0.7, 0.3, 2.0, 0.3
25
  ],
26
  [
27
  "data/early_media/3300999628164852605.wav",
28
+ "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
29
  ],
30
  [
31
  "data/early_media/62/3300999628999191096.wav",
32
+ "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
33
  ],
34
  [
35
  "data/early_media/62/33009996287818451333.wav",
36
+ "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
37
  ],
38
  [
39
+ "data/call_monitor/id-ID/noise/00a0a2a3-14ff-4a84-8aee-b18b2fb65355_id-ID_1680237229413.wav",
40
+ "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
41
  ],
42
  [
43
+ "data/call_monitor/id-ID/noise/000ad44a-fbad-4a22-ba5a-c6dc855779b2_id-ID_1672040947119.wav",
44
+ "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
45
+ ],
46
+ [
47
+ "data/call_monitor/id-ID/noise/000da369-6652-4601-b241-33ffbd52a224_id-ID_1676000326981.wav",
48
+ "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
49
+ ],
50
+ [
51
+ "data/call_monitor/id-ID/voicemail/00a20d31-e1cb-4c70-821b-6fd151b260ae_id-ID_1671762897272.wav",
52
+ "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
53
+ ],
54
+ [
55
+ "data/call_monitor/id-ID/voicemail/000b03b3-172e-4784-8510-24cf37e205ba_id-ID_1672193551438.wav",
56
+ "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
57
+ ],
58
+ [
59
+ "data/call_monitor/id-ID/voice/000a3f9a-b2bf-46fd-9c69-477fc62cda51_id-ID_1671935534167.wav",
60
+ "call_voice", 3, 300, 30, 120, 300, 0.4, 0.1, 2.0, 0.3
61
+ ],
62
+ [
63
+ "data/call_monitor/id-ID/voice/000cb369-a0ee-44aa-a213-18b036f1baf7_id-ID_1678762306513.wav",
64
+ "call_voice", 3, 300, 30, 120, 300, 0.4, 0.1, 2.0, 0.3
65
  ]
66
  ]
toolbox/vad/vad.py CHANGED
@@ -107,6 +107,8 @@ class Vad(object):
107
  frame_step_ms: int = 30,
108
  padding_length_ms: int = 300,
109
  max_silence_length_ms: int = 300,
 
 
110
  sample_rate: int = 8000
111
  ):
112
  self.model = model
@@ -115,13 +117,16 @@ class Vad(object):
115
  self.frame_length_ms = frame_length_ms
116
  self.padding_length_ms = padding_length_ms
117
  self.max_silence_length_ms = max_silence_length_ms
 
 
118
  self.sample_rate = sample_rate
119
 
120
  # frames
121
  self.frame_length = int(sample_rate * (frame_length_ms / 1000.0))
122
  self.frame_step = int(sample_rate * (frame_step_ms / 1000.0))
123
  self.frame_timestamp_s = 0.0
124
- self.signal_cache = np.zeros(shape=(self.frame_length,), dtype=np.int16)
 
125
 
126
  # segments
127
  self.num_padding_frames = int(padding_length_ms / frame_step_ms)
@@ -184,22 +189,23 @@ class Vad(object):
184
 
185
  for f, _ in self.ring_buffer:
186
  self.voiced_frames.append(f)
187
- self.ring_buffer.clear()
188
- else:
189
- self.voiced_frames.append(frame)
190
- self.ring_buffer.append((frame, speech_prob))
191
- num_voiced = sum([p for _, p in self.ring_buffer])
192
 
193
- if num_voiced < self.end_ring_rate * self.ring_buffer.maxlen:
194
- self.triggered = False
195
- segment = [
196
- np.concatenate([f.signal for f in self.voiced_frames]),
197
- self.voiced_frames[0].timestamp_s,
198
- self.voiced_frames[-1].timestamp_s,
199
- ]
200
- yield segment
201
- self.ring_buffer.clear()
202
- self.voiced_frames = []
 
 
 
 
 
203
 
204
  def vad_segments_generator(self, segments_generator):
205
  segments = list(segments_generator)
@@ -208,22 +214,31 @@ class Vad(object):
208
  start = round(segment[1], 4)
209
  end = round(segment[2], 4)
210
 
211
- if self.is_first_segment:
212
  self.timestamp_start_s = start
213
  self.timestamp_end_s = end
214
- self.is_first_segment = False
215
  continue
216
 
217
- if self.timestamp_start_s:
218
- silence_length_s = (start - self.timestamp_end_s) * 1000
219
- if silence_length_s > self.max_silence_length_ms:
220
- vad_segment = [self.timestamp_start_s, self.timestamp_end_s]
221
- yield vad_segment
 
 
 
 
 
 
 
 
 
 
222
 
223
- self.timestamp_start_s = start
224
- self.timestamp_end_s = end
225
- else:
226
- self.timestamp_end_s = end
227
 
228
  def vad(self, signal: np.ndarray) -> List[list]:
229
  segments = self.segments_generator(signal)
 
107
  frame_step_ms: int = 30,
108
  padding_length_ms: int = 300,
109
  max_silence_length_ms: int = 300,
110
+ max_speech_length_s: float = 2.0,
111
+ min_speech_length_s: float = 0.3,
112
  sample_rate: int = 8000
113
  ):
114
  self.model = model
 
117
  self.frame_length_ms = frame_length_ms
118
  self.padding_length_ms = padding_length_ms
119
  self.max_silence_length_ms = max_silence_length_ms
120
+ self.max_speech_length_s = max_speech_length_s
121
+ self.min_speech_length_s = min_speech_length_s
122
  self.sample_rate = sample_rate
123
 
124
  # frames
125
  self.frame_length = int(sample_rate * (frame_length_ms / 1000.0))
126
  self.frame_step = int(sample_rate * (frame_step_ms / 1000.0))
127
  self.frame_timestamp_s = 0.0
128
+ # self.signal_cache = np.zeros(shape=(self.frame_length,), dtype=np.int16)
129
+ self.signal_cache = None
130
 
131
  # segments
132
  self.num_padding_frames = int(padding_length_ms / frame_step_ms)
 
189
 
190
  for f, _ in self.ring_buffer:
191
  self.voiced_frames.append(f)
192
+ continue
 
 
 
 
193
 
194
+ self.voiced_frames.append(frame)
195
+ self.ring_buffer.append((frame, speech_prob))
196
+ num_voiced = sum([p for _, p in self.ring_buffer])
197
+
198
+ if num_voiced < self.end_ring_rate * self.ring_buffer.maxlen:
199
+ segment = [
200
+ np.concatenate([f.signal for f in self.voiced_frames]),
201
+ self.voiced_frames[0].timestamp_s,
202
+ self.voiced_frames[-1].timestamp_s,
203
+ ]
204
+ yield segment
205
+ self.triggered = False
206
+ self.ring_buffer.clear()
207
+ self.voiced_frames = []
208
+ continue
209
 
210
  def vad_segments_generator(self, segments_generator):
211
  segments = list(segments_generator)
 
214
  start = round(segment[1], 4)
215
  end = round(segment[2], 4)
216
 
217
+ if self.timestamp_start_s is None and self.timestamp_end_s is None:
218
  self.timestamp_start_s = start
219
  self.timestamp_end_s = end
 
220
  continue
221
 
222
+ if self.timestamp_end_s - self.timestamp_start_s > self.max_speech_length_s:
223
+ end_ = self.timestamp_start_s + self.max_speech_length_s
224
+ vad_segment = [self.timestamp_start_s, end_]
225
+ yield vad_segment
226
+ self.timestamp_start_s = end_
227
+
228
+ silence_length_ms = (start - self.timestamp_end_s) * 1000
229
+ if silence_length_ms < self.max_silence_length_ms:
230
+ self.timestamp_end_s = end
231
+ continue
232
+
233
+ if self.timestamp_end_s - self.timestamp_start_s < self.min_speech_length_s:
234
+ self.timestamp_start_s = start
235
+ self.timestamp_end_s = end
236
+ continue
237
 
238
+ vad_segment = [self.timestamp_start_s, self.timestamp_end_s]
239
+ yield vad_segment
240
+ self.timestamp_start_s = start
241
+ self.timestamp_end_s = end
242
 
243
  def vad(self, signal: np.ndarray) -> List[list]:
244
  segments = self.segments_generator(signal)