Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- VAD/vad_handler.py +64 -0
- VAD/vad_iterator.py +100 -0
VAD/vad_handler.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from VAD.vad_iterator import VADIterator
|
2 |
+
from baseHandler import BaseHandler
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from rich.console import Console
|
6 |
+
|
7 |
+
from utils.utils import int2float
|
8 |
+
|
9 |
+
import logging
|
10 |
+
|
11 |
+
logging.basicConfig(
|
12 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
13 |
+
)
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
console = Console()
|
17 |
+
|
18 |
+
|
19 |
+
class VADHandler(BaseHandler):
|
20 |
+
"""
|
21 |
+
Handles voice activity detection. When voice activity is detected, audio will be accumulated until the end of speech is detected and then passed
|
22 |
+
to the following part.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def setup(
|
26 |
+
self,
|
27 |
+
should_listen,
|
28 |
+
thresh=0.3,
|
29 |
+
sample_rate=16000,
|
30 |
+
min_silence_ms=1000,
|
31 |
+
min_speech_ms=500,
|
32 |
+
max_speech_ms=float("inf"),
|
33 |
+
speech_pad_ms=30,
|
34 |
+
):
|
35 |
+
self.should_listen = should_listen
|
36 |
+
self.sample_rate = sample_rate
|
37 |
+
self.min_silence_ms = min_silence_ms
|
38 |
+
self.min_speech_ms = min_speech_ms
|
39 |
+
self.max_speech_ms = max_speech_ms
|
40 |
+
self.model, _ = torch.hub.load("snakers4/silero-vad", "silero_vad")
|
41 |
+
self.iterator = VADIterator(
|
42 |
+
self.model,
|
43 |
+
threshold=thresh,
|
44 |
+
sampling_rate=sample_rate,
|
45 |
+
min_silence_duration_ms=min_silence_ms,
|
46 |
+
speech_pad_ms=speech_pad_ms,
|
47 |
+
)
|
48 |
+
|
49 |
+
def process(self, audio_chunk):
|
50 |
+
audio_int16 = np.frombuffer(audio_chunk, dtype=np.int16)
|
51 |
+
audio_float32 = int2float(audio_int16)
|
52 |
+
vad_output = self.iterator(torch.from_numpy(audio_float32))
|
53 |
+
if vad_output is not None and len(vad_output) != 0:
|
54 |
+
logger.debug("VAD: end of speech detected")
|
55 |
+
array = torch.cat(vad_output).cpu().numpy()
|
56 |
+
duration_ms = len(array) / self.sample_rate * 1000
|
57 |
+
if duration_ms < self.min_speech_ms or duration_ms > self.max_speech_ms:
|
58 |
+
logger.debug(
|
59 |
+
f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
|
60 |
+
)
|
61 |
+
else:
|
62 |
+
self.should_listen.clear()
|
63 |
+
logger.debug("Stop listening")
|
64 |
+
yield array
|
VAD/vad_iterator.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
|
4 |
+
class VADIterator:
|
5 |
+
def __init__(
|
6 |
+
self,
|
7 |
+
model,
|
8 |
+
threshold: float = 0.5,
|
9 |
+
sampling_rate: int = 16000,
|
10 |
+
min_silence_duration_ms: int = 100,
|
11 |
+
speech_pad_ms: int = 30,
|
12 |
+
):
|
13 |
+
"""
|
14 |
+
Mainly taken from https://github.com/snakers4/silero-vad
|
15 |
+
Class for stream imitation
|
16 |
+
|
17 |
+
Parameters
|
18 |
+
----------
|
19 |
+
model: preloaded .jit/.onnx silero VAD model
|
20 |
+
|
21 |
+
threshold: float (default - 0.5)
|
22 |
+
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
|
23 |
+
It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
24 |
+
|
25 |
+
sampling_rate: int (default - 16000)
|
26 |
+
Currently silero VAD models support 8000 and 16000 sample rates
|
27 |
+
|
28 |
+
min_silence_duration_ms: int (default - 100 milliseconds)
|
29 |
+
In the end of each speech chunk wait for min_silence_duration_ms before separating it
|
30 |
+
|
31 |
+
speech_pad_ms: int (default - 30 milliseconds)
|
32 |
+
Final speech chunks are padded by speech_pad_ms each side
|
33 |
+
"""
|
34 |
+
|
35 |
+
self.model = model
|
36 |
+
self.threshold = threshold
|
37 |
+
self.sampling_rate = sampling_rate
|
38 |
+
self.is_speaking = False
|
39 |
+
self.buffer = []
|
40 |
+
|
41 |
+
if sampling_rate not in [8000, 16000]:
|
42 |
+
raise ValueError(
|
43 |
+
"VADIterator does not support sampling rates other than [8000, 16000]"
|
44 |
+
)
|
45 |
+
|
46 |
+
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
47 |
+
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
48 |
+
self.reset_states()
|
49 |
+
|
50 |
+
def reset_states(self):
|
51 |
+
self.model.reset_states()
|
52 |
+
self.triggered = False
|
53 |
+
self.temp_end = 0
|
54 |
+
self.current_sample = 0
|
55 |
+
|
56 |
+
@torch.no_grad()
|
57 |
+
def __call__(self, x):
|
58 |
+
"""
|
59 |
+
x: torch.Tensor
|
60 |
+
audio chunk (see examples in repo)
|
61 |
+
|
62 |
+
return_seconds: bool (default - False)
|
63 |
+
whether return timestamps in seconds (default - samples)
|
64 |
+
"""
|
65 |
+
|
66 |
+
if not torch.is_tensor(x):
|
67 |
+
try:
|
68 |
+
x = torch.Tensor(x)
|
69 |
+
except Exception:
|
70 |
+
raise TypeError("Audio cannot be casted to tensor. Cast it manually")
|
71 |
+
|
72 |
+
window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
|
73 |
+
self.current_sample += window_size_samples
|
74 |
+
|
75 |
+
speech_prob = self.model(x, self.sampling_rate).item()
|
76 |
+
|
77 |
+
if (speech_prob >= self.threshold) and self.temp_end:
|
78 |
+
self.temp_end = 0
|
79 |
+
|
80 |
+
if (speech_prob >= self.threshold) and not self.triggered:
|
81 |
+
self.triggered = True
|
82 |
+
return None
|
83 |
+
|
84 |
+
if (speech_prob < self.threshold - 0.15) and self.triggered:
|
85 |
+
if not self.temp_end:
|
86 |
+
self.temp_end = self.current_sample
|
87 |
+
if self.current_sample - self.temp_end < self.min_silence_samples:
|
88 |
+
return None
|
89 |
+
else:
|
90 |
+
# end of speak
|
91 |
+
self.temp_end = 0
|
92 |
+
self.triggered = False
|
93 |
+
spoken_utterance = self.buffer
|
94 |
+
self.buffer = []
|
95 |
+
return spoken_utterance
|
96 |
+
|
97 |
+
if self.triggered:
|
98 |
+
self.buffer.append(x)
|
99 |
+
|
100 |
+
return None
|