Spaces:

intelli-zen
/

voice_activity_detection

Sleeping

App Files Files Community

qgyd2021 commited on Jan 30, 2024

Commit

3a820e8

1 Parent(s): 532c7af

update

Browse files

Files changed (21) hide show

data/call_monitor/id-ID/noise/000ad44a-fbad-4a22-ba5a-c6dc855779b2_id-ID_1672040947119.wav +3 -0
data/call_monitor/id-ID/noise/000da369-6652-4601-b241-33ffbd52a224_id-ID_1676000326981.wav +3 -0
data/call_monitor/id-ID/noise/00a0a2a3-14ff-4a84-8aee-b18b2fb65355_id-ID_1680237229413.wav +3 -0
data/call_monitor/id-ID/noise_mute/000d7fba-80ce-4bd7-84fe-e9c43de30f4a_id-ID_1678495379262.wav +3 -0
data/call_monitor/id-ID/voice/000a3f9a-b2bf-46fd-9c69-477fc62cda51_id-ID_1671935534167 - 副本.wav +3 -0
data/call_monitor/id-ID/voice/000a3f9a-b2bf-46fd-9c69-477fc62cda51_id-ID_1671935534167.wav +3 -0
data/call_monitor/id-ID/voice/000cb369-a0ee-44aa-a213-18b036f1baf7_id-ID_1678762306513.wav +3 -0
data/call_monitor/id-ID/voicemail/000b03b3-172e-4784-8510-24cf37e205ba_id-ID_1672193551438.wav +3 -0
data/call_monitor/id-ID/voicemail/00a20d31-e1cb-4c70-821b-6fd151b260ae_id-ID_1671762897272.wav +3 -0
data/early_media/62/33009996287818451333.wav +3 -0
data/early_media/62/3300999628999191096.wav +3 -0
main.py +38 -23
ring_vad_examples.json +24 -4
toolbox/torch/__init__.py +6 -0
toolbox/torch/utils/__init__.py +6 -0
toolbox/torch/utils/data/__init__.py +6 -0
toolbox/torch/utils/data/vocabulary.py +211 -0
toolbox/torch/utils/utils.py +26 -0
toolbox/vad/vad.py +114 -61
trained_models/cnn_voicemail_common_20231130/cnn_voicemail.pth +3 -0
trained_models/cnn_voicemail_common_20231130/labels.json +10 -0

data/call_monitor/id-ID/noise/000ad44a-fbad-4a22-ba5a-c6dc855779b2_id-ID_1672040947119.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7deca6895788f2fe7f7d2324dffabc39581ee6edfa4c6619d458790a2ca79b65
+size 32044

data/call_monitor/id-ID/noise/000da369-6652-4601-b241-33ffbd52a224_id-ID_1676000326981.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0678b0d7a759bdefe33725b2f224661beecce0e9fda52998d3535acab9e1c6e8
+size 32044

data/call_monitor/id-ID/noise/00a0a2a3-14ff-4a84-8aee-b18b2fb65355_id-ID_1680237229413.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cae813da4e2628586537cb41f0db6dc18f2021725a0a4827e1f7794dad727381
+size 32044

data/call_monitor/id-ID/noise_mute/000d7fba-80ce-4bd7-84fe-e9c43de30f4a_id-ID_1678495379262.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc09aeec1b37f75c65df3e83065ff54d5c2390b69847d174f65d3cb69f95da52
+size 32044

data/call_monitor/id-ID/voice/000a3f9a-b2bf-46fd-9c69-477fc62cda51_id-ID_1671935534167 - 副本.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf9e6ef0ee87be308c8a59a1459836dc9229c83be37c5e7204586c385d8d7a84
+size 32044

data/call_monitor/id-ID/voice/000a3f9a-b2bf-46fd-9c69-477fc62cda51_id-ID_1671935534167.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf9e6ef0ee87be308c8a59a1459836dc9229c83be37c5e7204586c385d8d7a84
+size 32044

data/call_monitor/id-ID/voice/000cb369-a0ee-44aa-a213-18b036f1baf7_id-ID_1678762306513.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f18b06b287ed16faf1bb231b5758b127562e90633e17f4ca931c48a0373b6b5
+size 32044

data/call_monitor/id-ID/voicemail/000b03b3-172e-4784-8510-24cf37e205ba_id-ID_1672193551438.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50f33d1c4b76ebeb028041d465a6f75965f0c6a584f19c38da4bfb104d0b3e26
+size 32044

data/call_monitor/id-ID/voicemail/00a20d31-e1cb-4c70-821b-6fd151b260ae_id-ID_1671762897272.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64c8496fa1fc98d40b145d5f4a1e07d2b1bf742348549d2518cb52a36130be05
+size 32044

data/early_media/62/33009996287818451333.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a68356976cde2101182663b90c2272be5730b73f341f1ef7aa76f2716dae7637
+size 155884

data/early_media/62/3300999628999191096.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12a24a8927ac75d5bf0549c7ac4f0fe9339b73b4a035b3953a375676761c71c3
+size 186604

main.py CHANGED Viewed

@@ -15,7 +15,7 @@ from PIL import Image
 from project_settings import project_path, temp_directory
 from toolbox.webrtcvad.vad import WebRTCVad
-from toolbox.vad.vad import Vad, WebRTCVoiceClassifier, SileroVoiceClassifier
 def get_args():
@@ -35,9 +35,10 @@ vad: Vad = None
 def click_ring_vad_button(audio: Tuple[int, np.ndarray],
                           model_name: str,
                           agg: int = 3,
-                          frame_duration_ms: int = 30,
-                          padding_duration_ms: int = 300,
-                          silence_duration_threshold: float = 0.3,
                           start_ring_rate: float = 0.9,
                           end_ring_rate: float = 0.1,
                           ):
@@ -47,22 +48,24 @@ def click_ring_vad_button(audio: Tuple[int, np.ndarray],
         return None, "please upload audio."
     sample_rate, signal = audio
-    if model_name == "webrtcvad" and frame_duration_ms not in (10, 20, 30):
         return None, "only 10, 20, 30 available for `frame_duration_ms`."
     if model_name == "webrtcvad":
         model = WebRTCVoiceClassifier(agg=agg)
     elif model_name == "silerovad":
-        model = SileroVoiceClassifier(model_name=(project_path / "pretrained_models/silero_vad/silero_vad.jit").as_posix())
     else:
         return None, "`model_name` not valid."
     vad = Vad(model=model,
               start_ring_rate=start_ring_rate,
               end_ring_rate=end_ring_rate,
-              frame_duration_ms=frame_duration_ms,
-              padding_duration_ms=padding_duration_ms,
-              silence_duration_threshold=silence_duration_threshold,
               sample_rate=sample_rate,
               )
@@ -75,12 +78,21 @@ def click_ring_vad_button(audio: Tuple[int, np.ndarray],
     except Exception as e:
         return None, str(e)
     time = np.arange(0, len(signal)) / sample_rate
     plt.figure(figsize=(12, 5))
-    plt.plot(time, signal / 32768, color='b')
     for start, end in vad_segments:
-        plt.axvline(x=start, ymin=0.25, ymax=0.75, color='g', linestyle='--', label='开始端点')  # 标记开始端点
-        plt.axvline(x=end, ymin=0.25, ymax=0.75, color='r', linestyle='--', label='结束端点')  # 标记结束端点
     temp_image_file = temp_directory / "temp.jpg"
     plt.savefig(temp_image_file)
@@ -116,19 +128,20 @@ def main():
                                 ring_wav = gr.Audio(label="wav")
                                 with gr.Row():
-                                    ring_model_name = gr.Dropdown(choices=["webrtcvad", "silerovad"], value="webrtcvad", label="model_name")
                                 with gr.Row():
-                                    ring_agg = gr.Dropdown(choices=[1, 2, 3], value=3, label="agg")
-                                    ring_frame_duration_ms = gr.Slider(minimum=0, maximum=100, value=30, label="frame_duration_ms")
                                 with gr.Row():
-                                    ring_padding_duration_ms = gr.Slider(minimum=0, maximum=1000, value=300, label="padding_duration_ms")
-                                    ring_silence_duration_threshold = gr.Slider(minimum=0, maximum=1.0, value=0.3, step=0.1, label="silence_duration_threshold")
                                 with gr.Row():
-                                    ring_start_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.9, step=0.1, label="start_ring_rate")
-                                    ring_end_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.1, label="end_ring_rate")
                                 ring_button = gr.Button("retrieval", variant="primary")
@@ -140,8 +153,9 @@ def main():
                             examples=ring_vad_examples,
                             inputs=[
                                 ring_wav,
-                                ring_model_name, ring_agg, ring_frame_duration_ms,
-                                ring_padding_duration_ms, ring_silence_duration_threshold,
                                 ring_start_ring_rate, ring_end_ring_rate
                             ],
                             outputs=[ring_image, ring_end_points],
@@ -153,8 +167,9 @@ def main():
                             click_ring_vad_button,
                             inputs=[
                                 ring_wav,
-                                ring_model_name, ring_agg, ring_frame_duration_ms,
-                                ring_padding_duration_ms, ring_silence_duration_threshold,
                                 ring_start_ring_rate, ring_end_ring_rate
                             ],
                             outputs=[ring_image, ring_end_points],

 from project_settings import project_path, temp_directory
 from toolbox.webrtcvad.vad import WebRTCVad
+from toolbox.vad.vad import Vad, WebRTCVoiceClassifier, SileroVoiceClassifier, CallVoiceClassifier, process_speech_probs
 def get_args():
 def click_ring_vad_button(audio: Tuple[int, np.ndarray],
                           model_name: str,
                           agg: int = 3,
+                          frame_length_ms: int = 30,
+                          frame_step_ms: int = 30,
+                          padding_length_ms: int = 300,
+                          max_silence_length_ms: int = 300,
                           start_ring_rate: float = 0.9,
                           end_ring_rate: float = 0.1,
                           ):
         return None, "please upload audio."
     sample_rate, signal = audio
+    if model_name == "webrtcvad" and frame_length_ms not in (10, 20, 30):
         return None, "only 10, 20, 30 available for `frame_duration_ms`."
     if model_name == "webrtcvad":
         model = WebRTCVoiceClassifier(agg=agg)
     elif model_name == "silerovad":
+        model = SileroVoiceClassifier(model_path=(project_path / "pretrained_models/silero_vad/silero_vad.jit").as_posix())
+    elif model_name == "call_voice":
+        model = CallVoiceClassifier(model_path=(project_path / "trained_models/cnn_voicemail_common_20231130").as_posix())
     else:
         return None, "`model_name` not valid."
     vad = Vad(model=model,
               start_ring_rate=start_ring_rate,
               end_ring_rate=end_ring_rate,
+              frame_length_ms=frame_length_ms,
+              padding_length_ms=padding_length_ms,
+              max_silence_length_ms=max_silence_length_ms,
               sample_rate=sample_rate,
               )
     except Exception as e:
         return None, str(e)
+    # speech_probs
+    speech_probs = process_speech_probs(
+        signal=signal,
+        speech_probs=vad.speech_probs,
+        frame_step=vad.frame_step,
+    )
     time = np.arange(0, len(signal)) / sample_rate
     plt.figure(figsize=(12, 5))
+    plt.plot(time, signal / 32768, color="b")
+    plt.plot(time, speech_probs * 2, color="gray")
     for start, end in vad_segments:
+        plt.axvline(x=start, ymin=0.15, ymax=0.85, color="g", linestyle="--", label="开始端点")
+        plt.axvline(x=end, ymin=0.15, ymax=0.85, color="r", linestyle="--", label="结束端点")
     temp_image_file = temp_directory / "temp.jpg"
     plt.savefig(temp_image_file)
                                 ring_wav = gr.Audio(label="wav")
                                 with gr.Row():
+                                    ring_model_name = gr.Dropdown(choices=["webrtcvad", "silerovad", "call_voice"], value="webrtcvad", label="model_name")
+                                    ring_agg = gr.Dropdown(choices=[1, 2, 3], value=3, label="agg")
                                 with gr.Row():
+                                    ring_frame_length_ms = gr.Slider(minimum=0, maximum=1000, value=30, label="frame_length_ms")
+                                    ring_frame_step_ms = gr.Slider(minimum=0, maximum=100, value=30, label="frame_step_ms")
                                 with gr.Row():
+                                    ring_padding_length_ms = gr.Slider(minimum=0, maximum=1000, value=300, label="padding_length_ms")
+                                    ring_max_silence_length_ms = gr.Slider(minimum=0, maximum=1000, value=300, step=0.1, label="max_silence_length_ms")
                                 with gr.Row():
+                                    ring_start_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.9, step=0.05, label="start_ring_rate")
+                                    ring_end_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="end_ring_rate")
                                 ring_button = gr.Button("retrieval", variant="primary")
                             examples=ring_vad_examples,
                             inputs=[
                                 ring_wav,
+                                ring_model_name, ring_agg,
+                                ring_frame_length_ms, ring_frame_step_ms,
+                                ring_padding_length_ms, ring_max_silence_length_ms,
                                 ring_start_ring_rate, ring_end_ring_rate
                             ],
                             outputs=[ring_image, ring_end_points],
                             click_ring_vad_button,
                             inputs=[
                                 ring_wav,
+                                ring_model_name, ring_agg,
+                                ring_frame_length_ms, ring_frame_step_ms,
+                                ring_padding_length_ms, ring_max_silence_length_ms,
                                 ring_start_ring_rate, ring_end_ring_rate
                             ],
                             outputs=[ring_image, ring_end_points],

ring_vad_examples.json CHANGED Viewed

@@ -1,18 +1,38 @@
 [
     [
         "data/early_media/3300999628164249998.wav",
-        "webrtcvad", 3, 30, 300, 0.3, 0.9, 0.1
     ],
     [
         "data/early_media/3300999628164852605.wav",
-        "webrtcvad", 3, 30, 300, 0.3, 0.9, 0.1
     ],
     [
         "data/early_media/3300999628164249998.wav",
-        "silerovad", 3, 35, 350, 0.35, 0.5, 0.5
     ],
     [
         "data/early_media/3300999628164852605.wav",
-        "silerovad", 3, 35, 350, 0.35, 0.5, 0.5
     ]
 ]

 [
     [
         "data/early_media/3300999628164249998.wav",
+        "webrtcvad", 3, 30, 300, 300, 300, 0.9, 0.1
     ],
     [
         "data/early_media/3300999628164852605.wav",
+        "webrtcvad", 3, 30, 300, 300, 300, 0.9, 0.1
     ],
     [
         "data/early_media/3300999628164249998.wav",
+        "silerovad", 3, 35, 350, 350, 350, 0.7, 0.3
     ],
     [
         "data/early_media/3300999628164852605.wav",
+        "silerovad", 3, 35, 350, 350, 350, 0.5, 0.5
+    ],
+    [
+        "data/early_media/3300999628164852605.wav",
+        "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1
+    ],
+    [
+        "data/early_media/62/3300999628999191096.wav",
+        "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1
+    ],
+    [
+        "data/early_media/62/33009996287818451333.wav",
+        "call_voice", 3, 300, 30, 300, 300, 0.2, 0.1
+    ],
+    [
+        "data/call_monitor/id-ID/noise_mute/000d7fba-80ce-4bd7-84fe-e9c43de30f4a_id-ID_1678495379262.wav",
+        "silerovad", 3, 35, 350, 350, 350, 0.7, 0.3
+    ],
+    [
+        "data/call_monitor/id-ID/noise/00a0a2a3-14ff-4a84-8aee-b18b2fb65355_id-ID_1680237229413.wav",
+        "silerovad", 3, 35, 350, 350, 350, 0.7, 0.3
     ]
 ]

toolbox/torch/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/torch/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/torch/utils/data/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/torch/utils/data/vocabulary.py ADDED Viewed

	@@ -0,0 +1,211 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from collections import defaultdict, OrderedDict
+import os
+from typing import Any, Callable, Dict, Iterable, List, Set
+def namespace_match(pattern: str, namespace: str):
+    """
+    Matches a namespace pattern against a namespace string.  For example, ``*tags`` matches
+    ``passage_tags`` and ``question_tags`` and ``tokens`` matches ``tokens`` but not
+    ``stemmed_tokens``.
+    """
+    if pattern[0] == '*' and namespace.endswith(pattern[1:]):
+        return True
+    elif pattern == namespace:
+        return True
+    return False
+class _NamespaceDependentDefaultDict(defaultdict):
+    def __init__(self,
+                 non_padded_namespaces: Set[str],
+                 padded_function: Callable[[], Any],
+                 non_padded_function: Callable[[], Any]) -> None:
+        self._non_padded_namespaces = set(non_padded_namespaces)
+        self._padded_function = padded_function
+        self._non_padded_function = non_padded_function
+        super(_NamespaceDependentDefaultDict, self).__init__()
+    def __missing__(self, key: str):
+        if any(namespace_match(pattern, key) for pattern in self._non_padded_namespaces):
+            value = self._non_padded_function()
+        else:
+            value = self._padded_function()
+        dict.__setitem__(self, key, value)
+        return value
+    def add_non_padded_namespaces(self, non_padded_namespaces: Set[str]):
+        # add non_padded_namespaces which weren't already present
+        self._non_padded_namespaces.update(non_padded_namespaces)
+class _TokenToIndexDefaultDict(_NamespaceDependentDefaultDict):
+    def __init__(self, non_padded_namespaces: Set[str], padding_token: str, oov_token: str) -> None:
+        super(_TokenToIndexDefaultDict, self).__init__(non_padded_namespaces,
+                                                       lambda: {padding_token: 0, oov_token: 1},
+                                                       lambda: {})
+class _IndexToTokenDefaultDict(_NamespaceDependentDefaultDict):
+    def __init__(self, non_padded_namespaces: Set[str], padding_token: str, oov_token: str) -> None:
+        super(_IndexToTokenDefaultDict, self).__init__(non_padded_namespaces,
+                                                       lambda: {0: padding_token, 1: oov_token},
+                                                       lambda: {})
+DEFAULT_NON_PADDED_NAMESPACES = ("*tags", "*labels")
+DEFAULT_PADDING_TOKEN = '[PAD]'
+DEFAULT_OOV_TOKEN = '[UNK]'
+NAMESPACE_PADDING_FILE = 'non_padded_namespaces.txt'
+class Vocabulary(object):
+    def __init__(self, non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES):
+        self._non_padded_namespaces = set(non_padded_namespaces)
+        self._padding_token = DEFAULT_PADDING_TOKEN
+        self._oov_token = DEFAULT_OOV_TOKEN
+        self._token_to_index = _TokenToIndexDefaultDict(self._non_padded_namespaces,
+                                                        self._padding_token,
+                                                        self._oov_token)
+        self._index_to_token = _IndexToTokenDefaultDict(self._non_padded_namespaces,
+                                                        self._padding_token,
+                                                        self._oov_token)
+    def add_token_to_namespace(self, token: str, namespace: str = 'tokens') -> int:
+        if token not in self._token_to_index[namespace]:
+            index = len(self._token_to_index[namespace])
+            self._token_to_index[namespace][token] = index
+            self._index_to_token[namespace][index] = token
+            return index
+        else:
+            return self._token_to_index[namespace][token]
+    def get_index_to_token_vocabulary(self, namespace: str = 'tokens') -> Dict[int, str]:
+        return self._index_to_token[namespace]
+    def get_token_to_index_vocabulary(self, namespace: str = 'tokens') -> Dict[str, int]:
+        return self._token_to_index[namespace]
+    def get_token_index(self, token: str, namespace: str = 'tokens') -> int:
+        if token in self._token_to_index[namespace]:
+            return self._token_to_index[namespace][token]
+        else:
+            return self._token_to_index[namespace][self._oov_token]
+    def get_token_from_index(self, index: int, namespace: str = 'tokens'):
+        return self._index_to_token[namespace][index]
+    def get_vocab_size(self, namespace: str = 'tokens') -> int:
+        return len(self._token_to_index[namespace])
+    def save_to_files(self, directory: str):
+        os.makedirs(directory, exist_ok=True)
+        with open(os.path.join(directory, NAMESPACE_PADDING_FILE), 'w', encoding='utf-8') as f:
+            for namespace_str in self._non_padded_namespaces:
+                f.write('{}\n'.format(namespace_str))
+        for namespace, token_to_index in self._token_to_index.items():
+            filename = os.path.join(directory, '{}.txt'.format(namespace))
+            with open(filename, 'w', encoding='utf-8') as f:
+                for token, _ in token_to_index.items():
+                    f.write('{}\n'.format(token))
+    @classmethod
+    def from_files(cls, directory: str) -> 'Vocabulary':
+        with open(os.path.join(directory, NAMESPACE_PADDING_FILE), 'r', encoding='utf-8') as f:
+            non_padded_namespaces = [namespace_str.strip() for namespace_str in f]
+        vocab = cls(non_padded_namespaces=non_padded_namespaces)
+        for namespace_filename in os.listdir(directory):
+            if namespace_filename == NAMESPACE_PADDING_FILE:
+                continue
+            if namespace_filename.startswith("."):
+                continue
+            namespace = namespace_filename.replace('.txt', '')
+            if any(namespace_match(pattern, namespace) for pattern in non_padded_namespaces):
+                is_padded = False
+            else:
+                is_padded = True
+            filename = os.path.join(directory, namespace_filename)
+            vocab.set_from_file(filename, is_padded, namespace=namespace)
+        return vocab
+    def set_from_file(self,
+                      filename: str,
+                      is_padded: bool = True,
+                      oov_token: str = DEFAULT_OOV_TOKEN,
+                      namespace: str = "tokens"
+                      ):
+        if is_padded:
+            self._token_to_index[namespace] = {self._padding_token: 0}
+            self._index_to_token[namespace] = {0: self._padding_token}
+        else:
+            self._token_to_index[namespace] = {}
+            self._index_to_token[namespace] = {}
+        with open(filename, 'r', encoding='utf-8') as f:
+            index = 1 if is_padded else 0
+            for row in f:
+                token = str(row).strip()
+                if token == oov_token:
+                    token = self._oov_token
+                self._token_to_index[namespace][token] = index
+                self._index_to_token[namespace][index] = token
+                index += 1
+    def convert_tokens_to_ids(self, tokens: List[str], namespace: str = "tokens"):
+        result = list()
+        for token in tokens:
+            idx = self._token_to_index[namespace].get(token)
+            if idx is None:
+                idx = self._token_to_index[namespace][self._oov_token]
+            result.append(idx)
+        return result
+    def convert_ids_to_tokens(self, ids: List[int], namespace: str = "tokens"):
+        result = list()
+        for idx in ids:
+            idx = self._index_to_token[namespace][idx]
+            result.append(idx)
+        return result
+    def pad_or_truncate_ids_by_max_length(self, ids: List[int], max_length: int, namespace: str = "tokens"):
+        pad_idx = self._token_to_index[namespace][self._padding_token]
+        length = len(ids)
+        if length > max_length:
+            result = ids[:max_length]
+        else:
+            result = ids + [pad_idx] * (max_length - length)
+        return result
+def demo1():
+    import jieba
+    vocabulary = Vocabulary()
+    vocabulary.add_token_to_namespace('白天', 'tokens')
+    vocabulary.add_token_to_namespace('晚上', 'tokens')
+    text = '不是在白天, 就是在晚上'
+    tokens = jieba.lcut(text)
+    print(tokens)
+    ids = vocabulary.convert_tokens_to_ids(tokens)
+    print(ids)
+    padded_idx = vocabulary.pad_or_truncate_ids_by_max_length(ids, 10)
+    print(padded_idx)
+    tokens = vocabulary.convert_ids_to_tokens(padded_idx)
+    print(tokens)
+    return
+if __name__ == '__main__':
+    demo1()

toolbox/torch/utils/utils.py ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+from typing import Dict
+import torch
+def get_text_field_mask(text_field_tensors: torch.Tensor,
+                        num_wrapping_dims: int = 0) -> torch.LongTensor:
+    tensor_dims = [(tensor.dim(), tensor) for tensor in text_field_tensors.values()]
+    tensor_dims.sort(key=lambda x: x[0])
+    smallest_dim = tensor_dims[0][0] - num_wrapping_dims
+    if smallest_dim == 2:
+        token_tensor = tensor_dims[0][1]
+        return (token_tensor != 0).long()
+    elif smallest_dim == 3:
+        character_tensor = tensor_dims[0][1]
+        return ((character_tensor > 0).long().sum(dim=-1) > 0).long()
+    else:
+        raise ValueError("Expected a tensor with dimension 2 or 3, found {}".format(smallest_dim))
+if __name__ == '__main__':
+    pass

toolbox/vad/vad.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 import argparse
 import collections
 from typing import List
 import matplotlib.pyplot as plt
@@ -11,6 +12,7 @@ import torch
 import webrtcvad
 from project_settings import project_path
 class FrameVoiceClassifier(object):
@@ -39,12 +41,12 @@ class WebRTCVoiceClassifier(FrameVoiceClassifier):
 class SileroVoiceClassifier(FrameVoiceClassifier):
     def __init__(self,
-                 model_name: str,
                  sample_rate: int = 8000):
-        self.model_name = model_name
         self.sample_rate = sample_rate
-        with open(self.model_name, "rb") as f:
             model = torch.jit.load(f, map_location="cpu")
         self.model = model
         self.model.reset_states()
@@ -61,11 +63,39 @@ class SileroVoiceClassifier(FrameVoiceClassifier):
         return float(speech_prob)
 class Frame(object):
-    def __init__(self, signal: np.ndarray, timestamp, duration):
         self.signal = signal
-        self.timestamp = timestamp
-        self.duration = duration
 class Vad(object):
@@ -73,26 +103,28 @@ class Vad(object):
                  model: FrameVoiceClassifier,
                  start_ring_rate: float = 0.5,
                  end_ring_rate: float = 0.5,
-                 frame_duration_ms: int = 30,
-                 padding_duration_ms: int = 300,
-                 silence_duration_threshold: float = 0.3,
                  sample_rate: int = 8000
                  ):
         self.model = model
         self.start_ring_rate = start_ring_rate
         self.end_ring_rate = end_ring_rate
-        self.frame_duration_ms = frame_duration_ms
-        self.padding_duration_ms = padding_duration_ms
-        self.silence_duration_threshold = silence_duration_threshold
         self.sample_rate = sample_rate
         # frames
-        self.frame_length = int(sample_rate * (frame_duration_ms / 1000.0))
-        self.frame_timestamp = 0.0
-        self.signal_cache = None
         # segments
-        self.num_padding_frames = int(padding_duration_ms / frame_duration_ms)
         self.ring_buffer = collections.deque(maxlen=self.num_padding_frames)
         self.triggered = False
         self.voiced_frames: List[Frame] = list()
@@ -100,21 +132,23 @@ class Vad(object):
         # vad segments
         self.is_first_segment = True
-        self.timestamp_start = 0.0
-        self.timestamp_end = 0.0
     def signal_to_frames(self, signal: np.ndarray):
         frames = list()
         l = len(signal)
-        duration = float(self.frame_length) / self.sample_rate
-        for offset in range(0, l, self.frame_length):
             sub_signal = signal[offset:offset+self.frame_length]
-            frame = Frame(sub_signal, self.frame_timestamp, duration)
-            self.frame_timestamp += duration
             frames.append(frame)
         return frames
@@ -124,7 +158,8 @@ class Vad(object):
         if self.signal_cache is not None:
             signal = np.concatenate([self.signal_cache, signal])
-        rest = len(signal) % self.frame_length
         if rest == 0:
             self.signal_cache = None
@@ -138,6 +173,7 @@ class Vad(object):
         for frame in frames:
             speech_prob = self.model.predict(frame.signal)
             if not self.triggered:
                 self.ring_buffer.append((frame, speech_prob))
@@ -158,8 +194,8 @@ class Vad(object):
                     self.triggered = False
                     segment = [
                         np.concatenate([f.signal for f in self.voiced_frames]),
-                        self.voiced_frames[0].timestamp,
-                        self.voiced_frames[-1].timestamp,
                     ]
                     yield segment
                     self.ring_buffer.clear()
@@ -173,21 +209,21 @@ class Vad(object):
             end = round(segment[2], 4)
             if self.is_first_segment:
-                self.timestamp_start = start
-                self.timestamp_end = end
                 self.is_first_segment = False
                 continue
-            if self.timestamp_start:
-                sil_duration = start - self.timestamp_end
-                if sil_duration > self.silence_duration_threshold:
-                    vad_segment = [self.timestamp_start, self.timestamp_end]
                     yield vad_segment
-                    self.timestamp_start = start
-                    self.timestamp_end = end
                 else:
-                    self.timestamp_end = end
     def vad(self, signal: np.ndarray) -> List[list]:
         segments = self.segments_generator(signal)
@@ -202,8 +238,8 @@ class Vad(object):
         else:
             segment = [
                 np.concatenate([f.signal for f in self.voiced_frames]),
-                self.voiced_frames[0].timestamp,
-                self.voiced_frames[-1].timestamp
             ]
             segments = [segment]
@@ -211,17 +247,33 @@ class Vad(object):
         vad_segments = self.vad_segments_generator(segments)
         vad_segments = list(vad_segments)
-        vad_segments = vad_segments + [[self.timestamp_start, self.timestamp_end]]
         return vad_segments
-def make_visualization(signal: np.ndarray, sample_rate: int, vad_segments: list):
     time = np.arange(0, len(signal)) / sample_rate
     plt.figure(figsize=(12, 5))
     plt.plot(time, signal / 32768, color='b')
     for start, end in vad_segments:
-        plt.axvline(x=start, ymin=0.25, ymax=0.75, color='g', linestyle='--', label='开始端点')  # 标记开始端点
-        plt.axvline(x=end, ymin=0.25, ymax=0.75, color='r', linestyle='--', label='结束端点')  # 标记结束端点
     plt.show()
     return
@@ -231,25 +283,14 @@ def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--wav_file",
-        default=(project_path / "data/early_media/3300999628164249998.wav").as_posix(),
         type=str,
     )
     parser.add_argument(
-        "--model_name",
         default=(project_path / "pretrained_models/silero_vad/silero_vad.jit").as_posix(),
         type=str,
     )
-    parser.add_argument(
-        "--frame_duration_ms",
-        default=30,
-        type=int,
-    )
-    parser.add_argument(
-        "--silence_duration_threshold",
-        default=0.3,
-        type=float,
-        help="minimum silence duration, in seconds."
-    )
     args = parser.parse_args()
     return args
@@ -264,15 +305,17 @@ def main():
     if SAMPLE_RATE != sample_rate:
         raise AssertionError
-    # model = SileroVoiceClassifier(model_name=args.model_name, sample_rate=SAMPLE_RATE)
-    model = WebRTCVoiceClassifier(agg=1, sample_rate=SAMPLE_RATE)
     vad = Vad(model=model,
-              start_ring_rate=0.9,
               end_ring_rate=0.1,
-              frame_duration_ms=30,
-              padding_duration_ms=300,
-              silence_duration_threshold=0.30,
               sample_rate=SAMPLE_RATE,
               )
     print(vad)
@@ -290,8 +333,18 @@ def main():
     for segment in segments:
         print(segment)
     # plot
-    make_visualization(signal, SAMPLE_RATE, vad_segments)
     return

 # -*- coding: utf-8 -*-
 import argparse
 import collections
+import os
 from typing import List
 import matplotlib.pyplot as plt
 import webrtcvad
 from project_settings import project_path
+from toolbox.torch.utils.data.vocabulary import Vocabulary
 class FrameVoiceClassifier(object):
 class SileroVoiceClassifier(FrameVoiceClassifier):
     def __init__(self,
+                 model_path: str,
                  sample_rate: int = 8000):
+        self.model_path = model_path
         self.sample_rate = sample_rate
+        with open(self.model_path, "rb") as f:
             model = torch.jit.load(f, map_location="cpu")
         self.model = model
         self.model.reset_states()
         return float(speech_prob)
+class CallVoiceClassifier(FrameVoiceClassifier):
+    def __init__(self,
+                 model_path: str,
+                 sample_rate: int = 8000):
+        self.model_path = model_path
+        self.sample_rate = sample_rate
+        self.model = torch.jit.load(os.path.join(model_path, "cnn_voicemail.pth"))
+    def predict(self, chunk: np.ndarray) -> float:
+        if chunk.dtype != np.int16:
+            raise AssertionError("signal dtype should be np.int16, instead of {}".format(chunk.dtype))
+        chunk = chunk / 32768
+        inputs = torch.tensor(chunk, dtype=torch.float32)
+        inputs = torch.unsqueeze(inputs, dim=0)
+        try:
+            outputs = self.model(inputs)
+        except RuntimeError as e:
+            print(inputs.shape)
+            raise e
+        probs = outputs["probs"]
+        voice_prob = probs[0][2]
+        return float(voice_prob)
 class Frame(object):
+    def __init__(self, signal: np.ndarray, timestamp_s: float):
         self.signal = signal
+        self.timestamp_s = timestamp_s
 class Vad(object):
                  model: FrameVoiceClassifier,
                  start_ring_rate: float = 0.5,
                  end_ring_rate: float = 0.5,
+                 frame_length_ms: int = 30,
+                 frame_step_ms: int = 30,
+                 padding_length_ms: int = 300,
+                 max_silence_length_ms: int = 300,
                  sample_rate: int = 8000
                  ):
         self.model = model
         self.start_ring_rate = start_ring_rate
         self.end_ring_rate = end_ring_rate
+        self.frame_length_ms = frame_length_ms
+        self.padding_length_ms = padding_length_ms
+        self.max_silence_length_ms = max_silence_length_ms
         self.sample_rate = sample_rate
         # frames
+        self.frame_length = int(sample_rate * (frame_length_ms / 1000.0))
+        self.frame_step = int(sample_rate * (frame_step_ms / 1000.0))
+        self.frame_timestamp_s = 0.0
+        self.signal_cache = np.zeros(shape=(self.frame_length,), dtype=np.int16)
         # segments
+        self.num_padding_frames = int(padding_length_ms / frame_step_ms)
         self.ring_buffer = collections.deque(maxlen=self.num_padding_frames)
         self.triggered = False
         self.voiced_frames: List[Frame] = list()
         # vad segments
         self.is_first_segment = True
+        self.timestamp_start_s = 0.0
+        self.timestamp_end_s = 0.0
+        # speech probs
+        self.speech_probs: List[float] = list()
     def signal_to_frames(self, signal: np.ndarray):
         frames = list()
         l = len(signal)
+        duration_s = float(self.frame_step) / self.sample_rate
+        for offset in range(0, l - self.frame_length + 1, self.frame_step):
             sub_signal = signal[offset:offset+self.frame_length]
+            frame = Frame(sub_signal, self.frame_timestamp_s)
+            self.frame_timestamp_s += duration_s
             frames.append(frame)
         return frames
         if self.signal_cache is not None:
             signal = np.concatenate([self.signal_cache, signal])
+        # rest
+        rest = (len(signal) - self.frame_length) % self.frame_step
         if rest == 0:
             self.signal_cache = None
         for frame in frames:
             speech_prob = self.model.predict(frame.signal)
+            self.speech_probs.append(speech_prob)
             if not self.triggered:
                 self.ring_buffer.append((frame, speech_prob))
                     self.triggered = False
                     segment = [
                         np.concatenate([f.signal for f in self.voiced_frames]),
+                        self.voiced_frames[0].timestamp_s,
+                        self.voiced_frames[-1].timestamp_s,
                     ]
                     yield segment
                     self.ring_buffer.clear()
             end = round(segment[2], 4)
             if self.is_first_segment:
+                self.timestamp_start_s = start
+                self.timestamp_end_s = end
                 self.is_first_segment = False
                 continue
+            if self.timestamp_start_s:
+                silence_length_s = (start - self.timestamp_end_s) * 1000
+                if silence_length_s > self.max_silence_length_ms:
+                    vad_segment = [self.timestamp_start_s, self.timestamp_end_s]
                     yield vad_segment
+                    self.timestamp_start_s = start
+                    self.timestamp_end_s = end
                 else:
+                    self.timestamp_end_s = end
     def vad(self, signal: np.ndarray) -> List[list]:
         segments = self.segments_generator(signal)
         else:
             segment = [
                 np.concatenate([f.signal for f in self.voiced_frames]),
+                self.voiced_frames[0].timestamp_s,
+                self.voiced_frames[-1].timestamp_s
             ]
             segments = [segment]
         vad_segments = self.vad_segments_generator(segments)
         vad_segments = list(vad_segments)
+        if self.timestamp_end_s > 1e-5 and self.timestamp_end_s > 1e-5:
+            vad_segments = vad_segments + [[self.timestamp_start_s, self.timestamp_end_s]]
         return vad_segments
+def process_speech_probs(signal: np.ndarray, speech_probs: List[float], frame_step: int) -> np.ndarray:
+    speech_probs_ = list()
+    for p in speech_probs[1:]:
+        speech_probs_.extend([p] * frame_step)
+    pad = (signal.shape[0] - len(speech_probs_))
+    speech_probs_ = speech_probs_ + [0.0] * pad
+    speech_probs_ = np.array(speech_probs_, dtype=np.float32)
+    if len(speech_probs_) != len(signal):
+        raise AssertionError
+    return speech_probs_
+def make_visualization(signal: np.ndarray, speech_probs, sample_rate: int, vad_segments: list):
     time = np.arange(0, len(signal)) / sample_rate
     plt.figure(figsize=(12, 5))
     plt.plot(time, signal / 32768, color='b')
+    plt.plot(time, speech_probs, color='gray')
     for start, end in vad_segments:
+        plt.axvline(x=start, ymin=0.15, ymax=0.85, color="g", linestyle="--", label="开始端点")
+        plt.axvline(x=end, ymin=0.15, ymax=0.85, color="r", linestyle="--", label="结束端点")
     plt.show()
     return
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--wav_file",
+        default=(project_path / "data/early_media/62/3300999628999191096.wav").as_posix(),
         type=str,
     )
     parser.add_argument(
+        "--model_path",
         default=(project_path / "pretrained_models/silero_vad/silero_vad.jit").as_posix(),
         type=str,
     )
     args = parser.parse_args()
     return args
     if SAMPLE_RATE != sample_rate:
         raise AssertionError
+    # model = SileroVoiceClassifier(model_path=args.model_path, sample_rate=SAMPLE_RATE)
+    # model = WebRTCVoiceClassifier(agg=1, sample_rate=SAMPLE_RATE)
+    model = CallVoiceClassifier(model_path=(project_path / "trained_models/cnn_voicemail_common_20231130").as_posix())
     vad = Vad(model=model,
+              start_ring_rate=0.2,
               end_ring_rate=0.1,
+              frame_length_ms=300,
+              frame_step_ms=30,
+              padding_length_ms=300,
+              max_silence_length_ms=300,
               sample_rate=SAMPLE_RATE,
               )
     print(vad)
     for segment in segments:
         print(segment)
+    print(vad.speech_probs)
+    print(len(vad.speech_probs))
+    # speech_probs
+    speech_probs = process_speech_probs(
+        signal=signal,
+        speech_probs=vad.speech_probs,
+        frame_step=vad.frame_step,
+    )
     # plot
+    make_visualization(signal, speech_probs, SAMPLE_RATE, vad_segments)
     return

trained_models/cnn_voicemail_common_20231130/cnn_voicemail.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f27b715f5c240b56c60bc80c9325bbe0ee1a80311b2e51a8f6e531985f8d8e61
+size 155558

trained_models/cnn_voicemail_common_20231130/labels.json ADDED Viewed

	@@ -0,0 +1,10 @@

+[
+    "white_noise",
+    "voicemail",
+    "voice",
+    "noise",
+    "bell",
+    "mute",
+    "noise_mute",
+    "music"
+]