add models

Browse files

Files changed (11) hide show

LICENSE +48 -0
README.md +12 -0
export-onnx.py +130 -0
model.int8.onnx +3 -0
model.onnx +3 -0
run.sh +46 -0
show-onnx.py +43 -0
speaker-diarization-onnx.py +488 -0
speaker-diarization-torch.py +86 -0
vad-onnx.py +244 -0
vad-torch.py +38 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,48 @@

+Rev Model Non-Production License
+1. Scope and acceptance
+1.1. Scope of the Agreement. This Agreement applies to any use, modification, or Distribution of any Rev Model by You, regardless of the source from which You obtained a copy of such Rev Model.
+1.2. Acceptance. By accessing, using, modifying, Distributing a Rev Model, or by creating, using or distributing a Derivative of the Rev Model, You agree to be bound by this Agreement.
+1.3. Acceptance on Behalf of a Third-Party. If You accept this Agreement on behalf of Your employer or another person or entity, You represent and warrant that You have the authority to act and accept this Agreement on their behalf. In such a case, the word “You” in this Agreement will refer to Your employer or such other person or entity.
+2. License
+2.1. Grant of Rights. Subject to Section 3 (Limitations) below, Rev hereby grants You a non-exclusive, royalty-free, worldwide, non-sublicensable, non-transferable, limited license to (a) use, copy, modify, and Distribute the Rev Model and any Derivatives, subject to the conditions provided in Section 2.2 below; and (b) use the Rev Model and any Derivatives to generate Output.
+2.2. Distribution of Rev Model. Subject to Section 3 below, You may Distribute copies of the Rev Model under the following conditions:  (a) You must make available a copy of this Agreement to third-party recipients of the Rev Model, it being specified that any rights to use the Rev Models shall be directly granted by Rev to said third-party recipients pursuant to the Rev Model Non-Production License agreement executed between these parties; and (b) You must retain in all copies of the Rev Models the following attribution notice within a “Notice” text file distributed as part of such copies: “Licensed by Rev under the Rev Model Non-Production License”.
+2.3. Distribution of Derivatives. Subject to Section 3 below, You may Distribute any Derivatives made by or for You under additional or different terms and conditions, provided that:  (a) in any event, Your use and modification of Rev Model and/or Derivatives shall remain governed by the terms and conditions of this Agreement; (b) You include in any such Derivatives made by or for You prominent notices stating that You modified the applicable Rev Model; (c) You will impose the same terms and conditions with respect to the Rev Model, Derivatives and Output that apply to You under this Agreement; and (d) any terms and conditions You impose on any third-party recipients relating to Derivatives shall neither limit such third-party recipients’ use of the Rev Model in accordance with this Rev Model Non-Production License.
+3. Limitations
+3.1. Misrepresentation. You must not misrepresent or imply, through any means, that the Derivatives made by or for You and/or any modified version of the Rev Model that You Distribute under your name and responsibility is an official product of Rev or has been endorsed, approved or validated by Rev, unless You are authorized by Us to do so in writing.
+3.2. Usage Limitation. You shall only use the Rev Models, Derivatives (whether or not created by Rev) and Outputs for testing, research, Personal, or evaluation purposes in Non-Production Environments. Subject to the foregoing, You shall not supply the Rev Models, Derivatives, or Outputs in the course of a commercial activity, whether in return for payment or free of charge, in any medium or form, including but not limited to through a hosted or managed service (e.g. SaaS, cloud instances, etc.), or behind a software layer.
+3.3. Usage Not Permitted Under this Agreement. If You want to use a Rev Model or a Derivative for any purpose that is not expressly authorized under this Agreement, You must request a license from Rev, which Rev may grant to You in Rev’s sole discretion. Please contact Rev at the following e-mail address if You want to discuss such a license: licensing@rev.com.
+4. Intellectual Property
+4.1. Trademarks. No trademark licenses are granted under this Agreement, and in connection with the Rev Models, You may not use any name or mark owned by or associated with Rev or any of its affiliates, except (a) as required for reasonable and customary use in describing and Distributing the Rev Models and Derivatives made by or for Rev and (b) for attribution purposes as required by this Agreement.
+4.2. Outputs. We claim no ownership rights in and to the Outputs. You are solely responsible for the Outputs that You generate and their subsequent uses in accordance with this Agreement. Notwithstanding the foregoing, You agree that Your use of the Outputs is subject to the Usage Limitations set forth in Section 3.2 above.
+4.3. Derivatives. By entering into this Agreement, You accept that any Derivatives that You may create or that may be created for You shall be subject to the restrictions set out in Section 3 of this Agreement.
+5. Liability
+5.1. Limitation of Liability. In no event, unless required by applicable law (such as the deliberate and grossly negligent acts of Rev) or agreed to in writing, shall Rev be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this Agreement or out of the use or inability to use the Rev Models and Derivatives (including but not limited to damages for loss of data, loss of goodwill, loss of expected profit or savings, work stoppage, computer failure or malfunction, or any damage caused by malware or security breaches), even if Rev has been advised of the possibility of such damages.
+5.2. Indemnification. You agree to indemnify and hold harmless Rev from and against any claims, damages, or losses arising out of or related to Your use or Distribution of the Rev Models, Derivatives and Output.
+6. Warranty Disclaimer. Unless required by applicable law or agreed to in writing, Rev provides the Rev Models and Derivatives on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. Rev does not represent nor warrant that the Rev Models, Derivatives or Output will be error-free, meet Your or any third party’s requirements, be secure or will allow You or any third party to achieve any kind of result or generate any kind of content. You are solely responsible for determining the appropriateness of using or Distributing the Rev Models, Derivatives and Output and assume any risks associated with Your exercise of rights under this Agreement.
+7. Termination
+7.1. Term. This Agreement is effective as of the date of your acceptance of this Agreement or access to the applicable Rev Models or Derivatives and will continue until terminated in accordance with the following terms.
+7.2. Termination. Rev may terminate this Agreement at any time if You are in breach of this Agreement. Upon termination of this Agreement, You must cease to use all Rev Models and Derivatives and shall permanently delete any copy thereof. Sections 3, 4, 5, 6, 7 and 8 shall survive the termination of this Agreement.
+7.3. Litigation. If You initiate any legal action or proceedings against Us or any other entity (including a cross-claim or counterclaim in a lawsuit), alleging that the Rev Model or a Derivative, or any part thereof, infringe upon intellectual property or other rights owned or licensable by You, then any licenses granted to You under this Agreement will immediately terminate as of the date such legal action or claim is filed or initiated.
+8. General provisions
+8.1. Governing Law. This Agreement will be governed by the laws of the State of Texas, without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
+8.2. Forum. The United States District Court for the Western District of Texas and any appellate court therefrom shall have exclusive jurisdiction of any dispute arising out of this Agreement.
+8.3. Severability. If any provision of this Agreement is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
+9. Definitions
+“Agreement” means this Rev Model Non-Production License agreement governing the access, use, and Distribution of the Rev Models, Derivatives and Output.
+“Derivative” means any (a) modified version of the Rev Model (including but not limited to any customized or fine-tuned version thereof), (b) work based on the Rev Model, or (c) any other derivative work thereof. For the avoidance of doubt, Outputs are not considered as Derivatives under this Agreement.
+“Distribution”, “Distributing”, “Distribute” or “Distributed” means providing or making available, by any means, a copy of the Rev Models and/or the Derivatives as the case may be, subject to Section 3 of this Agreement.
+“Rev”, “We” or “Us” means Rev.com, Inc.
+“Rev Model” means the AI model(s), and its elements which include algorithms, software, instructed checkpoints, parameters, source code (inference code, evaluation code and, if applicable, fine-tuning code) and any other elements associated thereto made available by Rev under this Agreement, including, if any, the technical documentation, manuals and instructions for the use and operation thereof.
+“Non-Production Environment” means any setting, use case, or application of the Rev Models or Derivatives that expressly excludes live, real-world conditions, commercial operations, revenue-generating activities, or direct interactions with or impacts on end users (such as, for instance, Your employees or customers). Non-Production Environment may include, but is not limited to, any setting, use case, or application for research, development, testing, quality assurance, training, internal evaluation (other than any internal usage by employees in the context of the employer’s commercial business activities), and demonstration purposes by You.
+“Outputs” mean any content generated by the operation of the Rev Models or the Derivatives from a prompt (i.e., audio files) provided by users. For the avoidance of doubt, Outputs do not include any components of a Rev Models, such as any fine-tuned versions of the Rev Models, the weights, or parameters.
+“Personal” means any use of a Rev Model or a Derivative that is (a) solely for personal, non-profit and non-commercial purposes and (b) not directly or indirectly connected to any commercial activities, business operations, or employment responsibilities. For illustration purposes, Personal use of a Rev Model or a Derivative does not include any usage by individuals employed in companies in the context of their daily tasks, any activity that is intended to generate revenue, or that is performed on behalf of a commercial entity.
+“You” means the individual or entity entering into this Agreement with Rev.

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+# Introduction
+Models in this file are converted from
+https://huggingface.co/Revai/reverb-diarization-v1/tree/main
+Note that it is accessible under a non-commercial license.
+Please see ./LICENSE for details.
+See also
+https://www.rev.com/blog/speech-to-text-technology/introducing-reverb-open-source-asr-diarization

export-onnx.py ADDED Viewed

	@@ -0,0 +1,130 @@

+#!/usr/bin/env python3
+from typing import Any, Dict
+import onnx
+import torch
+from onnxruntime.quantization import QuantType, quantize_dynamic
+from pyannote.audio import Model
+from pyannote.audio.core.task import Problem, Resolution
+def add_meta_data(filename: str, meta_data: Dict[str, Any]):
+    """Add meta data to an ONNX model. It is changed in-place.
+    Args:
+      filename:
+        Filename of the ONNX model to be changed.
+      meta_data:
+        Key-value pairs.
+    """
+    model = onnx.load(filename)
+    while len(model.metadata_props):
+        model.metadata_props.pop()
+    for key, value in meta_data.items():
+        meta = model.metadata_props.add()
+        meta.key = key
+        meta.value = str(value)
+    onnx.save(model, filename)
+@torch.no_grad()
+def main():
+    # You can download ./pytorch_model.bin from
+    # https://hf-mirror.com/csukuangfj/pyannote-models/tree/main/segmentation-3.0
+    # or from
+    # https://huggingface.co/Revai/reverb-diarization-v1/tree/main
+    pt_filename = "./pytorch_model.bin"
+    model = Model.from_pretrained(pt_filename)
+    model.eval()
+    assert model.dimension == 7, model.dimension
+    print(model.specifications)
+    assert (
+        model.specifications.problem == Problem.MONO_LABEL_CLASSIFICATION
+    ), model.specifications.problem
+    assert (
+        model.specifications.resolution == Resolution.FRAME
+    ), model.specifications.resolution
+    assert model.specifications.duration == 10.0, model.specifications.duration
+    assert model.audio.sample_rate == 16000, model.audio.sample_rate
+    # (batch, num_channels, num_samples)
+    assert list(model.example_input_array.shape) == [
+        1,
+        1,
+        16000 * 10,
+    ], model.example_input_array.shape
+    example_output = model(model.example_input_array)
+    # (batch, num_frames, num_classes)
+    assert list(example_output.shape) == [1, 589, 7], example_output.shape
+    assert model.receptive_field.step == 0.016875, model.receptive_field.step
+    assert model.receptive_field.duration == 0.0619375, model.receptive_field.duration
+    assert model.receptive_field.step * 16000 == 270, model.receptive_field.step * 16000
+    assert model.receptive_field.duration * 16000 == 991, (
+        model.receptive_field.duration * 16000
+    )
+    opset_version = 13
+    filename = "model.onnx"
+    torch.onnx.export(
+        model,
+        model.example_input_array,
+        filename,
+        opset_version=opset_version,
+        input_names=["x"],
+        output_names=["y"],
+        dynamic_axes={
+            "x": {0: "N", 2: "T"},
+            "y": {0: "N", 1: "T"},
+        },
+    )
+    sample_rate = model.audio.sample_rate
+    window_size = int(model.specifications.duration) * 16000
+    receptive_field_size = int(model.receptive_field.duration * 16000)
+    receptive_field_shift = int(model.receptive_field.step * 16000)
+    meta_data = {
+        "num_speakers": len(model.specifications.classes),
+        "powerset_max_classes": model.specifications.powerset_max_classes,
+        "num_classes": model.dimension,
+        "sample_rate": sample_rate,
+        "window_size": window_size,
+        "receptive_field_size": receptive_field_size,
+        "receptive_field_shift": receptive_field_shift,
+        "model_type": "pyannote-segmentation-3.0",
+        "version": "1",
+        "model_author": "pyannote",
+        "maintainer": "k2-fsa",
+        "url_1": "https://huggingface.co/pyannote/segmentation-3.0",
+        "url_2": "https://huggingface.co/csukuangfj/pyannote-models/tree/main/segmentation-3.0",
+        "license": "https://huggingface.co/pyannote/segmentation-3.0/blob/main/LICENSE",
+    }
+    add_meta_data(filename=filename, meta_data=meta_data)
+    print("Generate int8 quantization models")
+    filename_int8 = "model.int8.onnx"
+    quantize_dynamic(
+        model_input=filename,
+        model_output=filename_int8,
+        weight_type=QuantType.QUInt8,
+    )
+    print(f"Saved to {filename} and {filename_int8}")
+if __name__ == "__main__":
+    main()

model.int8.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0e83b2ca69b379eea37b80e2b739b1f6e43f3964c95aaac0bdeb5e2e225ec6e
+size 2415974

model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4d3926911a214a1e56df418f8d967f6dee3e139348a0738b6ef982fb3108fd4
+size 9512165

run.sh ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/usr/bin/env bash
+# Copyright    2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+set -ex
+function install_pyannote() {
+  pip install pyannote.audio onnx onnxruntime
+}
+function download_test_files() {
+  curl -SL -O https://huggingface.co/Revai/reverb-diarization-v1/resolve/main/pytorch_model.bin
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+}
+install_pyannote
+download_test_files
+./export-onnx.py
+./preprocess.sh
+echo "----------torch----------"
+./vad-torch.py
+echo "----------onnx model.onnx----------"
+./vad-onnx.py --model ./model.onnx --wav ./lei-jun-test.wav
+echo "----------onnx model.int8.onnx----------"
+./vad-onnx.py --model ./model.int8.onnx --wav ./lei-jun-test.wav
+curl -SL -O https://huggingface.co/Revai/reverb-diarization-v1/resolve/main/LICENSE
+cat >README.md << EOF
+# Introduction
+Models in this file are converted from
+https://huggingface.co/Revai/reverb-diarization-v1/tree/main
+Note that it is accessible under a non-commercial license.
+Please see ./LICENSE for details.
+See also
+https://www.rev.com/blog/speech-to-text-technology/introducing-reverb-open-source-asr-diarization
+EOF

show-onnx.py ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/usr/bin/env python3
+# Copyright      2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+import onnxruntime
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--filename",
+        type=str,
+        required=True,
+        help="Path to model.onnx",
+    )
+    return parser.parse_args()
+def show(filename):
+    session_opts = onnxruntime.SessionOptions()
+    session_opts.log_severity_level = 3
+    sess = onnxruntime.InferenceSession(filename, session_opts)
+    for i in sess.get_inputs():
+        print(i)
+    print("-----")
+    for i in sess.get_outputs():
+        print(i)
+def main():
+    args = get_args()
+    print(f"========={args.filename}==========")
+    show(args.filename)
+if __name__ == "__main__":
+    main()

speaker-diarization-onnx.py ADDED Viewed

	@@ -0,0 +1,488 @@

+#!/usr/bin/env python3
+# Copyright    2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+"""
+Please refer to
+https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/speaker-diarization.yaml
+for usages.
+"""
+import argparse
+from datetime import timedelta
+from pathlib import Path
+from typing import List
+import librosa
+import numpy as np
+import onnxruntime as ort
+import sherpa_onnx
+import soundfile as sf
+from numpy.lib.stride_tricks import as_strided
+class Segment:
+    def __init__(
+        self,
+        start,
+        end,
+        speaker,
+    ):
+        assert start < end
+        self.start = start
+        self.end = end
+        self.speaker = speaker
+    def merge(self, other, gap=0.5):
+        assert self.speaker == other.speaker, (self.speaker, other.speaker)
+        if self.end < other.start and self.end + gap >= other.start:
+            return Segment(start=self.start, end=other.end, speaker=self.speaker)
+        elif other.end < self.start and other.end + gap >= self.start:
+            return Segment(start=other.start, end=self.end, speaker=self.speaker)
+        else:
+            return None
+    @property
+    def duration(self):
+        return self.end - self.start
+    def __str__(self):
+        s = f"{timedelta(seconds=self.start)}"[:-3]
+        s += " --> "
+        s += f"{timedelta(seconds=self.end)}"[:-3]
+        s += f" speaker_{self.speaker:02d}"
+        return s
+def merge_segment_list(in_out: List[Segment], min_duration_off: float):
+    changed = True
+    while changed:
+        changed = False
+        for i in range(len(in_out)):
+            if i + 1 >= len(in_out):
+                continue
+            new_segment = in_out[i].merge(in_out[i + 1], gap=min_duration_off)
+            if new_segment is None:
+                continue
+            del in_out[i + 1]
+            in_out[i] = new_segment
+            changed = True
+            break
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--seg-model",
+        type=str,
+        required=True,
+        help="Path to model.onnx for segmentation",
+    )
+    parser.add_argument(
+        "--speaker-embedding-model",
+        type=str,
+        required=True,
+        help="Path to model.onnx for speaker embedding extractor",
+    )
+    parser.add_argument("--wav", type=str, required=True, help="Path to test.wav")
+    return parser.parse_args()
+class OnnxSegmentationModel:
+    def __init__(self, filename):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+        self.session_opts = session_opts
+        self.model = ort.InferenceSession(
+            filename,
+            sess_options=self.session_opts,
+            providers=["CPUExecutionProvider"],
+        )
+        meta = self.model.get_modelmeta().custom_metadata_map
+        print(meta)
+        self.window_size = int(meta["window_size"])
+        self.sample_rate = int(meta["sample_rate"])
+        self.window_shift = int(0.1 * self.window_size)
+        self.receptive_field_size = int(meta["receptive_field_size"])
+        self.receptive_field_shift = int(meta["receptive_field_shift"])
+        self.num_speakers = int(meta["num_speakers"])
+        self.powerset_max_classes = int(meta["powerset_max_classes"])
+        self.num_classes = int(meta["num_classes"])
+    def __call__(self, x):
+        """
+        Args:
+          x: (N, num_samples)
+        Returns:
+          A tensor of shape (N, num_frames, num_classes)
+        """
+        x = np.expand_dims(x, axis=1)
+        (y,) = self.model.run(
+            [self.model.get_outputs()[0].name], {self.model.get_inputs()[0].name: x}
+        )
+        return y
+def load_wav(filename, expected_sample_rate) -> np.ndarray:
+    audio, sample_rate = sf.read(filename, dtype="float32", always_2d=True)
+    audio = audio[:, 0]  # only use the first channel
+    if sample_rate != expected_sample_rate:
+        audio = librosa.resample(
+            audio,
+            orig_sr=sample_rate,
+            target_sr=expected_sample_rate,
+        )
+    return audio
+def get_powerset_mapping(num_classes, num_speakers, powerset_max_classes):
+    mapping = np.zeros((num_classes, num_speakers))
+    k = 1
+    for i in range(1, powerset_max_classes + 1):
+        if i == 1:
+            for j in range(0, num_speakers):
+                mapping[k, j] = 1
+                k += 1
+        elif i == 2:
+            for j in range(0, num_speakers):
+                for m in range(j + 1, num_speakers):
+                    mapping[k, j] = 1
+                    mapping[k, m] = 1
+                    k += 1
+        elif i == 3:
+            raise RuntimeError("Unsupported")
+    return mapping
+def to_multi_label(y, mapping):
+    """
+    Args:
+      y: (num_chunks, num_frames, num_classes)
+    Returns:
+      A tensor of shape (num_chunks, num_frames, num_speakers)
+    """
+    y = np.argmax(y, axis=-1)
+    labels = mapping[y.reshape(-1)].reshape(y.shape[0], y.shape[1], -1)
+    return labels
+# speaker count per frame
+def speaker_count(labels, seg_m):
+    """
+    Args:
+      labels: (num_chunks, num_frames, num_speakers)
+      seg_m: Segmentation model
+    Returns:
+      A integer array of shape (num_total_frames,)
+    """
+    labels = labels.sum(axis=-1)
+    # Now labels: (num_chunks, num_frames)
+    num_frames = (
+        int(
+            (seg_m.window_size + (labels.shape[0] - 1) * seg_m.window_shift)
+            / seg_m.receptive_field_shift
+        )
+        + 1
+    )
+    ans = np.zeros((num_frames,))
+    count = np.zeros((num_frames,))
+    for i in range(labels.shape[0]):
+        this_chunk = labels[i]
+        start = int(i * seg_m.window_shift / seg_m.receptive_field_shift + 0.5)
+        end = start + this_chunk.shape[0]
+        ans[start:end] += this_chunk
+        count[start:end] += 1
+    ans /= np.maximum(count, 1e-12)
+    return (ans + 0.5).astype(np.int8)
+def load_speaker_embedding_model(filename):
+    config = sherpa_onnx.SpeakerEmbeddingExtractorConfig(
+        model=filename,
+        num_threads=1,
+        debug=0,
+    )
+    if not config.validate():
+        raise ValueError(f"Invalid config. {config}")
+    extractor = sherpa_onnx.SpeakerEmbeddingExtractor(config)
+    return extractor
+def get_embeddings(embedding_filename, audio, labels, seg_m, exclude_overlap):
+    """
+    Args:
+      embedding_filename: Path to the speaker embedding extractor model
+      audio: (num_samples,)
+      labels: (num_chunks, num_frames, num_speakers)
+      seg_m: segmentation model
+    Returns:
+      Return (num_chunks, num_speakers, embedding_dim)
+    """
+    if exclude_overlap:
+        labels = labels * (labels.sum(axis=-1, keepdims=True) < 2)
+    extractor = load_speaker_embedding_model(embedding_filename)
+    buffer = np.empty(seg_m.window_size)
+    num_chunks, num_frames, num_speakers = labels.shape
+    ans_chunk_speaker_pair = []
+    ans_embeddings = []
+    for i in range(num_chunks):
+        labels_T = labels[i].T
+        # t: (num_speakers, num_frames)
+        sample_offset = i * seg_m.window_shift
+        for j in range(num_speakers):
+            frames = labels_T[j]
+            if frames.sum() < 10:
+                # skip segment less than 20 frames, i.e., about 0.2 seconds
+                continue
+            start = None
+            start_samples = 0
+            idx = 0
+            for k in range(num_frames):
+                if frames[k] != 0:
+                    if start is None:
+                        start = k
+                elif start is not None:
+                    start_samples = (
+                        int(start / num_frames * seg_m.window_size) + sample_offset
+                    )
+                    end_samples = (
+                        int(k / num_frames * seg_m.window_size) + sample_offset
+                    )
+                    num_samples = end_samples - start_samples
+                    buffer[idx : idx + num_samples] = audio[start_samples:end_samples]
+                    idx += num_samples
+                    start = None
+            if start is not None:
+                start_samples = (
+                    int(start / num_frames * seg_m.window_size) + sample_offset
+                )
+                end_samples = int(k / num_frames * seg_m.window_size) + sample_offset
+                num_samples = end_samples - start_samples
+                buffer[idx : idx + num_samples] = audio[start_samples:end_samples]
+                idx += num_samples
+            stream = extractor.create_stream()
+            stream.accept_waveform(sample_rate=seg_m.sample_rate, waveform=buffer[:idx])
+            stream.input_finished()
+            assert extractor.is_ready(stream)
+            embedding = extractor.compute(stream)
+            embedding = np.array(embedding)
+            ans_chunk_speaker_pair.append([i, j])
+            ans_embeddings.append(embedding)
+    assert len(ans_chunk_speaker_pair) == len(ans_embeddings), (
+        len(ans_chunk_speaker_pair),
+        len(ans_embeddings),
+    )
+    return ans_chunk_speaker_pair, np.array(ans_embeddings)
+def main():
+    args = get_args()
+    assert Path(args.seg_model).is_file(), args.seg_model
+    assert Path(args.wav).is_file(), args.wav
+    seg_m = OnnxSegmentationModel(args.seg_model)
+    audio = load_wav(args.wav, seg_m.sample_rate)
+    # audio: (num_samples,)
+    num = (audio.shape[0] - seg_m.window_size) // seg_m.window_shift + 1
+    samples = as_strided(
+        audio,
+        shape=(num, seg_m.window_size),
+        strides=(seg_m.window_shift * audio.strides[0], audio.strides[0]),
+    )
+    # or use torch.Tensor.unfold
+    #  samples = torch.from_numpy(audio).unfold(0, seg_m.window_size, seg_m.window_shift).numpy()
+    if (
+        audio.shape[0] < seg_m.window_size
+        or (audio.shape[0] - seg_m.window_size) % seg_m.window_shift > 0
+    ):
+        has_last_chunk = True
+    else:
+        has_last_chunk = False
+    num_chunks = samples.shape[0]
+    batch_size = 32
+    output = []
+    for i in range(0, num_chunks, batch_size):
+        start = i
+        end = i + batch_size
+        # it's perfectly ok to use end > num_chunks
+        y = seg_m(samples[start:end])
+        output.append(y)
+    if has_last_chunk:
+        last_chunk = audio[num_chunks * seg_m.window_shift :]  # noqa
+        pad_size = seg_m.window_size - last_chunk.shape[0]
+        last_chunk = np.pad(last_chunk, (0, pad_size))
+        last_chunk = np.expand_dims(last_chunk, axis=0)
+        y = seg_m(last_chunk)
+        output.append(y)
+    y = np.vstack(output)
+    # y: (num_chunks, num_frames, num_classes)
+    mapping = get_powerset_mapping(
+        num_classes=seg_m.num_classes,
+        num_speakers=seg_m.num_speakers,
+        powerset_max_classes=seg_m.powerset_max_classes,
+    )
+    labels = to_multi_label(y, mapping=mapping)
+    # labels: (num_chunks, num_frames, num_speakers)
+    inactive = (labels.sum(axis=1) == 0).astype(np.int8)
+    # inactive: (num_chunks, num_speakers)
+    speakers_per_frame = speaker_count(labels=labels, seg_m=seg_m)
+    # speakers_per_frame: (num_frames, speakers_per_frame)
+    if speakers_per_frame.max() == 0:
+        print("No speakers found in the audio file!")
+        return
+    # if users specify only 1 speaker for clustering, then return the
+    # result directly
+    # Now, get embeddings
+    chunk_speaker_pair, embeddings = get_embeddings(
+        args.speaker_embedding_model,
+        audio=audio,
+        labels=labels,
+        seg_m=seg_m,
+        #  exclude_overlap=True,
+        exclude_overlap=False,
+    )
+    # chunk_speaker_pair: a list of (chunk_idx, speaker_idx)
+    # embeddings: (batch_size, embedding_dim)
+    # Please change num_clusters or threshold by yourself.
+    clustering_config = sherpa_onnx.FastClusteringConfig(num_clusters=2)
+    #  clustering_config = sherpa_onnx.FastClusteringConfig(threshold=0.8)
+    clustering = sherpa_onnx.FastClustering(clustering_config)
+    cluster_labels = clustering(embeddings)
+    chunk_speaker_to_cluster = dict()
+    for (chunk_idx, speaker_idx), cluster_idx in zip(
+        chunk_speaker_pair, cluster_labels
+    ):
+        if inactive[chunk_idx, speaker_idx] == 1:
+            print("skip ", chunk_idx, speaker_idx)
+            continue
+        chunk_speaker_to_cluster[(chunk_idx, speaker_idx)] = cluster_idx
+    num_speakers = max(cluster_labels) + 1
+    relabels = np.zeros((labels.shape[0], labels.shape[1], num_speakers))
+    for i in range(labels.shape[0]):
+        for j in range(labels.shape[1]):
+            for k in range(labels.shape[2]):
+                if (i, k) not in chunk_speaker_to_cluster:
+                    continue
+                t = chunk_speaker_to_cluster[(i, k)]
+                if labels[i, j, k] == 1:
+                    relabels[i, j, t] = 1
+    num_frames = (
+        int(
+            (seg_m.window_size + (relabels.shape[0] - 1) * seg_m.window_shift)
+            / seg_m.receptive_field_shift
+        )
+        + 1
+    )
+    count = np.zeros((num_frames, relabels.shape[-1]))
+    for i in range(relabels.shape[0]):
+        this_chunk = relabels[i]
+        start = int(i * seg_m.window_shift / seg_m.receptive_field_shift + 0.5)
+        end = start + this_chunk.shape[0]
+        count[start:end] += this_chunk
+    if has_last_chunk:
+        stop_frame = int(audio.shape[0] / seg_m.receptive_field_shift)
+        count = count[:stop_frame]
+    sorted_count = np.argsort(-count, axis=-1)
+    final = np.zeros((count.shape[0], count.shape[1]))
+    for i, (c, sc) in enumerate(zip(speakers_per_frame, sorted_count)):
+        for k in range(c):
+            final[i, sc[k]] = 1
+    min_duration_off = 0.5
+    min_duration_on = 0.3
+    onset = 0.5
+    offset = 0.5
+    # final: (num_frames, num_speakers)
+    final = final.T
+    for kk in range(final.shape[0]):
+        segment_list = []
+        frames = final[kk]
+        is_active = frames[0] > onset
+        start = None
+        if is_active:
+            start = 0
+        scale = seg_m.receptive_field_shift / seg_m.sample_rate
+        scale_offset = seg_m.receptive_field_size / seg_m.sample_rate * 0.5
+        for i in range(1, len(frames)):
+            if is_active:
+                if frames[i] < offset:
+                    segment = Segment(
+                        start=start * scale + scale_offset,
+                        end=i * scale + scale_offset,
+                        speaker=kk,
+                    )
+                    segment_list.append(segment)
+                    is_active = False
+            else:
+                if frames[i] > onset:
+                    start = i
+                    is_active = True
+        if is_active:
+            segment = Segment(
+                start=start * scale + scale_offset,
+                end=(len(frames) - 1) * scale + scale_offset,
+                speaker=kk,
+            )
+            segment_list.append(segment)
+        if len(segment_list) > 1:
+            merge_segment_list(segment_list, min_duration_off=min_duration_off)
+            for s in segment_list:
+                if s.duration < min_duration_on:
+                    continue
+                print(s)
+if __name__ == "__main__":
+    main()

speaker-diarization-torch.py ADDED Viewed

	@@ -0,0 +1,86 @@

+#!/usr/bin/env python3
+"""
+Please refer to
+https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/speaker-diarization.yaml
+for usages.
+"""
+"""
+1. Go to https://huggingface.co/hbredin/wespeaker-voxceleb-resnet34-LM/tree/main
+wget https://huggingface.co/hbredin/wespeaker-voxceleb-resnet34-LM/resolve/main/speaker-embedding.onnx
+2. Change line 166 of pyannote/audio/pipelines/speaker_diarization.py
+```
+            #  self._embedding = PretrainedSpeakerEmbedding(
+            #      self.embedding, use_auth_token=use_auth_token
+            #  )
+            self._embedding = embedding
+```
+"""
+import argparse
+from pathlib import Path
+import torch
+from pyannote.audio import Model
+from pyannote.audio.pipelines import SpeakerDiarization as SpeakerDiarizationPipeline
+from pyannote.audio.pipelines.speaker_verification import (
+    ONNXWeSpeakerPretrainedSpeakerEmbedding,
+)
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--wav", type=str, required=True, help="Path to test.wav")
+    return parser.parse_args()
+def build_pipeline():
+    embedding_filename = "./speaker-embedding.onnx"
+    if Path(embedding_filename).is_file():
+        # You need to modify line 166
+        # of pyannote/audio/pipelines/speaker_diarization.py
+        # Please see the comments at the start of this script for details
+        embedding = ONNXWeSpeakerPretrainedSpeakerEmbedding(embedding_filename)
+    else:
+        embedding = "hbredin/wespeaker-voxceleb-resnet34-LM"
+    pt_filename = "./pytorch_model.bin"
+    segmentation = Model.from_pretrained(pt_filename)
+    segmentation.eval()
+    pipeline = SpeakerDiarizationPipeline(
+        segmentation=segmentation,
+        embedding=embedding,
+        embedding_exclude_overlap=True,
+    )
+    params = {
+        "clustering": {
+            "method": "centroid",
+            "min_cluster_size": 12,
+            "threshold": 0.7045654963945799,
+        },
+        "segmentation": {"min_duration_off": 0.5},
+    }
+    pipeline.instantiate(params)
+    return pipeline
+@torch.no_grad()
+def main():
+    args = get_args()
+    assert Path(args.wav).is_file(), args.wav
+    pipeline = build_pipeline()
+    print(pipeline)
+    t = pipeline(args.wav)
+    print(type(t))
+    print(t)
+if __name__ == "__main__":
+    main()

vad-onnx.py ADDED Viewed

	@@ -0,0 +1,244 @@

+#!/usr/bin/env python3
+"""
+./export-onnx.py
+./preprocess.sh
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+./vad-onnx.py --model ./model.onnx --wav ./lei-jun-test.wav
+"""
+import argparse
+from pathlib import Path
+import librosa
+import numpy as np
+import onnxruntime as ort
+import soundfile as sf
+from numpy.lib.stride_tricks import as_strided
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True, help="Path to model.onnx")
+    parser.add_argument("--wav", type=str, required=True, help="Path to test.wav")
+    return parser.parse_args()
+class OnnxModel:
+    def __init__(self, filename):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+        self.session_opts = session_opts
+        self.model = ort.InferenceSession(
+            filename,
+            sess_options=self.session_opts,
+            providers=["CPUExecutionProvider"],
+        )
+        meta = self.model.get_modelmeta().custom_metadata_map
+        print(meta)
+        self.window_size = int(meta["window_size"])
+        self.sample_rate = int(meta["sample_rate"])
+        self.window_shift = int(0.1 * self.window_size)
+        self.receptive_field_size = int(meta["receptive_field_size"])
+        self.receptive_field_shift = int(meta["receptive_field_shift"])
+        self.num_speakers = int(meta["num_speakers"])
+        self.powerset_max_classes = int(meta["powerset_max_classes"])
+        self.num_classes = int(meta["num_classes"])
+    def __call__(self, x):
+        """
+        Args:
+          x: (N, num_samples)
+        Returns:
+          A tensor of shape (N, num_frames, num_classes)
+        """
+        x = np.expand_dims(x, axis=1)
+        (y,) = self.model.run(
+            [self.model.get_outputs()[0].name], {self.model.get_inputs()[0].name: x}
+        )
+        return y
+def load_wav(filename, expected_sample_rate) -> np.ndarray:
+    audio, sample_rate = sf.read(filename, dtype="float32", always_2d=True)
+    audio = audio[:, 0]  # only use the first channel
+    if sample_rate != expected_sample_rate:
+        audio = librosa.resample(
+            audio,
+            orig_sr=sample_rate,
+            target_sr=expected_sample_rate,
+        )
+    return audio
+def get_powerset_mapping(num_classes, num_speakers, powerset_max_classes):
+    mapping = np.zeros((num_classes, num_speakers))
+    k = 1
+    for i in range(1, powerset_max_classes + 1):
+        if i == 1:
+            for j in range(0, num_speakers):
+                mapping[k, j] = 1
+                k += 1
+        elif i == 2:
+            for j in range(0, num_speakers):
+                for m in range(j + 1, num_speakers):
+                    mapping[k, j] = 1
+                    mapping[k, m] = 1
+                    k += 1
+        elif i == 3:
+            raise RuntimeError("Unsupported")
+    return mapping
+def to_multi_label(y, mapping):
+    """
+    Args:
+      y: (num_chunks, num_frames, num_classes)
+    Returns:
+      A tensor of shape (num_chunks, num_frames, num_speakers)
+    """
+    y = np.argmax(y, axis=-1)
+    labels = mapping[y.reshape(-1)].reshape(y.shape[0], y.shape[1], -1)
+    return labels
+def main():
+    args = get_args()
+    assert Path(args.model).is_file(), args.model
+    assert Path(args.wav).is_file(), args.wav
+    m = OnnxModel(args.model)
+    audio = load_wav(args.wav, m.sample_rate)
+    # audio: (num_samples,)
+    print("audio", audio.shape, audio.min(), audio.max(), audio.sum())
+    num = (audio.shape[0] - m.window_size) // m.window_shift + 1
+    samples = as_strided(
+        audio,
+        shape=(num, m.window_size),
+        strides=(m.window_shift * audio.strides[0], audio.strides[0]),
+    )
+    # or use torch.Tensor.unfold
+    #  samples = torch.from_numpy(audio).unfold(0, m.window_size, m.window_shift).numpy()
+    print(
+        "samples",
+        samples.shape,
+        samples.mean(),
+        samples.sum(),
+        samples[:3, :3].sum(axis=-1),
+    )
+    if (
+        audio.shape[0] < m.window_size
+        or (audio.shape[0] - m.window_size) % m.window_shift > 0
+    ):
+        has_last_chunk = True
+    else:
+        has_last_chunk = False
+    num_chunks = samples.shape[0]
+    batch_size = 32
+    output = []
+    for i in range(0, num_chunks, batch_size):
+        start = i
+        end = i + batch_size
+        # it's perfectly ok to use end > num_chunks
+        y = m(samples[start:end])
+        output.append(y)
+    if has_last_chunk:
+        last_chunk = audio[num_chunks * m.window_shift :]  # noqa
+        pad_size = m.window_size - last_chunk.shape[0]
+        last_chunk = np.pad(last_chunk, (0, pad_size))
+        last_chunk = np.expand_dims(last_chunk, axis=0)
+        y = m(last_chunk)
+        output.append(y)
+    y = np.vstack(output)
+    # y: (num_chunks, num_frames, num_classes)
+    mapping = get_powerset_mapping(
+        num_classes=m.num_classes,
+        num_speakers=m.num_speakers,
+        powerset_max_classes=m.powerset_max_classes,
+    )
+    labels = to_multi_label(y, mapping=mapping)
+    # labels: (num_chunks, num_frames, num_speakers)
+    # binary classification
+    labels = np.max(labels, axis=-1)
+    # labels: (num_chunk, num_frames)
+    num_frames = (
+        int(
+            (m.window_size + (labels.shape[0] - 1) * m.window_shift)
+            / m.receptive_field_shift
+        )
+        + 1
+    )
+    count = np.zeros((num_frames,))
+    classification = np.zeros((num_frames,))
+    weight = np.hamming(labels.shape[1])
+    for i in range(labels.shape[0]):
+        this_chunk = labels[i]
+        start = int(i * m.window_shift / m.receptive_field_shift + 0.5)
+        end = start + this_chunk.shape[0]
+        classification[start:end] += this_chunk * weight
+        count[start:end] += weight
+    classification /= np.maximum(count, 1e-12)
+    if has_last_chunk:
+        stop_frame = int(audio.shape[0] / m.receptive_field_shift)
+        classification = classification[:stop_frame]
+    classification = classification.tolist()
+    onset = 0.5
+    offset = 0.5
+    is_active = classification[0] > onset
+    start = None
+    if is_active:
+        start = 0
+    scale = m.receptive_field_shift / m.sample_rate
+    scale_offset = m.receptive_field_size / m.sample_rate * 0.5
+    for i in range(len(classification)):
+        if is_active:
+            if classification[i] < offset:
+                print(
+                    f"{start*scale + scale_offset:.3f} -- {i*scale + scale_offset:.3f}"
+                )
+                is_active = False
+        else:
+            if classification[i] > onset:
+                start = i
+                is_active = True
+    if is_active:
+        print(
+            f"{start*scale + scale_offset:.3f} -- {(len(classification)-1)*scale + scale_offset:.3f}"
+        )
+if __name__ == "__main__":
+    main()

vad-torch.py ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/env python3
+import torch
+from pyannote.audio import Model
+from pyannote.audio.pipelines import (
+    VoiceActivityDetection as VoiceActivityDetectionPipeline,
+)
+@torch.no_grad()
+def main():
+    # Please download it from
+    # https://huggingface.co/csukuangfj/pyannote-models/tree/main/segmentation-3.0
+    pt_filename = "./pytorch_model.bin"
+    model = Model.from_pretrained(pt_filename)
+    model.eval()
+    pipeline = VoiceActivityDetectionPipeline(segmentation=model)
+    # https://huggingface.co/pyannote/voice-activity-detection/blob/main/config.yaml
+    # https://github.com/pyannote/pyannote-audio/issues/1215
+    initial_params = {
+        "min_duration_on": 0.0,
+        "min_duration_off": 0.0,
+    }
+    pipeline.onset = 0.5
+    pipeline.offset = 0.5
+    pipeline.instantiate(initial_params)
+    # wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+    t = pipeline("./lei-jun-test.wav")
+    print(type(t))
+    print(t)
+if __name__ == "__main__":
+    main()