Spaces:

shethjenil
/

Audio2Violin

Running

App Files Files Community

shethjenil commited on 6 days ago

Commit

5cc486d

verified ·

1 Parent(s): c274ee1

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

app.py +42 -0
musc/__init__.py +0 -0
musc/dtw/__init__.py +0 -0
musc/dtw/anchor.py +147 -0
musc/dtw/core.py +205 -0
musc/dtw/cost.py +79 -0
musc/dtw/mrmsdtw.py +616 -0
musc/dtw/utils.py +377 -0
musc/dtw/visualization.py +216 -0
musc/model.py +220 -0
musc/pathway.py +114 -0
musc/pitch_estimator.py +206 -0
musc/postprocessing.py +533 -0
musc/representations.py +212 -0
musc/synchronizer.py +299 -0
musc/transcriber.py +163 -0
requirements.txt +9 -0
violin.json +17 -0
violin_model.pt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import gradio as gr
+from musc.model import PretrainedModel
+from json import load as json_load
+from mido import MidiFile,MidiTrack
+from os import remove as os_remove
+Model = PretrainedModel(json_load(open("violin.json")),"violin_model.pt").to("cpu")
+def merge_violin_tracks(input_midi, output_midi):
+    mid = MidiFile(input_midi)
+    new_mid = MidiFile(ticks_per_beat=mid.ticks_per_beat)
+    new_track = MidiTrack()
+    new_mid.tracks.append(new_track)
+    events = []
+    for track in mid.tracks:
+        current_time = 0
+        for msg in track:
+            current_time += msg.time
+            events.append((current_time, msg))
+    events.sort(key=lambda x: x[0])
+    last_time = 0
+    for event_time, msg in events:
+        delta_time = event_time - last_time
+        new_track.append(msg.copy(time=delta_time))
+        last_time = event_time
+    for track in mid.tracks:
+        for msg in track:
+            if msg.type == 'set_tempo':
+                new_track.insert(0, msg)
+    new_mid.save(output_midi)
+def transcribe_and_generate_midi(music_file_path, model=Model, batch_size=32):
+    model.transcribe(music_file_path, batch_size=batch_size).write("output.midi")
+    merge_violin_tracks("output.midi","output.midi")
+    os_remove(music_file_path)
+    return "output.midi"
+gr.Interface(
+    fn=transcribe_and_generate_midi,
+    inputs=gr.Audio(label="Upload your Audio file",type="filepath"),
+    outputs=gr.File(label="Download MIDI file"),
+    title="Audio2Violin",
+    description="Upload a Audio file, and it will be transcribed into Violin MIDI format."
+).launch()

musc/__init__.py ADDED Viewed

File without changes

musc/dtw/__init__.py ADDED Viewed

File without changes

musc/dtw/anchor.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from numba import jit
+import numpy as np
+from typing import Tuple
+def project_alignment_on_a_new_feature_rate(alignment: np.ndarray,
+                                            feature_rate_old: int,
+                                            feature_rate_new: int,
+                                            cost_matrix_size_old: tuple = (),
+                                            cost_matrix_size_new: tuple = ()) -> np.ndarray:
+    """Projects an alignment computed for a cost matrix on a certain
+    feature resolution on a cost matrix having a different feature
+    resolution.
+    Parameters
+    ----------
+    alignment : np.ndarray [shape=(2, N)]
+        Alignment matrix
+    feature_rate_old : int
+        Feature rate of the old cost matrix
+    feature_rate_new : int
+        Feature rate of the new cost matrix
+    cost_matrix_size_old : tuple
+        Size of the old cost matrix. Possibly needed to deal with border cases
+    cost_matrix_size_new : tuple
+        Size of the new cost matrix. Possibly needed to deal with border cases
+    Returns
+    -------
+    np.ndarray [shape=(2, N)]
+        Anchor sequence for the new cost matrix
+    """
+    # Project the alignment on the new feature rate
+    fac = feature_rate_new / feature_rate_old
+    anchors = np.round(alignment * fac) + 1
+    # In case the sizes of the cost matrices are given explicitly and the
+    # alignment specifies to align the first and last elements, handle this case
+    # separately since this might cause problems in the general projection
+    # procedure.
+    if cost_matrix_size_old is not None and cost_matrix_size_new is not None:
+        if np.array_equal(alignment[:, 0], np.array([0, 0])):
+            anchors[:, 0] = np.array([1, 1])
+        if np.array_equal(alignment[:, -1], np.array(cost_matrix_size_old) - 1):
+            anchors[:, -1] = np.array(cost_matrix_size_new)
+    return anchors - 1
+def derive_anchors_from_projected_alignment(projected_alignment: np.ndarray,
+                                            threshold: int) -> np.ndarray:
+    """Derive anchors from a projected alignment such that the area of the rectangle
+    defined by two subsequent anchors a1 and a2 is below a given threshold.
+    Parameters
+    ----------
+    projected_alignment : np.ndarray [shape=(2, N)]
+        Projected alignment array
+    threshold : int
+        Maximum area of the constraint rectangle
+    Returns
+    -------
+    anchors_res : np.ndarray [shape=(2, M)]
+        Resulting anchor sequence
+    """
+    L = projected_alignment.shape[1]
+    a1 = np.array(projected_alignment[:, 0], copy=True).reshape(-1, 1)
+    a2 = np.array(projected_alignment[:, -1], copy=True).reshape(-1, 1)
+    if __compute_area(a1, a2) <= threshold:
+        anchors_res = np.concatenate([a1, a2], axis=1)
+    elif L > 2:
+        center = int(np.floor(L/2 + 1))
+        a1 = np.array(projected_alignment[:, 0], copy=True).reshape(-1, 1)
+        a2 = np.array(projected_alignment[:, center - 1], copy=True).reshape(-1, 1)
+        a3 = np.array(projected_alignment[:, -1], copy=True).reshape(-1, 1)
+        if __compute_area(a1, a2) > threshold:
+            anchors_1 = derive_anchors_from_projected_alignment(projected_alignment[:, 0:center], threshold)
+        else:
+            anchors_1 = np.concatenate([a1, a2], axis=1)
+        if __compute_area(a2, a3) > threshold:
+            anchors_2 = derive_anchors_from_projected_alignment(projected_alignment[:, center - 1:], threshold)
+        else:
+            anchors_2 = np.concatenate([a2, a3], axis=1)
+        anchors_res = np.concatenate([anchors_1, anchors_2[:, 1:]], axis=1)
+    else:
+        if __compute_area(a1, a2) > threshold:
+            print('Only two anchor points are given which do not fulfill the constraint.')
+        anchors_res = np.concatenate([a1, a2], axis=1)
+    return anchors_res
+def derive_neighboring_anchors(warping_path: np.ndarray,
+                               anchor_indices: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Compute anchor points in the neighborhood of previous anchor points.
+    Parameters
+    ----------
+    warping_path : np.ndarray [shape=(2, N)]
+        Warping path
+    anchor_indices : np.ndarray
+        Indices corresponding to the anchor points in the ``warping_path``
+    Returns
+    -------
+    neighboring_anchors : np.ndarray [shape=(2, N-1)]
+        Sequence of neighboring anchors
+    neighboring_anchor_indices : np.ndarray
+        Indices into ``warping path`` corresponding to ``neighboring_anchors``
+    """
+    L = anchor_indices.shape[0]
+    neighboring_anchor_indices = np.zeros(L-1, dtype=int)
+    neighboring_anchors = np.zeros((2, L-1),  dtype=int)
+    for k in range(1, L):
+        i1 = anchor_indices[k-1]
+        i2 = anchor_indices[k]
+        neighboring_anchor_indices[k-1] = i1 + np.floor((i2 - i1) / 2)
+        neighboring_anchors[:, k-1] = warping_path[:, neighboring_anchor_indices[k - 1]]
+    return neighboring_anchors, neighboring_anchor_indices
+@jit(nopython=True)
+def __compute_area(a: tuple,
+                   b: tuple):
+    """Computes the area between two points, given as tuples"""
+    return (b[0] - a[0] + 1) * (b[1] - a[1] + 1)

musc/dtw/core.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import librosa
+from numba import jit
+import numpy as np
+@jit(nopython=True, cache=True)
+def __C_to_DE(C: np.ndarray = None,
+              dn: np.ndarray = np.array([1, 1, 0], np.int64),
+              dm: np.ndarray = np.array([1, 0, 1], np.int64),
+              dw: np.ndarray = np.array([1.0, 1.0, 1.0], np.float64),
+              sub_sequence: bool = False) -> tuple[np.ndarray, np.ndarray]:
+    """This function computes the accumulated cost matrix D and the step index
+    matrix E.
+    Parameters
+    ----------
+    C : np.ndarray (np.float32 / np.float64) [shape=(N, M)]
+        Cost matrix
+    dn : np.ndarray (np.int64) [shape=(1, S)]
+        Integer array defining valid steps (N direction of C), default: [1, 1, 0]
+    dm : np.ndarray (np.int64) [shape=(1, S)]
+        Integer array defining valid steps (M direction of C), default: [1, 0, 1]
+    dw : np.ndarray (np.float64) [shape=(1, S)]
+        Double array defining the weight of the each step, default: [1.0, 1.0, 1.0]
+    sub_sequence : bool
+        Set `True` for SubSequence DTW, default: False
+    Returns
+    -------
+    D : np.ndarray (np.float64) [shape=(N, M)]
+        Accumulated cost matrix of type double
+    E : np.ndarray (np.int64) [shape=(N, M)]
+        Step index matrix.
+        E[n, m] holds the index of the step take to determine the value of D[n, m].
+        If E[n, m] is zero, no valid step was possible.
+        NaNs in the cost matrix are preserved, invalid fields in the cost matrix are NaNs.
+    """
+    if C is None:
+        raise ValueError('C must be a 2D numpy array.')
+    N, M = C.shape
+    S = dn.size
+    if S != dm.size or S != dw.size:
+        raise ValueError('The parameters dn,dm, and dw must be of equal length.')
+    # calc bounding box size of steps
+    sbbn = np.max(dn)
+    sbbm = np.max(dm)
+    # initialize E
+    E = np.zeros((N, M), np.int64) - 1
+    # initialize extended D matrix
+    D = np.ones((sbbn + N, sbbm + M), np.float64) * np.inf
+    if sub_sequence:
+        for m in range(M):
+            D[sbbn, sbbm + m] = C[0, m]
+    else:
+        D[sbbn, sbbm] = C[0, 0]
+    # accumulate
+    for m in range(sbbm, M + sbbm):
+        for n in range(sbbn, N + sbbn):
+            for s in range(S):
+                cost = D[n - dn[s], m - dm[s]] + C[n - sbbn, m - sbbm] * dw[s]
+                if cost < D[n, m]:
+                    D[n, m] = cost
+                    E[n - sbbn, m - sbbm] = s
+    D = D[sbbn: N + sbbn, sbbm: M + sbbm]
+    return D, E
+@jit(nopython=True, cache=True)
+def __E_to_warping_path(E: np.ndarray,
+                        dn: np.ndarray = np.array([1, 1, 0], np.int64),
+                        dm: np.ndarray = np.array([1, 0, 1], np.int64),
+                        sub_sequence: bool = False,
+                        end_index: int = -1) -> np.ndarray:
+    """This function computes a warping path based on the provided matrix E
+    and the allowed steps.
+    Parameters
+    ----------
+    E : np.ndarray (np.int64) [shape=(N, M)]
+        Step index matrix
+    dn : np.ndarray (np.int64) [shape=(1, S)]
+        Integer array defining valid steps (N direction of C), default: [1, 1, 0]
+    dm : np.ndarray (np.int64) [shape=(1, S)]
+         Integer array defining valid steps (M direction of C), default: [1, 0, 1]
+    sub_sequence : bool
+        Set `True` for SubSequence DTW, default: False
+    end_index : int
+        In case of SubSequence DTW
+    Returns
+    -------
+    warping_path : np.ndarray (np.int64) [shape=(2, M)]
+        Resulting optimal warping path
+    """
+    N, M = E.shape
+    if not sub_sequence and end_index == -1:
+        end_index = M - 1
+    m = end_index
+    n = N - 1
+    warping_path = np.zeros((2, n + m + 1))
+    index = 0
+    def _loop(m, n, index):
+        warping_path[:, index] = np.array([n, m])
+        step_index = E[n, m]
+        m -= dm[step_index]
+        n -= dn[step_index]
+        index += 1
+        return m, n, index
+    if sub_sequence:
+        while n > 0:
+            m, n, index = _loop(m, n, index)
+    else:
+        while m > 0 or n > 0:
+            m, n, index = _loop(m, n, index)
+    warping_path[:, index] = np.array([n, m])
+    warping_path = warping_path[:, index::-1]
+    return warping_path
+def compute_warping_path(C: np.ndarray,
+                         step_sizes: np.ndarray = np.array([[1, 0], [0, 1], [1, 1]], np.int64),
+                         step_weights: np.ndarray = np.array([1.0, 1.0, 1.0], np.float64),
+                         implementation: str = 'synctoolbox'):
+    """Applies DTW on cost matrix C.
+    Parameters
+    ----------
+    C : np.ndarray (np.float32 / np.float64) [shape=(N, M)]
+        Cost matrix
+    step_sizes : np.ndarray (np.int64) [shape=(2, S)]
+        Array of step sizes
+    step_weights : np.ndarray (np.float64) [shape=(2, S)]
+        Array of step weights
+    implementation: str
+        Choose among ``synctoolbox`` and ``librosa``. (default: ``synctoolbox``)
+    Returns
+    -------
+    D : np.ndarray (np.float64) [shape=(N, M)]
+        Accumulated cost matrix
+    E : np.ndarray (np.int64) [shape=(N, M)]
+        Step index matrix
+    wp : np.ndarray (np.int64) [shape=(2, M)]
+        Warping path
+    """
+    if implementation == 'librosa':
+        D, wp, E = librosa.sequence.dtw(C=C,
+                                        step_sizes_sigma=step_sizes,
+                                        weights_add=np.array([0, 0, 0]),
+                                        weights_mul=step_weights,
+                                        return_steps=True,
+                                        subseq=False)
+        wp = wp[::-1].T
+    elif implementation == 'synctoolbox':
+        dn = step_sizes[:, 0]
+        dm = step_sizes[:, 1]
+        D, E = __C_to_DE(C,
+                         dn=dn,
+                         dm=dm,
+                         dw=step_weights,
+                         sub_sequence=False)
+        wp = __E_to_warping_path(E=E,
+                                 dn=dn,
+                                 dm=dm,
+                                 sub_sequence=False)
+    else:
+        raise NotImplementedError(f'No implementation found called {implementation}')
+    return D, E, wp

musc/dtw/cost.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import numpy as np
+from sklearn.metrics.pairwise import euclidean_distances
+#@jit(nopython=True)
+def cosine_distance(f1, f2, cos_meas_max=2.0, cos_meas_min=1.0):
+    """For all pairs of vectors f1' and f2' in f1 and f2, computes 1 - (f1.f2),
+    where '.' is the dot product, and rescales the results to lie in the
+    range [cos_meas_min, cos_meas_max].
+    Corresponds to regular cosine distance if f1' and f2' are normalized and
+    cos_meas_min==0.0 and cos_meas_max==1.0."""
+    return (1 - f1.T @ f2) * (cos_meas_max - cos_meas_min) + cos_meas_min
+#@jit(nopython=True)
+def euclidean_distance(f1, f2, l2_meas_max=1.0, l2_meas_min=0.0):
+    """Computes euclidean distances between the vectors in f1 and f2, and
+    rescales the results to lie in the range [cos_meas_min, cos_meas_max]."""
+    #S1 = np.zeros((f1.shape[1], f2.shape[1]))
+    #for n in range(f2.shape[1]):
+    #    S1[:, n] = np.sqrt(np.sum((f1.T - f2[:, n]) ** 2, axis=1))
+    S1 = euclidean_distances(f1.T, f2.T)
+    return S1 * (l2_meas_max - l2_meas_min) + l2_meas_min
+def compute_high_res_cost_matrix(f_chroma1: np.ndarray,
+                                 f_chroma2: np.ndarray,
+                                 f_onset1: np.ndarray,
+                                 f_onset2: np.ndarray,
+                                 weights: np.ndarray = np.array([1.0, 1.0]),
+                                 cos_meas_min: float = 1.0,
+                                 cos_meas_max: float = 2.0,
+                                 l2_meas_min: float = 0.0,
+                                 l2_meas_max: float = 1.0):
+    """Computes cost matrix of two sequences using two feature matrices
+    for each sequence. Cosine distance is used for the chroma sequences and
+    euclidean distance is used for the DLNCO sequences.
+    Parameters
+    ----------
+    f_chroma1 : np.ndarray [shape=(12, N)]
+        Chroma feature matrix of the first sequence (assumed to be normalized).
+    f_chroma2 : np.ndarray [shape=(12, M)]
+        Chroma feature matrix of the second sequence (assumed to be normalized).
+    f_onset1 : np.ndarray [shape=(12, N)]
+        DLNCO feature matrix of the first sequence
+    f_onset2 : np.ndarray [shape=(12, M)]
+        DLNCO feature matrix of the second sequence
+    weights : np.ndarray [shape=[2,]]
+        Weights array for the high-resolution cost computation.
+        weights[0] * cosine_distance + weights[1] * euclidean_distance
+    cos_meas_min : float
+        Cosine distances are shifted to be at least ``cos_meas_min``
+    cos_meas_max : float
+        Cosine distances are scaled to be at most ``cos_meas_max``
+    l2_meas_min : float
+        Euclidean distances are shifted to be at least ``l2_meas_min``
+    l2_meas_max : float
+        Euclidean distances are scaled to be at most ``l2_meas_max``
+    Returns
+    -------
+    C: np.ndarray [shape=(N, M)]
+        Cost matrix
+    """
+    cos_dis = cosine_distance(f_chroma1, f_chroma2, cos_meas_min=cos_meas_min, cos_meas_max=cos_meas_max)
+    euc_dis = euclidean_distance(f_onset1, f_onset2, l2_meas_min=l2_meas_min, l2_meas_max=l2_meas_max)
+    return weights[0] * cos_dis + weights[1] * euc_dis

musc/dtw/mrmsdtw.py ADDED Viewed

	@@ -0,0 +1,616 @@

+from numba import jit
+import numpy as np
+import time
+from typing import List, Tuple, Optional
+from .anchor import derive_anchors_from_projected_alignment, derive_neighboring_anchors, \
+    project_alignment_on_a_new_feature_rate
+from .utils import build_path_from_warping_paths, compute_cost_matrices_between_anchors, smooth_downsample_feature, normalize_feature, compute_warping_paths_from_cost_matrices, find_anchor_indices_in_warping_path
+from .visualization import sync_visualize_step1, sync_visualize_step2
+def sync_via_mrmsdtw_with_anchors(f_chroma1: np.ndarray,
+                                  f_chroma2: np.ndarray,
+                                  f_onset1: np.ndarray = None,
+                                  f_onset2: np.ndarray = None,
+                                  input_feature_rate: float = 50,
+                                  step_sizes: np.ndarray = np.array([[1, 0], [0, 1], [1, 1]], np.int32),
+                                  step_weights: np.ndarray = np.array([1.0, 1.0, 1.0], np.float64),
+                                  threshold_rec: int = 10000,
+                                  win_len_smooth: np.ndarray = np.array([201, 101, 21, 1]),
+                                  downsamp_smooth: np.ndarray = np.array([50, 25, 5, 1]),
+                                  verbose: bool = False,
+                                  dtw_implementation: str = 'synctoolbox',
+                                  normalize_chroma: bool = True,
+                                  chroma_norm_ord: int = 2,
+                                  chroma_norm_threshold: float = 0.001,
+                                  visualization_title: str = "MrMsDTW result",
+                                  anchor_pairs: List[Tuple] = None,
+                                  linear_inp_idx: List[int] = [],
+                                  alpha=0.5) -> np.ndarray:
+    """Compute memory-restricted multi-scale DTW (MrMsDTW) using chroma and (optionally) onset features.
+        MrMsDTW is performed on multiple levels that get progressively finer, with rectangular constraint
+        regions defined by the alignment found on the previous, coarser level.
+        If onset features are provided, these are used on the finest level in addition to chroma
+        to provide higher synchronization accuracy.
+        Parameters
+        ----------
+        f_chroma1 : np.ndarray [shape=(12, N)]
+            Chroma feature matrix of the first sequence
+        f_chroma2 : np.ndarray [shape=(12, M)]
+            Chroma feature matrix of the second sequence
+        f_onset1 : np.ndarray [shape=(L, N)]
+            Onset feature matrix of the first sequence (optional, default: None)
+        f_onset2 : np.ndarray [shape=(L, M)]
+            Onset feature matrix of the second sequence (optional, default: None)
+        input_feature_rate: int
+            Input feature rate of the chroma features (default: 50)
+        step_sizes: np.ndarray
+            DTW step sizes (default: np.array([[1, 0], [0, 1], [1, 1]]))
+        step_weights: np.ndarray
+            DTW step weights (np.array([1.0, 1.0, 1.0]))
+        threshold_rec: int
+            Defines the maximum area that is spanned by the rectangle of two
+            consecutive elements in the alignment (default: 10000)
+        win_len_smooth : np.ndarray
+            Window lengths for chroma feature smoothing (default: np.array([201, 101, 21, 1]))
+        downsamp_smooth : np.ndarray
+            Downsampling factors (default: np.array([50, 25, 5, 1]))
+        verbose : bool
+            Set `True` for visualization (default: False)
+        dtw_implementation : str
+            DTW implementation, librosa or synctoolbox (default: synctoolbox)
+        normalize_chroma : bool
+            Set `True` to normalize input chroma features after each downsampling
+            and smoothing operation.
+        chroma_norm_ord: int
+            Order of chroma normalization, relevant if ``normalize_chroma`` is True.
+            (default: 2)
+        chroma_norm_threshold: float
+            If the norm falls below threshold for a feature vector, then the
+            normalized feature vector is set to be the unit vector. Relevant, if
+            ``normalize_chroma`` is True (default: 0.001)
+        visualization_title : str
+            Title for the visualization plots. Only relevant if 'verbose' is True
+            (default: "MrMsDTW result")
+        anchor_pairs: List[Tuple]
+            Anchor pairs given in seconds. Note that
+            * (0, 0) and (<audio-len1>, <audio-len2>) are not allowed.
+            * Anchors must be monotonously increasing.
+        linear_inp_idx: List[int]
+            List of the indices of intervals created by anchor pairs, for which
+            MrMsDTW shouldn't be run, e.g., if the interval only involves silence.
+            0        ap1        ap2        ap3
+            |         |          |          |
+            |  idx0   |   idx1   |  idx2    |  idx3 OR idx-1
+            |         |          |          |
+            Note that index -1 corresponds to the last interval, which begins with
+            the last anchor pair until the end of the audio files.
+        alpha: float
+            Coefficient for the Chroma cost matrix in the finest scale of the MrMsDTW algorithm.
+            C = alpha * C_Chroma + (1 - alpha) * C_act  (default: 0.5)
+        Returns
+        -------
+        wp : np.ndarray [shape=(2, T)]
+            Resulting warping path which indicates synchronized indices.
+    """
+    if anchor_pairs is None:
+        wp = sync_via_mrmsdtw(f_chroma1=f_chroma1,
+                              f_chroma2=f_chroma2,
+                              f_onset1=f_onset1,
+                              f_onset2=f_onset2,
+                              input_feature_rate=input_feature_rate,
+                              step_sizes=step_sizes,
+                              step_weights=step_weights,
+                              threshold_rec=threshold_rec,
+                              win_len_smooth=win_len_smooth,
+                              downsamp_smooth=downsamp_smooth,
+                              verbose=verbose,
+                              dtw_implementation=dtw_implementation,
+                              normalize_chroma=normalize_chroma,
+                              chroma_norm_ord=chroma_norm_ord,
+                              chroma_norm_threshold=chroma_norm_threshold,
+                              visualization_title=visualization_title,
+                              alpha=alpha)
+    else:
+        # constant_intervals = [((0,  x1), (0, y1), False),
+        #                       ((x1, x2), (y1, y2), True),
+        #                       ((x2, -1), (y2, -1), False)]
+        wp = None
+        if verbose:
+            print('Anchor points are given!')
+        __check_anchor_pairs(anchor_pairs, f_chroma1.shape[1], f_chroma2.shape[1], input_feature_rate)
+        # Add ending as the anchor point
+        anchor_pairs.append((-1, -1))
+        prev_a1 = 0
+        prev_a2 = 0
+        for idx, anchor_pair in enumerate(anchor_pairs):
+            cur_a1, cur_a2 = anchor_pair
+            # Split the features
+            f_chroma1_split, f_onset1_split, f_chroma2_split, f_onset2_split = __split_features(f_chroma1,
+                                                                                                f_onset1,
+                                                                                                f_chroma2,
+                                                                                                f_onset2,
+                                                                                                cur_a1,
+                                                                                                cur_a2,
+                                                                                                prev_a1,
+                                                                                                prev_a2,
+                                                                                                input_feature_rate)
+            if idx in linear_inp_idx or idx == len(anchor_pairs) - 1 and -1 in linear_inp_idx:
+                # Generate a diagonal warping path, if the algorithm is not supposed to executed.
+                # A typical scenario is the silence breaks which are enclosed by two anchor points.
+                if verbose:
+                    print('A diagonal warping path is generated for the interval \n\t Feature sequence 1: %.2f - %.2f'
+                          '\n\t Feature sequence 2: %.2f - %.2f\n' % (prev_a1, cur_a1, prev_a2, cur_a2))
+                wp_cur = __diagonal_warping_path(f_chroma1_split, f_chroma2_split)
+            else:
+                if verbose:
+                    if cur_a1 != -1 and cur_a2 != -1:
+                        print('MrMsDTW is applied for the interval \n\t Feature sequence 1: %.2f - %.2f'
+                              '\n\t Feature sequence 2: %.2f - %.2f\n' % (prev_a1, cur_a1, prev_a2, cur_a2))
+                    else:
+                        print('MrMsDTW is applied for the interval \n\t Feature sequence 1: %.2f - end'
+                              '\n\t Feature sequence 2: %.2f - end\n' % (prev_a1, prev_a2))
+                wp_cur = sync_via_mrmsdtw(f_chroma1=f_chroma1_split,
+                                          f_chroma2=f_chroma2_split,
+                                          f_onset1=f_onset1_split,
+                                          f_onset2=f_onset2_split,
+                                          input_feature_rate=input_feature_rate,
+                                          step_sizes=step_sizes,
+                                          step_weights=step_weights,
+                                          threshold_rec=threshold_rec,
+                                          win_len_smooth=win_len_smooth,
+                                          downsamp_smooth=downsamp_smooth,
+                                          verbose=verbose,
+                                          dtw_implementation=dtw_implementation,
+                                          normalize_chroma=normalize_chroma,
+                                          chroma_norm_ord=chroma_norm_ord,
+                                          chroma_norm_threshold=chroma_norm_threshold,
+                                          alpha=alpha)
+            if wp is None:
+                wp = np.array(wp_cur, copy=True)
+            # Concatenate warping paths
+            else:
+                wp = np.concatenate([wp, wp_cur + wp[:, -1].reshape(2, 1) + 1], axis=1)
+            prev_a1 = cur_a1
+            prev_a2 = cur_a2
+        anchor_pairs.pop()
+    return wp
+def sync_via_mrmsdtw(f_chroma1: np.ndarray,
+                     f_chroma2: np.ndarray,
+                     f_onset1: np.ndarray = None,
+                     f_onset2: np.ndarray = None,
+                     input_feature_rate: float = 50,
+                     step_sizes: np.ndarray = np.array([[1, 0], [0, 1], [1, 1]], np.int32),
+                     step_weights: np.ndarray = np.array([1.0, 1.0, 1.0], np.float64),
+                     threshold_rec: int = 10000,
+                     win_len_smooth: np.ndarray = np.array([201, 101, 21, 1]),
+                     downsamp_smooth: np.ndarray = np.array([50, 25, 5, 1]),
+                     verbose: bool = False,
+                     dtw_implementation: str = 'synctoolbox',
+                     normalize_chroma: bool = True,
+                     chroma_norm_ord: int = 2,
+                     chroma_norm_threshold: float = 0.001,
+                     visualization_title: str = "MrMsDTW result",
+                     alpha=0.5) -> np.ndarray:
+    """Compute memory-restricted multi-scale DTW (MrMsDTW) using chroma and (optionally) onset features.
+        MrMsDTW is performed on multiple levels that get progressively finer, with rectangular constraint
+        regions defined by the alignment found on the previous, coarser level.
+        If onset features are provided, these are used on the finest level in addition to chroma
+        to provide higher synchronization accuracy.
+        Parameters
+        ----------
+        f_chroma1 : np.ndarray [shape=(12, N)]
+            Chroma feature matrix of the first sequence
+        f_chroma2 : np.ndarray [shape=(12, M)]
+            Chroma feature matrix of the second sequence
+        f_onset1 : np.ndarray [shape=(L, N)]
+            Onset feature matrix of the first sequence (optional, default: None)
+        f_onset2 : np.ndarray [shape=(L, M)]
+            Onset feature matrix of the second sequence (optional, default: None)
+        input_feature_rate: int
+            Input feature rate of the chroma features (default: 50)
+        step_sizes: np.ndarray
+            DTW step sizes (default: np.array([[1, 0], [0, 1], [1, 1]]))
+        step_weights: np.ndarray
+            DTW step weights (np.array([1.0, 1.0, 1.0]))
+        threshold_rec: int
+            Defines the maximum area that is spanned by the rectangle of two
+            consecutive elements in the alignment (default: 10000)
+        win_len_smooth : np.ndarray
+            Window lengths for chroma feature smoothing (default: np.array([201, 101, 21, 1]))
+        downsamp_smooth : np.ndarray
+            Downsampling factors (default: np.array([50, 25, 5, 1]))
+        verbose : bool
+            Set `True` for visualization (default: False)
+        dtw_implementation : str
+            DTW implementation, librosa or synctoolbox (default: synctoolbox)
+        normalize_chroma : bool
+            Set `True` to normalize input chroma features after each downsampling
+            and smoothing operation.
+        chroma_norm_ord: int
+            Order of chroma normalization, relevant if ``normalize_chroma`` is True.
+            (default: 2)
+        chroma_norm_threshold: float
+            If the norm falls below threshold for a feature vector, then the
+            normalized feature vector is set to be the unit vector. Relevant, if
+            ``normalize_chroma`` is True (default: 0.001)
+        visualization_title : str
+            Title for the visualization plots. Only relevant if 'verbose' is True
+            (default: "MrMsDTW result")
+        alpha: float
+            Coefficient for the Chroma cost matrix in the finest scale of the MrMsDTW algorithm.
+            C = alpha * C_Chroma + (1 - alpha) * C_act  (default: 0.5)
+        Returns
+        -------
+        alignment: np.ndarray [shape=(2, T)]
+            Resulting warping path which indicates synchronized indices.
+    """
+    # If onset features are given as input, high resolution MrMsDTW is activated.
+    high_res = False
+    if f_onset1 is not None and f_onset2 is not None:
+        high_res = True
+    if high_res and (f_chroma1.shape[1] != f_onset1.shape[1] or f_chroma2.shape[1] != f_onset2.shape[1]):
+        raise ValueError('Chroma and onset features must be of the same length.')
+    if downsamp_smooth[-1] != 1 or win_len_smooth[-1] != 1:
+        raise ValueError('The downsampling factor of the last iteration must be equal to 1, i.e.'
+                         'at the last iteration, it is computed at the input feature rate!')
+    num_iterations = win_len_smooth.shape[0]
+    cost_matrix_size_old = tuple()
+    feature_rate_old = input_feature_rate / downsamp_smooth[0]
+    alignment = None
+    total_computation_time = 0.0
+    # If the area is less than the threshold_rec, don't apply the multiscale DTW.
+    it = (num_iterations - 1) if __compute_area(f_chroma1, f_chroma2) < threshold_rec else 0
+    while it < num_iterations:
+        tic1 = time.perf_counter()
+        # Smooth and downsample given raw features
+        f_chroma1_cur, _ = smooth_downsample_feature(f_chroma1,
+                                                     input_feature_rate=input_feature_rate,
+                                                     win_len_smooth=win_len_smooth[it],
+                                                     downsamp_smooth=downsamp_smooth[it])
+        f_chroma2_cur, feature_rate_new = smooth_downsample_feature(f_chroma2,
+                                                                    input_feature_rate=input_feature_rate,
+                                                                    win_len_smooth=win_len_smooth[it],
+                                                                    downsamp_smooth=downsamp_smooth[it])
+        if normalize_chroma:
+            f_chroma1_cur = normalize_feature(f_chroma1_cur,
+                                              norm_ord=chroma_norm_ord,
+                                              threshold=chroma_norm_threshold)
+            f_chroma2_cur = normalize_feature(f_chroma2_cur,
+                                              norm_ord=chroma_norm_ord,
+                                              threshold=chroma_norm_threshold)
+        # Project path onto new resolution
+        cost_matrix_size_new = (f_chroma1_cur.shape[1], f_chroma2_cur.shape[1])
+        if alignment is None:
+            # Initialize the alignment with the start and end frames of the feature sequence
+            anchors = np.array([[0, f_chroma1_cur.shape[1] - 1], [0, f_chroma2_cur.shape[1] - 1]])
+        else:
+            projected_alignment = project_alignment_on_a_new_feature_rate(alignment=alignment,
+                                                                          feature_rate_old=feature_rate_old,
+                                                                          feature_rate_new=feature_rate_new,
+                                                                          cost_matrix_size_old=cost_matrix_size_old,
+                                                                          cost_matrix_size_new=cost_matrix_size_new)
+            anchors = derive_anchors_from_projected_alignment(projected_alignment=projected_alignment,
+                                                              threshold=threshold_rec)
+        # Cost matrix and warping path computation
+        if high_res and it == num_iterations - 1:
+            # Compute cost considering chroma and pitch onset features and alignment only in the last iteration,
+            # where the features are at the finest level.
+            cost_matrices_step1 = compute_cost_matrices_between_anchors(f_chroma1=f_chroma1_cur,
+                                                                        f_chroma2=f_chroma2_cur,
+                                                                        f_onset1=f_onset1,
+                                                                        f_onset2=f_onset2,
+                                                                        anchors=anchors,
+                                                                        alpha=alpha)
+        else:
+            cost_matrices_step1 = compute_cost_matrices_between_anchors(f_chroma1=f_chroma1_cur,
+                                                                        f_chroma2=f_chroma2_cur,
+                                                                        anchors=anchors,
+                                                                        alpha=alpha)
+        wp_list = compute_warping_paths_from_cost_matrices(cost_matrices_step1,
+                                                           step_sizes=step_sizes,
+                                                           step_weights=step_weights,
+                                                           implementation=dtw_implementation)
+        # Concatenate warping paths
+        wp = build_path_from_warping_paths(warping_paths=wp_list,
+                                           anchors=anchors)
+        anchors_step1 = None
+        wp_step1 = None
+        num_rows_step1 = 0
+        num_cols_step1 = 0
+        ax = None
+        toc1 = time.perf_counter()
+        if verbose and cost_matrices_step1 is not None:
+            anchors_step1 = np.array(anchors, copy=True)
+            wp_step1 = np.array(wp, copy=True)
+            num_rows_step1, num_cols_step1 = np.sum(np.array([dtw_mat.shape for dtw_mat in cost_matrices_step1], int),
+                                                    axis=0)
+            fig, ax = sync_visualize_step1(cost_matrices_step1,
+                                           num_rows_step1,
+                                           num_cols_step1,
+                                           anchors,
+                                           wp)
+        tic2 = time.perf_counter()
+        # Compute neighboring anchors and refine alignment using local path between neighboring anchors
+        anchor_indices_in_warping_path = find_anchor_indices_in_warping_path(wp, anchors=anchors)
+        # Compute neighboring anchors for refinement
+        neighboring_anchors, neighboring_anchor_indices = \
+            derive_neighboring_anchors(wp, anchor_indices=anchor_indices_in_warping_path)
+        if neighboring_anchor_indices.shape[0] > 1 \
+                and it == num_iterations - 1 and high_res:
+            cost_matrices_step2 = compute_cost_matrices_between_anchors(f_chroma1=f_chroma1_cur,
+                                                                        f_chroma2=f_chroma2_cur,
+                                                                        f_onset1=f_onset1,
+                                                                        f_onset2=f_onset2,
+                                                                        anchors=neighboring_anchors,
+                                                                        alpha=alpha)
+        else:
+            cost_matrices_step2 = compute_cost_matrices_between_anchors(f_chroma1=f_chroma1_cur,
+                                                                        f_chroma2=f_chroma2_cur,
+                                                                        anchors=neighboring_anchors,
+                                                                        alpha=alpha)
+        wp_list_refine = compute_warping_paths_from_cost_matrices(cost_matrices=cost_matrices_step2,
+                                                                  step_sizes=step_sizes,
+                                                                  step_weights=step_weights,
+                                                                  implementation=dtw_implementation)
+        wp = __refine_wp(wp, anchors, wp_list_refine, neighboring_anchors, neighboring_anchor_indices)
+        toc2 = time.perf_counter()
+        computation_time_it = toc2 - tic2 + toc1 - tic1
+        total_computation_time += computation_time_it
+        alignment = wp
+        feature_rate_old = feature_rate_new
+        cost_matrix_size_old = cost_matrix_size_new
+        if verbose and cost_matrices_step2 is not None:
+            sync_visualize_step2(ax,
+                                 cost_matrices_step2,
+                                 wp,
+                                 wp_step1,
+                                 num_rows_step1,
+                                 num_cols_step1,
+                                 anchors_step1,
+                                 neighboring_anchors,
+                                 plot_title=f"{visualization_title} - Level {it + 1}")
+            print('Level {} computation time: {:.2f} seconds'.format(it, computation_time_it))
+        it += 1
+    if verbose:
+        print('Computation time of MrMsDTW: {:.2f} seconds'.format(total_computation_time))
+    return alignment
+def __diagonal_warping_path(f1: np.ndarray,
+                            f2: np.ndarray) -> np.ndarray:
+    """Generates a diagonal warping path given two feature sequences.
+    Parameters
+    ----------
+    f1: np.ndarray [shape=(_, N)]
+        First feature sequence
+    f2: np.ndarray [shape=(_, M)]
+        Second feature sequence
+    Returns
+    -------
+    np.ndarray: Diagonal warping path [shape=(2, T)]
+    """
+    max_size = np.maximum(f1.shape[1], f2.shape[1])
+    min_size = np.minimum(f1.shape[1], f2.shape[1])
+    if min_size == 1:
+        return np.array([max_size - 1, 0]).reshape(-1, 1)
+    elif max_size == f1.shape[1]:
+        return np.array([np.round(np.linspace(0, max_size - 1, min_size)), np.linspace(0, min_size - 1, min_size)])
+    else:
+        return np.array([np.linspace(0, min_size-1, min_size), np.round(np.linspace(0, max_size - 1, min_size))])
+@jit(nopython=True)
+def __compute_area(f1, f2):
+    """Computes the area of the cost matrix given two feature sequences
+    Parameters
+    ----------
+    f1: np.ndarray
+        First feature sequence
+    f2: np.ndarray
+        Second feature sequence
+    Returns
+    -------
+    int: Area of the cost matrix
+    """
+    return f1.shape[1] * f2.shape[1]
+def __split_features(f_chroma1: np.ndarray,
+                     f_onset1: np.ndarray,
+                     f_chroma2: np.ndarray,
+                     f_onset2: np.ndarray,
+                     cur_a1: float,
+                     cur_a2: float,
+                     prev_a1: float,
+                     prev_a2: float,
+                     feature_rate: int) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray, Optional[np.ndarray]]:
+    if cur_a1 == -1:
+        f_chroma1_split = f_chroma1[:, int(prev_a1 * feature_rate):]
+        if f_onset1 is not None:
+            f_onset1_split = f_onset1[:, int(prev_a1 * feature_rate):]
+        else:
+            f_onset1_split = None
+    else:
+        # Split the features
+        f_chroma1_split = f_chroma1[:, int(prev_a1 * feature_rate):int(cur_a1 * feature_rate)]
+        if f_onset1 is not None:
+            f_onset1_split = f_onset1[:, int(prev_a1 * feature_rate):int(cur_a1 * feature_rate)]
+        else:
+            f_onset1_split = None
+    if cur_a2 == -1:
+        f_chroma2_split = f_chroma2[:, int(prev_a2 * feature_rate):]
+        if f_onset2 is not None:
+            f_onset2_split = f_onset2[:, int(prev_a2 * feature_rate):]
+        else:
+            f_onset2_split = None
+    else:
+        f_chroma2_split = f_chroma2[:, int(prev_a2 * feature_rate):int(cur_a2 * feature_rate)]
+        if f_onset2 is not None:
+            f_onset2_split = f_onset2[:, int(prev_a2 * feature_rate):int(cur_a2 * feature_rate)]
+        else:
+            f_onset2_split = None
+    return f_chroma1_split, f_onset1_split, f_chroma2_split, f_onset2_split
+def __refine_wp(wp: np.ndarray,
+                anchors: np.ndarray,
+                wp_list_refine: List,
+                neighboring_anchors: np.ndarray,
+                neighboring_anchor_indices: np.ndarray) -> np.ndarray:
+    wp_length = wp[:, neighboring_anchor_indices[-1]:].shape[1]
+    last_list = wp[:, neighboring_anchor_indices[-1]:] - np.tile(
+        wp[:, neighboring_anchor_indices[-1]].reshape(-1, 1), wp_length)
+    wp_list_tmp = [wp[:, :neighboring_anchor_indices[0] + 1]] + wp_list_refine + [last_list]
+    A_tmp = np.concatenate([anchors[:, 0].reshape(-1, 1), neighboring_anchors, anchors[:, -1].reshape(-1, 1)],
+                           axis=1)
+    wp_res = build_path_from_warping_paths(warping_paths=wp_list_tmp,
+                                           anchors=A_tmp)
+    return wp_res
+def __check_anchor_pairs(anchor_pairs: List,
+                         f_len1: int,
+                         f_len2: int,
+                         feature_rate: int):
+    """Ensures that the anchors satisfy the conditions
+    Parameters
+    ----------
+    anchor_pairs: List[Tuple]
+        List of anchor pairs
+    f_len1: int
+        Length of the first feature sequence
+    f_len2: int
+        Length of the second feature sequence
+    feature_rate: int
+        Input feature rate of the features
+    """
+    prev_a1 = 0
+    prev_a2 = 0
+    for anchor_pair in anchor_pairs:
+        a1, a2 = anchor_pair
+        if a1 <= 0 or a2 <= 0:
+            raise ValueError('Starting point must be a positive number!')
+        if a1 > f_len1 / feature_rate or a2 > f_len2 / feature_rate:
+            raise ValueError('Anchor points cannot be greater than the length of the input audio files!')
+        if a1 == f_len1 and a2 == f_len2:
+            raise ValueError('Both anchor points cannot be equal to the length of the audio files.')
+        if a1 == prev_a1 and a2 == prev_a2:
+            raise ValueError('Duplicate anchor pairs are not allowed!')
+        if a1 < prev_a1 or a2 < prev_a2:
+            raise ValueError('Anchor points must be monotonously increasing.')
+        prev_a1 = a1
+        prev_a2 = a2

musc/dtw/utils.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import numpy as np
+from typing import List
+from numba import jit
+from scipy import signal
+from typing import Tuple
+from .core import compute_warping_path
+from .cost import cosine_distance,compute_high_res_cost_matrix
+def compute_warping_paths_from_cost_matrices(cost_matrices: List,
+                                             step_sizes: np.array = np.array([[1, 0], [0, 1], [1, 1]], int),
+                                             step_weights: np.array = np.array([1.0, 1.0, 1.0], np.float64),
+                                             implementation: str = 'synctoolbox') -> List:
+    """Computes a path via DTW on each matrix in cost_matrices
+    Parameters
+    ----------
+    cost_matrices : list
+        List of cost matrices
+    step_sizes : np.ndarray
+        DTW step sizes (default: np.array([[1, 0], [0, 1], [1, 1]]))
+    step_weights : np.ndarray
+        DTW step weights (default: np.array([1.0, 1.0, 1.0]))
+    implementation : str
+        Choose among 'synctoolbox' and 'librosa' (default: 'synctoolbox')
+    Returns
+    -------
+    wp_list : list
+        List of warping paths
+    """
+    return [compute_warping_path(C=C,
+                                 step_sizes=step_sizes,
+                                 step_weights=step_weights,
+                                 implementation=implementation)[2] for C in cost_matrices]
+def compute_cost_matrices_between_anchors(f_chroma1: np.ndarray,
+                                          f_chroma2: np.ndarray,
+                                          anchors: np.ndarray,
+                                          f_onset1: np.ndarray = None,
+                                          f_onset2: np.ndarray = None,
+                                          alpha: float = 0.5) -> List:
+    """Computes cost matrices for the given features between subsequent
+    pairs of anchors points.
+    Parameters
+    ----------
+    f_chroma1 : np.ndarray [shape=(12, N)]
+        Chroma feature matrix of the first sequence
+    f_chroma2 : np.ndarray [shape=(12, M)]
+        Chroma feature matrix of the second sequence
+    anchors : np.ndarray [shape=(2, R)]
+        Anchor sequence
+    f_onset1 : np.ndarray [shape=(L, N)]
+        Onset feature matrix of the first sequence
+    f_onset2 : np.ndarray [shape=(L, M)]
+        Onset feature matrix of the second sequence
+    alpha: float
+        Alpha parameter to weight the cost functions.
+    Returns
+    -------
+    cost_matrices: list
+        List containing cost matrices
+    """
+    high_res = False
+    if f_onset1 is not None and f_onset2 is not None:
+        high_res = True
+    cost_matrices = list()
+    for k in range(anchors.shape[1] - 1):
+        a1 = np.array(anchors[:, k].astype(int), copy=True)
+        a2 = np.array(anchors[:, k + 1].astype(int), copy=True)
+        if high_res:
+            cost_matrices.append(compute_high_res_cost_matrix(f_chroma1[:, a1[0]: a2[0] + 1],
+                                                              f_chroma2[:, a1[1]: a2[1] + 1],
+                                                              f_onset1[:, a1[0]: a2[0] + 1],
+                                                              f_onset2[:, a1[1]: a2[1] + 1],
+                                                              weights=np.array([alpha, 1-alpha])))
+        else:
+            cost_matrices.append(cosine_distance(f_chroma1[:, a1[0]: a2[0] + 1],
+                                                 f_chroma2[:, a1[1]: a2[1] + 1]))
+    return cost_matrices
+def build_path_from_warping_paths(warping_paths: List,
+                                  anchors: np.ndarray = None) -> np.ndarray:
+    """The function builds a path from a given list of warping paths
+    and the anchors used to obtain these paths. The indices of the original
+    warping paths are adapted such that they cross the anchors.
+    Parameters
+    ----------
+    warping_paths : list
+        List of warping paths
+    anchors : np.ndarray [shape=(2, N)]
+        Anchor sequence
+    Returns
+    -------
+    path : np.ndarray [shape=(2, M)]
+        Merged path
+    """
+    if anchors is None:
+        # When no anchor points are given, we can construct them from the
+        # subpaths in the wp_list
+        # To do this, we assume that the first path's element is the starting
+        # anchor
+        anchors = warping_paths[0][:, 0]
+        # Retrieve the last element of each path
+        anchors_tmp = np.zeros(len(warping_paths), np.float32)
+        for idx, x in enumerate(warping_paths):
+            anchors_tmp[idx] = x[:, -1]
+        # Correct indices, such that the indices of the anchors are given on a
+        # common path. Each anchor a_l = [Nnew_[l+1];Mnew_[l+1]]
+        #    Nnew_[l+1] = N_l + N_[l+1] -1
+        #    Mnew_[l+1] = M_l + M_[l+1] -1
+        anchors_tmp = np.cumsum(anchors_tmp, axis=1)
+        anchors_tmp[:, 1:] = anchors_tmp[:, 1:] - [np.arange(1, anchors_tmp.shape[1]),
+                                                   np.arange(1, anchors_tmp.shape[1])]
+        anchors = np.concatenate([anchors, anchors_tmp], axis=1)
+    L = len(warping_paths) + 1
+    path = None
+    wp = None
+    for anchor_idx in range(1, L):
+        anchor1 = anchors[:, anchor_idx - 1]
+        anchor2 = anchors[:, anchor_idx]
+        wp = np.array(warping_paths[anchor_idx - 1], copy=True)
+        # correct indices in warpingPath
+        wp += np.repeat(anchor1.reshape(-1, 1), wp.shape[1], axis=1).astype(wp.dtype)
+        # consistency checks
+        assert np.array_equal(wp[:, 0], anchor1), 'First entry of warping path does not coincide with anchor point'
+        assert np.array_equal(wp[:, -1], anchor2), 'Last entry of warping path does not coincide with anchor point'
+        if path is None:
+            path = np.array(wp[:, :-1], copy=True)
+        else:
+            path = np.concatenate([path, wp[:, :-1]], axis=1)
+    # append last index of warping path
+    path = np.concatenate([path, wp[:, -1].reshape(-1, 1)], axis=1)
+    return path
+def find_anchor_indices_in_warping_path(warping_path: np.ndarray,
+                                        anchors: np.ndarray) -> np.ndarray:
+    """Compute the indices in the warping path that corresponds
+    to the elements in 'anchors'
+    Parameters
+    ----------
+    warping_path : np.ndarray [shape=(2, N)]
+        Warping path
+    anchors : np.ndarray [shape=(2, M)]
+        Anchor sequence
+    Returns
+    -------
+    indices : np.ndarray [shape=(2, M)]
+        Anchor indices in the ``warping_path``
+    """
+    indices = np.zeros(anchors.shape[1])
+    for k in range(anchors.shape[1]):
+        a = anchors[:, k]
+        indices[k] = np.where((a[0] == warping_path[0, :]) & (a[1] == warping_path[1, :]))[0]
+    return indices
+def make_path_strictly_monotonic(P: np.ndarray) -> np.ndarray:
+    """Compute strict alignment path from a warping path
+    Wrapper around "compute_strict_alignment_path_mask" from libfmp.
+    Parameters
+    ----------
+    P: np.ndarray [shape=(2, N)]
+        Warping path
+    Returns
+    -------
+    P_mod: np.ndarray [shape=(2, M)]
+        Strict alignment path, M <= N
+    """
+    P_mod = compute_strict_alignment_path_mask(P.T)
+    return P_mod.T
+def compute_strict_alignment_path_mask(P):
+    """Compute strict alignment path from a warping path
+    Notebook: C3/C3S3_MusicAppTempoCurve.ipynb
+    Args:
+        P (list or np.ndarray): Wapring path
+    Returns:
+        P_mod (list or np.ndarray): Strict alignment path
+    """
+    P = np.array(P, copy=True)
+    N, M = P[-1]
+    # Get indices for strict monotonicity
+    keep_mask = (P[1:, 0] > P[:-1, 0]) & (P[1:, 1] > P[:-1, 1])
+    # Add first index to enforce start boundary condition
+    keep_mask = np.concatenate(([True], keep_mask))
+    # Remove all indices for of last row or column
+    keep_mask[(P[:, 0] == N) | (P[:, 1] == M)] = False
+    # Add last index to enforce end boundary condition
+    keep_mask[-1] = True
+    P_mod = P[keep_mask, :]
+    return P_mod
+def evaluate_synchronized_positions(ground_truth_positions: np.ndarray,
+                                    synchronized_positions: np.ndarray,
+                                    tolerances: List = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 250]):
+    """Compute standard evaluation measures for evaluating the quality of synchronized (musical) positions.
+    When synchronizing two versions of a piece of music, one can evaluate the quality of the resulting alignment
+    by comparing errors at musical positions (e.g. beats or measures) that appear in both versions.
+    This function implements two measures: mean absolute error at positions and the percentage of correctly transferred
+    measures given a threshold.
+    Parameters
+    ----------
+    ground_truth_positions: np.ndarray [shape=N]
+        Positions (e.g. beat or measure positions) annotated in the target version of a piece of music, in milliseconds.
+    synchronized_positions: np.ndarray [shape=N]
+        The same musical positions as in 'ground_truth_positions' obtained by transfer using music synchronization,
+        in milliseconds.
+    tolerances: list of integers
+        Tolerances (in miliseconds) used for comparing annotated and synchronized positions.
+    Returns
+    -------
+    mean_absolute_error: float
+        Mean absolute error for synchronized positions, in miliseconds.
+    accuracy_at_tolerances: list of floats
+        Percentages of correctly transferred measures, for each entry in 'tolerances'.
+    """
+    absolute_errors_at_positions = np.abs(synchronized_positions - ground_truth_positions)
+    print('Measure transfer from recording 1 to 2 yielded:')
+    mean_absolute_error = np.mean(absolute_errors_at_positions)
+    print('\nMean absolute error (MAE): %.2fms (standard deviation: %.2fms)' % (mean_absolute_error,
+                                                                                np.std(absolute_errors_at_positions)))
+    print('\nAccuracy of transferred positions at different tolerances:')
+    print('\t\t\tAccuracy')
+    print('################################')
+    accuracy_at_tolerances = []
+    for tolerance in tolerances:
+        accuracy = np.mean((absolute_errors_at_positions < tolerance)) * 100.0
+        accuracy_at_tolerances.append(accuracy)
+        print('Tolerance: {} ms \t{:.2f} %'.format(tolerance, accuracy))
+    return mean_absolute_error, accuracy_at_tolerances
+def smooth_downsample_feature(f_feature: np.ndarray,
+                              input_feature_rate: float,
+                              win_len_smooth: int = 0,
+                              downsamp_smooth: int = 1) -> Tuple[np.ndarray, float]:
+    """Temporal smoothing and downsampling of a feature sequence
+    Parameters
+    ----------
+    f_feature : np.ndarray
+        Input feature sequence, size dxN
+    input_feature_rate : float
+        Input feature rate in Hz
+    win_len_smooth : int
+        Smoothing window length. For 0, no smoothing is applied.
+    downsamp_smooth : int
+        Downsampling factor. For 1, no downsampling is applied.
+    Returns
+    -------
+    f_feature_stat : np.ndarray
+        Downsampled & smoothed feature.
+    new_feature_rate : float
+        New feature rate after downsampling
+    """
+    if win_len_smooth != 0 or downsamp_smooth != 1:
+        # hack to get the same results as on MATLAB
+        stat_window = np.hanning(win_len_smooth+2)[1:-1]
+        stat_window /= np.sum(stat_window)
+        # upfirdn filters and downsamples each column of f_stat_help
+        f_feature_stat = signal.upfirdn(h=stat_window, x=f_feature, up=1, down=downsamp_smooth)
+        seg_num = f_feature.shape[1]
+        stat_num = int(np.ceil(seg_num / downsamp_smooth))
+        cut = int(np.floor((win_len_smooth - 1) / (2 * downsamp_smooth)))
+        f_feature_stat = f_feature_stat[:, cut: stat_num + cut]
+    else:
+        f_feature_stat = f_feature
+    new_feature_rate = input_feature_rate / downsamp_smooth
+    return f_feature_stat, new_feature_rate
+@jit(nopython=True)
+def normalize_feature(feature: np.ndarray,
+                      norm_ord: int,
+                      threshold: float) -> np.ndarray:
+    """Normalizes a feature sequence according to the l^norm_ord norm.
+    Parameters
+    ----------
+    feature : np.ndarray
+        Input feature sequence of size d x N
+            d: dimensionality of feature vectors
+            N: number of feature vectors (time in frames)
+    norm_ord : int
+        Norm degree
+    threshold : float
+        If the norm falls below threshold for a feature vector, then the
+        normalized feature vector is set to be the normalized unit vector.
+    Returns
+    -------
+    f_normalized : np.ndarray
+        Normalized feature sequence
+    """
+    # TODO rewrite in vectorized fashion
+    d, N = feature.shape
+    f_normalized = np.zeros((d, N))
+    # normalize the vectors according to the l^norm_ord norm
+    unit_vec = np.ones(d)
+    unit_vec = unit_vec / np.linalg.norm(unit_vec, norm_ord)
+    for k in range(N):
+        cur_norm = np.linalg.norm(feature[:, k], norm_ord)
+        if cur_norm < threshold:
+            f_normalized[:, k] = unit_vec
+        else:
+            f_normalized[:, k] = feature[:, k] / cur_norm
+    return f_normalized

musc/dtw/visualization.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import matplotlib
+import matplotlib.cm
+import matplotlib.patches
+import matplotlib.pyplot as plt
+import numpy as np
+from typing import Tuple, List
+def sync_visualize_step1(cost_matrices: List,
+                         num_rows: int,
+                         num_cols: int,
+                         anchors: np.ndarray,
+                         wp: np.ndarray) -> Tuple[plt.Figure, plt.Axes]:
+    fig, ax = plt.subplots(1, 1, dpi=72)
+    ax = __visualize_cost_matrices(ax, cost_matrices)
+    __visualize_constraint_rectangles(anchors[[1, 0], :],
+                                      edgecolor='firebrick')
+    __visualize_path_in_matrix(ax=ax,
+                               wp=wp,
+                               axisX=np.arange(0, num_rows),
+                               axisY=np.arange(0, num_cols),
+                               path_color='firebrick')
+    return fig, ax
+def sync_visualize_step2(ax: plt.Axes,
+                         cost_matrices: list,
+                         wp_step2: np.ndarray,
+                         wp_step1: np.ndarray,
+                         num_rows_step1: int,
+                         num_cols_step1: int,
+                         anchors_step1: np.ndarray,
+                         neighboring_anchors: np.ndarray,
+                         plot_title: str = ""):
+    offset_x = neighboring_anchors[0, 0] - 1
+    offset_y = neighboring_anchors[1, 0] - 1
+    ax = __visualize_cost_matrices(ax=ax,
+                                   cost_matrices=cost_matrices,
+                                   offset_x=offset_x,
+                                   offset_y=offset_y)
+    __visualize_constraint_rectangles(anchors_step1[[1, 0], :],
+                                      edgecolor='firebrick')
+    __visualize_path_in_matrix(ax=ax,
+                               wp=wp_step1,
+                               axisX=np.arange(0, num_rows_step1),
+                               axisY=np.arange(0, num_cols_step1),
+                               path_color='firebrick')
+    __visualize_constraint_rectangles(neighboring_anchors[[1, 0], :] - 1,
+                                      edgecolor='orangered',
+                                      linestyle='--')
+    __visualize_path_in_matrix(ax=ax,
+                               wp=wp_step2,
+                               axisX=np.arange(0, num_rows_step1),
+                               axisY=np.arange(0, num_cols_step1),
+                               path_color='orangered')
+    ax.set_title(plot_title)
+    ax.set_ylabel("Version 1 (frames)")
+    ax.set_xlabel("Version 2 (frames)")
+    ax = plt.gca()  # get the current axes
+    pcm = None
+    for pcm in ax.get_children():
+        if isinstance(pcm, matplotlib.cm.ScalarMappable):
+            break
+    plt.colorbar(pcm, ax=ax)
+    plt.tight_layout()
+    plt.show()
+def __size_dtw_matrices(dtw_matrices: List) -> Tuple[List[np.ndarray], List[np.ndarray]]:
+    """Gives information about the dimensionality of a DTW matrix
+    given in form of a list matrix
+    Parameters
+    ----------
+    dtw_matrices: list
+        The DTW matrix (cost matrix or accumulated cost matrix) given in form a list.
+    Returns
+    -------
+    axisX_list: list
+        A list containing a horizontal axis for each of the sub matrices
+        which specifies the horizontal position of the respective submatrix
+        in the overall cost matrix.
+    axis_y_list: list
+        A list containing a vertical axis for each of the
+        sub matrices which specifies the vertical position of the
+        respective submatrix in the overall cost matrix.
+    """
+    num_matrices = len(dtw_matrices)
+    size_list = [dtw_mat.shape for dtw_mat in dtw_matrices]
+    axis_x_list = list()
+    axis_y_list = list()
+    x_acc = 0
+    y_acc = 0
+    for i in range(num_matrices):
+        curr_size_list = size_list[i]
+        axis_x_list.append(np.arange(x_acc, x_acc + curr_size_list[0]))
+        axis_y_list.append(np.arange(y_acc, y_acc + curr_size_list[1]))
+        x_acc += curr_size_list[0] - 1
+        y_acc += curr_size_list[1] - 1
+    return axis_x_list, axis_y_list
+def __visualize_cost_matrices(ax: plt.Axes,
+                              cost_matrices: list = None,
+                              offset_x: float = 0.0,
+                              offset_y: float = 0.0) -> plt.Axes:
+    """Visualizes cost matrices
+    Parameters
+    ----------
+    ax : axes
+         The Axes instance to plot on
+    cost_matrices : list
+        List of DTW cost matrices.
+    offset_x : float
+        Offset on the x axis.
+    offset_y : float
+        Offset on the y axis.
+    Returns
+    -------
+    ax: axes
+        The Axes instance to plot on
+    """
+    x_ax, y_ax = __size_dtw_matrices(dtw_matrices=cost_matrices)
+    for i, cur_cost in enumerate(cost_matrices[::-1]):
+        curr_x_ax = x_ax[i] + offset_x
+        curr_y_ax = y_ax[i] + offset_y
+        cur_cost = cost_matrices[i]
+        ax.imshow(cur_cost, cmap='gray_r', aspect='auto', origin='lower',
+                  extent=[curr_y_ax[0], curr_y_ax[-1], curr_x_ax[0], curr_x_ax[-1]])
+    return ax
+def __visualize_path_in_matrix(ax,
+                               wp: np.ndarray = None,
+                               axisX: np.ndarray = None,
+                               axisY: np.ndarray = None,
+                               path_color: str = 'r'):
+    """Plots a warping path on top of a given matrix. The matrix is
+    usually an accumulated cost matrix.
+    Parameters
+    ----------
+    ax : axes
+         The Axes instance to plot on
+    wp : np.ndarray
+        Warping path
+    axisX : np.ndarray
+        Array of X axis
+    axisY : np.ndarray
+        Array of Y axis
+    path_color : str
+        Color of the warping path to be plotted. (default: r)
+    """
+    assert axisX is not None and isinstance(axisX, np.ndarray), 'axisX must be a numpy array!'
+    assert axisY is not None and isinstance(axisY, np.ndarray), 'axisY must be a numpy array!'
+    wp = wp.astype(int)
+    ax.plot(axisY[wp[1, :]], axisX[wp[0, :]], '-k', linewidth=5)
+    ax.plot(axisY[wp[1, :]], axisX[wp[0, :]], color=path_color, linewidth=3)
+def __visualize_constraint_rectangles(anchors: np.ndarray,
+                                      linestyle: str = '-',
+                                      edgecolor: str = 'royalblue',
+                                      linewidth: float = 1.0):
+    for k in range(anchors.shape[1]-1):
+        a1 = anchors[:, k]
+        a2 = anchors[:, k + 1]
+        # a rectangle is defined by [x y width height]
+        x = a1[0]
+        y = a1[1]
+        w = a2[0] - a1[0] + np.finfo(float).eps
+        h = a2[1] - a1[1] + np.finfo(float).eps
+        rect = matplotlib.patches.Rectangle((x, y), w, h,
+                                            linewidth=linewidth,
+                                            edgecolor=edgecolor,
+                                            linestyle=linestyle,
+                                            facecolor='none')
+        plt.gca().add_patch(rect)

musc/model.py ADDED Viewed

	@@ -0,0 +1,220 @@

+from .pathway import TinyPathway
+from .synchronizer import Synchronizer
+from .representations import PerformanceLabel
+from torchaudio.models.conformer import ConformerLayer
+import torch
+from torch import nn
+import numpy as np
+class FourHeads(Synchronizer):
+    def __init__(
+            self,
+            pathway_multiscale: int = 32,
+            num_pathway_layers: int = 2,
+            chunk_size: int = 256,
+            hop_length: int = 256,
+            encoder_dim: int = 256,
+            sr: int = 44100,
+            num_heads: int = 4,
+            ffn_dim: int = 128,
+            num_separator_layers: int = 16,
+            num_representation_layers: int = 4,
+            depthwise_conv_kernel_size: int = 31,
+            dropout: float = 0.25,
+            use_group_norm: bool = False,
+            convolution_first: bool = False,
+            labeling=PerformanceLabel(),
+            wiring='tiktok'
+    ):
+        super().__init__(labeling, sr=sr, hop_length=hop_length)
+        self.main = TinyPathway(dilation=1, hop=hop_length, localize=True,
+                                n_layers=num_pathway_layers, chunk_size=chunk_size)
+        self.attendant = TinyPathway(dilation=pathway_multiscale, hop=hop_length, localize=False,
+                                     n_layers=num_pathway_layers, chunk_size=chunk_size)
+        assert self.main.hop == self.attendant.hop  # they should output with the same sample rate
+        print('hop in samples:', self.main.hop)
+        self.input_window = self.attendant.input_window
+        self.encoder_dim = encoder_dim
+        self.dropout = nn.Dropout(dropout)
+        # merge two streams into a conformer input
+        self.stream_merger = nn.Sequential(self.dropout,
+                                           nn.Linear(self.main.out_dim + self.attendant.out_dim, self.encoder_dim))
+        print('main stream window:', self.main.input_window,
+              ', attendant stream window:', self.attendant.input_window,
+              ', conformer input dim:', self.encoder_dim)
+        center = ((chunk_size - 1) * self.main.hop)  # region labeled with pitch track
+        main_overlap = self.main.input_window - center
+        main_overlap = [int(np.floor(main_overlap / 2)), int(np.ceil(main_overlap / 2))]
+        attendant_overlap = self.attendant.input_window - center
+        attendant_overlap = [int(np.floor(attendant_overlap / 2)), int(np.ceil(attendant_overlap / 2))]
+        print('main frame overlap:', main_overlap, ', attendant frame overlap:', attendant_overlap)
+        main_crop_relative = [attendant_overlap[0] - main_overlap[0], main_overlap[1] - attendant_overlap[1]]
+        print('crop for main pathway', main_crop_relative)
+        print("Total sequence duration is", self.attendant.input_window, 'samples')
+        print('Main stream receptive field for one frame is', (self.main.input_window - center), 'samples')
+        print('Attendant stream receptive field for one frame is', (self.attendant.input_window - center), 'samples')
+        self.frame_overlap = attendant_overlap
+        self.main_stream_crop = main_crop_relative
+        self.max_window_size = self.attendant.input_window
+        self.chunk_size = chunk_size
+        self.separator_stream = nn.ModuleList( # source-separation, reinvented
+            [
+                ConformerLayer(
+                    input_dim=self.encoder_dim,
+                    ffn_dim=ffn_dim,
+                    num_attention_heads=num_heads,
+                    depthwise_conv_kernel_size=depthwise_conv_kernel_size,
+                    dropout=dropout,
+                    use_group_norm=use_group_norm,
+                    convolution_first=convolution_first,
+                )
+                for _ in range(num_separator_layers)
+            ]
+        )
+        self.f0_stream = nn.ModuleList(
+            [
+                ConformerLayer(
+                    input_dim=self.encoder_dim,
+                    ffn_dim=ffn_dim,
+                    num_attention_heads=num_heads,
+                    depthwise_conv_kernel_size=depthwise_conv_kernel_size,
+                    dropout=dropout,
+                    use_group_norm=use_group_norm,
+                    convolution_first=convolution_first,
+                )
+                for _ in range(num_representation_layers)
+            ]
+        )
+        self.f0_head = nn.Linear(self.encoder_dim, len(self.labeling.f0_centers_c))
+        self.note_stream = nn.ModuleList(
+            [
+                ConformerLayer(
+                    input_dim=self.encoder_dim,
+                    ffn_dim=ffn_dim,
+                    num_attention_heads=num_heads,
+                    depthwise_conv_kernel_size=depthwise_conv_kernel_size,
+                    dropout=dropout,
+                    use_group_norm=use_group_norm,
+                    convolution_first=convolution_first,
+                )
+                for _ in range(num_representation_layers)
+            ]
+        )
+        self.note_head = nn.Linear(self.encoder_dim, len(self.labeling.midi_centers))
+        self.onset_stream = nn.ModuleList(
+            [
+                ConformerLayer(
+                    input_dim=self.encoder_dim,
+                    ffn_dim=ffn_dim,
+                    num_attention_heads=num_heads,
+                    depthwise_conv_kernel_size=depthwise_conv_kernel_size,
+                    dropout=dropout,
+                    use_group_norm=use_group_norm,
+                    convolution_first=convolution_first,
+                )
+                for _ in range(num_representation_layers)
+            ]
+        )
+        self.onset_head = nn.Linear(self.encoder_dim, len(self.labeling.midi_centers))
+        self.offset_stream = nn.ModuleList(
+            [
+                ConformerLayer(
+                    input_dim=self.encoder_dim,
+                    ffn_dim=ffn_dim,
+                    num_attention_heads=num_heads,
+                    depthwise_conv_kernel_size=depthwise_conv_kernel_size,
+                    dropout=dropout,
+                    use_group_norm=use_group_norm,
+                    convolution_first=convolution_first,
+                )
+                for _ in range(num_representation_layers)
+            ]
+        )
+        self.offset_head = nn.Linear(self.encoder_dim, len(self.labeling.midi_centers))
+        self.labeling = labeling
+        self.double_merger = nn.Sequential(self.dropout, nn.Linear(2 * self.encoder_dim, self.encoder_dim))
+        self.triple_merger = nn.Sequential(self.dropout, nn.Linear(3 * self.encoder_dim, self.encoder_dim))
+        self.wiring = wiring
+        print('Total parameter count: ', self.count_parameters())
+    def count_parameters(self) -> int:
+        """ Count parameters of encoder """
+        return sum([p.numel() for p in self.parameters()])
+    def stream(self, x, representation, key_padding_mask=None):
+        for i, layer in enumerate(self.__getattr__('{}_stream'.format(representation))):
+            x = layer(x, key_padding_mask)
+        return x
+    def head(self, x, representation):
+        return self.__getattr__('{}_head'.format(representation))(x)
+    def forward(self, x, key_padding_mask=None):
+        # two auditory streams followed by the separator stream to ensure timbre-awareness
+        x_attendant = self.attendant(x)
+        x_main = self.main(x[:, self.main_stream_crop[0]:self.main_stream_crop[1]])
+        x = self.stream_merger(torch.cat((x_attendant, x_main), -1).squeeze(1))
+        x = self.stream(x, 'separator', key_padding_mask)
+        f0 = self.stream(x, 'f0', key_padding_mask) # they say this is a low level feature :)
+        if self.wiring == 'parallel':
+            note = self.stream(x, 'note', key_padding_mask)
+            onset = self.stream(x, 'onset', key_padding_mask)
+            offset = self.stream(x, 'offset', key_padding_mask)
+        elif self.wiring == 'tiktok':
+            onset = self.stream(x, 'onset', key_padding_mask)
+            offset = self.stream(x, 'offset', key_padding_mask)
+            # f0 is disconnected, note relies on separator, onset, and offset
+            note = self.stream(self.triple_merger(torch.cat((x, onset, offset), -1)), 'note', key_padding_mask)
+        elif self.wiring == 'tiktok2':
+            onset = self.stream(x, 'onset', key_padding_mask)
+            offset = self.stream(x, 'offset', key_padding_mask)
+            # note is connected to f0, onset, and offset
+            note = self.stream(self.triple_merger(torch.cat((f0, onset, offset), -1)), 'note', key_padding_mask)
+        elif self.wiring == 'spotify':
+            # note is connected to f0 only
+            note = self.stream(f0, 'note', key_padding_mask)
+            # here onset and onsets are higher-level features informed by the separator and note
+            onset = self.stream(self.double_merger(torch.cat((x, note), -1)), 'onset', key_padding_mask)
+            offset = self.stream(self.double_merger(torch.cat((x, note), -1)), 'offset', key_padding_mask)
+        else:
+            # onset and offset are connected to f0 and separator streams
+            onset = self.stream(self.double_merger(torch.cat((x, f0), -1)), 'onset', key_padding_mask)
+            offset = self.stream(self.double_merger(torch.cat((x, f0), -1)), 'offset', key_padding_mask)
+            # note is connected to f0, onset, and offset streams
+            note = self.stream(self.triple_merger(torch.cat((f0, onset, offset), -1)), 'note', key_padding_mask)
+        return {'f0': self.head(f0, 'f0'),
+                'note': self.head(note, 'note'),
+                'onset': self.head(onset, 'onset'),
+                'offset': self.head(offset, 'offset')}
+class PretrainedModel(FourHeads):
+    def __init__(self,model_json:dict,model:str):
+        super().__init__(pathway_multiscale=model_json['pathway_multiscale'],num_pathway_layers=model_json['num_pathway_layers'], wiring=model_json['wiring'],hop_length=model_json['hop_length'], chunk_size=model_json['chunk_size'],labeling=PerformanceLabel(note_min=model_json['note_low'], note_max=model_json['note_high'],f0_bins_per_semitone=model_json['f0_bins_per_semitone'],f0_tolerance_c=200,f0_smooth_std_c=model_json['f0_smooth_std_c'], onset_smooth_std=model_json['onset_smooth_std']), sr=model_json['sampling_rate'])
+        self.load_state_dict(torch.load(model, map_location=torch.device('cpu'),weights_only=True))
+        self.eval()

musc/pathway.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import numpy as np
+import torch.nn as nn
+class ConvBlock(nn.Module):
+    def __init__(self, f, w, s, d, in_channels):
+        super().__init__()
+        p1 = d*(w - 1) // 2
+        p2 = d*(w - 1) - p1
+        self.pad = nn.ZeroPad2d((0, 0, p1, p2))
+        self.conv2d = nn.Conv2d(in_channels=in_channels, out_channels=f, kernel_size=(w, 1), stride=(s, 1), dilation=(d, 1))
+        self.relu = nn.ReLU()
+        self.bn = nn.BatchNorm2d(f)
+        self.pool = nn.MaxPool2d(kernel_size=(2, 1))
+        self.dropout = nn.Dropout(0.25)
+    def forward(self, x):
+        x = self.pad(x)
+        x = self.conv2d(x)
+        x = self.relu(x)
+        x = self.bn(x)
+        x = self.pool(x)
+        x = self.dropout(x)
+        return x
+class NoPadConvBlock(nn.Module):
+    def __init__(self, f, w, s, d, in_channels):
+        super().__init__()
+        self.conv2d = nn.Conv2d(in_channels=in_channels, out_channels=f, kernel_size=(w, 1), stride=(s, 1),
+                                dilation=(d, 1))
+        self.relu = nn.ReLU()
+        self.bn = nn.BatchNorm2d(f)
+        self.pool = nn.MaxPool2d(kernel_size=(2, 1))
+        self.dropout = nn.Dropout(0.25)
+    def forward(self, x):
+        x = self.conv2d(x)
+        x = self.relu(x)
+        x = self.bn(x)
+        x = self.pool(x)
+        x = self.dropout(x)
+        return x
+class TinyPathway(nn.Module):
+    def __init__(self, dilation=1, hop=256, localize=False,
+                 model_capacity="full", n_layers=6, chunk_size=256):
+        super().__init__()
+        capacity_multiplier = {
+            'tiny': 4, 'small': 8, 'medium': 16, 'large': 24, 'full': 32
+        }[model_capacity]
+        self.layers = [1, 2, 3, 4, 5, 6]
+        self.layers = self.layers[:n_layers]
+        filters = [n * capacity_multiplier for n in [32, 8, 8, 8, 8, 8]]
+        filters = [1] + filters
+        widths = [512, 64, 64, 64, 32, 32]
+        strides = self.deter_dilations(hop//(4*(2**n_layers)), localize=localize)
+        strides[0] = strides[0]*4  # apply 4 times more stride at the first layer
+        dilations = self.deter_dilations(dilation)
+        for i in range(len(self.layers)):
+            f, w, s, d, in_channel = filters[i + 1], widths[i], strides[i], dilations[i], filters[i]
+            self.add_module("conv%d" % i, NoPadConvBlock(f, w, s, d, in_channel))
+        self.chunk_size = chunk_size
+        self.input_window, self.hop = self.find_input_size_for_pathway()
+        self.out_dim = filters[n_layers]
+    def find_input_size_for_pathway(self):
+        def find_input_size(output_size, kernel_size, stride, dilation, padding):
+            num = (stride*(output_size-1)) + 1
+            input_size = num - 2*padding + dilation*(kernel_size-1)
+            return input_size
+        conv_calc, n = {}, 0
+        for i in self.layers:
+            layer = self.__getattr__("conv%d" % (i-1))
+            for mm in layer.modules():
+                if hasattr(mm, 'kernel_size'):
+                    try:
+                        d = mm.dilation[0]
+                    except TypeError:
+                        d = mm.dilation
+                    conv_calc[n] = [mm.kernel_size[0], mm.stride[0], 0, d]
+                    n += 1
+        out = self.chunk_size
+        hop = 1
+        for n in sorted(conv_calc.keys())[::-1]:
+            kernel_size_n, stride_n, padding_n, dilation_n = conv_calc[n]
+            out = find_input_size(out, kernel_size_n, stride_n, dilation_n, padding_n)
+            hop = hop*stride_n
+        return out, hop
+    def deter_dilations(self, total_dilation, localize=False):
+        n_layers = len(self.layers)
+        if localize:  # e.g., 32*1023 window and 3 layers -> [1, 1, 32]
+            a = [total_dilation] + [1 for _ in range(n_layers-1)]
+        else:  # e.g., 32*1023 window and 3 layers -> [4, 4, 2]
+            total_dilation = int(np.log2(total_dilation))
+            a = []
+            for layer in range(n_layers):
+                this_dilation = int(np.ceil(total_dilation/(n_layers-layer)))
+                a.append(2**this_dilation)
+                total_dilation = total_dilation - this_dilation
+        return a[::-1]
+    def forward(self, x):
+        x = x.view(x.shape[0], 1, -1, 1)
+        for i in range(len(self.layers)):
+            x = self.__getattr__("conv%d" % i)(x)
+        x = x.permute(0, 3, 2, 1)
+        return x

musc/pitch_estimator.py ADDED Viewed

	@@ -0,0 +1,206 @@

+from torch import nn
+import torch
+import torchaudio
+from typing import List, Optional, Tuple
+import pathlib
+from scipy.signal import medfilt
+import numpy as np
+import librosa
+from librosa.sequence import viterbi_discriminative
+from scipy.ndimage import gaussian_filter1d
+from .postprocessing import spotify_create_notes
+class PitchEstimator(nn.Module):
+    """
+    This is the base class that everything else inherits from. The hierarchy is:
+    PitchEstimator -> Transcriber -> Synchronizer -> AutonomousAgent -> The n-Head Music Performance Analysis Models
+    PitchEstimator can handle reading the audio, predicting all the features,
+    estimating a single frame level f0 using viterbi, or
+    MIDI pitch bend creation for the predicted note events when used inside a Transcriber, or
+    score-informed f0 estimation when used inside a Synchronizer.
+    """
+    def __init__(self, labeling, instrument='Violin', sr=16000, window_size=1024, hop_length=160):
+        super().__init__()
+        self.labeling = labeling
+        self.sr = sr
+        self.window_size = window_size
+        self.hop_length = hop_length
+        self.instrument = instrument
+        self.f0_bins_per_semitone = int(np.round(100/self.labeling.f0_granularity_c))
+    def read_audio(self, audio):
+        """
+        Read and resample an audio file, convert to mono, and unfold into representation frames.
+        The time array represents the center of each small frame with 5.8ms hop length. This is different than the chunk
+        level frames. The chunk level frames represent the entire sequence the model sees. Whereas it predicts with the
+        small frames intervals (5.8ms).
+        :param  audio: str, pathlib.Path, np.ndarray, or torch.Tensor
+        :return: frames: (n_big_frames, frame_length), times: (n_small_frames,)
+        """
+        if isinstance(audio, str) or isinstance(audio, pathlib.Path):
+            audio, sample_rate = torchaudio.load(audio, normalize=True)
+            audio = audio.mean(axis=0)  # convert to mono
+            if sample_rate != self.sr:
+                audio = torchaudio.functional.resample(audio, sample_rate, self.sr)
+        elif isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio)
+        else:
+            assert isinstance(audio, torch.Tensor)
+        len_audio = audio.shape[-1]
+        n_frames = int(np.ceil((len_audio + sum(self.frame_overlap)) / (self.hop_length * self.chunk_size)))
+        audio = nn.functional.pad(audio, (self.frame_overlap[0],
+                                          self.frame_overlap[1] + (n_frames * self.hop_length * self.chunk_size) - len_audio))
+        frames = audio.unfold(0, self.max_window_size, self.hop_length*self.chunk_size)
+        times = np.arange(0, len_audio, self.hop_length) / self.sr    # not tensor, we don't compute anything with it
+        return frames, times
+    def predict(self, audio, batch_size):
+        frames, times = self.read_audio(audio)
+        performance = {'f0': [], 'note': [], 'onset': [], 'offset': []}
+        self.eval()
+        device = self.main.conv0.conv2d.weight.device
+        with torch.no_grad():
+            for i in range(0, len(frames), batch_size):
+                f = frames[i:min(i + batch_size, len(frames))].to(device)
+                f -= (torch.mean(f, axis=1).unsqueeze(-1))
+                f /= (torch.std(f, axis=1).unsqueeze(-1))
+                out = self.forward(f)
+                for key, value in out.items():
+                    value = torch.sigmoid(value)
+                    value = torch.nan_to_num(value) # the model outputs nan when the frame is silent (this is an expected behavior due to normalization)
+                    value = value.view(-1, value.shape[-1])
+                    value = value.detach().cpu().numpy()
+                    performance[key].append(value)
+        performance = {key: np.concatenate(value, axis=0)[:len(times)] for key, value in performance.items()}
+        performance['time'] = times
+        return performance
+    def estimate_pitch(self, audio, batch_size, viterbi=False):
+        out = self.predict(audio, batch_size)
+        f0_hz = self.out2f0(out, viterbi)
+        return out['time'], f0_hz
+    def out2f0(self, out, viterbi=False):
+        """
+        Monophonic f0 estimation from the model output. The viterbi postprocessing is specialized for the violin family.
+        """
+        salience = out['f0']
+        if viterbi == 'constrained':
+            assert hasattr(self, 'out2note')
+            notes =  spotify_create_notes( out["note"], out["onset"], note_low=self.labeling.midi_centers[0],
+                                           note_high=self.labeling.midi_centers[-1], onset_thresh=0.5, frame_thresh=0.3,
+                                           infer_onsets=True, melodia_trick=True,
+                                           min_note_len=int(np.round(127.70 / 1000 * (self.sr / self.hop_length))))
+            note_cents = self.get_pitch_bends(salience, notes, to_midi=False, timing_refinement_range=0)
+            cents = np.zeros_like(out['time'])
+            cents[note_cents[:,0].astype(int)] = note_cents[:,1]
+        elif viterbi:
+            # transition probabilities inducing continuous pitch
+            # big changes are penalized with one order of magnitude
+            transition = gaussian_filter1d(np.eye(self.labeling.f0_n_bins), 30) + 99 * gaussian_filter1d(
+                np.eye(self.labeling.f0_n_bins), 2)
+            transition = transition / np.sum(transition, axis=1)[:, None]
+            p = salience / salience.sum(axis=1)[:, None]
+            p[np.isnan(p.sum(axis=1)), :] = np.ones(self.labeling.f0_n_bins) * 1 / self.labeling.f0_n_bins
+            path = viterbi_discriminative(p.T, transition)
+            cents = np.array([self.labeling.f0_label2c(salience[i, :], path[i]) for i in range(len(path))])
+        else:
+            cents = self.labeling.f0_label2c(salience, center=None)  # use argmax for center
+        f0_hz = self.labeling.f0_c2hz(cents)
+        f0_hz[np.isnan(f0_hz)] = 0
+        return f0_hz
+    def get_pitch_bends(
+            self,
+            contours: np.ndarray, note_events: List[Tuple[int, int, int, float]],
+            timing_refinement_range: int = 0, to_midi: bool = True,
+    ) -> List[Tuple[int, int, int, float, Optional[List[int]]]]:
+        """Modified version of an excellent script from Spotify/basic_pitch!! Thank you!!!!
+        Given note events and contours, estimate pitch bends per note.
+        Pitch bends are represented as a sequence of evenly spaced midi pitch bend control units.
+        The time stamps of each pitch bend can be inferred by computing an evenly spaced grid between
+        the start and end times of each note event.
+        Args:
+            contours: Matrix of estimated pitch contours
+            note_events: note event tuple
+            timing_refinement_range: if > 0, refine onset/offset boundaries with f0 confidence
+            to_midi: whether to convert pitch bends to midi pitch bends. If False, return pitch estimates in the format
+        [time (index), pitch (Hz), confidence in range [0, 1]].
+        Returns:
+            note events with pitch bends
+        """
+        f0_matrix = []  # [time (index), pitch (Hz), confidence in range [0, 1]]
+        note_events_with_pitch_bends = []
+        for start_idx, end_idx, pitch_midi, amplitude in note_events:
+            if timing_refinement_range:
+                start_idx = np.max([0, start_idx - timing_refinement_range])
+                end_idx = np.min([contours.shape[0], end_idx + timing_refinement_range])
+            freq_idx = int(np.round(self.midi_pitch_to_contour_bin(pitch_midi)))
+            freq_start_idx = np.max([freq_idx - self.labeling.f0_tolerance_bins, 0])
+            freq_end_idx = np.min([self.labeling.f0_n_bins, freq_idx + self.labeling.f0_tolerance_bins + 1])
+            trans_start_idx = np.max([0, self.labeling.f0_tolerance_bins - freq_idx])
+            trans_end_idx = (2 * self.labeling.f0_tolerance_bins + 1) - \
+                            np.max([0, freq_idx - (self.labeling.f0_n_bins - self.labeling.f0_tolerance_bins - 1)])
+            # apply regional viterbi to estimate the intonation
+            # observation probabilities come from the f0_roll matrix
+            observation = contours[start_idx:end_idx, freq_start_idx:freq_end_idx]
+            observation = observation / observation.sum(axis=1)[:, None]
+            observation[np.isnan(observation.sum(axis=1)), :] = np.ones(freq_end_idx - freq_start_idx) * 1 / (
+                        freq_end_idx - freq_start_idx)
+            # transition probabilities assure continuity
+            transition = self.labeling.f0_transition_matrix[trans_start_idx:trans_end_idx,
+                         trans_start_idx:trans_end_idx] + 1e-6
+            transition = transition / np.sum(transition, axis=1)[:, None]
+            path = viterbi_discriminative(observation.T / observation.sum(axis=1), transition) + freq_start_idx
+            cents = np.array([self.labeling.f0_label2c(contours[i + start_idx, :], path[i]) for i in range(len(path))])
+            bends = cents - self.labeling.midi_centers_c[pitch_midi - self.labeling.midi_centers[0]]
+            if to_midi:
+                bends = (bends * 4096 / 100).astype(int)
+                bends[bends > 8191] = 8191
+                bends[bends < -8192] = -8192
+                if timing_refinement_range:
+                    confidences = np.array([contours[i + start_idx, path[i]] for i in range(len(path))])
+                    threshold = np.median(confidences)
+                    threshold = (np.median(confidences > threshold) + threshold) / 2  # some magic
+                    median_kernel = 2 * (timing_refinement_range // 2) + 1  # some more magic
+                    confidences = medfilt(confidences, kernel_size=median_kernel)
+                    conf_bool = confidences > threshold
+                    onset_idx = np.argmax(conf_bool)
+                    offset_idx = len(confidences) - np.argmax(conf_bool[::-1])
+                    bends = bends[onset_idx:offset_idx]
+                    start_idx = start_idx + onset_idx
+                    end_idx = start_idx + offset_idx
+                note_events_with_pitch_bends.append((start_idx, end_idx, pitch_midi, amplitude, bends))
+            else:
+                confidences = np.array([contours[i + start_idx, path[i]] for i in range(len(path))])
+                time_idx = np.arange(len(path)) + start_idx
+                # f0_hz = self.labeling.f0_c2hz(cents)
+                possible_f0s = np.array([time_idx, cents, confidences]).T
+                f0_matrix.append(possible_f0s[np.abs(bends)<100]) # filter out pitch bends that are too large
+        if not to_midi:
+            return np.vstack(f0_matrix)
+        else:
+            return note_events_with_pitch_bends
+    def midi_pitch_to_contour_bin(self, pitch_midi: int) -> np.array:
+        """Convert midi pitch to corresponding index in contour matrix
+        Args:
+            pitch_midi: pitch in midi
+        Returns:
+            index in contour matrix
+        """
+        pitch_hz = librosa.midi_to_hz(pitch_midi)
+        return np.argmin(np.abs(self.labeling.f0_centers_hz - pitch_hz))

musc/postprocessing.py ADDED Viewed

	@@ -0,0 +1,533 @@

+from typing import List, Tuple
+import scipy
+import numpy as np
+# SPOTIFY
+def get_inferred_onsets(onset_roll: np.array, note_roll: np.array, n_diff: int = 2) -> np.array:
+    """
+    Infer onsets from large changes in note roll matrix amplitudes.
+    Modified from https://github.com/spotify/basic-pitch/blob/main/basic_pitch/note_creation.py
+    :param onset_roll: Onset activation matrix (n_times, n_freqs).
+    :param note_roll: Frame-level note activation matrix (n_times, n_freqs).
+    :param n_diff: Differences used to detect onsets.
+    :return: The maximum between the predicted onsets and its differences.
+    """
+    diffs = []
+    for n in range(1, n_diff + 1):
+        frames_appended = np.concatenate([np.zeros((n, note_roll.shape[1])), note_roll])
+        diffs.append(frames_appended[n:, :] - frames_appended[:-n, :])
+    frame_diff = np.min(diffs, axis=0)
+    frame_diff[frame_diff < 0] = 0
+    frame_diff[:n_diff, :] = 0
+    frame_diff = np.max(onset_roll) * frame_diff / np.max(frame_diff)  # rescale to have the same max as onsets
+    max_onsets_diff = np.max([onset_roll, frame_diff],
+                             axis=0)  # use the max of the predicted onsets and the differences
+    return max_onsets_diff
+def spotify_create_notes(
+        note_roll: np.array,
+        onset_roll: np.array,
+        onset_thresh: float,
+        frame_thresh: float,
+        min_note_len: int,
+        infer_onsets: bool,
+        note_low : int, #self.labeling.midi_centers[0]
+        note_high : int, #self.labeling.midi_centers[-1],
+        melodia_trick: bool = True,
+        energy_tol: int = 11,
+) -> List[Tuple[int, int, int, float]]:
+    """Decode raw model output to polyphonic note events
+    Modified from https://github.com/spotify/basic-pitch/blob/main/basic_pitch/note_creation.py
+    Args:
+        note_roll: Frame activation matrix (n_times, n_freqs).
+        onset_roll: Onset activation matrix (n_times, n_freqs).
+        onset_thresh: Minimum amplitude of an onset activation to be considered an onset.
+        frame_thresh: Minimum amplitude of a frame activation for a note to remain "on".
+        min_note_len: Minimum allowed note length in frames.
+        infer_onsets: If True, add additional onsets when there are large differences in frame amplitudes.
+        melodia_trick : Whether to use the melodia trick to better detect notes.
+        energy_tol: Drop notes below this energy.
+    Returns:
+        list of tuples [(start_time_frames, end_time_frames, pitch_midi, amplitude)]
+        representing the note events, where amplitude is a number between 0 and 1
+    """
+    n_frames = note_roll.shape[0]
+    # use onsets inferred from frames in addition to the predicted onsets
+    if infer_onsets:
+        onset_roll = get_inferred_onsets(onset_roll, note_roll)
+    peak_thresh_mat = np.zeros(onset_roll.shape)
+    peaks = scipy.signal.argrelmax(onset_roll, axis=0)
+    peak_thresh_mat[peaks] = onset_roll[peaks]
+    onset_idx = np.where(peak_thresh_mat >= onset_thresh)
+    onset_time_idx = onset_idx[0][::-1]  # sort to go backwards in time
+    onset_freq_idx = onset_idx[1][::-1]  # sort to go backwards in time
+    remaining_energy = np.zeros(note_roll.shape)
+    remaining_energy[:, :] = note_roll[:, :]
+    # loop over onsets
+    note_events = []
+    for note_start_idx, freq_idx in zip(onset_time_idx, onset_freq_idx):
+        # if we're too close to the end of the audio, continue
+        if note_start_idx >= n_frames - 1:
+            continue
+        # find time index at this frequency band where the frames drop below an energy threshold
+        i = note_start_idx + 1
+        k = 0  # number of frames since energy dropped below threshold
+        while i < n_frames - 1 and k < energy_tol:
+            if remaining_energy[i, freq_idx] < frame_thresh:
+                k += 1
+            else:
+                k = 0
+            i += 1
+        i -= k  # go back to frame above threshold
+        # if the note is too short, skip it
+        if i - note_start_idx <= min_note_len:
+            continue
+        remaining_energy[note_start_idx:i, freq_idx] = 0
+        if freq_idx < note_high:
+            remaining_energy[note_start_idx:i, freq_idx + 1] = 0
+        if freq_idx > note_low:
+            remaining_energy[note_start_idx:i, freq_idx - 1] = 0
+        # add the note
+        amplitude = np.mean(note_roll[note_start_idx:i, freq_idx])
+        note_events.append(
+            (
+                note_start_idx,
+                i,
+                freq_idx + note_low,
+                amplitude,
+            )
+        )
+    if melodia_trick:
+        energy_shape = remaining_energy.shape
+        while np.max(remaining_energy) > frame_thresh:
+            i_mid, freq_idx = np.unravel_index(np.argmax(remaining_energy), energy_shape)
+            remaining_energy[i_mid, freq_idx] = 0
+            # forward pass
+            i = i_mid + 1
+            k = 0
+            while i < n_frames - 1 and k < energy_tol:
+                if remaining_energy[i, freq_idx] < frame_thresh:
+                    k += 1
+                else:
+                    k = 0
+                remaining_energy[i, freq_idx] = 0
+                if freq_idx < note_high:
+                    remaining_energy[i, freq_idx + 1] = 0
+                if freq_idx > note_low:
+                    remaining_energy[i, freq_idx - 1] = 0
+                i += 1
+            i_end = i - 1 - k  # go back to frame above threshold
+            # backward pass
+            i = i_mid - 1
+            k = 0
+            while i > 0 and k < energy_tol:
+                if remaining_energy[i, freq_idx] < frame_thresh:
+                    k += 1
+                else:
+                    k = 0
+                remaining_energy[i, freq_idx] = 0
+                if freq_idx < note_high:
+                    remaining_energy[i, freq_idx + 1] = 0
+                if freq_idx > note_low:
+                    remaining_energy[i, freq_idx - 1] = 0
+                i -= 1
+            i_start = i + 1 + k  # go back to frame above threshold
+            assert i_start >= 0, "{}".format(i_start)
+            assert i_end < n_frames
+            if i_end - i_start <= min_note_len:
+                # note is too short, skip it
+                continue
+            # add the note
+            amplitude = np.mean(note_roll[i_start:i_end, freq_idx])
+            note_events.append(
+                (
+                    i_start,
+                    i_end,
+                    freq_idx + note_low,
+                    amplitude,
+                )
+            )
+    return note_events
+# TIKTOK
+def note_detection_with_onset_offset_regress(frame_output, onset_output,
+                                             onset_shift_output, offset_output, offset_shift_output, velocity_output,
+                                             frame_threshold):
+    """Process prediction matrices to note events information.
+    First, detect onsets with onset outputs. Then, detect offsets
+    with frame and offset outputs.
+    Args:
+      frame_output: (frames_num,)
+      onset_output: (frames_num,)
+      onset_shift_output: (frames_num,)
+      offset_output: (frames_num,)
+      offset_shift_output: (frames_num,)
+      velocity_output: (frames_num,)
+      frame_threshold: float
+    Returns:
+      output_tuples: list of [bgn, fin, onset_shift, offset_shift, normalized_velocity],
+      e.g., [
+        [1821, 1909, 0.47498, 0.3048533, 0.72119445],
+        [1909, 1947, 0.30730522, -0.45764327, 0.64200014],
+        ...]
+    """
+    output_tuples = []
+    bgn = None
+    frame_disappear = None
+    offset_occur = None
+    for i in range(onset_output.shape[0]):
+        if onset_output[i] == 1:
+            """Onset detected"""
+            if bgn:
+                """Consecutive onsets. E.g., pedal is not released, but two
+                consecutive notes being played."""
+                fin = max(i - 1, 0)
+                output_tuples.append([bgn, fin, onset_shift_output[bgn],
+                                      0, velocity_output[bgn]])
+                frame_disappear, offset_occur = None, None
+            bgn = i
+        if bgn and i > bgn:
+            """If onset found, then search offset"""
+            if frame_output[i] <= frame_threshold and not frame_disappear:
+                """Frame disappear detected"""
+                frame_disappear = i
+            if offset_output[i] == 1 and not offset_occur:
+                """Offset detected"""
+                offset_occur = i
+            if frame_disappear:
+                if offset_occur and offset_occur - bgn > frame_disappear - offset_occur:
+                    """bgn --------- offset_occur --- frame_disappear"""
+                    fin = offset_occur
+                else:
+                    """bgn --- offset_occur --------- frame_disappear"""
+                    fin = frame_disappear
+                output_tuples.append([bgn, fin, onset_shift_output[bgn],
+                                      offset_shift_output[fin], velocity_output[bgn]])
+                bgn, frame_disappear, offset_occur = None, None, None
+            if bgn and (i - bgn >= 600 or i == onset_output.shape[0] - 1):
+                """Offset not detected"""
+                fin = i
+                output_tuples.append([bgn, fin, onset_shift_output[bgn],
+                                      offset_shift_output[fin], velocity_output[bgn]])
+                bgn, frame_disappear, offset_occur = None, None, None
+    # Sort pairs by onsets
+    output_tuples.sort(key=lambda pair: pair[0])
+    return output_tuples
+class RegressionPostProcessor(object):
+    def __init__(self, frames_per_second, classes_num, onset_threshold,
+                 offset_threshold, frame_threshold, pedal_offset_threshold,
+                 begin_note):
+        """Postprocess the output probabilities of a transription model to MIDI
+        events.
+        Args:
+          frames_per_second: float
+          classes_num: int
+          onset_threshold: float
+          offset_threshold: float
+          frame_threshold: float
+          pedal_offset_threshold: float
+        """
+        self.frames_per_second = frames_per_second
+        self.classes_num = classes_num
+        self.onset_threshold = onset_threshold
+        self.offset_threshold = offset_threshold
+        self.frame_threshold = frame_threshold
+        self.pedal_offset_threshold = pedal_offset_threshold
+        self.begin_note = begin_note
+        self.velocity_scale = 128
+    def output_dict_to_midi_events(self, output_dict):
+        """Main function. Post process model outputs to MIDI events.
+        Args:
+          output_dict: {
+            'reg_onset_output': (segment_frames, classes_num),
+            'reg_offset_output': (segment_frames, classes_num),
+            'frame_output': (segment_frames, classes_num),
+            'velocity_output': (segment_frames, classes_num),
+            'reg_pedal_onset_output': (segment_frames, 1),
+            'reg_pedal_offset_output': (segment_frames, 1),
+            'pedal_frame_output': (segment_frames, 1)}
+        Outputs:
+          est_note_events: list of dict, e.g. [
+            {'onset_time': 39.74, 'offset_time': 39.87, 'midi_note': 27, 'velocity': 83},
+            {'onset_time': 11.98, 'offset_time': 12.11, 'midi_note': 33, 'velocity': 88}]
+          est_pedal_events: list of dict, e.g. [
+            {'onset_time': 0.17, 'offset_time': 0.96},
+            {'osnet_time': 1.17, 'offset_time': 2.65}]
+        """
+        output_dict['frame_output'] = output_dict['note']
+        output_dict['velocity_output'] = output_dict['note']
+        output_dict['reg_onset_output'] = output_dict['onset']
+        output_dict['reg_offset_output'] = output_dict['offset']
+        # Post process piano note outputs to piano note and pedal events information
+        (est_on_off_note_vels, est_pedal_on_offs) = \
+            self.output_dict_to_note_pedal_arrays(output_dict)
+        """est_on_off_note_vels: (events_num, 4), the four columns are: [onset_time, offset_time, piano_note, velocity],
+        est_pedal_on_offs: (pedal_events_num, 2), the two columns are: [onset_time, offset_time]"""
+        # Reformat notes to MIDI events
+        est_note_events = self.detected_notes_to_events(est_on_off_note_vels)
+        if est_pedal_on_offs is None:
+            est_pedal_events = None
+        else:
+            est_pedal_events = self.detected_pedals_to_events(est_pedal_on_offs)
+        return est_note_events, est_pedal_events
+    def output_dict_to_note_pedal_arrays(self, output_dict):
+        """Postprocess the output probabilities of a transription model to MIDI
+        events.
+        Args:
+          output_dict: dict, {
+            'reg_onset_output': (frames_num, classes_num),
+            'reg_offset_output': (frames_num, classes_num),
+            'frame_output': (frames_num, classes_num),
+            'velocity_output': (frames_num, classes_num),
+            ...}
+        Returns:
+          est_on_off_note_vels: (events_num, 4), the 4 columns are onset_time,
+            offset_time, piano_note and velocity. E.g. [
+             [39.74, 39.87, 27, 0.65],
+             [11.98, 12.11, 33, 0.69],
+             ...]
+          est_pedal_on_offs: (pedal_events_num, 2), the 2 columns are onset_time
+            and offset_time. E.g. [
+             [0.17, 0.96],
+             [1.17, 2.65],
+             ...]
+        """
+        # ------ 1. Process regression outputs to binarized outputs ------
+        # For example, onset or offset of [0., 0., 0.15, 0.30, 0.40, 0.35, 0.20, 0.05, 0., 0.]
+        # will be processed to [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]
+        # Calculate binarized onset output from regression output
+        (onset_output, onset_shift_output) = \
+            self.get_binarized_output_from_regression(
+                reg_output=output_dict['reg_onset_output'],
+                threshold=self.onset_threshold, neighbour=2)
+        output_dict['onset_output'] = onset_output  # Values are 0 or 1
+        output_dict['onset_shift_output'] = onset_shift_output
+        # Calculate binarized offset output from regression output
+        (offset_output, offset_shift_output) = \
+            self.get_binarized_output_from_regression(
+                reg_output=output_dict['reg_offset_output'],
+                threshold=self.offset_threshold, neighbour=4)
+        output_dict['offset_output'] = offset_output  # Values are 0 or 1
+        output_dict['offset_shift_output'] = offset_shift_output
+        if 'reg_pedal_onset_output' in output_dict.keys():
+            """Pedal onsets are not used in inference. Instead, frame-wise pedal
+            predictions are used to detect onsets. We empirically found this is
+            more accurate to detect pedal onsets."""
+            pass
+        if 'reg_pedal_offset_output' in output_dict.keys():
+            # Calculate binarized pedal offset output from regression output
+            (pedal_offset_output, pedal_offset_shift_output) = \
+                self.get_binarized_output_from_regression(
+                    reg_output=output_dict['reg_pedal_offset_output'],
+                    threshold=self.pedal_offset_threshold, neighbour=4)
+            output_dict['pedal_offset_output'] = pedal_offset_output  # Values are 0 or 1
+            output_dict['pedal_offset_shift_output'] = pedal_offset_shift_output
+        # ------ 2. Process matrices results to event results ------
+        # Detect piano notes from output_dict
+        est_on_off_note_vels = self.output_dict_to_detected_notes(output_dict)
+        est_pedal_on_offs = None
+        return est_on_off_note_vels, est_pedal_on_offs
+    def get_binarized_output_from_regression(self, reg_output, threshold, neighbour):
+        """Calculate binarized output and shifts of onsets or offsets from the
+        regression results.
+        Args:
+          reg_output: (frames_num, classes_num)
+          threshold: float
+          neighbour: int
+        Returns:
+          binary_output: (frames_num, classes_num)
+          shift_output: (frames_num, classes_num)
+        """
+        binary_output = np.zeros_like(reg_output)
+        shift_output = np.zeros_like(reg_output)
+        (frames_num, classes_num) = reg_output.shape
+        for k in range(classes_num):
+            x = reg_output[:, k]
+            for n in range(neighbour, frames_num - neighbour):
+                if x[n] > threshold and self.is_monotonic_neighbour(x, n, neighbour):
+                    binary_output[n, k] = 1
+                    """See Section III-D in [1] for deduction.
+                    [1] Q. Kong, et al., High-resolution Piano Transcription
+                    with Pedals by Regressing Onsets and Offsets Times, 2020."""
+                    if x[n - 1] > x[n + 1]:
+                        shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n + 1]) / 2
+                    else:
+                        shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n - 1]) / 2
+                    shift_output[n, k] = shift
+        return binary_output, shift_output
+    def is_monotonic_neighbour(self, x, n, neighbour):
+        """Detect if values are monotonic in both side of x[n].
+        Args:
+          x: (frames_num,)
+          n: int
+          neighbour: int
+        Returns:
+          monotonic: bool
+        """
+        monotonic = True
+        for i in range(neighbour):
+            if x[n - i] < x[n - i - 1]:
+                monotonic = False
+            if x[n + i] < x[n + i + 1]:
+                monotonic = False
+        return monotonic
+    def output_dict_to_detected_notes(self, output_dict):
+        """Postprocess output_dict to piano notes.
+        Args:
+          output_dict: dict, e.g. {
+            'onset_output': (frames_num, classes_num),
+            'onset_shift_output': (frames_num, classes_num),
+            'offset_output': (frames_num, classes_num),
+            'offset_shift_output': (frames_num, classes_num),
+            'frame_output': (frames_num, classes_num),
+            'onset_output': (frames_num, classes_num),
+            ...}
+        Returns:
+          est_on_off_note_vels: (notes, 4), the four columns are onsets, offsets,
+          MIDI notes and velocities. E.g.,
+            [[39.7375, 39.7500, 27., 0.6638],
+             [11.9824, 12.5000, 33., 0.6892],
+             ...]
+        """
+        est_tuples = []
+        est_midi_notes = []
+        classes_num = output_dict['frame_output'].shape[-1]
+        for piano_note in range(classes_num):
+            """Detect piano notes"""
+            est_tuples_per_note = note_detection_with_onset_offset_regress(
+                frame_output=output_dict['frame_output'][:, piano_note],
+                onset_output=output_dict['onset_output'][:, piano_note],
+                onset_shift_output=output_dict['onset_shift_output'][:, piano_note],
+                offset_output=output_dict['offset_output'][:, piano_note],
+                offset_shift_output=output_dict['offset_shift_output'][:, piano_note],
+                velocity_output=output_dict['velocity_output'][:, piano_note],
+                frame_threshold=self.frame_threshold)
+            est_tuples += est_tuples_per_note
+            est_midi_notes += [piano_note + self.begin_note] * len(est_tuples_per_note)
+        est_tuples = np.array(est_tuples)  # (notes, 5)
+        """(notes, 5), the five columns are onset, offset, onset_shift,
+        offset_shift and normalized_velocity"""
+        est_midi_notes = np.array(est_midi_notes)  # (notes,)
+        onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second
+        offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second
+        velocities = est_tuples[:, 4]
+        est_on_off_note_vels = np.stack((onset_times, offset_times, est_midi_notes, velocities), axis=-1)
+        """(notes, 3), the three columns are onset_times, offset_times and velocity."""
+        est_on_off_note_vels = est_on_off_note_vels.astype(np.float32)
+        return est_on_off_note_vels
+    def detected_notes_to_events(self, est_on_off_note_vels):
+        """Reformat detected notes to midi events.
+        Args:
+          est_on_off_vels: (notes, 3), the three columns are onset_times,
+            offset_times and velocity. E.g.
+            [[32.8376, 35.7700, 0.7932],
+             [37.3712, 39.9300, 0.8058],
+             ...]
+        Returns:
+          midi_events, list, e.g.,
+            [{'onset_time': 39.7376, 'offset_time': 39.75, 'midi_note': 27, 'velocity': 84},
+             {'onset_time': 11.9824, 'offset_time': 12.50, 'midi_note': 33, 'velocity': 88},
+             ...]
+        """
+        midi_events = []
+        for i in range(est_on_off_note_vels.shape[0]):
+            midi_events.append({
+                'onset_time': est_on_off_note_vels[i][0],
+                'offset_time': est_on_off_note_vels[i][1],
+                'midi_note': int(est_on_off_note_vels[i][2]),
+                'velocity': int(est_on_off_note_vels[i][3] * self.velocity_scale)})
+        return midi_events

musc/representations.py ADDED Viewed

	@@ -0,0 +1,212 @@

+from mir_eval import melody
+import numpy as np
+from scipy.stats import norm
+import librosa
+import pretty_midi
+from scipy.ndimage import gaussian_filter1d
+class PerformanceLabel:
+    """
+    The dataset labeling class for performance representations. Currently, includes onset, note, and fine-grained f0
+    representations. Note min, note max, and f0_bin_per_semitone values are to be arranged per instrument. The default
+    values are for violin performance analysis. Fretted instruments might not require such f0 resolutions per semitone.
+    """
+    def __init__(self, note_min='F#3', note_max='C8', f0_bins_per_semitone=9, f0_smooth_std_c=None,
+                 onset_smooth_std=0.7, f0_tolerance_c=200):
+        midi_min = pretty_midi.note_name_to_number(note_min)
+        midi_max = pretty_midi.note_name_to_number(note_max)
+        self.midi_centers = np.arange(midi_min, midi_max)
+        self.onset_smooth_std=onset_smooth_std # onset smoothing along time axis (compensate for alignment)
+        f0_hz_range = librosa.note_to_hz([note_min, note_max])
+        f0_c_min, f0_c_max = melody.hz2cents(f0_hz_range)
+        self.f0_granularity_c = 100/f0_bins_per_semitone
+        if not f0_smooth_std_c:
+            f0_smooth_std_c = self.f0_granularity_c * 5/4  # Keep the ratio from the CREPE paper (20 cents and 25 cents)
+        self.f0_smooth_std_c = f0_smooth_std_c
+        self.f0_centers_c = np.arange(f0_c_min, f0_c_max, self.f0_granularity_c)
+        self.f0_centers_hz = 10 * 2 ** (self.f0_centers_c / 1200)
+        self.f0_n_bins = len(self.f0_centers_c)
+        self.pdf_normalizer = norm.pdf(0)
+        self.f0_c2hz = lambda c: 10*2**(c/1200)
+        self.f0_hz2c = melody.hz2cents
+        self.midi_centers_c = self.f0_hz2c(librosa.midi_to_hz(self.midi_centers))
+        self.f0_tolerance_bins = int(f0_tolerance_c/self.f0_granularity_c)
+        self.f0_transition_matrix = gaussian_filter1d(np.eye(2*self.f0_tolerance_bins + 1), 25/self.f0_granularity_c)
+    def f0_c2label(self, pitch_c):
+        """
+        Convert a single f0 value in cents to a one-hot label vector with smoothing (i.e., create a gaussian blur around
+        the target f0 bin for regularization and training stability. The blur is controlled by self.f0_smooth_std_c
+        :param pitch_c: a single pitch value in cents
+        :return: one-hot label vector with frequency blur
+        """
+        result = norm.pdf((self.f0_centers_c - pitch_c) / self.f0_smooth_std_c).astype(np.float32)
+        result /= self.pdf_normalizer
+        return result
+    def f0_label2c(self, salience, center=None):
+        """
+        Convert the salience predictions to monophonic f0 in cents. Only outputs a single f0 value per frame!
+        :param salience: f0 activations
+        :param center: f0 center bin to calculate the weighted average. Use argmax if empty
+        :return: f0 array per frame (in cents).
+        """
+        if salience.ndim == 1:
+            if center is None:
+                center = int(np.argmax(salience))
+            start = max(0, center - 4)
+            end = min(len(salience), center + 5)
+            salience = salience[start:end]
+            product_sum = np.sum(salience * self.f0_centers_c[start:end])
+            weight_sum = np.sum(salience)
+            return product_sum / np.clip(weight_sum, 1e-8, None)
+        if salience.ndim == 2:
+            return np.array([self.f0_label2c(salience[i, :]) for i in range(salience.shape[0])])
+        raise Exception("label should be either 1d or 2d ndarray")
+    def fill_onset_matrix(self, onsets, window, feature_rate):
+        """
+        Create a sparse onset matrix from window and onsets (per-semitone). Apply a gaussian smoothing (along time)
+        so that we can tolerate better the alignment problems. This is similar to the frequency smoothing for the f0.
+        The temporal smoothing is controlled by the parameter self.onset_smooth_std
+        :param onsets: A 2d np.array of individual note onsets with their respective time values
+        (Nx2: time in seconds - midi number)
+        :param window: Timestamps for the frame centers of the sparse matrix
+        :param feature_rate: Window timestamps are integer, this is to convert them to seconds
+        :return: onset_roll: A sparse matrix filled with temporally blurred onsets.
+        """
+        onsets = self.get_window_feats(onsets, window, feature_rate)
+        onset_roll = np.zeros((len(window), len(self.midi_centers)))
+        for onset in onsets:
+            onset, note = onset  # it was a pair with time and midi note
+            if self.midi_centers[0] < note < self.midi_centers[-1]: # midi note should be in the range defined
+                note = int(note) - self.midi_centers[0]  # find the note index in our range
+                onset = (onset*feature_rate)-window[0]    # onset index (as float but in frames, not in seconds!)
+                start = max(0, int(onset) - 3)
+                end = min(len(window) - 1, int(onset) + 3)
+                try:
+                    vals = norm.pdf(np.linspace(start - onset, end - onset, end - start + 1) / self.onset_smooth_std)
+                    # if you increase 0.7 you smooth the peak
+                    # if you decrease it, e.g., 0.1, it becomes too peaky! around 0.5-0.7 seems ok
+                    vals /= self.pdf_normalizer
+                    onset_roll[start:end + 1, note] += vals
+                except ValueError:
+                    print('start',start, 'onset', onset, 'end', end)
+        return onset_roll, onsets
+    def fill_note_matrix(self, notes, window, feature_rate):
+        """
+        Create the note matrix (piano roll) from window timestamps and note values per frame.
+        :param notes: A 2d np.array of individual notes with their active time values Nx2
+        :param window: Timestamps for the frame centers of the output
+        :param feature_rate: Window timestamps are integer, this is to convert them to seconds
+        :return note_roll: The piano roll in the defined range of [note_min, note_max).
+        """
+        notes = self.get_window_feats(notes, window, feature_rate)
+        # take the notes in the midi range defined
+        notes = notes[np.logical_and(notes[:,1]>=self.midi_centers[0], notes[:,1]<=self.midi_centers[-1]),:]
+        times = (notes[:,0]*feature_rate - window[0]).astype(int) # in feature samples (fs:self.hop/self.sr)
+        notes = (notes[:,1] - self.midi_centers[0]).astype(int)
+        note_roll = np.zeros((len(window), len(self.midi_centers)))
+        note_roll[(times, notes)] = 1
+        return note_roll, notes
+    def fill_f0_matrix(self, f0s, window, feature_rate):
+        """
+        Unlike the labels for onsets and notes, f0 label is only relevant for strictly monophonic regions! Thus, this
+        function returns a boolean which represents where to apply the given values.
+        Never back-propagate without the boolean! Empty frames mean that the label is not that reliable.
+        :param f0s: A 2d np.array of f0 values with the time they belong to (2xN: time in seconds - f0 in Hz)
+        :param window: Timestamps for the frame centers of the output
+        :param feature_rate: Window timestamps are integer, this is to convert them to seconds
+        :return f0_roll: f0 label matrix and
+                f0_hz: f0 values in Hz
+                annotation_bool: A boolean array representing which frames have reliable f0 annotations.
+        """
+        f0s = self.get_window_feats(f0s, window, feature_rate)
+        f0_cents = np.zeros_like(window, dtype=float)
+        f0s[:,1] = self.f0_hz2c(f0s[:,1]) # convert f0 in hz to cents
+        annotation_bool = np.zeros_like(window, dtype=bool)
+        f0_roll = np.zeros((len(window), len(self.f0_centers_c)))
+        times_in_frame = f0s[:, 0]*feature_rate - window[0]
+        for t, f0 in enumerate(f0s):
+            t = times_in_frame[t]
+            if t%1 < 0.25: # only consider it as annotation if the f0 values is really close to the frame center
+                t = int(np.round(t))
+                f0_roll[t] = self.f0_c2label(f0[1])
+                annotation_bool[t] = True
+                f0_cents[t] = f0[1]
+        return f0_roll, f0_cents, annotation_bool
+    @staticmethod
+    def get_window_feats(time_feature_matrix, window, feature_rate):
+        """
+        Restrict the feature matrix to the features that are inside the window
+        :param window: Timestamps for the frame centers of the output
+        :param time_feature_matrix: A 2d array of Nx2 per the entire file.
+        :param feature_rate: Window timestamps are integer, this is to convert them to seconds
+        :return: window_features: the features inside the given window
+        """
+        start = time_feature_matrix[:,0]>(window[0]-0.5)/feature_rate
+        end = time_feature_matrix[:,0]<(window[-1]+0.5)/feature_rate
+        window_features = np.logical_and(start, end)
+        window_features = np.array(time_feature_matrix[window_features,:])
+        return window_features
+    def represent_midi(self, midi, feature_rate):
+        """
+        Represent a midi file as sparse matrices of onsets, offsets, and notes. No f0 is included.
+        :param midi: A midi file (either a path or a pretty_midi.PrettyMIDI object)
+        :param feature_rate: The feature rate in Hz
+        :return: dict {onset, offset, note, time}: Same format with the model's learning and outputs
+        """
+        def _get_onsets_offsets_frames(midi_content):
+            if isinstance(midi_content, str):
+                midi_content = pretty_midi.PrettyMIDI(midi_content)
+            onsets = []
+            offsets = []
+            frames = []
+            for instrument in midi_content.instruments:
+                for note in instrument.notes:
+                    start = int(np.round(note.start * feature_rate))
+                    end = int(np.round(note.end * feature_rate))
+                    note_times = (np.arange(start, end+0.5)/feature_rate)[:, np.newaxis]
+                    note_pitch = np.full_like(note_times, fill_value=note.pitch)
+                    onsets.append([note.start, note.pitch])
+                    offsets.append([note.end, note.pitch])
+                    frames.append(np.hstack([note_times, note_pitch]))
+            onsets = np.vstack(onsets)
+            offsets = np.vstack(offsets)
+            frames = np.vstack(frames)
+            return onsets, offsets, frames, midi_content
+        onset_array, offset_array, frame_array, midi_object = _get_onsets_offsets_frames(midi)
+        window = np.arange(frame_array[0, 0]*feature_rate, frame_array[-1, 0]*feature_rate, dtype=int)
+        onset_roll, _ = self.fill_onset_matrix(onset_array, window, feature_rate)
+        offset_roll, _ = self.fill_onset_matrix(offset_array, window, feature_rate)
+        note_roll, _ = self.fill_note_matrix(frame_array, window, feature_rate)
+        start_anchor = onset_array[onset_array[:, 0]==np.min(onset_array[:, 0])]
+        end_anchor = offset_array[offset_array[:, 0]==np.max(offset_array[:, 0])]
+        return {
+            'midi': midi_object,
+            'note': note_roll,
+            'onset': onset_roll,
+            'offset': offset_roll,
+            'time': window/feature_rate,
+            'start_anchor': start_anchor,
+            'end_anchor': end_anchor
+        }

musc/synchronizer.py ADDED Viewed

	@@ -0,0 +1,299 @@

+from .dtw.mrmsdtw import sync_via_mrmsdtw_with_anchors
+from .dtw.utils import make_path_strictly_monotonic
+import numpy as np
+from .transcriber import Transcriber
+from typing import Dict
+class Synchronizer(Transcriber):
+    def __init__(self, labeling, instrument='Violin', sr=16000, window_size=1024, hop_length=160):
+        super().__init__(labeling, instrument=instrument, sr=sr, window_size=window_size, hop_length=hop_length)
+    def synchronize(self, audio, midi, batch_size=128, include_pitch_bends=True,  to_midi=True, debug=False,
+                    include_velocity=False, alignment_padding=50, timing_refinement_range_with_f0s=0):
+        """
+        Synchronize an audio file or mono waveform in numpy or torch with a MIDI file.
+        :param audio: str, pathlib.Path, np.ndarray, or torch.Tensor
+        :param midi: str, pathlib.Path, or pretty_midi.PrettyMIDI
+        :param batch_size: frames to process at once
+        :param include_pitch_bends: whether to include pitch bends in the MIDI file
+        :param to_midi: whether to return a MIDI file or a list of note events (as tuple)
+        :param debug: whether to plot the alignment path and compare the alignment with the predicted notes
+        :param include_velocity: whether to embed the note confidence in place of the velocity in the MIDI file
+        :param alignment_padding: how many frames to pad the audio and MIDI representations with
+        :param timing_refinement_range_with_f0s: how many frames to refine the alignment with the f0 confidence
+        :return: aligned MIDI file as a pretty_midi.PrettyMIDI object
+        Args:
+            debug:
+            to_midi:
+            include_pitch_bends:
+        """
+        audio = self.predict(audio, batch_size)
+        notes_and_midi = self.out2sync(audio, midi, include_velocity=include_velocity,
+                                       alignment_padding=alignment_padding)
+        if notes_and_midi: # it might be none
+            notes, midi = notes_and_midi
+            if debug:
+                import matplotlib.pyplot as plt
+                import pandas as pd
+                estimated_notes = self.out2note(audio, postprocessing='spotify', include_pitch_bends=True)
+                est_df = pd.DataFrame(estimated_notes).sort_values(by=0)
+                note_df = pd.DataFrame(notes).sort_values(by=0)
+                fig, ax = plt.subplots(figsize=(20, 10))
+                for row in notes:
+                    t_start = row[0]  # sec
+                    t_end = row[1]  # sec
+                    freq = row[2]  # Hz
+                    ax.hlines(freq, t_start, t_end, color='k', linewidth=3, zorder=2, alpha=0.5)
+                for row in estimated_notes:
+                    t_start = row[0]  # sec
+                    t_end = row[1]  # sec
+                    freq = row[2]  # Hz
+                    ax.hlines(freq, t_start, t_end, color='r', linewidth=3, zorder=2, alpha=0.5)
+                fig.suptitle('alignment (black) vs. estimated (red)')
+                fig.show()
+            if not include_pitch_bends:
+                if to_midi:
+                    return midi['midi']
+                else:
+                    return notes
+            else:
+                notes = [(np.argmin(np.abs(audio['time']-note[0])),
+                          np.argmin(np.abs(audio['time']-note[1])),
+                          note[2], note[3]) for note in notes]
+                notes = self.get_pitch_bends(audio["f0"], notes, timing_refinement_range_with_f0s)
+                notes = [
+                    (audio['time'][note[0]], audio['time'][note[1]], note[2], note[3], note[4]) for note in
+                    notes
+                ]
+                if to_midi:
+                    return self.note2midi(notes, 120) #int(midi['midi'].estimate_tempo()))
+                else:
+                    return notes
+    def out2sync_old(self, out: Dict[str, np.array], midi, include_velocity=False, alignment_padding=50, debug=False):
+        """
+        Synchronizes the output of the model with the MIDI file.
+        Args:
+            out: Model output dictionary
+            midi: Path to the MIDI file or PrettyMIDI object
+            include_velocity: Whether to encode the note confidence in place of velocity
+            alignment_padding: Number of frames to pad the MIDI features with zeros
+            debug: Visualize the alignment
+        Returns:
+            note events and the aligned PrettyMIDI object
+        """
+        midi = self.labeling.represent_midi(midi, self.sr/self.hop_length)
+        audio_midi_anchors = self.prepare_for_synchronization(out, midi, feature_rate=self.sr/self.hop_length,
+                                                              pad_length=alignment_padding)
+        if isinstance(audio_midi_anchors, str):
+            print(audio_midi_anchors)
+            return None   # the file is corrupted! no possible alignment at all
+        else:
+            audio, midi, anchor_pairs = audio_midi_anchors
+        ALPHA = 0.6  # This is the coefficient of onsets, 1 - ALPHA for offsets
+        wp = sync_via_mrmsdtw_with_anchors(f_chroma1=audio['note'].T,
+                                           f_onset1=np.hstack([ALPHA * audio['onset'],
+                                                               (1 - ALPHA) * audio['offset']]).T,
+                                           f_chroma2=midi['note'].T,
+                                           f_onset2=np.hstack([ALPHA * midi['onset'],
+                                                               (1 - ALPHA) * midi['offset']]).T,
+                                           input_feature_rate=self.sr/self.hop_length,
+                                           step_weights=np.array([1.5, 1.5, 2.0]),
+                                           threshold_rec=10 ** 6,
+                                           verbose=debug, normalize_chroma=False,
+                                           anchor_pairs=anchor_pairs)
+        wp = make_path_strictly_monotonic(wp).astype(int)
+        audio_time = np.take(audio['time'], wp[0])
+        midi_time = np.take(midi['time'], wp[1])
+        notes = []
+        for instrument in midi['midi'].instruments:
+            for note in instrument.notes:
+                note.start = np.interp(note.start, midi_time, audio_time)
+                note.end = np.interp(note.end, midi_time, audio_time)
+                if note.end - note.start <= 0.012: # notes should be at least 12 ms (i.e. 2 frames)
+                    note.start = note.start - 0.003
+                    note.end = note.start + 0.012
+                if include_velocity:  # encode the note confidence in place of velocity
+                    velocity = np.median(audio['note'][np.argmin(np.abs(audio['time']-note.start)):
+                                                       np.argmin(np.abs(audio['time']-note.end)),
+                                         note.pitch-self.labeling.midi_centers[0]])
+                    note.velocity = max(1, velocity*127) # velocity should be at least 1 otherwise midi removes the note
+                else:
+                    velocity = note.velocity/127
+                notes.append((note.start, note.end, note.pitch, velocity))
+        return notes, midi
+    def out2sync(self, out: Dict[str, np.array], midi, include_velocity=False, alignment_padding=50, debug=False):
+        """
+        Synchronizes the output of the model with the MIDI file.
+        Args:
+            out: Model output dictionary
+            midi: Path to the MIDI file or PrettyMIDI object
+            include_velocity: Whether to encode the note confidence in place of velocity
+            alignment_padding: Number of frames to pad the MIDI features with zeros
+            debug: Visualize the alignment
+        Returns:
+            note events and the aligned PrettyMIDI object
+        """
+        midi = self.labeling.represent_midi(midi, self.sr/self.hop_length)
+        audio_midi_anchors = self.prepare_for_synchronization(out, midi, feature_rate=self.sr/self.hop_length,
+                                                              pad_length=alignment_padding)
+        if isinstance(audio_midi_anchors, str):
+            print(audio_midi_anchors)
+            return None   # the file is corrupted! no possible alignment at all
+        else:
+            audio, midi, anchor_pairs = audio_midi_anchors
+        ALPHA = 0.6  # This is the coefficient of onsets, 1 - ALPHA for offsets
+        starts = (np.array(anchor_pairs[0])*self.sr/self.hop_length).astype(int)
+        ends = (np.array(anchor_pairs[1])*self.sr/self.hop_length).astype(int)
+        wp = sync_via_mrmsdtw_with_anchors(f_chroma1=audio['note'].T[:, starts[0]:ends[0]],
+                                           f_onset1=np.hstack([ALPHA * audio['onset'],
+                                                               (1 - ALPHA) * audio['offset']]).T[:, starts[0]:ends[0]],
+                                           f_chroma2=midi['note'].T[:, starts[1]:ends[1]],
+                                           f_onset2=np.hstack([ALPHA * midi['onset'],
+                                                               (1 - ALPHA) * midi['offset']]).T[:, starts[1]:ends[1]],
+                                           input_feature_rate=self.sr/self.hop_length,
+                                           step_weights=np.array([1.5, 1.5, 2.0]),
+                                           threshold_rec=10 ** 6,
+                                           verbose=debug, normalize_chroma=False,
+                                           anchor_pairs=None)
+        wp = make_path_strictly_monotonic(wp).astype(int)
+        wp[0] += starts[0]
+        wp[1] += starts[1]
+        wp = np.hstack((wp, ends[:,np.newaxis]))
+        audio_time = np.take(audio['time'], wp[0])
+        midi_time = np.take(midi['time'], wp[1])
+        notes = []
+        for instrument in midi['midi'].instruments:
+            for note in instrument.notes:
+                note.start = np.interp(note.start, midi_time, audio_time)
+                note.end = np.interp(note.end, midi_time, audio_time)
+                if note.end - note.start <= 0.012: # notes should be at least 12 ms (i.e. 2 frames)
+                    note.start = note.start - 0.003
+                    note.end = note.start + 0.012
+                if include_velocity:  # encode the note confidence in place of velocity
+                    velocity = np.median(audio['note'][np.argmin(np.abs(audio['time']-note.start)):
+                                                       np.argmin(np.abs(audio['time']-note.end)),
+                                         note.pitch-self.labeling.midi_centers[0]])
+                    note.velocity = max(1, velocity*127) # velocity should be at least 1 otherwise midi removes the note
+                else:
+                    velocity = note.velocity/127
+                notes.append((note.start, note.end, note.pitch, velocity))
+        return notes, midi
+    @staticmethod
+    def pad_representations(dict_of_representations, pad_length=10):
+        """
+        Pad the representations so that the DTW does not enforce them to encompass the entire duration.
+        Args:
+            dict_of_representations: audio or midi representations
+            pad_length: how many frames to pad
+        Returns:
+            padded representations
+        """
+        for key, value in dict_of_representations.items():
+            if key == 'time':
+                padded_time = dict_of_representations[key]
+                padded_time = np.concatenate([padded_time[:2*pad_length], padded_time+padded_time[2*pad_length]])
+                dict_of_representations[key] = padded_time - padded_time[pad_length] # this is to ensure that the
+                # first frame times are negative until the real zero time
+            elif key in ['onset', 'offset', 'note']:
+                dict_of_representations[key] = np.pad(value, ((pad_length, pad_length), (0, 0)))
+            elif key in ['start_anchor', 'end_anchor']:
+                anchor_time =  dict_of_representations[key][0][0]
+                anchor_time = np.argmin(np.abs(dict_of_representations['time'] - anchor_time))
+                dict_of_representations[key][:,0] = anchor_time
+                dict_of_representations[key] = dict_of_representations[key].astype(np.int)
+        return dict_of_representations
+    def prepare_for_synchronization(self, audio, midi, feature_rate=44100/256, pad_length=100):
+        """
+        MrMsDTW works better with start and end anchors. This function finds the start and end anchors for audio
+        based on the midi notes. It also pads the MIDI representations since MIDI files most often start with an active
+        note and end with an active note. Thus, the DTW will try to align the active notes to the entire duration of the
+        audio. This is not desirable. Therefore, we pad the MIDI representations with a few frames of silence at the
+        beginning and end of the audio. This way, the DTW will not try to align the active notes to the entire duration.
+        Args:
+            audio:
+            midi:
+            feature_rate:
+            pad_length:
+        Returns:
+        """
+        # first pad the MIDI
+        midi = self.pad_representations(midi, pad_length)
+        # sometimes f0s are more reliable than the notes. So, we use both the f0s and the notes together to find the
+        # start and end anchors. f0 lookup bins is the number of bins to look around the f0 to assign a note to it.
+        f0_lookup_bins = int(100//(2*self.labeling.f0_granularity_c))
+        # find the start anchor for the audio
+        # first decide on which notes to use for the start anchor (take the entire chord where the MIDI file starts)
+        anchor_notes = midi['start_anchor'][:, 1] - self.labeling.midi_centers[0]
+        # now find which f0 bins to look at for the start anchor
+        anchor_f0s = [self.midi_pitch_to_contour_bin(an+self.labeling.midi_centers[0]) for an in anchor_notes]
+        anchor_f0s = np.array([list(range(f0-f0_lookup_bins, f0+f0_lookup_bins+1)) for f0 in anchor_f0s]).reshape(-1)
+        # first start anchor proposals come from the notes
+        anchor_vals = np.any(audio['note'][:, anchor_notes]>0.5, axis=1)
+        # now the f0s
+        anchor_vals_f0 = np.any(audio['f0'][:, anchor_f0s]>0.5, axis=1)
+        # combine the two
+        anchor_vals = np.logical_or(anchor_vals, anchor_vals_f0)
+        if not any(anchor_vals):
+            return 'corrupted'  # do not consider the file if we cannot find the start anchor
+        audio_start = np.argmax(anchor_vals)
+        # now the end anchor (most string instruments use chords in cadences: in general the end anchor is polyphonic)
+        anchor_notes = midi['end_anchor'][:, 1] - self.labeling.midi_centers[0]
+        anchor_f0s = [self.midi_pitch_to_contour_bin(an+self.labeling.midi_centers[0]) for an in anchor_notes]
+        anchor_f0s = np.array([list(range(f0-f0_lookup_bins, f0+f0_lookup_bins+1)) for f0 in anchor_f0s]).reshape(-1)
+        # the same procedure as above
+        anchor_vals = np.any(audio['note'][::-1, anchor_notes]>0.5, axis=1)
+        anchor_vals_f0 = np.any(audio['f0'][::-1, anchor_f0s]>0.5, axis=1)
+        anchor_vals = np.logical_or(anchor_vals, anchor_vals_f0)
+        if not any(anchor_vals):
+            return 'corrupted'  # do not consider the file if we cannot find the end anchor
+        audio_end = audio['note'].shape[0] - np.argmax(anchor_vals)
+        if audio_end - audio_start < (midi['end_anchor'][0][0] - midi['start_anchor'][0][0])/10: # no one plays x10 faster
+            return 'corrupted'  # do not consider the interval between anchors is too short
+        anchor_pairs = [(audio_start - 5, midi['start_anchor'][0][0] - 5),
+                        (audio_end + 5, midi['end_anchor'][0][0] + 5)]
+        if anchor_pairs[0][0] < 1:
+            anchor_pairs[0] = (1, midi['start_anchor'][0][0])
+        if anchor_pairs[1][0] > audio['note'].shape[0] - 1:
+            anchor_pairs[1] = (audio['note'].shape[0] - 1, midi['end_anchor'][0][0])
+        return audio, midi, [(anchor_pairs[0][0]/feature_rate, anchor_pairs[0][1]/feature_rate),
+                             (anchor_pairs[1][0]/feature_rate, anchor_pairs[1][1]/feature_rate)]

musc/transcriber.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from collections import defaultdict
+from typing import DefaultDict, Dict, List, Optional, Tuple
+import pretty_midi
+import numpy as np
+from .postprocessing import RegressionPostProcessor, spotify_create_notes
+from .pitch_estimator import PitchEstimator
+class Transcriber(PitchEstimator):
+    def __init__(self, labeling, instrument='Violin', sr=16000, window_size=1024, hop_length=160):
+        super().__init__(labeling, instrument=instrument, sr=sr, window_size=window_size, hop_length=hop_length)
+    def transcribe(self, audio, batch_size=128, postprocessing='spotify', include_pitch_bends=True, to_midi=True,
+                   debug=False):
+        """
+        Transcribe an audio file or mono waveform in numpy or torch into MIDI with pitch bends.
+        :param audio: str, pathlib.Path, np.ndarray, or torch.Tensor
+        :param batch_size: frames to process at once
+        :param postprocessing: note creation method. 'spotify'(default) or 'tiktok'
+        :param include_pitch_bends: whether to include pitch bends in the MIDI file
+        :param to_midi: whether to return a MIDI file or a list of note events (as tuple)
+        :return: transcribed MIDI file as a pretty_midi.PrettyMIDI object
+        """
+        out = self.predict(audio, batch_size)
+        if debug:
+            import matplotlib.pyplot as plt
+            plt.imshow(out['f0'].T, aspect='auto', origin='lower')
+            plt.show()
+            plt.imshow(out['note'].T, aspect='auto', origin='lower')
+            plt.show()
+            plt.imshow(out['onset'].T, aspect='auto', origin='lower')
+            plt.show()
+            plt.imshow(out['offset'].T, aspect='auto', origin='lower')
+            plt.show()
+        if to_midi:
+            return self.out2midi(out, postprocessing, include_pitch_bends)
+        else:
+            return self.out2note(out, postprocessing, include_pitch_bends)
+    def out2note(self, output: Dict[str, np.array], postprocessing='spotify',
+                 include_pitch_bends: bool = True,
+    ) -> List[Tuple[float, float, int, float, Optional[List[int]]]]:
+        """Convert model output to notes
+        """
+        if postprocessing == 'spotify':
+            estimated_notes = spotify_create_notes(
+                output["note"],
+                output["onset"],
+                note_low=self.labeling.midi_centers[0],
+                note_high=self.labeling.midi_centers[-1],
+                onset_thresh=0.5,
+                frame_thresh=0.3,
+                infer_onsets=True,
+                min_note_len=int(np.round(127.70 / 1000 * (self.sr / self.hop_length))), #127.70
+                melodia_trick=True,
+            )
+        if postprocessing == 'rebab':
+            estimated_notes = spotify_create_notes(
+                output["note"],
+                output["onset"],
+                note_low=self.labeling.midi_centers[0],
+                note_high=self.labeling.midi_centers[-1],
+                onset_thresh=0.2,
+                frame_thresh=0.2,
+                infer_onsets=True,
+                min_note_len=int(np.round(127.70 / 1000 * (self.sr / self.hop_length))), #127.70
+                melodia_trick=True,
+            )
+        elif postprocessing == 'tiktok':
+            postprocessor = RegressionPostProcessor(
+                frames_per_second=self.sr / self.hop_length,
+                classes_num=self.labeling.midi_centers.shape[0],
+                begin_note=self.labeling.midi_centers[0],
+                onset_threshold=0.2,
+                offset_threshold=0.2,
+                frame_threshold=0.3,
+                pedal_offset_threshold=0.5,
+            )
+            tiktok_note_dict, _ = postprocessor.output_dict_to_midi_events(output)
+            estimated_notes = []
+            for list_item in tiktok_note_dict:
+                if list_item['offset_time'] > 0.6 + list_item['onset_time']:
+                    estimated_notes.append((int(np.floor(list_item['onset_time']/(output['time'][1]))),
+                                            int(np.ceil(list_item['offset_time']/(output['time'][1]))),
+                                            list_item['midi_note'], list_item['velocity']/128))
+        if include_pitch_bends:
+            estimated_notes_with_pitch_bend = self.get_pitch_bends(output["f0"], estimated_notes)
+        else:
+            estimated_notes_with_pitch_bend = [(note[0], note[1], note[2], note[3], None) for note in estimated_notes]
+        times_s = output['time']
+        estimated_notes_time_seconds = [
+            (times_s[note[0]], times_s[note[1]], note[2], note[3], note[4]) for note in estimated_notes_with_pitch_bend
+        ]
+        return estimated_notes_time_seconds
+    def out2midi(self, output: Dict[str, np.array], postprocessing: str = 'spotify', include_pitch_bends: bool = True,
+    ) -> pretty_midi.PrettyMIDI:
+        """Convert model output to MIDI
+        Args:
+            output: A dictionary with shape
+                {
+                    'frame': array of shape (n_times, n_freqs),
+                    'onset': array of shape (n_times, n_freqs),
+                    'contour': array of shape (n_times, 3*n_freqs)
+                }
+                representing the output of the basic pitch model.
+            postprocessing: spotify or tiktok postprocessing.
+            include_pitch_bends: If True, include pitch bends.
+        Returns:
+            note_events: A list of note event tuples (start_time_s, end_time_s, pitch_midi, amplitude)
+        """
+        estimated_notes_time_seconds = self.out2note(output, postprocessing, include_pitch_bends)
+        midi_tempo = 120  # todo: infer tempo from the onsets
+        return self.note2midi(estimated_notes_time_seconds, midi_tempo)
+    def note2midi(
+            self, note_events_with_pitch_bends: List[Tuple[float, float, int, float, Optional[List[int]]]],
+            midi_tempo: float = 120,
+    ) -> pretty_midi.PrettyMIDI:
+        """Create a pretty_midi object from note events
+            :param note_events_with_pitch_bends: list of tuples
+                    [(start_time_seconds, end_time_seconds, pitch_midi, amplitude)]
+            :param midi_tempo: #todo: infer tempo from the onsets
+            :return: transcribed MIDI file as a pretty_midi.PrettyMIDI object
+        """
+        mid = pretty_midi.PrettyMIDI(initial_tempo=midi_tempo)
+        program = pretty_midi.instrument_name_to_program(self.instrument)
+        instruments: DefaultDict[int, pretty_midi.Instrument] = defaultdict(
+            lambda: pretty_midi.Instrument(program=program)
+        )
+        for start_time, end_time, note_number, amplitude, pitch_bend in note_events_with_pitch_bends:
+            instrument = instruments[note_number]
+            note = pretty_midi.Note(
+                velocity=int(np.round(127 * amplitude)),
+                pitch=note_number,
+                start=start_time,
+                end=end_time,
+            )
+            instrument.notes.append(note)
+            if not isinstance(pitch_bend, np.ndarray):
+                continue
+            pitch_bend_times = np.linspace(start_time, end_time, len(pitch_bend))
+            for pb_time, pb_midi in zip(pitch_bend_times, pitch_bend):
+                instrument.pitch_bends.append(pretty_midi.PitchBend(pb_midi, pb_time))
+        mid.instruments.extend(instruments.values())
+        return mid

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+mir_eval
+pretty_midi
+torchaudio
+scipy
+numba
+librosa
+matplotlib
+mido

violin.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "wiring": "parallel",
+  "sampling_rate": 44100,
+  "pathway_multiscale": 4,
+  "num_pathway_layers": 2,
+  "num_separator_layers": 16,
+  "num_representation_layers": 4,
+  "hop_length": 256,
+  "chunk_size": 512,
+  "minSNR": -32,
+  "maxSNR": 96,
+  "note_low": "F#3",
+  "note_high": "E8",
+  "f0_bins_per_semitone": 10,
+  "f0_smooth_std_c": 12,
+  "onset_smooth_std": 0.7
+}

violin_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a913356f059be6dc930be41158ac864f7d5511889ef0b2a6b6ba75a4a8732750
+size 218770231