shethjenil commited on
Commit
5cc486d
·
verified ·
1 Parent(s): c274ee1

Upload folder using huggingface_hub

Browse files
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from musc.model import PretrainedModel
3
+ from json import load as json_load
4
+ from mido import MidiFile,MidiTrack
5
+ from os import remove as os_remove
6
+ Model = PretrainedModel(json_load(open("violin.json")),"violin_model.pt").to("cpu")
7
+ def merge_violin_tracks(input_midi, output_midi):
8
+ mid = MidiFile(input_midi)
9
+ new_mid = MidiFile(ticks_per_beat=mid.ticks_per_beat)
10
+ new_track = MidiTrack()
11
+ new_mid.tracks.append(new_track)
12
+ events = []
13
+ for track in mid.tracks:
14
+ current_time = 0
15
+ for msg in track:
16
+ current_time += msg.time
17
+ events.append((current_time, msg))
18
+ events.sort(key=lambda x: x[0])
19
+ last_time = 0
20
+ for event_time, msg in events:
21
+ delta_time = event_time - last_time
22
+ new_track.append(msg.copy(time=delta_time))
23
+ last_time = event_time
24
+ for track in mid.tracks:
25
+ for msg in track:
26
+ if msg.type == 'set_tempo':
27
+ new_track.insert(0, msg)
28
+ new_mid.save(output_midi)
29
+
30
+ def transcribe_and_generate_midi(music_file_path, model=Model, batch_size=32):
31
+ model.transcribe(music_file_path, batch_size=batch_size).write("output.midi")
32
+ merge_violin_tracks("output.midi","output.midi")
33
+ os_remove(music_file_path)
34
+ return "output.midi"
35
+
36
+ gr.Interface(
37
+ fn=transcribe_and_generate_midi,
38
+ inputs=gr.Audio(label="Upload your Audio file",type="filepath"),
39
+ outputs=gr.File(label="Download MIDI file"),
40
+ title="Audio2Violin",
41
+ description="Upload a Audio file, and it will be transcribed into Violin MIDI format."
42
+ ).launch()
musc/__init__.py ADDED
File without changes
musc/dtw/__init__.py ADDED
File without changes
musc/dtw/anchor.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from numba import jit
2
+ import numpy as np
3
+ from typing import Tuple
4
+
5
+
6
+ def project_alignment_on_a_new_feature_rate(alignment: np.ndarray,
7
+ feature_rate_old: int,
8
+ feature_rate_new: int,
9
+ cost_matrix_size_old: tuple = (),
10
+ cost_matrix_size_new: tuple = ()) -> np.ndarray:
11
+ """Projects an alignment computed for a cost matrix on a certain
12
+ feature resolution on a cost matrix having a different feature
13
+ resolution.
14
+
15
+ Parameters
16
+ ----------
17
+ alignment : np.ndarray [shape=(2, N)]
18
+ Alignment matrix
19
+
20
+ feature_rate_old : int
21
+ Feature rate of the old cost matrix
22
+
23
+ feature_rate_new : int
24
+ Feature rate of the new cost matrix
25
+
26
+ cost_matrix_size_old : tuple
27
+ Size of the old cost matrix. Possibly needed to deal with border cases
28
+
29
+ cost_matrix_size_new : tuple
30
+ Size of the new cost matrix. Possibly needed to deal with border cases
31
+
32
+ Returns
33
+ -------
34
+ np.ndarray [shape=(2, N)]
35
+ Anchor sequence for the new cost matrix
36
+ """
37
+ # Project the alignment on the new feature rate
38
+ fac = feature_rate_new / feature_rate_old
39
+ anchors = np.round(alignment * fac) + 1
40
+
41
+ # In case the sizes of the cost matrices are given explicitly and the
42
+ # alignment specifies to align the first and last elements, handle this case
43
+ # separately since this might cause problems in the general projection
44
+ # procedure.
45
+ if cost_matrix_size_old is not None and cost_matrix_size_new is not None:
46
+ if np.array_equal(alignment[:, 0], np.array([0, 0])):
47
+ anchors[:, 0] = np.array([1, 1])
48
+
49
+ if np.array_equal(alignment[:, -1], np.array(cost_matrix_size_old) - 1):
50
+ anchors[:, -1] = np.array(cost_matrix_size_new)
51
+
52
+ return anchors - 1
53
+
54
+
55
+ def derive_anchors_from_projected_alignment(projected_alignment: np.ndarray,
56
+ threshold: int) -> np.ndarray:
57
+ """Derive anchors from a projected alignment such that the area of the rectangle
58
+ defined by two subsequent anchors a1 and a2 is below a given threshold.
59
+
60
+ Parameters
61
+ ----------
62
+ projected_alignment : np.ndarray [shape=(2, N)]
63
+ Projected alignment array
64
+
65
+ threshold : int
66
+ Maximum area of the constraint rectangle
67
+
68
+ Returns
69
+ -------
70
+ anchors_res : np.ndarray [shape=(2, M)]
71
+ Resulting anchor sequence
72
+ """
73
+ L = projected_alignment.shape[1]
74
+
75
+ a1 = np.array(projected_alignment[:, 0], copy=True).reshape(-1, 1)
76
+ a2 = np.array(projected_alignment[:, -1], copy=True).reshape(-1, 1)
77
+
78
+ if __compute_area(a1, a2) <= threshold:
79
+ anchors_res = np.concatenate([a1, a2], axis=1)
80
+ elif L > 2:
81
+ center = int(np.floor(L/2 + 1))
82
+
83
+ a1 = np.array(projected_alignment[:, 0], copy=True).reshape(-1, 1)
84
+ a2 = np.array(projected_alignment[:, center - 1], copy=True).reshape(-1, 1)
85
+ a3 = np.array(projected_alignment[:, -1], copy=True).reshape(-1, 1)
86
+
87
+ if __compute_area(a1, a2) > threshold:
88
+ anchors_1 = derive_anchors_from_projected_alignment(projected_alignment[:, 0:center], threshold)
89
+ else:
90
+ anchors_1 = np.concatenate([a1, a2], axis=1)
91
+
92
+ if __compute_area(a2, a3) > threshold:
93
+ anchors_2 = derive_anchors_from_projected_alignment(projected_alignment[:, center - 1:], threshold)
94
+ else:
95
+ anchors_2 = np.concatenate([a2, a3], axis=1)
96
+
97
+ anchors_res = np.concatenate([anchors_1, anchors_2[:, 1:]], axis=1)
98
+ else:
99
+ if __compute_area(a1, a2) > threshold:
100
+ print('Only two anchor points are given which do not fulfill the constraint.')
101
+ anchors_res = np.concatenate([a1, a2], axis=1)
102
+
103
+ return anchors_res
104
+
105
+
106
+ def derive_neighboring_anchors(warping_path: np.ndarray,
107
+ anchor_indices: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
108
+ """Compute anchor points in the neighborhood of previous anchor points.
109
+
110
+ Parameters
111
+ ----------
112
+ warping_path : np.ndarray [shape=(2, N)]
113
+ Warping path
114
+
115
+ anchor_indices : np.ndarray
116
+ Indices corresponding to the anchor points in the ``warping_path``
117
+
118
+ Returns
119
+ -------
120
+ neighboring_anchors : np.ndarray [shape=(2, N-1)]
121
+ Sequence of neighboring anchors
122
+
123
+ neighboring_anchor_indices : np.ndarray
124
+ Indices into ``warping path`` corresponding to ``neighboring_anchors``
125
+ """
126
+ L = anchor_indices.shape[0]
127
+ neighboring_anchor_indices = np.zeros(L-1, dtype=int)
128
+ neighboring_anchors = np.zeros((2, L-1), dtype=int)
129
+
130
+ for k in range(1, L):
131
+ i1 = anchor_indices[k-1]
132
+ i2 = anchor_indices[k]
133
+
134
+ neighboring_anchor_indices[k-1] = i1 + np.floor((i2 - i1) / 2)
135
+ neighboring_anchors[:, k-1] = warping_path[:, neighboring_anchor_indices[k - 1]]
136
+
137
+ return neighboring_anchors, neighboring_anchor_indices
138
+
139
+
140
+ @jit(nopython=True)
141
+ def __compute_area(a: tuple,
142
+ b: tuple):
143
+ """Computes the area between two points, given as tuples"""
144
+ return (b[0] - a[0] + 1) * (b[1] - a[1] + 1)
145
+
146
+
147
+
musc/dtw/core.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ from numba import jit
3
+ import numpy as np
4
+
5
+
6
+ @jit(nopython=True, cache=True)
7
+ def __C_to_DE(C: np.ndarray = None,
8
+ dn: np.ndarray = np.array([1, 1, 0], np.int64),
9
+ dm: np.ndarray = np.array([1, 0, 1], np.int64),
10
+ dw: np.ndarray = np.array([1.0, 1.0, 1.0], np.float64),
11
+ sub_sequence: bool = False) -> tuple[np.ndarray, np.ndarray]:
12
+ """This function computes the accumulated cost matrix D and the step index
13
+ matrix E.
14
+
15
+ Parameters
16
+ ----------
17
+ C : np.ndarray (np.float32 / np.float64) [shape=(N, M)]
18
+ Cost matrix
19
+
20
+ dn : np.ndarray (np.int64) [shape=(1, S)]
21
+ Integer array defining valid steps (N direction of C), default: [1, 1, 0]
22
+
23
+ dm : np.ndarray (np.int64) [shape=(1, S)]
24
+ Integer array defining valid steps (M direction of C), default: [1, 0, 1]
25
+
26
+ dw : np.ndarray (np.float64) [shape=(1, S)]
27
+ Double array defining the weight of the each step, default: [1.0, 1.0, 1.0]
28
+
29
+ sub_sequence : bool
30
+ Set `True` for SubSequence DTW, default: False
31
+
32
+ Returns
33
+ -------
34
+ D : np.ndarray (np.float64) [shape=(N, M)]
35
+ Accumulated cost matrix of type double
36
+
37
+ E : np.ndarray (np.int64) [shape=(N, M)]
38
+ Step index matrix.
39
+ E[n, m] holds the index of the step take to determine the value of D[n, m].
40
+ If E[n, m] is zero, no valid step was possible.
41
+ NaNs in the cost matrix are preserved, invalid fields in the cost matrix are NaNs.
42
+ """
43
+ if C is None:
44
+ raise ValueError('C must be a 2D numpy array.')
45
+
46
+ N, M = C.shape
47
+ S = dn.size
48
+
49
+ if S != dm.size or S != dw.size:
50
+ raise ValueError('The parameters dn,dm, and dw must be of equal length.')
51
+
52
+ # calc bounding box size of steps
53
+ sbbn = np.max(dn)
54
+ sbbm = np.max(dm)
55
+
56
+ # initialize E
57
+ E = np.zeros((N, M), np.int64) - 1
58
+
59
+ # initialize extended D matrix
60
+ D = np.ones((sbbn + N, sbbm + M), np.float64) * np.inf
61
+
62
+ if sub_sequence:
63
+ for m in range(M):
64
+ D[sbbn, sbbm + m] = C[0, m]
65
+ else:
66
+ D[sbbn, sbbm] = C[0, 0]
67
+
68
+ # accumulate
69
+ for m in range(sbbm, M + sbbm):
70
+ for n in range(sbbn, N + sbbn):
71
+ for s in range(S):
72
+ cost = D[n - dn[s], m - dm[s]] + C[n - sbbn, m - sbbm] * dw[s]
73
+ if cost < D[n, m]:
74
+ D[n, m] = cost
75
+ E[n - sbbn, m - sbbm] = s
76
+
77
+ D = D[sbbn: N + sbbn, sbbm: M + sbbm]
78
+
79
+ return D, E
80
+
81
+
82
+ @jit(nopython=True, cache=True)
83
+ def __E_to_warping_path(E: np.ndarray,
84
+ dn: np.ndarray = np.array([1, 1, 0], np.int64),
85
+ dm: np.ndarray = np.array([1, 0, 1], np.int64),
86
+ sub_sequence: bool = False,
87
+ end_index: int = -1) -> np.ndarray:
88
+ """This function computes a warping path based on the provided matrix E
89
+ and the allowed steps.
90
+
91
+ Parameters
92
+ ----------
93
+ E : np.ndarray (np.int64) [shape=(N, M)]
94
+ Step index matrix
95
+
96
+ dn : np.ndarray (np.int64) [shape=(1, S)]
97
+ Integer array defining valid steps (N direction of C), default: [1, 1, 0]
98
+
99
+ dm : np.ndarray (np.int64) [shape=(1, S)]
100
+ Integer array defining valid steps (M direction of C), default: [1, 0, 1]
101
+
102
+ sub_sequence : bool
103
+ Set `True` for SubSequence DTW, default: False
104
+
105
+ end_index : int
106
+ In case of SubSequence DTW
107
+
108
+ Returns
109
+ -------
110
+ warping_path : np.ndarray (np.int64) [shape=(2, M)]
111
+ Resulting optimal warping path
112
+ """
113
+ N, M = E.shape
114
+
115
+ if not sub_sequence and end_index == -1:
116
+ end_index = M - 1
117
+
118
+ m = end_index
119
+ n = N - 1
120
+
121
+ warping_path = np.zeros((2, n + m + 1))
122
+
123
+ index = 0
124
+
125
+ def _loop(m, n, index):
126
+ warping_path[:, index] = np.array([n, m])
127
+ step_index = E[n, m]
128
+ m -= dm[step_index]
129
+ n -= dn[step_index]
130
+ index += 1
131
+ return m, n, index
132
+
133
+ if sub_sequence:
134
+ while n > 0:
135
+ m, n, index = _loop(m, n, index)
136
+ else:
137
+ while m > 0 or n > 0:
138
+ m, n, index = _loop(m, n, index)
139
+
140
+ warping_path[:, index] = np.array([n, m])
141
+ warping_path = warping_path[:, index::-1]
142
+
143
+ return warping_path
144
+
145
+
146
+ def compute_warping_path(C: np.ndarray,
147
+ step_sizes: np.ndarray = np.array([[1, 0], [0, 1], [1, 1]], np.int64),
148
+ step_weights: np.ndarray = np.array([1.0, 1.0, 1.0], np.float64),
149
+ implementation: str = 'synctoolbox'):
150
+ """Applies DTW on cost matrix C.
151
+
152
+ Parameters
153
+ ----------
154
+ C : np.ndarray (np.float32 / np.float64) [shape=(N, M)]
155
+ Cost matrix
156
+
157
+ step_sizes : np.ndarray (np.int64) [shape=(2, S)]
158
+ Array of step sizes
159
+
160
+ step_weights : np.ndarray (np.float64) [shape=(2, S)]
161
+ Array of step weights
162
+
163
+ implementation: str
164
+ Choose among ``synctoolbox`` and ``librosa``. (default: ``synctoolbox``)
165
+
166
+ Returns
167
+ -------
168
+ D : np.ndarray (np.float64) [shape=(N, M)]
169
+ Accumulated cost matrix
170
+
171
+ E : np.ndarray (np.int64) [shape=(N, M)]
172
+ Step index matrix
173
+
174
+ wp : np.ndarray (np.int64) [shape=(2, M)]
175
+ Warping path
176
+ """
177
+ if implementation == 'librosa':
178
+ D, wp, E = librosa.sequence.dtw(C=C,
179
+ step_sizes_sigma=step_sizes,
180
+ weights_add=np.array([0, 0, 0]),
181
+ weights_mul=step_weights,
182
+ return_steps=True,
183
+ subseq=False)
184
+ wp = wp[::-1].T
185
+
186
+ elif implementation == 'synctoolbox':
187
+ dn = step_sizes[:, 0]
188
+ dm = step_sizes[:, 1]
189
+
190
+ D, E = __C_to_DE(C,
191
+ dn=dn,
192
+ dm=dm,
193
+ dw=step_weights,
194
+ sub_sequence=False)
195
+
196
+ wp = __E_to_warping_path(E=E,
197
+ dn=dn,
198
+ dm=dm,
199
+ sub_sequence=False)
200
+
201
+ else:
202
+ raise NotImplementedError(f'No implementation found called {implementation}')
203
+
204
+ return D, E, wp
205
+
musc/dtw/cost.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.metrics.pairwise import euclidean_distances
3
+
4
+ #@jit(nopython=True)
5
+ def cosine_distance(f1, f2, cos_meas_max=2.0, cos_meas_min=1.0):
6
+ """For all pairs of vectors f1' and f2' in f1 and f2, computes 1 - (f1.f2),
7
+ where '.' is the dot product, and rescales the results to lie in the
8
+ range [cos_meas_min, cos_meas_max].
9
+ Corresponds to regular cosine distance if f1' and f2' are normalized and
10
+ cos_meas_min==0.0 and cos_meas_max==1.0."""
11
+ return (1 - f1.T @ f2) * (cos_meas_max - cos_meas_min) + cos_meas_min
12
+
13
+
14
+ #@jit(nopython=True)
15
+ def euclidean_distance(f1, f2, l2_meas_max=1.0, l2_meas_min=0.0):
16
+ """Computes euclidean distances between the vectors in f1 and f2, and
17
+ rescales the results to lie in the range [cos_meas_min, cos_meas_max]."""
18
+
19
+ #S1 = np.zeros((f1.shape[1], f2.shape[1]))
20
+ #for n in range(f2.shape[1]):
21
+ # S1[:, n] = np.sqrt(np.sum((f1.T - f2[:, n]) ** 2, axis=1))
22
+ S1 = euclidean_distances(f1.T, f2.T)
23
+
24
+ return S1 * (l2_meas_max - l2_meas_min) + l2_meas_min
25
+
26
+
27
+ def compute_high_res_cost_matrix(f_chroma1: np.ndarray,
28
+ f_chroma2: np.ndarray,
29
+ f_onset1: np.ndarray,
30
+ f_onset2: np.ndarray,
31
+ weights: np.ndarray = np.array([1.0, 1.0]),
32
+ cos_meas_min: float = 1.0,
33
+ cos_meas_max: float = 2.0,
34
+ l2_meas_min: float = 0.0,
35
+ l2_meas_max: float = 1.0):
36
+ """Computes cost matrix of two sequences using two feature matrices
37
+ for each sequence. Cosine distance is used for the chroma sequences and
38
+ euclidean distance is used for the DLNCO sequences.
39
+
40
+ Parameters
41
+ ----------
42
+ f_chroma1 : np.ndarray [shape=(12, N)]
43
+ Chroma feature matrix of the first sequence (assumed to be normalized).
44
+
45
+ f_chroma2 : np.ndarray [shape=(12, M)]
46
+ Chroma feature matrix of the second sequence (assumed to be normalized).
47
+
48
+ f_onset1 : np.ndarray [shape=(12, N)]
49
+ DLNCO feature matrix of the first sequence
50
+
51
+ f_onset2 : np.ndarray [shape=(12, M)]
52
+ DLNCO feature matrix of the second sequence
53
+
54
+ weights : np.ndarray [shape=[2,]]
55
+ Weights array for the high-resolution cost computation.
56
+ weights[0] * cosine_distance + weights[1] * euclidean_distance
57
+
58
+ cos_meas_min : float
59
+ Cosine distances are shifted to be at least ``cos_meas_min``
60
+
61
+ cos_meas_max : float
62
+ Cosine distances are scaled to be at most ``cos_meas_max``
63
+
64
+ l2_meas_min : float
65
+ Euclidean distances are shifted to be at least ``l2_meas_min``
66
+
67
+ l2_meas_max : float
68
+ Euclidean distances are scaled to be at most ``l2_meas_max``
69
+
70
+ Returns
71
+ -------
72
+ C: np.ndarray [shape=(N, M)]
73
+ Cost matrix
74
+ """
75
+ cos_dis = cosine_distance(f_chroma1, f_chroma2, cos_meas_min=cos_meas_min, cos_meas_max=cos_meas_max)
76
+ euc_dis = euclidean_distance(f_onset1, f_onset2, l2_meas_min=l2_meas_min, l2_meas_max=l2_meas_max)
77
+
78
+ return weights[0] * cos_dis + weights[1] * euc_dis
79
+
musc/dtw/mrmsdtw.py ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from numba import jit
2
+ import numpy as np
3
+ import time
4
+ from typing import List, Tuple, Optional
5
+
6
+ from .anchor import derive_anchors_from_projected_alignment, derive_neighboring_anchors, \
7
+ project_alignment_on_a_new_feature_rate
8
+ from .utils import build_path_from_warping_paths, compute_cost_matrices_between_anchors, smooth_downsample_feature, normalize_feature, compute_warping_paths_from_cost_matrices, find_anchor_indices_in_warping_path
9
+ from .visualization import sync_visualize_step1, sync_visualize_step2
10
+
11
+
12
+
13
+ def sync_via_mrmsdtw_with_anchors(f_chroma1: np.ndarray,
14
+ f_chroma2: np.ndarray,
15
+ f_onset1: np.ndarray = None,
16
+ f_onset2: np.ndarray = None,
17
+ input_feature_rate: float = 50,
18
+ step_sizes: np.ndarray = np.array([[1, 0], [0, 1], [1, 1]], np.int32),
19
+ step_weights: np.ndarray = np.array([1.0, 1.0, 1.0], np.float64),
20
+ threshold_rec: int = 10000,
21
+ win_len_smooth: np.ndarray = np.array([201, 101, 21, 1]),
22
+ downsamp_smooth: np.ndarray = np.array([50, 25, 5, 1]),
23
+ verbose: bool = False,
24
+ dtw_implementation: str = 'synctoolbox',
25
+ normalize_chroma: bool = True,
26
+ chroma_norm_ord: int = 2,
27
+ chroma_norm_threshold: float = 0.001,
28
+ visualization_title: str = "MrMsDTW result",
29
+ anchor_pairs: List[Tuple] = None,
30
+ linear_inp_idx: List[int] = [],
31
+ alpha=0.5) -> np.ndarray:
32
+ """Compute memory-restricted multi-scale DTW (MrMsDTW) using chroma and (optionally) onset features.
33
+ MrMsDTW is performed on multiple levels that get progressively finer, with rectangular constraint
34
+ regions defined by the alignment found on the previous, coarser level.
35
+ If onset features are provided, these are used on the finest level in addition to chroma
36
+ to provide higher synchronization accuracy.
37
+
38
+ Parameters
39
+ ----------
40
+ f_chroma1 : np.ndarray [shape=(12, N)]
41
+ Chroma feature matrix of the first sequence
42
+
43
+ f_chroma2 : np.ndarray [shape=(12, M)]
44
+ Chroma feature matrix of the second sequence
45
+
46
+ f_onset1 : np.ndarray [shape=(L, N)]
47
+ Onset feature matrix of the first sequence (optional, default: None)
48
+
49
+ f_onset2 : np.ndarray [shape=(L, M)]
50
+ Onset feature matrix of the second sequence (optional, default: None)
51
+
52
+ input_feature_rate: int
53
+ Input feature rate of the chroma features (default: 50)
54
+
55
+ step_sizes: np.ndarray
56
+ DTW step sizes (default: np.array([[1, 0], [0, 1], [1, 1]]))
57
+
58
+ step_weights: np.ndarray
59
+ DTW step weights (np.array([1.0, 1.0, 1.0]))
60
+
61
+ threshold_rec: int
62
+ Defines the maximum area that is spanned by the rectangle of two
63
+ consecutive elements in the alignment (default: 10000)
64
+
65
+ win_len_smooth : np.ndarray
66
+ Window lengths for chroma feature smoothing (default: np.array([201, 101, 21, 1]))
67
+
68
+ downsamp_smooth : np.ndarray
69
+ Downsampling factors (default: np.array([50, 25, 5, 1]))
70
+
71
+ verbose : bool
72
+ Set `True` for visualization (default: False)
73
+
74
+ dtw_implementation : str
75
+ DTW implementation, librosa or synctoolbox (default: synctoolbox)
76
+
77
+ normalize_chroma : bool
78
+ Set `True` to normalize input chroma features after each downsampling
79
+ and smoothing operation.
80
+
81
+ chroma_norm_ord: int
82
+ Order of chroma normalization, relevant if ``normalize_chroma`` is True.
83
+ (default: 2)
84
+
85
+ chroma_norm_threshold: float
86
+ If the norm falls below threshold for a feature vector, then the
87
+ normalized feature vector is set to be the unit vector. Relevant, if
88
+ ``normalize_chroma`` is True (default: 0.001)
89
+
90
+ visualization_title : str
91
+ Title for the visualization plots. Only relevant if 'verbose' is True
92
+ (default: "MrMsDTW result")
93
+
94
+ anchor_pairs: List[Tuple]
95
+ Anchor pairs given in seconds. Note that
96
+ * (0, 0) and (<audio-len1>, <audio-len2>) are not allowed.
97
+ * Anchors must be monotonously increasing.
98
+
99
+ linear_inp_idx: List[int]
100
+ List of the indices of intervals created by anchor pairs, for which
101
+ MrMsDTW shouldn't be run, e.g., if the interval only involves silence.
102
+
103
+ 0 ap1 ap2 ap3
104
+ | | | |
105
+ | idx0 | idx1 | idx2 | idx3 OR idx-1
106
+ | | | |
107
+
108
+ Note that index -1 corresponds to the last interval, which begins with
109
+ the last anchor pair until the end of the audio files.
110
+
111
+ alpha: float
112
+ Coefficient for the Chroma cost matrix in the finest scale of the MrMsDTW algorithm.
113
+ C = alpha * C_Chroma + (1 - alpha) * C_act (default: 0.5)
114
+
115
+ Returns
116
+ -------
117
+ wp : np.ndarray [shape=(2, T)]
118
+ Resulting warping path which indicates synchronized indices.
119
+ """
120
+ if anchor_pairs is None:
121
+ wp = sync_via_mrmsdtw(f_chroma1=f_chroma1,
122
+ f_chroma2=f_chroma2,
123
+ f_onset1=f_onset1,
124
+ f_onset2=f_onset2,
125
+ input_feature_rate=input_feature_rate,
126
+ step_sizes=step_sizes,
127
+ step_weights=step_weights,
128
+ threshold_rec=threshold_rec,
129
+ win_len_smooth=win_len_smooth,
130
+ downsamp_smooth=downsamp_smooth,
131
+ verbose=verbose,
132
+ dtw_implementation=dtw_implementation,
133
+ normalize_chroma=normalize_chroma,
134
+ chroma_norm_ord=chroma_norm_ord,
135
+ chroma_norm_threshold=chroma_norm_threshold,
136
+ visualization_title=visualization_title,
137
+ alpha=alpha)
138
+ else:
139
+ # constant_intervals = [((0, x1), (0, y1), False),
140
+ # ((x1, x2), (y1, y2), True),
141
+ # ((x2, -1), (y2, -1), False)]
142
+ wp = None
143
+
144
+ if verbose:
145
+ print('Anchor points are given!')
146
+
147
+ __check_anchor_pairs(anchor_pairs, f_chroma1.shape[1], f_chroma2.shape[1], input_feature_rate)
148
+
149
+ # Add ending as the anchor point
150
+ anchor_pairs.append((-1, -1))
151
+
152
+ prev_a1 = 0
153
+ prev_a2 = 0
154
+
155
+ for idx, anchor_pair in enumerate(anchor_pairs):
156
+ cur_a1, cur_a2 = anchor_pair
157
+
158
+ # Split the features
159
+ f_chroma1_split, f_onset1_split, f_chroma2_split, f_onset2_split = __split_features(f_chroma1,
160
+ f_onset1,
161
+ f_chroma2,
162
+ f_onset2,
163
+ cur_a1,
164
+ cur_a2,
165
+ prev_a1,
166
+ prev_a2,
167
+ input_feature_rate)
168
+
169
+ if idx in linear_inp_idx or idx == len(anchor_pairs) - 1 and -1 in linear_inp_idx:
170
+ # Generate a diagonal warping path, if the algorithm is not supposed to executed.
171
+ # A typical scenario is the silence breaks which are enclosed by two anchor points.
172
+ if verbose:
173
+ print('A diagonal warping path is generated for the interval \n\t Feature sequence 1: %.2f - %.2f'
174
+ '\n\t Feature sequence 2: %.2f - %.2f\n' % (prev_a1, cur_a1, prev_a2, cur_a2))
175
+ wp_cur = __diagonal_warping_path(f_chroma1_split, f_chroma2_split)
176
+
177
+ else:
178
+ if verbose:
179
+ if cur_a1 != -1 and cur_a2 != -1:
180
+ print('MrMsDTW is applied for the interval \n\t Feature sequence 1: %.2f - %.2f'
181
+ '\n\t Feature sequence 2: %.2f - %.2f\n' % (prev_a1, cur_a1, prev_a2, cur_a2))
182
+ else:
183
+ print('MrMsDTW is applied for the interval \n\t Feature sequence 1: %.2f - end'
184
+ '\n\t Feature sequence 2: %.2f - end\n' % (prev_a1, prev_a2))
185
+ wp_cur = sync_via_mrmsdtw(f_chroma1=f_chroma1_split,
186
+ f_chroma2=f_chroma2_split,
187
+ f_onset1=f_onset1_split,
188
+ f_onset2=f_onset2_split,
189
+ input_feature_rate=input_feature_rate,
190
+ step_sizes=step_sizes,
191
+ step_weights=step_weights,
192
+ threshold_rec=threshold_rec,
193
+ win_len_smooth=win_len_smooth,
194
+ downsamp_smooth=downsamp_smooth,
195
+ verbose=verbose,
196
+ dtw_implementation=dtw_implementation,
197
+ normalize_chroma=normalize_chroma,
198
+ chroma_norm_ord=chroma_norm_ord,
199
+ chroma_norm_threshold=chroma_norm_threshold,
200
+ alpha=alpha)
201
+
202
+ if wp is None:
203
+ wp = np.array(wp_cur, copy=True)
204
+
205
+ # Concatenate warping paths
206
+ else:
207
+ wp = np.concatenate([wp, wp_cur + wp[:, -1].reshape(2, 1) + 1], axis=1)
208
+
209
+ prev_a1 = cur_a1
210
+ prev_a2 = cur_a2
211
+
212
+ anchor_pairs.pop()
213
+
214
+ return wp
215
+
216
+
217
+ def sync_via_mrmsdtw(f_chroma1: np.ndarray,
218
+ f_chroma2: np.ndarray,
219
+ f_onset1: np.ndarray = None,
220
+ f_onset2: np.ndarray = None,
221
+ input_feature_rate: float = 50,
222
+ step_sizes: np.ndarray = np.array([[1, 0], [0, 1], [1, 1]], np.int32),
223
+ step_weights: np.ndarray = np.array([1.0, 1.0, 1.0], np.float64),
224
+ threshold_rec: int = 10000,
225
+ win_len_smooth: np.ndarray = np.array([201, 101, 21, 1]),
226
+ downsamp_smooth: np.ndarray = np.array([50, 25, 5, 1]),
227
+ verbose: bool = False,
228
+ dtw_implementation: str = 'synctoolbox',
229
+ normalize_chroma: bool = True,
230
+ chroma_norm_ord: int = 2,
231
+ chroma_norm_threshold: float = 0.001,
232
+ visualization_title: str = "MrMsDTW result",
233
+ alpha=0.5) -> np.ndarray:
234
+ """Compute memory-restricted multi-scale DTW (MrMsDTW) using chroma and (optionally) onset features.
235
+ MrMsDTW is performed on multiple levels that get progressively finer, with rectangular constraint
236
+ regions defined by the alignment found on the previous, coarser level.
237
+ If onset features are provided, these are used on the finest level in addition to chroma
238
+ to provide higher synchronization accuracy.
239
+
240
+ Parameters
241
+ ----------
242
+ f_chroma1 : np.ndarray [shape=(12, N)]
243
+ Chroma feature matrix of the first sequence
244
+
245
+ f_chroma2 : np.ndarray [shape=(12, M)]
246
+ Chroma feature matrix of the second sequence
247
+
248
+ f_onset1 : np.ndarray [shape=(L, N)]
249
+ Onset feature matrix of the first sequence (optional, default: None)
250
+
251
+ f_onset2 : np.ndarray [shape=(L, M)]
252
+ Onset feature matrix of the second sequence (optional, default: None)
253
+
254
+ input_feature_rate: int
255
+ Input feature rate of the chroma features (default: 50)
256
+
257
+ step_sizes: np.ndarray
258
+ DTW step sizes (default: np.array([[1, 0], [0, 1], [1, 1]]))
259
+
260
+ step_weights: np.ndarray
261
+ DTW step weights (np.array([1.0, 1.0, 1.0]))
262
+
263
+ threshold_rec: int
264
+ Defines the maximum area that is spanned by the rectangle of two
265
+ consecutive elements in the alignment (default: 10000)
266
+
267
+ win_len_smooth : np.ndarray
268
+ Window lengths for chroma feature smoothing (default: np.array([201, 101, 21, 1]))
269
+
270
+ downsamp_smooth : np.ndarray
271
+ Downsampling factors (default: np.array([50, 25, 5, 1]))
272
+
273
+ verbose : bool
274
+ Set `True` for visualization (default: False)
275
+
276
+ dtw_implementation : str
277
+ DTW implementation, librosa or synctoolbox (default: synctoolbox)
278
+
279
+ normalize_chroma : bool
280
+ Set `True` to normalize input chroma features after each downsampling
281
+ and smoothing operation.
282
+
283
+ chroma_norm_ord: int
284
+ Order of chroma normalization, relevant if ``normalize_chroma`` is True.
285
+ (default: 2)
286
+
287
+ chroma_norm_threshold: float
288
+ If the norm falls below threshold for a feature vector, then the
289
+ normalized feature vector is set to be the unit vector. Relevant, if
290
+ ``normalize_chroma`` is True (default: 0.001)
291
+
292
+ visualization_title : str
293
+ Title for the visualization plots. Only relevant if 'verbose' is True
294
+ (default: "MrMsDTW result")
295
+
296
+ alpha: float
297
+ Coefficient for the Chroma cost matrix in the finest scale of the MrMsDTW algorithm.
298
+ C = alpha * C_Chroma + (1 - alpha) * C_act (default: 0.5)
299
+
300
+ Returns
301
+ -------
302
+ alignment: np.ndarray [shape=(2, T)]
303
+ Resulting warping path which indicates synchronized indices.
304
+ """
305
+ # If onset features are given as input, high resolution MrMsDTW is activated.
306
+ high_res = False
307
+ if f_onset1 is not None and f_onset2 is not None:
308
+ high_res = True
309
+
310
+ if high_res and (f_chroma1.shape[1] != f_onset1.shape[1] or f_chroma2.shape[1] != f_onset2.shape[1]):
311
+ raise ValueError('Chroma and onset features must be of the same length.')
312
+
313
+ if downsamp_smooth[-1] != 1 or win_len_smooth[-1] != 1:
314
+ raise ValueError('The downsampling factor of the last iteration must be equal to 1, i.e.'
315
+ 'at the last iteration, it is computed at the input feature rate!')
316
+
317
+ num_iterations = win_len_smooth.shape[0]
318
+ cost_matrix_size_old = tuple()
319
+ feature_rate_old = input_feature_rate / downsamp_smooth[0]
320
+ alignment = None
321
+ total_computation_time = 0.0
322
+
323
+ # If the area is less than the threshold_rec, don't apply the multiscale DTW.
324
+ it = (num_iterations - 1) if __compute_area(f_chroma1, f_chroma2) < threshold_rec else 0
325
+
326
+ while it < num_iterations:
327
+ tic1 = time.perf_counter()
328
+
329
+ # Smooth and downsample given raw features
330
+ f_chroma1_cur, _ = smooth_downsample_feature(f_chroma1,
331
+ input_feature_rate=input_feature_rate,
332
+ win_len_smooth=win_len_smooth[it],
333
+ downsamp_smooth=downsamp_smooth[it])
334
+
335
+ f_chroma2_cur, feature_rate_new = smooth_downsample_feature(f_chroma2,
336
+ input_feature_rate=input_feature_rate,
337
+ win_len_smooth=win_len_smooth[it],
338
+ downsamp_smooth=downsamp_smooth[it])
339
+
340
+ if normalize_chroma:
341
+ f_chroma1_cur = normalize_feature(f_chroma1_cur,
342
+ norm_ord=chroma_norm_ord,
343
+ threshold=chroma_norm_threshold)
344
+
345
+ f_chroma2_cur = normalize_feature(f_chroma2_cur,
346
+ norm_ord=chroma_norm_ord,
347
+ threshold=chroma_norm_threshold)
348
+
349
+ # Project path onto new resolution
350
+ cost_matrix_size_new = (f_chroma1_cur.shape[1], f_chroma2_cur.shape[1])
351
+
352
+ if alignment is None:
353
+ # Initialize the alignment with the start and end frames of the feature sequence
354
+ anchors = np.array([[0, f_chroma1_cur.shape[1] - 1], [0, f_chroma2_cur.shape[1] - 1]])
355
+
356
+ else:
357
+ projected_alignment = project_alignment_on_a_new_feature_rate(alignment=alignment,
358
+ feature_rate_old=feature_rate_old,
359
+ feature_rate_new=feature_rate_new,
360
+ cost_matrix_size_old=cost_matrix_size_old,
361
+ cost_matrix_size_new=cost_matrix_size_new)
362
+
363
+ anchors = derive_anchors_from_projected_alignment(projected_alignment=projected_alignment,
364
+ threshold=threshold_rec)
365
+
366
+ # Cost matrix and warping path computation
367
+ if high_res and it == num_iterations - 1:
368
+ # Compute cost considering chroma and pitch onset features and alignment only in the last iteration,
369
+ # where the features are at the finest level.
370
+ cost_matrices_step1 = compute_cost_matrices_between_anchors(f_chroma1=f_chroma1_cur,
371
+ f_chroma2=f_chroma2_cur,
372
+ f_onset1=f_onset1,
373
+ f_onset2=f_onset2,
374
+ anchors=anchors,
375
+ alpha=alpha)
376
+
377
+ else:
378
+ cost_matrices_step1 = compute_cost_matrices_between_anchors(f_chroma1=f_chroma1_cur,
379
+ f_chroma2=f_chroma2_cur,
380
+ anchors=anchors,
381
+ alpha=alpha)
382
+
383
+ wp_list = compute_warping_paths_from_cost_matrices(cost_matrices_step1,
384
+ step_sizes=step_sizes,
385
+ step_weights=step_weights,
386
+ implementation=dtw_implementation)
387
+
388
+ # Concatenate warping paths
389
+ wp = build_path_from_warping_paths(warping_paths=wp_list,
390
+ anchors=anchors)
391
+
392
+ anchors_step1 = None
393
+ wp_step1 = None
394
+ num_rows_step1 = 0
395
+ num_cols_step1 = 0
396
+ ax = None
397
+
398
+ toc1 = time.perf_counter()
399
+ if verbose and cost_matrices_step1 is not None:
400
+ anchors_step1 = np.array(anchors, copy=True)
401
+ wp_step1 = np.array(wp, copy=True)
402
+ num_rows_step1, num_cols_step1 = np.sum(np.array([dtw_mat.shape for dtw_mat in cost_matrices_step1], int),
403
+ axis=0)
404
+ fig, ax = sync_visualize_step1(cost_matrices_step1,
405
+ num_rows_step1,
406
+ num_cols_step1,
407
+ anchors,
408
+ wp)
409
+ tic2 = time.perf_counter()
410
+
411
+ # Compute neighboring anchors and refine alignment using local path between neighboring anchors
412
+ anchor_indices_in_warping_path = find_anchor_indices_in_warping_path(wp, anchors=anchors)
413
+
414
+ # Compute neighboring anchors for refinement
415
+ neighboring_anchors, neighboring_anchor_indices = \
416
+ derive_neighboring_anchors(wp, anchor_indices=anchor_indices_in_warping_path)
417
+
418
+ if neighboring_anchor_indices.shape[0] > 1 \
419
+ and it == num_iterations - 1 and high_res:
420
+ cost_matrices_step2 = compute_cost_matrices_between_anchors(f_chroma1=f_chroma1_cur,
421
+ f_chroma2=f_chroma2_cur,
422
+ f_onset1=f_onset1,
423
+ f_onset2=f_onset2,
424
+ anchors=neighboring_anchors,
425
+ alpha=alpha)
426
+
427
+ else:
428
+ cost_matrices_step2 = compute_cost_matrices_between_anchors(f_chroma1=f_chroma1_cur,
429
+ f_chroma2=f_chroma2_cur,
430
+ anchors=neighboring_anchors,
431
+ alpha=alpha)
432
+
433
+ wp_list_refine = compute_warping_paths_from_cost_matrices(cost_matrices=cost_matrices_step2,
434
+ step_sizes=step_sizes,
435
+ step_weights=step_weights,
436
+ implementation=dtw_implementation)
437
+
438
+ wp = __refine_wp(wp, anchors, wp_list_refine, neighboring_anchors, neighboring_anchor_indices)
439
+
440
+ toc2 = time.perf_counter()
441
+ computation_time_it = toc2 - tic2 + toc1 - tic1
442
+ total_computation_time += computation_time_it
443
+
444
+ alignment = wp
445
+ feature_rate_old = feature_rate_new
446
+ cost_matrix_size_old = cost_matrix_size_new
447
+
448
+ if verbose and cost_matrices_step2 is not None:
449
+ sync_visualize_step2(ax,
450
+ cost_matrices_step2,
451
+ wp,
452
+ wp_step1,
453
+ num_rows_step1,
454
+ num_cols_step1,
455
+ anchors_step1,
456
+ neighboring_anchors,
457
+ plot_title=f"{visualization_title} - Level {it + 1}")
458
+ print('Level {} computation time: {:.2f} seconds'.format(it, computation_time_it))
459
+
460
+ it += 1
461
+
462
+ if verbose:
463
+ print('Computation time of MrMsDTW: {:.2f} seconds'.format(total_computation_time))
464
+
465
+ return alignment
466
+
467
+
468
+ def __diagonal_warping_path(f1: np.ndarray,
469
+ f2: np.ndarray) -> np.ndarray:
470
+ """Generates a diagonal warping path given two feature sequences.
471
+
472
+ Parameters
473
+ ----------
474
+ f1: np.ndarray [shape=(_, N)]
475
+ First feature sequence
476
+
477
+ f2: np.ndarray [shape=(_, M)]
478
+ Second feature sequence
479
+
480
+ Returns
481
+ -------
482
+ np.ndarray: Diagonal warping path [shape=(2, T)]
483
+ """
484
+ max_size = np.maximum(f1.shape[1], f2.shape[1])
485
+ min_size = np.minimum(f1.shape[1], f2.shape[1])
486
+
487
+ if min_size == 1:
488
+ return np.array([max_size - 1, 0]).reshape(-1, 1)
489
+
490
+ elif max_size == f1.shape[1]:
491
+ return np.array([np.round(np.linspace(0, max_size - 1, min_size)), np.linspace(0, min_size - 1, min_size)])
492
+
493
+ else:
494
+ return np.array([np.linspace(0, min_size-1, min_size), np.round(np.linspace(0, max_size - 1, min_size))])
495
+
496
+
497
+ @jit(nopython=True)
498
+ def __compute_area(f1, f2):
499
+ """Computes the area of the cost matrix given two feature sequences
500
+
501
+ Parameters
502
+ ----------
503
+ f1: np.ndarray
504
+ First feature sequence
505
+
506
+ f2: np.ndarray
507
+ Second feature sequence
508
+
509
+ Returns
510
+ -------
511
+ int: Area of the cost matrix
512
+ """
513
+ return f1.shape[1] * f2.shape[1]
514
+
515
+
516
+ def __split_features(f_chroma1: np.ndarray,
517
+ f_onset1: np.ndarray,
518
+ f_chroma2: np.ndarray,
519
+ f_onset2: np.ndarray,
520
+ cur_a1: float,
521
+ cur_a2: float,
522
+ prev_a1: float,
523
+ prev_a2: float,
524
+ feature_rate: int) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray, Optional[np.ndarray]]:
525
+
526
+ if cur_a1 == -1:
527
+ f_chroma1_split = f_chroma1[:, int(prev_a1 * feature_rate):]
528
+ if f_onset1 is not None:
529
+ f_onset1_split = f_onset1[:, int(prev_a1 * feature_rate):]
530
+ else:
531
+ f_onset1_split = None
532
+
533
+ else:
534
+ # Split the features
535
+ f_chroma1_split = f_chroma1[:, int(prev_a1 * feature_rate):int(cur_a1 * feature_rate)]
536
+ if f_onset1 is not None:
537
+ f_onset1_split = f_onset1[:, int(prev_a1 * feature_rate):int(cur_a1 * feature_rate)]
538
+ else:
539
+ f_onset1_split = None
540
+
541
+ if cur_a2 == -1:
542
+ f_chroma2_split = f_chroma2[:, int(prev_a2 * feature_rate):]
543
+ if f_onset2 is not None:
544
+ f_onset2_split = f_onset2[:, int(prev_a2 * feature_rate):]
545
+ else:
546
+ f_onset2_split = None
547
+
548
+ else:
549
+ f_chroma2_split = f_chroma2[:, int(prev_a2 * feature_rate):int(cur_a2 * feature_rate)]
550
+ if f_onset2 is not None:
551
+ f_onset2_split = f_onset2[:, int(prev_a2 * feature_rate):int(cur_a2 * feature_rate)]
552
+ else:
553
+ f_onset2_split = None
554
+
555
+ return f_chroma1_split, f_onset1_split, f_chroma2_split, f_onset2_split
556
+
557
+
558
+ def __refine_wp(wp: np.ndarray,
559
+ anchors: np.ndarray,
560
+ wp_list_refine: List,
561
+ neighboring_anchors: np.ndarray,
562
+ neighboring_anchor_indices: np.ndarray) -> np.ndarray:
563
+ wp_length = wp[:, neighboring_anchor_indices[-1]:].shape[1]
564
+ last_list = wp[:, neighboring_anchor_indices[-1]:] - np.tile(
565
+ wp[:, neighboring_anchor_indices[-1]].reshape(-1, 1), wp_length)
566
+ wp_list_tmp = [wp[:, :neighboring_anchor_indices[0] + 1]] + wp_list_refine + [last_list]
567
+ A_tmp = np.concatenate([anchors[:, 0].reshape(-1, 1), neighboring_anchors, anchors[:, -1].reshape(-1, 1)],
568
+ axis=1)
569
+ wp_res = build_path_from_warping_paths(warping_paths=wp_list_tmp,
570
+ anchors=A_tmp)
571
+
572
+ return wp_res
573
+
574
+
575
+ def __check_anchor_pairs(anchor_pairs: List,
576
+ f_len1: int,
577
+ f_len2: int,
578
+ feature_rate: int):
579
+ """Ensures that the anchors satisfy the conditions
580
+
581
+ Parameters
582
+ ----------
583
+ anchor_pairs: List[Tuple]
584
+ List of anchor pairs
585
+
586
+ f_len1: int
587
+ Length of the first feature sequence
588
+
589
+ f_len2: int
590
+ Length of the second feature sequence
591
+
592
+ feature_rate: int
593
+ Input feature rate of the features
594
+ """
595
+ prev_a1 = 0
596
+ prev_a2 = 0
597
+ for anchor_pair in anchor_pairs:
598
+ a1, a2 = anchor_pair
599
+
600
+ if a1 <= 0 or a2 <= 0:
601
+ raise ValueError('Starting point must be a positive number!')
602
+
603
+ if a1 > f_len1 / feature_rate or a2 > f_len2 / feature_rate:
604
+ raise ValueError('Anchor points cannot be greater than the length of the input audio files!')
605
+
606
+ if a1 == f_len1 and a2 == f_len2:
607
+ raise ValueError('Both anchor points cannot be equal to the length of the audio files.')
608
+
609
+ if a1 == prev_a1 and a2 == prev_a2:
610
+ raise ValueError('Duplicate anchor pairs are not allowed!')
611
+
612
+ if a1 < prev_a1 or a2 < prev_a2:
613
+ raise ValueError('Anchor points must be monotonously increasing.')
614
+
615
+ prev_a1 = a1
616
+ prev_a2 = a2
musc/dtw/utils.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import List
3
+ from numba import jit
4
+ from scipy import signal
5
+ from typing import Tuple
6
+ from .core import compute_warping_path
7
+ from .cost import cosine_distance,compute_high_res_cost_matrix
8
+
9
+
10
+ def compute_warping_paths_from_cost_matrices(cost_matrices: List,
11
+ step_sizes: np.array = np.array([[1, 0], [0, 1], [1, 1]], int),
12
+ step_weights: np.array = np.array([1.0, 1.0, 1.0], np.float64),
13
+ implementation: str = 'synctoolbox') -> List:
14
+ """Computes a path via DTW on each matrix in cost_matrices
15
+
16
+ Parameters
17
+ ----------
18
+ cost_matrices : list
19
+ List of cost matrices
20
+
21
+ step_sizes : np.ndarray
22
+ DTW step sizes (default: np.array([[1, 0], [0, 1], [1, 1]]))
23
+
24
+ step_weights : np.ndarray
25
+ DTW step weights (default: np.array([1.0, 1.0, 1.0]))
26
+
27
+ implementation : str
28
+ Choose among 'synctoolbox' and 'librosa' (default: 'synctoolbox')
29
+
30
+ Returns
31
+ -------
32
+ wp_list : list
33
+ List of warping paths
34
+ """
35
+ return [compute_warping_path(C=C,
36
+ step_sizes=step_sizes,
37
+ step_weights=step_weights,
38
+ implementation=implementation)[2] for C in cost_matrices]
39
+
40
+
41
+ def compute_cost_matrices_between_anchors(f_chroma1: np.ndarray,
42
+ f_chroma2: np.ndarray,
43
+ anchors: np.ndarray,
44
+ f_onset1: np.ndarray = None,
45
+ f_onset2: np.ndarray = None,
46
+ alpha: float = 0.5) -> List:
47
+ """Computes cost matrices for the given features between subsequent
48
+ pairs of anchors points.
49
+
50
+ Parameters
51
+ ----------
52
+ f_chroma1 : np.ndarray [shape=(12, N)]
53
+ Chroma feature matrix of the first sequence
54
+
55
+ f_chroma2 : np.ndarray [shape=(12, M)]
56
+ Chroma feature matrix of the second sequence
57
+
58
+ anchors : np.ndarray [shape=(2, R)]
59
+ Anchor sequence
60
+
61
+ f_onset1 : np.ndarray [shape=(L, N)]
62
+ Onset feature matrix of the first sequence
63
+
64
+ f_onset2 : np.ndarray [shape=(L, M)]
65
+ Onset feature matrix of the second sequence
66
+
67
+ alpha: float
68
+ Alpha parameter to weight the cost functions.
69
+
70
+ Returns
71
+ -------
72
+ cost_matrices: list
73
+ List containing cost matrices
74
+ """
75
+ high_res = False
76
+ if f_onset1 is not None and f_onset2 is not None:
77
+ high_res = True
78
+
79
+ cost_matrices = list()
80
+ for k in range(anchors.shape[1] - 1):
81
+ a1 = np.array(anchors[:, k].astype(int), copy=True)
82
+ a2 = np.array(anchors[:, k + 1].astype(int), copy=True)
83
+
84
+ if high_res:
85
+ cost_matrices.append(compute_high_res_cost_matrix(f_chroma1[:, a1[0]: a2[0] + 1],
86
+ f_chroma2[:, a1[1]: a2[1] + 1],
87
+ f_onset1[:, a1[0]: a2[0] + 1],
88
+ f_onset2[:, a1[1]: a2[1] + 1],
89
+ weights=np.array([alpha, 1-alpha])))
90
+ else:
91
+ cost_matrices.append(cosine_distance(f_chroma1[:, a1[0]: a2[0] + 1],
92
+ f_chroma2[:, a1[1]: a2[1] + 1]))
93
+ return cost_matrices
94
+
95
+
96
+ def build_path_from_warping_paths(warping_paths: List,
97
+ anchors: np.ndarray = None) -> np.ndarray:
98
+ """The function builds a path from a given list of warping paths
99
+ and the anchors used to obtain these paths. The indices of the original
100
+ warping paths are adapted such that they cross the anchors.
101
+
102
+ Parameters
103
+ ----------
104
+ warping_paths : list
105
+ List of warping paths
106
+
107
+ anchors : np.ndarray [shape=(2, N)]
108
+ Anchor sequence
109
+
110
+ Returns
111
+ -------
112
+ path : np.ndarray [shape=(2, M)]
113
+ Merged path
114
+ """
115
+
116
+ if anchors is None:
117
+ # When no anchor points are given, we can construct them from the
118
+ # subpaths in the wp_list
119
+
120
+ # To do this, we assume that the first path's element is the starting
121
+ # anchor
122
+ anchors = warping_paths[0][:, 0]
123
+
124
+ # Retrieve the last element of each path
125
+ anchors_tmp = np.zeros(len(warping_paths), np.float32)
126
+ for idx, x in enumerate(warping_paths):
127
+ anchors_tmp[idx] = x[:, -1]
128
+
129
+ # Correct indices, such that the indices of the anchors are given on a
130
+ # common path. Each anchor a_l = [Nnew_[l+1];Mnew_[l+1]]
131
+ # Nnew_[l+1] = N_l + N_[l+1] -1
132
+ # Mnew_[l+1] = M_l + M_[l+1] -1
133
+
134
+ anchors_tmp = np.cumsum(anchors_tmp, axis=1)
135
+ anchors_tmp[:, 1:] = anchors_tmp[:, 1:] - [np.arange(1, anchors_tmp.shape[1]),
136
+ np.arange(1, anchors_tmp.shape[1])]
137
+
138
+ anchors = np.concatenate([anchors, anchors_tmp], axis=1)
139
+
140
+ L = len(warping_paths) + 1
141
+ path = None
142
+ wp = None
143
+
144
+ for anchor_idx in range(1, L):
145
+ anchor1 = anchors[:, anchor_idx - 1]
146
+ anchor2 = anchors[:, anchor_idx]
147
+
148
+ wp = np.array(warping_paths[anchor_idx - 1], copy=True)
149
+
150
+ # correct indices in warpingPath
151
+ wp += np.repeat(anchor1.reshape(-1, 1), wp.shape[1], axis=1).astype(wp.dtype)
152
+
153
+ # consistency checks
154
+ assert np.array_equal(wp[:, 0], anchor1), 'First entry of warping path does not coincide with anchor point'
155
+ assert np.array_equal(wp[:, -1], anchor2), 'Last entry of warping path does not coincide with anchor point'
156
+
157
+ if path is None:
158
+ path = np.array(wp[:, :-1], copy=True)
159
+ else:
160
+ path = np.concatenate([path, wp[:, :-1]], axis=1)
161
+
162
+ # append last index of warping path
163
+ path = np.concatenate([path, wp[:, -1].reshape(-1, 1)], axis=1)
164
+
165
+ return path
166
+
167
+
168
+ def find_anchor_indices_in_warping_path(warping_path: np.ndarray,
169
+ anchors: np.ndarray) -> np.ndarray:
170
+ """Compute the indices in the warping path that corresponds
171
+ to the elements in 'anchors'
172
+
173
+ Parameters
174
+ ----------
175
+ warping_path : np.ndarray [shape=(2, N)]
176
+ Warping path
177
+
178
+ anchors : np.ndarray [shape=(2, M)]
179
+ Anchor sequence
180
+
181
+ Returns
182
+ -------
183
+ indices : np.ndarray [shape=(2, M)]
184
+ Anchor indices in the ``warping_path``
185
+ """
186
+ indices = np.zeros(anchors.shape[1])
187
+
188
+ for k in range(anchors.shape[1]):
189
+ a = anchors[:, k]
190
+ indices[k] = np.where((a[0] == warping_path[0, :]) & (a[1] == warping_path[1, :]))[0]
191
+
192
+ return indices
193
+
194
+
195
+ def make_path_strictly_monotonic(P: np.ndarray) -> np.ndarray:
196
+ """Compute strict alignment path from a warping path
197
+
198
+ Wrapper around "compute_strict_alignment_path_mask" from libfmp.
199
+
200
+ Parameters
201
+ ----------
202
+ P: np.ndarray [shape=(2, N)]
203
+ Warping path
204
+
205
+ Returns
206
+ -------
207
+ P_mod: np.ndarray [shape=(2, M)]
208
+ Strict alignment path, M <= N
209
+ """
210
+ P_mod = compute_strict_alignment_path_mask(P.T)
211
+
212
+ return P_mod.T
213
+
214
+ def compute_strict_alignment_path_mask(P):
215
+ """Compute strict alignment path from a warping path
216
+
217
+ Notebook: C3/C3S3_MusicAppTempoCurve.ipynb
218
+
219
+ Args:
220
+ P (list or np.ndarray): Wapring path
221
+
222
+ Returns:
223
+ P_mod (list or np.ndarray): Strict alignment path
224
+ """
225
+ P = np.array(P, copy=True)
226
+ N, M = P[-1]
227
+ # Get indices for strict monotonicity
228
+ keep_mask = (P[1:, 0] > P[:-1, 0]) & (P[1:, 1] > P[:-1, 1])
229
+ # Add first index to enforce start boundary condition
230
+ keep_mask = np.concatenate(([True], keep_mask))
231
+ # Remove all indices for of last row or column
232
+ keep_mask[(P[:, 0] == N) | (P[:, 1] == M)] = False
233
+ # Add last index to enforce end boundary condition
234
+ keep_mask[-1] = True
235
+ P_mod = P[keep_mask, :]
236
+
237
+ return P_mod
238
+
239
+
240
+ def evaluate_synchronized_positions(ground_truth_positions: np.ndarray,
241
+ synchronized_positions: np.ndarray,
242
+ tolerances: List = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 250]):
243
+ """Compute standard evaluation measures for evaluating the quality of synchronized (musical) positions.
244
+
245
+ When synchronizing two versions of a piece of music, one can evaluate the quality of the resulting alignment
246
+ by comparing errors at musical positions (e.g. beats or measures) that appear in both versions.
247
+ This function implements two measures: mean absolute error at positions and the percentage of correctly transferred
248
+ measures given a threshold.
249
+
250
+ Parameters
251
+ ----------
252
+ ground_truth_positions: np.ndarray [shape=N]
253
+ Positions (e.g. beat or measure positions) annotated in the target version of a piece of music, in milliseconds.
254
+
255
+ synchronized_positions: np.ndarray [shape=N]
256
+ The same musical positions as in 'ground_truth_positions' obtained by transfer using music synchronization,
257
+ in milliseconds.
258
+
259
+ tolerances: list of integers
260
+ Tolerances (in miliseconds) used for comparing annotated and synchronized positions.
261
+
262
+ Returns
263
+ -------
264
+ mean_absolute_error: float
265
+ Mean absolute error for synchronized positions, in miliseconds.
266
+
267
+ accuracy_at_tolerances: list of floats
268
+ Percentages of correctly transferred measures, for each entry in 'tolerances'.
269
+
270
+ """
271
+ absolute_errors_at_positions = np.abs(synchronized_positions - ground_truth_positions)
272
+
273
+ print('Measure transfer from recording 1 to 2 yielded:')
274
+ mean_absolute_error = np.mean(absolute_errors_at_positions)
275
+ print('\nMean absolute error (MAE): %.2fms (standard deviation: %.2fms)' % (mean_absolute_error,
276
+ np.std(absolute_errors_at_positions)))
277
+ print('\nAccuracy of transferred positions at different tolerances:')
278
+ print('\t\t\tAccuracy')
279
+ print('################################')
280
+ accuracy_at_tolerances = []
281
+ for tolerance in tolerances:
282
+ accuracy = np.mean((absolute_errors_at_positions < tolerance)) * 100.0
283
+ accuracy_at_tolerances.append(accuracy)
284
+ print('Tolerance: {} ms \t{:.2f} %'.format(tolerance, accuracy))
285
+
286
+ return mean_absolute_error, accuracy_at_tolerances
287
+
288
+
289
+ def smooth_downsample_feature(f_feature: np.ndarray,
290
+ input_feature_rate: float,
291
+ win_len_smooth: int = 0,
292
+ downsamp_smooth: int = 1) -> Tuple[np.ndarray, float]:
293
+ """Temporal smoothing and downsampling of a feature sequence
294
+
295
+ Parameters
296
+ ----------
297
+ f_feature : np.ndarray
298
+ Input feature sequence, size dxN
299
+
300
+ input_feature_rate : float
301
+ Input feature rate in Hz
302
+
303
+ win_len_smooth : int
304
+ Smoothing window length. For 0, no smoothing is applied.
305
+
306
+ downsamp_smooth : int
307
+ Downsampling factor. For 1, no downsampling is applied.
308
+
309
+ Returns
310
+ -------
311
+ f_feature_stat : np.ndarray
312
+ Downsampled & smoothed feature.
313
+
314
+ new_feature_rate : float
315
+ New feature rate after downsampling
316
+ """
317
+ if win_len_smooth != 0 or downsamp_smooth != 1:
318
+ # hack to get the same results as on MATLAB
319
+ stat_window = np.hanning(win_len_smooth+2)[1:-1]
320
+ stat_window /= np.sum(stat_window)
321
+
322
+ # upfirdn filters and downsamples each column of f_stat_help
323
+ f_feature_stat = signal.upfirdn(h=stat_window, x=f_feature, up=1, down=downsamp_smooth)
324
+ seg_num = f_feature.shape[1]
325
+ stat_num = int(np.ceil(seg_num / downsamp_smooth))
326
+ cut = int(np.floor((win_len_smooth - 1) / (2 * downsamp_smooth)))
327
+ f_feature_stat = f_feature_stat[:, cut: stat_num + cut]
328
+ else:
329
+ f_feature_stat = f_feature
330
+
331
+ new_feature_rate = input_feature_rate / downsamp_smooth
332
+
333
+ return f_feature_stat, new_feature_rate
334
+
335
+
336
+ @jit(nopython=True)
337
+ def normalize_feature(feature: np.ndarray,
338
+ norm_ord: int,
339
+ threshold: float) -> np.ndarray:
340
+ """Normalizes a feature sequence according to the l^norm_ord norm.
341
+
342
+ Parameters
343
+ ----------
344
+ feature : np.ndarray
345
+ Input feature sequence of size d x N
346
+ d: dimensionality of feature vectors
347
+ N: number of feature vectors (time in frames)
348
+
349
+ norm_ord : int
350
+ Norm degree
351
+
352
+ threshold : float
353
+ If the norm falls below threshold for a feature vector, then the
354
+ normalized feature vector is set to be the normalized unit vector.
355
+
356
+ Returns
357
+ -------
358
+ f_normalized : np.ndarray
359
+ Normalized feature sequence
360
+ """
361
+ # TODO rewrite in vectorized fashion
362
+ d, N = feature.shape
363
+ f_normalized = np.zeros((d, N))
364
+
365
+ # normalize the vectors according to the l^norm_ord norm
366
+ unit_vec = np.ones(d)
367
+ unit_vec = unit_vec / np.linalg.norm(unit_vec, norm_ord)
368
+
369
+ for k in range(N):
370
+ cur_norm = np.linalg.norm(feature[:, k], norm_ord)
371
+
372
+ if cur_norm < threshold:
373
+ f_normalized[:, k] = unit_vec
374
+ else:
375
+ f_normalized[:, k] = feature[:, k] / cur_norm
376
+
377
+ return f_normalized
musc/dtw/visualization.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib
2
+ import matplotlib.cm
3
+ import matplotlib.patches
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ from typing import Tuple, List
7
+
8
+
9
+ def sync_visualize_step1(cost_matrices: List,
10
+ num_rows: int,
11
+ num_cols: int,
12
+ anchors: np.ndarray,
13
+ wp: np.ndarray) -> Tuple[plt.Figure, plt.Axes]:
14
+
15
+ fig, ax = plt.subplots(1, 1, dpi=72)
16
+ ax = __visualize_cost_matrices(ax, cost_matrices)
17
+ __visualize_constraint_rectangles(anchors[[1, 0], :],
18
+ edgecolor='firebrick')
19
+
20
+ __visualize_path_in_matrix(ax=ax,
21
+ wp=wp,
22
+ axisX=np.arange(0, num_rows),
23
+ axisY=np.arange(0, num_cols),
24
+ path_color='firebrick')
25
+
26
+ return fig, ax
27
+
28
+
29
+ def sync_visualize_step2(ax: plt.Axes,
30
+ cost_matrices: list,
31
+ wp_step2: np.ndarray,
32
+ wp_step1: np.ndarray,
33
+ num_rows_step1: int,
34
+ num_cols_step1: int,
35
+ anchors_step1: np.ndarray,
36
+ neighboring_anchors: np.ndarray,
37
+ plot_title: str = ""):
38
+
39
+ offset_x = neighboring_anchors[0, 0] - 1
40
+ offset_y = neighboring_anchors[1, 0] - 1
41
+ ax = __visualize_cost_matrices(ax=ax,
42
+ cost_matrices=cost_matrices,
43
+ offset_x=offset_x,
44
+ offset_y=offset_y)
45
+
46
+ __visualize_constraint_rectangles(anchors_step1[[1, 0], :],
47
+ edgecolor='firebrick')
48
+
49
+ __visualize_path_in_matrix(ax=ax,
50
+ wp=wp_step1,
51
+ axisX=np.arange(0, num_rows_step1),
52
+ axisY=np.arange(0, num_cols_step1),
53
+ path_color='firebrick')
54
+
55
+ __visualize_constraint_rectangles(neighboring_anchors[[1, 0], :] - 1,
56
+ edgecolor='orangered',
57
+ linestyle='--')
58
+
59
+ __visualize_path_in_matrix(ax=ax,
60
+ wp=wp_step2,
61
+ axisX=np.arange(0, num_rows_step1),
62
+ axisY=np.arange(0, num_cols_step1),
63
+ path_color='orangered')
64
+
65
+ ax.set_title(plot_title)
66
+ ax.set_ylabel("Version 1 (frames)")
67
+ ax.set_xlabel("Version 2 (frames)")
68
+
69
+ ax = plt.gca() # get the current axes
70
+ pcm = None
71
+ for pcm in ax.get_children():
72
+ if isinstance(pcm, matplotlib.cm.ScalarMappable):
73
+ break
74
+ plt.colorbar(pcm, ax=ax)
75
+ plt.tight_layout()
76
+ plt.show()
77
+
78
+
79
+ def __size_dtw_matrices(dtw_matrices: List) -> Tuple[List[np.ndarray], List[np.ndarray]]:
80
+ """Gives information about the dimensionality of a DTW matrix
81
+ given in form of a list matrix
82
+
83
+ Parameters
84
+ ----------
85
+ dtw_matrices: list
86
+ The DTW matrix (cost matrix or accumulated cost matrix) given in form a list.
87
+
88
+ Returns
89
+ -------
90
+ axisX_list: list
91
+ A list containing a horizontal axis for each of the sub matrices
92
+ which specifies the horizontal position of the respective submatrix
93
+ in the overall cost matrix.
94
+
95
+ axis_y_list: list
96
+ A list containing a vertical axis for each of the
97
+ sub matrices which specifies the vertical position of the
98
+ respective submatrix in the overall cost matrix.
99
+
100
+ """
101
+ num_matrices = len(dtw_matrices)
102
+ size_list = [dtw_mat.shape for dtw_mat in dtw_matrices]
103
+
104
+ axis_x_list = list()
105
+ axis_y_list = list()
106
+
107
+ x_acc = 0
108
+ y_acc = 0
109
+
110
+ for i in range(num_matrices):
111
+ curr_size_list = size_list[i]
112
+ axis_x_list.append(np.arange(x_acc, x_acc + curr_size_list[0]))
113
+ axis_y_list.append(np.arange(y_acc, y_acc + curr_size_list[1]))
114
+ x_acc += curr_size_list[0] - 1
115
+ y_acc += curr_size_list[1] - 1
116
+
117
+ return axis_x_list, axis_y_list
118
+
119
+
120
+ def __visualize_cost_matrices(ax: plt.Axes,
121
+ cost_matrices: list = None,
122
+ offset_x: float = 0.0,
123
+ offset_y: float = 0.0) -> plt.Axes:
124
+ """Visualizes cost matrices
125
+
126
+ Parameters
127
+ ----------
128
+ ax : axes
129
+ The Axes instance to plot on
130
+
131
+ cost_matrices : list
132
+ List of DTW cost matrices.
133
+
134
+ offset_x : float
135
+ Offset on the x axis.
136
+
137
+ offset_y : float
138
+ Offset on the y axis.
139
+
140
+ Returns
141
+ -------
142
+ ax: axes
143
+ The Axes instance to plot on
144
+
145
+ """
146
+ x_ax, y_ax = __size_dtw_matrices(dtw_matrices=cost_matrices)
147
+
148
+ for i, cur_cost in enumerate(cost_matrices[::-1]):
149
+ curr_x_ax = x_ax[i] + offset_x
150
+ curr_y_ax = y_ax[i] + offset_y
151
+ cur_cost = cost_matrices[i]
152
+ ax.imshow(cur_cost, cmap='gray_r', aspect='auto', origin='lower',
153
+ extent=[curr_y_ax[0], curr_y_ax[-1], curr_x_ax[0], curr_x_ax[-1]])
154
+
155
+ return ax
156
+
157
+
158
+ def __visualize_path_in_matrix(ax,
159
+ wp: np.ndarray = None,
160
+ axisX: np.ndarray = None,
161
+ axisY: np.ndarray = None,
162
+ path_color: str = 'r'):
163
+ """Plots a warping path on top of a given matrix. The matrix is
164
+ usually an accumulated cost matrix.
165
+
166
+ Parameters
167
+ ----------
168
+ ax : axes
169
+ The Axes instance to plot on
170
+
171
+ wp : np.ndarray
172
+ Warping path
173
+
174
+ axisX : np.ndarray
175
+ Array of X axis
176
+
177
+ axisY : np.ndarray
178
+ Array of Y axis
179
+
180
+ path_color : str
181
+ Color of the warping path to be plotted. (default: r)
182
+ """
183
+ assert axisX is not None and isinstance(axisX, np.ndarray), 'axisX must be a numpy array!'
184
+ assert axisY is not None and isinstance(axisY, np.ndarray), 'axisY must be a numpy array!'
185
+
186
+ wp = wp.astype(int)
187
+
188
+ ax.plot(axisY[wp[1, :]], axisX[wp[0, :]], '-k', linewidth=5)
189
+ ax.plot(axisY[wp[1, :]], axisX[wp[0, :]], color=path_color, linewidth=3)
190
+
191
+
192
+ def __visualize_constraint_rectangles(anchors: np.ndarray,
193
+ linestyle: str = '-',
194
+ edgecolor: str = 'royalblue',
195
+ linewidth: float = 1.0):
196
+
197
+ for k in range(anchors.shape[1]-1):
198
+ a1 = anchors[:, k]
199
+ a2 = anchors[:, k + 1]
200
+
201
+ # a rectangle is defined by [x y width height]
202
+ x = a1[0]
203
+ y = a1[1]
204
+ w = a2[0] - a1[0] + np.finfo(float).eps
205
+ h = a2[1] - a1[1] + np.finfo(float).eps
206
+
207
+ rect = matplotlib.patches.Rectangle((x, y), w, h,
208
+ linewidth=linewidth,
209
+ edgecolor=edgecolor,
210
+ linestyle=linestyle,
211
+ facecolor='none')
212
+
213
+ plt.gca().add_patch(rect)
214
+
215
+
216
+
musc/model.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .pathway import TinyPathway
2
+ from .synchronizer import Synchronizer
3
+ from .representations import PerformanceLabel
4
+ from torchaudio.models.conformer import ConformerLayer
5
+ import torch
6
+ from torch import nn
7
+ import numpy as np
8
+
9
+
10
+ class FourHeads(Synchronizer):
11
+
12
+ def __init__(
13
+ self,
14
+ pathway_multiscale: int = 32,
15
+ num_pathway_layers: int = 2,
16
+ chunk_size: int = 256,
17
+ hop_length: int = 256,
18
+ encoder_dim: int = 256,
19
+ sr: int = 44100,
20
+ num_heads: int = 4,
21
+ ffn_dim: int = 128,
22
+ num_separator_layers: int = 16,
23
+ num_representation_layers: int = 4,
24
+ depthwise_conv_kernel_size: int = 31,
25
+ dropout: float = 0.25,
26
+ use_group_norm: bool = False,
27
+ convolution_first: bool = False,
28
+ labeling=PerformanceLabel(),
29
+ wiring='tiktok'
30
+ ):
31
+ super().__init__(labeling, sr=sr, hop_length=hop_length)
32
+ self.main = TinyPathway(dilation=1, hop=hop_length, localize=True,
33
+ n_layers=num_pathway_layers, chunk_size=chunk_size)
34
+ self.attendant = TinyPathway(dilation=pathway_multiscale, hop=hop_length, localize=False,
35
+ n_layers=num_pathway_layers, chunk_size=chunk_size)
36
+ assert self.main.hop == self.attendant.hop # they should output with the same sample rate
37
+ print('hop in samples:', self.main.hop)
38
+ self.input_window = self.attendant.input_window
39
+
40
+ self.encoder_dim = encoder_dim
41
+ self.dropout = nn.Dropout(dropout)
42
+
43
+ # merge two streams into a conformer input
44
+ self.stream_merger = nn.Sequential(self.dropout,
45
+ nn.Linear(self.main.out_dim + self.attendant.out_dim, self.encoder_dim))
46
+
47
+
48
+
49
+ print('main stream window:', self.main.input_window,
50
+ ', attendant stream window:', self.attendant.input_window,
51
+ ', conformer input dim:', self.encoder_dim)
52
+
53
+ center = ((chunk_size - 1) * self.main.hop) # region labeled with pitch track
54
+ main_overlap = self.main.input_window - center
55
+ main_overlap = [int(np.floor(main_overlap / 2)), int(np.ceil(main_overlap / 2))]
56
+ attendant_overlap = self.attendant.input_window - center
57
+ attendant_overlap = [int(np.floor(attendant_overlap / 2)), int(np.ceil(attendant_overlap / 2))]
58
+ print('main frame overlap:', main_overlap, ', attendant frame overlap:', attendant_overlap)
59
+ main_crop_relative = [attendant_overlap[0] - main_overlap[0], main_overlap[1] - attendant_overlap[1]]
60
+ print('crop for main pathway', main_crop_relative)
61
+ print("Total sequence duration is", self.attendant.input_window, 'samples')
62
+ print('Main stream receptive field for one frame is', (self.main.input_window - center), 'samples')
63
+ print('Attendant stream receptive field for one frame is', (self.attendant.input_window - center), 'samples')
64
+ self.frame_overlap = attendant_overlap
65
+
66
+ self.main_stream_crop = main_crop_relative
67
+ self.max_window_size = self.attendant.input_window
68
+ self.chunk_size = chunk_size
69
+
70
+ self.separator_stream = nn.ModuleList( # source-separation, reinvented
71
+ [
72
+ ConformerLayer(
73
+ input_dim=self.encoder_dim,
74
+ ffn_dim=ffn_dim,
75
+ num_attention_heads=num_heads,
76
+ depthwise_conv_kernel_size=depthwise_conv_kernel_size,
77
+ dropout=dropout,
78
+ use_group_norm=use_group_norm,
79
+ convolution_first=convolution_first,
80
+ )
81
+ for _ in range(num_separator_layers)
82
+ ]
83
+ )
84
+
85
+ self.f0_stream = nn.ModuleList(
86
+ [
87
+ ConformerLayer(
88
+ input_dim=self.encoder_dim,
89
+ ffn_dim=ffn_dim,
90
+ num_attention_heads=num_heads,
91
+ depthwise_conv_kernel_size=depthwise_conv_kernel_size,
92
+ dropout=dropout,
93
+ use_group_norm=use_group_norm,
94
+ convolution_first=convolution_first,
95
+ )
96
+ for _ in range(num_representation_layers)
97
+ ]
98
+ )
99
+ self.f0_head = nn.Linear(self.encoder_dim, len(self.labeling.f0_centers_c))
100
+
101
+ self.note_stream = nn.ModuleList(
102
+ [
103
+ ConformerLayer(
104
+ input_dim=self.encoder_dim,
105
+ ffn_dim=ffn_dim,
106
+ num_attention_heads=num_heads,
107
+ depthwise_conv_kernel_size=depthwise_conv_kernel_size,
108
+ dropout=dropout,
109
+ use_group_norm=use_group_norm,
110
+ convolution_first=convolution_first,
111
+ )
112
+ for _ in range(num_representation_layers)
113
+ ]
114
+ )
115
+ self.note_head = nn.Linear(self.encoder_dim, len(self.labeling.midi_centers))
116
+
117
+ self.onset_stream = nn.ModuleList(
118
+ [
119
+ ConformerLayer(
120
+ input_dim=self.encoder_dim,
121
+ ffn_dim=ffn_dim,
122
+ num_attention_heads=num_heads,
123
+ depthwise_conv_kernel_size=depthwise_conv_kernel_size,
124
+ dropout=dropout,
125
+ use_group_norm=use_group_norm,
126
+ convolution_first=convolution_first,
127
+ )
128
+ for _ in range(num_representation_layers)
129
+ ]
130
+ )
131
+ self.onset_head = nn.Linear(self.encoder_dim, len(self.labeling.midi_centers))
132
+
133
+ self.offset_stream = nn.ModuleList(
134
+ [
135
+ ConformerLayer(
136
+ input_dim=self.encoder_dim,
137
+ ffn_dim=ffn_dim,
138
+ num_attention_heads=num_heads,
139
+ depthwise_conv_kernel_size=depthwise_conv_kernel_size,
140
+ dropout=dropout,
141
+ use_group_norm=use_group_norm,
142
+ convolution_first=convolution_first,
143
+ )
144
+ for _ in range(num_representation_layers)
145
+ ]
146
+ )
147
+ self.offset_head = nn.Linear(self.encoder_dim, len(self.labeling.midi_centers))
148
+
149
+ self.labeling = labeling
150
+ self.double_merger = nn.Sequential(self.dropout, nn.Linear(2 * self.encoder_dim, self.encoder_dim))
151
+ self.triple_merger = nn.Sequential(self.dropout, nn.Linear(3 * self.encoder_dim, self.encoder_dim))
152
+ self.wiring = wiring
153
+
154
+ print('Total parameter count: ', self.count_parameters())
155
+
156
+ def count_parameters(self) -> int:
157
+ """ Count parameters of encoder """
158
+ return sum([p.numel() for p in self.parameters()])
159
+
160
+ def stream(self, x, representation, key_padding_mask=None):
161
+ for i, layer in enumerate(self.__getattr__('{}_stream'.format(representation))):
162
+ x = layer(x, key_padding_mask)
163
+ return x
164
+
165
+ def head(self, x, representation):
166
+ return self.__getattr__('{}_head'.format(representation))(x)
167
+
168
+ def forward(self, x, key_padding_mask=None):
169
+
170
+ # two auditory streams followed by the separator stream to ensure timbre-awareness
171
+ x_attendant = self.attendant(x)
172
+ x_main = self.main(x[:, self.main_stream_crop[0]:self.main_stream_crop[1]])
173
+ x = self.stream_merger(torch.cat((x_attendant, x_main), -1).squeeze(1))
174
+ x = self.stream(x, 'separator', key_padding_mask)
175
+
176
+ f0 = self.stream(x, 'f0', key_padding_mask) # they say this is a low level feature :)
177
+
178
+ if self.wiring == 'parallel':
179
+ note = self.stream(x, 'note', key_padding_mask)
180
+ onset = self.stream(x, 'onset', key_padding_mask)
181
+ offset = self.stream(x, 'offset', key_padding_mask)
182
+
183
+ elif self.wiring == 'tiktok':
184
+ onset = self.stream(x, 'onset', key_padding_mask)
185
+ offset = self.stream(x, 'offset', key_padding_mask)
186
+ # f0 is disconnected, note relies on separator, onset, and offset
187
+ note = self.stream(self.triple_merger(torch.cat((x, onset, offset), -1)), 'note', key_padding_mask)
188
+
189
+ elif self.wiring == 'tiktok2':
190
+ onset = self.stream(x, 'onset', key_padding_mask)
191
+ offset = self.stream(x, 'offset', key_padding_mask)
192
+ # note is connected to f0, onset, and offset
193
+ note = self.stream(self.triple_merger(torch.cat((f0, onset, offset), -1)), 'note', key_padding_mask)
194
+
195
+ elif self.wiring == 'spotify':
196
+ # note is connected to f0 only
197
+ note = self.stream(f0, 'note', key_padding_mask)
198
+ # here onset and onsets are higher-level features informed by the separator and note
199
+ onset = self.stream(self.double_merger(torch.cat((x, note), -1)), 'onset', key_padding_mask)
200
+ offset = self.stream(self.double_merger(torch.cat((x, note), -1)), 'offset', key_padding_mask)
201
+
202
+ else:
203
+ # onset and offset are connected to f0 and separator streams
204
+ onset = self.stream(self.double_merger(torch.cat((x, f0), -1)), 'onset', key_padding_mask)
205
+ offset = self.stream(self.double_merger(torch.cat((x, f0), -1)), 'offset', key_padding_mask)
206
+ # note is connected to f0, onset, and offset streams
207
+ note = self.stream(self.triple_merger(torch.cat((f0, onset, offset), -1)), 'note', key_padding_mask)
208
+
209
+
210
+ return {'f0': self.head(f0, 'f0'),
211
+ 'note': self.head(note, 'note'),
212
+ 'onset': self.head(onset, 'onset'),
213
+ 'offset': self.head(offset, 'offset')}
214
+
215
+
216
+ class PretrainedModel(FourHeads):
217
+ def __init__(self,model_json:dict,model:str):
218
+ super().__init__(pathway_multiscale=model_json['pathway_multiscale'],num_pathway_layers=model_json['num_pathway_layers'], wiring=model_json['wiring'],hop_length=model_json['hop_length'], chunk_size=model_json['chunk_size'],labeling=PerformanceLabel(note_min=model_json['note_low'], note_max=model_json['note_high'],f0_bins_per_semitone=model_json['f0_bins_per_semitone'],f0_tolerance_c=200,f0_smooth_std_c=model_json['f0_smooth_std_c'], onset_smooth_std=model_json['onset_smooth_std']), sr=model_json['sampling_rate'])
219
+ self.load_state_dict(torch.load(model, map_location=torch.device('cpu'),weights_only=True))
220
+ self.eval()
musc/pathway.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch.nn as nn
3
+
4
+
5
+ class ConvBlock(nn.Module):
6
+ def __init__(self, f, w, s, d, in_channels):
7
+ super().__init__()
8
+ p1 = d*(w - 1) // 2
9
+ p2 = d*(w - 1) - p1
10
+ self.pad = nn.ZeroPad2d((0, 0, p1, p2))
11
+
12
+ self.conv2d = nn.Conv2d(in_channels=in_channels, out_channels=f, kernel_size=(w, 1), stride=(s, 1), dilation=(d, 1))
13
+ self.relu = nn.ReLU()
14
+ self.bn = nn.BatchNorm2d(f)
15
+ self.pool = nn.MaxPool2d(kernel_size=(2, 1))
16
+ self.dropout = nn.Dropout(0.25)
17
+
18
+ def forward(self, x):
19
+ x = self.pad(x)
20
+ x = self.conv2d(x)
21
+ x = self.relu(x)
22
+ x = self.bn(x)
23
+ x = self.pool(x)
24
+ x = self.dropout(x)
25
+ return x
26
+
27
+
28
+ class NoPadConvBlock(nn.Module):
29
+ def __init__(self, f, w, s, d, in_channels):
30
+ super().__init__()
31
+
32
+ self.conv2d = nn.Conv2d(in_channels=in_channels, out_channels=f, kernel_size=(w, 1), stride=(s, 1),
33
+ dilation=(d, 1))
34
+ self.relu = nn.ReLU()
35
+ self.bn = nn.BatchNorm2d(f)
36
+ self.pool = nn.MaxPool2d(kernel_size=(2, 1))
37
+ self.dropout = nn.Dropout(0.25)
38
+
39
+ def forward(self, x):
40
+ x = self.conv2d(x)
41
+ x = self.relu(x)
42
+ x = self.bn(x)
43
+ x = self.pool(x)
44
+ x = self.dropout(x)
45
+ return x
46
+
47
+
48
+ class TinyPathway(nn.Module):
49
+ def __init__(self, dilation=1, hop=256, localize=False,
50
+ model_capacity="full", n_layers=6, chunk_size=256):
51
+ super().__init__()
52
+
53
+ capacity_multiplier = {
54
+ 'tiny': 4, 'small': 8, 'medium': 16, 'large': 24, 'full': 32
55
+ }[model_capacity]
56
+ self.layers = [1, 2, 3, 4, 5, 6]
57
+ self.layers = self.layers[:n_layers]
58
+ filters = [n * capacity_multiplier for n in [32, 8, 8, 8, 8, 8]]
59
+ filters = [1] + filters
60
+ widths = [512, 64, 64, 64, 32, 32]
61
+ strides = self.deter_dilations(hop//(4*(2**n_layers)), localize=localize)
62
+ strides[0] = strides[0]*4 # apply 4 times more stride at the first layer
63
+ dilations = self.deter_dilations(dilation)
64
+
65
+ for i in range(len(self.layers)):
66
+ f, w, s, d, in_channel = filters[i + 1], widths[i], strides[i], dilations[i], filters[i]
67
+ self.add_module("conv%d" % i, NoPadConvBlock(f, w, s, d, in_channel))
68
+ self.chunk_size = chunk_size
69
+ self.input_window, self.hop = self.find_input_size_for_pathway()
70
+ self.out_dim = filters[n_layers]
71
+
72
+ def find_input_size_for_pathway(self):
73
+ def find_input_size(output_size, kernel_size, stride, dilation, padding):
74
+ num = (stride*(output_size-1)) + 1
75
+ input_size = num - 2*padding + dilation*(kernel_size-1)
76
+ return input_size
77
+ conv_calc, n = {}, 0
78
+ for i in self.layers:
79
+ layer = self.__getattr__("conv%d" % (i-1))
80
+ for mm in layer.modules():
81
+ if hasattr(mm, 'kernel_size'):
82
+ try:
83
+ d = mm.dilation[0]
84
+ except TypeError:
85
+ d = mm.dilation
86
+ conv_calc[n] = [mm.kernel_size[0], mm.stride[0], 0, d]
87
+ n += 1
88
+ out = self.chunk_size
89
+ hop = 1
90
+ for n in sorted(conv_calc.keys())[::-1]:
91
+ kernel_size_n, stride_n, padding_n, dilation_n = conv_calc[n]
92
+ out = find_input_size(out, kernel_size_n, stride_n, dilation_n, padding_n)
93
+ hop = hop*stride_n
94
+ return out, hop
95
+
96
+ def deter_dilations(self, total_dilation, localize=False):
97
+ n_layers = len(self.layers)
98
+ if localize: # e.g., 32*1023 window and 3 layers -> [1, 1, 32]
99
+ a = [total_dilation] + [1 for _ in range(n_layers-1)]
100
+ else: # e.g., 32*1023 window and 3 layers -> [4, 4, 2]
101
+ total_dilation = int(np.log2(total_dilation))
102
+ a = []
103
+ for layer in range(n_layers):
104
+ this_dilation = int(np.ceil(total_dilation/(n_layers-layer)))
105
+ a.append(2**this_dilation)
106
+ total_dilation = total_dilation - this_dilation
107
+ return a[::-1]
108
+
109
+ def forward(self, x):
110
+ x = x.view(x.shape[0], 1, -1, 1)
111
+ for i in range(len(self.layers)):
112
+ x = self.__getattr__("conv%d" % i)(x)
113
+ x = x.permute(0, 3, 2, 1)
114
+ return x
musc/pitch_estimator.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ import torch
3
+ import torchaudio
4
+ from typing import List, Optional, Tuple
5
+ import pathlib
6
+ from scipy.signal import medfilt
7
+ import numpy as np
8
+ import librosa
9
+ from librosa.sequence import viterbi_discriminative
10
+ from scipy.ndimage import gaussian_filter1d
11
+ from .postprocessing import spotify_create_notes
12
+
13
+
14
+ class PitchEstimator(nn.Module):
15
+ """
16
+ This is the base class that everything else inherits from. The hierarchy is:
17
+ PitchEstimator -> Transcriber -> Synchronizer -> AutonomousAgent -> The n-Head Music Performance Analysis Models
18
+ PitchEstimator can handle reading the audio, predicting all the features,
19
+ estimating a single frame level f0 using viterbi, or
20
+ MIDI pitch bend creation for the predicted note events when used inside a Transcriber, or
21
+ score-informed f0 estimation when used inside a Synchronizer.
22
+ """
23
+ def __init__(self, labeling, instrument='Violin', sr=16000, window_size=1024, hop_length=160):
24
+ super().__init__()
25
+ self.labeling = labeling
26
+ self.sr = sr
27
+ self.window_size = window_size
28
+ self.hop_length = hop_length
29
+ self.instrument = instrument
30
+ self.f0_bins_per_semitone = int(np.round(100/self.labeling.f0_granularity_c))
31
+
32
+
33
+ def read_audio(self, audio):
34
+ """
35
+ Read and resample an audio file, convert to mono, and unfold into representation frames.
36
+ The time array represents the center of each small frame with 5.8ms hop length. This is different than the chunk
37
+ level frames. The chunk level frames represent the entire sequence the model sees. Whereas it predicts with the
38
+ small frames intervals (5.8ms).
39
+ :param audio: str, pathlib.Path, np.ndarray, or torch.Tensor
40
+ :return: frames: (n_big_frames, frame_length), times: (n_small_frames,)
41
+ """
42
+ if isinstance(audio, str) or isinstance(audio, pathlib.Path):
43
+ audio, sample_rate = torchaudio.load(audio, normalize=True)
44
+ audio = audio.mean(axis=0) # convert to mono
45
+ if sample_rate != self.sr:
46
+ audio = torchaudio.functional.resample(audio, sample_rate, self.sr)
47
+ elif isinstance(audio, np.ndarray):
48
+ audio = torch.from_numpy(audio)
49
+ else:
50
+ assert isinstance(audio, torch.Tensor)
51
+ len_audio = audio.shape[-1]
52
+ n_frames = int(np.ceil((len_audio + sum(self.frame_overlap)) / (self.hop_length * self.chunk_size)))
53
+ audio = nn.functional.pad(audio, (self.frame_overlap[0],
54
+ self.frame_overlap[1] + (n_frames * self.hop_length * self.chunk_size) - len_audio))
55
+ frames = audio.unfold(0, self.max_window_size, self.hop_length*self.chunk_size)
56
+ times = np.arange(0, len_audio, self.hop_length) / self.sr # not tensor, we don't compute anything with it
57
+ return frames, times
58
+
59
+ def predict(self, audio, batch_size):
60
+ frames, times = self.read_audio(audio)
61
+ performance = {'f0': [], 'note': [], 'onset': [], 'offset': []}
62
+ self.eval()
63
+ device = self.main.conv0.conv2d.weight.device
64
+ with torch.no_grad():
65
+ for i in range(0, len(frames), batch_size):
66
+ f = frames[i:min(i + batch_size, len(frames))].to(device)
67
+ f -= (torch.mean(f, axis=1).unsqueeze(-1))
68
+ f /= (torch.std(f, axis=1).unsqueeze(-1))
69
+ out = self.forward(f)
70
+ for key, value in out.items():
71
+ value = torch.sigmoid(value)
72
+ value = torch.nan_to_num(value) # the model outputs nan when the frame is silent (this is an expected behavior due to normalization)
73
+ value = value.view(-1, value.shape[-1])
74
+ value = value.detach().cpu().numpy()
75
+ performance[key].append(value)
76
+ performance = {key: np.concatenate(value, axis=0)[:len(times)] for key, value in performance.items()}
77
+ performance['time'] = times
78
+ return performance
79
+
80
+ def estimate_pitch(self, audio, batch_size, viterbi=False):
81
+ out = self.predict(audio, batch_size)
82
+ f0_hz = self.out2f0(out, viterbi)
83
+ return out['time'], f0_hz
84
+
85
+ def out2f0(self, out, viterbi=False):
86
+ """
87
+ Monophonic f0 estimation from the model output. The viterbi postprocessing is specialized for the violin family.
88
+ """
89
+ salience = out['f0']
90
+ if viterbi == 'constrained':
91
+ assert hasattr(self, 'out2note')
92
+ notes = spotify_create_notes( out["note"], out["onset"], note_low=self.labeling.midi_centers[0],
93
+ note_high=self.labeling.midi_centers[-1], onset_thresh=0.5, frame_thresh=0.3,
94
+ infer_onsets=True, melodia_trick=True,
95
+ min_note_len=int(np.round(127.70 / 1000 * (self.sr / self.hop_length))))
96
+ note_cents = self.get_pitch_bends(salience, notes, to_midi=False, timing_refinement_range=0)
97
+ cents = np.zeros_like(out['time'])
98
+ cents[note_cents[:,0].astype(int)] = note_cents[:,1]
99
+ elif viterbi:
100
+ # transition probabilities inducing continuous pitch
101
+ # big changes are penalized with one order of magnitude
102
+ transition = gaussian_filter1d(np.eye(self.labeling.f0_n_bins), 30) + 99 * gaussian_filter1d(
103
+ np.eye(self.labeling.f0_n_bins), 2)
104
+ transition = transition / np.sum(transition, axis=1)[:, None]
105
+
106
+ p = salience / salience.sum(axis=1)[:, None]
107
+ p[np.isnan(p.sum(axis=1)), :] = np.ones(self.labeling.f0_n_bins) * 1 / self.labeling.f0_n_bins
108
+ path = viterbi_discriminative(p.T, transition)
109
+ cents = np.array([self.labeling.f0_label2c(salience[i, :], path[i]) for i in range(len(path))])
110
+ else:
111
+ cents = self.labeling.f0_label2c(salience, center=None) # use argmax for center
112
+
113
+ f0_hz = self.labeling.f0_c2hz(cents)
114
+ f0_hz[np.isnan(f0_hz)] = 0
115
+ return f0_hz
116
+
117
+ def get_pitch_bends(
118
+ self,
119
+ contours: np.ndarray, note_events: List[Tuple[int, int, int, float]],
120
+ timing_refinement_range: int = 0, to_midi: bool = True,
121
+ ) -> List[Tuple[int, int, int, float, Optional[List[int]]]]:
122
+ """Modified version of an excellent script from Spotify/basic_pitch!! Thank you!!!!
123
+ Given note events and contours, estimate pitch bends per note.
124
+ Pitch bends are represented as a sequence of evenly spaced midi pitch bend control units.
125
+ The time stamps of each pitch bend can be inferred by computing an evenly spaced grid between
126
+ the start and end times of each note event.
127
+ Args:
128
+ contours: Matrix of estimated pitch contours
129
+ note_events: note event tuple
130
+ timing_refinement_range: if > 0, refine onset/offset boundaries with f0 confidence
131
+ to_midi: whether to convert pitch bends to midi pitch bends. If False, return pitch estimates in the format
132
+ [time (index), pitch (Hz), confidence in range [0, 1]].
133
+ Returns:
134
+ note events with pitch bends
135
+ """
136
+
137
+ f0_matrix = [] # [time (index), pitch (Hz), confidence in range [0, 1]]
138
+ note_events_with_pitch_bends = []
139
+ for start_idx, end_idx, pitch_midi, amplitude in note_events:
140
+ if timing_refinement_range:
141
+ start_idx = np.max([0, start_idx - timing_refinement_range])
142
+ end_idx = np.min([contours.shape[0], end_idx + timing_refinement_range])
143
+ freq_idx = int(np.round(self.midi_pitch_to_contour_bin(pitch_midi)))
144
+ freq_start_idx = np.max([freq_idx - self.labeling.f0_tolerance_bins, 0])
145
+ freq_end_idx = np.min([self.labeling.f0_n_bins, freq_idx + self.labeling.f0_tolerance_bins + 1])
146
+
147
+ trans_start_idx = np.max([0, self.labeling.f0_tolerance_bins - freq_idx])
148
+ trans_end_idx = (2 * self.labeling.f0_tolerance_bins + 1) - \
149
+ np.max([0, freq_idx - (self.labeling.f0_n_bins - self.labeling.f0_tolerance_bins - 1)])
150
+
151
+ # apply regional viterbi to estimate the intonation
152
+ # observation probabilities come from the f0_roll matrix
153
+ observation = contours[start_idx:end_idx, freq_start_idx:freq_end_idx]
154
+ observation = observation / observation.sum(axis=1)[:, None]
155
+ observation[np.isnan(observation.sum(axis=1)), :] = np.ones(freq_end_idx - freq_start_idx) * 1 / (
156
+ freq_end_idx - freq_start_idx)
157
+
158
+ # transition probabilities assure continuity
159
+ transition = self.labeling.f0_transition_matrix[trans_start_idx:trans_end_idx,
160
+ trans_start_idx:trans_end_idx] + 1e-6
161
+ transition = transition / np.sum(transition, axis=1)[:, None]
162
+
163
+ path = viterbi_discriminative(observation.T / observation.sum(axis=1), transition) + freq_start_idx
164
+
165
+ cents = np.array([self.labeling.f0_label2c(contours[i + start_idx, :], path[i]) for i in range(len(path))])
166
+ bends = cents - self.labeling.midi_centers_c[pitch_midi - self.labeling.midi_centers[0]]
167
+ if to_midi:
168
+ bends = (bends * 4096 / 100).astype(int)
169
+ bends[bends > 8191] = 8191
170
+ bends[bends < -8192] = -8192
171
+
172
+ if timing_refinement_range:
173
+ confidences = np.array([contours[i + start_idx, path[i]] for i in range(len(path))])
174
+ threshold = np.median(confidences)
175
+ threshold = (np.median(confidences > threshold) + threshold) / 2 # some magic
176
+ median_kernel = 2 * (timing_refinement_range // 2) + 1 # some more magic
177
+ confidences = medfilt(confidences, kernel_size=median_kernel)
178
+ conf_bool = confidences > threshold
179
+ onset_idx = np.argmax(conf_bool)
180
+ offset_idx = len(confidences) - np.argmax(conf_bool[::-1])
181
+ bends = bends[onset_idx:offset_idx]
182
+ start_idx = start_idx + onset_idx
183
+ end_idx = start_idx + offset_idx
184
+
185
+ note_events_with_pitch_bends.append((start_idx, end_idx, pitch_midi, amplitude, bends))
186
+ else:
187
+ confidences = np.array([contours[i + start_idx, path[i]] for i in range(len(path))])
188
+ time_idx = np.arange(len(path)) + start_idx
189
+ # f0_hz = self.labeling.f0_c2hz(cents)
190
+ possible_f0s = np.array([time_idx, cents, confidences]).T
191
+ f0_matrix.append(possible_f0s[np.abs(bends)<100]) # filter out pitch bends that are too large
192
+ if not to_midi:
193
+ return np.vstack(f0_matrix)
194
+ else:
195
+ return note_events_with_pitch_bends
196
+
197
+
198
+ def midi_pitch_to_contour_bin(self, pitch_midi: int) -> np.array:
199
+ """Convert midi pitch to corresponding index in contour matrix
200
+ Args:
201
+ pitch_midi: pitch in midi
202
+ Returns:
203
+ index in contour matrix
204
+ """
205
+ pitch_hz = librosa.midi_to_hz(pitch_midi)
206
+ return np.argmin(np.abs(self.labeling.f0_centers_hz - pitch_hz))
musc/postprocessing.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ import scipy
3
+ import numpy as np
4
+
5
+
6
+ # SPOTIFY
7
+
8
+ def get_inferred_onsets(onset_roll: np.array, note_roll: np.array, n_diff: int = 2) -> np.array:
9
+ """
10
+ Infer onsets from large changes in note roll matrix amplitudes.
11
+ Modified from https://github.com/spotify/basic-pitch/blob/main/basic_pitch/note_creation.py
12
+ :param onset_roll: Onset activation matrix (n_times, n_freqs).
13
+ :param note_roll: Frame-level note activation matrix (n_times, n_freqs).
14
+ :param n_diff: Differences used to detect onsets.
15
+ :return: The maximum between the predicted onsets and its differences.
16
+ """
17
+
18
+ diffs = []
19
+ for n in range(1, n_diff + 1):
20
+ frames_appended = np.concatenate([np.zeros((n, note_roll.shape[1])), note_roll])
21
+ diffs.append(frames_appended[n:, :] - frames_appended[:-n, :])
22
+ frame_diff = np.min(diffs, axis=0)
23
+ frame_diff[frame_diff < 0] = 0
24
+ frame_diff[:n_diff, :] = 0
25
+ frame_diff = np.max(onset_roll) * frame_diff / np.max(frame_diff) # rescale to have the same max as onsets
26
+
27
+ max_onsets_diff = np.max([onset_roll, frame_diff],
28
+ axis=0) # use the max of the predicted onsets and the differences
29
+
30
+ return max_onsets_diff
31
+
32
+
33
+
34
+ def spotify_create_notes(
35
+ note_roll: np.array,
36
+ onset_roll: np.array,
37
+ onset_thresh: float,
38
+ frame_thresh: float,
39
+ min_note_len: int,
40
+ infer_onsets: bool,
41
+ note_low : int, #self.labeling.midi_centers[0]
42
+ note_high : int, #self.labeling.midi_centers[-1],
43
+ melodia_trick: bool = True,
44
+ energy_tol: int = 11,
45
+ ) -> List[Tuple[int, int, int, float]]:
46
+ """Decode raw model output to polyphonic note events
47
+ Modified from https://github.com/spotify/basic-pitch/blob/main/basic_pitch/note_creation.py
48
+ Args:
49
+ note_roll: Frame activation matrix (n_times, n_freqs).
50
+ onset_roll: Onset activation matrix (n_times, n_freqs).
51
+ onset_thresh: Minimum amplitude of an onset activation to be considered an onset.
52
+ frame_thresh: Minimum amplitude of a frame activation for a note to remain "on".
53
+ min_note_len: Minimum allowed note length in frames.
54
+ infer_onsets: If True, add additional onsets when there are large differences in frame amplitudes.
55
+ melodia_trick : Whether to use the melodia trick to better detect notes.
56
+ energy_tol: Drop notes below this energy.
57
+ Returns:
58
+ list of tuples [(start_time_frames, end_time_frames, pitch_midi, amplitude)]
59
+ representing the note events, where amplitude is a number between 0 and 1
60
+ """
61
+
62
+ n_frames = note_roll.shape[0]
63
+
64
+ # use onsets inferred from frames in addition to the predicted onsets
65
+ if infer_onsets:
66
+ onset_roll = get_inferred_onsets(onset_roll, note_roll)
67
+
68
+ peak_thresh_mat = np.zeros(onset_roll.shape)
69
+ peaks = scipy.signal.argrelmax(onset_roll, axis=0)
70
+ peak_thresh_mat[peaks] = onset_roll[peaks]
71
+
72
+ onset_idx = np.where(peak_thresh_mat >= onset_thresh)
73
+ onset_time_idx = onset_idx[0][::-1] # sort to go backwards in time
74
+ onset_freq_idx = onset_idx[1][::-1] # sort to go backwards in time
75
+
76
+ remaining_energy = np.zeros(note_roll.shape)
77
+ remaining_energy[:, :] = note_roll[:, :]
78
+
79
+ # loop over onsets
80
+ note_events = []
81
+ for note_start_idx, freq_idx in zip(onset_time_idx, onset_freq_idx):
82
+ # if we're too close to the end of the audio, continue
83
+ if note_start_idx >= n_frames - 1:
84
+ continue
85
+
86
+ # find time index at this frequency band where the frames drop below an energy threshold
87
+ i = note_start_idx + 1
88
+ k = 0 # number of frames since energy dropped below threshold
89
+ while i < n_frames - 1 and k < energy_tol:
90
+ if remaining_energy[i, freq_idx] < frame_thresh:
91
+ k += 1
92
+ else:
93
+ k = 0
94
+ i += 1
95
+
96
+ i -= k # go back to frame above threshold
97
+
98
+ # if the note is too short, skip it
99
+ if i - note_start_idx <= min_note_len:
100
+ continue
101
+
102
+ remaining_energy[note_start_idx:i, freq_idx] = 0
103
+ if freq_idx < note_high:
104
+ remaining_energy[note_start_idx:i, freq_idx + 1] = 0
105
+ if freq_idx > note_low:
106
+ remaining_energy[note_start_idx:i, freq_idx - 1] = 0
107
+
108
+ # add the note
109
+ amplitude = np.mean(note_roll[note_start_idx:i, freq_idx])
110
+ note_events.append(
111
+ (
112
+ note_start_idx,
113
+ i,
114
+ freq_idx + note_low,
115
+ amplitude,
116
+ )
117
+ )
118
+
119
+ if melodia_trick:
120
+ energy_shape = remaining_energy.shape
121
+
122
+ while np.max(remaining_energy) > frame_thresh:
123
+ i_mid, freq_idx = np.unravel_index(np.argmax(remaining_energy), energy_shape)
124
+ remaining_energy[i_mid, freq_idx] = 0
125
+
126
+ # forward pass
127
+ i = i_mid + 1
128
+ k = 0
129
+ while i < n_frames - 1 and k < energy_tol:
130
+ if remaining_energy[i, freq_idx] < frame_thresh:
131
+ k += 1
132
+ else:
133
+ k = 0
134
+
135
+ remaining_energy[i, freq_idx] = 0
136
+ if freq_idx < note_high:
137
+ remaining_energy[i, freq_idx + 1] = 0
138
+ if freq_idx > note_low:
139
+ remaining_energy[i, freq_idx - 1] = 0
140
+
141
+ i += 1
142
+
143
+ i_end = i - 1 - k # go back to frame above threshold
144
+
145
+ # backward pass
146
+ i = i_mid - 1
147
+ k = 0
148
+ while i > 0 and k < energy_tol:
149
+ if remaining_energy[i, freq_idx] < frame_thresh:
150
+ k += 1
151
+ else:
152
+ k = 0
153
+
154
+ remaining_energy[i, freq_idx] = 0
155
+ if freq_idx < note_high:
156
+ remaining_energy[i, freq_idx + 1] = 0
157
+ if freq_idx > note_low:
158
+ remaining_energy[i, freq_idx - 1] = 0
159
+
160
+ i -= 1
161
+
162
+ i_start = i + 1 + k # go back to frame above threshold
163
+ assert i_start >= 0, "{}".format(i_start)
164
+ assert i_end < n_frames
165
+
166
+ if i_end - i_start <= min_note_len:
167
+ # note is too short, skip it
168
+ continue
169
+
170
+ # add the note
171
+ amplitude = np.mean(note_roll[i_start:i_end, freq_idx])
172
+ note_events.append(
173
+ (
174
+ i_start,
175
+ i_end,
176
+ freq_idx + note_low,
177
+ amplitude,
178
+ )
179
+ )
180
+
181
+ return note_events
182
+
183
+
184
+
185
+ # TIKTOK
186
+
187
+
188
+ def note_detection_with_onset_offset_regress(frame_output, onset_output,
189
+ onset_shift_output, offset_output, offset_shift_output, velocity_output,
190
+ frame_threshold):
191
+ """Process prediction matrices to note events information.
192
+ First, detect onsets with onset outputs. Then, detect offsets
193
+ with frame and offset outputs.
194
+
195
+ Args:
196
+ frame_output: (frames_num,)
197
+ onset_output: (frames_num,)
198
+ onset_shift_output: (frames_num,)
199
+ offset_output: (frames_num,)
200
+ offset_shift_output: (frames_num,)
201
+ velocity_output: (frames_num,)
202
+ frame_threshold: float
203
+ Returns:
204
+ output_tuples: list of [bgn, fin, onset_shift, offset_shift, normalized_velocity],
205
+ e.g., [
206
+ [1821, 1909, 0.47498, 0.3048533, 0.72119445],
207
+ [1909, 1947, 0.30730522, -0.45764327, 0.64200014],
208
+ ...]
209
+ """
210
+ output_tuples = []
211
+ bgn = None
212
+ frame_disappear = None
213
+ offset_occur = None
214
+
215
+ for i in range(onset_output.shape[0]):
216
+ if onset_output[i] == 1:
217
+ """Onset detected"""
218
+ if bgn:
219
+ """Consecutive onsets. E.g., pedal is not released, but two
220
+ consecutive notes being played."""
221
+ fin = max(i - 1, 0)
222
+ output_tuples.append([bgn, fin, onset_shift_output[bgn],
223
+ 0, velocity_output[bgn]])
224
+ frame_disappear, offset_occur = None, None
225
+ bgn = i
226
+
227
+ if bgn and i > bgn:
228
+ """If onset found, then search offset"""
229
+ if frame_output[i] <= frame_threshold and not frame_disappear:
230
+ """Frame disappear detected"""
231
+ frame_disappear = i
232
+
233
+ if offset_output[i] == 1 and not offset_occur:
234
+ """Offset detected"""
235
+ offset_occur = i
236
+
237
+ if frame_disappear:
238
+ if offset_occur and offset_occur - bgn > frame_disappear - offset_occur:
239
+ """bgn --------- offset_occur --- frame_disappear"""
240
+ fin = offset_occur
241
+ else:
242
+ """bgn --- offset_occur --------- frame_disappear"""
243
+ fin = frame_disappear
244
+ output_tuples.append([bgn, fin, onset_shift_output[bgn],
245
+ offset_shift_output[fin], velocity_output[bgn]])
246
+ bgn, frame_disappear, offset_occur = None, None, None
247
+
248
+ if bgn and (i - bgn >= 600 or i == onset_output.shape[0] - 1):
249
+ """Offset not detected"""
250
+ fin = i
251
+ output_tuples.append([bgn, fin, onset_shift_output[bgn],
252
+ offset_shift_output[fin], velocity_output[bgn]])
253
+ bgn, frame_disappear, offset_occur = None, None, None
254
+
255
+ # Sort pairs by onsets
256
+ output_tuples.sort(key=lambda pair: pair[0])
257
+
258
+ return output_tuples
259
+
260
+
261
+ class RegressionPostProcessor(object):
262
+ def __init__(self, frames_per_second, classes_num, onset_threshold,
263
+ offset_threshold, frame_threshold, pedal_offset_threshold,
264
+ begin_note):
265
+ """Postprocess the output probabilities of a transription model to MIDI
266
+ events.
267
+
268
+ Args:
269
+ frames_per_second: float
270
+ classes_num: int
271
+ onset_threshold: float
272
+ offset_threshold: float
273
+ frame_threshold: float
274
+ pedal_offset_threshold: float
275
+ """
276
+ self.frames_per_second = frames_per_second
277
+ self.classes_num = classes_num
278
+ self.onset_threshold = onset_threshold
279
+ self.offset_threshold = offset_threshold
280
+ self.frame_threshold = frame_threshold
281
+ self.pedal_offset_threshold = pedal_offset_threshold
282
+ self.begin_note = begin_note
283
+ self.velocity_scale = 128
284
+
285
+ def output_dict_to_midi_events(self, output_dict):
286
+ """Main function. Post process model outputs to MIDI events.
287
+
288
+ Args:
289
+ output_dict: {
290
+ 'reg_onset_output': (segment_frames, classes_num),
291
+ 'reg_offset_output': (segment_frames, classes_num),
292
+ 'frame_output': (segment_frames, classes_num),
293
+ 'velocity_output': (segment_frames, classes_num),
294
+ 'reg_pedal_onset_output': (segment_frames, 1),
295
+ 'reg_pedal_offset_output': (segment_frames, 1),
296
+ 'pedal_frame_output': (segment_frames, 1)}
297
+
298
+ Outputs:
299
+ est_note_events: list of dict, e.g. [
300
+ {'onset_time': 39.74, 'offset_time': 39.87, 'midi_note': 27, 'velocity': 83},
301
+ {'onset_time': 11.98, 'offset_time': 12.11, 'midi_note': 33, 'velocity': 88}]
302
+
303
+ est_pedal_events: list of dict, e.g. [
304
+ {'onset_time': 0.17, 'offset_time': 0.96},
305
+ {'osnet_time': 1.17, 'offset_time': 2.65}]
306
+ """
307
+ output_dict['frame_output'] = output_dict['note']
308
+ output_dict['velocity_output'] = output_dict['note']
309
+ output_dict['reg_onset_output'] = output_dict['onset']
310
+ output_dict['reg_offset_output'] = output_dict['offset']
311
+ # Post process piano note outputs to piano note and pedal events information
312
+ (est_on_off_note_vels, est_pedal_on_offs) = \
313
+ self.output_dict_to_note_pedal_arrays(output_dict)
314
+ """est_on_off_note_vels: (events_num, 4), the four columns are: [onset_time, offset_time, piano_note, velocity],
315
+ est_pedal_on_offs: (pedal_events_num, 2), the two columns are: [onset_time, offset_time]"""
316
+
317
+ # Reformat notes to MIDI events
318
+ est_note_events = self.detected_notes_to_events(est_on_off_note_vels)
319
+
320
+ if est_pedal_on_offs is None:
321
+ est_pedal_events = None
322
+ else:
323
+ est_pedal_events = self.detected_pedals_to_events(est_pedal_on_offs)
324
+
325
+ return est_note_events, est_pedal_events
326
+
327
+ def output_dict_to_note_pedal_arrays(self, output_dict):
328
+ """Postprocess the output probabilities of a transription model to MIDI
329
+ events.
330
+
331
+ Args:
332
+ output_dict: dict, {
333
+ 'reg_onset_output': (frames_num, classes_num),
334
+ 'reg_offset_output': (frames_num, classes_num),
335
+ 'frame_output': (frames_num, classes_num),
336
+ 'velocity_output': (frames_num, classes_num),
337
+ ...}
338
+
339
+ Returns:
340
+ est_on_off_note_vels: (events_num, 4), the 4 columns are onset_time,
341
+ offset_time, piano_note and velocity. E.g. [
342
+ [39.74, 39.87, 27, 0.65],
343
+ [11.98, 12.11, 33, 0.69],
344
+ ...]
345
+
346
+ est_pedal_on_offs: (pedal_events_num, 2), the 2 columns are onset_time
347
+ and offset_time. E.g. [
348
+ [0.17, 0.96],
349
+ [1.17, 2.65],
350
+ ...]
351
+ """
352
+
353
+ # ------ 1. Process regression outputs to binarized outputs ------
354
+ # For example, onset or offset of [0., 0., 0.15, 0.30, 0.40, 0.35, 0.20, 0.05, 0., 0.]
355
+ # will be processed to [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]
356
+
357
+ # Calculate binarized onset output from regression output
358
+ (onset_output, onset_shift_output) = \
359
+ self.get_binarized_output_from_regression(
360
+ reg_output=output_dict['reg_onset_output'],
361
+ threshold=self.onset_threshold, neighbour=2)
362
+
363
+ output_dict['onset_output'] = onset_output # Values are 0 or 1
364
+ output_dict['onset_shift_output'] = onset_shift_output
365
+
366
+ # Calculate binarized offset output from regression output
367
+ (offset_output, offset_shift_output) = \
368
+ self.get_binarized_output_from_regression(
369
+ reg_output=output_dict['reg_offset_output'],
370
+ threshold=self.offset_threshold, neighbour=4)
371
+
372
+ output_dict['offset_output'] = offset_output # Values are 0 or 1
373
+ output_dict['offset_shift_output'] = offset_shift_output
374
+
375
+ if 'reg_pedal_onset_output' in output_dict.keys():
376
+ """Pedal onsets are not used in inference. Instead, frame-wise pedal
377
+ predictions are used to detect onsets. We empirically found this is
378
+ more accurate to detect pedal onsets."""
379
+ pass
380
+
381
+ if 'reg_pedal_offset_output' in output_dict.keys():
382
+ # Calculate binarized pedal offset output from regression output
383
+ (pedal_offset_output, pedal_offset_shift_output) = \
384
+ self.get_binarized_output_from_regression(
385
+ reg_output=output_dict['reg_pedal_offset_output'],
386
+ threshold=self.pedal_offset_threshold, neighbour=4)
387
+
388
+ output_dict['pedal_offset_output'] = pedal_offset_output # Values are 0 or 1
389
+ output_dict['pedal_offset_shift_output'] = pedal_offset_shift_output
390
+
391
+ # ------ 2. Process matrices results to event results ------
392
+ # Detect piano notes from output_dict
393
+ est_on_off_note_vels = self.output_dict_to_detected_notes(output_dict)
394
+
395
+ est_pedal_on_offs = None
396
+
397
+ return est_on_off_note_vels, est_pedal_on_offs
398
+
399
+ def get_binarized_output_from_regression(self, reg_output, threshold, neighbour):
400
+ """Calculate binarized output and shifts of onsets or offsets from the
401
+ regression results.
402
+
403
+ Args:
404
+ reg_output: (frames_num, classes_num)
405
+ threshold: float
406
+ neighbour: int
407
+
408
+ Returns:
409
+ binary_output: (frames_num, classes_num)
410
+ shift_output: (frames_num, classes_num)
411
+ """
412
+ binary_output = np.zeros_like(reg_output)
413
+ shift_output = np.zeros_like(reg_output)
414
+ (frames_num, classes_num) = reg_output.shape
415
+
416
+ for k in range(classes_num):
417
+ x = reg_output[:, k]
418
+ for n in range(neighbour, frames_num - neighbour):
419
+ if x[n] > threshold and self.is_monotonic_neighbour(x, n, neighbour):
420
+ binary_output[n, k] = 1
421
+
422
+ """See Section III-D in [1] for deduction.
423
+ [1] Q. Kong, et al., High-resolution Piano Transcription
424
+ with Pedals by Regressing Onsets and Offsets Times, 2020."""
425
+ if x[n - 1] > x[n + 1]:
426
+ shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n + 1]) / 2
427
+ else:
428
+ shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n - 1]) / 2
429
+ shift_output[n, k] = shift
430
+
431
+ return binary_output, shift_output
432
+
433
+ def is_monotonic_neighbour(self, x, n, neighbour):
434
+ """Detect if values are monotonic in both side of x[n].
435
+
436
+ Args:
437
+ x: (frames_num,)
438
+ n: int
439
+ neighbour: int
440
+
441
+ Returns:
442
+ monotonic: bool
443
+ """
444
+ monotonic = True
445
+ for i in range(neighbour):
446
+ if x[n - i] < x[n - i - 1]:
447
+ monotonic = False
448
+ if x[n + i] < x[n + i + 1]:
449
+ monotonic = False
450
+
451
+ return monotonic
452
+
453
+ def output_dict_to_detected_notes(self, output_dict):
454
+ """Postprocess output_dict to piano notes.
455
+
456
+ Args:
457
+ output_dict: dict, e.g. {
458
+ 'onset_output': (frames_num, classes_num),
459
+ 'onset_shift_output': (frames_num, classes_num),
460
+ 'offset_output': (frames_num, classes_num),
461
+ 'offset_shift_output': (frames_num, classes_num),
462
+ 'frame_output': (frames_num, classes_num),
463
+ 'onset_output': (frames_num, classes_num),
464
+ ...}
465
+
466
+ Returns:
467
+ est_on_off_note_vels: (notes, 4), the four columns are onsets, offsets,
468
+ MIDI notes and velocities. E.g.,
469
+ [[39.7375, 39.7500, 27., 0.6638],
470
+ [11.9824, 12.5000, 33., 0.6892],
471
+ ...]
472
+ """
473
+
474
+ est_tuples = []
475
+ est_midi_notes = []
476
+ classes_num = output_dict['frame_output'].shape[-1]
477
+
478
+ for piano_note in range(classes_num):
479
+ """Detect piano notes"""
480
+ est_tuples_per_note = note_detection_with_onset_offset_regress(
481
+ frame_output=output_dict['frame_output'][:, piano_note],
482
+ onset_output=output_dict['onset_output'][:, piano_note],
483
+ onset_shift_output=output_dict['onset_shift_output'][:, piano_note],
484
+ offset_output=output_dict['offset_output'][:, piano_note],
485
+ offset_shift_output=output_dict['offset_shift_output'][:, piano_note],
486
+ velocity_output=output_dict['velocity_output'][:, piano_note],
487
+ frame_threshold=self.frame_threshold)
488
+
489
+ est_tuples += est_tuples_per_note
490
+ est_midi_notes += [piano_note + self.begin_note] * len(est_tuples_per_note)
491
+
492
+ est_tuples = np.array(est_tuples) # (notes, 5)
493
+ """(notes, 5), the five columns are onset, offset, onset_shift,
494
+ offset_shift and normalized_velocity"""
495
+
496
+ est_midi_notes = np.array(est_midi_notes) # (notes,)
497
+
498
+ onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second
499
+ offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second
500
+ velocities = est_tuples[:, 4]
501
+
502
+ est_on_off_note_vels = np.stack((onset_times, offset_times, est_midi_notes, velocities), axis=-1)
503
+ """(notes, 3), the three columns are onset_times, offset_times and velocity."""
504
+
505
+ est_on_off_note_vels = est_on_off_note_vels.astype(np.float32)
506
+
507
+ return est_on_off_note_vels
508
+
509
+ def detected_notes_to_events(self, est_on_off_note_vels):
510
+ """Reformat detected notes to midi events.
511
+
512
+ Args:
513
+ est_on_off_vels: (notes, 3), the three columns are onset_times,
514
+ offset_times and velocity. E.g.
515
+ [[32.8376, 35.7700, 0.7932],
516
+ [37.3712, 39.9300, 0.8058],
517
+ ...]
518
+
519
+ Returns:
520
+ midi_events, list, e.g.,
521
+ [{'onset_time': 39.7376, 'offset_time': 39.75, 'midi_note': 27, 'velocity': 84},
522
+ {'onset_time': 11.9824, 'offset_time': 12.50, 'midi_note': 33, 'velocity': 88},
523
+ ...]
524
+ """
525
+ midi_events = []
526
+ for i in range(est_on_off_note_vels.shape[0]):
527
+ midi_events.append({
528
+ 'onset_time': est_on_off_note_vels[i][0],
529
+ 'offset_time': est_on_off_note_vels[i][1],
530
+ 'midi_note': int(est_on_off_note_vels[i][2]),
531
+ 'velocity': int(est_on_off_note_vels[i][3] * self.velocity_scale)})
532
+
533
+ return midi_events
musc/representations.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mir_eval import melody
2
+ import numpy as np
3
+ from scipy.stats import norm
4
+ import librosa
5
+ import pretty_midi
6
+ from scipy.ndimage import gaussian_filter1d
7
+
8
+
9
+ class PerformanceLabel:
10
+ """
11
+ The dataset labeling class for performance representations. Currently, includes onset, note, and fine-grained f0
12
+ representations. Note min, note max, and f0_bin_per_semitone values are to be arranged per instrument. The default
13
+ values are for violin performance analysis. Fretted instruments might not require such f0 resolutions per semitone.
14
+ """
15
+ def __init__(self, note_min='F#3', note_max='C8', f0_bins_per_semitone=9, f0_smooth_std_c=None,
16
+ onset_smooth_std=0.7, f0_tolerance_c=200):
17
+ midi_min = pretty_midi.note_name_to_number(note_min)
18
+ midi_max = pretty_midi.note_name_to_number(note_max)
19
+ self.midi_centers = np.arange(midi_min, midi_max)
20
+ self.onset_smooth_std=onset_smooth_std # onset smoothing along time axis (compensate for alignment)
21
+
22
+ f0_hz_range = librosa.note_to_hz([note_min, note_max])
23
+ f0_c_min, f0_c_max = melody.hz2cents(f0_hz_range)
24
+ self.f0_granularity_c = 100/f0_bins_per_semitone
25
+ if not f0_smooth_std_c:
26
+ f0_smooth_std_c = self.f0_granularity_c * 5/4 # Keep the ratio from the CREPE paper (20 cents and 25 cents)
27
+ self.f0_smooth_std_c = f0_smooth_std_c
28
+
29
+ self.f0_centers_c = np.arange(f0_c_min, f0_c_max, self.f0_granularity_c)
30
+ self.f0_centers_hz = 10 * 2 ** (self.f0_centers_c / 1200)
31
+ self.f0_n_bins = len(self.f0_centers_c)
32
+
33
+ self.pdf_normalizer = norm.pdf(0)
34
+
35
+ self.f0_c2hz = lambda c: 10*2**(c/1200)
36
+ self.f0_hz2c = melody.hz2cents
37
+ self.midi_centers_c = self.f0_hz2c(librosa.midi_to_hz(self.midi_centers))
38
+
39
+ self.f0_tolerance_bins = int(f0_tolerance_c/self.f0_granularity_c)
40
+ self.f0_transition_matrix = gaussian_filter1d(np.eye(2*self.f0_tolerance_bins + 1), 25/self.f0_granularity_c)
41
+
42
+ def f0_c2label(self, pitch_c):
43
+ """
44
+ Convert a single f0 value in cents to a one-hot label vector with smoothing (i.e., create a gaussian blur around
45
+ the target f0 bin for regularization and training stability. The blur is controlled by self.f0_smooth_std_c
46
+ :param pitch_c: a single pitch value in cents
47
+ :return: one-hot label vector with frequency blur
48
+ """
49
+ result = norm.pdf((self.f0_centers_c - pitch_c) / self.f0_smooth_std_c).astype(np.float32)
50
+ result /= self.pdf_normalizer
51
+ return result
52
+
53
+ def f0_label2c(self, salience, center=None):
54
+ """
55
+ Convert the salience predictions to monophonic f0 in cents. Only outputs a single f0 value per frame!
56
+ :param salience: f0 activations
57
+ :param center: f0 center bin to calculate the weighted average. Use argmax if empty
58
+ :return: f0 array per frame (in cents).
59
+ """
60
+ if salience.ndim == 1:
61
+ if center is None:
62
+ center = int(np.argmax(salience))
63
+ start = max(0, center - 4)
64
+ end = min(len(salience), center + 5)
65
+ salience = salience[start:end]
66
+ product_sum = np.sum(salience * self.f0_centers_c[start:end])
67
+ weight_sum = np.sum(salience)
68
+ return product_sum / np.clip(weight_sum, 1e-8, None)
69
+ if salience.ndim == 2:
70
+ return np.array([self.f0_label2c(salience[i, :]) for i in range(salience.shape[0])])
71
+ raise Exception("label should be either 1d or 2d ndarray")
72
+
73
+ def fill_onset_matrix(self, onsets, window, feature_rate):
74
+ """
75
+ Create a sparse onset matrix from window and onsets (per-semitone). Apply a gaussian smoothing (along time)
76
+ so that we can tolerate better the alignment problems. This is similar to the frequency smoothing for the f0.
77
+ The temporal smoothing is controlled by the parameter self.onset_smooth_std
78
+ :param onsets: A 2d np.array of individual note onsets with their respective time values
79
+ (Nx2: time in seconds - midi number)
80
+ :param window: Timestamps for the frame centers of the sparse matrix
81
+ :param feature_rate: Window timestamps are integer, this is to convert them to seconds
82
+ :return: onset_roll: A sparse matrix filled with temporally blurred onsets.
83
+ """
84
+ onsets = self.get_window_feats(onsets, window, feature_rate)
85
+ onset_roll = np.zeros((len(window), len(self.midi_centers)))
86
+ for onset in onsets:
87
+ onset, note = onset # it was a pair with time and midi note
88
+ if self.midi_centers[0] < note < self.midi_centers[-1]: # midi note should be in the range defined
89
+ note = int(note) - self.midi_centers[0] # find the note index in our range
90
+ onset = (onset*feature_rate)-window[0] # onset index (as float but in frames, not in seconds!)
91
+ start = max(0, int(onset) - 3)
92
+ end = min(len(window) - 1, int(onset) + 3)
93
+ try:
94
+ vals = norm.pdf(np.linspace(start - onset, end - onset, end - start + 1) / self.onset_smooth_std)
95
+ # if you increase 0.7 you smooth the peak
96
+ # if you decrease it, e.g., 0.1, it becomes too peaky! around 0.5-0.7 seems ok
97
+ vals /= self.pdf_normalizer
98
+ onset_roll[start:end + 1, note] += vals
99
+ except ValueError:
100
+ print('start',start, 'onset', onset, 'end', end)
101
+ return onset_roll, onsets
102
+
103
+ def fill_note_matrix(self, notes, window, feature_rate):
104
+ """
105
+ Create the note matrix (piano roll) from window timestamps and note values per frame.
106
+ :param notes: A 2d np.array of individual notes with their active time values Nx2
107
+ :param window: Timestamps for the frame centers of the output
108
+ :param feature_rate: Window timestamps are integer, this is to convert them to seconds
109
+ :return note_roll: The piano roll in the defined range of [note_min, note_max).
110
+ """
111
+ notes = self.get_window_feats(notes, window, feature_rate)
112
+
113
+ # take the notes in the midi range defined
114
+ notes = notes[np.logical_and(notes[:,1]>=self.midi_centers[0], notes[:,1]<=self.midi_centers[-1]),:]
115
+
116
+ times = (notes[:,0]*feature_rate - window[0]).astype(int) # in feature samples (fs:self.hop/self.sr)
117
+ notes = (notes[:,1] - self.midi_centers[0]).astype(int)
118
+
119
+ note_roll = np.zeros((len(window), len(self.midi_centers)))
120
+ note_roll[(times, notes)] = 1
121
+ return note_roll, notes
122
+
123
+
124
+ def fill_f0_matrix(self, f0s, window, feature_rate):
125
+ """
126
+ Unlike the labels for onsets and notes, f0 label is only relevant for strictly monophonic regions! Thus, this
127
+ function returns a boolean which represents where to apply the given values.
128
+ Never back-propagate without the boolean! Empty frames mean that the label is not that reliable.
129
+
130
+ :param f0s: A 2d np.array of f0 values with the time they belong to (2xN: time in seconds - f0 in Hz)
131
+ :param window: Timestamps for the frame centers of the output
132
+ :param feature_rate: Window timestamps are integer, this is to convert them to seconds
133
+
134
+ :return f0_roll: f0 label matrix and
135
+ f0_hz: f0 values in Hz
136
+ annotation_bool: A boolean array representing which frames have reliable f0 annotations.
137
+ """
138
+ f0s = self.get_window_feats(f0s, window, feature_rate)
139
+ f0_cents = np.zeros_like(window, dtype=float)
140
+ f0s[:,1] = self.f0_hz2c(f0s[:,1]) # convert f0 in hz to cents
141
+
142
+ annotation_bool = np.zeros_like(window, dtype=bool)
143
+ f0_roll = np.zeros((len(window), len(self.f0_centers_c)))
144
+ times_in_frame = f0s[:, 0]*feature_rate - window[0]
145
+ for t, f0 in enumerate(f0s):
146
+ t = times_in_frame[t]
147
+ if t%1 < 0.25: # only consider it as annotation if the f0 values is really close to the frame center
148
+ t = int(np.round(t))
149
+ f0_roll[t] = self.f0_c2label(f0[1])
150
+ annotation_bool[t] = True
151
+ f0_cents[t] = f0[1]
152
+
153
+ return f0_roll, f0_cents, annotation_bool
154
+
155
+
156
+ @staticmethod
157
+ def get_window_feats(time_feature_matrix, window, feature_rate):
158
+ """
159
+ Restrict the feature matrix to the features that are inside the window
160
+ :param window: Timestamps for the frame centers of the output
161
+ :param time_feature_matrix: A 2d array of Nx2 per the entire file.
162
+ :param feature_rate: Window timestamps are integer, this is to convert them to seconds
163
+ :return: window_features: the features inside the given window
164
+ """
165
+ start = time_feature_matrix[:,0]>(window[0]-0.5)/feature_rate
166
+ end = time_feature_matrix[:,0]<(window[-1]+0.5)/feature_rate
167
+ window_features = np.logical_and(start, end)
168
+ window_features = np.array(time_feature_matrix[window_features,:])
169
+ return window_features
170
+
171
+ def represent_midi(self, midi, feature_rate):
172
+ """
173
+ Represent a midi file as sparse matrices of onsets, offsets, and notes. No f0 is included.
174
+ :param midi: A midi file (either a path or a pretty_midi.PrettyMIDI object)
175
+ :param feature_rate: The feature rate in Hz
176
+ :return: dict {onset, offset, note, time}: Same format with the model's learning and outputs
177
+ """
178
+ def _get_onsets_offsets_frames(midi_content):
179
+ if isinstance(midi_content, str):
180
+ midi_content = pretty_midi.PrettyMIDI(midi_content)
181
+ onsets = []
182
+ offsets = []
183
+ frames = []
184
+ for instrument in midi_content.instruments:
185
+ for note in instrument.notes:
186
+ start = int(np.round(note.start * feature_rate))
187
+ end = int(np.round(note.end * feature_rate))
188
+ note_times = (np.arange(start, end+0.5)/feature_rate)[:, np.newaxis]
189
+ note_pitch = np.full_like(note_times, fill_value=note.pitch)
190
+ onsets.append([note.start, note.pitch])
191
+ offsets.append([note.end, note.pitch])
192
+ frames.append(np.hstack([note_times, note_pitch]))
193
+ onsets = np.vstack(onsets)
194
+ offsets = np.vstack(offsets)
195
+ frames = np.vstack(frames)
196
+ return onsets, offsets, frames, midi_content
197
+ onset_array, offset_array, frame_array, midi_object = _get_onsets_offsets_frames(midi)
198
+ window = np.arange(frame_array[0, 0]*feature_rate, frame_array[-1, 0]*feature_rate, dtype=int)
199
+ onset_roll, _ = self.fill_onset_matrix(onset_array, window, feature_rate)
200
+ offset_roll, _ = self.fill_onset_matrix(offset_array, window, feature_rate)
201
+ note_roll, _ = self.fill_note_matrix(frame_array, window, feature_rate)
202
+ start_anchor = onset_array[onset_array[:, 0]==np.min(onset_array[:, 0])]
203
+ end_anchor = offset_array[offset_array[:, 0]==np.max(offset_array[:, 0])]
204
+ return {
205
+ 'midi': midi_object,
206
+ 'note': note_roll,
207
+ 'onset': onset_roll,
208
+ 'offset': offset_roll,
209
+ 'time': window/feature_rate,
210
+ 'start_anchor': start_anchor,
211
+ 'end_anchor': end_anchor
212
+ }
musc/synchronizer.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .dtw.mrmsdtw import sync_via_mrmsdtw_with_anchors
2
+ from .dtw.utils import make_path_strictly_monotonic
3
+ import numpy as np
4
+ from .transcriber import Transcriber
5
+ from typing import Dict
6
+
7
+ class Synchronizer(Transcriber):
8
+ def __init__(self, labeling, instrument='Violin', sr=16000, window_size=1024, hop_length=160):
9
+ super().__init__(labeling, instrument=instrument, sr=sr, window_size=window_size, hop_length=hop_length)
10
+ def synchronize(self, audio, midi, batch_size=128, include_pitch_bends=True, to_midi=True, debug=False,
11
+ include_velocity=False, alignment_padding=50, timing_refinement_range_with_f0s=0):
12
+ """
13
+ Synchronize an audio file or mono waveform in numpy or torch with a MIDI file.
14
+ :param audio: str, pathlib.Path, np.ndarray, or torch.Tensor
15
+ :param midi: str, pathlib.Path, or pretty_midi.PrettyMIDI
16
+ :param batch_size: frames to process at once
17
+ :param include_pitch_bends: whether to include pitch bends in the MIDI file
18
+ :param to_midi: whether to return a MIDI file or a list of note events (as tuple)
19
+ :param debug: whether to plot the alignment path and compare the alignment with the predicted notes
20
+ :param include_velocity: whether to embed the note confidence in place of the velocity in the MIDI file
21
+ :param alignment_padding: how many frames to pad the audio and MIDI representations with
22
+ :param timing_refinement_range_with_f0s: how many frames to refine the alignment with the f0 confidence
23
+ :return: aligned MIDI file as a pretty_midi.PrettyMIDI object
24
+
25
+ Args:
26
+ debug:
27
+ to_midi:
28
+ include_pitch_bends:
29
+ """
30
+
31
+ audio = self.predict(audio, batch_size)
32
+ notes_and_midi = self.out2sync(audio, midi, include_velocity=include_velocity,
33
+ alignment_padding=alignment_padding)
34
+ if notes_and_midi: # it might be none
35
+ notes, midi = notes_and_midi
36
+
37
+ if debug:
38
+ import matplotlib.pyplot as plt
39
+ import pandas as pd
40
+ estimated_notes = self.out2note(audio, postprocessing='spotify', include_pitch_bends=True)
41
+ est_df = pd.DataFrame(estimated_notes).sort_values(by=0)
42
+ note_df = pd.DataFrame(notes).sort_values(by=0)
43
+
44
+ fig, ax = plt.subplots(figsize=(20, 10))
45
+
46
+ for row in notes:
47
+ t_start = row[0] # sec
48
+ t_end = row[1] # sec
49
+ freq = row[2] # Hz
50
+ ax.hlines(freq, t_start, t_end, color='k', linewidth=3, zorder=2, alpha=0.5)
51
+
52
+ for row in estimated_notes:
53
+ t_start = row[0] # sec
54
+ t_end = row[1] # sec
55
+ freq = row[2] # Hz
56
+ ax.hlines(freq, t_start, t_end, color='r', linewidth=3, zorder=2, alpha=0.5)
57
+ fig.suptitle('alignment (black) vs. estimated (red)')
58
+ fig.show()
59
+
60
+ if not include_pitch_bends:
61
+ if to_midi:
62
+ return midi['midi']
63
+ else:
64
+ return notes
65
+ else:
66
+ notes = [(np.argmin(np.abs(audio['time']-note[0])),
67
+ np.argmin(np.abs(audio['time']-note[1])),
68
+ note[2], note[3]) for note in notes]
69
+ notes = self.get_pitch_bends(audio["f0"], notes, timing_refinement_range_with_f0s)
70
+ notes = [
71
+ (audio['time'][note[0]], audio['time'][note[1]], note[2], note[3], note[4]) for note in
72
+ notes
73
+ ]
74
+ if to_midi:
75
+ return self.note2midi(notes, 120) #int(midi['midi'].estimate_tempo()))
76
+ else:
77
+ return notes
78
+
79
+ def out2sync_old(self, out: Dict[str, np.array], midi, include_velocity=False, alignment_padding=50, debug=False):
80
+ """
81
+ Synchronizes the output of the model with the MIDI file.
82
+ Args:
83
+ out: Model output dictionary
84
+ midi: Path to the MIDI file or PrettyMIDI object
85
+ include_velocity: Whether to encode the note confidence in place of velocity
86
+ alignment_padding: Number of frames to pad the MIDI features with zeros
87
+ debug: Visualize the alignment
88
+
89
+ Returns:
90
+ note events and the aligned PrettyMIDI object
91
+ """
92
+ midi = self.labeling.represent_midi(midi, self.sr/self.hop_length)
93
+
94
+ audio_midi_anchors = self.prepare_for_synchronization(out, midi, feature_rate=self.sr/self.hop_length,
95
+ pad_length=alignment_padding)
96
+ if isinstance(audio_midi_anchors, str):
97
+ print(audio_midi_anchors)
98
+ return None # the file is corrupted! no possible alignment at all
99
+ else:
100
+ audio, midi, anchor_pairs = audio_midi_anchors
101
+
102
+ ALPHA = 0.6 # This is the coefficient of onsets, 1 - ALPHA for offsets
103
+
104
+ wp = sync_via_mrmsdtw_with_anchors(f_chroma1=audio['note'].T,
105
+ f_onset1=np.hstack([ALPHA * audio['onset'],
106
+ (1 - ALPHA) * audio['offset']]).T,
107
+ f_chroma2=midi['note'].T,
108
+ f_onset2=np.hstack([ALPHA * midi['onset'],
109
+ (1 - ALPHA) * midi['offset']]).T,
110
+ input_feature_rate=self.sr/self.hop_length,
111
+ step_weights=np.array([1.5, 1.5, 2.0]),
112
+ threshold_rec=10 ** 6,
113
+ verbose=debug, normalize_chroma=False,
114
+ anchor_pairs=anchor_pairs)
115
+ wp = make_path_strictly_monotonic(wp).astype(int)
116
+
117
+ audio_time = np.take(audio['time'], wp[0])
118
+ midi_time = np.take(midi['time'], wp[1])
119
+
120
+ notes = []
121
+ for instrument in midi['midi'].instruments:
122
+ for note in instrument.notes:
123
+ note.start = np.interp(note.start, midi_time, audio_time)
124
+ note.end = np.interp(note.end, midi_time, audio_time)
125
+
126
+ if note.end - note.start <= 0.012: # notes should be at least 12 ms (i.e. 2 frames)
127
+ note.start = note.start - 0.003
128
+ note.end = note.start + 0.012
129
+
130
+ if include_velocity: # encode the note confidence in place of velocity
131
+ velocity = np.median(audio['note'][np.argmin(np.abs(audio['time']-note.start)):
132
+ np.argmin(np.abs(audio['time']-note.end)),
133
+ note.pitch-self.labeling.midi_centers[0]])
134
+
135
+ note.velocity = max(1, velocity*127) # velocity should be at least 1 otherwise midi removes the note
136
+ else:
137
+ velocity = note.velocity/127
138
+ notes.append((note.start, note.end, note.pitch, velocity))
139
+ return notes, midi
140
+
141
+
142
+ def out2sync(self, out: Dict[str, np.array], midi, include_velocity=False, alignment_padding=50, debug=False):
143
+ """
144
+ Synchronizes the output of the model with the MIDI file.
145
+ Args:
146
+ out: Model output dictionary
147
+ midi: Path to the MIDI file or PrettyMIDI object
148
+ include_velocity: Whether to encode the note confidence in place of velocity
149
+ alignment_padding: Number of frames to pad the MIDI features with zeros
150
+ debug: Visualize the alignment
151
+
152
+ Returns:
153
+ note events and the aligned PrettyMIDI object
154
+ """
155
+ midi = self.labeling.represent_midi(midi, self.sr/self.hop_length)
156
+
157
+ audio_midi_anchors = self.prepare_for_synchronization(out, midi, feature_rate=self.sr/self.hop_length,
158
+ pad_length=alignment_padding)
159
+ if isinstance(audio_midi_anchors, str):
160
+ print(audio_midi_anchors)
161
+ return None # the file is corrupted! no possible alignment at all
162
+ else:
163
+ audio, midi, anchor_pairs = audio_midi_anchors
164
+
165
+ ALPHA = 0.6 # This is the coefficient of onsets, 1 - ALPHA for offsets
166
+
167
+ starts = (np.array(anchor_pairs[0])*self.sr/self.hop_length).astype(int)
168
+ ends = (np.array(anchor_pairs[1])*self.sr/self.hop_length).astype(int)
169
+
170
+ wp = sync_via_mrmsdtw_with_anchors(f_chroma1=audio['note'].T[:, starts[0]:ends[0]],
171
+ f_onset1=np.hstack([ALPHA * audio['onset'],
172
+ (1 - ALPHA) * audio['offset']]).T[:, starts[0]:ends[0]],
173
+ f_chroma2=midi['note'].T[:, starts[1]:ends[1]],
174
+ f_onset2=np.hstack([ALPHA * midi['onset'],
175
+ (1 - ALPHA) * midi['offset']]).T[:, starts[1]:ends[1]],
176
+ input_feature_rate=self.sr/self.hop_length,
177
+ step_weights=np.array([1.5, 1.5, 2.0]),
178
+ threshold_rec=10 ** 6,
179
+ verbose=debug, normalize_chroma=False,
180
+ anchor_pairs=None)
181
+ wp = make_path_strictly_monotonic(wp).astype(int)
182
+ wp[0] += starts[0]
183
+ wp[1] += starts[1]
184
+ wp = np.hstack((wp, ends[:,np.newaxis]))
185
+
186
+ audio_time = np.take(audio['time'], wp[0])
187
+ midi_time = np.take(midi['time'], wp[1])
188
+
189
+ notes = []
190
+ for instrument in midi['midi'].instruments:
191
+ for note in instrument.notes:
192
+ note.start = np.interp(note.start, midi_time, audio_time)
193
+ note.end = np.interp(note.end, midi_time, audio_time)
194
+
195
+ if note.end - note.start <= 0.012: # notes should be at least 12 ms (i.e. 2 frames)
196
+ note.start = note.start - 0.003
197
+ note.end = note.start + 0.012
198
+
199
+ if include_velocity: # encode the note confidence in place of velocity
200
+ velocity = np.median(audio['note'][np.argmin(np.abs(audio['time']-note.start)):
201
+ np.argmin(np.abs(audio['time']-note.end)),
202
+ note.pitch-self.labeling.midi_centers[0]])
203
+
204
+ note.velocity = max(1, velocity*127) # velocity should be at least 1 otherwise midi removes the note
205
+ else:
206
+ velocity = note.velocity/127
207
+ notes.append((note.start, note.end, note.pitch, velocity))
208
+ return notes, midi
209
+
210
+ @staticmethod
211
+ def pad_representations(dict_of_representations, pad_length=10):
212
+ """
213
+ Pad the representations so that the DTW does not enforce them to encompass the entire duration.
214
+ Args:
215
+ dict_of_representations: audio or midi representations
216
+ pad_length: how many frames to pad
217
+
218
+ Returns:
219
+ padded representations
220
+ """
221
+ for key, value in dict_of_representations.items():
222
+ if key == 'time':
223
+ padded_time = dict_of_representations[key]
224
+ padded_time = np.concatenate([padded_time[:2*pad_length], padded_time+padded_time[2*pad_length]])
225
+ dict_of_representations[key] = padded_time - padded_time[pad_length] # this is to ensure that the
226
+ # first frame times are negative until the real zero time
227
+ elif key in ['onset', 'offset', 'note']:
228
+ dict_of_representations[key] = np.pad(value, ((pad_length, pad_length), (0, 0)))
229
+ elif key in ['start_anchor', 'end_anchor']:
230
+ anchor_time = dict_of_representations[key][0][0]
231
+ anchor_time = np.argmin(np.abs(dict_of_representations['time'] - anchor_time))
232
+ dict_of_representations[key][:,0] = anchor_time
233
+ dict_of_representations[key] = dict_of_representations[key].astype(np.int)
234
+ return dict_of_representations
235
+
236
+ def prepare_for_synchronization(self, audio, midi, feature_rate=44100/256, pad_length=100):
237
+ """
238
+ MrMsDTW works better with start and end anchors. This function finds the start and end anchors for audio
239
+ based on the midi notes. It also pads the MIDI representations since MIDI files most often start with an active
240
+ note and end with an active note. Thus, the DTW will try to align the active notes to the entire duration of the
241
+ audio. This is not desirable. Therefore, we pad the MIDI representations with a few frames of silence at the
242
+ beginning and end of the audio. This way, the DTW will not try to align the active notes to the entire duration.
243
+ Args:
244
+ audio:
245
+ midi:
246
+ feature_rate:
247
+ pad_length:
248
+
249
+ Returns:
250
+
251
+ """
252
+ # first pad the MIDI
253
+ midi = self.pad_representations(midi, pad_length)
254
+
255
+ # sometimes f0s are more reliable than the notes. So, we use both the f0s and the notes together to find the
256
+ # start and end anchors. f0 lookup bins is the number of bins to look around the f0 to assign a note to it.
257
+ f0_lookup_bins = int(100//(2*self.labeling.f0_granularity_c))
258
+
259
+ # find the start anchor for the audio
260
+ # first decide on which notes to use for the start anchor (take the entire chord where the MIDI file starts)
261
+ anchor_notes = midi['start_anchor'][:, 1] - self.labeling.midi_centers[0]
262
+ # now find which f0 bins to look at for the start anchor
263
+ anchor_f0s = [self.midi_pitch_to_contour_bin(an+self.labeling.midi_centers[0]) for an in anchor_notes]
264
+ anchor_f0s = np.array([list(range(f0-f0_lookup_bins, f0+f0_lookup_bins+1)) for f0 in anchor_f0s]).reshape(-1)
265
+ # first start anchor proposals come from the notes
266
+ anchor_vals = np.any(audio['note'][:, anchor_notes]>0.5, axis=1)
267
+ # now the f0s
268
+ anchor_vals_f0 = np.any(audio['f0'][:, anchor_f0s]>0.5, axis=1)
269
+ # combine the two
270
+ anchor_vals = np.logical_or(anchor_vals, anchor_vals_f0)
271
+ if not any(anchor_vals):
272
+ return 'corrupted' # do not consider the file if we cannot find the start anchor
273
+ audio_start = np.argmax(anchor_vals)
274
+
275
+ # now the end anchor (most string instruments use chords in cadences: in general the end anchor is polyphonic)
276
+ anchor_notes = midi['end_anchor'][:, 1] - self.labeling.midi_centers[0]
277
+ anchor_f0s = [self.midi_pitch_to_contour_bin(an+self.labeling.midi_centers[0]) for an in anchor_notes]
278
+ anchor_f0s = np.array([list(range(f0-f0_lookup_bins, f0+f0_lookup_bins+1)) for f0 in anchor_f0s]).reshape(-1)
279
+ # the same procedure as above
280
+ anchor_vals = np.any(audio['note'][::-1, anchor_notes]>0.5, axis=1)
281
+ anchor_vals_f0 = np.any(audio['f0'][::-1, anchor_f0s]>0.5, axis=1)
282
+ anchor_vals = np.logical_or(anchor_vals, anchor_vals_f0)
283
+ if not any(anchor_vals):
284
+ return 'corrupted' # do not consider the file if we cannot find the end anchor
285
+ audio_end = audio['note'].shape[0] - np.argmax(anchor_vals)
286
+
287
+ if audio_end - audio_start < (midi['end_anchor'][0][0] - midi['start_anchor'][0][0])/10: # no one plays x10 faster
288
+ return 'corrupted' # do not consider the interval between anchors is too short
289
+ anchor_pairs = [(audio_start - 5, midi['start_anchor'][0][0] - 5),
290
+ (audio_end + 5, midi['end_anchor'][0][0] + 5)]
291
+
292
+ if anchor_pairs[0][0] < 1:
293
+ anchor_pairs[0] = (1, midi['start_anchor'][0][0])
294
+ if anchor_pairs[1][0] > audio['note'].shape[0] - 1:
295
+ anchor_pairs[1] = (audio['note'].shape[0] - 1, midi['end_anchor'][0][0])
296
+
297
+ return audio, midi, [(anchor_pairs[0][0]/feature_rate, anchor_pairs[0][1]/feature_rate),
298
+ (anchor_pairs[1][0]/feature_rate, anchor_pairs[1][1]/feature_rate)]
299
+
musc/transcriber.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ from typing import DefaultDict, Dict, List, Optional, Tuple
3
+ import pretty_midi
4
+ import numpy as np
5
+ from .postprocessing import RegressionPostProcessor, spotify_create_notes
6
+ from .pitch_estimator import PitchEstimator
7
+
8
+
9
+ class Transcriber(PitchEstimator):
10
+ def __init__(self, labeling, instrument='Violin', sr=16000, window_size=1024, hop_length=160):
11
+ super().__init__(labeling, instrument=instrument, sr=sr, window_size=window_size, hop_length=hop_length)
12
+
13
+ def transcribe(self, audio, batch_size=128, postprocessing='spotify', include_pitch_bends=True, to_midi=True,
14
+ debug=False):
15
+ """
16
+ Transcribe an audio file or mono waveform in numpy or torch into MIDI with pitch bends.
17
+ :param audio: str, pathlib.Path, np.ndarray, or torch.Tensor
18
+ :param batch_size: frames to process at once
19
+ :param postprocessing: note creation method. 'spotify'(default) or 'tiktok'
20
+ :param include_pitch_bends: whether to include pitch bends in the MIDI file
21
+ :param to_midi: whether to return a MIDI file or a list of note events (as tuple)
22
+ :return: transcribed MIDI file as a pretty_midi.PrettyMIDI object
23
+ """
24
+ out = self.predict(audio, batch_size)
25
+ if debug:
26
+ import matplotlib.pyplot as plt
27
+ plt.imshow(out['f0'].T, aspect='auto', origin='lower')
28
+ plt.show()
29
+ plt.imshow(out['note'].T, aspect='auto', origin='lower')
30
+ plt.show()
31
+
32
+ plt.imshow(out['onset'].T, aspect='auto', origin='lower')
33
+ plt.show()
34
+
35
+ plt.imshow(out['offset'].T, aspect='auto', origin='lower')
36
+ plt.show()
37
+
38
+ if to_midi:
39
+ return self.out2midi(out, postprocessing, include_pitch_bends)
40
+ else:
41
+ return self.out2note(out, postprocessing, include_pitch_bends)
42
+
43
+
44
+
45
+ def out2note(self, output: Dict[str, np.array], postprocessing='spotify',
46
+ include_pitch_bends: bool = True,
47
+ ) -> List[Tuple[float, float, int, float, Optional[List[int]]]]:
48
+ """Convert model output to notes
49
+ """
50
+ if postprocessing == 'spotify':
51
+ estimated_notes = spotify_create_notes(
52
+ output["note"],
53
+ output["onset"],
54
+ note_low=self.labeling.midi_centers[0],
55
+ note_high=self.labeling.midi_centers[-1],
56
+ onset_thresh=0.5,
57
+ frame_thresh=0.3,
58
+ infer_onsets=True,
59
+ min_note_len=int(np.round(127.70 / 1000 * (self.sr / self.hop_length))), #127.70
60
+ melodia_trick=True,
61
+ )
62
+
63
+ if postprocessing == 'rebab':
64
+ estimated_notes = spotify_create_notes(
65
+ output["note"],
66
+ output["onset"],
67
+ note_low=self.labeling.midi_centers[0],
68
+ note_high=self.labeling.midi_centers[-1],
69
+ onset_thresh=0.2,
70
+ frame_thresh=0.2,
71
+ infer_onsets=True,
72
+ min_note_len=int(np.round(127.70 / 1000 * (self.sr / self.hop_length))), #127.70
73
+ melodia_trick=True,
74
+ )
75
+
76
+
77
+ elif postprocessing == 'tiktok':
78
+ postprocessor = RegressionPostProcessor(
79
+ frames_per_second=self.sr / self.hop_length,
80
+ classes_num=self.labeling.midi_centers.shape[0],
81
+ begin_note=self.labeling.midi_centers[0],
82
+ onset_threshold=0.2,
83
+ offset_threshold=0.2,
84
+ frame_threshold=0.3,
85
+ pedal_offset_threshold=0.5,
86
+ )
87
+ tiktok_note_dict, _ = postprocessor.output_dict_to_midi_events(output)
88
+ estimated_notes = []
89
+ for list_item in tiktok_note_dict:
90
+ if list_item['offset_time'] > 0.6 + list_item['onset_time']:
91
+ estimated_notes.append((int(np.floor(list_item['onset_time']/(output['time'][1]))),
92
+ int(np.ceil(list_item['offset_time']/(output['time'][1]))),
93
+ list_item['midi_note'], list_item['velocity']/128))
94
+ if include_pitch_bends:
95
+ estimated_notes_with_pitch_bend = self.get_pitch_bends(output["f0"], estimated_notes)
96
+ else:
97
+ estimated_notes_with_pitch_bend = [(note[0], note[1], note[2], note[3], None) for note in estimated_notes]
98
+
99
+ times_s = output['time']
100
+ estimated_notes_time_seconds = [
101
+ (times_s[note[0]], times_s[note[1]], note[2], note[3], note[4]) for note in estimated_notes_with_pitch_bend
102
+ ]
103
+
104
+ return estimated_notes_time_seconds
105
+
106
+
107
+ def out2midi(self, output: Dict[str, np.array], postprocessing: str = 'spotify', include_pitch_bends: bool = True,
108
+ ) -> pretty_midi.PrettyMIDI:
109
+ """Convert model output to MIDI
110
+ Args:
111
+ output: A dictionary with shape
112
+ {
113
+ 'frame': array of shape (n_times, n_freqs),
114
+ 'onset': array of shape (n_times, n_freqs),
115
+ 'contour': array of shape (n_times, 3*n_freqs)
116
+ }
117
+ representing the output of the basic pitch model.
118
+ postprocessing: spotify or tiktok postprocessing.
119
+ include_pitch_bends: If True, include pitch bends.
120
+ Returns:
121
+ note_events: A list of note event tuples (start_time_s, end_time_s, pitch_midi, amplitude)
122
+ """
123
+ estimated_notes_time_seconds = self.out2note(output, postprocessing, include_pitch_bends)
124
+ midi_tempo = 120 # todo: infer tempo from the onsets
125
+ return self.note2midi(estimated_notes_time_seconds, midi_tempo)
126
+
127
+
128
+ def note2midi(
129
+ self, note_events_with_pitch_bends: List[Tuple[float, float, int, float, Optional[List[int]]]],
130
+ midi_tempo: float = 120,
131
+ ) -> pretty_midi.PrettyMIDI:
132
+ """Create a pretty_midi object from note events
133
+ :param note_events_with_pitch_bends: list of tuples
134
+ [(start_time_seconds, end_time_seconds, pitch_midi, amplitude)]
135
+ :param midi_tempo: #todo: infer tempo from the onsets
136
+ :return: transcribed MIDI file as a pretty_midi.PrettyMIDI object
137
+ """
138
+ mid = pretty_midi.PrettyMIDI(initial_tempo=midi_tempo)
139
+
140
+ program = pretty_midi.instrument_name_to_program(self.instrument)
141
+ instruments: DefaultDict[int, pretty_midi.Instrument] = defaultdict(
142
+ lambda: pretty_midi.Instrument(program=program)
143
+ )
144
+ for start_time, end_time, note_number, amplitude, pitch_bend in note_events_with_pitch_bends:
145
+ instrument = instruments[note_number]
146
+ note = pretty_midi.Note(
147
+ velocity=int(np.round(127 * amplitude)),
148
+ pitch=note_number,
149
+ start=start_time,
150
+ end=end_time,
151
+ )
152
+ instrument.notes.append(note)
153
+ if not isinstance(pitch_bend, np.ndarray):
154
+ continue
155
+ pitch_bend_times = np.linspace(start_time, end_time, len(pitch_bend))
156
+
157
+ for pb_time, pb_midi in zip(pitch_bend_times, pitch_bend):
158
+ instrument.pitch_bends.append(pretty_midi.PitchBend(pb_midi, pb_time))
159
+
160
+ mid.instruments.extend(instruments.values())
161
+
162
+ return mid
163
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ mir_eval
3
+ pretty_midi
4
+ torchaudio
5
+ scipy
6
+ numba
7
+ librosa
8
+ matplotlib
9
+ mido
violin.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "wiring": "parallel",
3
+ "sampling_rate": 44100,
4
+ "pathway_multiscale": 4,
5
+ "num_pathway_layers": 2,
6
+ "num_separator_layers": 16,
7
+ "num_representation_layers": 4,
8
+ "hop_length": 256,
9
+ "chunk_size": 512,
10
+ "minSNR": -32,
11
+ "maxSNR": 96,
12
+ "note_low": "F#3",
13
+ "note_high": "E8",
14
+ "f0_bins_per_semitone": 10,
15
+ "f0_smooth_std_c": 12,
16
+ "onset_smooth_std": 0.7
17
+ }
violin_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a913356f059be6dc930be41158ac864f7d5511889ef0b2a6b6ba75a4a8732750
3
+ size 218770231