import os import numpy as np import audioread import librosa from mido import MidiFile from piano_vad import (note_detection_with_onset_offset_regress, pedal_detection_with_onset_offset_regress) import config def create_folder(fd): if not os.path.exists(fd): os.makedirs(fd) def get_filename(path): path = os.path.realpath(path) na_ext = path.split('/')[-1] na = os.path.splitext(na_ext)[0] return na def note_to_freq(piano_note): return 2 ** ((piano_note - 39) / 12) * 440 def float32_to_int16(x): assert np.max(np.abs(x)) <= 1. return (x * 32767.).astype(np.int16) def int16_to_float32(x): return (x / 32767.).astype(np.float32) def pad_truncate_sequence(x, max_len): if len(x) < max_len: return np.concatenate((x, np.zeros(max_len - len(x)))) else: return x[0 : max_len] def read_midi(midi_path): """Parse MIDI file. Args: midi_path: str Returns: midi_dict: dict, e.g. { 'midi_event': [ 'program_change channel=0 program=0 time=0', 'control_change channel=0 control=64 value=127 time=0', 'control_change channel=0 control=64 value=63 time=236', ...], 'midi_event_time': [0., 0, 0.98307292, ...]} """ midi_file = MidiFile(midi_path) ticks_per_beat = midi_file.ticks_per_beat assert len(midi_file.tracks) == 2 """The first track contains tempo, time signature. The second track contains piano events.""" microseconds_per_beat = midi_file.tracks[0][0].tempo beats_per_second = 1e6 / microseconds_per_beat ticks_per_second = ticks_per_beat * beats_per_second message_list = [] ticks = 0 time_in_second = [] for message in midi_file.tracks[1]: message_list.append(str(message)) ticks += message.time time_in_second.append(ticks / ticks_per_second) midi_dict = { 'midi_event': np.array(message_list), 'midi_event_time': np.array(time_in_second)} return midi_dict def write_events_to_midi(start_time, note_events, pedal_events, midi_path): """Write out note events to MIDI file. Args: start_time: float note_events: list of dict, e.g. [ {'midi_note': 51, 'onset_time': 696.63544, 'offset_time': 696.9948, 'velocity': 44}, {'midi_note': 58, 'onset_time': 696.99585, 'offset_time': 697.18646, 'velocity': 50} ...] midi_path: str """ from mido import Message, MidiFile, MidiTrack, MetaMessage # This configuration is the same as MIDIs in MAESTRO dataset ticks_per_beat = 384 beats_per_second = 2 ticks_per_second = ticks_per_beat * beats_per_second microseconds_per_beat = int(1e6 // beats_per_second) midi_file = MidiFile() midi_file.ticks_per_beat = ticks_per_beat # Track 0 track0 = MidiTrack() track0.append(MetaMessage('set_tempo', tempo=microseconds_per_beat, time=0)) track0.append(MetaMessage('time_signature', numerator=4, denominator=4, time=0)) track0.append(MetaMessage('end_of_track', time=1)) midi_file.tracks.append(track0) # Track 1 track1 = MidiTrack() # Message rolls of MIDI message_roll = [] for note_event in note_events: # Onset message_roll.append({ 'time': note_event['onset_time'], 'midi_note': note_event['midi_note'], 'velocity': note_event['velocity']}) # Offset message_roll.append({ 'time': note_event['offset_time'], 'midi_note': note_event['midi_note'], 'velocity': 0}) if pedal_events: for pedal_event in pedal_events: message_roll.append({'time': pedal_event['onset_time'], 'control_change': 64, 'value': 127}) message_roll.append({'time': pedal_event['offset_time'], 'control_change': 64, 'value': 0}) # Sort MIDI messages by time message_roll.sort(key=lambda note_event: note_event['time']) previous_ticks = 0 for message in message_roll: this_ticks = int((message['time'] - start_time) * ticks_per_second) if this_ticks >= 0: diff_ticks = this_ticks - previous_ticks previous_ticks = this_ticks if 'midi_note' in message.keys(): track1.append(Message('note_on', note=message['midi_note'], velocity=message['velocity'], time=diff_ticks)) elif 'control_change' in message.keys(): track1.append(Message('control_change', channel=0, control=message['control_change'], value=message['value'], time=diff_ticks)) track1.append(MetaMessage('end_of_track', time=1)) midi_file.tracks.append(track1) midi_file.save(midi_path) class RegressionPostProcessor(object): def __init__(self, frames_per_second, classes_num, onset_threshold, offset_threshold, frame_threshold, pedal_offset_threshold): """Postprocess the output probabilities of a transription model to MIDI events. Args: frames_per_second: int classes_num: int onset_threshold: float offset_threshold: float frame_threshold: float pedal_offset_threshold: float """ self.frames_per_second = frames_per_second self.classes_num = classes_num self.onset_threshold = onset_threshold self.offset_threshold = offset_threshold self.frame_threshold = frame_threshold self.pedal_offset_threshold = pedal_offset_threshold self.begin_note = config.begin_note self.velocity_scale = config.velocity_scale def output_dict_to_midi_events(self, output_dict): """Main function. Post process model outputs to MIDI events. Args: output_dict: { 'reg_onset_output': (segment_frames, classes_num), 'reg_offset_output': (segment_frames, classes_num), 'frame_output': (segment_frames, classes_num), 'velocity_output': (segment_frames, classes_num), 'reg_pedal_onset_output': (segment_frames, 1), 'reg_pedal_offset_output': (segment_frames, 1), 'pedal_frame_output': (segment_frames, 1)} Outputs: est_note_events: list of dict, e.g. [ {'onset_time': 39.74, 'offset_time': 39.87, 'midi_note': 27, 'velocity': 83}, {'onset_time': 11.98, 'offset_time': 12.11, 'midi_note': 33, 'velocity': 88}] est_pedal_events: list of dict, e.g. [ {'onset_time': 0.17, 'offset_time': 0.96}, {'osnet_time': 1.17, 'offset_time': 2.65}] """ # Post process piano note outputs to piano note and pedal events information (est_on_off_note_vels, est_pedal_on_offs) = \ self.output_dict_to_note_pedal_arrays(output_dict) """est_on_off_note_vels: (events_num, 4), the four columns are: [onset_time, offset_time, piano_note, velocity], est_pedal_on_offs: (pedal_events_num, 2), the two columns are: [onset_time, offset_time]""" # Reformat notes to MIDI events est_note_events = self.detected_notes_to_events(est_on_off_note_vels) if est_pedal_on_offs is None: est_pedal_events = None else: est_pedal_events = self.detected_pedals_to_events(est_pedal_on_offs) return est_note_events, est_pedal_events def output_dict_to_note_pedal_arrays(self, output_dict): """Postprocess the output probabilities of a transription model to MIDI events. Args: output_dict: dict, { 'reg_onset_output': (frames_num, classes_num), 'reg_offset_output': (frames_num, classes_num), 'frame_output': (frames_num, classes_num), 'velocity_output': (frames_num, classes_num), ...} Returns: est_on_off_note_vels: (events_num, 4), the 4 columns are onset_time, offset_time, piano_note and velocity. E.g. [ [39.74, 39.87, 27, 0.65], [11.98, 12.11, 33, 0.69], ...] est_pedal_on_offs: (pedal_events_num, 2), the 2 columns are onset_time and offset_time. E.g. [ [0.17, 0.96], [1.17, 2.65], ...] """ # ------ 1. Process regression outputs to binarized outputs ------ # For example, onset or offset of [0., 0., 0.15, 0.30, 0.40, 0.35, 0.20, 0.05, 0., 0.] # will be processed to [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.] # Calculate binarized onset output from regression output (onset_output, onset_shift_output) = \ self.get_binarized_output_from_regression( reg_output=output_dict['reg_onset_output'], threshold=self.onset_threshold, neighbour=2) output_dict['onset_output'] = onset_output # Values are 0 or 1 output_dict['onset_shift_output'] = onset_shift_output # Calculate binarized offset output from regression output (offset_output, offset_shift_output) = \ self.get_binarized_output_from_regression( reg_output=output_dict['reg_offset_output'], threshold=self.offset_threshold, neighbour=4) output_dict['offset_output'] = offset_output # Values are 0 or 1 output_dict['offset_shift_output'] = offset_shift_output if 'reg_pedal_onset_output' in output_dict.keys(): """Pedal onsets are not used in inference. Instead, frame-wise pedal predictions are used to detect onsets. We empirically found this is more accurate to detect pedal onsets.""" pass if 'reg_pedal_offset_output' in output_dict.keys(): # Calculate binarized pedal offset output from regression output (pedal_offset_output, pedal_offset_shift_output) = \ self.get_binarized_output_from_regression( reg_output=output_dict['reg_pedal_offset_output'], threshold=self.pedal_offset_threshold, neighbour=4) output_dict['pedal_offset_output'] = pedal_offset_output # Values are 0 or 1 output_dict['pedal_offset_shift_output'] = pedal_offset_shift_output # ------ 2. Process matrices results to event results ------ # Detect piano notes from output_dict est_on_off_note_vels = self.output_dict_to_detected_notes(output_dict) if 'reg_pedal_onset_output' in output_dict.keys(): # Detect piano pedals from output_dict est_pedal_on_offs = self.output_dict_to_detected_pedals(output_dict) else: est_pedal_on_offs = None return est_on_off_note_vels, est_pedal_on_offs def get_binarized_output_from_regression(self, reg_output, threshold, neighbour): """Calculate binarized output and shifts of onsets or offsets from the regression results. Args: reg_output: (frames_num, classes_num) threshold: float neighbour: int Returns: binary_output: (frames_num, classes_num) shift_output: (frames_num, classes_num) """ binary_output = np.zeros_like(reg_output) shift_output = np.zeros_like(reg_output) (frames_num, classes_num) = reg_output.shape for k in range(classes_num): x = reg_output[:, k] for n in range(neighbour, frames_num - neighbour): if x[n] > threshold and self.is_monotonic_neighbour(x, n, neighbour): binary_output[n, k] = 1 """See Section III-D in [1] for deduction. [1] Q. Kong, et al., High-resolution Piano Transcription with Pedals by Regressing Onsets and Offsets Times, 2020.""" if x[n - 1] > x[n + 1]: shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n + 1]) / 2 else: shift = (x[n + 1] - x[n - 1]) / (x[n] - x[n - 1]) / 2 shift_output[n, k] = shift return binary_output, shift_output def is_monotonic_neighbour(self, x, n, neighbour): """Detect if values are monotonic in both side of x[n]. Args: x: (frames_num,) n: int neighbour: int Returns: monotonic: bool """ monotonic = True for i in range(neighbour): if x[n - i] < x[n - i - 1]: monotonic = False if x[n + i] < x[n + i + 1]: monotonic = False return monotonic def output_dict_to_detected_notes(self, output_dict): """Postprocess output_dict to piano notes. Args: output_dict: dict, e.g. { 'onset_output': (frames_num, classes_num), 'onset_shift_output': (frames_num, classes_num), 'offset_output': (frames_num, classes_num), 'offset_shift_output': (frames_num, classes_num), 'frame_output': (frames_num, classes_num), 'onset_output': (frames_num, classes_num), ...} Returns: est_on_off_note_vels: (notes, 4), the four columns are onsets, offsets, MIDI notes and velocities. E.g., [[39.7375, 39.7500, 27., 0.6638], [11.9824, 12.5000, 33., 0.6892], ...] """ est_tuples = [] est_midi_notes = [] classes_num = output_dict['frame_output'].shape[-1] for piano_note in range(classes_num): """Detect piano notes""" est_tuples_per_note = note_detection_with_onset_offset_regress( frame_output=output_dict['frame_output'][:, piano_note], onset_output=output_dict['onset_output'][:, piano_note], onset_shift_output=output_dict['onset_shift_output'][:, piano_note], offset_output=output_dict['offset_output'][:, piano_note], offset_shift_output=output_dict['offset_shift_output'][:, piano_note], velocity_output=output_dict['velocity_output'][:, piano_note], frame_threshold=self.frame_threshold) est_tuples += est_tuples_per_note est_midi_notes += [piano_note + self.begin_note] * len(est_tuples_per_note) est_tuples = np.array(est_tuples) # (notes, 5) """(notes, 5), the five columns are onset, offset, onset_shift, offset_shift and normalized_velocity""" est_midi_notes = np.array(est_midi_notes) # (notes,) if len(est_tuples) == 0: return np.array([]) else: onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second velocities = est_tuples[:, 4] est_on_off_note_vels = np.stack((onset_times, offset_times, est_midi_notes, velocities), axis=-1) """(notes, 3), the three columns are onset_times, offset_times and velocity.""" est_on_off_note_vels = est_on_off_note_vels.astype(np.float32) return est_on_off_note_vels def output_dict_to_detected_pedals(self, output_dict): """Postprocess output_dict to piano pedals. Args: output_dict: dict, e.g. { 'pedal_frame_output': (frames_num,), 'pedal_offset_output': (frames_num,), 'pedal_offset_shift_output': (frames_num,), ...} Returns: est_on_off: (notes, 2), the two columns are pedal onsets and pedal offsets. E.g., [[0.1800, 0.9669], [1.1400, 2.6458], ...] """ frames_num = output_dict['pedal_frame_output'].shape[0] est_tuples = pedal_detection_with_onset_offset_regress( frame_output=output_dict['pedal_frame_output'][:, 0], offset_output=output_dict['pedal_offset_output'][:, 0], offset_shift_output=output_dict['pedal_offset_shift_output'][:, 0], frame_threshold=0.5) est_tuples = np.array(est_tuples) """(notes, 2), the two columns are pedal onsets and pedal offsets""" if len(est_tuples) == 0: return np.array([]) else: onset_times = (est_tuples[:, 0] + est_tuples[:, 2]) / self.frames_per_second offset_times = (est_tuples[:, 1] + est_tuples[:, 3]) / self.frames_per_second est_on_off = np.stack((onset_times, offset_times), axis=-1) est_on_off = est_on_off.astype(np.float32) return est_on_off def detected_notes_to_events(self, est_on_off_note_vels): """Reformat detected notes to midi events. Args: est_on_off_vels: (notes, 3), the three columns are onset_times, offset_times and velocity. E.g. [[32.8376, 35.7700, 0.7932], [37.3712, 39.9300, 0.8058], ...] Returns: midi_events, list, e.g., [{'onset_time': 39.7376, 'offset_time': 39.75, 'midi_note': 27, 'velocity': 84}, {'onset_time': 11.9824, 'offset_time': 12.50, 'midi_note': 33, 'velocity': 88}, ...] """ midi_events = [] for i in range(est_on_off_note_vels.shape[0]): midi_events.append({ 'onset_time': est_on_off_note_vels[i][0], 'offset_time': est_on_off_note_vels[i][1], 'midi_note': int(est_on_off_note_vels[i][2]), 'velocity': int(est_on_off_note_vels[i][3] * self.velocity_scale)}) return midi_events def detected_pedals_to_events(self, pedal_on_offs): """Reformat detected pedal onset and offsets to events. Args: pedal_on_offs: (notes, 2), the two columns are pedal onsets and pedal offsets. E.g., [[0.1800, 0.9669], [1.1400, 2.6458], ...] Returns: pedal_events: list of dict, e.g., [{'onset_time': 0.1800, 'offset_time': 0.9669}, {'onset_time': 1.1400, 'offset_time': 2.6458}, ...] """ pedal_events = [] for i in range(len(pedal_on_offs)): pedal_events.append({ 'onset_time': pedal_on_offs[i, 0], 'offset_time': pedal_on_offs[i, 1]}) return pedal_events def load_audio(path, sr=22050, mono=True, offset=0.0, duration=None, dtype=np.float32, res_type='kaiser_best', backends=[audioread.ffdec.FFmpegAudioFile]): """Load audio. Copied from librosa.core.load() except that ffmpeg backend is always used in this function.""" y = [] with audioread.audio_open(os.path.realpath(path), backends=backends) as input_file: sr_native = input_file.samplerate n_channels = input_file.channels s_start = int(np.round(sr_native * offset)) * n_channels if duration is None: s_end = np.inf else: s_end = s_start + (int(np.round(sr_native * duration)) * n_channels) n = 0 for frame in input_file: frame = frame = librosa.util.buf_to_float(frame, n_bytes=2, dtype=dtype) n_prev = n n = n + len(frame) if n < s_start: # offset is after the current frame # keep reading continue if s_end < n_prev: # we're off the end. stop reading break if s_end < n: # the end is in this frame. crop. frame = frame[:s_end - n_prev] if n_prev <= s_start <= n: # beginning is in this frame frame = frame[(s_start - n_prev):] # tack on the current frame y.append(frame) if y: y = np.concatenate(y) if n_channels > 1: y = y.reshape((-1, n_channels)).T if mono: y = librosa.to_mono(y) if sr is not None: y = librosa.resample(y, orig_sr=sr_native, target_sr=sr, res_type=res_type) else: sr = sr_native # Final cleanup for dtype and contiguity y = np.ascontiguousarray(y, dtype=dtype) return (y, sr)