Spaces:

asigalov61
/

ByteDance-Solo-Piano-Audio-to-MIDI-Transcription

Running on Zero

App Files Files Community

ByteDance-Solo-Piano-Audio-to-MIDI-Transcription / inference.py

asigalov61

Update inference.py

0d3b879 verified 5 months ago

raw

history blame contribute delete

No virus

5.6 kB

	import os
	import numpy as np
	import time
	import librosa
	from pathlib import Path

	import torch

	from utilities import (create_folder, get_filename, RegressionPostProcessor,
	write_events_to_midi)
	from models import Regress_onset_offset_frame_velocity_CRNN, Note_pedal
	from pytorch_utils import move_data_to_device, forward
	import config


	class PianoTranscription(object):
	def __init__(self, model_type='Note_pedal', checkpoint_path=None,
	segment_samples=16000*10, device=torch.device('cuda')):
	"""Class for transcribing piano solo recording.

	Args:
	model_type: str
	checkpoint_path: str
	segment_samples: int
	device: 'cuda' \| 'cpu'
	"""

	checkpoint_path="CRNN_note_F1=0.9677_pedal_F1=0.9186.pth"
	print('Checkpoint path: {}'.format(checkpoint_path))

	print('Using {} for inference.'.format(device))

	self.segment_samples = segment_samples
	self.frames_per_second = config.frames_per_second
	self.classes_num = config.classes_num
	self.onset_threshold = 0.3
	self.offset_threshod = 0.3
	self.frame_threshold = 0.1
	self.pedal_offset_threshold = 0.2

	# Build model
	Model = eval(model_type)
	self.model = Model(frames_per_second=self.frames_per_second,
	classes_num=self.classes_num)

	# Load model
	checkpoint = torch.load(checkpoint_path, map_location=device)
	self.model.load_state_dict(checkpoint['model'], strict=False)

	# Parallel
	if 'cuda' in str(device):
	self.model.to(device)
	print('GPU number: {}'.format(torch.cuda.device_count()))
	self.model = torch.nn.DataParallel(self.model)
	else:
	print('Using CPU.')

	def transcribe(self, audio, midi_path):
	"""Transcribe an audio recording.

	Args:
	audio: (audio_samples,)
	midi_path: str, path to write out the transcribed MIDI.

	Returns:
	transcribed_dict, dict: {'output_dict':, ..., 'est_note_events': ...}

	"""
	audio = audio[None, :] # (1, audio_samples)

	# Pad audio to be evenly divided by segment_samples
	audio_len = audio.shape[1]
	pad_len = int(np.ceil(audio_len / self.segment_samples))\
	* self.segment_samples - audio_len

	audio = np.concatenate((audio, np.zeros((1, pad_len))), axis=1)

	# Enframe to segments
	segments = self.enframe(audio, self.segment_samples)
	"""(N, segment_samples)"""

	# Forward
	output_dict = forward(self.model, segments, batch_size=1)
	"""{'reg_onset_output': (N, segment_frames, classes_num), ...}"""

	# Deframe to original length
	for key in output_dict.keys():
	output_dict[key] = self.deframe(output_dict[key])[0 : audio_len]
	"""output_dict: {
	'reg_onset_output': (N, segment_frames, classes_num),
	'reg_offset_output': (N, segment_frames, classes_num),
	'frame_output': (N, segment_frames, classes_num),
	'velocity_output': (N, segment_frames, classes_num)}"""

	# Post processor
	post_processor = RegressionPostProcessor(self.frames_per_second,
	classes_num=self.classes_num, onset_threshold=self.onset_threshold,
	offset_threshold=self.offset_threshod,
	frame_threshold=self.frame_threshold,
	pedal_offset_threshold=self.pedal_offset_threshold)

	# Post process output_dict to MIDI events
	(est_note_events, est_pedal_events) = \
	post_processor.output_dict_to_midi_events(output_dict)

	# Write MIDI events to file
	if midi_path:
	write_events_to_midi(start_time=0, note_events=est_note_events,
	pedal_events=est_pedal_events, midi_path=midi_path)
	print('Write out to {}'.format(midi_path))

	transcribed_dict = {
	'output_dict': output_dict,
	'est_note_events': est_note_events,
	'est_pedal_events': est_pedal_events}

	return transcribed_dict

	def enframe(self, x, segment_samples):
	"""Enframe long sequence to short segments.

	Args:
	x: (1, audio_samples)
	segment_samples: int

	Returns:
	batch: (N, segment_samples)
	"""
	assert x.shape[1] % segment_samples == 0
	batch = []

	pointer = 0
	while pointer + segment_samples <= x.shape[1]:
	batch.append(x[:, pointer : pointer + segment_samples])
	pointer += segment_samples // 2

	batch = np.concatenate(batch, axis=0)
	return batch

	def deframe(self, x):
	"""Deframe predicted segments to original sequence.

	Args:
	x: (N, segment_frames, classes_num)

	Returns:
	y: (audio_frames, classes_num)
	"""
	if x.shape[0] == 1:
	return x[0]

	else:
	x = x[:, 0 : -1, :]
	"""Remove an extra frame in the end of each segment caused by the
	'center=True' argument when calculating spectrogram."""
	(N, segment_samples, classes_num) = x.shape
	assert segment_samples % 4 == 0

	y = []
	y.append(x[0, 0 : int(segment_samples * 0.75)])
	for i in range(1, N - 1):
	y.append(x[i, int(segment_samples * 0.25) : int(segment_samples * 0.75)])
	y.append(x[-1, int(segment_samples * 0.25) :])
	y = np.concatenate(y, axis=0)
	return y