Spaces:
Runtime error
Runtime error
import argparse | |
import os | |
import pathlib | |
import time | |
from typing import NoReturn | |
import librosa | |
import numpy as np | |
import soundfile | |
import torch | |
from bytesep.inference import Separator | |
from bytesep.models.lightning_modules import get_model_class | |
from bytesep.utils import read_yaml | |
def inference(args) -> NoReturn: | |
r"""Separate all audios in a directory. | |
Args: | |
config_yaml: str, the config file of a model being trained | |
checkpoint_path: str, the path of checkpoint to be loaded | |
audios_dir: str, the directory of audios to be separated | |
output_dir: str, the directory to write out separated audios | |
scale_volume: bool, if True then the volume is scaled to the maximum value of 1. | |
Returns: | |
NoReturn | |
""" | |
# Arguments & parameters | |
config_yaml = args.config_yaml | |
checkpoint_path = args.checkpoint_path | |
audios_dir = args.audios_dir | |
output_dir = args.output_dir | |
scale_volume = args.scale_volume | |
device = ( | |
torch.device('cuda') | |
if args.cuda and torch.cuda.is_available() | |
else torch.device('cpu') | |
) | |
configs = read_yaml(config_yaml) | |
sample_rate = configs['train']['sample_rate'] | |
input_channels = configs['train']['channels'] | |
target_source_types = configs['train']['target_source_types'] | |
target_sources_num = len(target_source_types) | |
model_type = configs['train']['model_type'] | |
mono = input_channels == 1 | |
segment_samples = int(30 * sample_rate) | |
batch_size = 1 | |
device = "cuda" | |
models_contains_inplaceabn = True | |
# Need to use torch.distributed if models contain inplace_abn.abn.InPlaceABNSync. | |
if models_contains_inplaceabn: | |
import torch.distributed as dist | |
dist.init_process_group( | |
'gloo', init_method='file:///tmp/somefile', rank=0, world_size=1 | |
) | |
print("Using {} for separating ..".format(device)) | |
# paths | |
os.makedirs(output_dir, exist_ok=True) | |
# Get model class. | |
Model = get_model_class(model_type) | |
# Create model. | |
model = Model(input_channels=input_channels, target_sources_num=target_sources_num) | |
# Load checkpoint. | |
checkpoint = torch.load(checkpoint_path, map_location='cpu') | |
model.load_state_dict(checkpoint["model"]) | |
# Move model to device. | |
model.to(device) | |
# Create separator. | |
separator = Separator( | |
model=model, | |
segment_samples=segment_samples, | |
batch_size=batch_size, | |
device=device, | |
) | |
audio_names = sorted(os.listdir(audios_dir)) | |
for audio_name in audio_names: | |
audio_path = os.path.join(audios_dir, audio_name) | |
# Load audio. | |
audio, _ = librosa.load(audio_path, sr=sample_rate, mono=mono) | |
if audio.ndim == 1: | |
audio = audio[None, :] | |
input_dict = {'waveform': audio} | |
# Separate | |
separate_time = time.time() | |
sep_wav = separator.separate(input_dict) | |
# (channels_num, audio_samples) | |
print('Separate time: {:.3f} s'.format(time.time() - separate_time)) | |
# Write out separated audio. | |
if scale_volume: | |
sep_wav /= np.max(np.abs(sep_wav)) | |
soundfile.write(file='_zz.wav', data=sep_wav.T, samplerate=sample_rate) | |
output_path = os.path.join( | |
output_dir, '{}.mp3'.format(pathlib.Path(audio_name).stem) | |
) | |
os.system('ffmpeg -y -loglevel panic -i _zz.wav "{}"'.format(output_path)) | |
print('Write out to {}'.format(output_path)) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="") | |
parser.add_argument( | |
"--config_yaml", | |
type=str, | |
required=True, | |
help="The config file of a model being trained.", | |
) | |
parser.add_argument( | |
"--checkpoint_path", | |
type=str, | |
required=True, | |
help="The path of checkpoint to be loaded.", | |
) | |
parser.add_argument( | |
"--audios_dir", | |
type=str, | |
required=True, | |
help="The directory of audios to be separated.", | |
) | |
parser.add_argument( | |
"--output_dir", | |
type=str, | |
required=True, | |
help="The directory to write out separated audios.", | |
) | |
parser.add_argument( | |
'--scale_volume', | |
action='store_true', | |
default=False, | |
help="set to True if separated audios are scaled to the maximum value of 1.", | |
) | |
parser.add_argument("--cuda", action='store_true', default=True) | |
args = parser.parse_args() | |
inference(args) | |