Spaces:
Sleeping
Sleeping
from __gin__ import dynamic_registration | |
from gamadhani import src | |
from gamadhani.src import dataset | |
from gamadhani.src import model_diffusion | |
from gamadhani.utils import pitch_to_audio_utils | |
from gamadhani.utils import utils | |
import torch | |
# Macros: | |
# ============================================================================== | |
AUDIO_SEQ_LEN = 750 | |
LR = 0.0001 | |
NFFT = 1024 | |
NUM_MELS = 192 | |
SINGER_CONDITIONING = True | |
SR = 16000 | |
# Parameters for torch.optim.AdamW: | |
# ============================================================================== | |
torch.optim.AdamW.betas = (0.9, 0.99) | |
torch.optim.AdamW.lr = 0.0001 | |
# Parameters for utils.build_warmed_exponential_lr_scheduler: | |
# ============================================================================== | |
utils.build_warmed_exponential_lr_scheduler.cycle_length = 480000 | |
utils.build_warmed_exponential_lr_scheduler.eta_max = %LR | |
utils.build_warmed_exponential_lr_scheduler.eta_min = 0.1 | |
utils.build_warmed_exponential_lr_scheduler.peak_iteration = 10000 | |
utils.build_warmed_exponential_lr_scheduler.start_factor = 0.01 | |
# Parameters for model_diffusion.UNetPitchConditioned.configure_optimizers: | |
# ============================================================================== | |
model_diffusion.UNetPitchConditioned.configure_optimizers.optimizer_cls = @torch.optim.AdamW | |
model_diffusion.UNetPitchConditioned.configure_optimizers.scheduler_cls = \ | |
@utils.build_warmed_exponential_lr_scheduler | |
# Parameters for pitch_to_audio_utils.from_mels: | |
# ============================================================================== | |
pitch_to_audio_utils.from_mels.nfft = %NFFT | |
pitch_to_audio_utils.from_mels.num_mels = %NUM_MELS | |
pitch_to_audio_utils.from_mels.sr = %SR | |
# Parameters for pitch_to_audio_utils.normalized_mels_to_audio: | |
# ============================================================================== | |
pitch_to_audio_utils.normalized_mels_to_audio.n_iter = 100 | |
pitch_to_audio_utils.normalized_mels_to_audio.nfft = %NFFT | |
pitch_to_audio_utils.normalized_mels_to_audio.num_mels = %NUM_MELS | |
pitch_to_audio_utils.normalized_mels_to_audio.sr = %SR | |
# Parameters for dataset.SequenceDataset: | |
# ============================================================================== | |
dataset.SequenceDataset.task = @dataset.Task() | |
# Parameters for dataset.Task: | |
# ============================================================================== | |
dataset.Task.read_fn = @dataset.load_cached_dataset | |
dataset.Task.kwargs = {"audio_len": %AUDIO_SEQ_LEN, | |
"return_singer": %SINGER_CONDITIONING} | |
# Parameters for pitch_to_audio_utils.torch_gl: | |
# ============================================================================== | |
pitch_to_audio_utils.torch_gl.n_iter = 200 | |
pitch_to_audio_utils.torch_gl.nfft = %NFFT | |
pitch_to_audio_utils.torch_gl.sr = %SR | |
# Parameters for pitch_to_audio_utils.torch_istft: | |
# ============================================================================== | |
pitch_to_audio_utils.torch_istft.nfft = %NFFT | |
# Parameters for model_diffusion.UNetPitchConditioned: | |
# ============================================================================== | |
model_diffusion.UNetPitchConditioned.audio_seq_len = %AUDIO_SEQ_LEN | |
model_diffusion.UNetPitchConditioned.cfg = True | |
model_diffusion.UNetPitchConditioned.cond_drop_prob = 0.2 | |
model_diffusion.UNetPitchConditioned.dropout = 0.3 | |
model_diffusion.UNetPitchConditioned.f0_dim = 128 | |
model_diffusion.UNetPitchConditioned.features = [512, 640, 1024] | |
model_diffusion.UNetPitchConditioned.inp_dim = %NUM_MELS | |
model_diffusion.UNetPitchConditioned.kernel_size = 5 | |
model_diffusion.UNetPitchConditioned.log_samples_every = 10 | |
model_diffusion.UNetPitchConditioned.log_wandb_samples_every = 50 | |
model_diffusion.UNetPitchConditioned.loss_w_padding = True | |
model_diffusion.UNetPitchConditioned.nonlinearity = 'mish' | |
model_diffusion.UNetPitchConditioned.norm = False | |
model_diffusion.UNetPitchConditioned.num_attns = 4 | |
model_diffusion.UNetPitchConditioned.num_convs = 4 | |
model_diffusion.UNetPitchConditioned.num_heads = 8 | |
model_diffusion.UNetPitchConditioned.project_dim = 256 | |
model_diffusion.UNetPitchConditioned.singer_conditioning = %SINGER_CONDITIONING | |
model_diffusion.UNetPitchConditioned.singer_dim = 128 | |
model_diffusion.UNetPitchConditioned.singer_vocab = 55 | |
model_diffusion.UNetPitchConditioned.sr = %SR | |
model_diffusion.UNetPitchConditioned.strides = [4, 2, 2] | |
model_diffusion.UNetPitchConditioned.time_dim = 128 | |