Spaces:
Running
on
Zero
Running
on
Zero
# @package __global__ | |
# This is the training loop solver | |
# for the base audio-MAGNeT model (text-to-sound) | |
# on monophonic audio sampled at 16 kHz | |
# using a similar EnCodec+LM setup to MAGNeT | |
defaults: | |
- audiogen/default | |
- /model: lm/audiogen_lm | |
- override /dset: audio/default | |
- _self_ | |
lm_model: transformer_lm_magnet | |
solver: audio_magnet | |
autocast: true | |
autocast_dtype: float16 | |
# EnCodec large trained on mono-channel music audio sampled at 16khz | |
# with a total stride of 320 leading to 50 frames/s. | |
# rvq.n_q=4, rvq.bins=2048, no quantization dropout | |
# (transformer_lm card and n_q must be compatible) | |
compression_model_checkpoint: //reference/bd44a852/checkpoint.th | |
channels: 1 | |
sample_rate: 16000 | |
deadlock: | |
use: true # deadlock detection | |
dataset: | |
batch_size: 128 # matching AudioGen paper setup (256 * mix_p=0.5 = 128) | |
num_workers: 10 | |
segment_duration: 10 | |
min_segment_ratio: 1.0 | |
sample_on_weight: false # Uniform sampling all the way | |
sample_on_duration: false # Uniform sampling all the way | |
external_metadata_source: null | |
# sample mixing augmentation at train time | |
train: | |
batch_size: 256 # matching AudioGen paper setup | |
aug_p: 0.5 # perform audio mixing 50% of the time | |
mix_p: 0.5 # proportion of batch items mixed together | |
# important: note that this will reduce the | |
# actual batch size used at train time | |
# which will be equal to mix_p * batch_size | |
mix_snr_low: -5 | |
mix_snr_high: 5 | |
mix_min_overlap: 0.5 | |
optim: | |
epochs: 100 | |
optimizer: adamw | |
lr: 5e-4 | |
ema: | |
use: true | |
updates: 10 | |
device: cuda | |
logging: | |
log_tensorboard: true | |
schedule: | |
lr_scheduler: inverse_sqrt | |
inverse_sqrt: | |
warmup: 3000 | |
warmup_init_lr: 0.0 | |
codebooks_pattern: | |
modeling: parallel | |
parallel: | |
empty_initial: -1 | |
transformer_lm: | |
card: 2048 | |
causal: false | |
subcodes_context: 5 | |
compression_model_framerate: 50 # NOTE: Must match the actual frame rate of the used compression model | |
segment_duration: 0 | |
span_len: -1 | |
masking: | |
span_len: 3 | |
generate: | |
lm: | |
max_prompt_len: null | |
max_gen_len: null | |
remove_prompts: false | |
use_sampling: true | |
temp: 3.5 | |
top_k: 0 | |
top_p: 0.8 | |
max_cfg_coef: 20.0 | |
min_cfg_coef: 1.0 | |
decoding_steps: [20, 10, 10, 10] | |
anneal_temp: true | |
span_scoring: 'max' | |
span_arrangement: 'nonoverlap' | |
prompted_samples: false | |
samples: | |
prompted: false | |
unprompted: true | |