# @package __global__ # This is the training loop solver # for the base audio-MAGNeT model (text-to-sound) # on monophonic audio sampled at 16 kHz # using a similar EnCodec+LM setup to MAGNeT defaults: - audiogen/default - /model: lm/audiogen_lm - override /dset: audio/default - _self_ lm_model: transformer_lm_magnet solver: audio_magnet autocast: true autocast_dtype: float16 # EnCodec large trained on mono-channel music audio sampled at 16khz # with a total stride of 320 leading to 50 frames/s. # rvq.n_q=4, rvq.bins=2048, no quantization dropout # (transformer_lm card and n_q must be compatible) compression_model_checkpoint: //reference/bd44a852/checkpoint.th channels: 1 sample_rate: 16000 deadlock: use: true # deadlock detection dataset: batch_size: 128 # matching AudioGen paper setup (256 * mix_p=0.5 = 128) num_workers: 10 segment_duration: 10 min_segment_ratio: 1.0 sample_on_weight: false # Uniform sampling all the way sample_on_duration: false # Uniform sampling all the way external_metadata_source: null # sample mixing augmentation at train time train: batch_size: 256 # matching AudioGen paper setup aug_p: 0.5 # perform audio mixing 50% of the time mix_p: 0.5 # proportion of batch items mixed together # important: note that this will reduce the # actual batch size used at train time # which will be equal to mix_p * batch_size mix_snr_low: -5 mix_snr_high: 5 mix_min_overlap: 0.5 optim: epochs: 100 optimizer: adamw lr: 5e-4 ema: use: true updates: 10 device: cuda logging: log_tensorboard: true schedule: lr_scheduler: inverse_sqrt inverse_sqrt: warmup: 3000 warmup_init_lr: 0.0 codebooks_pattern: modeling: parallel parallel: empty_initial: -1 transformer_lm: card: 2048 causal: false subcodes_context: 5 compression_model_framerate: 50 # NOTE: Must match the actual frame rate of the used compression model segment_duration: 0 span_len: -1 masking: span_len: 3 generate: lm: max_prompt_len: null max_gen_len: null remove_prompts: false use_sampling: true temp: 3.5 top_k: 0 top_p: 0.8 max_cfg_coef: 20.0 min_cfg_coef: 1.0 decoding_steps: [20, 10, 10, 10] anneal_temp: true span_scoring: 'max' span_arrangement: 'nonoverlap' prompted_samples: false samples: prompted: false unprompted: true