model_name: EzAudio-L

model:
  mae: True
  mae_prob: 0.25
  mask_ratio: [0.25, 1.0]
  mask_span: 10
  img_size: 500
  patch_size: 1
  in_chans: 257
  out_chans: 128
  input_type: '1d'
  embed_dim: 1024
  depth: 24
  num_heads: 16
  mlp_ratio: 4.0
  qkv_bias: false
  qk_scale: null
  qk_norm: layernorm
  norm_layer: layernorm
  act_layer: geglu
  context_norm: true
  use_checkpoint: true
  time_fusion: 'ada_sola_bias'
  ada_lora_rank: 32
  ada_lora_alpha: 32
  cls_dim: null
  context_dim: 1024
  context_fusion: 'cross'
  context_max_length: null
  context_pe_method: 'none'
  pe_method: 'none'
  rope_mode: 'shared'
  use_conv: true
  skip: true
  skip_norm: true

autoencoder:
  name: stable_vae
  dim: 128
  sr: 24000
  latent_sr: 50
  q_first: true
  scale: 1.0
  shift: 0.0

text_encoder:
  model: google/flan-t5-large
  max_length: 100
  cfg: 0.1

diff:
  num_train_timesteps: 1000
  beta_schedule: 'scaled_linear'
  beta_start: 0.00085
  beta_end: 0.012
  prediction_type: 'v_prediction'
  rescale_betas_zero_snr: true
  timestep_spacing: 'trailing'
  clip_sample: false