Spaces:

facebook
/

MelodyFlow

Running on Zero

File size: 3,989 Bytes

9d0d223

# @package __global__

defaults:
  - /solver/default
  - /augmentations/default
  - override /dset: audio/example
  - _self_

solver: watermarking # standard name to load the solver using builders
sample_rate: ???
channels: ???

# all the defaults form compression
losses:
  adv: 4.
  feat: 4.
  l1: 0.1
  mel: 0.0
  msspec: 2.0
  sisnr: 0.0
  wm_detection: 1.0 # loss for first 2 bits cannot be 0 
  wm_mb: 1.0  # loss for the rest of the bits (wm message)
  tf_loudnessratio: 10.0

balancer:
  balance_grads: true
  ema_decay: 0.999
  per_batch_item: true
  total_norm: 1.

crop:
  prob: 0.4
  shuffle_prob: 0.2
  pad_prob: 0.2  # shuffle_prob + pad_prob + prob <= 1
  size: 0.5
  max_n_windows: 5

adversarial:
  every: 1
  adversaries: [msstftd]
  adv_loss: hinge
  feat_loss: l1

tf_loudnessratio:
  sample_rate: ${sample_rate}
  segment: 0.5
  overlap: 0.5
  n_bands: 16
  temperature: 1.0

# watermarking: audioseal

# losses hyperparameters
l1: {}
l2: {}

wm_detection:
  p_weight: 1
  n_weight: 1

wm_mb:
  loss_type: bce # loss between decoded and original
  temperature: 0.1 # decoded is divided by temperature before loss computation

spec_range:
  n_fft: 2048
  min_frequency: 300.0
  max_frequency: 15000.0
  sample_rate: ${sample_rate}
spec_entropy_range:
  n_fft: 2048
  min_frequency: 300.0
  max_frequency: 15000.0
  sample_rate: ${sample_rate}
mrstft:
  factor_sc: .5
  factor_mag: .5
  normalized: false
mel:
  sample_rate: ${sample_rate}
  n_fft: 1024
  hop_length: 256
  win_length: 1024
  n_mels: 64
  f_min: 64
  f_max: null
  normalized: false
  floor_level: 1e-5
sisnr:
  sample_rate: ${sample_rate}
  segment: 5.
msspec:
  sample_rate: ${sample_rate}
  range_start: 6
  range_end: 11
  n_mels: 64
  f_min: 64
  f_max: null
  normalized: true
  alphas: false
  floor_level: 1e-5

# metrics
metrics:
  visqol:
    mode: audio
    bin: null # path to visqol install
    model: tcdaudio14_aacvopus_coresv_svrnsim_n.68_g.01_c1.model # visqol v3

# adversaries hyperparameters
msstftd:
  in_channels: 1
  out_channels: 1
  filters: 32
  norm: weight_norm
  n_ffts: [1024, 2048, 512, 256, 128]
  hop_lengths: [256, 512, 128, 64, 32]
  win_lengths: [1024, 2048, 512, 256, 128]
  activation: LeakyReLU
  activation_params: { negative_slope: 0.3 }
msd:
  in_channels: 1
  out_channels: 1
  scale_norms: [spectral_norm, weight_norm, weight_norm]
  kernel_sizes: [5, 3]
  filters: 16
  max_filters: 1024
  downsample_scales: [4, 4, 4, 4]
  inner_kernel_sizes: null
  groups: [4, 4, 4, 4]
  strides: null
  paddings: null
  activation: LeakyReLU
  activation_params: { negative_slope: 0.3 }
mpd:
  in_channels: 1
  out_channels: 1
  periods: [2, 3, 5, 7, 11]
  n_layers: 5
  kernel_size: 5
  stride: 3
  filters: 8
  filter_scales: 4
  max_filters: 1024
  activation: LeakyReLU
  activation_params: { negative_slope: 0.3 }
  norm: weight_norm

# data hyperparameters
dataset:
  batch_size: 16
  num_workers: 10
  segment_duration: 1
  train:
    num_samples: 500000
  valid:
    num_samples: 10000
  evaluate:
    batch_size: 16
    num_samples: 10000
    segment_duration: 10

  generate:
    batch_size: 16
    num_samples: 50
    segment_duration: 30

# solver hyperparameters
evaluate:
  every: 10
  num_workers: 5
  metrics:
    visqol: false
    sisnr: true
generate:
  every: 10
  num_workers: 5
  audio:
    sample_rate: ${sample_rate}

# checkpointing schedule
checkpoint:
  save_last: true
  save_every: 25
  keep_last: 10
  keep_every_states: null



# optimization hyperparameters
optim:
  epochs: 300
  updates_per_epoch: 2000
  lr: 5e-5
  max_norm: 3.0
  optimizer: adam
  adam:
    betas: [0.5, 0.9]
    weight_decay: 0.
  ema:
    use: true # whether to use EMA or not
    updates: 1 # update at every step
    device: ${device} # device for EMA, can be put on GPU if more frequent updates
    decay: 0.99 # EMA decay value, if null, no EMA is used

  
schedule:
  lr_scheduler: "cosine"
  cosine:
    warmup: 4000
    lr_min_ratio: 0.0
    cycle_length: 1.0