File size: 3,989 Bytes
9d0d223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# @package __global__

defaults:
  - /solver/default
  - /augmentations/default
  - override /dset: audio/example
  - _self_

solver: watermarking # standard name to load the solver using builders
sample_rate: ???
channels: ???

# all the defaults form compression
losses:
  adv: 4.
  feat: 4.
  l1: 0.1
  mel: 0.0
  msspec: 2.0
  sisnr: 0.0
  wm_detection: 1.0 # loss for first 2 bits cannot be 0 
  wm_mb: 1.0  # loss for the rest of the bits (wm message)
  tf_loudnessratio: 10.0

balancer:
  balance_grads: true
  ema_decay: 0.999
  per_batch_item: true
  total_norm: 1.

crop:
  prob: 0.4
  shuffle_prob: 0.2
  pad_prob: 0.2  # shuffle_prob + pad_prob + prob <= 1
  size: 0.5
  max_n_windows: 5

adversarial:
  every: 1
  adversaries: [msstftd]
  adv_loss: hinge
  feat_loss: l1

tf_loudnessratio:
  sample_rate: ${sample_rate}
  segment: 0.5
  overlap: 0.5
  n_bands: 16
  temperature: 1.0

# watermarking: audioseal

# losses hyperparameters
l1: {}
l2: {}

wm_detection:
  p_weight: 1
  n_weight: 1

wm_mb:
  loss_type: bce # loss between decoded and original
  temperature: 0.1 # decoded is divided by temperature before loss computation

spec_range:
  n_fft: 2048
  min_frequency: 300.0
  max_frequency: 15000.0
  sample_rate: ${sample_rate}
spec_entropy_range:
  n_fft: 2048
  min_frequency: 300.0
  max_frequency: 15000.0
  sample_rate: ${sample_rate}
mrstft:
  factor_sc: .5
  factor_mag: .5
  normalized: false
mel:
  sample_rate: ${sample_rate}
  n_fft: 1024
  hop_length: 256
  win_length: 1024
  n_mels: 64
  f_min: 64
  f_max: null
  normalized: false
  floor_level: 1e-5
sisnr:
  sample_rate: ${sample_rate}
  segment: 5.
msspec:
  sample_rate: ${sample_rate}
  range_start: 6
  range_end: 11
  n_mels: 64
  f_min: 64
  f_max: null
  normalized: true
  alphas: false
  floor_level: 1e-5

# metrics
metrics:
  visqol:
    mode: audio
    bin: null # path to visqol install
    model: tcdaudio14_aacvopus_coresv_svrnsim_n.68_g.01_c1.model # visqol v3

# adversaries hyperparameters
msstftd:
  in_channels: 1
  out_channels: 1
  filters: 32
  norm: weight_norm
  n_ffts: [1024, 2048, 512, 256, 128]
  hop_lengths: [256, 512, 128, 64, 32]
  win_lengths: [1024, 2048, 512, 256, 128]
  activation: LeakyReLU
  activation_params: { negative_slope: 0.3 }
msd:
  in_channels: 1
  out_channels: 1
  scale_norms: [spectral_norm, weight_norm, weight_norm]
  kernel_sizes: [5, 3]
  filters: 16
  max_filters: 1024
  downsample_scales: [4, 4, 4, 4]
  inner_kernel_sizes: null
  groups: [4, 4, 4, 4]
  strides: null
  paddings: null
  activation: LeakyReLU
  activation_params: { negative_slope: 0.3 }
mpd:
  in_channels: 1
  out_channels: 1
  periods: [2, 3, 5, 7, 11]
  n_layers: 5
  kernel_size: 5
  stride: 3
  filters: 8
  filter_scales: 4
  max_filters: 1024
  activation: LeakyReLU
  activation_params: { negative_slope: 0.3 }
  norm: weight_norm

# data hyperparameters
dataset:
  batch_size: 16
  num_workers: 10
  segment_duration: 1
  train:
    num_samples: 500000
  valid:
    num_samples: 10000
  evaluate:
    batch_size: 16
    num_samples: 10000
    segment_duration: 10

  generate:
    batch_size: 16
    num_samples: 50
    segment_duration: 30

# solver hyperparameters
evaluate:
  every: 10
  num_workers: 5
  metrics:
    visqol: false
    sisnr: true
generate:
  every: 10
  num_workers: 5
  audio:
    sample_rate: ${sample_rate}

# checkpointing schedule
checkpoint:
  save_last: true
  save_every: 25
  keep_last: 10
  keep_every_states: null



# optimization hyperparameters
optim:
  epochs: 300
  updates_per_epoch: 2000
  lr: 5e-5
  max_norm: 3.0
  optimizer: adam
  adam:
    betas: [0.5, 0.9]
    weight_decay: 0.
  ema:
    use: true # whether to use EMA or not
    updates: 1 # update at every step
    device: ${device} # device for EMA, can be put on GPU if more frequent updates
    decay: 0.99 # EMA decay value, if null, no EMA is used

  
schedule:
  lr_scheduler: "cosine"
  cosine:
    warmup: 4000
    lr_min_ratio: 0.0
    cycle_length: 1.0