File size: 3,118 Bytes
6efc863
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695bea5
6efc863
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
model:
  base_learning_rate: 3.0e-06
  target: ldm.models.diffusion.lcm_audio.LCM_audio
  params:
    linear_start: 0.00085
    linear_end: 0.012
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: caption
    mel_dim: 20
    mel_length: 312
    channels: 0
    cond_stage_trainable: False
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_by_std: true
    use_lcm: True
    num_ddim_timesteps: 50
    w_min: 4
    w_max: 12
    ckpt_path: ./ckpt/maa2.ckpt
    
    use_ema: false
    scheduler_config:
      target: ldm.lr_scheduler.LambdaLinearScheduler
      params:
        warm_up_steps:
        - 10000
        cycle_lengths:
        - 10000000000000
        f_start:
        - 1.0e-06
        f_max:
        - 1.0
        f_min:
        - 1.0
    unet_config:
      target: ldm.modules.diffusionmodules.concatDiT.ConcatDiT2MLP
      params:
        in_channels: 20
        context_dim: 1024
        hidden_size: 576
        num_heads: 8
        depth: 4
        max_len: 1000
    first_stage_config:
      target: ldm.models.autoencoder1d.AutoencoderKL
      params:
        embed_dim: 20
        monitor: val/rec_loss
        ckpt_path: ./model/AutoencoderKL/epoch=000032.ckpt
        ddconfig:
          double_z: true
          in_channels: 80
          out_ch: 80
          z_channels: 20
          kernel_size: 5
          ch: 384
          ch_mult:
          - 1
          - 2
          - 4
          num_res_blocks: 2
          attn_layers:
          - 3
          down_layers:
          - 0
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder
      params:
        weights_path: ./model/FrozenCLAPFLANEmbedder/CLAP_weights_2022.pth

lightning:
  callbacks:
    image_logger:
      target: main.AudioLogger
      params:
        sample_rate: 16000
        for_specs: true
        increase_log_steps: false
        batch_frequency: 5000
        max_images: 8
        melvmin: -5
        melvmax: 1.5
        vocoder_cfg:
          target: vocoder.bigvgan.models.VocoderBigVGAN
          params:
            ckpt_vocoder: ./vocoder/logs/bigvnat16k93.5w
  trainer:
    benchmark: True
    gradient_clip_val: 1.0
    replace_sampler_ddp: false
    max_epochs: 100
  modelcheckpoint:
    params:
      monitor: epoch
      mode: max
      # every_n_train_steps: 2000
      save_top_k: 100
      every_n_epochs: 3

      
data:
  target: main.SpectrogramDataModuleFromConfig
  params:
    batch_size: 8
    num_workers: 32
    spec_dir_path: 'ldm/data/tsv_dirs/full_data/caps_struct' 
    mel_num: 80
    train:
      target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsTrain
      params:
        specs_dataset_cfg:
    validation:
      target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsValidation
      params:
        specs_dataset_cfg:
        
test_dataset:
  target: ldm.data.tsvdataset.TSVDatasetStruct
  params:
    tsv_path: audiocaps_test_16000_struct.tsv
    spec_crop_len: 624