update

Browse files

Files changed (11) hide show

ckpts/.ipynb_checkpoints/ezaudio-l-checkpoint.yml +60 -0
ckpts/.ipynb_checkpoints/ezaudio-xl-checkpoint.yml +60 -0
ckpts/controlnet/.ipynb_checkpoints/energy_l-checkpoint.yml +79 -0
ckpts/controlnet/energy_l.yml +79 -0
ckpts/controlnet/s3_l_energy.pt +3 -0
ckpts/ezaudio-l.yml +60 -0
ckpts/ezaudio-xl.yml +60 -0
ckpts/s3/ezaudio_s3_l.pt +3 -0
ckpts/s3/ezaudio_s3_xl.pt +3 -0
ckpts/vae/1m.pt +3 -0
ckpts/vae/config.json +122 -0

ckpts/.ipynb_checkpoints/ezaudio-l-checkpoint.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+model_name: EzAudio-L
+model:
+  mae: True
+  mae_prob: 0.25
+  mask_ratio: [0.25, 1.0]
+  mask_span: 10
+  img_size: 500
+  patch_size: 1
+  in_chans: 257
+  out_chans: 128
+  input_type: '1d'
+  embed_dim: 1024
+  depth: 24
+  num_heads: 16
+  mlp_ratio: 4.0
+  qkv_bias: false
+  qk_scale: null
+  qk_norm: layernorm
+  norm_layer: layernorm
+  act_layer: geglu
+  context_norm: true
+  use_checkpoint: true
+  time_fusion: 'ada_sola_bias'
+  ada_lora_rank: 32
+  ada_lora_alpha: 32
+  cls_dim: null
+  context_dim: 1024
+  context_fusion: 'cross'
+  context_max_length: null
+  context_pe_method: 'none'
+  pe_method: 'none'
+  rope_mode: 'shared'
+  use_conv: true
+  skip: true
+  skip_norm: true
+autoencoder:
+  name: stable_vae
+  dim: 128
+  sr: 24000
+  latent_sr: 50
+  q_first: true
+  scale: 1.0
+  shift: 0.0
+text_encoder:
+  model: google/flan-t5-large
+  max_length: 100
+  cfg: 0.1
+diff:
+  num_train_timesteps: 1000
+  beta_schedule: 'scaled_linear'
+  beta_start: 0.00085
+  beta_end: 0.012
+  prediction_type: 'v_prediction'
+  rescale_betas_zero_snr: true
+  timestep_spacing: 'trailing'
+  clip_sample: false

ckpts/.ipynb_checkpoints/ezaudio-xl-checkpoint.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+model_name: EzAudio-XL
+model:
+  mae: True
+  mae_prob: 0.25
+  mask_ratio: [0.25, 1.0]
+  mask_span: 10
+  img_size: 500
+  patch_size: 1
+  in_chans: 257
+  out_chans: 128
+  input_type: '1d'
+  embed_dim: 1152
+  depth: 28
+  num_heads: 16
+  mlp_ratio: 4.0
+  qkv_bias: false
+  qk_scale: null
+  qk_norm: layernorm
+  norm_layer: layernorm
+  act_layer: geglu
+  context_norm: true
+  use_checkpoint: true
+  time_fusion: 'ada_sola_bias'
+  ada_sola_rank: 36
+  ada_sola_alpha: 36
+  cls_dim: null
+  context_dim: 2048
+  context_fusion: 'cross'
+  context_max_length: null
+  context_pe_method: 'none'
+  pe_method: 'none'
+  rope_mode: 'shared'
+  use_conv: true
+  skip: true
+  skip_norm: true
+autoencoder:
+  name: stable_vae
+  dim: 128
+  sr: 24000
+  latent_sr: 50
+  q_first: true
+  scale: 1.0
+  shift: 0.0
+text_encoder:
+  model: google/flan-t5-xl
+  max_length: 100
+  cfg: 0.1
+diff:
+  num_train_timesteps: 1000
+  beta_schedule: 'scaled_linear'
+  beta_start: 0.00085
+  beta_end: 0.012
+  prediction_type: 'v_prediction'
+  rescale_betas_zero_snr: true
+  timestep_spacing: 'trailing'
+  clip_sample: false

ckpts/controlnet/.ipynb_checkpoints/energy_l-checkpoint.yml ADDED Viewed

	@@ -0,0 +1,79 @@

+model_name: EzAudio-L-Energy
+model:
+  mae: True
+  mae_prob: 0.25
+  mask_ratio: [0.25, 1.0]
+  mask_span: 10
+  img_size: 500
+  patch_size: 1
+  in_chans: 257
+  out_chans: 128
+  input_type: '1d'
+  embed_dim: 1024
+  depth: 24
+  num_heads: 16
+  mlp_ratio: 4.0
+  qkv_bias: false
+  qk_scale: null
+  qk_norm: layernorm
+  norm_layer: layernorm
+  act_layer: geglu
+  context_norm: true
+  use_checkpoint: true
+  time_fusion: 'ada_sola_bias'
+  ada_lora_rank: 32
+  ada_lora_alpha: 32
+  cls_dim: null
+  context_dim: 1024
+  context_fusion: 'cross'
+  context_max_length: null
+  context_pe_method: 'none'
+  pe_method: 'none'
+  rope_mode: 'shared'
+  use_conv: true
+  skip: true
+  skip_norm: true
+controlnet:
+  cond_in: 1
+  cond_blocks: [64, 128]
+  cond_mask: true
+  cond_mask_prob: 0.25
+  cond_mask_ratio: [0.25, 0.50]
+  cond_mask_span: 10
+conditioner:
+  condition_type: energy
+  hop_size: 240
+  window_size: 1920
+  padding: 'reflect'
+  min_db: -60
+  norm: True
+# usually use q_first as false like other studies
+autoencoder:
+  name: stable_vae
+  dim: 128
+  sr: 24000
+  latent_sr: 50
+  q_first: true
+  scale: 1.0
+  shift: 0.0
+# a fixed length should be set when using concat mode
+# a fixed length should be set for distributed training
+text_encoder:
+  model: google/flan-t5-large
+  max_length: 100
+  cfg: 0.1
+diff:
+  num_train_timesteps: 1000
+  beta_schedule: 'scaled_linear'
+  beta_start: 0.00085
+  beta_end: 0.012
+  prediction_type: 'v_prediction'
+  rescale_betas_zero_snr: true
+  timestep_spacing: 'trailing'
+  clip_sample: false

ckpts/controlnet/energy_l.yml ADDED Viewed

	@@ -0,0 +1,79 @@

+model_name: EzAudio-L-Energy
+model:
+  mae: True
+  mae_prob: 0.25
+  mask_ratio: [0.25, 1.0]
+  mask_span: 10
+  img_size: 500
+  patch_size: 1
+  in_chans: 257
+  out_chans: 128
+  input_type: '1d'
+  embed_dim: 1024
+  depth: 24
+  num_heads: 16
+  mlp_ratio: 4.0
+  qkv_bias: false
+  qk_scale: null
+  qk_norm: layernorm
+  norm_layer: layernorm
+  act_layer: geglu
+  context_norm: true
+  use_checkpoint: true
+  time_fusion: 'ada_sola_bias'
+  ada_lora_rank: 32
+  ada_lora_alpha: 32
+  cls_dim: null
+  context_dim: 1024
+  context_fusion: 'cross'
+  context_max_length: null
+  context_pe_method: 'none'
+  pe_method: 'none'
+  rope_mode: 'shared'
+  use_conv: true
+  skip: true
+  skip_norm: true
+controlnet:
+  cond_in: 1
+  cond_blocks: [64, 128]
+  cond_mask: true
+  cond_mask_prob: 0.25
+  cond_mask_ratio: [0.25, 0.50]
+  cond_mask_span: 10
+conditioner:
+  condition_type: energy
+  hop_size: 240
+  window_size: 1920
+  padding: 'reflect'
+  min_db: -60
+  norm: True
+# usually use q_first as false like other studies
+autoencoder:
+  name: stable_vae
+  dim: 128
+  sr: 24000
+  latent_sr: 50
+  q_first: true
+  scale: 1.0
+  shift: 0.0
+# a fixed length should be set when using concat mode
+# a fixed length should be set for distributed training
+text_encoder:
+  model: google/flan-t5-large
+  max_length: 100
+  cfg: 0.1
+diff:
+  num_train_timesteps: 1000
+  beta_schedule: 'scaled_linear'
+  beta_start: 0.00085
+  beta_end: 0.012
+  prediction_type: 'v_prediction'
+  rescale_betas_zero_snr: true
+  timestep_spacing: 'trailing'
+  clip_sample: false

ckpts/controlnet/s3_l_energy.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63fc1b6497570be35bed1833b1402f7a4487cd34acffca527ac8eb44b75427c1
+size 1165074763

ckpts/ezaudio-l.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+model_name: EzAudio-L
+model:
+  mae: True
+  mae_prob: 0.25
+  mask_ratio: [0.25, 1.0]
+  mask_span: 10
+  img_size: 500
+  patch_size: 1
+  in_chans: 257
+  out_chans: 128
+  input_type: '1d'
+  embed_dim: 1024
+  depth: 24
+  num_heads: 16
+  mlp_ratio: 4.0
+  qkv_bias: false
+  qk_scale: null
+  qk_norm: layernorm
+  norm_layer: layernorm
+  act_layer: geglu
+  context_norm: true
+  use_checkpoint: true
+  time_fusion: 'ada_sola_bias'
+  ada_lora_rank: 32
+  ada_lora_alpha: 32
+  cls_dim: null
+  context_dim: 1024
+  context_fusion: 'cross'
+  context_max_length: null
+  context_pe_method: 'none'
+  pe_method: 'none'
+  rope_mode: 'shared'
+  use_conv: true
+  skip: true
+  skip_norm: true
+autoencoder:
+  name: stable_vae
+  dim: 128
+  sr: 24000
+  latent_sr: 50
+  q_first: true
+  scale: 1.0
+  shift: 0.0
+text_encoder:
+  model: google/flan-t5-large
+  max_length: 100
+  cfg: 0.1
+diff:
+  num_train_timesteps: 1000
+  beta_schedule: 'scaled_linear'
+  beta_start: 0.00085
+  beta_end: 0.012
+  prediction_type: 'v_prediction'
+  rescale_betas_zero_snr: true
+  timestep_spacing: 'trailing'
+  clip_sample: false

ckpts/ezaudio-xl.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+model_name: EzAudio-XL
+model:
+  mae: True
+  mae_prob: 0.25
+  mask_ratio: [0.25, 1.0]
+  mask_span: 10
+  img_size: 500
+  patch_size: 1
+  in_chans: 257
+  out_chans: 128
+  input_type: '1d'
+  embed_dim: 1152
+  depth: 28
+  num_heads: 16
+  mlp_ratio: 4.0
+  qkv_bias: false
+  qk_scale: null
+  qk_norm: layernorm
+  norm_layer: layernorm
+  act_layer: geglu
+  context_norm: true
+  use_checkpoint: true
+  time_fusion: 'ada_sola_bias'
+  ada_sola_rank: 36
+  ada_sola_alpha: 36
+  cls_dim: null
+  context_dim: 2048
+  context_fusion: 'cross'
+  context_max_length: null
+  context_pe_method: 'none'
+  pe_method: 'none'
+  rope_mode: 'shared'
+  use_conv: true
+  skip: true
+  skip_norm: true
+autoencoder:
+  name: stable_vae
+  dim: 128
+  sr: 24000
+  latent_sr: 50
+  q_first: true
+  scale: 1.0
+  shift: 0.0
+text_encoder:
+  model: google/flan-t5-xl
+  max_length: 100
+  cfg: 0.1
+diff:
+  num_train_timesteps: 1000
+  beta_schedule: 'scaled_linear'
+  beta_start: 0.00085
+  beta_end: 0.012
+  prediction_type: 'v_prediction'
+  rescale_betas_zero_snr: true
+  timestep_spacing: 'trailing'
+  clip_sample: false

ckpts/s3/ezaudio_s3_l.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eef9eb97ac411574c44ce3f34ce6ca034f65111923f3fdb4f75dc835db43d563
+size 2387669060

ckpts/s3/ezaudio_s3_xl.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97f0e5a2a000166dd4696d005b899321ab80ce9cda8bd912708cbc6761ea95a7
+size 3499437603

ckpts/vae/1m.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cb13e2699fa922ce6a2b3b4f53c270ec64156e0cc3f3e3645e10cdf98b740dc
+size 183037614

ckpts/vae/config.json ADDED Viewed

	@@ -0,0 +1,122 @@

+{
+    "model_type": "autoencoder",
+    "sample_size": 12000,
+    "sample_rate": 24000,
+    "audio_channels": 1,
+    "model": {
+        "encoder": {
+            "type": "oobleck",
+            "config": {
+                "in_channels": 1,
+                "channels": 128,
+                "c_mults": [1, 2, 4, 8],
+                "strides": [2, 4, 6, 10],
+                "latent_dim": 256,
+                "use_snake": true
+            }
+        },
+        "decoder": {
+            "type": "oobleck",
+            "config": {
+                "out_channels": 1,
+                "channels": 128,
+                "c_mults": [1, 2, 4, 8],
+                "strides": [2, 4, 6, 10],
+                "latent_dim": 128,
+                "use_snake": true,
+                "final_tanh": false
+            }
+        },
+        "bottleneck": {
+            "type": "vae"
+        },
+        "latent_dim": 128,
+        "downsampling_ratio": 480,
+        "io_channels": 1
+    },
+    "training": {
+        "learning_rate": 1.5e-4,
+        "warmup_steps": 0,
+        "use_ema": false,
+        "optimizer_configs": {
+            "autoencoder": {
+                "optimizer": {
+                    "type": "AdamW",
+                    "config": {
+                        "betas": [0.8, 0.99],
+                        "lr": 1.5e-4,
+                        "weight_decay": 1e-3
+                    }
+                },
+                "scheduler": {
+                    "type": "InverseLR",
+                    "config": {
+                        "inv_gamma": 200000,
+                        "power": 0.5,
+                        "warmup": 0.999
+                    }
+                }
+            },
+            "discriminator": {
+                "optimizer": {
+                    "type": "AdamW",
+                    "config": {
+                        "betas": [0.8, 0.99],
+                        "lr": 3e-4,
+                        "weight_decay": 1e-3
+                    }
+                },
+                "scheduler": {
+                    "type": "InverseLR",
+                    "config": {
+                        "inv_gamma": 200000,
+                        "power": 0.5,
+                        "warmup": 0.999
+                    }
+                }
+            }
+        },
+        "loss_configs": {
+            "discriminator": {
+                "type": "encodec",
+                "config": {
+                    "filters": 64,
+                    "n_ffts": [1280, 640, 320, 160, 80],
+                    "hop_lengths": [320, 160, 80, 40, 20],
+                    "win_lengths": [1280, 640, 320, 160, 80]
+                },
+                "weights": {
+                    "adversarial": 0.1,
+                    "feature_matching": 5.0
+                }
+            },
+            "spectral": {
+                "type": "mrstft",
+                "config": {
+                    "fft_sizes": [1280, 640, 320, 160, 80, 40, 20],
+                    "hop_sizes": [320, 160, 80, 40, 20, 10, 5],
+                    "win_lengths": [1280, 640, 320, 160, 80, 40, 20],
+                    "perceptual_weighting": true
+                },
+                "weights": {
+                    "mrstft": 1.0
+                }
+            },
+            "time": {
+                "type": "l1",
+                "weights": {
+                    "l1": 0.0
+                }
+            },
+            "bottleneck": {
+                "type": "kl",
+                "weights": {
+                    "kl": 1e-4
+                }
+            }
+        },
+        "demo": {
+            "demo_every": 10000
+        }
+    }
+}