Spaces:
Running
on
A10G
Running
on
A10G
File size: 4,514 Bytes
c673f60 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import os
def default_vae_config():
basic_config = {
"model": {
"params": {
"first_stage_config": {
"base_learning_rate": 4.5e-05,
"target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL",
"params": {
"monitor": "val/rec_loss",
"image_key": "fbank",
"subband": 1,
"embed_dim": 8,
"time_shuffle": 1,
"ddconfig": {
"double_z": True,
"z_channels": 8,
"resolution": 256,
"downsample_time": False,
"in_channels": 1,
"out_ch": 1,
"ch": 128,
"ch_mult": [1, 2, 4],
"num_res_blocks": 2,
"attn_resolutions": [],
"dropout": 0.0,
},
},
},
},
},
}
return basic_config
def default_stft_config():
basic_config = {
"preprocessing_16k": {
"audio": {"sampling_rate": 16000, "max_wav_value": 32768},
"stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024},
"mel": {
"n_mel_channels": 64,
"mel_fmin": 0,
"mel_fmax": 8000,
"freqm": 0,
"timem": 0,
"blur": False,
"mean": -4.63,
"std": 2.74,
"target_length": 1024,
},
},
"preprocessing_24k": {
"audio": {"sampling_rate": 24000, "max_wav_value": 32768},
"stft": {"filter_length": 2048, "hop_length": 240, "win_length": 2048},
"mel": {
"n_mel_channels": 64,
"mel_fmin": 0,
"mel_fmax": 12000,
"target_length": 1024,
},
},
"preprocessing_32k": {
"audio": {"sampling_rate": 32000, "max_wav_value": 32768},
"stft": {"filter_length": 2048, "hop_length": 320, "win_length": 2048},
"mel": {
"n_mel_channels": 64,
"mel_fmin": 0,
"mel_fmax": 16000,
"target_length": 1024,
},
},
"preprocessing_48k": {
"audio": {"sampling_rate": 48000, "max_wav_value": 32768, "duration": 10.00},
"stft": {"filter_length": 2048, "hop_length": 480, "win_length": 2048},
"mel": {
"n_mel_channels": 64,
"mel_fmin": 20,
"mel_fmax": 24000
}
},
}
return basic_config
def get_metadata():
return {
"audioldm-s-full": {
"path": os.path.join(
CACHE_DIR,
"audioldm-s-full.ckpt",
),
"url": "https://zenodo.org/record/7600541/files/audioldm-s-full?download=1",
},
"audioldm-l-full": {
"path": os.path.join(
CACHE_DIR,
"audioldm-l-full.ckpt",
),
"url": "https://zenodo.org/record/7698295/files/audioldm-full-l.ckpt?download=1",
},
"audioldm-s-full-v2": {
"path": os.path.join(
CACHE_DIR,
"audioldm-s-full-v2.ckpt",
),
"url": "https://zenodo.org/record/7698295/files/audioldm-full-s-v2.ckpt?download=1",
},
"audioldm-m-text-ft": {
"path": os.path.join(
CACHE_DIR,
"audioldm-m-text-ft.ckpt",
),
"url": "https://zenodo.org/record/7813012/files/audioldm-m-text-ft.ckpt?download=1",
},
"audioldm-s-text-ft": {
"path": os.path.join(
CACHE_DIR,
"audioldm-s-text-ft.ckpt",
),
"url": "https://zenodo.org/record/7813012/files/audioldm-s-text-ft.ckpt?download=1",
},
"audioldm-m-full": {
"path": os.path.join(
CACHE_DIR,
"audioldm-m-full.ckpt",
),
"url": "https://zenodo.org/record/7813012/files/audioldm-m-full.ckpt?download=1",
},
}
|