hilamanor's picture
initial commit
e73da9c
import contextlib
import importlib
from inspect import isfunction
import os
import soundfile as sf
import time
import wave
import urllib.request
import progressbar
CACHE_DIR = os.getenv(
"AUDIOLDM_CACHE_DIR",
os.path.join(os.path.expanduser("~"), ".cache/audioldm"))
def get_duration(fname):
with contextlib.closing(wave.open(fname, 'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
return frames / float(rate)
def get_bit_depth(fname):
with contextlib.closing(wave.open(fname, 'r')) as f:
bit_depth = f.getsampwidth() * 8
return bit_depth
def get_time():
t = time.localtime()
return time.strftime("%d_%m_%Y_%H_%M_%S", t)
def seed_everything(seed):
import random, os
import numpy as np
import torch
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
def save_wave(waveform, savepath, name="outwav"):
if type(name) is not list:
name = [name] * waveform.shape[0]
for i in range(waveform.shape[0]):
path = os.path.join(
savepath,
"%s_%s.wav"
% (
os.path.basename(name[i])
if (not ".wav" in name[i])
else os.path.basename(name[i]).split(".")[0],
i,
),
)
print("Save audio to %s" % path)
sf.write(path, waveform[i, 0], samplerate=16000)
def exists(x):
return x is not None
def default(val, d):
if exists(val):
return val
return d() if isfunction(d) else d
def count_params(model, verbose=False):
total_params = sum(p.numel() for p in model.parameters())
if verbose:
print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
return total_params
def get_obj_from_str(string, reload=False):
module, cls = string.rsplit(".", 1)
if reload:
module_imp = importlib.import_module(module)
importlib.reload(module_imp)
return getattr(importlib.import_module(module, package=None), cls)
def instantiate_from_config(config):
if not "target" in config:
if config == "__is_first_stage__":
return None
elif config == "__is_unconditional__":
return None
raise KeyError("Expected key `target` to instantiate.")
return get_obj_from_str(config["target"])(**config.get("params", dict()))
def default_audioldm_config(model_name="audioldm-s-full"):
basic_config = {
"wave_file_save_path": "./output",
"id": {
"version": "v1",
"name": "default",
"root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml",
},
"preprocessing": {
"audio": {"sampling_rate": 16000, "max_wav_value": 32768},
"stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024},
"mel": {
"n_mel_channels": 64,
"mel_fmin": 0,
"mel_fmax": 8000,
"freqm": 0,
"timem": 0,
"blur": False,
"mean": -4.63,
"std": 2.74,
"target_length": 1024,
},
},
"model": {
"device": "cuda",
"target": "audioldm.pipline.LatentDiffusion",
"params": {
"base_learning_rate": 5e-06,
"linear_start": 0.0015,
"linear_end": 0.0195,
"num_timesteps_cond": 1,
"log_every_t": 200,
"timesteps": 1000,
"first_stage_key": "fbank",
"cond_stage_key": "waveform",
"latent_t_size": 256,
"latent_f_size": 16,
"channels": 8,
"cond_stage_trainable": True,
"conditioning_key": "film",
"monitor": "val/loss_simple_ema",
"scale_by_std": True,
"unet_config": {
"target": "audioldm.latent_diffusion.openaimodel.UNetModel",
"params": {
"image_size": 64,
"extra_film_condition_dim": 512,
"extra_film_use_concat": True,
"in_channels": 8,
"out_channels": 8,
"model_channels": 128,
"attention_resolutions": [8, 4, 2],
"num_res_blocks": 2,
"channel_mult": [1, 2, 3, 5],
"num_head_channels": 32,
"use_spatial_transformer": True,
},
},
"first_stage_config": {
"base_learning_rate": 4.5e-05,
"target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL",
"params": {
"monitor": "val/rec_loss",
"image_key": "fbank",
"subband": 1,
"embed_dim": 8,
"time_shuffle": 1,
"ddconfig": {
"double_z": True,
"z_channels": 8,
"resolution": 256,
"downsample_time": False,
"in_channels": 1,
"out_ch": 1,
"ch": 128,
"ch_mult": [1, 2, 4],
"num_res_blocks": 2,
"attn_resolutions": [],
"dropout": 0.0,
},
},
},
"cond_stage_config": {
"target": "audioldm.clap.encoders.CLAPAudioEmbeddingClassifierFreev2",
"params": {
"key": "waveform",
"sampling_rate": 16000,
"embed_mode": "audio",
"unconditional_prob": 0.1,
},
},
},
},
}
if("-l-" in model_name):
basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 256
basic_config["model"]["params"]["unet_config"]["params"]["num_head_channels"] = 64
elif("-m-" in model_name):
basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 192
basic_config["model"]["params"]["cond_stage_config"]["params"]["amodel"] = "HTSAT-base" # This model use a larger HTAST
return basic_config
def get_metadata():
return {
"audioldm-s-full": {
"path": os.path.join(
CACHE_DIR,
"audioldm-s-full.ckpt",
),
"url": "https://zenodo.org/record/7600541/files/audioldm-s-full?download=1",
},
"audioldm-l-full": {
"path": os.path.join(
CACHE_DIR,
"audioldm-l-full.ckpt",
),
"url": "https://zenodo.org/record/7698295/files/audioldm-full-l.ckpt?download=1",
},
"audioldm-s-full-v2": {
"path": os.path.join(
CACHE_DIR,
"audioldm-s-full-v2.ckpt",
),
"url": "https://zenodo.org/record/7698295/files/audioldm-full-s-v2.ckpt?download=1",
},
"audioldm-m-text-ft": {
"path": os.path.join(
CACHE_DIR,
"audioldm-m-text-ft.ckpt",
),
"url": "https://zenodo.org/record/7813012/files/audioldm-m-text-ft.ckpt?download=1",
},
"audioldm-s-text-ft": {
"path": os.path.join(
CACHE_DIR,
"audioldm-s-text-ft.ckpt",
),
"url": "https://zenodo.org/record/7813012/files/audioldm-s-text-ft.ckpt?download=1",
},
"audioldm-m-full": {
"path": os.path.join(
CACHE_DIR,
"audioldm-m-full.ckpt",
),
"url": "https://zenodo.org/record/7813012/files/audioldm-m-full.ckpt?download=1",
},
}
class MyProgressBar():
def __init__(self):
self.pbar = None
def __call__(self, block_num, block_size, total_size):
if not self.pbar:
self.pbar=progressbar.ProgressBar(maxval=total_size)
self.pbar.start()
downloaded = block_num * block_size
if downloaded < total_size:
self.pbar.update(downloaded)
else:
self.pbar.finish()
def download_checkpoint(checkpoint_name="audioldm-s-full"):
meta = get_metadata()
if(checkpoint_name not in meta.keys()):
print("The model name you provided is not supported. Please use one of the following: ", meta.keys())
if not os.path.exists(meta[checkpoint_name]["path"]) or os.path.getsize(meta[checkpoint_name]["path"]) < 2*10**9:
os.makedirs(os.path.dirname(meta[checkpoint_name]["path"]), exist_ok=True)
print(f"Downloading the main structure of {checkpoint_name} into {os.path.dirname(meta[checkpoint_name]['path'])}")
urllib.request.urlretrieve(meta[checkpoint_name]["url"], meta[checkpoint_name]["path"], MyProgressBar())
print(
"Weights downloaded in: {} Size: {}".format(
meta[checkpoint_name]["path"],
os.path.getsize(meta[checkpoint_name]["path"]),
)
)