Spaces:

declare-lab
/

mustango

Sleeping

App Files Files Community

mustango / audioldm /utils.py

deepanway

Uplaod files

f1069cc 12 months ago

raw

history blame

9.8 kB

	import contextlib
	import importlib

	from inspect import isfunction
	import os
	import soundfile as sf
	import time
	import wave

	import urllib.request
	import progressbar

	CACHE_DIR = os.getenv(
	"AUDIOLDM_CACHE_DIR",
	os.path.join(os.path.expanduser("~"), ".cache/audioldm"))

	def get_duration(fname):
	with contextlib.closing(wave.open(fname, 'r')) as f:
	frames = f.getnframes()
	rate = f.getframerate()
	return frames / float(rate)

	def get_bit_depth(fname):
	with contextlib.closing(wave.open(fname, 'r')) as f:
	bit_depth = f.getsampwidth() * 8
	return bit_depth

	def get_time():
	t = time.localtime()
	return time.strftime("%d_%m_%Y_%H_%M_%S", t)

	def seed_everything(seed):
	import random, os
	import numpy as np
	import torch

	random.seed(seed)
	os.environ["PYTHONHASHSEED"] = str(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = True


	def save_wave(waveform, savepath, name="outwav"):
	if type(name) is not list:
	name = [name] * waveform.shape[0]

	for i in range(waveform.shape[0]):
	path = os.path.join(
	savepath,
	"%s_%s.wav"
	% (
	os.path.basename(name[i])
	if (not ".wav" in name[i])
	else os.path.basename(name[i]).split(".")[0],
	i,
	),
	)
	print("Save audio to %s" % path)
	sf.write(path, waveform[i, 0], samplerate=16000)


	def exists(x):
	return x is not None


	def default(val, d):
	if exists(val):
	return val
	return d() if isfunction(d) else d


	def count_params(model, verbose=False):
	total_params = sum(p.numel() for p in model.parameters())
	if verbose:
	print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
	return total_params


	def get_obj_from_str(string, reload=False):
	module, cls = string.rsplit(".", 1)
	if reload:
	module_imp = importlib.import_module(module)
	importlib.reload(module_imp)
	return getattr(importlib.import_module(module, package=None), cls)


	def instantiate_from_config(config):
	if not "target" in config:
	if config == "__is_first_stage__":
	return None
	elif config == "__is_unconditional__":
	return None
	raise KeyError("Expected key `target` to instantiate.")
	return get_obj_from_str(config["target"])(**config.get("params", dict()))


	def default_audioldm_config(model_name="audioldm-s-full"):
	basic_config = {
	"wave_file_save_path": "./output",
	"id": {
	"version": "v1",
	"name": "default",
	"root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml",
	},
	"preprocessing": {
	"audio": {"sampling_rate": 16000, "max_wav_value": 32768},
	"stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024},
	"mel": {
	"n_mel_channels": 64,
	"mel_fmin": 0,
	"mel_fmax": 8000,
	"freqm": 0,
	"timem": 0,
	"blur": False,
	"mean": -4.63,
	"std": 2.74,
	"target_length": 1024,
	},
	},
	"model": {
	"device": "cuda",
	"target": "audioldm.pipline.LatentDiffusion",
	"params": {
	"base_learning_rate": 5e-06,
	"linear_start": 0.0015,
	"linear_end": 0.0195,
	"num_timesteps_cond": 1,
	"log_every_t": 200,
	"timesteps": 1000,
	"first_stage_key": "fbank",
	"cond_stage_key": "waveform",
	"latent_t_size": 256,
	"latent_f_size": 16,
	"channels": 8,
	"cond_stage_trainable": True,
	"conditioning_key": "film",
	"monitor": "val/loss_simple_ema",
	"scale_by_std": True,
	"unet_config": {
	"target": "audioldm.latent_diffusion.openaimodel.UNetModel",
	"params": {
	"image_size": 64,
	"extra_film_condition_dim": 512,
	"extra_film_use_concat": True,
	"in_channels": 8,
	"out_channels": 8,
	"model_channels": 128,
	"attention_resolutions": [8, 4, 2],
	"num_res_blocks": 2,
	"channel_mult": [1, 2, 3, 5],
	"num_head_channels": 32,
	"use_spatial_transformer": True,
	},
	},
	"first_stage_config": {
	"base_learning_rate": 4.5e-05,
	"target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL",
	"params": {
	"monitor": "val/rec_loss",
	"image_key": "fbank",
	"subband": 1,
	"embed_dim": 8,
	"time_shuffle": 1,
	"ddconfig": {
	"double_z": True,
	"z_channels": 8,
	"resolution": 256,
	"downsample_time": False,
	"in_channels": 1,
	"out_ch": 1,
	"ch": 128,
	"ch_mult": [1, 2, 4],
	"num_res_blocks": 2,
	"attn_resolutions": [],
	"dropout": 0.0,
	},
	},
	},
	"cond_stage_config": {
	"target": "audioldm.clap.encoders.CLAPAudioEmbeddingClassifierFreev2",
	"params": {
	"key": "waveform",
	"sampling_rate": 16000,
	"embed_mode": "audio",
	"unconditional_prob": 0.1,
	},
	},
	},
	},
	}

	if("-l-" in model_name):
	basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 256
	basic_config["model"]["params"]["unet_config"]["params"]["num_head_channels"] = 64
	elif("-m-" in model_name):
	basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 192
	basic_config["model"]["params"]["cond_stage_config"]["params"]["amodel"] = "HTSAT-base" # This model use a larger HTAST

	return basic_config

	def get_metadata():
	return {
	"audioldm-s-full": {
	"path": os.path.join(
	CACHE_DIR,
	"audioldm-s-full.ckpt",
	),
	"url": "https://zenodo.org/record/7600541/files/audioldm-s-full?download=1",
	},
	"audioldm-l-full": {
	"path": os.path.join(
	CACHE_DIR,
	"audioldm-l-full.ckpt",
	),
	"url": "https://zenodo.org/record/7698295/files/audioldm-full-l.ckpt?download=1",
	},
	"audioldm-s-full-v2": {
	"path": os.path.join(
	CACHE_DIR,
	"audioldm-s-full-v2.ckpt",
	),
	"url": "https://zenodo.org/record/7698295/files/audioldm-full-s-v2.ckpt?download=1",
	},
	"audioldm-m-text-ft": {
	"path": os.path.join(
	CACHE_DIR,
	"audioldm-m-text-ft.ckpt",
	),
	"url": "https://zenodo.org/record/7813012/files/audioldm-m-text-ft.ckpt?download=1",
	},
	"audioldm-s-text-ft": {
	"path": os.path.join(
	CACHE_DIR,
	"audioldm-s-text-ft.ckpt",
	),
	"url": "https://zenodo.org/record/7813012/files/audioldm-s-text-ft.ckpt?download=1",
	},
	"audioldm-m-full": {
	"path": os.path.join(
	CACHE_DIR,
	"audioldm-m-full.ckpt",
	),
	"url": "https://zenodo.org/record/7813012/files/audioldm-m-full.ckpt?download=1",
	},
	}

	class MyProgressBar():
	def __init__(self):
	self.pbar = None

	def __call__(self, block_num, block_size, total_size):
	if not self.pbar:
	self.pbar=progressbar.ProgressBar(maxval=total_size)
	self.pbar.start()

	downloaded = block_num * block_size
	if downloaded < total_size:
	self.pbar.update(downloaded)
	else:
	self.pbar.finish()

	def download_checkpoint(checkpoint_name="audioldm-s-full"):
	meta = get_metadata()
	if(checkpoint_name not in meta.keys()):
	print("The model name you provided is not supported. Please use one of the following: ", meta.keys())

	if not os.path.exists(meta[checkpoint_name]["path"]) or os.path.getsize(meta[checkpoint_name]["path"]) < 210*9:
	os.makedirs(os.path.dirname(meta[checkpoint_name]["path"]), exist_ok=True)
	print(f"Downloading the main structure of {checkpoint_name} into {os.path.dirname(meta[checkpoint_name]['path'])}")

	urllib.request.urlretrieve(meta[checkpoint_name]["url"], meta[checkpoint_name]["path"], MyProgressBar())
	print(
	"Weights downloaded in: {} Size: {}".format(
	meta[checkpoint_name]["path"],
	os.path.getsize(meta[checkpoint_name]["path"]),
	)
	)