Spaces:
Sleeping
Sleeping
# @package __global__ | |
classifier_free_guidance: | |
training_dropout: 0.3 | |
inference_coef: 3.0 | |
attribute_dropout: | |
text: {} | |
wav: {} | |
fuser: | |
cross_attention_pos_emb: false | |
cross_attention_pos_emb_scale: 1 | |
sum: [] | |
prepend: [] | |
cross: [description] | |
input_interpolate: [] | |
conditioners: | |
description: | |
model: clap | |
clap: | |
checkpoint: //reference/clap/music_audioset_epoch_15_esc_90.14.pt | |
model_arch: 'HTSAT-base' | |
enable_fusion: false | |
sample_rate: 48000 | |
max_audio_length: 10 | |
audio_stride: 1 | |
dim: 512 | |
attribute: description | |
normalize: true | |
quantize: true # use RVQ quantization | |
n_q: 12 | |
bins: 1024 | |
kmeans_iters: 50 | |
text_p: 0. # probability of using text embed at train time | |
cache_path: null | |
dataset: | |
joint_embed_attributes: [description] | |
train: | |
merge_text_p: 0.25 | |
drop_desc_p: 0.5 | |
drop_other_p: 0.5 | |