|
|
|
|
|
classifier_free_guidance: |
|
training_dropout: 0.3 |
|
inference_coef: 3.0 |
|
|
|
attribute_dropout: |
|
text: {} |
|
wav: {} |
|
|
|
fuser: |
|
cross_attention_pos_emb: false |
|
cross_attention_pos_emb_scale: 1 |
|
sum: [] |
|
prepend: [] |
|
cross: [description] |
|
input_interpolate: [] |
|
|
|
conditioners: |
|
description: |
|
model: clap |
|
clap: |
|
checkpoint: //reference/clap/music_audioset_epoch_15_esc_90.14.pt |
|
model_arch: 'HTSAT-base' |
|
enable_fusion: false |
|
sample_rate: 48000 |
|
max_audio_length: 10 |
|
audio_stride: 1 |
|
dim: 512 |
|
attribute: description |
|
normalize: true |
|
quantize: true |
|
n_q: 12 |
|
bins: 1024 |
|
kmeans_iters: 50 |
|
text_p: 0. |
|
cache_path: null |
|
|
|
dataset: |
|
joint_embed_attributes: [description] |
|
train: |
|
merge_text_p: 0.25 |
|
drop_desc_p: 0.5 |
|
drop_other_p: 0.5 |
|
|