framework: pretrain
data_dir: ../../dataset
train_data: all
caption_type: meta_tag_caption_sim
workers: 4
total_steps: 32768
start_steps: 0
batch_size: 768
world_size: 1
lr: 5.0e-05
min_lr: 1.0e-09
seed: null
print_freq: 10
cos: true
n_fft: 1024
hop_size: 0.01
sr: 22050
duration: 10
max_length: 128
audio_loader: ffmpeg
audio_arch: resnet
text_arch: roberta-base
n_heads: 8
width: 64
n_mels: 128
audio_dim: 768
text_dim: 768
mlp_dim: 128
temperature: 0.1
tid: base
gpu: 0
epochs: 19
start_epoch: 0
warmup_steps: 5000