framework: pretrain data_dir: ../../dataset train_data: all caption_type: meta_tag_caption_sim workers: 4 total_steps: 32768 start_steps: 0 batch_size: 768 world_size: 1 lr: 5.0e-05 min_lr: 1.0e-09 seed: null print_freq: 10 cos: true n_fft: 1024 hop_size: 0.01 sr: 22050 duration: 10 max_length: 128 audio_loader: ffmpeg audio_arch: resnet text_arch: roberta-base n_heads: 8 width: 64 n_mels: 128 audio_dim: 768 text_dim: 768 mlp_dim: 128 temperature: 0.1 tid: base gpu: 0 epochs: 19 start_epoch: 0 warmup_steps: 5000