model: arch: mm_gpt4 model_type: pretrain_vicuna freeze_imagebind: True freeze_qformer: False max_txt_len: 160 end_sym: "###" low_resource: False prompt_path: "prompts/alignment.txt" prompt_template: '###Human: {} ###Assistant: ' ckpt: ['checkpoints/bubogpt_7b.pth', # 'checkpoints/mmgpt2_stage1_audio.pth', # 'checkpoints/mmgpt2_stage2_mm_5k.pth', ] with_bind_head: False use_blip_vision: True joiner_cfg: # NOTE: uncomment below to share qformer across modalities # share_key: vision vision: feat_dim: 1408 post_dims: [768,] num_query_token: 32 freeze_qformer: True audio: feat_dim: 768 datasets: default: # Double check vis_processor: eval: name: "imagebind_vision_eval" image_size: 224 text_processor: eval: name: "imagebind_caption" audio_processor: eval: name: "imagebind_audio_eval" use_global: True clip_duration: 5 clips_per_video: 6 run: task: image_text_pretrain evaluate: True