model:
  arch: mm_gpt4
  model_type: pretrain_vicuna
  freeze_imagebind: True
  freeze_qformer: False
  max_txt_len: 160
  end_sym: "###"
  low_resource: False
  prompt_path: "prompts/alignment.txt"
  prompt_template: '###Human: {} ###Assistant: '
  ckpt: ['checkpoints/bubogpt_7b.pth',
    # 'checkpoints/mmgpt2_stage1_audio.pth',
    # 'checkpoints/mmgpt2_stage2_mm_5k.pth',
  ]
  with_bind_head: False
  use_blip_vision: True
  joiner_cfg:
    # NOTE: uncomment below to share qformer across modalities
    # share_key: vision
    vision:
      feat_dim: 1408
      post_dims: [768,]
      num_query_token: 32
      freeze_qformer: True
    audio:
      feat_dim: 768


datasets:
  default:  # Double check
    vis_processor:
      eval:
        name: "imagebind_vision_eval"
        image_size: 224
    text_processor:
      eval:
        name: "imagebind_caption"
    audio_processor:
      eval:
        name: "imagebind_audio_eval"
        use_global: True
        clip_duration: 5
        clips_per_video: 6
run:
  task: image_text_pretrain
  evaluate: True