model:
  arch: mm_gpt4
  model_type: pretrain_vicuna
  freeze_imagebind: True
  freeze_qformer: False
  max_txt_len: 160
  end_sym: "###"
  low_resource: False
  prompt_path: "prompts/alignment.txt"
  prompt_template: '###Human: {} ###Assistant: '
  ckpt: [
    "bubogpt/output/mmgpt4_stage2_mm_blipvision_13b/20230701204/checkpoint_4.pth",
  ]
  with_bind_head: False
  use_blip_vision: True
  proj_model: "checkpoints/prerained_minigpt4_13b.pth"
  llama_model: "/mnt/bn/bykang/chixma/data/pretrained_models/vicuna-13b-v0/"
  joiner_cfg:
    # NOTE: uncomment below to share qformer across modalities
    # share_key: vision
    vision:
      feat_dim: 1408
      post_dims: [768,]
      num_query_token: 32
      freeze_qformer: True
    audio:
      feat_dim: 768


datasets:
  default:  # Double check
    vis_processor:
      eval:
        name: "imagebind_vision_eval"
        image_size: 224
    text_processor:
      eval:
        name: "imagebind_caption"
    audio_processor:
      eval:
        name: "imagebind_audio_eval"
        # d2c18
        # clip_duration: 2
        # clips_per_video: 18
        # d5c6
        use_global: True
        clip_duration: 5
        clips_per_video: 6
run:
  task: image_text_pretrain
  evaluate: True