File size: 1,066 Bytes
e4bd7f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
model:
  arch: mm_gpt4
  model_type: pretrain_vicuna
  freeze_imagebind: True
  freeze_qformer: False
  max_txt_len: 160
  end_sym: "###"
  low_resource: False
  prompt_path: "prompts/alignment.txt"
  prompt_template: '###Human: {} ###Assistant: '
  ckpt: ['checkpoints/bubogpt_7b.pth',
    # 'checkpoints/mmgpt2_stage1_audio.pth',
    # 'checkpoints/mmgpt2_stage2_mm_5k.pth',
  ]
  with_bind_head: False
  use_blip_vision: True
  joiner_cfg:
    # NOTE: uncomment below to share qformer across modalities
    # share_key: vision
    vision:
      feat_dim: 1408
      post_dims: [768,]
      num_query_token: 32
      freeze_qformer: True
    audio:
      feat_dim: 768


datasets:
  default:  # Double check
    vis_processor:
      eval:
        name: "imagebind_vision_eval"
        image_size: 224
    text_processor:
      eval:
        name: "imagebind_caption"
    audio_processor:
      eval:
        name: "imagebind_audio_eval"
        use_global: True
        clip_duration: 5
        clips_per_video: 6
run:
  task: image_text_pretrain
  evaluate: True