model: arch: video_llama model_type: pretrain_vicuna freeze_vit: True freeze_qformer: True max_txt_len: 512 end_sym: "###" low_resource: False frozen_llama_proj: False llama_model: "Video-LLaMA-2-7B-Finetuned/llama-2-7b-chat-hf" imagebind_ckpt_path: "Video-LLaMA-2-7B-Finetuned" ckpt: "Video-LLaMA-2-7B-Finetuned/VL_LLaMA_2_7B_Finetuned.pth" ckpt_2: "Video-LLaMA-2-7B-Finetuned/AL_LLaMA_2_7B_Finetuned.pth" equip_audio_branch: True # whether equips the audio branch fusion_head_layers: 2 max_frame_pos: 32 fusion_header_type: "seqTransf" datasets: webvid: vis_processor: train: name: "alpro_video_eval" n_frms: 8 image_size: 224 text_processor: train: name: "blip_caption" run: task: video_text_pretrain