from tasks.train.instruction_data import *
# ========================= data ==========================
# train_corpus = "videochat2_instruction"
train_corpus = "videochat2_instruction_full"
train_file = "${available_corpus[${train_corpus}]}" # for lazy evaluation
test_file = dict()
test_types = []
num_workers = 8
save_steps=10000
ckpt_steps=1000
stop_key = None
deepspeed=False
# ========================= input ==========================
num_frames = 16
num_frames_test = 1
batch_size = 1
gradient_accumulation_steps=16
max_txt_l = 512
max_train_steps=None
pre_text = False
gradient_checkpointing=False
inputs = dict(
image_res=336,
video_input=dict(
num_frames="${num_frames}",
sample_type="rand",
num_frames_test="${num_frames_test}",
sample_type_test="middle",
random_aug=False,
),
max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"),
batch_size=dict(image="${batch_size}", video="${batch_size}"),
batch_size_test=dict(image="${batch_size}", video="${batch_size}"),
)
# ========================= model ==========================
model = dict(
repo_id="llava-hf/llava-v1.6-vicuna-7b-hf",
pretrained_path=None,
load_from_origin=False,
origin_vision="",
origin_llm="",
vision_encoder=dict(
name="vit_l14", # somehow need this to tell the dataset the mean std of pretrained model
),
torch_dtype='bfloat16',
freeze_projector=False,
freeze_lm=True,
freeze_vision_tower=True,
lora_target_modules=["q_proj", "v_proj"], # for llama/mistral/gemma
use_lora=True,
lora_r=128,
lora_alpha=32,
lora_dropout=0.05,
num_frames="${num_frames}",
pooling_method='avg',
use_pooling=True,
frame_shape=(24,24),
pooling_shape=(16,8,8),
)
preprocess = dict(
system="",
mm_alone=True,
random_shuffle=True,
add_second_msg=True,
roles=['USER:', 'ASSISTANT:'],
end_signal=(' ', ''),
begin_signal='',
dataset_image_placeholder='',
dataset_video_placeholder='',
image_token_index=32000,
max_txt_l = "${max_txt_l}",
ignore_index=-100, # same as torch softmax ignore index
center_pad=False,
longest_edge=762,
shortest_edge=336,
clip_transform=False,
num_frames="${num_frames}",
)
optimizer = dict(
opt="adamW",
lr=2e-5,
opt_betas=[0.9, 0.999], # default
weight_decay=0.02,
max_grad_norm=-1, # requires a positive float, use -1 to disable
# use a different lr for some modules, e.g., larger lr for new modules
different_lr=dict(enable=False, module_names=[], lr=1e-3),
)
# scheduler = dict(sched="cosine", epochs=3, min_lr_multi=0.25, warmup_epochs=0.6)
# scheduler = dict(sched="cosine", epochs=3, min_lr_multi=0.25, warmup_epochs=0.6)
scheduler = dict(
is_videochat2_custom=False,
sched="cosine",
epochs=2,
warmup_ratio=0.2,
min_lr_multi=0.25)
evaluate = False
deep_fusion = False
evaluation = dict(
eval_frame_ensemble="concat", # [concat, max, mean, lse]
eval_x_only=False,
k_test=128,
eval_offload=True, # offload gpu tensors to cpu to save memory.
)
fp16 = True
gradient_checkpointing = True
# ========================= wandb ==========================
wandb = dict(
enable=False,
entity="user", # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init
project="videochat2", # setup in your command line
)
dist_url = "env://"
device = "cuda"
mode = "it"
# ========================= others ==========================
output_dir = None # output dir
resume = False # if True, load optimizer and scheduler states as well
debug = False
log_freq = 5
metric_window_size=10 # window size for metric
seed = 42
report_to='tensorboard'
save_latest = True
auto_resume = True
pretrained_path = "" # path to pretrained model weights, for resume only?