- attn_projector=mlp, per_device_train_batch_size=16, run_name=baseline
- attn_projector=mlp, per_device_train_batch_size=2, run_name=bs2
- attn_projector=mlp, per_device_train_batch_size=2, run_name=bs2_liger, student_model_use_liger=True
- attn_projector=mlp, per_device_train_batch_size=4, run_name=bs4
- attn_projector=mlp, per_device_train_batch_size=8, run_name=bs8
- attn_weight=0.0, per_device_train_batch_size=4, run_name=bs4_NO_liger_baseline, student_model_use_liger=False
- attn_weight=0.0, per_device_train_batch_size=4, run_name=bs4_NO_liger_baseline, student_model_use_liger=True
- attn_weight=0.0, per_device_train_batch_size=4, run_name=logits_bs4_liger_torch_compile, student_model_use_liger=True, torch_compile=True
- attn_weight=0.0, per_device_train_batch_size=4, run_name=logits_bs4_torch_compile, student_model_use_liger=False, torch_compile=True
- attn_weight=0.0, per_device_train_batch_size=4, run_name=logits_only_bs4_liger, student_model_use_liger=True