# Config for multi-device LoRA finetuning in lora_finetune_distributed.py | |
# using a Llama3 8B model | |
# | |
# This config assumes that you've run the following command before launching | |
# this run: | |
# tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token <HF_TOKEN> | |
# | |
# To launch on 2 devices, run the following command from root: | |
# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora | |
# | |
# You can add specific overrides through the command line. For example | |
# to override the checkpointer directory while launching training | |
# you can run: | |
# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR> | |
# | |
# This config works best when the model is being fine-tuned on 2+ GPUs. | |
# For single device LoRA finetuning please use 8B_lora_single_device.yaml | |
# or 8B_qlora_single_device.yaml | |
# Tokenizer | |
tokenizer: | |
_component_: torchtune.models.llama3.llama3_tokenizer | |
path: ./model/original/tokenizer.model | |
# Model Arguments | |
model: | |
_component_: torchtune.models.llama3.lora_llama3_8b | |
lora_attn_modules: ['q_proj', 'v_proj'] | |
apply_lora_to_mlp: False | |
apply_lora_to_output: False | |
lora_rank: 8 | |
lora_alpha: 16 | |
checkpointer: | |
_component_: torchtune.utils.FullModelMetaCheckpointer | |
checkpoint_dir: ./model/original/ | |
checkpoint_files: [ | |
consolidated.00.pth | |
] | |
recipe_checkpoint: null | |
output_dir: ./finetuned_model/ | |
model_type: LLAMA3 | |
resume_from_checkpoint: False | |
# Dataset and Sampler | |
# InstructDataset( | |
# tokenizer=tokenizer, | |
# source=source, | |
# template=GrammarErrorCorrectionTemplate, | |
# column_map={"sentence": "input"}, | |
# train_on_input=train_on_input, | |
# split="train", | |
# ) | |
dataset: | |
_component_: torchtune.datasets.instruct_dataset | |
source: grammarly/coedit | |
template: GrammarErrorCorrectionTemplate | |
column_map: {"sentence": "src", "output": "tgt"} | |
train_on_input: False | |
split: train | |
seed: 123 | |
shuffle: True | |
batch_size: 4 | |
# Optimizer and Scheduler | |
optimizer: | |
_component_: torch.optim.AdamW | |
weight_decay: 0.01 | |
lr: 3e-4 | |
lr_scheduler: | |
_component_: torchtune.modules.get_cosine_schedule_with_warmup | |
num_warmup_steps: 100 | |
loss: | |
_component_: torch.nn.CrossEntropyLoss | |
# Training | |
epochs: 2 | |
max_steps_per_epoch: null | |
gradient_accumulation_steps: 32 | |
# Logging | |
output_dir: ./lora_finetune_output | |
metric_logger: | |
_component_: torchtune.utils.metric_logging.WandBLogger | |
project: torchtune | |
group: llama3-grammarly | |
log_every_n_steps: null | |
# Environment | |
device: cuda | |
dtype: bf16 | |
enable_activation_checkpointing: False | |