# promises detector fine tuning for llama2-7B from datasets import load_dataset from accelerate import find_executable_batch_size # 8 is the recommended from the book from transformers import TrainingArguments, BitsAndBytesConfig from transformers import LlamaForCausalLM, LlamaTokenizer from peft import PeftModel, LoraConfig from trl import SFTTrainer tune_data = load_dataset("csv", data_files='Political_Promises_Fine_Tuning.csv') optim = "paged_adamw_32bit" learning_rate = 3e-4 weight_decay = .01 lr_scheduler_type = 'cosine' warmup_ratio = .03 # The proportion of training steps # on a 16GB GPU the params are: gradient_accumulation_steps = 4 bf16 = True gradient_checkpointing = True # Label smoothing # Label 0 will be transformed to label_smoothing_factor/num_labels # Label 1 will be transformed to 1 + label_smoothing_factor(-1 + 1/ num_labels) label_smoothing_factor = .1 # neftune stands for Noise Embedding Instruction Fine-Tuning (noise ~U(-1,1)) neftune_noise_alpha = 5 # batch-size (the bigger, the faster and more memory heavy and prone to local minima/overfitting) # Tip: Reduce max_seq_len to support larger batch_size per_device_train_batch_size = find_executable_batch_size() per_device_eval_batch_size = find_executable_batch_size() max_grad_norm = 2 group_by_length = True max_train_epochs = 3 # peft = parameter efficient fine tuning # LORA: r = 64 lora_alpha = 8 lora_dropout = 0.1 # Working with reduced precision with bitsandbytes use_4bit = True bnb_4bit_compute_dtype = 'float16' bnb_4bit_quant_type = 'nf4' use_nested_quant = False # trl = Transformer Reinforcement Learning, for supervised fine-tuning max_seq_length = 128 # Packing is used to place multiple instructions in the same input sequence packing = True train_params = TrainingArguments( optim=optim, learning_rate=learning_rate, weight_decay=weight_decay, warmup_ratio=warmup_ratio, gradient_accumulation_steps=gradient_accumulation_steps, bf16=bf16, gradient_checkpointing=gradient_checkpointing, label_smoothing_factor=label_smoothing_factor, neftune_noise_alpha=neftune_noise_alpha, per_device_train_batch_size=per_device_train_batch_size, per_device_eval_batch_size=per_device_eval_batch_size, max_grad_norm=max_grad_norm, group_by_length=group_by_length, num_train_epochs=max_train_epochs, output_dir='./model_outputs', save_steps=50, logging_steps=10 ) quantize_params = BitsAndBytesConfig( use_4bit=use_4bit, bnb_4bit_compute_dtype=bnb_4bit_compute_dtype, bnb_4bit_quant_type=bnb_4bit_quant_type, use_nested_quant=use_nested_quant, ) lora_params = LoraConfig ( r = 64, lora_alpha = 8, lora_dropout = 0.1 ) model = LlamaForCausalLM.from_pretrained( pretrained_model_name_or_path = 'meta-llama/Llama-2-7b', quantization_config=quantize_params, device_map='auto' ) tokenizer = LlamaTokenizer.from_pretrained('meta-llama/Llama-2-7b') tune_data = load_dataset("csv", data_files='/path/to/finetune_data.csv') sft = SFTTrainer ( model = model, args = train_params, train_dataset = tune_data, tokenizer = tokenizer peft_config = lora_params, max_seq_length = 128, dataset_text_field = 'text', packing = True ) sft.train() sft.model.save_pretrained('/path/to/llama-2-it.csv')