File size: 6,251 Bytes
e1f30e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import torch, os, wandb, uuid, json
import bitsandbytes as bnb
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, TrainerCallback
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from accelerate import Accelerator
from accelerate.utils import set_seed
from datasets import load_dataset, DatasetDict, Dataset,load_from_disk
from functools import partial
set_seed(42)
accelerator = Accelerator()
run_id = str(uuid.uuid4())
modelpath="microsoft/phi-2"
dataset_name="teknium/OpenHermes-2.5"
lr=0.00002
bs=10 # batch size
bs_eval=16 # batch size for evals
ga_steps=4 # gradient acc. steps
epochs=1
max_length=1024
output_dir=f"out_{run_id}"
# Load model
model = AutoModelForCausalLM.from_pretrained(
modelpath,
device_map={"": accelerator.process_index},
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4",
),
torch_dtype=torch.bfloat16,
# does not work yet
# attn_implementation="flash_attention_2",
)
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False) # fast tokenizer sometimes ignores the added tokens
# Add tokens <|im_start|> and <|im_end|>, latter is special eos token,
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
model.config.eos_token_id = tokenizer.eos_token_id
# Add adapters to model
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
lora_config = LoraConfig(
r=32,
lora_alpha=32,
target_modules = [ "q_proj", "k_proj", "v_proj", "dense" ],
modules_to_save = ["lm_head", "embed_tokens"],
lora_dropout=0.1,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.config.use_cache = False
# Print stats
if accelerator.is_main_process:
model.print_trainable_parameters()
# Load dataset
with accelerator.main_process_first():
dataset = load_dataset(dataset_name)
dataset = dataset["train"].train_test_split(test_size=0.1)
# Format (chatML) and tokenize dataset
templates= {
"system": "<|im_start|>system\n{msg}<|im_end|>",
"human": "<|im_start|>user\n{msg}<|im_end|>",
"gpt": "<|im_start|>assistant\n{msg}<|im_end|>",
}
IGNORE_INDEX=-100
def tokenize(input, max_length):
input_ids, attention_mask, labels = [], [], []
for i,msg in enumerate(input["conversations"]):
msg_role=msg["from"]
msg_content=msg["value"]
isHuman=msg_role=="human"
if not msg_role in templates: return # this will break it
msg_chatml=templates[msg_role].format(msg=msg_content)
msg_tokenized=tokenizer(msg_chatml, truncation=False, add_special_tokens=False)
input_ids+=msg_tokenized["input_ids"]
attention_mask+=msg_tokenized["attention_mask"]
labels+=[IGNORE_INDEX]*len(msg_tokenized["input_ids"]) if isHuman else msg_tokenized["input_ids"]
return {
"input_ids": input_ids[:max_length],
"attention_mask": attention_mask[:max_length],
"labels": labels[:max_length],
}
dataset_tokenized = dataset.map(
partial(tokenize, max_length=max_length),
batched=False,
# num_proc=os.cpu_count()//accelerator.num_processes, # multithreaded
num_proc=os.cpu_count(), # multithreaded
remove_columns=dataset["train"].column_names # don't need this anymore, we have tokens from here on
)
# collate function - to transform list of dictionaries [ {input_ids: [123, ..]}, {.. ] to single batch dictionary { input_ids: [..], labels: [..], attention_mask: [..] }
def collate(elements):
tokens=[e["input_ids"] for e in elements]
tokens_maxlen=max([len(t) for t in tokens])
for i,sample in enumerate(elements):
input_ids=sample["input_ids"]
labels=sample["labels"]
attention_mask=sample["attention_mask"]
pad_len=tokens_maxlen-len(input_ids)
input_ids.extend( pad_len * [tokenizer.pad_token_id] )
labels.extend( pad_len * [IGNORE_INDEX] )
attention_mask.extend( pad_len * [0] )
batch={
"input_ids": torch.tensor( [e["input_ids"] for e in elements] ),
"labels": torch.tensor( [e["labels"] for e in elements] ),
"attention_mask": torch.tensor( [e["attention_mask"] for e in elements] ),
}
return batch
steps_per_epoch=len(dataset_tokenized["train"])//(accelerator.num_processes*bs*ga_steps)
args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=bs,
per_device_eval_batch_size=bs_eval,
evaluation_strategy="steps",
logging_steps=1,
eval_steps=steps_per_epoch//3, # 2 evals per epoch
save_steps=steps_per_epoch//3, # save once per epoch
gradient_accumulation_steps=ga_steps,
num_train_epochs=epochs,
lr_scheduler_type="constant",
optim="paged_adamw_32bit", # val_loss will go nan with paged_adamw_8bit
learning_rate=lr,
group_by_length=False,
bf16=True,
ddp_find_unused_parameters=False,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
data_collator=collate,
train_dataset=dataset_tokenized["train"],
eval_dataset=dataset_tokenized["test"],
)
if accelerator.is_main_process:
run = wandb.init(
project="phi2-teknium1",
name=modelpath+"_"+dataset_name+f"_bs-{bs}_LR-{lr}_GPUs-{accelerator.num_processes}_maxlen-{max_length}_{run_id}",
config={
"model_name": modelpath,
"run_id": run_id,
"dataset": dataset_name,
"output_dir": output_dir,
"lr": lr,
"max_length": max_length,
"train_batch_size": bs,
"validation_batch_size": bs,
"ga_steps": ga_steps,
"lora_config": lora_config,
"training_args": args,
"GPUs": accelerator.num_processes,
}
)
run.log_code()
trainer.train() |