Spaces:
Runtime error
Runtime error
File size: 6,220 Bytes
9e70bac feee6eb 9e70bac feee6eb 9e70bac feee6eb 9e70bac feee6eb 9e70bac feee6eb 9e70bac feee6eb 9e70bac e51648a 4ec4a3c e51648a feee6eb 9e70bac feee6eb e51648a 9e70bac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
import os
from uuid import uuid4
import pandas as pd
import subprocess
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import evaluate
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
### Define functions
def max_token_len(dataset):
max_seq_length = 0
for row in dataset:
tokens = len(tokenizer(row['text'])['input_ids'])
if tokens > max_seq_length:
max_seq_length = tokens
return max_seq_length
### Set up models and datasets, training parameters
# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
model_name = 'mistralai/Mistral-7B-v0.1'
# model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_max_length = tokenizer.model_max_length
print("Model Max Length:", model_max_length)
# dataset = load_dataset("imdb", split="train")
dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
dataset = load_dataset(dataset_name)
# Write dataset files into data directory
data_directory = './fine_tune_data/'
# Create the data directory if it doesn't exist
os.makedirs(data_directory, exist_ok=True)
# Write the train data to a CSV file
train_data='train_data'
train_filename = os.path.join(data_directory, train_data)
dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)
max_token_length_train=max_token_len(dataset['train'])
print('Max token length train: '+str(max_token_length_train))
# Write the validation data to a CSV file
validation_data='validation_data'
validation_filename = os.path.join(data_directory, validation_data)
dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)
max_token_length_validation=max_token_len(dataset['validation'])
print('Max token length validation: '+str(max_token_length_validation))
max_token_length=max(max_token_length_train,max_token_length_validation)
# max_token_length=max_token_length_train
if max_token_length > model_max_length:
raise ValueError("Maximum token length exceeds model limits.")
block_size=2*max_token_length
print('Block size: '+str(block_size))
# Define project parameters
username='ai-aerospace'
project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
repo_name='ams-data-train-100-'+str(uuid4())
model_params={
"project_name": project_name,
"model_name": model_name,
"repo_id": username+'/'+repo_name,
"train_data": train_data,
"validation_data": validation_data,
"data_directory": data_directory,
"block_size": block_size,
"model_max_length": max_token_length,
"logging_steps": -1,
"evaluation_strategy": "epoch",
"save_total_limit": 1,
"save_strategy": "epoch",
"mixed_precision": "fp16",
"lr": 0.00003,
"epochs": 3,
"batch_size": 2,
"warmup_ratio": 0.1,
"gradient_accumulation": 1,
"optimizer": "adamw_torch",
"scheduler": "linear",
"weight_decay": 0,
"max_grad_norm": 1,
"seed": 42,
"quantization": "int4",
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.05
}
for key, value in model_params.items():
os.environ[key] = str(value)
print(model_params)
args_custom=transformers.TrainingArguments(
per_device_train_batch_size=model_params['batch_size'],
per_device_eval_batch_size=model_params['batch_size'],
gradient_accumulation_steps=model_params['gradient_accumulation'],
warmup_ratio=model_params['warmup_ratio'],
num_train_epochs=model_params['epochs'],
learning_rate=model_params['lr'],
fp16=True,
logging_steps=model_params['logging_steps'],
save_total_limit=model_params['save_total_limit'],
evaluation_strategy=model_params['evaluation_strategy'],
metric_for_best_model="f1",
output_dir='model_outputs',
logging_dir='model_outputs',
optim=model_params['optimizer'],
max_grad_norm=model_params['max_grad_norm'],
weight_decay=model_params['weight_decay'],
lr_scheduler_type=model_params['scheduler']
)
### Args from medium article
args_medium=transformers.TrainingArguments(
per_device_train_batch_size=8,
per_device_eval_batch_size=32,
gradient_accumulation_steps=4,
warmup_steps=100,
max_steps=12276,
learning_rate=2e-4,
fp16=True,
eval_steps= 1000,
logging_steps=1000,
save_steps=1000,
evaluation_strategy="steps",
do_eval=True,
load_best_model_at_end=True,
metric_for_best_model="f1",
output_dir='model_outputs',
logging_dir='model_outputs',
remove_unused_columns =False,
report_to='wandb' # enable logging to W&B
)
###
### Load model and peft config, calculate trainable parameters
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True
)
peft_config = LoraConfig(
r=model_params['lora_r'],
lora_alpha=model_params['lora_alpha'],
lora_dropout=model_params['lora_dropout']
)
lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()
### Train model
f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
results = {}
results.update(f1_metric.compute(predictions=predictions, references = labels, average="macro"))
results.update(recall_metric.compute(predictions=predictions, references = labels, average="macro"))
results.update(accuracy_metric.compute(predictions=predictions, references = labels))
results.update(precision_metric.compute(predictions=predictions, references = labels, average="macro"))
return results
# See https://towardsdatascience.com/fine-tune-your-llm-without-maxing-out-your-gpu-db2278603d78 for details
trainer = transformers.Trainer(
model=lora_model,
train_dataset=model_params['train_data'],
eval_dataset=model_params['validation_data'],
compute_metrics=compute_metrics,
args=args_custom
)
trainer.train() |