guillaumetell-7b / finetuning.py

Create finetuning.py

083d82d verified 10 months ago

6.31 kB

	import os
	import torch

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	print(device)


	from datasets import load_dataset
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	BitsAndBytesConfig,
	HfArgumentParser,
	TrainingArguments,
	pipeline,
	logging,
	LlamaTokenizerFast
	)
	from peft import LoraConfig, PeftModel, get_peft_model
	from trl import SFTTrainer

	# Le modèle que nous allons utiliser dans le Hugging Face hub
	model_name = "mistral-hermes"

	torch.cuda.empty_cache()

	#project_directory = "~/finetuning/sigmund-spplus"

	# Le nom du nouveau modèle
	new_model_name = "mistral-mfs-reference"

	# The output directory where the model predictions and checkpoints will be written
	output_dir = "./mistral-mfs-reference"

	# Tensorboard logs
	tb_log_dir = "./mistral-mfs-reference/logs"

	# Nombre de steps : à ajuster selon la taille du corpus et le nombre d'epochs à faire tourner.
	max_steps = 500

	# Les paramètres importants !!
	per_device_train_batch_size = 4 #Nombre d'exemples envoyés par batch. En mettre plus pour aller plus vite.
	learning_rate = 2e-5 #De préférence un taux d'apprentissage bas comme Mistral-Hermes se débrouille bien en français
	max_seq_length = 4096 #C'est la fenêtre contextuelle. Elle peut être portée jusqu'à 4096 tokens (mais attention à la mémoire disponible !)
	save_steps = 1000 # Sauvegarde des steps (permet de faire redémarrer l'entraînement si le fine-tuning ne fonctionne pas)
	# Learning rate schedule (constant a bit better than cosine, and has advantage for analysis)
	lr_scheduler_type = "linear"


	#Les autres paramètres
	local_rank = -1
	per_device_eval_batch_size = 1
	gradient_accumulation_steps = 4
	max_grad_norm = 0.3
	weight_decay = 0.001
	lora_alpha = 16
	lora_dropout = 0.1
	lora_r = 64
	# Group sequences into batches with same length (saves memory and speeds up training considerably)
	group_by_length = True

	# Activate 4-bit precision base model loading
	use_4bit = True

	# Activate nested quantization for 4-bit base models
	use_nested_quant = False

	# Compute dtype for 4-bit base models
	bnb_4bit_compute_dtype = "float16"

	# Quantization type (fp4 or nf4=
	bnb_4bit_quant_type = "nf4"

	# Number of training epochs
	num_train_epochs = 1

	# Enable fp16 training
	fp16 = True

	# Enable bf16 training
	bf16 = False

	# Use packing dataset creating
	packing = False

	# Enable gradient checkpointing
	gradient_checkpointing = True

	# Optimizer to use, original is paged_adamw_32bit
	optim = "paged_adamw_32bit"

	# Fraction of steps to do a warmup for
	warmup_ratio = 0.03

	# Log every X updates steps
	logging_steps = 1

	# Load the entire model on the GPU 0
	device_map = {"": 0}

	# Visualize training
	report_to = "tensorboard"


	#2. Import du tokenizer.
	peft_config = LoraConfig(
	lora_alpha=lora_alpha,
	lora_dropout=lora_dropout,
	r=lora_r,
	inference_mode=False,
	task_type="CAUSAL_LM",
	target_modules = ["q_proj", "v_proj"]
	)

	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# This is the fix for fp16 training
	#tokenizer.padding_side = "right"
	#tokenizer.pad_token = tokenizer.eos_token

	#3. Préparation de la base de données

	from datasets import load_dataset

	def format_alpaca(sample):
	prompt = f"{sample['conversation']}"
	return prompt

	# template dataset to add prompt to each sample
	def template_dataset(sample):
	sample["text"] = f"{format_alpaca(sample)}{tokenizer.eos_token}"
	return sample

	# Chargement du dataset.
	#dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
	data_files = {"train": "references_mfs_corpus.json"}
	dataset = load_dataset("json", data_files=data_files, split="train")

	# Shuffle the dataset
	dataset_shuffled = dataset.shuffle(seed=42)

	# Select the first 250 rows from the shuffled dataset, comment if you want 15k
	#dataset = dataset_shuffled.select(range(512))

	#Transformation du dataset pour utiliser le format guanaco
	dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))

	print(dataset[40])

	#4. Import du modèle

	# Load tokenizer and model with QLoRA configuration
	compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=use_4bit,
	bnb_4bit_quant_type=bnb_4bit_quant_type,
	bnb_4bit_compute_dtype=compute_dtype,
	bnb_4bit_use_double_quant=use_nested_quant,
	)

	if compute_dtype == torch.float16 and use_4bit:
	major, _ = torch.cuda.get_device_capability()
	if major >= 8:
	print("=" * 80)
	print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
	print("=" * 80)

	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	device_map=device_map,
	quantization_config=bnb_config
	)

	model.config.use_cache = False
	model.config.pretraining_tp = 1

	#5. Fine-tuning

	torch.cuda.empty_cache()

	training_arguments = TrainingArguments(
	output_dir=output_dir,
	per_device_train_batch_size=per_device_train_batch_size,
	gradient_accumulation_steps=gradient_accumulation_steps,
	gradient_checkpointing=True,
	optim=optim,
	save_steps=save_steps,
	logging_steps=logging_steps,
	learning_rate=learning_rate,
	fp16=fp16,
	bf16=bf16,
	max_grad_norm=max_grad_norm,
	max_steps=max_steps,
	warmup_ratio=warmup_ratio,
	group_by_length=group_by_length,
	lr_scheduler_type=lr_scheduler_type,
	report_to="tensorboard"
	)

	trainer = SFTTrainer(
	model=model,
	train_dataset=dataset,
	peft_config=peft_config,
	dataset_text_field="text",
	max_seq_length=max_seq_length,
	tokenizer=tokenizer,
	args=training_arguments,
	packing=packing
	)

	trainer.train()
	#trainer.train(resume_from_checkpoint=True)

	#6. Sauvegarde

	model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model # Take care of distributed/parallel training
	model_to_save.save_pretrained(new_model_name)

	torch.cuda.empty_cache()

	from peft import AutoPeftModelForCausalLM

	model = AutoPeftModelForCausalLM.from_pretrained(new_model_name, device_map="auto", torch_dtype=torch.bfloat16)
	model = model.merge_and_unload()

	output_merged_dir = os.path.join(new_model_name, new_model_name)
	model.save_pretrained(output_merged_dir, safe_serialization=True)

	tokenizer.save_pretrained(output_merged_dir)