# %% from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments from datasets import Dataset import pandas as pd from sklearn.model_selection import train_test_split from peft import get_peft_model, LoraConfig, TaskType import evaluate import numpy as np from tqdm import tqdm # Load the dataset file_path = 'train_en.csv' dataset = pd.read_csv(file_path) # Map labels to expected responses label_mapping = { "Yes": 0, "No": 1, "It doesn't matter": 2, "Unimportant": 2, "Incorrect questioning": 3, "Correct answers": 4 } # Apply label mapping dataset['label'] = dataset['label'].map(label_mapping) # Handle NaN values: Drop rows where label is NaN dataset = dataset.dropna(subset=['label']) # Ensure labels are integers dataset['label'] = dataset['label'].astype(int) # Combine "text" and "puzzle" columns dataset['combined_text'] = dataset['text'] + " " + dataset['puzzle'] # Split the dataset into training and validation sets train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42) # Convert the dataframes to datasets train_dataset = Dataset.from_pandas(train_df) val_dataset = Dataset.from_pandas(val_df) # Load the tokenizer and model model_name = "meta-llama/Meta-Llama-3-8B" # Replace with the actual model name tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5) # Add a padding token if it's not already present if tokenizer.pad_token is None: tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) model.resize_token_embeddings(len(tokenizer)) tokenizer.pad_token = tokenizer.eos_token # Set the padding token explicitly # Ensure the padding token is set correctly in the model configuration model.config.pad_token_id = tokenizer.pad_token_id # Tokenize the data def tokenize_function(examples): return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128) train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4) # Use multiprocessing val_dataset = val_dataset.map(tokenize_function, batched=True, num_proc=4) # Set the format for PyTorch train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) # Define LoRA configuration lora_config = LoraConfig( task_type=TaskType.SEQ_CLS, r=16, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none" ) # Apply LoRA to the model model = get_peft_model(model, lora_config) model.print_trainable_parameters() # Training arguments training_args = TrainingArguments( output_dir='./results', learning_rate=1e-4, lr_scheduler_type="linear", warmup_ratio=0.1, max_grad_norm=0.3, per_device_train_batch_size=8, # Increase batch size if memory allows per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.001, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, report_to="wandb", fp16=True, gradient_checkpointing=True, gradient_accumulation_steps=2, # Adjust based on memory constraints dataloader_num_workers=4, logging_steps=100, save_total_limit=2, ) def compute_metrics(eval_pred): precision_metric = evaluate.load("precision") recall_metric = evaluate.load("recall") f1_metric = evaluate.load("f1") accuracy_metric = evaluate.load("accuracy") logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"] recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"] f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"] accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"] return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy} # Initialize the Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics ) # Train the model with progress bar trainer.train() # Save the model model.save_pretrained('trained_llama_model') tokenizer.save_pretrained('trained_llama_model') # Evaluate the model with progress bar eval_results = trainer.evaluate() print(eval_results) # %%