from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments from datasets import Dataset import pandas as pd from sklearn.model_selection import train_test_split from peft import get_peft_model, LoraConfig, TaskType import evaluate import numpy as np # Load the dataset file_path = 'train_en.csv' dataset = pd.read_csv(file_path) # Map labels to expected responses label_mapping = { "Yes": 0, "No": 1, "It doesn't matter": 2, "Unimportant": 2, # Assuming "unimportant" is synonymous with "It doesn't matter" "Incorrect questioning": 3, "Correct answers": 4 } # Apply label mapping dataset['label'] = dataset['label'].map(label_mapping) # Handle NaN values: Drop rows where label is NaN dataset = dataset.dropna(subset=['label']) # Ensure labels are integers dataset['label'] = dataset['label'].astype(int) # Format puzzle, truth, text into rows dataset['combined_text'] = ( "==========================================\n" "puzzle: " + dataset['puzzle'] + "\n" "==========================================\n" "truth: " + dataset['truth'] + "\n" "==========================================\n" "text: " + dataset['text'] ) # Split the dataset into training and validation sets train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42) # Convert the dataframes to datasets train_dataset = Dataset.from_pandas(train_df) val_dataset = Dataset.from_pandas(val_df) # Load the tokenizer and model model_name = "google/gemma-2-9b" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5) # Tokenize the data def tokenize_function(examples): return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128) train_dataset = train_dataset.map(tokenize_function, batched=True) val_dataset = val_dataset.map(tokenize_function, batched=True) # Set the format for PyTorch train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) # Define LoRA configuration lora_config = LoraConfig( task_type=TaskType.SEQ_CLS, r=16, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none" ) # Apply LoRA to the model model = get_peft_model(model, lora_config) model.print_trainable_parameters() # Training arguments training_args = TrainingArguments( output_dir='./results', learning_rate=1e-4, lr_scheduler_type="linear", warmup_ratio=0.1, max_grad_norm=0.3, per_device_train_batch_size=4, per_device_eval_batch_size=4, num_train_epochs=3, weight_decay=0.001, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, report_to="wandb", fp16=True, gradient_checkpointing=True, gradient_accumulation_steps=4, dataloader_num_workers=4, logging_steps=100, save_total_limit=2, ) def compute_metrics(eval_pred): precision_metric = evaluate.load("precision") recall_metric = evaluate.load("recall") f1_metric = evaluate.load("f1") accuracy_metric = evaluate.load("accuracy") logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')["precision"] recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')["recall"] f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')["f1"] accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"] return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy} # Initialize the Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics ) # Train the model trainer.train() # Save the model model.save_pretrained('trained_gemma_model') tokenizer.save_pretrained('trained_gemma_model') # Evaluate the model trainer.evaluate()