nicoleathy's picture
Upload 2 files
9b7e8b1 verified
raw
history blame
4.23 kB
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np
# Load the dataset
file_path = 'train_en.csv'
dataset = pd.read_csv(file_path)
# Map labels to expected responses
label_mapping = {
"Yes": 0,
"No": 1,
"It doesn't matter": 2,
"Unimportant": 2, # Assuming "unimportant" is synonymous with "It doesn't matter"
"Incorrect questioning": 3,
"Correct answers": 4
}
# Apply label mapping
dataset['label'] = dataset['label'].map(label_mapping)
# Handle NaN values: Drop rows where label is NaN
dataset = dataset.dropna(subset=['label'])
# Ensure labels are integers
dataset['label'] = dataset['label'].astype(int)
# Format puzzle, truth, text into rows
dataset['combined_text'] = (
"==========================================\n"
"puzzle: " + dataset['puzzle'] + "\n"
"==========================================\n"
"truth: " + dataset['truth'] + "\n"
"==========================================\n"
"text: " + dataset['text']
)
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)
# Convert the dataframes to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
# Load the tokenizer and model
model_name = "google/gemma-2-9b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
# Tokenize the data
def tokenize_function(examples):
return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
# Define LoRA configuration
lora_config = LoraConfig(
task_type=TaskType.SEQ_CLS,
r=16,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none"
)
# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Training arguments
training_args = TrainingArguments(
output_dir='./results',
learning_rate=1e-4,
lr_scheduler_type="linear",
warmup_ratio=0.1,
max_grad_norm=0.3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
weight_decay=0.001,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
report_to="wandb",
fp16=True,
gradient_checkpointing=True,
gradient_accumulation_steps=4,
dataloader_num_workers=4,
logging_steps=100,
save_total_limit=2,
)
def compute_metrics(eval_pred):
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")
accuracy_metric = evaluate.load("accuracy")
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')["precision"]
recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')["recall"]
f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')["f1"]
accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics
)
# Train the model
trainer.train()
# Save the model
model.save_pretrained('trained_gemma_model')
tokenizer.save_pretrained('trained_gemma_model')
# Evaluate the model
trainer.evaluate()