nicoleathy commited on
Commit
9b7e8b1
·
verified ·
1 Parent(s): f11a0ff

Upload 2 files

Browse files
Files changed (2) hide show
  1. competition/gemma.py +135 -0
  2. competition/llama.py +150 -0
competition/gemma.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
2
+ from datasets import Dataset
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+ from peft import get_peft_model, LoraConfig, TaskType
6
+ import evaluate
7
+ import numpy as np
8
+
9
+ # Load the dataset
10
+ file_path = 'train_en.csv'
11
+ dataset = pd.read_csv(file_path)
12
+
13
+ # Map labels to expected responses
14
+ label_mapping = {
15
+ "Yes": 0,
16
+ "No": 1,
17
+ "It doesn't matter": 2,
18
+ "Unimportant": 2, # Assuming "unimportant" is synonymous with "It doesn't matter"
19
+ "Incorrect questioning": 3,
20
+ "Correct answers": 4
21
+ }
22
+
23
+ # Apply label mapping
24
+ dataset['label'] = dataset['label'].map(label_mapping)
25
+
26
+ # Handle NaN values: Drop rows where label is NaN
27
+ dataset = dataset.dropna(subset=['label'])
28
+
29
+ # Ensure labels are integers
30
+ dataset['label'] = dataset['label'].astype(int)
31
+
32
+ # Format puzzle, truth, text into rows
33
+ dataset['combined_text'] = (
34
+ "==========================================\n"
35
+ "puzzle: " + dataset['puzzle'] + "\n"
36
+ "==========================================\n"
37
+ "truth: " + dataset['truth'] + "\n"
38
+ "==========================================\n"
39
+ "text: " + dataset['text']
40
+ )
41
+
42
+ # Split the dataset into training and validation sets
43
+ train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)
44
+
45
+ # Convert the dataframes to datasets
46
+ train_dataset = Dataset.from_pandas(train_df)
47
+ val_dataset = Dataset.from_pandas(val_df)
48
+
49
+ # Load the tokenizer and model
50
+ model_name = "google/gemma-2-9b"
51
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
52
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
53
+
54
+ # Tokenize the data
55
+ def tokenize_function(examples):
56
+ return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)
57
+
58
+ train_dataset = train_dataset.map(tokenize_function, batched=True)
59
+ val_dataset = val_dataset.map(tokenize_function, batched=True)
60
+
61
+ # Set the format for PyTorch
62
+ train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
63
+ val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
64
+
65
+ # Define LoRA configuration
66
+ lora_config = LoraConfig(
67
+ task_type=TaskType.SEQ_CLS,
68
+ r=16,
69
+ lora_alpha=16,
70
+ target_modules=["q_proj", "v_proj"],
71
+ lora_dropout=0.05,
72
+ bias="none"
73
+ )
74
+
75
+ # Apply LoRA to the model
76
+ model = get_peft_model(model, lora_config)
77
+ model.print_trainable_parameters()
78
+
79
+ # Training arguments
80
+ training_args = TrainingArguments(
81
+ output_dir='./results',
82
+ learning_rate=1e-4,
83
+ lr_scheduler_type="linear",
84
+ warmup_ratio=0.1,
85
+ max_grad_norm=0.3,
86
+ per_device_train_batch_size=4,
87
+ per_device_eval_batch_size=4,
88
+ num_train_epochs=3,
89
+ weight_decay=0.001,
90
+ evaluation_strategy="epoch",
91
+ save_strategy="epoch",
92
+ load_best_model_at_end=True,
93
+ report_to="wandb",
94
+ fp16=True,
95
+ gradient_checkpointing=True,
96
+ gradient_accumulation_steps=4,
97
+ dataloader_num_workers=4,
98
+ logging_steps=100,
99
+ save_total_limit=2,
100
+ )
101
+
102
+ def compute_metrics(eval_pred):
103
+ precision_metric = evaluate.load("precision")
104
+ recall_metric = evaluate.load("recall")
105
+ f1_metric = evaluate.load("f1")
106
+ accuracy_metric = evaluate.load("accuracy")
107
+
108
+ logits, labels = eval_pred
109
+ predictions = np.argmax(logits, axis=-1)
110
+
111
+ precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')["precision"]
112
+ recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')["recall"]
113
+ f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')["f1"]
114
+ accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
115
+
116
+ return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}
117
+
118
+ # Initialize the Trainer
119
+ trainer = Trainer(
120
+ model=model,
121
+ args=training_args,
122
+ train_dataset=train_dataset,
123
+ eval_dataset=val_dataset,
124
+ compute_metrics=compute_metrics
125
+ )
126
+
127
+ # Train the model
128
+ trainer.train()
129
+
130
+ # Save the model
131
+ model.save_pretrained('trained_gemma_model')
132
+ tokenizer.save_pretrained('trained_gemma_model')
133
+
134
+ # Evaluate the model
135
+ trainer.evaluate()
competition/llama.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
2
+ from datasets import Dataset
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+ from peft import get_peft_model, LoraConfig, TaskType
6
+ import evaluate
7
+ import numpy as np
8
+ from tqdm import tqdm
9
+
10
+ # Load the dataset
11
+ file_path = 'train_en.csv'
12
+ dataset = pd.read_csv(file_path)
13
+
14
+ # Map labels to expected responses
15
+ label_mapping = {
16
+ "Yes": 0,
17
+ "No": 1,
18
+ "It doesn't matter": 2,
19
+ "Unimportant": 2,
20
+ "Incorrect questioning": 3,
21
+ "Correct answers": 4
22
+ }
23
+
24
+ # Apply label mapping
25
+ dataset['label'] = dataset['label'].map(label_mapping)
26
+
27
+ # Handle NaN values: Drop rows where label is NaN
28
+ dataset = dataset.dropna(subset=['label'])
29
+
30
+ # Ensure labels are integers
31
+ dataset['label'] = dataset['label'].astype(int)
32
+
33
+ # Format puzzle, truth, text into rows
34
+ dataset['combined_text'] = (
35
+ "==========================================\n"
36
+ "puzzle: " + dataset['puzzle'] + "\n"
37
+ "==========================================\n"
38
+ "truth: " + dataset['truth'] + "\n"
39
+ "==========================================\n"
40
+ "text: " + dataset['text']
41
+ )
42
+
43
+ # Split the dataset into training and validation sets
44
+ train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)
45
+
46
+ # Convert the dataframes to datasets
47
+ train_dataset = Dataset.from_pandas(train_df)
48
+ val_dataset = Dataset.from_pandas(val_df)
49
+
50
+ # Load the tokenizer and model
51
+ model_name = "meta-llama/Meta-Llama-3-8B" # Replace with the actual model name
52
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
53
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
54
+
55
+ # Add a padding token if it's not already present
56
+ if tokenizer.pad_token is None:
57
+ tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
58
+ model.resize_token_embeddings(len(tokenizer))
59
+ tokenizer.pad_token = tokenizer.eos_token # Set the padding token explicitly
60
+
61
+ # Ensure the padding token is set correctly in the model configuration
62
+ model.config.pad_token_id = tokenizer.pad_token_id
63
+
64
+ # Tokenize the data
65
+ def tokenize_function(examples):
66
+ return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)
67
+
68
+ train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4) # Use multiprocessing
69
+ val_dataset = val_dataset.map(tokenize_function, batched=True, num_proc=4)
70
+
71
+ # Set the format for PyTorch
72
+ train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
73
+ val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
74
+
75
+ # Define LoRA configuration
76
+ lora_config = LoraConfig(
77
+ task_type=TaskType.SEQ_CLS,
78
+ r=16,
79
+ lora_alpha=16,
80
+ target_modules=["q_proj", "v_proj"],
81
+ lora_dropout=0.05,
82
+ bias="none"
83
+ )
84
+
85
+ # Apply LoRA to the model
86
+ model = get_peft_model(model, lora_config)
87
+ model.print_trainable_parameters()
88
+
89
+ # Training arguments
90
+ training_args = TrainingArguments(
91
+ output_dir='./results',
92
+ learning_rate=1e-4,
93
+ lr_scheduler_type="linear",
94
+ warmup_ratio=0.1,
95
+ max_grad_norm=0.3,
96
+ per_device_train_batch_size=8, # Increase batch size if memory allows
97
+ per_device_eval_batch_size=8,
98
+ num_train_epochs=3,
99
+ weight_decay=0.001,
100
+ evaluation_strategy="epoch",
101
+ save_strategy="epoch",
102
+ load_best_model_at_end=True,
103
+ report_to="wandb",
104
+ fp16=True,
105
+ gradient_checkpointing=True,
106
+ gradient_accumulation_steps=2, # Adjust based on memory constraints
107
+ dataloader_num_workers=4,
108
+ logging_steps=100,
109
+ save_total_limit=2,
110
+ )
111
+
112
+ def compute_metrics(eval_pred):
113
+ precision_metric = evaluate.load("precision")
114
+ recall_metric = evaluate.load("recall")
115
+ f1_metric = evaluate.load("f1")
116
+ accuracy_metric = evaluate.load("accuracy")
117
+
118
+ logits, labels = eval_pred
119
+ predictions = np.argmax(logits, axis=-1)
120
+
121
+ precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
122
+ recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
123
+ f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
124
+ accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
125
+
126
+ return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}
127
+
128
+ # Initialize the Trainer
129
+ trainer = Trainer(
130
+ model=model,
131
+ args=training_args,
132
+ train_dataset=train_dataset,
133
+ eval_dataset=val_dataset,
134
+ compute_metrics=compute_metrics
135
+ )
136
+
137
+ # Train the model with progress bar
138
+ trainer.train()
139
+
140
+ # Save the model
141
+ model.save_pretrained('trained_llama_model')
142
+ tokenizer.save_pretrained('trained_llama_model')
143
+
144
+ # Evaluate the model with progress bar
145
+ eval_results = trainer.evaluate()
146
+ print(eval_results)
147
+
148
+
149
+
150
+