Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

nicoleathy commited on Jul 18, 2024

Commit

9b7e8b1

verified ·

1 Parent(s): f11a0ff

Upload 2 files

Browse files

Files changed (2) hide show

competition/gemma.py +135 -0
competition/llama.py +150 -0

competition/gemma.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
+from datasets import Dataset
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from peft import get_peft_model, LoraConfig, TaskType
+import evaluate
+import numpy as np
+# Load the dataset
+file_path = 'train_en.csv'
+dataset = pd.read_csv(file_path)
+# Map labels to expected responses
+label_mapping = {
+    "Yes": 0,
+    "No": 1,
+    "It doesn't matter": 2,
+    "Unimportant": 2,  # Assuming "unimportant" is synonymous with "It doesn't matter"
+    "Incorrect questioning": 3,
+    "Correct answers": 4
+}
+# Apply label mapping
+dataset['label'] = dataset['label'].map(label_mapping)
+# Handle NaN values: Drop rows where label is NaN
+dataset = dataset.dropna(subset=['label'])
+# Ensure labels are integers
+dataset['label'] = dataset['label'].astype(int)
+# Format puzzle, truth, text into rows
+dataset['combined_text'] = (
+    "==========================================\n"
+    "puzzle: " + dataset['puzzle'] + "\n"
+    "==========================================\n"
+    "truth: " + dataset['truth'] + "\n"
+    "==========================================\n"
+    "text: " + dataset['text']
+)
+# Split the dataset into training and validation sets
+train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)
+# Convert the dataframes to datasets
+train_dataset = Dataset.from_pandas(train_df)
+val_dataset = Dataset.from_pandas(val_df)
+# Load the tokenizer and model
+model_name = "google/gemma-2-9b"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
+# Tokenize the data
+def tokenize_function(examples):
+    return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)
+train_dataset = train_dataset.map(tokenize_function, batched=True)
+val_dataset = val_dataset.map(tokenize_function, batched=True)
+# Set the format for PyTorch
+train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+# Define LoRA configuration
+lora_config = LoraConfig(
+    task_type=TaskType.SEQ_CLS,
+    r=16,
+    lora_alpha=16,
+    target_modules=["q_proj", "v_proj"],
+    lora_dropout=0.05,
+    bias="none"
+)
+# Apply LoRA to the model
+model = get_peft_model(model, lora_config)
+model.print_trainable_parameters()
+# Training arguments
+training_args = TrainingArguments(
+    output_dir='./results',
+    learning_rate=1e-4,
+    lr_scheduler_type="linear",
+    warmup_ratio=0.1,
+    max_grad_norm=0.3,
+    per_device_train_batch_size=4,
+    per_device_eval_batch_size=4,
+    num_train_epochs=3,
+    weight_decay=0.001,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    report_to="wandb",
+    fp16=True,
+    gradient_checkpointing=True,
+    gradient_accumulation_steps=4,
+    dataloader_num_workers=4,
+    logging_steps=100,
+    save_total_limit=2,
+)
+def compute_metrics(eval_pred):
+    precision_metric = evaluate.load("precision")
+    recall_metric = evaluate.load("recall")
+    f1_metric = evaluate.load("f1")
+    accuracy_metric = evaluate.load("accuracy")
+    logits, labels = eval_pred
+    predictions = np.argmax(logits, axis=-1)
+    precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')["precision"]
+    recall = recall_metric.compute(predictions=predictions, references=labels, average='macro')["recall"]
+    f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')["f1"]
+    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
+    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}
+# Initialize the Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=val_dataset,
+    compute_metrics=compute_metrics
+)
+# Train the model
+trainer.train()
+# Save the model
+model.save_pretrained('trained_gemma_model')
+tokenizer.save_pretrained('trained_gemma_model')
+# Evaluate the model
+trainer.evaluate()

competition/llama.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
+from datasets import Dataset
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from peft import get_peft_model, LoraConfig, TaskType
+import evaluate
+import numpy as np
+from tqdm import tqdm
+# Load the dataset
+file_path = 'train_en.csv'
+dataset = pd.read_csv(file_path)
+# Map labels to expected responses
+label_mapping = {
+    "Yes": 0,
+    "No": 1,
+    "It doesn't matter": 2,
+    "Unimportant": 2,
+    "Incorrect questioning": 3,
+    "Correct answers": 4
+}
+# Apply label mapping
+dataset['label'] = dataset['label'].map(label_mapping)
+# Handle NaN values: Drop rows where label is NaN
+dataset = dataset.dropna(subset=['label'])
+# Ensure labels are integers
+dataset['label'] = dataset['label'].astype(int)
+# Format puzzle, truth, text into rows
+dataset['combined_text'] = (
+    "==========================================\n"
+    "puzzle: " + dataset['puzzle'] + "\n"
+    "==========================================\n"
+    "truth: " + dataset['truth'] + "\n"
+    "==========================================\n"
+    "text: " + dataset['text']
+)
+# Split the dataset into training and validation sets
+train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)
+# Convert the dataframes to datasets
+train_dataset = Dataset.from_pandas(train_df)
+val_dataset = Dataset.from_pandas(val_df)
+# Load the tokenizer and model
+model_name = "meta-llama/Meta-Llama-3-8B"  # Replace with the actual model name
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
+# Add a padding token if it's not already present
+if tokenizer.pad_token is None:
+    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
+    model.resize_token_embeddings(len(tokenizer))
+    tokenizer.pad_token = tokenizer.eos_token  # Set the padding token explicitly
+# Ensure the padding token is set correctly in the model configuration
+model.config.pad_token_id = tokenizer.pad_token_id
+# Tokenize the data
+def tokenize_function(examples):
+    return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)
+train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4)  # Use multiprocessing
+val_dataset = val_dataset.map(tokenize_function, batched=True, num_proc=4)
+# Set the format for PyTorch
+train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+# Define LoRA configuration
+lora_config = LoraConfig(
+    task_type=TaskType.SEQ_CLS,
+    r=16,
+    lora_alpha=16,
+    target_modules=["q_proj", "v_proj"],
+    lora_dropout=0.05,
+    bias="none"
+)
+# Apply LoRA to the model
+model = get_peft_model(model, lora_config)
+model.print_trainable_parameters()
+# Training arguments
+training_args = TrainingArguments(
+    output_dir='./results',
+    learning_rate=1e-4,
+    lr_scheduler_type="linear",
+    warmup_ratio=0.1,
+    max_grad_norm=0.3,
+    per_device_train_batch_size=8,  # Increase batch size if memory allows
+    per_device_eval_batch_size=8,
+    num_train_epochs=3,
+    weight_decay=0.001,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    report_to="wandb",
+    fp16=True,
+    gradient_checkpointing=True,
+    gradient_accumulation_steps=2,  # Adjust based on memory constraints
+    dataloader_num_workers=4,
+    logging_steps=100,
+    save_total_limit=2,
+)
+def compute_metrics(eval_pred):
+    precision_metric = evaluate.load("precision")
+    recall_metric = evaluate.load("recall")
+    f1_metric = evaluate.load("f1")
+    accuracy_metric = evaluate.load("accuracy")
+    logits, labels = eval_pred
+    predictions = np.argmax(logits, axis=-1)
+    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
+    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
+    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
+    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
+    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}
+# Initialize the Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=val_dataset,
+    compute_metrics=compute_metrics
+)
+# Train the model with progress bar
+trainer.train()
+# Save the model
+model.save_pretrained('trained_llama_model')
+tokenizer.save_pretrained('trained_llama_model')
+# Evaluate the model with progress bar
+eval_results = trainer.evaluate()
+print(eval_results)