MLR-Copilot / workspaces /imdb /backup /train.py_2024-07-31_20-40-01
Lim0011's picture
Upload 251 files
85e3d20 verified
raw
history blame
2.72 kB
from datasets import load_dataset
import torch
import pandas as pd
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import Trainer, TrainingArguments
def tokenize_function(examples):
return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)
if __name__ == "__main__":
imdb = load_dataset("imdb")
# Preprocess data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenized_imdb = imdb.map(tokenize_function, batched=True)
tokenized_imdb.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
# Define model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
# Set up TrainingArguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
)
# Create a Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_imdb['train'],
eval_dataset=tokenized_imdb['test'],
)
# Start training
trainer.train()
# Use the Trainer's predict method to get predictions
predictions = trainer.predict(tokenized_imdb['test'])
# Extract the logits from the predictions
logits = predictions.predictions
# Convert logits to probabilities using softmax
probs = torch.nn.functional.softmax(torch.Tensor(logits), dim=1).numpy()
# Create a DataFrame with the probabilities
submission = pd.DataFrame(probs, columns=['negative', 'positive'])
# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)
# Print a message confirming the save
print('Predictions saved to submission.csv')
#evaluate model and print accuracy on test set, also save the predictions of probabilities per class to submission.csv
submission = pd.DataFrame(columns=list(range(2)), index=range(len(imdb["test"])))
acc = 0
for idx, data in enumerate(imdb["test"]):
text = data["text"]
label = data["label"]
pred = model(text) # TODO: replace with proper prediction
pred = torch.softmax(pred, dim=0)
submission.loc[idx] = pred.tolist()
acc += int(torch.argmax(pred).item() == label)
print("Accuracy: ", acc/len(imdb["test"]))
submission.to_csv('submission.csv', index_label='idx')