from datasets import load_dataset import torch import pandas as pd from transformers import DistilBertTokenizer from transformers import DistilBertForSequenceClassification import torch from torch.utils.data import DataLoader from tqdm import tqdm from transformers import Trainer, TrainingArguments def tokenize_function(examples): return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512) if __name__ == "__main__": imdb = load_dataset("imdb") # Preprocess data tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenized_imdb = imdb.map(tokenize_function, batched=True) tokenized_imdb.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) # Define model model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2) # Set up TrainingArguments training_args = TrainingArguments( output_dir='./results', num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=10, ) # Create a Trainer instance trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_imdb['train'], eval_dataset=tokenized_imdb['test'], ) # Start training trainer.train() # Use the Trainer's predict method to get predictions predictions = trainer.predict(tokenized_imdb['test']) # Extract the logits from the predictions logits = predictions.predictions # Convert logits to probabilities using softmax probs = torch.nn.functional.softmax(torch.Tensor(logits), dim=1).numpy() # Create a DataFrame with the probabilities submission = pd.DataFrame(probs, columns=['negative', 'positive']) # Save the DataFrame to a CSV file submission.to_csv('submission.csv', index=False) # Print a message confirming the save print('Predictions saved to submission.csv') #evaluate model and print accuracy on test set, also save the predictions of probabilities per class to submission.csv submission = pd.DataFrame(columns=list(range(2)), index=range(len(imdb["test"]))) acc = 0 for idx, data in enumerate(imdb["test"]): text = data["text"] label = data["label"] pred = model(text) # TODO: replace with proper prediction pred = torch.softmax(pred, dim=0) submission.loc[idx] = pred.tolist() acc += int(torch.argmax(pred).item() == label) print("Accuracy: ", acc/len(imdb["test"])) submission.to_csv('submission.csv', index_label='idx')