sentiment-analysis / finetune.py
dahongj's picture
edited code documentation
2f1aee4
raw
history blame
No virus
2.47 kB
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast
from transformers import Trainer, TrainingArguments
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
model_name = "distilbert-base-uncased"
#Reading text
df = pd.read_csv('train.csv')
train_texts = df["comment_text"].values
train_labels = df[df.columns[2:]].values
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
#Dataset class to create the labels and encode them
class TextDataset(Dataset):
def __init__(self,texts,labels):
self.texts = texts
self.labels = labels
def __getitem__(self,idx):
#Create tokenizer
encodings = tokenizer(self.texts[idx], truncation=True, padding="max_length")
item = {key: torch.tensor(val) for key, val in encodings.items()}
item['labels'] = torch.tensor(self.labels[idx],dtype=torch.float32)
#Remove encoding to prevent memory leak
del encodings
return item
def __len__(self):
return len(self.labels)
#This is the tokenizer for the current model
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
#Set up the dataset
train_dataset = TextDataset(train_texts,train_labels)
val_dataset = TextDataset(val_texts, val_labels)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#Use multilabel model because there are 6 variables to fintune for
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6, problem_type="multi_label_classification")
model.to(device)
model.train()
#Use these parameters
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)
#Finetune process
for epoch in range(1):
for batch in train_loader:
optim.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
loss.backward()
optim.step()
model.eval()
#Upload trained model to a file
model.save_pretrained("sentiment_custom_model")
#Upload tokenizer to a file
tokenizer.save_pretrained("sentiment_tokenizer")