|
import torch |
|
from torch.utils.data import Dataset, DataLoader |
|
|
|
import pandas as pd |
|
|
|
from transformers import BertTokenizerFast, BertForSequenceClassification |
|
from transformers import Trainer, TrainingArguments |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
model_name = "bert-base-uncased" |
|
tokenizer = BertTokenizerFast.from_pretrained(model_name) |
|
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6).to(device) |
|
max_len = 200 |
|
|
|
training_args = TrainingArguments( |
|
output_dir="results", |
|
num_train_epochs=1, |
|
per_device_train_batch_size=16, |
|
per_device_eval_batch_size=64, |
|
warmup_steps=500, |
|
learning_rate=5e-5, |
|
weight_decay=0.01, |
|
logging_dir="./logs", |
|
logging_steps=10 |
|
) |
|
|
|
|
|
|
|
|
|
class TokenizerDataset(Dataset): |
|
def __init__(self, strings): |
|
self.strings = strings |
|
|
|
def __getitem__(self, idx): |
|
return self.strings[idx] |
|
|
|
def __len__(self): |
|
return len(self.strings) |
|
|
|
|
|
train_data = pd.read_csv("data/train.csv") |
|
print(train_data) |
|
train_text = train_data["comment_text"] |
|
train_labels = train_data[["toxic", "severe_toxic", |
|
"obscene", "threat", |
|
"insult", "identity_hate"]] |
|
|
|
test_text = pd.read_csv("data/test.csv")["comment_text"] |
|
test_labels = pd.read_csv("data/test_labels.csv")[[ |
|
"toxic", "severe_toxic", |
|
"obscene", "threat", |
|
"insult", "identity_hate"]] |
|
|
|
|
|
|
|
|
|
|
|
train_text = train_text.values.tolist() |
|
train_labels = train_labels.values.tolist() |
|
test_text = test_text.values.tolist() |
|
test_labels = test_labels.values.tolist() |
|
|
|
|
|
|
|
|
|
class TweetDataset(Dataset): |
|
def __init__(self, encodings, labels): |
|
self.encodings = encodings |
|
self.labels = labels |
|
self.tok = tokenizer |
|
|
|
def __getitem__(self, idx): |
|
print(idx) |
|
|
|
encoding = self.tok(self.encodings.strings[idx], truncation=True, |
|
padding="max_length", max_length=max_len) |
|
|
|
item = { key: torch.tensor(val) for key, val in encoding.items() } |
|
item['labels'] = torch.tensor(self.labels[idx]) |
|
|
|
return item |
|
|
|
def __len__(self): |
|
return len(self.labels) |
|
|
|
|
|
|
|
|
|
|
|
train_strings = TokenizerDataset(train_text) |
|
test_strings = TokenizerDataset(test_text) |
|
|
|
train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True) |
|
test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_dataset = TweetDataset(train_strings, train_labels) |
|
test_dataset = TweetDataset(test_strings, test_labels) |
|
|
|
print(len(train_dataset.labels)) |
|
print(len(train_strings)) |
|
|
|
|
|
class MultilabelTrainer(Trainer): |
|
def compute_loss(self, model, inputs, return_outputs=False): |
|
labels = inputs.pop("labels") |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
loss_fct = torch.nn.BCEWithLogitsLoss() |
|
loss = loss_fct(logits.view(-1, self.model.config.num_labels), |
|
labels.float().view(-1, self.model.config.num_labels)) |
|
return (loss, outputs) if return_outputs else loss |
|
|
|
|
|
|
|
trainer = MultilabelTrainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=test_dataset |
|
) |
|
|
|
trainer.train() |