In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

import pandas as pd

from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

2023-04-23 21:39:14.489766: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)
model = model.to("cuda:0")
max_len = 200

training_args = TrainingArguments(
 output_dir="results",
 num_train_epochs=1,
 per_device_train_batch_size=16,
 per_device_eval_batch_size=64,
 warmup_steps=500,
 learning_rate=5e-5,
 weight_decay=0.01,
 logging_dir="./logs",
 logging_steps=10
 )

# dataset class that inherits from torch.utils.data.Dataset

 
class TokenizerDataset(Dataset):
 def __init__(self, strings):
 self.strings = strings
 
 def __getitem__(self, idx):
 return self.strings[idx]
 
 def __len__(self):
 return len(self.strings)
 

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:
train_data = pd.read_csv("data/train.csv")
print(train_data)
train_text = train_data["comment_text"]
train_labels = train_data[["toxic", "severe_toxic", 
 "obscene", "threat", 
 "insult", "identity_hate"]]

test_text = pd.read_csv("data/test.csv")["comment_text"]
test_labels = pd.read_csv("data/test_labels.csv")[[
 "toxic", "severe_toxic", 
 "obscene", "threat", 
 "insult", "identity_hate"]]

# data preprocessing



train_text = train_text.values.tolist()
train_labels = train_labels.values.tolist()
test_text = test_text.values.tolist()
test_labels = test_labels.values.tolist()


 id comment_text \
0 0000997932d777bf Explanation\nWhy the edits made under my usern... 
1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 
2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 
3 0001b41b1c6bb37e "\nMore\nI can't make any real suggestions on ... 
4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 
... ... ... 
159566 ffe987279560d7ff ":::::And for the second time of asking, when ... 
159567 ffea4adeee384e90 You should be ashamed of yourself \n\nThat is ... 
159568 ffee36eab5c267c9 Spitzer \n\nUmm, theres no actual article for ... 
159569 fff125370e4aaaf3 And it looks like it was actually you who put ... 
159570 fff46fc426af1f9a "\nAnd ... I really don't think you understand... 

 toxic severe_toxic obscene threat insult identity_hate 
0 0 0 0 0 0 0 
1 0 0 0 0 0 0 
2 0 0 0 0 0 0 
3 0 0 0 0 0 0 
4 0 0 0 0 0 0 
... ... ... ... ... ... ... 
159566 0 0 0 0 0 0 
159567 0 0 0 0 0 0 
159568 0 0 0 0 0 0 
159569 0 0 0 0 0 0 
1

In [10]:
# prepare tokenizer and dataset

class TweetDataset(Dataset):
 def __init__(self, encodings, labels):
 self.encodings = encodings
 self.labels = labels
 self.tok = tokenizer
 
 def __getitem__(self, idx):
# print(idx)
 print(len(self.labels))
 encoding = self.tok(self.encodings.strings[idx], truncation=True, padding="max_length", max_length=max_len).to("cuda:0")
 print(encoding.items())
 item = { key: torch.tensor(val) for key, val in encoding.items() }
 item['labels'] = torch.tensor(self.labels[idx])
# print(item)
 return item
 
 def __len__(self):
 return len(self.labels)

# no tokenizer
class TweetDataset2(Dataset):
 def __init__(self, encodings, labels):
 self.encodings = encodings
 self.labels = labels
 self.tok = tokenizer
 
 def __getitem__(self, idx):
# print(idx)
 print(len(self.labels))
 encoding = self.tok(self.encodings.strings[idx], truncation=True, padding="max_length", max_length=max_len).to("cuda:0")
 print(encoding.items())
 item = { key: torch.tensor(val) for key, val in encoding.items() }
 item['labels'] = torch.tensor(self.labels[idx])
# print(item)
 return item
 
 def __len__(self):
 return len(self.labels)




train_strings = TokenizerDataset(train_text)
test_strings = TokenizerDataset(test_text)

train_dataloader = DataLoader(train_strings, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_strings, batch_size=16, shuffle=True)




train_encodings = tokenizer.batch_encode_plus(train_text, \
 max_length=200, pad_to_max_length=True, \
 truncation=True, return_token_type_ids=False, return_tensors='pt' \
 ).to("cuda:0")
test_encodings = tokenizer.batch_encode_plus(test_text, \
 max_length=200, pad_to_max_length=True, \
 truncation=True, return_token_type_ids=False, return_tensors='pt' \
 ).to("cuda:0")

# train_encodings = tokenizer(train_text, truncation=True, padding=True)
# test_encodings = tokenizer(test_text, truncation=True, padding=True)

In [15]:
# no tokenizer
class TweetDataset3(Dataset):
 def __init__(self, encodings, labels):
 self.encodings = encodings
 self.labels = labels
 self.tok = tokenizer
 
 def __getitem__(self, idx):
 print(idx)
 item = { key: torch.tensor(val) for key, val in self.encodings.items() }
 item['labels'] = torch.tensor(self.labels[idx])
# print(item)
 return item
 
 def __len__(self):
 return len(self.labels)



train_dataset = TweetDataset3(train_encodings, train_labels)
test_dataset = TweetDataset3(test_encodings, test_labels)

print(len(train_dataset.labels))
print(len(train_strings))


class MultilabelTrainer(Trainer):
 def compute_loss(self, model, inputs, return_outputs=False):
 labels = inputs.pop("labels")
 outputs = model(**inputs)
 logits = outputs.logits
 loss_fct = torch.nn.BCEWithLogitsLoss()
 loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
 labels.float().view(-1, self.model.config.num_labels))
 return (loss, outputs) if return_outputs else loss


# training
trainer = MultilabelTrainer(
 model=model, 
 args=training_args, 
 train_dataset=train_dataset, 
 eval_dataset=test_dataset
 )

159571
159571


In [None]:
trainer.train()