|
|
|
|
|
import torch |
|
import pandas as pd |
|
from transformers import BartTokenizer, BartForSequenceClassification |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
|
|
|
|
|
|
joke_data = pd.read_csv('jokes.csv', sep='|', names=["joke", "label"], skiprows=1) |
|
noJoke_data = pd.read_csv('not_jokes.csv', sep='|', names=["joke", "label"], skiprows=1) |
|
frames = [joke_data, noJoke_data] |
|
train_data = pd.concat(frames) |
|
|
|
test_data = pd.read_csv('test_jokes.csv', sep='|', names=["joke", "label"], skiprows=1) |
|
|
|
numCategories = 2 |
|
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') |
|
model = BartForSequenceClassification.from_pretrained('facebook/bart-large', num_labels=numCategories) |
|
model = model.to(device) |
|
|
|
|
|
one_hot_train = pd.get_dummies(train_data['label']) |
|
one_hot_test = pd.get_dummies(test_data['label']) |
|
|
|
|
|
inputs_train = tokenizer(list(train_data['joke']), return_tensors='pt', padding=True) |
|
labels_train = torch.tensor(one_hot_train.values, dtype=torch.float32) |
|
dataset_train = torch.utils.data.TensorDataset(inputs_train['input_ids'], inputs_train['attention_mask'], labels_train) |
|
inputs_test = tokenizer(list(test_data['joke']), return_tensors='pt', padding=True) |
|
labels_test = torch.tensor(one_hot_test.values, dtype=torch.float32) |
|
dataset_test = torch.utils.data.TensorDataset(inputs_test['input_ids'], inputs_test['attention_mask'], labels_test) |
|
|
|
|
|
|
|
epochs = 10 |
|
batch_size = 32 |
|
learning_rate = 1e-5 |
|
|
|
|
|
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) |
|
data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True) |
|
data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size) |
|
|
|
print(f"\nTraining on {len(train_data)} examples\n") |
|
print("Num. Parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad)) |
|
|
|
for epoch in range(epochs): |
|
|
|
avg_loss = 0 |
|
for step, batch in enumerate(data_loader_train): |
|
input_ids, attention_mask, labels = batch |
|
input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device) |
|
outputs = model(input_ids, attention_mask=attention_mask, labels=labels) |
|
loss = outputs[0] |
|
avg_loss += loss.item() |
|
if step % 100 == 0: |
|
print(f"Step {step}/{len(data_loader_train)} Loss {loss} Avg Train Loss {avg_loss / (step + 1)}") |
|
optimizer.zero_grad() |
|
loss.backward() |
|
optimizer.step() |
|
loss = avg_loss / len(data_loader_train) |
|
|
|
print(f"Epoch {epoch+1} Test Loss {loss}") |
|
|
|
correct = 0 |
|
total = 0 |
|
for step, batch in enumerate(data_loader_test): |
|
input_ids, attention_mask, labels = batch |
|
input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device) |
|
outputs = model(input_ids, attention_mask=attention_mask) |
|
predicted = torch.argmax(outputs[0], dim=1) |
|
total += labels.size(0) |
|
correct += (predicted == torch.argmax(labels, dim=1)).sum().item() |
|
print(f"Test Accuracy {100*correct/total}%\n") |
|
|
|
|
|
model.save_pretrained('fine-tuned-bart_countries') |