In [None]:
from simpletransformers.classification import MultiLabelClassificationModel
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers_interpret import MultiLabelClassificationExplainer
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
# Ensure GPU is available for use
import tensorflow as tf
tf.test.gpu_device_name()

In [None]:
# Install required libraries
!pip install simpletransformers
!pip install transformers
!pip install transformers_interpret

In [None]:
# If using Google Colab, mount drive
try:
  from google.colab import drive
  drive.mount("/content/gdrive")
  %cd "/content/gdrive/MyDrive/persplain"
except:
  print("Notebook is not being run from Google Colab")
  %cd ..

In [None]:
# Prepare labels and drop unnecessary cols
docs = pd.read_csv('data/docs.csv', sep=',')
docs['labels'] = list(zip(docs.o.tolist(), docs.c.tolist(), docs.e.tolist(), docs.a.tolist(),  docs.n.tolist()))
docs = docs.drop(["o", "c", "e", "a", "n"], axis=1)

In [None]:
# Split train and test data
train_df, eval_df = train_test_split(docs, test_size=0.2)

In [None]:
# Define early stopping rules
early_stopping = {
    "early_stopping_metric": "eval_loss",
    "early_stopping_metric_minimize": True,
    "early_stopping_patience": 3,
}

# Initialize the model
model_simple = MultiLabelClassificationModel(
    'roberta', 
    'roberta-base', 
    num_labels=5, 
    args={
        'train_batch_size': 2, 
        'gradient_accumulation_steps': 16, 
        'learning_rate': 3e-5, 
        'num_train_epochs': 99, 
        'max_seq_length': 512,
        'output_dir': './models/ov/outputs/',
        'early_stopping_rule': early_stopping,
    },
)

# Train the model
model_simple.train_model(
    train_df, 
    eval_df=eval_df,
    args={
        'overwrite_output_dir': True,
        'use_early_stopping': True,
        'early_stopping_delta': 0.01,
    },
)

In [None]:
# If not training, load the "simple" trained version of the model and evaluate it
model_simple = MultiLabelClassificationModel("roberta", "models/ov/outputs/checkpoint-1909-epoch-23", use_cuda=False)

In [None]:
# Load trained model
model_path = "models/ov/outputs/checkpoint-1909-epoch-23"
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)

In [None]:
# Make sure model uses GPU, otherwise interpreting it will be too slow
model.to('cuda')
next(model.parameters()).is_cuda

In [None]:
# Initialise the explainer and make sure it is using the GPU
cls_explainer = MultiLabelClassificationExplainer(model, tokenizer)
cls_explainer.device.type == "cuda"

In [None]:
# Predict personality from text and gather explaination
text = "I like to talk to people and work hard"
prediction = model_simple.predict([text])
word_attributions = cls_explainer(text)

print(prediction)

In [None]:
# Visualise explanation
cls_explainer.visualize()