File size: 3,062 Bytes
c69842f
b3fb325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b57e27
 
 
4abb8db
b3fb325
 
 
 
3b57e27
 
 
 
 
 
4abb8db
b3fb325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b57e27
 
b3fb325
 
 
 
3b57e27
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import streamlit as st
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
import random

classifiers = ['toxic', 'severe_toxic', 'obscene',
               'threat', 'insult', 'identity_hate']


def reset_scores():
    global scores_df
    scores_df = pd.DataFrame(columns=['Comment'] + classifiers)


def get_score(model_base, text):
    if model_base == "bert-base-cased":
        model_dir = "./bert/_bert_model"
    elif model_base == "distilbert-base-cased":
        model_dir = "./distilbert/_distilbert_model"
    else:
        model_dir = "./roberta/_roberta_model"
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_base)
    inputs = tokenizer.encode_plus(
        text, max_length=512, truncation=True, padding=True, return_tensors='pt')
    outputs = model(**inputs)
    predictions = torch.sigmoid(outputs.logits)
    return predictions


# Ask user for input, return scores
st.title("Toxic Comment Classifier")
st.write("John Makely")
st.write("The following model's are fine tuned on the jigsaw-toxic-comment-classification dataset")
st.write("Please be patient and give the queries and tables time to load (max 2 minutes)")

# Drop down menu for model selection, default is roberta
model_base = st.selectbox("Select a pretrained model",
                          ["roberta-base", "bert-base-cased", "distilbert-base-cased"])

text_input = st.text_input("Enter text for toxicity classification",
                           "I hope you die")
st.write("After hitting Submit, classification scores will be displayed for the provided text")
submit_btn = st.button("Submit")


if submit_btn and text_input:
    result = get_score(model_base, text_input)

    df = pd.DataFrame([result[0].tolist()], columns=classifiers)
    df = df.round(2)  # Round the values to 2 decimal places
    # Format the values as percentages
    df = df.applymap(lambda x: '{:.0%}'.format(x))

    st.table(df)

# Read the test dataset
test_df = pd.read_csv(
    "./jigsaw-toxic-comment-classification-challenge/test.csv")

# Select 10 random comments from the test dataset
sample_df = test_df.sample(n=3)

# Create an empty DataFrame to store the scores
reset_scores()

# Calculate the scores for each comment and add them to the DataFrame
for index, row in sample_df.iterrows():
    result = get_score(model_base, row['comment_text'])
    scores = result[0].tolist()
    scores_df.loc[len(scores_df)] = [row['comment_text']] + scores

# Round the values to 2 decimal places
scores_df = scores_df.round(2)


st.subheader("Toxicity Scores for Random Comments")
st.write("The following table will grab random values from the jigsaw dataset and display their respective scores")
st.write("Please be patient as it may take some time for the scores to be passed through the model")
# Create a button to reset the scores
if st.button("Refresh Random Tweets"):
    reset_scores()
    st.success("New tweets have been loaded!")
st.table(scores_df)