File size: 5,511 Bytes
7f333a3
8e43439
f261a66
e42945f
3411c2a
f261a66
 
 
23ec782
03b82de
b68bd8f
 
 
 
 
09af0a6
b68bd8f
 
23ec782
216e8fc
03b82de
e42945f
b68bd8f
db836b5
e42945f
 
b68bd8f
e42945f
 
 
 
 
3411c2a
 
e42945f
d14c346
e42945f
08c00f2
e42945f
 
08c00f2
e42945f
0272517
3411c2a
 
d14c346
3411c2a
e42945f
0cf9fc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23ec782
 
 
 
 
 
27c8c3b
65399fa
cafaa0d
216e8fc
a488b81
52094ea
216e8fc
d14c346
3411c2a
d14c346
23ec782
 
8e43439
0272517
ba54417
23ec782
ba54417
 
 
 
 
 
 
 
 
 
 
 
 
23ec782
 
 
 
 
 
 
 
 
 
 
8e43439
23ec782
515f053
8e43439
 
 
 
 
 
a61ca85
0d60836
ceba835
e0537cf
7671267
 
 
 
e0537cf
a98c3f3
 
0272517
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import gradio as gr
import re
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import torch
from keybert import KeyBERT
from datasets import load_dataset
import shap
from transformers_interpret import SequenceClassificationExplainer
from ferret import Benchmark

#model_identifier = "karalif/myTestModel"
#model = AutoModelForSequenceClassification.from_pretrained(model_identifier)
#tokenizer = AutoTokenizer.from_pretrained(model_identifier)

name = "karalif/myTestModel"
model = AutoModelForSequenceClassification.from_pretrained(name)
tokenizer = AutoTokenizer.from_pretrained(name, normalization=True)

bench = Benchmark(model, tokenizer)
#text = "hvað er maðurinn eiginlega að pæla ég fatta ekki??????????"

def get_prediction(text):
    encoding = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=200)
    encoding = {k: v.to(model.device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**encoding)

    logits = outputs.logits
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu()).numpy()

    kw_model = KeyBERT()
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words='english', use_maxsum=True, nr_candidates=20, top_n=5)

    response = ""
    labels = ['Politeness', 'Toxicity', 'Sentiment', 'Formality']
    colors = ['#b8e994', '#f8d7da', '#fff3cd', '#bee5eb']  # Corresponding colors for labels

    for i, label in enumerate(labels):
        response += f"<span style='background-color:{colors[i]}; color:black;'>{label}</span>: {probs[i]*100:.1f}%<br>"
    
    influential_keywords = "INFLUENTIAL KEYWORDS:<br>"
    for keyword, score in keywords:
        influential_keywords += f"{keyword} (Score: {score:.2f})<br>"
    
    return response, keywords, influential_keywords

def replace_encoding(tokens):
    return [token.replace('Ġ', ' ')
                 .replace('ð', 'ð')
                 .replace('é', 'é')
                 .replace('æ', 'æ')
                 .replace('ý', 'ý')
                 .replace('á', 'á')
                 .replace('ú', 'ú')
                 .replace('ÃŃ', 'í')
                 .replace('Ãö', 'ö')
                 .replace('þ', 'þ')
                 .replace('Ãģ', 'Á')
                 .replace('Ãį', 'Ú')
                 .replace('Ãĵ', 'Ó')
                 .replace('ÃĨ', 'Æ')
                 .replace('ÃIJ', 'Ð')
                 .replace('Ãĸ', 'Ö')
                 .replace('Ãī', 'É')
                 .replace('Ãļ', 'ý')
            for token in tokens[1:-1]]

def predict(text):

    explanations_formality = bench.explain(text, target=0)
    explanations_sentiment = bench.explain(text, target=1)
    explanations_politeness = bench.explain(text, target=2)
    explanations_toxicity = bench.explain(text, target=3)
    
    greeting_pattern = r"^(Halló|Hæ|Sæl|Góðan dag|Kær kveðja|Daginn|Kvöldið|Ágætis|Elsku)"
    
    prediction_output, keywords, influential_keywords = get_prediction(text)
    greeting_feedback = ""
    
    modified_input = text
    for keyword, _ in keywords:
        modified_input = modified_input.replace(keyword, f"<span style='color:green;'>{keyword}</span>")
    
    #if not re.match(greeting_pattern, text, re.IGNORECASE):
    #    greeting_feedback = "OTHER FEEDBACK:<br>Heilsaðu dóninn þinn<br>"

    response = f"INPUT:<br>{modified_input}<br><br>MY PREDICTION:<br>{prediction_output}<br>{influential_keywords}<br>{greeting_feedback}"

    # Influential words
    explanation_lists = [explanations_toxicity, explanations_formality, explanations_sentiment, explanations_politeness]
    labels = ['Toxicity', 'Formality', 'Sentiment', 'Politeness']

    response += "<br>MOST INFLUENTIAL WORDS FOR EACH LABEL:<br>"
    for i, explanations in enumerate(explanation_lists):
        label = labels[i]
        for explanation in explanations:
            if explanation.explainer == 'Partition SHAP':
                tokens = replace_encoding(explanation.tokens)
                token_score_pairs = zip(tokens, explanation.scores)
                formatted_output = ' '.join([f"{token} ({score})" for token, score in token_score_pairs])
                response += f"{label}: {formatted_output}<br>"

    #response += "<br>TOP 2 MOST INFLUENTIAL WORDS FOR EACH LABEL:<br>"
    #for i, explanations in enumerate(explanation_lists):
    #    label = labels[i]
    #    response += f"{label}:<br>"
    #    for explanation in explanations:
    #        if explanation.explainer == 'Partition SHAP':
    #            sorted_scores = sorted(enumerate(explanation.scores), key=lambda x: abs(x[1]), reverse=True)[:2]
    #            tokens = replace_encoding(explanation.tokens)
    #            tokens = [tokens[idx] for idx, _ in sorted_scores]
    #            formatted_output = ' '.join(tokens)
    #            response += f"{formatted_output}<br>"

    return response

description_html = """
    <center>
        <img src='http://www.ru.is/media/HR_logo_vinstri_transparent.png' width='250' height='auto'>
    </center>
    """

demo = gr.Interface(
    fn=predict,
    inputs=gr.TextArea(label="Enter text here:"),
    outputs=gr.HTML(label="Leiðrétt"),
    description=description_html,
    examples=[
        ["Sæl og blessuð Kristín, hvað er að frella af þér gamla??"],
    ],
    theme=gr.themes.Default(primary_hue="red", secondary_hue="pink")
)

demo.launch()