File size: 10,017 Bytes
03287bc
 
 
 
b0fc967
03287bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#Import the libraries we know we'll need for the Generator.
import pandas as pd, spacy, nltk, numpy as np, re
from spacy.matcher import Matcher
#!python -m spacy download en_core_web_md #Not sure if we need this so I'm going to keep it just in case
nlp = spacy.load("en_core_web_lg")
import altair as alt
import streamlit as st
from annotated_text import annotated_text as ant

#Import the libraries to support the model and predictions.
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import lime
import torch
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer

class_names = ['negative', 'positive']
explainer = LimeTextExplainer(class_names=class_names)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True) 

def predictor(texts):
    outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
    probas = F.softmax(outputs.logits, dim=1).detach().numpy()
    return probas

@st.experimental_singleton
def critical_words(document, options=False):
    if type(document) is not spacy.tokens.doc.Doc:
        document = nlp(document)
    chunks = list(document.noun_chunks)
    pos_options = []
    lime_options = []
    
    #Identify what the model cares about.
    if options:
        #Run Lime Setup code
        exp = explainer.explain_instance(document.text, predictor, num_features=15, num_samples=2000)
        lime_results = exp.as_list()
        for feature in lime_results:
            lime_options.append(feature[0])
        lime_results = pd.DataFrame(lime_results, columns=["Word","Weight"])
    
    #Identify what we care about "parts of speech"
    for chunk in chunks:
        #The use of chunk[-1] is due to testing that it appears to always match the root
        root = chunk[-1]
        #This currently matches to a list I've created. I don't know the best way to deal with this so I'm leaving it as is for the moment.
        if root.ent_type_:
            cur_values = []
            if (len(chunk) > 1) and (chunk[-2].dep_ == "compound"):
                #creates the compound element of the noun
                compound = [x.text for x in chunk if x.dep_ == "compound"]
                print(f"This is the contents of {compound} and it is {all(elem in lime_options for elem in compound)} that all elements are present in {lime_options}.") #for QA
                #checks to see all elements in the compound are important to the model or use the compound if not checking importance.
                if (all(elem in lime_options for elem in cur_values) and (options is True)) or ((options is False)):
                    #creates a span for the entirety of the compound noun and adds it to the list.
                    span = -1 * (1 + len(compound))
                    pos_options.append(chunk[span:].text)
                    cur_values + [token.text for token in chunk if token.pos_ == "ADJ"]
                else:
                    print(f"The elmenents in {compound} could not be added to the final list because they are not all relevant to the model.")
            else: 
                cur_values = [token.text for token in chunk if (token.ent_type_) or (token.pos_ == "ADJ")]
            if (all(elem in lime_options for elem in cur_values) and (options is True)) or ((options is False)):
                pos_options.extend(cur_values)
                print(f"From {chunk.text}, {cur_values} added to pos_options due to entity recognition.") #for QA
        elif len(chunk) >= 1:
            cur_values = [token.text for token in chunk if token.pos_ in ["NOUN","ADJ"]]
            if (all(elem in lime_options for elem in cur_values) and (options is True)) or ((options is False)):
                pos_options.extend(cur_values)
                print(f"From {chunk.text}, {cur_values} added to pos_options due to wildcard.") #for QA
        else:
            print(f"No options added for \'{chunk.text}\' ")
        # Here I am going to try to pick up pronouns, which are people, and Adjectival Compliments.
    for token in document:
        if (token.text not in pos_options) and ((token.text in lime_options) or (options == False)):
            #print(f"executed {token.text} with {token.pos_} and {token.dep_}") #QA
            if (token.pos_ == "ADJ") and (token.dep_ in ["acomp","conj"]):
                pos_options.append(token.text)            
            elif (token.pos_ == "PRON") and (len(token.morph) !=0):
                if (token.morph.get("PronType") == "Prs"):
                    pos_options.append(token.text)
    
    if options:
        return pos_options, lime_results
    else:
        return pos_options
    
# Return the Viz of elements critical to LIME.
def lime_viz(df):
    if not isinstance(df, pd.DataFrame):
        df = pd.DataFrame(df, columns=["Word","Weight"])
    single_nearest = alt.selection_single(on='mouseover', nearest=True)
    viz = alt.Chart(df).encode(
        alt.X('Weight:Q', scale=alt.Scale(domain=(-1, 1))),
        alt.Y('Word:N', sort='x', axis=None),
        color=alt.Color("Weight", scale=alt.Scale(scheme='blueorange', domain=[0], type="threshold", range='diverging'), legend=None),
        tooltip = ("Word","Weight")
    ).mark_bar().properties(title ="Importance of individual words")

    text = viz.mark_text(
        fill="black",
        align='right',
        baseline='middle'
    ).encode(
        text='Word:N'
    )
    limeplot = alt.LayerChart(layer=[viz,text], width = 300).configure_axis(grid=False).configure_view(strokeWidth=0)
    return limeplot

# Evaluate Predictions using the model and pipe.
def eval_pred(text, return_all = False):
    '''A basic function for evaluating the prediction from the model and turning it into a visualization friendly number.'''
    preds = pipe(text)
    neg_score = -1 * preds[0][0]['score']
    sent_neg = preds[0][0]['label']
    pos_score = preds[0][1]['score']
    sent_pos = preds[0][1]['label']
    prediction = 0
    sentiment = ''
    if pos_score > abs(neg_score):
        prediction = pos_score
        sentiment = sent_pos
    elif abs(neg_score) > pos_score:
        prediction = neg_score
        sentiment = sent_neg
        
    if return_all:
        return prediction, sentiment
    else:
        return prediction

def construct_nlexp(text,sentiment,probability):
    prob = str(np.round(100 * abs(probability),2))
    if sentiment == "NEGATIVE":
        color_sent = ant('The model predicts the sentiment of the sentence you provided is ', (sentiment, "-", "#FFA44F"), ' with a probability of ', (prob, "neg", "#FFA44F"),"%.")
    elif sentiment == "POSITIVE":
        color_sent = ant('The model predicts the sentiment of the sentence you provided is ', (sentiment, "+", "#50A9FF"), ' with a probability of ', (prob, "pos", "#50A9FF"),"%.")
    return color_sent

def get_min_max(df, seed):
    '''This function provides the alternatives with the highest spaCy similarity scores and the lowest similarity scores. As similarity is based on vectorization of words and documents this may not be the best way to identify bias.
    
    text2 = Most Similar
    text3 = Least Similar'''
    maximum = df[df['similarity'] < .9999].similarity.max()
    text2 = df.loc[df['similarity'] == maximum, 'text'].iloc[0]
    minimum = df[df['similarity'] > .0001].similarity.min()
    text3 = df.loc[df['similarity'] == minimum, 'text'].iloc[0]
    return text2, text3

# Inspired by https://stackoverflow.com/questions/17758023/return-rows-in-a-dataframe-closest-to-a-user-defined-number/17758115#17758115
def abs_dif(df,seed):
    '''This function enables a user to identify the alternative that is closest to the seed and farthest from the seed should that be the what they wish to display.
    
    text2 = Nearest Prediction
    text3 = Farthest Prediction'''
    target = df[df['Words'] == seed].pred.iloc[0]
    sub_df = df[df['Words'] != seed].reset_index()
    nearest_prediction = sub_df.pred[(sub_df.pred-target).abs().argsort()[:1]]
    farthest_prediction = sub_df.pred[(sub_df.pred-target).abs().argsort()[-1:]]
    text2 = sub_df.text.iloc[nearest_prediction.index[0]]
    text3 = sub_df.text.iloc[farthest_prediction.index[0]]
    return text2, text3

#@st.experimental_singleton #I've enabled this to prevent it from triggering every time the code runs... which could get very messy
def sampled_alts(df, seed, fixed=False):
    '''This function enables a user to select an alternate way of choosing which counterfactuals are shown for MultiNLC, MultiNLC + Lime, and VizNLC. If you use this then you are enabling random sampling over other options (ex. spaCy similarity scores, or absolute difference).
    
    Both samples are random.'''
    sub_df = df[df['Words'] != seed]
    if fixed:
        sample = sub_df.sample(n=2, random_state = 2052)
    else:
        sample = sub_df.sample(n=2)
    text2 = sample.text.iloc[0]
    text3 = sample.text.iloc[1]
    return text2, text3

def gen_cf_country(df,_document,selection):
    df['text'] = df.Words.apply(lambda x: re.sub(r'\b'+selection+r'\b',x,_document.text))
    df['pred'] = df.text.apply(eval_pred)
    df['seed'] = df.Words.apply(lambda x: 'seed' if x == selection else 'alternative')
    df['similarity'] = df.Words.apply(lambda x: nlp(selection).similarity(nlp(x)))
    return df


def gen_cf_profession(df,_document,selection):
    category = df.loc[df['Words'] == selection, 'Major'].iloc[0]
    df = df[df.Major == category]
    df['text'] = df.Words.apply(lambda x: re.sub(r'\b'+selection+r'\b',x,_document.text))
    df['pred'] = df.text.apply(eval_pred)
    df['seed'] = df.Words.apply(lambda x: 'seed' if x == selection else 'alternative')
    df['similarity'] = df.Words.apply(lambda x: nlp(selection).similarity(nlp(x)))
    return df