Nathan Butters commited on
Commit
e087e63
·
1 Parent(s): d1a4408

fixed bugs in main scripts

Browse files
Files changed (5) hide show
  1. .DS_Store +0 -0
  2. NLselector.py +37 -13
  3. WNgen.py +3 -2
  4. app.py +27 -24
  5. backup-app.py +343 -0
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
NLselector.py CHANGED
@@ -1,7 +1,6 @@
1
  #Import the libraries we know we'll need for the Generator.
2
  import pandas as pd, spacy, nltk, numpy as np, re
3
  from spacy.matcher import Matcher
4
- #!python -m spacy download en_core_web_md #Not sure if we need this so I'm going to keep it just in case
5
  nlp = spacy.load("en_core_web_lg")
6
  import altair as alt
7
  import streamlit as st
@@ -14,6 +13,9 @@ import torch
14
  import torch.nn.functional as F
15
  from lime.lime_text import LimeTextExplainer
16
 
 
 
 
17
  class_names = ['negative', 'positive']
18
  explainer = LimeTextExplainer(class_names=class_names)
19
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
@@ -27,6 +29,10 @@ def predictor(texts):
27
 
28
  @st.experimental_singleton
29
  def critical_words(document, options=False):
 
 
 
 
30
  if type(document) is not spacy.tokens.doc.Doc:
31
  document = nlp(document)
32
  chunks = list(document.noun_chunks)
@@ -43,6 +49,31 @@ def critical_words(document, options=False):
43
  lime_results = pd.DataFrame(lime_results, columns=["Word","Weight"])
44
 
45
  #Identify what we care about "parts of speech"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  for chunk in chunks:
47
  #The use of chunk[-1] is due to testing that it appears to always match the root
48
  root = chunk[-1]
@@ -58,7 +89,7 @@ def critical_words(document, options=False):
58
  #creates a span for the entirety of the compound noun and adds it to the list.
59
  span = -1 * (1 + len(compound))
60
  pos_options.append(chunk[span:].text)
61
- cur_values + [token.text for token in chunk if token.pos_ == "ADJ"]
62
  else:
63
  print(f"The elmenents in {compound} could not be added to the final list because they are not all relevant to the model.")
64
  else:
@@ -67,21 +98,14 @@ def critical_words(document, options=False):
67
  pos_options.extend(cur_values)
68
  print(f"From {chunk.text}, {cur_values} added to pos_options due to entity recognition.") #for QA
69
  elif len(chunk) >= 1:
70
- cur_values = [token.text for token in chunk if token.pos_ in ["NOUN","ADJ"]]
71
  if (all(elem in lime_options for elem in cur_values) and (options is True)) or ((options is False)):
72
  pos_options.extend(cur_values)
73
  print(f"From {chunk.text}, {cur_values} added to pos_options due to wildcard.") #for QA
74
  else:
75
  print(f"No options added for \'{chunk.text}\' ")
76
- # Here I am going to try to pick up pronouns, which are people, and Adjectival Compliments.
77
- for token in document:
78
- if (token.text not in pos_options) and ((token.text in lime_options) or (options == False)):
79
- #print(f"executed {token.text} with {token.pos_} and {token.dep_}") #QA
80
- if (token.pos_ == "ADJ") and (token.dep_ in ["acomp","conj"]):
81
- pos_options.append(token.text)
82
- elif (token.pos_ == "PRON") and (len(token.morph) !=0):
83
- if (token.morph.get("PronType") == "Prs"):
84
- pos_options.append(token.text)
85
 
86
  if options:
87
  return pos_options, lime_results
@@ -157,6 +181,7 @@ def abs_dif(df,seed):
157
 
158
  text2 = Nearest Prediction
159
  text3 = Farthest Prediction'''
 
160
  target = df[df['Words'] == seed].pred.iloc[0]
161
  sub_df = df[df['Words'] != seed].reset_index()
162
  nearest_prediction = sub_df.pred[(sub_df.pred-target).abs().argsort()[:1]]
@@ -186,7 +211,6 @@ def gen_cf_country(df,_document,selection):
186
  df['similarity'] = df.Words.apply(lambda x: nlp(selection).similarity(nlp(x)))
187
  return df
188
 
189
-
190
  def gen_cf_profession(df,_document,selection):
191
  category = df.loc[df['Words'] == selection, 'Major'].iloc[0]
192
  df = df[df.Major == category]
 
1
  #Import the libraries we know we'll need for the Generator.
2
  import pandas as pd, spacy, nltk, numpy as np, re
3
  from spacy.matcher import Matcher
 
4
  nlp = spacy.load("en_core_web_lg")
5
  import altair as alt
6
  import streamlit as st
 
13
  import torch.nn.functional as F
14
  from lime.lime_text import LimeTextExplainer
15
 
16
+ #Import WNgen.py
17
+ from WNgen import *
18
+
19
  class_names = ['negative', 'positive']
20
  explainer = LimeTextExplainer(class_names=class_names)
21
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
 
29
 
30
  @st.experimental_singleton
31
  def critical_words(document, options=False):
32
+ '''This function is meant to select the critical part of a sentence. Critical, in this context means
33
+ the part of the sentence that is either: A) a NOUN or PROPN from the correct entity group, B) a NOUN,
34
+ C) a NOUN + ADJ combination, or D) ADJ and PROPN used to modify other NOUN tokens.
35
+ It also checks this against what the model thinks is important if the user defines "options" as "LIME" or True.'''
36
  if type(document) is not spacy.tokens.doc.Doc:
37
  document = nlp(document)
38
  chunks = list(document.noun_chunks)
 
49
  lime_results = pd.DataFrame(lime_results, columns=["Word","Weight"])
50
 
51
  #Identify what we care about "parts of speech"
52
+
53
+ # Here I am going to try to pick up pronouns, which are people, and Adjectival Compliments.
54
+ for token in document:
55
+ if (token.text not in pos_options) and ((token.text in lime_options) or (options == False)):
56
+ #print(f"executed {token.text} with {token.pos_} and {token.dep_}") #QA
57
+ if (token.pos_ in ["ADJ","PROPN"]) and (token.dep_ in ["compound", "amod"]) and (document[token.i - 1].dep_ in ["compound", "amod"]):
58
+ compound = document[token.i - 1: token.i +1].text
59
+ pos_options.append(compound)
60
+ print(f'Added {compound} based on "amod" and "compound" adjectives.')
61
+ elif (token.pos_ in ["NOUN"]) and (token.dep_ in ["compound", "amod", "conj"]) and (document[token.i - 1].dep_ in ["compound"]):
62
+ compound = document[token.i - 1: token.i +1].text
63
+ pos_options.append(compound)
64
+ print(f'Added {compound} based on "amod" and "compound" and "conj" nouns.')
65
+ elif (token.pos_ == "PROPN") and (token.dep_ in ["prep","amod"]):
66
+ pos_options.append(token.text)
67
+ print(f"Added '{token.text}' based on their adjectival state.")
68
+ elif (token.pos_ == "ADJ") and (token.dep_ in ["acomp","conj","amod"]):
69
+ pos_options.append(token.text)
70
+ print(f"Added '{token.text}' based on their adjectival state.")
71
+ elif (token.pos_ == "PRON") and (len(token.morph) !=0):
72
+ if (token.morph.get("PronType") == "Prs"):
73
+ pos_options.append(token.text)
74
+ print(f"Added '{token.text}' because it's a human pronoun.")
75
+
76
+ #Noun Chunks parsing
77
  for chunk in chunks:
78
  #The use of chunk[-1] is due to testing that it appears to always match the root
79
  root = chunk[-1]
 
89
  #creates a span for the entirety of the compound noun and adds it to the list.
90
  span = -1 * (1 + len(compound))
91
  pos_options.append(chunk[span:].text)
92
+ cur_values + [token.text for token in chunk if token.pos_ in ["ADJ","NOUN","PROPN"]]
93
  else:
94
  print(f"The elmenents in {compound} could not be added to the final list because they are not all relevant to the model.")
95
  else:
 
98
  pos_options.extend(cur_values)
99
  print(f"From {chunk.text}, {cur_values} added to pos_options due to entity recognition.") #for QA
100
  elif len(chunk) >= 1:
101
+ cur_values = [token.text for token in chunk if token.pos_ in ["NOUN","ADJ","PROPN"]]
102
  if (all(elem in lime_options for elem in cur_values) and (options is True)) or ((options is False)):
103
  pos_options.extend(cur_values)
104
  print(f"From {chunk.text}, {cur_values} added to pos_options due to wildcard.") #for QA
105
  else:
106
  print(f"No options added for \'{chunk.text}\' ")
107
+
108
+ pos_options = list(set(pos_options))
 
 
 
 
 
 
 
109
 
110
  if options:
111
  return pos_options, lime_results
 
181
 
182
  text2 = Nearest Prediction
183
  text3 = Farthest Prediction'''
184
+ seed = process_text(seed)
185
  target = df[df['Words'] == seed].pred.iloc[0]
186
  sub_df = df[df['Words'] != seed].reset_index()
187
  nearest_prediction = sub_df.pred[(sub_df.pred-target).abs().argsort()[:1]]
 
211
  df['similarity'] = df.Words.apply(lambda x: nlp(selection).similarity(nlp(x)))
212
  return df
213
 
 
214
  def gen_cf_profession(df,_document,selection):
215
  category = df.loc[df['Words'] == selection, 'Major'].iloc[0]
216
  df = df[df.Major == category]
WNgen.py CHANGED
@@ -12,8 +12,8 @@ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst
12
  model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
13
  pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
14
 
15
- '''#If an error is thrown that the corpus "omw-1.4" isn't discoverable you can use this code. (https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed)
16
- try:
17
  _create_unverified_https_context = ssl._create_unverified_context
18
  except AttributeError:
19
  pass
@@ -234,6 +234,7 @@ def get_parallel(word, seed_definition, QA=False):
234
  hypos = list(set([syn for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.66]))[:25]
235
  elif len(hypos) >= 10:
236
  hypos = list(set([syn for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.8]))[:25]
 
237
  if QA:
238
  print(hypers)
239
  print(hypos)
 
12
  model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
13
  pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
14
 
15
+ #If an error is thrown that the corpus "omw-1.4" isn't discoverable you can use this code. (https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed)
16
+ '''try:
17
  _create_unverified_https_context = ssl._create_unverified_context
18
  except AttributeError:
19
  pass
 
234
  hypos = list(set([syn for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.66]))[:25]
235
  elif len(hypos) >= 10:
236
  hypos = list(set([syn for syn in new_hypos if cleaned.similarity(nlp(process_text(syn.definition()))) >=.8]))[:25]
237
+
238
  if QA:
239
  print(hypers)
240
  print(hypos)
app.py CHANGED
@@ -1,8 +1,7 @@
1
  #Import the libraries we know we'll need for the Generator.
2
- import pandas as pd, spacy, nltk, numpy as np
3
  from spacy.matcher import Matcher
4
  nlp = spacy.load("en_core_web_lg")
5
- from nltk.corpus import wordnet
6
 
7
  #Import the libraries to support the model and predictions.
8
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
@@ -35,29 +34,30 @@ def prepare_model():
35
 
36
  @st.experimental_singleton
37
  def prepare_lists():
38
- nltk.download('omw-1.4')
39
- nltk.download('wordnet')
40
  countries = pd.read_csv("Assets/Countries/combined-countries.csv")
41
  professions = pd.read_csv("Assets/Professions/soc-professions-2018.csv")
42
- word_lists = [list(countries.Words),list(professions.Words)]
43
  return countries, professions, word_lists
44
 
45
  #Provide all the functions necessary to run the app
46
  #get definitions for control flow in Streamlit
47
  def get_def(word, POS=False):
48
  pos_options = ['NOUN','VERB','ADJ','ADV']
49
- m_word = word.replace(" ", "_")
50
  if POS in pos_options:
51
  seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS))]
52
  else:
53
  seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word)]
54
- seed_definition = col1.selectbox("Which definition is most relevant?", seed_definitions, key= "WN_definition")
55
- if col1.button("Choose Definition"):
56
- col1.write("You've chosen a definition.")
57
- st.session_state.definition = seed_definition
58
- return seed_definition
 
 
 
59
  else:
60
- col1.write("Please choose a definition.")
61
 
62
  ###Start coding the actual app###
63
  st.set_page_config(layout="wide", page_title="Natural Language Counterfactuals (NLC)")
@@ -124,7 +124,8 @@ if layout == 'MultiNLC':
124
 
125
  #Allow the user to pick an option to generate counterfactuals from.
126
  option = st.radio('Which word would you like to use to generate alternatives?', options, key = "option")
127
- if (any(option in sublist for sublist in word_lists)):
 
128
  st.write(f'You selected {option}. It matches a list.')
129
  elif option:
130
  st.write(f'You selected {option}. It does not match a list.')
@@ -133,10 +134,10 @@ if layout == 'MultiNLC':
133
  st.write('Awaiting your selection.')
134
 
135
  if st.button('Generate Alternatives'):
136
- if option in list(countries.Words):
137
  cf_df = gen_cf_country(countries, doc, option)
138
  st.success('Alternatives created.')
139
- elif option in list(professions.Words):
140
  cf_df = gen_cf_profession(professions, doc, option)
141
  st.success('Alternatives created.')
142
  else:
@@ -198,7 +199,8 @@ if layout == 'MultiNLC + Lime':
198
 
199
  #Allow the user to pick an option to generate counterfactuals from.
200
  option = st.radio('Which word would you like to use to generate alternatives?', options, key = "option")
201
- if (any(option in sublist for sublist in word_lists)):
 
202
  st.write(f'You selected {option}. It matches a list.')
203
  elif option:
204
  st.write(f'You selected {option}. It does not match a list.')
@@ -207,10 +209,10 @@ if layout == 'MultiNLC + Lime':
207
  st.write('Awaiting your selection.')
208
 
209
  if st.button('Generate Alternatives'):
210
- if option in list(countries.Words):
211
  cf_df = gen_cf_country(countries, doc, option)
212
  st.success('Alternatives created.')
213
- elif option in list(professions.Words):
214
  cf_df = gen_cf_profession(professions, doc, option)
215
  st.success('Alternatives created.')
216
  else:
@@ -278,7 +280,8 @@ if layout == 'VizNLC':
278
 
279
  #Allow the user to pick an option to generate counterfactuals from.
280
  option = st.radio('Which word would you like to use to generate alternatives?', options, key = "option")
281
- if (any(option in sublist for sublist in word_lists)):
 
282
  st.write(f'You selected {option}. It matches a list.')
283
  elif option:
284
  st.write(f'You selected {option}. It does not match a list.')
@@ -287,10 +290,10 @@ if layout == 'VizNLC':
287
  st.write('Awaiting your selection.')
288
 
289
  if st.button('Generate Alternatives'):
290
- if option in list(countries.Words):
291
  cf_df = gen_cf_country(countries, doc, option)
292
  st.success('Alternatives created.')
293
- elif option in list(professions.Words):
294
  cf_df = gen_cf_profession(professions, doc, option)
295
  st.success('Alternatives created.')
296
  else:
@@ -303,15 +306,15 @@ if layout == 'VizNLC':
303
  if alt_choice == "Similarity":
304
  text2, text3 = get_min_max(cf_df, option)
305
  col2.caption(f"This sentence is 'similar' to {option}.")
306
- col3.caption(f"This graph represents the {len(cf_df)} alternatives to {option}.")
307
  elif alt_choice == "Sampling (Random)":
308
  text2, text3 = sampled_alts(cf_df, option)
309
  col2.caption(f"This sentence is a random sample from the alternatives.")
310
- col3.caption(f"This graph represents the {len(cf_df)} alternatives to {option}.")
311
  elif alt_choice == "Sampling (Fixed)":
312
  text2, text3 = sampled_alts(cf_df, option, fixed=True)
313
  col2.caption(f"This sentence is a fixed sample of the alternatives.")
314
- col3.caption(f"This graph represents the {len(cf_df)} alternatives to {option}.")
315
  elif alt_choice == "Probability":
316
  text2, text3 = abs_dif(cf_df, option)
317
  col2.caption(f"This sentence is the closest prediction in the model.")
 
1
  #Import the libraries we know we'll need for the Generator.
2
+ import pandas as pd, spacy, nltk, numpy as np, re
3
  from spacy.matcher import Matcher
4
  nlp = spacy.load("en_core_web_lg")
 
5
 
6
  #Import the libraries to support the model and predictions.
7
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
 
34
 
35
  @st.experimental_singleton
36
  def prepare_lists():
 
 
37
  countries = pd.read_csv("Assets/Countries/combined-countries.csv")
38
  professions = pd.read_csv("Assets/Professions/soc-professions-2018.csv")
39
+ word_lists = [list(countries.Words.apply(lambda x: x.lower())),list(professions.Words)]
40
  return countries, professions, word_lists
41
 
42
  #Provide all the functions necessary to run the app
43
  #get definitions for control flow in Streamlit
44
  def get_def(word, POS=False):
45
  pos_options = ['NOUN','VERB','ADJ','ADV']
46
+ m_word = re.sub("(\W\s|\s)","_",word)
47
  if POS in pos_options:
48
  seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS))]
49
  else:
50
  seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word)]
51
+ if len(seed_definitions) > 0:
52
+ seed_definition = col1.selectbox("Which definition is most relevant?", seed_definitions, key= "WN_definition")
53
+ if col1.button("Choose Definition"):
54
+ col1.write("You've chosen a definition.")
55
+ st.session_state.definition = seed_definition
56
+ return seed_definition
57
+ else:
58
+ col1.write("Please choose a definition.")
59
  else:
60
+ col1.error("The word you've chosen does not have a definition within WordNet.")
61
 
62
  ###Start coding the actual app###
63
  st.set_page_config(layout="wide", page_title="Natural Language Counterfactuals (NLC)")
 
124
 
125
  #Allow the user to pick an option to generate counterfactuals from.
126
  option = st.radio('Which word would you like to use to generate alternatives?', options, key = "option")
127
+ lc_option = option.lower()
128
+ if (any(lc_option in sublist for sublist in word_lists)):
129
  st.write(f'You selected {option}. It matches a list.')
130
  elif option:
131
  st.write(f'You selected {option}. It does not match a list.')
 
134
  st.write('Awaiting your selection.')
135
 
136
  if st.button('Generate Alternatives'):
137
+ if lc_option in word_lists[0]:
138
  cf_df = gen_cf_country(countries, doc, option)
139
  st.success('Alternatives created.')
140
+ elif lc_option in word_lists[1]:
141
  cf_df = gen_cf_profession(professions, doc, option)
142
  st.success('Alternatives created.')
143
  else:
 
199
 
200
  #Allow the user to pick an option to generate counterfactuals from.
201
  option = st.radio('Which word would you like to use to generate alternatives?', options, key = "option")
202
+ lc_option = option.lower()
203
+ if (any(lc_option in sublist for sublist in word_lists)):
204
  st.write(f'You selected {option}. It matches a list.')
205
  elif option:
206
  st.write(f'You selected {option}. It does not match a list.')
 
209
  st.write('Awaiting your selection.')
210
 
211
  if st.button('Generate Alternatives'):
212
+ if lc_option in word_lists[0]:
213
  cf_df = gen_cf_country(countries, doc, option)
214
  st.success('Alternatives created.')
215
+ elif lc_option in word_lists[1]:
216
  cf_df = gen_cf_profession(professions, doc, option)
217
  st.success('Alternatives created.')
218
  else:
 
280
 
281
  #Allow the user to pick an option to generate counterfactuals from.
282
  option = st.radio('Which word would you like to use to generate alternatives?', options, key = "option")
283
+ lc_option = option.lower()
284
+ if (any(lc_option in sublist for sublist in word_lists)):
285
  st.write(f'You selected {option}. It matches a list.')
286
  elif option:
287
  st.write(f'You selected {option}. It does not match a list.')
 
290
  st.write('Awaiting your selection.')
291
 
292
  if st.button('Generate Alternatives'):
293
+ if lc_option in word_lists[0]:
294
  cf_df = gen_cf_country(countries, doc, option)
295
  st.success('Alternatives created.')
296
+ elif lc_option in word_lists[1]:
297
  cf_df = gen_cf_profession(professions, doc, option)
298
  st.success('Alternatives created.')
299
  else:
 
306
  if alt_choice == "Similarity":
307
  text2, text3 = get_min_max(cf_df, option)
308
  col2.caption(f"This sentence is 'similar' to {option}.")
309
+ col3.caption(f"This sentence is 'not similar' to {option}.")
310
  elif alt_choice == "Sampling (Random)":
311
  text2, text3 = sampled_alts(cf_df, option)
312
  col2.caption(f"This sentence is a random sample from the alternatives.")
313
+ col3.caption(f"This sentence is a random sample from the alternatives.")
314
  elif alt_choice == "Sampling (Fixed)":
315
  text2, text3 = sampled_alts(cf_df, option, fixed=True)
316
  col2.caption(f"This sentence is a fixed sample of the alternatives.")
317
+ col3.caption(f"This sentence is a fixed sample of the alternatives.")
318
  elif alt_choice == "Probability":
319
  text2, text3 = abs_dif(cf_df, option)
320
  col2.caption(f"This sentence is the closest prediction in the model.")
backup-app.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Import the libraries we know we'll need for the Generator.
2
+ import pandas as pd, spacy, nltk, numpy as np
3
+ from spacy.matcher import Matcher
4
+ nlp = spacy.load("en_core_web_lg")
5
+ from nltk.corpus import wordnet
6
+
7
+ #Import the libraries to support the model and predictions.
8
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
9
+ import lime
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from lime.lime_text import LimeTextExplainer
13
+
14
+ #Import the libraries for human interaction and visualization.
15
+ import altair as alt
16
+ import streamlit as st
17
+ from annotated_text import annotated_text as ant
18
+
19
+ #Import functions needed to build dataframes of keywords from WordNet
20
+ from WNgen import *
21
+ from NLselector import *
22
+
23
+ @st.experimental_singleton
24
+ def set_up_explainer():
25
+ class_names = ['negative', 'positive']
26
+ explainer = LimeTextExplainer(class_names=class_names)
27
+ return explainer
28
+
29
+ @st.experimental_singleton
30
+ def prepare_model():
31
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
32
+ model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
33
+ pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
34
+ return tokenizer, model, pipe
35
+
36
+ @st.experimental_singleton
37
+ def prepare_lists():
38
+ nltk.download('omw-1.4')
39
+ nltk.download('wordnet')
40
+ countries = pd.read_csv("Assets/Countries/combined-countries.csv")
41
+ professions = pd.read_csv("Assets/Professions/soc-professions-2018.csv")
42
+ word_lists = [list(countries.Words),list(professions.Words)]
43
+ return countries, professions, word_lists
44
+
45
+ #Provide all the functions necessary to run the app
46
+ #get definitions for control flow in Streamlit
47
+ def get_def(word, POS=False):
48
+ pos_options = ['NOUN','VERB','ADJ','ADV']
49
+ m_word = word.replace(" ", "_")
50
+ if POS in pos_options:
51
+ seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word, pos=getattr(wordnet, POS))]
52
+ else:
53
+ seed_definitions = [syn.definition() for syn in wordnet.synsets(m_word)]
54
+ seed_definition = col1.selectbox("Which definition is most relevant?", seed_definitions, key= "WN_definition")
55
+ if col1.button("Choose Definition"):
56
+ col1.write("You've chosen a definition.")
57
+ st.session_state.definition = seed_definition
58
+ return seed_definition
59
+ else:
60
+ col1.write("Please choose a definition.")
61
+
62
+ ###Start coding the actual app###
63
+ st.set_page_config(layout="wide", page_title="Natural Language Counterfactuals (NLC)")
64
+ layouts = ['Natural Language Explanation', 'Lime Explanation', 'MultiNLC', 'MultiNLC + Lime', 'VizNLC']
65
+ alternatives = ['Similarity', 'Sampling (Random)', 'Sampling (Fixed)', 'Probability']
66
+ alt_choice = "Similarity"
67
+
68
+ #Content in the Sidebar.
69
+ st.sidebar.info('This is an interface for exploring how different interfaces for exploring natural language explanations (NLE) may appear to people. It is intended to allow individuals to provide feedback on specific versions, as well as to compare what one offers over others for the same inputs.')
70
+ layout = st.sidebar.selectbox("Select a layout to explore.", layouts)
71
+ alt_choice = st.sidebar.selectbox("Choose the way you want to display alternatives.", alternatives) #Commented out until we decide this is useful functionality.
72
+
73
+ #Set up the Main Area Layout
74
+ st.title('Natural Language Counterfactuals (NLC) Prototype')
75
+ st.subheader(f'Current Layout: {layout}')
76
+ text = st.text_input('Provide a sentence you want to evaluate.', placeholder = "I like you. I love you.", key="input")
77
+
78
+ #Prepare the model, data, and Lime. Set starting variables.
79
+ tokenizer, model, pipe = prepare_model()
80
+ countries, professions, word_lists = prepare_lists()
81
+ explainer = set_up_explainer()
82
+ text2 = ""
83
+ text3 = ""
84
+ cf_df = pd.DataFrame()
85
+ if 'definition' not in st.session_state:
86
+ st.session_state.definition = "<(^_')>"
87
+
88
+ #Outline the various user interfaces we have built.
89
+
90
+ col1, col2, col3 = st.columns(3)
91
+ if layout == 'Natural Language Explanation':
92
+ with col1:
93
+ if st.session_state.input != "":
94
+ st.caption("This is the sentence you provided.")
95
+ st.write(text)
96
+ probability, sentiment = eval_pred(text, return_all=True)
97
+ nat_lang_explanation = construct_nlexp(text,sentiment,probability)
98
+
99
+ if layout == 'Lime Explanation':
100
+ with col1:
101
+ #Use spaCy to make the sentence into a doc so we can do NLP.
102
+ doc = nlp(st.session_state.input)
103
+ #Evaluate the provided sentence for sentiment and probability.
104
+ if st.session_state.input != "":
105
+ st.caption("This is the sentence you provided.")
106
+ st.write(text)
107
+ probability, sentiment = eval_pred(text, return_all=True)
108
+ options, lime = critical_words(st.session_state.input,options=True)
109
+ nat_lang_explanation = construct_nlexp(text,sentiment,probability)
110
+ st.write(" ")
111
+ st.altair_chart(lime_viz(lime))
112
+
113
+ if layout == 'MultiNLC':
114
+ with col1:
115
+ #Use spaCy to make the sentence into a doc so we can do NLP.
116
+ doc = nlp(st.session_state.input)
117
+ #Evaluate the provided sentence for sentiment and probability.
118
+ if st.session_state.input != "":
119
+ st.caption("This is the sentence you provided.")
120
+ st.write(text)
121
+ probability, sentiment = eval_pred(text, return_all=True)
122
+ options, lime = critical_words(st.session_state.input,options=True)
123
+ nat_lang_explanation = construct_nlexp(text,sentiment,probability)
124
+
125
+ #Allow the user to pick an option to generate counterfactuals from.
126
+ option = st.radio('Which word would you like to use to generate alternatives?', options, key = "option")
127
+ if (any(option in sublist for sublist in word_lists)):
128
+ st.write(f'You selected {option}. It matches a list.')
129
+ elif option:
130
+ st.write(f'You selected {option}. It does not match a list.')
131
+ definition = get_def(option)
132
+ else:
133
+ st.write('Awaiting your selection.')
134
+
135
+ if st.button('Generate Alternatives'):
136
+ if option in list(countries.Words):
137
+ cf_df = gen_cf_country(countries, doc, option)
138
+ st.success('Alternatives created.')
139
+ elif option in list(professions.Words):
140
+ cf_df = gen_cf_profession(professions, doc, option)
141
+ st.success('Alternatives created.')
142
+ else:
143
+ with st.sidebar:
144
+ ant("Generating alternatives for",(option,"opt","#E0FBFB"), "with a definition of: ",(st.session_state.definition,"def","#E0FBFB"),".")
145
+ cf_df = cf_from_wordnet_df(option,text,seed_definition=st.session_state.definition)
146
+ st.success('Alternatives created.')
147
+
148
+ if len(cf_df) != 0:
149
+ if alt_choice == "Similarity":
150
+ text2, text3 = get_min_max(cf_df, option)
151
+ col2.caption(f"This sentence is 'similar' to {option}.")
152
+ col3.caption(f"This sentence is 'not similar' to {option}.")
153
+ elif alt_choice == "Sampling (Random)":
154
+ text2, text3 = sampled_alts(cf_df, option)
155
+ col2.caption(f"This sentence is a random sample from the alternatives.")
156
+ col3.caption(f"This sentence is a random sample from the alternatives.")
157
+ elif alt_choice == "Sampling (Fixed)":
158
+ text2, text3 = sampled_alts(cf_df, option, fixed=True)
159
+ col2.caption(f"This sentence is a fixed sample of the alternatives.")
160
+ col3.caption(f"This sentence is a fixed sample of the alternatives.")
161
+ elif alt_choice == "Probability":
162
+ text2, text3 = abs_dif(cf_df, option)
163
+ col2.caption(f"This sentence is the closest prediction in the model.")
164
+ col3.caption(f"This sentence is the farthest prediction in the model.")
165
+ with st.sidebar:
166
+ st.info(f"Alternatives generated: {len(cf_df)}")
167
+
168
+ with col2:
169
+ if text2 != "":
170
+ sim2 = cf_df.loc[cf_df['text'] == text2, 'similarity'].iloc[0]
171
+ st.write(text2)
172
+ probability2, sentiment2 = eval_pred(text2, return_all=True)
173
+ nat_lang_explanation = construct_nlexp(text2,sentiment2,probability2)
174
+ #st.info(f" Similarity Score: {np.round(sim2, 2)}, Num Checked: {len(cf_df)}") #for QA purposes
175
+
176
+ with col3:
177
+ if text3 != "":
178
+ sim3 = cf_df.loc[cf_df['text'] == text3, 'similarity'].iloc[0]
179
+ st.write(text3)
180
+ probability3, sentiment3 = eval_pred(text3, return_all=True)
181
+ nat_lang_explanation = construct_nlexp(text3,sentiment3,probability3)
182
+ #st.info(f"Similarity Score: {np.round(sim3, 2)}, Num Checked: {len(cf_df)}") #for QA purposes
183
+
184
+ if layout == 'MultiNLC + Lime':
185
+ with col1:
186
+
187
+ #Use spaCy to make the sentence into a doc so we can do NLP.
188
+ doc = nlp(st.session_state.input)
189
+ #Evaluate the provided sentence for sentiment and probability.
190
+ if st.session_state.input != "":
191
+ st.caption("This is the sentence you provided.")
192
+ st.write(text)
193
+ probability, sentiment = eval_pred(text, return_all=True)
194
+ options, lime = critical_words(st.session_state.input,options=True)
195
+ nat_lang_explanation = construct_nlexp(text,sentiment,probability)
196
+ st.write(" ")
197
+ st.altair_chart(lime_viz(lime))
198
+
199
+ #Allow the user to pick an option to generate counterfactuals from.
200
+ option = st.radio('Which word would you like to use to generate alternatives?', options, key = "option")
201
+ if (any(option in sublist for sublist in word_lists)):
202
+ st.write(f'You selected {option}. It matches a list.')
203
+ elif option:
204
+ st.write(f'You selected {option}. It does not match a list.')
205
+ definition = get_def(option)
206
+ else:
207
+ st.write('Awaiting your selection.')
208
+
209
+ if st.button('Generate Alternatives'):
210
+ if option in list(countries.Words):
211
+ cf_df = gen_cf_country(countries, doc, option)
212
+ st.success('Alternatives created.')
213
+ elif option in list(professions.Words):
214
+ cf_df = gen_cf_profession(professions, doc, option)
215
+ st.success('Alternatives created.')
216
+ else:
217
+ with st.sidebar:
218
+ ant("Generating alternatives for",(option,"opt","#E0FBFB"), "with a definition of: ",(st.session_state.definition,"def","#E0FBFB"),".")
219
+ cf_df = cf_from_wordnet_df(option,text,seed_definition=st.session_state.definition)
220
+ st.success('Alternatives created.')
221
+
222
+ if len(cf_df) != 0:
223
+ if alt_choice == "Similarity":
224
+ text2, text3 = get_min_max(cf_df, option)
225
+ col2.caption(f"This sentence is 'similar' to {option}.")
226
+ col3.caption(f"This sentence is 'not similar' to {option}.")
227
+ elif alt_choice == "Sampling (Random)":
228
+ text2, text3 = sampled_alts(cf_df, option)
229
+ col2.caption(f"This sentence is a random sample from the alternatives.")
230
+ col3.caption(f"This sentence is a random sample from the alternatives.")
231
+ elif alt_choice == "Sampling (Fixed)":
232
+ text2, text3 = sampled_alts(cf_df, option, fixed=True)
233
+ col2.caption(f"This sentence is a fixed sample of the alternatives.")
234
+ col3.caption(f"This sentence is a fixed sample of the alternatives.")
235
+ elif alt_choice == "Probability":
236
+ text2, text3 = abs_dif(cf_df, option)
237
+ col2.caption(f"This sentence is the closest prediction in the model.")
238
+ col3.caption(f"This sentence is the farthest prediction in the model.")
239
+ with st.sidebar:
240
+ st.info(f"Alternatives generated: {len(cf_df)}")
241
+
242
+ with col2:
243
+ if text2 != "":
244
+ sim2 = cf_df.loc[cf_df['text'] == text2, 'similarity'].iloc[0]
245
+ st.write(text2)
246
+ probability2, sentiment2 = eval_pred(text2, return_all=True)
247
+ nat_lang_explanation = construct_nlexp(text2,sentiment2,probability2)
248
+ exp2 = explainer.explain_instance(text2, predictor, num_features=15, num_samples=2000)
249
+ lime_results2 = exp2.as_list()
250
+ st.write(" ")
251
+ st.altair_chart(lime_viz(lime_results2))
252
+
253
+ with col3:
254
+ if text3 != "":
255
+ sim3 = cf_df.loc[cf_df['text'] == text3, 'similarity'].iloc[0]
256
+ st.write(text3)
257
+ probability3, sentiment3 = eval_pred(text3, return_all=True)
258
+ nat_lang_explanation = construct_nlexp(text3,sentiment3,probability3)
259
+ exp3 = explainer.explain_instance(text3, predictor, num_features=15, num_samples=2000)
260
+ lime_results3 = exp3.as_list()
261
+ st.write(" ")
262
+ st.altair_chart(lime_viz(lime_results3))
263
+
264
+ if layout == 'VizNLC':
265
+ with col1:
266
+
267
+ #Use spaCy to make the sentence into a doc so we can do NLP.
268
+ doc = nlp(st.session_state.input)
269
+ #Evaluate the provided sentence for sentiment and probability.
270
+ if st.session_state.input != "":
271
+ st.caption("This is the sentence you provided.")
272
+ st.write(text)
273
+ probability, sentiment = eval_pred(text, return_all=True)
274
+ options, lime = critical_words(st.session_state.input,options=True)
275
+ nat_lang_explanation = construct_nlexp(text,sentiment,probability)
276
+ st.write(" ")
277
+ st.altair_chart(lime_viz(lime))
278
+
279
+ #Allow the user to pick an option to generate counterfactuals from.
280
+ option = st.radio('Which word would you like to use to generate alternatives?', options, key = "option")
281
+ if (any(option in sublist for sublist in word_lists)):
282
+ st.write(f'You selected {option}. It matches a list.')
283
+ elif option:
284
+ st.write(f'You selected {option}. It does not match a list.')
285
+ definition = get_def(option)
286
+ else:
287
+ st.write('Awaiting your selection.')
288
+
289
+ if st.button('Generate Alternatives'):
290
+ if option in list(countries.Words):
291
+ cf_df = gen_cf_country(countries, doc, option)
292
+ st.success('Alternatives created.')
293
+ elif option in list(professions.Words):
294
+ cf_df = gen_cf_profession(professions, doc, option)
295
+ st.success('Alternatives created.')
296
+ else:
297
+ with st.sidebar:
298
+ ant("Generating alternatives for",(option,"opt","#E0FBFB"), "with a definition of: ",(st.session_state.definition,"def","#E0FBFB"),".")
299
+ cf_df = cf_from_wordnet_df(option,text,seed_definition=st.session_state.definition)
300
+ st.success('Alternatives created.')
301
+
302
+ if len(cf_df) != 0:
303
+ if alt_choice == "Similarity":
304
+ text2, text3 = get_min_max(cf_df, option)
305
+ col2.caption(f"This sentence is 'similar' to {option}.")
306
+ col3.caption(f"This graph represents the {len(cf_df)} alternatives to {option}.")
307
+ elif alt_choice == "Sampling (Random)":
308
+ text2, text3 = sampled_alts(cf_df, option)
309
+ col2.caption(f"This sentence is a random sample from the alternatives.")
310
+ col3.caption(f"This graph represents the {len(cf_df)} alternatives to {option}.")
311
+ elif alt_choice == "Sampling (Fixed)":
312
+ text2, text3 = sampled_alts(cf_df, option, fixed=True)
313
+ col2.caption(f"This sentence is a fixed sample of the alternatives.")
314
+ col3.caption(f"This graph represents the {len(cf_df)} alternatives to {option}.")
315
+ elif alt_choice == "Probability":
316
+ text2, text3 = abs_dif(cf_df, option)
317
+ col2.caption(f"This sentence is the closest prediction in the model.")
318
+ col3.caption(f"This graph represents the {len(cf_df)} alternatives to {option}.")
319
+ with st.sidebar:
320
+ st.info(f"Alternatives generated: {len(cf_df)}")
321
+
322
+ with col2:
323
+ if text2 != "":
324
+ sim2 = cf_df.loc[cf_df['text'] == text2, 'similarity'].iloc[0]
325
+ st.write(text2)
326
+ probability2, sentiment2 = eval_pred(text2, return_all=True)
327
+ nat_lang_explanation = construct_nlexp(text2,sentiment2,probability2)
328
+ exp2 = explainer.explain_instance(text2, predictor, num_features=15, num_samples=2000)
329
+ lime_results2 = exp2.as_list()
330
+ st.write(" ")
331
+ st.altair_chart(lime_viz(lime_results2))
332
+
333
+ with col3:
334
+ if not cf_df.empty:
335
+ single_nearest = alt.selection_single(on='mouseover', nearest=True)
336
+ full = alt.Chart(cf_df).encode(
337
+ alt.X('similarity:Q', scale=alt.Scale(zero=False)),
338
+ alt.Y('pred:Q'),
339
+ color=alt.Color('Categories:N', legend=alt.Legend(title="Color of Categories")),
340
+ size=alt.Size('seed:O'),
341
+ tooltip=('Categories','text','pred')
342
+ ).mark_circle(opacity=.5).properties(width=450, height=450).add_selection(single_nearest)
343
+ st.altair_chart(full)