Nathan Butters commited on
Commit
bdb6cd4
·
1 Parent(s): 9d6f821

abs_diff attempt 0

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
.ipynb_checkpoints/NLselector-checkpoint.py CHANGED
@@ -1,7 +1,6 @@
1
  #Import the libraries we know we'll need for the Generator.
2
  import pandas as pd, spacy, nltk, numpy as np, re
3
  from spacy.matcher import Matcher
4
- #!python -m spacy download en_core_web_md #Not sure if we need this so I'm going to keep it just in case
5
  nlp = spacy.load("en_core_web_lg")
6
  import altair as alt
7
  import streamlit as st
@@ -14,6 +13,9 @@ import torch
14
  import torch.nn.functional as F
15
  from lime.lime_text import LimeTextExplainer
16
 
 
 
 
17
  class_names = ['negative', 'positive']
18
  explainer = LimeTextExplainer(class_names=class_names)
19
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
@@ -27,6 +29,10 @@ def predictor(texts):
27
 
28
  @st.experimental_singleton
29
  def critical_words(document, options=False):
 
 
 
 
30
  if type(document) is not spacy.tokens.doc.Doc:
31
  document = nlp(document)
32
  chunks = list(document.noun_chunks)
@@ -43,6 +49,31 @@ def critical_words(document, options=False):
43
  lime_results = pd.DataFrame(lime_results, columns=["Word","Weight"])
44
 
45
  #Identify what we care about "parts of speech"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  for chunk in chunks:
47
  #The use of chunk[-1] is due to testing that it appears to always match the root
48
  root = chunk[-1]
@@ -58,7 +89,7 @@ def critical_words(document, options=False):
58
  #creates a span for the entirety of the compound noun and adds it to the list.
59
  span = -1 * (1 + len(compound))
60
  pos_options.append(chunk[span:].text)
61
- cur_values + [token.text for token in chunk if token.pos_ == "ADJ"]
62
  else:
63
  print(f"The elmenents in {compound} could not be added to the final list because they are not all relevant to the model.")
64
  else:
@@ -67,21 +98,14 @@ def critical_words(document, options=False):
67
  pos_options.extend(cur_values)
68
  print(f"From {chunk.text}, {cur_values} added to pos_options due to entity recognition.") #for QA
69
  elif len(chunk) >= 1:
70
- cur_values = [token.text for token in chunk if token.pos_ in ["NOUN","ADJ"]]
71
  if (all(elem in lime_options for elem in cur_values) and (options is True)) or ((options is False)):
72
  pos_options.extend(cur_values)
73
  print(f"From {chunk.text}, {cur_values} added to pos_options due to wildcard.") #for QA
74
  else:
75
  print(f"No options added for \'{chunk.text}\' ")
76
- # Here I am going to try to pick up pronouns, which are people, and Adjectival Compliments.
77
- for token in document:
78
- if (token.text not in pos_options) and ((token.text in lime_options) or (options == False)):
79
- #print(f"executed {token.text} with {token.pos_} and {token.dep_}") #QA
80
- if (token.pos_ == "ADJ") and (token.dep_ in ["acomp","conj"]):
81
- pos_options.append(token.text)
82
- elif (token.pos_ == "PRON") and (len(token.morph) !=0):
83
- if (token.morph.get("PronType") == "Prs"):
84
- pos_options.append(token.text)
85
 
86
  if options:
87
  return pos_options, lime_results
@@ -157,6 +181,7 @@ def abs_dif(df,seed):
157
 
158
  text2 = Nearest Prediction
159
  text3 = Farthest Prediction'''
 
160
  target = df[df['Words'] == seed].pred.iloc[0]
161
  sub_df = df[df['Words'] != seed].reset_index()
162
  nearest_prediction = sub_df.pred[(sub_df.pred-target).abs().argsort()[:1]]
@@ -186,7 +211,6 @@ def gen_cf_country(df,_document,selection):
186
  df['similarity'] = df.Words.apply(lambda x: nlp(selection).similarity(nlp(x)))
187
  return df
188
 
189
-
190
  def gen_cf_profession(df,_document,selection):
191
  category = df.loc[df['Words'] == selection, 'Major'].iloc[0]
192
  df = df[df.Major == category]
 
1
  #Import the libraries we know we'll need for the Generator.
2
  import pandas as pd, spacy, nltk, numpy as np, re
3
  from spacy.matcher import Matcher
 
4
  nlp = spacy.load("en_core_web_lg")
5
  import altair as alt
6
  import streamlit as st
 
13
  import torch.nn.functional as F
14
  from lime.lime_text import LimeTextExplainer
15
 
16
+ #Import WNgen.py
17
+ from WNgen import *
18
+
19
  class_names = ['negative', 'positive']
20
  explainer = LimeTextExplainer(class_names=class_names)
21
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
 
29
 
30
  @st.experimental_singleton
31
  def critical_words(document, options=False):
32
+ '''This function is meant to select the critical part of a sentence. Critical, in this context means
33
+ the part of the sentence that is either: A) a NOUN or PROPN from the correct entity group, B) a NOUN,
34
+ C) a NOUN + ADJ combination, or D) ADJ and PROPN used to modify other NOUN tokens.
35
+ It also checks this against what the model thinks is important if the user defines "options" as "LIME" or True.'''
36
  if type(document) is not spacy.tokens.doc.Doc:
37
  document = nlp(document)
38
  chunks = list(document.noun_chunks)
 
49
  lime_results = pd.DataFrame(lime_results, columns=["Word","Weight"])
50
 
51
  #Identify what we care about "parts of speech"
52
+
53
+ # Here I am going to try to pick up pronouns, which are people, and Adjectival Compliments.
54
+ for token in document:
55
+ if (token.text not in pos_options) and ((token.text in lime_options) or (options == False)):
56
+ #print(f"executed {token.text} with {token.pos_} and {token.dep_}") #QA
57
+ if (token.pos_ in ["ADJ","PROPN"]) and (token.dep_ in ["compound", "amod"]) and (document[token.i - 1].dep_ in ["compound", "amod"]):
58
+ compound = document[token.i - 1: token.i +1].text
59
+ pos_options.append(compound)
60
+ print(f'Added {compound} based on "amod" and "compound" adjectives.')
61
+ elif (token.pos_ in ["NOUN"]) and (token.dep_ in ["compound", "amod", "conj"]) and (document[token.i - 1].dep_ in ["compound"]):
62
+ compound = document[token.i - 1: token.i +1].text
63
+ pos_options.append(compound)
64
+ print(f'Added {compound} based on "amod" and "compound" and "conj" nouns.')
65
+ elif (token.pos_ == "PROPN") and (token.dep_ in ["prep","amod"]):
66
+ pos_options.append(token.text)
67
+ print(f"Added '{token.text}' based on their adjectival state.")
68
+ elif (token.pos_ == "ADJ") and (token.dep_ in ["acomp","conj","amod"]):
69
+ pos_options.append(token.text)
70
+ print(f"Added '{token.text}' based on their adjectival state.")
71
+ elif (token.pos_ == "PRON") and (len(token.morph) !=0):
72
+ if (token.morph.get("PronType") == "Prs"):
73
+ pos_options.append(token.text)
74
+ print(f"Added '{token.text}' because it's a human pronoun.")
75
+
76
+ #Noun Chunks parsing
77
  for chunk in chunks:
78
  #The use of chunk[-1] is due to testing that it appears to always match the root
79
  root = chunk[-1]
 
89
  #creates a span for the entirety of the compound noun and adds it to the list.
90
  span = -1 * (1 + len(compound))
91
  pos_options.append(chunk[span:].text)
92
+ cur_values + [token.text for token in chunk if token.pos_ in ["ADJ","NOUN","PROPN"]]
93
  else:
94
  print(f"The elmenents in {compound} could not be added to the final list because they are not all relevant to the model.")
95
  else:
 
98
  pos_options.extend(cur_values)
99
  print(f"From {chunk.text}, {cur_values} added to pos_options due to entity recognition.") #for QA
100
  elif len(chunk) >= 1:
101
+ cur_values = [token.text for token in chunk if token.pos_ in ["NOUN","ADJ","PROPN"]]
102
  if (all(elem in lime_options for elem in cur_values) and (options is True)) or ((options is False)):
103
  pos_options.extend(cur_values)
104
  print(f"From {chunk.text}, {cur_values} added to pos_options due to wildcard.") #for QA
105
  else:
106
  print(f"No options added for \'{chunk.text}\' ")
107
+
108
+ pos_options = list(set(pos_options))
 
 
 
 
 
 
 
109
 
110
  if options:
111
  return pos_options, lime_results
 
181
 
182
  text2 = Nearest Prediction
183
  text3 = Farthest Prediction'''
184
+ #seed = process_text(seed)
185
  target = df[df['Words'] == seed].pred.iloc[0]
186
  sub_df = df[df['Words'] != seed].reset_index()
187
  nearest_prediction = sub_df.pred[(sub_df.pred-target).abs().argsort()[:1]]
 
211
  df['similarity'] = df.Words.apply(lambda x: nlp(selection).similarity(nlp(x)))
212
  return df
213
 
 
214
  def gen_cf_profession(df,_document,selection):
215
  category = df.loc[df['Words'] == selection, 'Major'].iloc[0]
216
  df = df[df.Major == category]
Assets/.DS_Store CHANGED
Binary files a/Assets/.DS_Store and b/Assets/.DS_Store differ
 
NLselector.py CHANGED
@@ -181,7 +181,7 @@ def abs_dif(df,seed):
181
 
182
  text2 = Nearest Prediction
183
  text3 = Farthest Prediction'''
184
- seed = process_text(seed)
185
  target = df[df['Words'] == seed].pred.iloc[0]
186
  sub_df = df[df['Words'] != seed].reset_index()
187
  nearest_prediction = sub_df.pred[(sub_df.pred-target).abs().argsort()[:1]]
 
181
 
182
  text2 = Nearest Prediction
183
  text3 = Farthest Prediction'''
184
+ #seed = process_text(seed)
185
  target = df[df['Words'] == seed].pred.iloc[0]
186
  sub_df = df[df['Words'] != seed].reset_index()
187
  nearest_prediction = sub_df.pred[(sub_df.pred-target).abs().argsort()[:1]]