smhavens commited on
Commit
f544dbd
·
1 Parent(s): 862ca42

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +272 -267
app.py CHANGED
@@ -1,268 +1,273 @@
1
- import gradio as gr
2
- import math
3
- import spacy
4
- from datasets import load_dataset
5
- from sentence_transformers import SentenceTransformer
6
- from sentence_transformers import InputExample
7
- from sentence_transformers import losses
8
- from sentence_transformers import util
9
- from transformers import pipeline
10
- from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
11
- from transformers import TrainingArguments, Trainer
12
- import torch
13
- import torch.nn.functional as F
14
- from torch.utils.data import DataLoader
15
- import numpy as np
16
- import evaluate
17
- import nltk
18
- from nltk.corpus import stopwords
19
- import subprocess
20
- import sys
21
- import random
22
-
23
- # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
24
- subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
25
- # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
26
- model_base = "bert-analogies"
27
- nltk.download('stopwords')
28
- nlp = spacy.load("en_core_web_sm")
29
- stops = stopwords.words("english")
30
- ROMAN_CONSTANTS = (
31
- ( "", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX" ),
32
- ( "", "X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC" ),
33
- ( "", "C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM" ),
34
- ( "", "M", "MM", "MMM", "", "", "-", "", "", "" ),
35
- ( "", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix" ),
36
- ( "", "x", "xx", "xxx", "xl", "l", "lx", "lxx", "lxxx", "xc" ),
37
- ( "", "c", "cc", "ccc", "cd", "d", "dc", "dcc", "dccc", "cm" ),
38
- ( "", "m", "mm", "mmm", "", "", "-", "", "", "" ),
39
- )
40
-
41
- # answer = "Pizza"
42
- guesses = []
43
- return_guesses = []
44
- answer = "Moon"
45
- word1 = "Black"
46
- word2 = "White"
47
- word3 = "Sun"
48
-
49
-
50
- #Mean Pooling - Take attention mask into account for correct averaging
51
- def mean_pooling(model_output, attention_mask):
52
- token_embeddings = model_output['token_embeddings'] #First element of model_output contains all token embeddings
53
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
54
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
55
-
56
-
57
- def normalize(comment, lowercase, remove_stopwords):
58
- if lowercase:
59
- comment = comment.lower()
60
- comment = nlp(comment)
61
- lemmatized = list()
62
- for word in comment:
63
- lemma = word.lemma_.strip()
64
- if lemma:
65
- if not remove_stopwords or (remove_stopwords and lemma not in stops):
66
- lemmatized.append(lemma)
67
- return " ".join(lemmatized)
68
-
69
-
70
- # def tokenize_function(examples):
71
- # return tokenizer(examples["text"])
72
-
73
-
74
- def compute_metrics(eval_pred):
75
- logits, labels = eval_pred
76
- predictions = np.argmax(logits, axis=-1)
77
- metric = evaluate.load("accuracy")
78
- return metric.compute(predictions=predictions, references=labels)
79
-
80
-
81
- def get_model():
82
- global model_base
83
- model = SentenceTransformer(model_base)
84
- gpu_available = torch.cuda.is_available()
85
- device = torch.device("cuda" if gpu_available else "cpu")
86
- model = model.to(device)
87
- return model
88
-
89
-
90
- def cosine_scores(model, sentence):
91
- global word1
92
- global word2
93
- global word3
94
- # sentence1 = f"{word1} is to {word2} as"
95
- embeddings1 = model.encode(sentence, convert_to_tensor=True)
96
-
97
- def embeddings(model, sentences):
98
- gpu_available = torch.cuda.is_available()
99
- device = torch.device("cuda" if gpu_available else "cpu")
100
- # device = torch.device('cuda:0')
101
- embeddings = model.encode(sentences)
102
- global word1
103
- global word2
104
- global word3
105
- global model_base
106
-
107
- # Load model from HuggingFace Hub
108
- tokenizer = AutoTokenizer.from_pretrained(model_base)
109
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
110
- # token_ids = tokenizer.encode(sentences, return_tensors='pt')
111
- # blank_id = tokenizer.mask_token_id
112
- # blank_id_idx = torch.where(encoded_input["input_ids"] == blank_id)[1]
113
-
114
- encoded_input["input_ids"] = encoded_input["input_ids"].to(device)
115
- encoded_input["attention_mask"] = encoded_input["attention_mask"].to(device)
116
- encoded_input['token_type_ids'] = encoded_input['token_type_ids'].to(device)
117
-
118
- encoded_input['input'] = {'input_ids':encoded_input['input_ids'], 'attention_mask':encoded_input['attention_mask']}
119
-
120
- del encoded_input['input_ids']
121
- del encoded_input['token_type_ids']
122
- del encoded_input['attention_mask']
123
-
124
- with torch.no_grad():
125
- # output = model(encoded_input)
126
- print(encoded_input)
127
- model_output = model(**encoded_input)
128
- # output = model(encoded_input_topk)
129
-
130
- unmasker = pipeline('fill-mask', model=model_base)
131
- guesses = unmasker(sentences)
132
- print(guesses)
133
-
134
- # Perform pooling
135
- sentence_embeddings = mean_pooling(model_output, encoded_input['input']["attention_mask"])
136
-
137
- # Normalize embeddings
138
- sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
139
-
140
- potential_words = []
141
- for guess in guesses:
142
- temp_word = guess['token_str']
143
- if temp_word[0].isalpha() and temp_word not in stops and temp_word not in ROMAN_CONSTANTS:
144
- potential_words.append(guess['token_str'])
145
-
146
- rand_index = random.randint(0, len(potential_words) - 1)
147
- print("THE LENGTH OF POTENTIAL WORDS FOR", sentences, "IS", len(potential_words), "AND THE RANDOM INDEX CHOSEN IS", rand_index)
148
- chosen_word = potential_words[rand_index]
149
-
150
- return chosen_word
151
-
152
-
153
- def random_word():
154
- global model_base
155
- with open(model_base + '/vocab.txt', 'r') as file:
156
- line = ""
157
- content = file.readlines()
158
- length = len(content)
159
- while line == "":
160
- rand_line = random.randrange(0, length)
161
-
162
- if content[rand_line][0].isalpha() and content[rand_line][:-1] not in stops and content[rand_line][:-1] not in ROMAN_CONSTANTS:
163
- line = content[rand_line]
164
- else:
165
- print(f"{content[rand_line]} is not alpha or is a stop word")
166
- # for num, aline in enumerate(file, 1997):
167
- # if random.randrange(num) and aline.isalpha():
168
- # continue
169
- # # elif not aline.isalpha():
170
-
171
- # line = aline
172
- print(line)
173
- return line[:-1]
174
-
175
-
176
- def generate_prompt(model):
177
- global word1
178
- global word2
179
- global word3
180
- global answer
181
- word1 = random_word()
182
- # word2 = random_word()
183
- word2 = embeddings(model, f"{word1} is to [MASK].")
184
- word3 = random_word()
185
- sentence = f"{word1} is to {word2} as {word3} is to [MASK]."
186
- print(sentence)
187
- answer = embeddings(model, sentence)
188
- print("ANSWER IS", answer)
189
- return f"# {word1} is to {word2} as {word3} is to ___."
190
- # cosine_scores(model, sentence)
191
-
192
-
193
- def greet(name):
194
- return "Hello " + name + "!!"
195
-
196
- def check_answer(guess:str):
197
- global guesses
198
- global answer
199
- global return_guesses
200
- global word1
201
- global word2
202
- global word3
203
-
204
- model = get_model()
205
- output = ""
206
- protected_guess = guess
207
- sentence = f"{word1} is to {word2} as [MASK] is to {guess}."
208
-
209
- other_word = embeddings(model, sentence)
210
- guesses.append(guess)
211
-
212
-
213
-
214
- for guess in return_guesses:
215
- output += ("- " + guess + "<br>")
216
-
217
- # output = output[:-1]
218
- prompt = f"{word1} is to {word2} as {word3} is to ___."
219
- # print("IS", protected_guess, "EQUAL TO", answer, ":", protected_guess.lower() == answer.lower())
220
-
221
- if protected_guess.lower() == answer.lower():
222
- return_guesses.append(f"{protected_guess}: {word1} is to {word2} as {word3} is to {protected_guess}.")
223
- output += f"<span style='color:green'>- {return_guesses[-1]}</span><br>"
224
- new_prompt = generate_prompt(model)
225
- return new_prompt, "Correct!", output
226
- else:
227
- return_guess = f"{protected_guess}: {word1} is to {word2} as {other_word} is to {protected_guess}."
228
- return_guesses.append(return_guess)
229
- output += ("- " + return_guess + " <br>")
230
- return prompt, "Try again!", output
231
-
232
- def main():
233
- global word1
234
- global word2
235
- global word3
236
- global answer
237
- # answer = "Moon"
238
- global guesses
239
-
240
-
241
- # num_rows, data_type, value, example, embeddings = training()
242
- # sent_embeddings = embeddings()
243
- model = get_model()
244
- generate_prompt(model)
245
-
246
- prompt = f"{word1} is to {word2} as {word3} is to ____"
247
- print(prompt)
248
- print("TESTING EMBEDDINGS")
249
- with gr.Blocks() as iface:
250
- mark_question = gr.Markdown(prompt)
251
- with gr.Tab("Guess"):
252
- text_input = gr.Textbox()
253
- text_output = gr.Textbox()
254
- text_button = gr.Button("Submit")
255
- with gr.Accordion("Open for previous guesses"):
256
- text_guesses = gr.Markdown()
257
- # with gr.Tab("Testing"):
258
- # gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
259
- text_button.click(check_answer, inputs=[text_input], outputs=[mark_question, text_output, text_guesses])
260
- # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
261
- iface.launch()
262
-
263
-
264
-
265
-
266
-
267
- if __name__ == "__main__":
 
 
 
 
 
268
  main()
 
1
+ import gradio as gr
2
+ import math
3
+ import spacy
4
+ from datasets import load_dataset
5
+ from sentence_transformers import SentenceTransformer
6
+ from sentence_transformers import InputExample
7
+ from sentence_transformers import losses
8
+ from sentence_transformers import util
9
+ from transformers import pipeline
10
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
11
+ from transformers import TrainingArguments, Trainer
12
+ import torch
13
+ import torch.nn.functional as F
14
+ from torch.utils.data import DataLoader
15
+ import numpy as np
16
+ import evaluate
17
+ import nltk
18
+ from nltk.corpus import stopwords
19
+ import subprocess
20
+ import sys
21
+ import random
22
+
23
+ # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
24
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
25
+ # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
26
+ model_base = "bert-analogies"
27
+ nltk.download('stopwords')
28
+ nlp = spacy.load("en_core_web_sm")
29
+ stops = stopwords.words("english")
30
+ ROMAN_CONSTANTS = (
31
+ ( "", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX" ),
32
+ ( "", "X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC" ),
33
+ ( "", "C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM" ),
34
+ ( "", "M", "MM", "MMM", "", "", "-", "", "", "" ),
35
+ ( "", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix" ),
36
+ ( "", "x", "xx", "xxx", "xl", "l", "lx", "lxx", "lxxx", "xc" ),
37
+ ( "", "c", "cc", "ccc", "cd", "d", "dc", "dcc", "dccc", "cm" ),
38
+ ( "", "m", "mm", "mmm", "", "", "-", "", "", "" ),
39
+ )
40
+
41
+ # answer = "Pizza"
42
+ guesses = []
43
+ return_guesses = []
44
+ answer = "Moon"
45
+ word1 = "Black"
46
+ word2 = "White"
47
+ word3 = "Sun"
48
+ base_prompts = ["Sun is to Moon as ", "Black is to White as ", "Atom is to Element as",
49
+ "Athens is to Greece as ", "Cat is to Dog as ", "Robin is to Bird as",
50
+ "Hunger is to Ambition as "]
51
+
52
+
53
+ #Mean Pooling - Take attention mask into account for correct averaging
54
+ def mean_pooling(model_output, attention_mask):
55
+ token_embeddings = model_output['token_embeddings'] #First element of model_output contains all token embeddings
56
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
57
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
58
+
59
+
60
+ def normalize(comment, lowercase, remove_stopwords):
61
+ if lowercase:
62
+ comment = comment.lower()
63
+ comment = nlp(comment)
64
+ lemmatized = list()
65
+ for word in comment:
66
+ lemma = word.lemma_.strip()
67
+ if lemma:
68
+ if not remove_stopwords or (remove_stopwords and lemma not in stops):
69
+ lemmatized.append(lemma)
70
+ return " ".join(lemmatized)
71
+
72
+
73
+ # def tokenize_function(examples):
74
+ # return tokenizer(examples["text"])
75
+
76
+
77
+ def compute_metrics(eval_pred):
78
+ logits, labels = eval_pred
79
+ predictions = np.argmax(logits, axis=-1)
80
+ metric = evaluate.load("accuracy")
81
+ return metric.compute(predictions=predictions, references=labels)
82
+
83
+
84
+ def get_model():
85
+ global model_base
86
+ model = SentenceTransformer(model_base)
87
+ gpu_available = torch.cuda.is_available()
88
+ device = torch.device("cuda" if gpu_available else "cpu")
89
+ model = model.to(device)
90
+ return model
91
+
92
+
93
+ def cosine_scores(model, sentence):
94
+ global word1
95
+ global word2
96
+ global word3
97
+ # sentence1 = f"{word1} is to {word2} as"
98
+ embeddings1 = model.encode(sentence, convert_to_tensor=True)
99
+
100
+ def embeddings(model, sentences):
101
+ gpu_available = torch.cuda.is_available()
102
+ device = torch.device("cuda" if gpu_available else "cpu")
103
+ # device = torch.device('cuda:0')
104
+ embeddings = model.encode(sentences)
105
+ global word1
106
+ global word2
107
+ global word3
108
+ global model_base
109
+
110
+ # Load model from HuggingFace Hub
111
+ tokenizer = AutoTokenizer.from_pretrained(model_base)
112
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
113
+ # token_ids = tokenizer.encode(sentences, return_tensors='pt')
114
+ # blank_id = tokenizer.mask_token_id
115
+ # blank_id_idx = torch.where(encoded_input["input_ids"] == blank_id)[1]
116
+
117
+ encoded_input["input_ids"] = encoded_input["input_ids"].to(device)
118
+ encoded_input["attention_mask"] = encoded_input["attention_mask"].to(device)
119
+ encoded_input['token_type_ids'] = encoded_input['token_type_ids'].to(device)
120
+
121
+ encoded_input['input'] = {'input_ids':encoded_input['input_ids'], 'attention_mask':encoded_input['attention_mask']}
122
+
123
+ del encoded_input['input_ids']
124
+ del encoded_input['token_type_ids']
125
+ del encoded_input['attention_mask']
126
+
127
+ with torch.no_grad():
128
+ # output = model(encoded_input)
129
+ print(encoded_input)
130
+ model_output = model(**encoded_input)
131
+ # output = model(encoded_input_topk)
132
+
133
+ unmasker = pipeline('fill-mask', model=model_base)
134
+ guesses = unmasker(sentences)
135
+ print(guesses)
136
+
137
+ # Perform pooling
138
+ sentence_embeddings = mean_pooling(model_output, encoded_input['input']["attention_mask"])
139
+
140
+ # Normalize embeddings
141
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
142
+
143
+ potential_words = []
144
+ for guess in guesses:
145
+ temp_word = guess['token_str']
146
+ if temp_word[0].isalpha() and temp_word not in stops and temp_word not in ROMAN_CONSTANTS:
147
+ potential_words.append(guess['token_str'])
148
+
149
+ rand_index = random.randint(0, len(potential_words) - 1)
150
+ print("THE LENGTH OF POTENTIAL WORDS FOR", sentences, "IS", len(potential_words), "AND THE RANDOM INDEX CHOSEN IS", rand_index)
151
+ chosen_word = potential_words[rand_index]
152
+
153
+ return chosen_word
154
+
155
+
156
+ def random_word():
157
+ global model_base
158
+ with open(model_base + '/vocab.txt', 'r') as file:
159
+ line = ""
160
+ content = file.readlines()
161
+ length = len(content)
162
+ while line == "":
163
+ rand_line = random.randrange(0, length)
164
+
165
+ if content[rand_line][0].isalpha() and content[rand_line][:-1] not in stops and content[rand_line][:-1] not in ROMAN_CONSTANTS:
166
+ line = content[rand_line]
167
+ else:
168
+ print(f"{content[rand_line]} is not alpha or is a stop word")
169
+ # for num, aline in enumerate(file, 1997):
170
+ # if random.randrange(num) and aline.isalpha():
171
+ # continue
172
+ # # elif not aline.isalpha():
173
+
174
+ # line = aline
175
+ print(line)
176
+ return line[:-1]
177
+
178
+
179
+ def generate_prompt(model):
180
+ global word1
181
+ global word2
182
+ global word3
183
+ global answer
184
+ global base_prompts
185
+ word1 = random_word()
186
+ # word2 = random_word()
187
+ random_line = random.randint(0, len(base_prompts) - 1)
188
+ word2 = embeddings(model, f"{base_prompts[random_line]}{word1} is to [MASK].")
189
+ word3 = random_word()
190
+ sentence = f"{word1} is to {word2} as {word3} is to [MASK]."
191
+ print(sentence)
192
+ answer = embeddings(model, sentence)
193
+ print("ANSWER IS", answer)
194
+ return f"# {word1} is to {word2} as {word3} is to ___."
195
+ # cosine_scores(model, sentence)
196
+
197
+
198
+ def greet(name):
199
+ return "Hello " + name + "!!"
200
+
201
+ def check_answer(guess:str):
202
+ global guesses
203
+ global answer
204
+ global return_guesses
205
+ global word1
206
+ global word2
207
+ global word3
208
+
209
+ model = get_model()
210
+ output = ""
211
+ protected_guess = guess
212
+ sentence = f"{word1} is to {word2} as [MASK] is to {guess}."
213
+
214
+ other_word = embeddings(model, sentence)
215
+ guesses.append(guess)
216
+
217
+
218
+
219
+ for guess in return_guesses:
220
+ output += (guess)
221
+
222
+ # output = output[:-1]
223
+ prompt = f"{word1} is to {word2} as {word3} is to ___."
224
+ # print("IS", protected_guess, "EQUAL TO", answer, ":", protected_guess.lower() == answer.lower())
225
+
226
+ if protected_guess.lower() == answer.lower():
227
+ return_guesses.append(f"<span style='color:green'>- {protected_guess}: {word1} is to {word2} as {word3} is to {protected_guess}.</span><br>")
228
+ output += f"<span style='color:green'>- {return_guesses[-1]}</span><br>"
229
+ new_prompt = generate_prompt(model)
230
+ return new_prompt, "Correct!", output
231
+ else:
232
+ return_guess = f"- {protected_guess}: {word1} is to {word2} as {other_word} is to {protected_guess}.<br>"
233
+ return_guesses.append(return_guess)
234
+ output += (return_guess)
235
+ return prompt, "Try again!", output
236
+
237
+ def main():
238
+ global word1
239
+ global word2
240
+ global word3
241
+ global answer
242
+ # answer = "Moon"
243
+ global guesses
244
+
245
+
246
+ # num_rows, data_type, value, example, embeddings = training()
247
+ # sent_embeddings = embeddings()
248
+ model = get_model()
249
+ generate_prompt(model)
250
+
251
+ prompt = f"# {word1} is to {word2} as {word3} is to ____"
252
+ print(prompt)
253
+ print("TESTING EMBEDDINGS")
254
+ with gr.Blocks() as iface:
255
+ mark_question = gr.Markdown(prompt)
256
+ with gr.Tab("Guess"):
257
+ text_input = gr.Textbox()
258
+ text_output = gr.Textbox()
259
+ text_button = gr.Button("Submit")
260
+ with gr.Accordion("Open for previous guesses"):
261
+ text_guesses = gr.Markdown()
262
+ # with gr.Tab("Testing"):
263
+ # gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
264
+ text_button.click(check_answer, inputs=[text_input], outputs=[mark_question, text_output, text_guesses])
265
+ # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
266
+ iface.launch()
267
+
268
+
269
+
270
+
271
+
272
+ if __name__ == "__main__":
273
  main()