Gabriela Nicole Gonzalez Saez commited on
Commit
c2ad8fd
1 Parent(s): 0b60561
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ index/en-es_metadata_ref.pkl filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,900 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from time import time
3
+
4
+ from bertviz import model_view, head_view
5
+ from bertviz_gradio import head_view_mod
6
+
7
+ import faiss
8
+ import torch
9
+ import os
10
+ # import nltk
11
+ import argparse
12
+ import random
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+ from argparse import Namespace
17
+ from tqdm.notebook import tqdm
18
+ from torch.utils.data import DataLoader
19
+ from functools import partial
20
+
21
+ from transformers import AutoTokenizer, MarianTokenizer, AutoModel, AutoModelForSeq2SeqLM, MarianMTModel
22
+
23
+ model_es = "Helsinki-NLP/opus-mt-en-es"
24
+ model_fr = "Helsinki-NLP/opus-mt-en-fr"
25
+ model_zh = "Helsinki-NLP/opus-mt-en-zh"
26
+ model_sw = "Helsinki-NLP/opus-mt-en-sw"
27
+
28
+ tokenizer_es = AutoTokenizer.from_pretrained(model_es)
29
+ tokenizer_fr = AutoTokenizer.from_pretrained(model_fr)
30
+ tokenizer_zh = AutoTokenizer.from_pretrained(model_zh)
31
+ tokenizer_sw = AutoTokenizer.from_pretrained(model_sw)
32
+
33
+ model_tr_es = MarianMTModel.from_pretrained(model_es)
34
+ model_tr_fr = MarianMTModel.from_pretrained(model_fr)
35
+ model_tr_zh = MarianMTModel.from_pretrained(model_zh)
36
+ model_tr_sw = MarianMTModel.from_pretrained(model_sw)
37
+
38
+ from faiss import write_index, read_index
39
+ import pickle
40
+
41
+ def load_index(model):
42
+ with open('index/'+ model + '_metadata_ref.pkl', 'rb') as f:
43
+ loaded_dict = pickle.load(f)
44
+ for type in ['tokens','words']:
45
+ for kind in ['input', 'output']:
46
+ ## save index file
47
+ name = 'index/'+ model + "_" + kind + "_"+ type + ".index"
48
+ loaded_dict[kind][type][1] = read_index(name)
49
+ # write_index(metadata_all[kind][type][1], name)
50
+ return loaded_dict
51
+
52
+
53
+ dict_models = {
54
+ 'en-es': model_es,
55
+ 'en-fr': model_fr,
56
+ 'en-zh': model_zh,
57
+ 'en-sw': model_sw,
58
+ }
59
+
60
+ dict_models_tr = {
61
+ 'en-es': model_tr_es,
62
+ 'en-fr': model_tr_fr,
63
+ 'en-zh': model_tr_zh,
64
+ 'en-sw': model_tr_sw,
65
+ }
66
+
67
+ dict_tokenizer_tr = {
68
+ 'en-es': tokenizer_es,
69
+ 'en-fr': tokenizer_fr,
70
+ 'en-zh': tokenizer_zh,
71
+ 'en-sw': tokenizer_sw,
72
+ }
73
+
74
+ dict_reference_faiss = {
75
+ 'en-es': load_index('en-es'),
76
+ }
77
+
78
+ saliency_examples = [
79
+ "Peace of Mind: Protection for consumers.",
80
+ "The sustainable development goals report: towards a rescue plan for people and planet",
81
+ "We will leave no stone unturned to hold those responsible to account.",
82
+ "The clock is now ticking on our work to finalise the remaining key legislative proposals presented by this Commission to ensure that citizens and businesses can reap the benefits of our policy actions.",
83
+ "Pumpkins, squash and gourds, fresh or chilled, excluding courgettes",
84
+ "The labour market participation of mothers with infants has even deteriorated over the past two decades, often impacting their career and incomes for years.",
85
+ ]
86
+
87
+ contrastive_examples = [
88
+ ["Peace of Mind: Protection for consumers.",
89
+ "Paz mental: protección de los consumidores",
90
+ "Paz de la mente: protección de los consumidores"],
91
+ ["the slaughterer has finished his work.",
92
+ "l'abatteur a terminé son travail.",
93
+ "l'abatteuse a terminé son travail."],
94
+ ['A fundamental shift is needed - in commitment, solidarity, financing and action - to put the world on a better path.',
95
+ '需要在承诺、团结、筹资和行动方面进行根本转变,使世界走上更美好的道路。',
96
+ '我们需要从根本上转变承诺、团结、资助和行动,使世界走上更美好的道路。',]
97
+ ]
98
+
99
+
100
+
101
+ #Load challenge set examples
102
+ df_challenge_set = pd.read_csv("challenge_sets.csv")
103
+ arr_challenge_set = df_challenge_set.values
104
+ arr_challenge_set = [[x[2], x[3], x[4], x[5]] for x in arr_challenge_set]
105
+
106
+
107
+
108
+
109
+ def get_k_prob_tokens(transition_scores, result, model, k_values=5):
110
+ tokenizer_tr = dict_tokenizer_tr[model]
111
+ gen_sequences = result.sequences[:, 1:]
112
+
113
+ result_output = []
114
+
115
+ # First beam only...
116
+ bs = 0
117
+ text = ' '
118
+ for tok, score, i_step in zip(gen_sequences[bs], transition_scores[bs],range(len(gen_sequences[bs]))):
119
+
120
+ beam_i = result.beam_indices[0][i_step]
121
+ if beam_i < 0:
122
+ beam_i = bs
123
+ bs_alt = [tokenizer_tr.decode(tok) for tok in result.scores[i_step][beam_i].topk(k_values).indices ]
124
+ bs_alt_scores = np.exp(result.scores[i_step][beam_i].topk(k_values).values)
125
+ result_output.append([np.array(result.scores[i_step][beam_i].topk(k_values).indices), np.array(bs_alt_scores),bs_alt])
126
+
127
+ return result_output
128
+
129
+
130
+ def split_token_from_sequences(sequences, model) -> dict :
131
+ n_sentences = len(sequences)
132
+
133
+ gen_sequences_texts = []
134
+ for bs in range(n_sentences):
135
+ # gen_sequences_texts.append(dict_tokenizer_tr[model].decode(sequences[:, 1:][bs], skip_special_tokens=True).split(' '))
136
+ #### decoder per token.
137
+ seq_bs = []
138
+
139
+ for token in sequences[:, 1:][bs]:
140
+ seq_bs.append(dict_tokenizer_tr[model].decode(token, skip_special_tokens=True))
141
+ gen_sequences_texts.append(seq_bs)
142
+
143
+ score = 0
144
+ #raw dict is bos
145
+ text = 'bos'
146
+ new_id = text +'--1'
147
+ dict_parent = [{'id': new_id, 'parentId': None , 'text': text, 'name': 'bos', 'prob': score }]
148
+ id_dict_pos = {}
149
+ step_i = 0
150
+ cont = True
151
+ words_by_step = [] #[['bos' for i in range(n_sentences)]]
152
+
153
+ while cont:
154
+ # append to dict_parent for all beams of step_i
155
+ cont = False
156
+ step_words = []
157
+ for beam in range(n_sentences):
158
+ app_text = '<empty_word>'
159
+ if step_i < len(gen_sequences_texts[beam]):
160
+ app_text = gen_sequences_texts[beam][step_i]
161
+ cont = True
162
+ step_words.append(app_text)
163
+ words_by_step.append(step_words)
164
+ print(words_by_step)
165
+
166
+ for i_bs, step_w in enumerate(step_words):
167
+ if not step_w in ['<empty_word>', '<pad>']:
168
+ #new id if the same word is not in another beam (?) [beam[i] was a token id]
169
+ #parent id = previous word and previous step.
170
+
171
+
172
+ # new_parent_id = "-".join([str(beam[i]) for i in range(step_i)])
173
+
174
+ new_id = "-".join([str(words_by_step[i][i_bs])+ '-' + str(i) for i in range(step_i+1)])
175
+ parent_id = "-".join([words_by_step[i][i_bs] + '-' + str(i) for i in range(step_i) ])
176
+
177
+ # new_id = step_w +'-' + str(step_i)
178
+ # parent_id = words_by_step[step_i-1][i_bs] + '-' + str(step_i -1)
179
+ next_word_flag = 1
180
+ if step_i == 0 :
181
+ parent_id = 'bos--1'
182
+ ## if the dict already exists remove it, if it is not a root...
183
+ ## root?? then next is ''
184
+ else:
185
+ next_word_flag = len(gen_sequences_texts[i_bs][step_i]) > step_i ## Not in step_i = 0;
186
+ if next_word_flag:
187
+ if not (new_id in id_dict_pos):
188
+ dict_parent.append({'id': new_id, 'parentId': parent_id , 'text': step_w, 'name': step_w, 'prob' : score })
189
+ id_dict_pos[new_id] = len(dict_parent) - 1
190
+ else:
191
+ if not (new_id in id_dict_pos):
192
+ dict_parent.append({'id': new_id, 'parentId': parent_id , 'text': step_w, 'name': step_w, 'prob' : score })
193
+ id_dict_pos[new_id] = len(dict_parent) - 1
194
+
195
+
196
+ step_i += 1
197
+ return dict_parent
198
+
199
+
200
+ ## Tokenization
201
+ def compute_tokenization(inputs, targets, w1, model):
202
+ colors = ['tok-first-color', 'tok-second-color', 'tok-third-color', 'tok-fourth-color']
203
+ len_colors = len(colors);
204
+ inputs = inputs.input_ids
205
+ html_tokens = ""
206
+ i = 0
207
+ for sentence in inputs:
208
+ html_tokens += "<p>"
209
+ # print("TOKENS", inputs, targets)
210
+ # print("input", [dict_tokenizer_tr[model].decode(tok) for tok in sentence])
211
+ tokens = [dict_tokenizer_tr[model].decode(tok) for tok in sentence]
212
+ for token in tokens:
213
+ token = token.replace("<", "<'") # .substring(0, token.length - 2)
214
+ html_tokens += "<span class='" + colors[i % len_colors] + "'>" + token + " </span>"
215
+ i +=1
216
+ html_tokens += "</p>"
217
+ i = 0
218
+ # for tgt_sentence in targets :
219
+ html_tokens_tgt = ""
220
+ html_tokens_tgt += "<p>"
221
+ # print("targets", [dict_tokenizer_tr[model].decode(tok) for tok in targets])
222
+ # print("targets", dict_tokenizer_tr[model].decode(targets))
223
+ tokens = [dict_tokenizer_tr[model].decode(tok) for tok in targets]
224
+ for token in tokens:
225
+ token = token.replace("<", "<'") # .substring(0, token.length - 2)
226
+ html_tokens_tgt += "<span class='" + colors[i % len_colors] + "'>" + token + " </span>"
227
+ i +=1
228
+ html_tokens_tgt += "</p>"
229
+ # print("HTML", html_tokens, html_tokens_tgt)
230
+ return html_tokens, html_tokens_tgt
231
+
232
+
233
+ def create_vocab_multiple(embeddings_list, model):
234
+ """_summary_
235
+
236
+ Args:
237
+ embeddings_list (list): embedding array
238
+
239
+ Returns:
240
+ Dict: vocabulary of tokens' embeddings
241
+ """
242
+ print("START VOCAB CREATION MULTIPLE \n \n ")
243
+ vocab = {} ## add embedds.
244
+ sentence_tokens_text_list = []
245
+ for embeddings in embeddings_list:
246
+ tokens_id = embeddings['tokens'] # [[tokens_id]x n_sentences ]
247
+ for sent_i, sentence in enumerate(tokens_id):
248
+ sentence_tokens = []
249
+ for tok_i, token in enumerate(sentence):
250
+ sentence_tokens.append(token)
251
+ if not (token in vocab):
252
+ vocab[token] = {
253
+ 'token' : token,
254
+ 'count': 1,
255
+ # 'text': embeddings['texts'][sent_i][tok_i],
256
+ 'text': dict_tokenizer_tr[model].decode([token]),
257
+ # 'text': src_token_lists[sent_i][tok_i],
258
+ 'embed': embeddings['embeddings'][sent_i][tok_i]}
259
+ else:
260
+ vocab[token]['count'] = vocab[token]['count'] + 1
261
+ # print(vocab)
262
+ sentence_tokens_text_list.append(sentence_tokens)
263
+ print("END VOCAB CREATION MULTIPLE \n \n ")
264
+ return vocab, sentence_tokens_text_list
265
+
266
+ def vocab_words_all_prefix(token_embeddings, model, sufix="@@",prefix = '▁' ):
267
+ vocab = {}
268
+ # inf_model = dict_models_tr[model]
269
+ sentence_words_text_list = []
270
+ if prefix :
271
+ n_prefix = len(prefix)
272
+ for input_sentences in token_embeddings:
273
+ # n_tokens_in_word
274
+ for sent_i, sentence in enumerate(input_sentences['tokens']):
275
+ words_text_list = []
276
+ # embedding = input_sentences['embed'][sent_i]
277
+ word = ''
278
+ tokens_ids = []
279
+ embeddings = []
280
+ ids_to_tokens = dict_tokenizer_tr[model].convert_ids_to_tokens(sentence)
281
+ # print("validate same len", len(sentence) == len(ids_to_tokens), len(sentence), len(ids_to_tokens), ids_to_tokens)
282
+
283
+ to_save= False
284
+ for tok_i, token_text in enumerate(ids_to_tokens):
285
+ token_id = sentence[tok_i]
286
+ if token_text[:n_prefix] == prefix :
287
+ #first we save the previous word
288
+ if to_save:
289
+ vocab[word] = {
290
+ 'word' : word,
291
+ 'text': word,
292
+ 'count': 1,
293
+ 'tokens_ids' : tokens_ids,
294
+ 'embed': np.mean(np.array(embeddings), 0).tolist()
295
+ }
296
+ words_text_list.append(word)
297
+ #word is starting if prefix
298
+ tokens_ids = [token_id]
299
+ embeddings = [input_sentences['embeddings'][sent_i][tok_i]]
300
+ word = token_text[n_prefix:]
301
+ ## if word
302
+ to_save = True
303
+
304
+ else :
305
+ if (token_text in dict_tokenizer_tr[model].special_tokens_map.values()):
306
+ # print('final or save', token_text, token_id, to_save, word)
307
+ if to_save:
308
+ # vocab[word] = ids
309
+ vocab[word] = {
310
+ 'word' : word,
311
+ 'text': word,
312
+ 'count': 1,
313
+ 'tokens_ids' : tokens_ids,
314
+ 'embed': np.mean(np.array(embeddings), 0).tolist()
315
+ }
316
+ words_text_list.append(word)
317
+ #special token is one token element, no continuation
318
+ # vocab[token_text] = [token_id]
319
+ tokens_ids = [token_id]
320
+ embeddings = [input_sentences['embeddings'][sent_i][tok_i]]
321
+ vocab[token_text] = {
322
+ 'word' : token_text,
323
+ 'count': 1,
324
+ 'text': word,
325
+ 'tokens_ids' : tokens_ids,
326
+ 'embed': np.mean(np.array(embeddings), 0).tolist()
327
+ }
328
+ words_text_list.append(token_text)
329
+ to_save = False
330
+ else:
331
+ # is a continuation; we do not know if it is final; we don't save here.
332
+ to_save = True
333
+ word += token_text
334
+ tokens_ids.append(token_id)
335
+ embeddings.append(input_sentences['embeddings'][sent_i][tok_i])
336
+ if to_save:
337
+ # print('final save', token_text, token_id, to_save, word)
338
+ vocab[word] = tokens_ids
339
+ if not (word in vocab):
340
+ vocab[word] = {
341
+ 'word' : word,
342
+ 'count': 1,
343
+ 'text': word,
344
+ 'tokens_ids' : tokens_ids,
345
+ 'embed': np.mean(np.array(embeddings), 0).tolist()
346
+ }
347
+ words_text_list.append(word)
348
+ else:
349
+ vocab[word]['count'] = vocab[word]['count'] + 1
350
+ sentence_words_text_list.append(words_text_list)
351
+
352
+ return vocab, sentence_words_text_list
353
+ def search_query_vocab(index, vocab_queries, topk = 10, limited_search = []):
354
+ """ the embed queries are a vocabulary of words : embds_input_voc
355
+
356
+ Args:
357
+ index (_type_): faiss index
358
+ embed_queries (_type_): vocab format.
359
+ { 'token' : token,
360
+ 'count': 1,
361
+ 'text': src_token_lists[sent_i][tok_i],
362
+ 'embed': embeddings[0]['embeddings'][sent_i][tok_i] }
363
+ nb_ids (_type_): hash to find the token_id w.r.t the faiss index id.
364
+ topk (int, optional): nb of similar tokens. Defaults to 10.
365
+
366
+ Returns:
367
+ _type_: Distance matrix D, indices matrix I and tokens ids (using nb_ids)
368
+ """
369
+ # nb_qi_ids = [] ##ordered ids list
370
+ nb_q_embds = [] ##ordered embeddings list
371
+ metadata = {}
372
+ qi_pos = 0
373
+ for key , token_values in vocab_queries.items():
374
+ # nb_qi_ids.append(token_values['token']) # for x in vocab_tokens]
375
+ metadata[qi_pos] = {'word': token_values['word'], 'tokens': token_values['tokens_ids'], 'text': token_values['text']}
376
+ qi_pos += 1
377
+ nb_q_embds.append(token_values['embed']) # for x in vocab_tokens]
378
+
379
+ xq = np.array(nb_q_embds).astype('float32') #elements to query
380
+
381
+ D,I = index.search(xq, topk)
382
+
383
+ return D,I, metadata
384
+
385
+ def search_query_vocab_token(index, vocab_queries, topk = 10, limited_search = []):
386
+ """ the embed queries are a vocabulary of words : embds_input_vov
387
+ Returns:
388
+ _type_: Distance matrix D, indices matrix I and tokens ids (using nb_ids)
389
+ """
390
+ # nb_qi_ids = [] ##ordered ids list
391
+ nb_q_embds = [] ##ordered embeddings list
392
+ metadata = {}
393
+ qi_pos = 0
394
+ for key , token_values in vocab_queries.items():
395
+ # nb_qi_ids.append(token_values['token']) # for x in vocab_tokens]
396
+ metadata[qi_pos] = {'token': token_values['token'], 'text': token_values['text']}
397
+ qi_pos += 1
398
+ nb_q_embds.append(token_values['embed']) # for x in vocab_tokens]
399
+
400
+ xq = np.array(nb_q_embds).astype('float32') #elements to query
401
+
402
+ D,I = index.search(xq, topk)
403
+
404
+ return D,I, metadata
405
+
406
+
407
+ def build_search(query_embeddings, model,type="input"):
408
+ metadata_all = dict_reference_faiss[model]
409
+
410
+ # ## biuld vocab for index
411
+ vocab_queries, sentence_tokens_list = create_vocab_multiple(query_embeddings, model)
412
+ words_vocab_queries, sentence_words_list = vocab_words_all_prefix(query_embeddings, model, sufix="@@",prefix="▁")
413
+
414
+ index_vor_tokens = metadata_all[type]['tokens'][1]
415
+ md_tokens = metadata_all[type]['tokens'][2]
416
+ D, I, meta = search_query_vocab_token(index_vor_tokens, vocab_queries)
417
+
418
+ qi_pos = 0
419
+ similar_tokens = {}
420
+ # similar_tokens = []
421
+ for dist, ind in zip(D,I):
422
+ try:
423
+ # similar_tokens.append({
424
+ similar_tokens[str(meta[qi_pos]['token'])] = {
425
+ 'token': meta[qi_pos]['token'],
426
+ 'text': meta[qi_pos]['text'],
427
+ # 'text': dict_tokenizer_tr[model].decode(meta[qi_pos]['token'])
428
+ # 'text': meta[qi_pos]['text'],
429
+ "similar_topk": [md_tokens[i_index]['token'] for i_index in ind if (i_index != -1) ],
430
+ "distance": [dist[i] for (i, i_index) in enumerate(ind) if (i_index != -1)],
431
+ }
432
+ # )
433
+ except:
434
+ print("\n ERROR ", qi_pos, dist, ind)
435
+ qi_pos += 1
436
+
437
+
438
+ index_vor_words = metadata_all[type]['words'][1]
439
+ md_words = metadata_all[type]['words'][2]
440
+
441
+ Dw, Iw, metaw = search_query_vocab(index_vor_words, words_vocab_queries)
442
+ # D, I, meta, vocab_words, sentence_words_list = result_input['words']# [2] # D ; I ; meta
443
+ qi_pos = 0
444
+ # similar_words = []
445
+ similar_words = {}
446
+ for dist, ind in zip(Dw,Iw):
447
+ try:
448
+ # similar_words.append({
449
+ similar_words[str(metaw[qi_pos]['word']) ] = {
450
+ 'word': metaw[qi_pos]['word'],
451
+ 'text': metaw[qi_pos]['word'],
452
+ "similar_topk": [md_words[i_index]['word'] for i_index in ind if (i_index != -1) ],
453
+ "distance": [dist[i] for (i, i_index) in enumerate(ind) if (i_index != -1)],
454
+ }
455
+ # )
456
+ except:
457
+ print("\n ERROR ", qi_pos, dist, ind)
458
+ qi_pos += 1
459
+
460
+
461
+ return {'tokens': {'D': D, 'I': I, 'meta': meta, 'vocab_queries': vocab_queries, 'similar':similar_tokens, 'sentence_key_list': sentence_tokens_list},
462
+ 'words': {'D':Dw,'I': Iw, 'meta': metaw, 'vocab_queries':words_vocab_queries, 'sentence_key_list': sentence_words_list, 'similar': similar_words}
463
+ }
464
+ from sklearn.manifold import TSNE
465
+ def embds_input_projection_vocab(vocab, key="token"):
466
+ t0 = time()
467
+
468
+ nb_ids = [] ##ordered ids list
469
+ nb_embds = [] ##ordered embeddings list
470
+ nb_text = [] ##ordered embeddings list
471
+ tnse_error = []
472
+ for _ , token_values in vocab.items():
473
+ tnse_error.append([0,0])
474
+ nb_ids.append(token_values[key]) # for x in vocab_tokens]
475
+ nb_text.append(token_values['text']) # for x in vocab_tokens]
476
+ nb_embds.append(token_values['embed']) # for x in vocab_tokens]
477
+
478
+ X = np.array(nb_embds).astype('float32') #elements to project
479
+ try:
480
+ tsne = TSNE(random_state=0, n_iter=1000)
481
+ tsne_results = tsne.fit_transform(X)
482
+
483
+ tsne_results = np.c_[tsne_results, nb_ids, nb_text, range(len(nb_ids))] ## creates a zip array : [[TNSE[X,Y], tokenid, token_text], ...]
484
+ except:
485
+ tsne_results = np.c_[tnse_error, nb_ids, nb_text, range(len(nb_ids))] ## creates a zip array : [[TNSE[X,Y], tokenid, token_text], ...]
486
+
487
+ t1 = time()
488
+ print("t-SNE: %.2g sec" % (t1 - t0))
489
+ # print(tsne_results)
490
+
491
+ return tsne_results.tolist()
492
+
493
+
494
+
495
+ def filtered_projection(similar_key, vocab, model, type="input", key="word"):
496
+ metadata_all = dict_reference_faiss[model]
497
+ vocab_proj = vocab.copy()
498
+ ## tnse projection Input words
499
+ source_words_voc_similar = set()
500
+ # for words_set in similar_key:
501
+ for key_i in similar_key:
502
+ words_set = similar_key[key_i]
503
+ source_words_voc_similar.update(words_set['similar_topk'])
504
+
505
+ # print(len(source_words_voc_similar))
506
+ # source_embeddings_filtered = {key: metadata_all['input']['words'][0][key] for key in source_words_voc_similar}
507
+ source_embeddings_filtered = {key_value: metadata_all[type][key][0][key_value] for key_value in source_words_voc_similar}
508
+ vocab_proj.update(source_embeddings_filtered)
509
+ ## vocab_proj add
510
+ try:
511
+ result_TSNE = embds_input_projection_vocab(vocab_proj, key=key[:-1]) ## singular => without 's'
512
+ dict_projected_embds_all = {str(embds[2]): [embds[0], embds[1], embds[2], embds[3], embds[4]] for embds in result_TSNE}
513
+ except:
514
+ print('TSNE error', type, key)
515
+ dict_projected_embds_all = {}
516
+
517
+
518
+
519
+ # print(result_TSNE)
520
+ return dict_projected_embds_all
521
+
522
+ def get_bertvis_data(input_text, lg_model):
523
+ tokenizer_tr = dict_tokenizer_tr[lg_model]
524
+ model_tr = dict_models_tr[lg_model]
525
+
526
+ # input_ids = tokenizer_tr(input_text, return_tensors="pt", padding=True)
527
+ input_ids = tokenizer_tr(input_text, return_tensors="pt", padding=False)
528
+ result_att = model_tr.generate(**input_ids,
529
+ num_beams=4,
530
+ num_return_sequences=4,
531
+ return_dict_in_generate=True,
532
+ output_attentions =True,
533
+ output_scores=True,
534
+ )
535
+
536
+ # tokenizer_tr.convert_ids_to_tokens(result_att.sequences[0])
537
+ # tokenizer_tr.convert_ids_to_tokens(input_ids.input_ids[0])
538
+
539
+ tgt_text = tokenizer_tr.decode(result_att.sequences[0], skip_special_tokens=True)
540
+
541
+
542
+ outputs = model_tr(input_ids=input_ids.input_ids,
543
+ decoder_input_ids=result_att.sequences[:1],
544
+ output_attentions =True,
545
+ )
546
+ html_attentions = head_view_mod(
547
+ encoder_attention = outputs.encoder_attentions,
548
+ cross_attention = outputs.cross_attentions,
549
+ decoder_attention = outputs.decoder_attentions,
550
+ encoder_tokens = tokenizer_tr.convert_ids_to_tokens(input_ids.input_ids[0]),
551
+ decoder_tokens = tokenizer_tr.convert_ids_to_tokens(result_att.sequences[0]),
552
+ html_action='gradio'
553
+ )
554
+ return html_attentions, tgt_text, result_att, outputs
555
+
556
+
557
+ def translation_model(w1, model):
558
+ #translate and get internal values and visualizations;
559
+ # src_text = saliency_examples[0]
560
+ inputs = dict_tokenizer_tr[model](w1, return_tensors="pt", padding=True)
561
+
562
+ num_ret_seq = 4
563
+ translated = dict_models_tr[model].generate(**inputs,
564
+ num_beams=4,
565
+ num_return_sequences=num_ret_seq,
566
+ return_dict_in_generate=True,
567
+ output_attentions =True,
568
+ output_hidden_states = True,
569
+ output_scores=True,)
570
+
571
+
572
+ beam_dict = split_token_from_sequences(translated.sequences,model )
573
+
574
+ tgt_text = dict_tokenizer_tr[model].decode(translated.sequences[0], skip_special_tokens=True)
575
+ ## Attentions
576
+ outputs = dict_models_tr[model](input_ids=inputs.input_ids,
577
+ decoder_input_ids=translated.sequences[:1],
578
+ output_attentions =True,
579
+ )
580
+ encoder_tokens = dict_tokenizer_tr[model].convert_ids_to_tokens(inputs.input_ids[0])
581
+ decoder_tokens = dict_tokenizer_tr[model].convert_ids_to_tokens(translated.sequences[0])
582
+ # decoder_tokens = [tok for tok in decoder_tokens if tok != '<pad>']
583
+ # decoder_tokens = [tok for tok in decoder_tokens if tok != '<pad>']
584
+
585
+ # html_attentions = head_view_mod(
586
+ # encoder_attention = outputs.encoder_attentions,
587
+ # cross_attention = outputs.cross_attentions,
588
+ # decoder_attention = outputs.decoder_attentions,
589
+ # encoder_tokens = encoder_tokens,
590
+ # decoder_tokens = decoder_tokens,
591
+ # html_action='gradio'
592
+ # )
593
+
594
+ html_attentions_enc = head_view_mod(
595
+ encoder_attention = outputs.encoder_attentions,
596
+ encoder_tokens = encoder_tokens,
597
+ decoder_tokens = decoder_tokens,
598
+ html_action='gradio'
599
+ )
600
+
601
+ html_attentions_dec = head_view_mod(
602
+ # encoder_attention = outputs.encoder_attentions,
603
+ decoder_attention = outputs.decoder_attentions,
604
+ encoder_tokens = encoder_tokens,
605
+ decoder_tokens = decoder_tokens,
606
+ html_action='gradio'
607
+ )
608
+
609
+ html_attentions_cross = head_view_mod(
610
+ cross_attention = outputs.cross_attentions,
611
+ encoder_tokens = encoder_tokens,
612
+ decoder_tokens = decoder_tokens,
613
+ html_action='gradio'
614
+ )
615
+
616
+ # tokenization
617
+ html_in, html_out = compute_tokenization(inputs, translated.sequences[0],w1, model)
618
+
619
+ transition_scores = dict_models_tr[model].compute_transition_scores(
620
+ translated.sequences, translated.scores, translated.beam_indices , normalize_logits=True
621
+ )
622
+ prob_tokens = get_k_prob_tokens(transition_scores, translated, model, k_values=10)
623
+
624
+ input_embeddings = dict_models_tr[model].get_encoder().embed_tokens(inputs.input_ids)
625
+ target_embeddings = dict_models_tr[model].get_decoder().embed_tokens(translated.sequences)
626
+
627
+
628
+ return [tgt_text,
629
+ [beam_dict,prob_tokens, html_in, html_out, translated, inputs.input_ids,input_embeddings,target_embeddings],
630
+ [html_attentions_enc['params'], html_attentions_enc['html2'].data],
631
+ [html_attentions_dec['params'], html_attentions_dec['html2'].data],
632
+ [html_attentions_cross['params'], html_attentions_cross['html2'].data] ]
633
+
634
+
635
+
636
+ html = """
637
+ <html>
638
+ <script async src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script>
639
+
640
+ <style>
641
+ .tok-first-color {
642
+ background: #e0ffcd;
643
+ }
644
+
645
+ .tok-second-color {
646
+ background: #fdffcd;
647
+ }
648
+
649
+ .tok-third-color {
650
+ background: #ffebbb;
651
+ }
652
+
653
+ .tok-fourth-color {
654
+ background: #ffcab0;
655
+ }
656
+ </style>
657
+ <body>
658
+
659
+ <p id="demo"></p>
660
+ <p id="viz"></p>
661
+
662
+ <p id="demo2"></p>
663
+ <h4> Exploring top-k probable tokens </h4>
664
+ <div id="d3_text_grid">... top 10 tokens generated at each step ...</div>
665
+
666
+ <h4> Exploring the Beam Search sequence generation</h4>
667
+ <div id="d3_beam_search">... top 4 generated sequences using Beam Search...</div>
668
+
669
+ </body>
670
+ </html>
671
+ """
672
+
673
+ html_tok = """
674
+ <div id="d3_tok">... tokenization visualization ...</div>
675
+ """
676
+
677
+ html_embd = """
678
+ <div id="d3_embd">... token embeddings visualization ...</div>
679
+ <div id="select_div">
680
+ <select id="select_type" class="form-select" aria-label="select example" hidden>
681
+ <option selected value="words">Words</option>
682
+ <option value="tokens">Tokens</option>
683
+ </select>
684
+ </div>
685
+ <div class="row">
686
+ <div class="col-9">
687
+ <div id="d3_graph_input_words" class="d3_graph words"></div>
688
+ </div>
689
+ <div class="col-3">
690
+ <div id="similar_input_words" class=""></div>
691
+ </div>
692
+ </div>
693
+ <div id="d3_graph_input_tokens" class="d3_graph tokens"></div>
694
+ <div id="similar_input_tokens" class=" "></div>
695
+
696
+ """
697
+
698
+ html_tok_target ="""
699
+ <div id="d3_tok_target">... tokenization visualization ...</div>
700
+ """
701
+
702
+ html_embd_target= """
703
+ <div id="d3_embd_target">... token embeddings visualization ...</div>
704
+ <div id="d3_graph_output_words" class="d3_graph words"></div>
705
+ <div id="d3_graph_output_tokens" class="d3_graph tokens"></div>
706
+ <div id="similar_output_words" class=""></div>
707
+ <div id="similar_output_tokens" class=" "></div>
708
+ """
709
+
710
+ html_att_enc = """
711
+ <div id="d3_att_enc">... Encoder self attention only -- last layer and mean across heads ... Always read from left to right</div>
712
+ <div id="bertviz_enc"></div>
713
+ """
714
+
715
+ html_att_cross = """
716
+ <div id="d3_att_cross">... Encoder-decoder cross attention only -- last layer and mean across heads ...</div>
717
+ """
718
+
719
+ html_att_dec = """
720
+ <div id="d3_att_dec">... decoder self attention only -- last layer and mean across heads ...</div>
721
+ """
722
+
723
+
724
+
725
+ def sentence_maker2(w1,j2):
726
+ print(w1,j2)
727
+ return "in sentence22..."
728
+
729
+
730
+ def first_function(w1, model):
731
+ global metadata_all
732
+ #translate and get internal values
733
+ sentences = w1.split("\n")
734
+ all_sentences = []
735
+ translated_text = ''
736
+ input_embeddings = []
737
+ output_embeddings = []
738
+ for sentence in sentences :
739
+ # print(sentence, end=";")
740
+ params = translation_model(sentence, model)
741
+ all_sentences.append(params)
742
+ # print(len(params))
743
+ translated_text += params[0] + ' \n'
744
+ input_embeddings.append({
745
+ 'embeddings': params[1][6].detach(), ## create a vocabulary with the set of embeddings
746
+ 'tokens': params[1][3+2].tolist(), # one translation = one sentence
747
+ # 'texts' : dict_tokenizer_tr[model].decode(params[2].tolist())
748
+
749
+ })
750
+ output_embeddings.append({
751
+ 'embeddings' : params[1][7].detach(),
752
+ 'tokens': params[1][3+1].sequences.tolist(),
753
+ # 'texts' : dict_tokenizer_tr[model].decode(params[1].sequences.tolist())
754
+ })
755
+
756
+ ## load_reference;
757
+ ## Build FAISS index
758
+ # ---> preload faiss using the respective model with a initial dataset.
759
+ ## dict_reference_faiss[model] = metadata_all [per language]
760
+ # result_input = build_reference(input_embeddings,model)
761
+ # result_output = build_reference(output_embeddings,model)
762
+ # metadata_all = {'input': result_input, 'output': result_output}
763
+
764
+ ## Build FAISS index
765
+ # ---> preload faiss using the respective model with a initial dataset.
766
+ result_search = {}
767
+ result_search['input'] = build_search(input_embeddings, model, type='input')
768
+ result_search['output'] = build_search(output_embeddings, model, type='output')
769
+
770
+ json_out = {'input': {'tokens': {}, 'words': {}}, 'output': {'tokens': {}, 'words': {}}}
771
+ dict_projected = {}
772
+ for type in ['input', 'output']:
773
+ dict_projected[type] = {}
774
+ for key in ['tokens', 'words']:
775
+ similar_key = result_search[type][key]['similar']
776
+ vocab = result_search[type][key]['vocab_queries']
777
+ dict_projected[type][key] = filtered_projection(similar_key, vocab, model, type=type, key=key)
778
+ json_out[type][key]['similar_queries'] = similar_key
779
+ json_out[type][key]['tnse'] = dict_projected[type][key]
780
+ json_out[type][key]['key_text_list'] = result_search[type][key]['sentence_key_list']
781
+
782
+ ## bertviz
783
+ # paramsbv, tgtbv = get_bertvis_data(w1, model)
784
+
785
+ # params.append(json_out)
786
+ html_att_enc = params[2][1]#.root_div_id = "bertviz_enc"
787
+ html_att_dec = params[3][1]
788
+ html_att_cross = params[4][1]
789
+
790
+
791
+ params = [params[0], params[1], json_out, params[2][0], params[3][0], params[4][0]]
792
+ # params.append([tgt, params['params'], params['html2'].data]
793
+
794
+ return [translated_text, params, html_att_enc, html_att_dec, html_att_cross]
795
+
796
+ def second_function(w1,j2):
797
+ # json_value = {'one':1}# return f"{w1['two']} in sentence22..."
798
+ # to transfer the data to json.
799
+ print("second_function -- after the js", w1,j2)
800
+ return "transition to second js function finished."
801
+
802
+
803
+ with gr.Blocks(js="plotsjs.js") as demo:
804
+ gr.Markdown(
805
+ """
806
+ # MAKE NMT Workshop \t `Literacy task`
807
+ """)
808
+
809
+ gr.Markdown(
810
+ """
811
+ ### Translation
812
+ """)
813
+
814
+ gr.Markdown(
815
+ """
816
+ 1. Select the language pair for the translation
817
+ """)
818
+ radio_c = gr.Radio(choices=['en-zh', 'en-es', 'en-fr', 'en-sw'], value="en-es", label= '', container=False)
819
+ gr.Markdown(
820
+ """
821
+ 2. Source text to translate
822
+ """)
823
+ in_text = gr.Textbox(label="source text")
824
+ with gr.Accordion("Optional: Challenge selection:", open=False):
825
+ gr.Markdown(
826
+ """
827
+ ### select an example from the challenge set listed bellow
828
+ """)
829
+ challenge_ex = gr.Textbox(label="Challenge", interactive=False)
830
+ category_minor = gr.Textbox(label="category_minor", interactive=False)
831
+ category_major = gr.Textbox(label="category_major", interactive=False)
832
+
833
+ with gr.Accordion("Examples:"):
834
+ gr.Examples(arr_challenge_set,[in_text, challenge_ex,category_minor,category_major], label="")
835
+
836
+ btn = gr.Button("Translate")
837
+
838
+
839
+ with gr.Accordion("3. Review the source tokenization:", open=False):
840
+ input_tokenisation = gr.HTML(html_tok)
841
+
842
+ with gr.Accordion("4. Review similar source tokens in the embedding space:", open=False):
843
+ input_embd= gr.HTML(html_embd)
844
+
845
+ with gr.Accordion("5. Review the attention between the source tokens:", open=False):
846
+ gr.Markdown(
847
+ """
848
+ `Bertviz `
849
+ """)
850
+ input_embd= gr.HTML(html_att_enc)
851
+ enc_html = gr.HTML()
852
+
853
+ gr.Markdown(
854
+ """
855
+ ### Text is translated into Target Language
856
+ """)
857
+ out_text = gr.Textbox(label="target text")
858
+
859
+ with gr.Accordion("1. Review the target tokenization:", open=False):
860
+ target_tokenisation = gr.HTML(html_tok_target)
861
+
862
+ with gr.Accordion("2. Review similar target tokens in the embedding space:", open=False):
863
+ target_embd= gr.HTML(html_embd_target)
864
+
865
+ with gr.Accordion("3. Review the attention between the target and source tokens:", open=False):
866
+ gr.Markdown(
867
+ """
868
+ `Bertviz -cross attention`
869
+ """)
870
+ input_embd= gr.HTML(html_att_cross)
871
+ cross_html = gr.HTML()
872
+
873
+ with gr.Accordion("4. Review the attention between the target tokens:", open=False):
874
+ gr.Markdown(
875
+ """
876
+ `Bertviz -dec attention`
877
+ """)
878
+ input_embd= gr.HTML(html_att_dec)
879
+ dec_html = gr.HTML()
880
+
881
+ with gr.Accordion("6. Review the alternative translations tokens:", open=False):
882
+ gr.Markdown(
883
+ """
884
+ Generation process : `topk - beam search `
885
+ """)
886
+ input_mic = gr.HTML(html)
887
+
888
+
889
+ out_text2 = gr.Textbox(visible=False)
890
+ var2 = gr.JSON(visible=False)
891
+
892
+
893
+ btn.click(first_function, [in_text, radio_c], [out_text,var2,enc_html, dec_html, cross_html], js="(in_text,radio_c) => testFn_out(in_text,radio_c)") #should return an output comp.
894
+ out_text.change(second_function, [out_text, var2], out_text2, js="(out_text,var2) => testFn_out_json(var2)") #
895
+
896
+ # run script function on load,
897
+ # demo.load(None,None,None,js="plotsjs.js")
898
+
899
+ if __name__ == "__main__":
900
+ demo.launch()
bertviz_gradio.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ import os
4
+ import uuid
5
+
6
+ from IPython.core.display import display, HTML, Javascript
7
+
8
+ from bertviz.util import format_special_chars, format_attention, num_layers
9
+
10
+ print("UP TO DATE")
11
+
12
+ def head_view_mod(
13
+ attention=None,
14
+ tokens=None,
15
+ sentence_b_start=None,
16
+ prettify_tokens=True,
17
+ layer=None,
18
+ heads=None,
19
+ encoder_attention=None,
20
+ decoder_attention=None,
21
+ cross_attention=None,
22
+ encoder_tokens=None,
23
+ decoder_tokens=None,
24
+ include_layers=None,
25
+ html_action='view',
26
+ patest ="something"
27
+ ):
28
+ """Render head view
29
+
30
+ Args:
31
+ For self-attention models:
32
+ attention: list of ``torch.FloatTensor``(one for each layer) of shape
33
+ ``(batch_size(must be 1), num_heads, sequence_length, sequence_length)``
34
+ tokens: list of tokens
35
+ sentence_b_start: index of first wordpiece in sentence B if input text is sentence pair (optional)
36
+ For encoder-decoder models:
37
+ encoder_attention: list of ``torch.FloatTensor``(one for each layer) of shape
38
+ ``(batch_size(must be 1), num_heads, encoder_sequence_length, encoder_sequence_length)``
39
+ decoder_attention: list of ``torch.FloatTensor``(one for each layer) of shape
40
+ ``(batch_size(must be 1), num_heads, decoder_sequence_length, decoder_sequence_length)``
41
+ cross_attention: list of ``torch.FloatTensor``(one for each layer) of shape
42
+ ``(batch_size(must be 1), num_heads, decoder_sequence_length, encoder_sequence_length)``
43
+ encoder_tokens: list of tokens for encoder input
44
+ decoder_tokens: list of tokens for decoder input
45
+ For all models:
46
+ prettify_tokens: indicates whether to remove special characters in wordpieces, e.g. Ġ
47
+ layer: index (zero-based) of initial selected layer in visualization. Defaults to layer 0.
48
+ heads: Indices (zero-based) of initial selected heads in visualization. Defaults to all heads.
49
+ include_layers: Indices (zero-based) of layers to include in visualization. Defaults to all layers.
50
+ Note: filtering layers may improve responsiveness of the visualization for long inputs.
51
+ html_action: Specifies the action to be performed with the generated HTML object
52
+ - 'view' (default): Displays the generated HTML representation as a notebook cell output
53
+ - 'return' : Returns an HTML object containing the generated view for further processing or custom visualization
54
+ """
55
+
56
+ attn_data = []
57
+ if attention is not None:
58
+ if tokens is None:
59
+ raise ValueError("'tokens' is required")
60
+ if encoder_attention is not None or decoder_attention is not None or cross_attention is not None \
61
+ or encoder_tokens is not None or decoder_tokens is not None:
62
+ raise ValueError("If you specify 'attention' you may not specify any encoder-decoder arguments. This"
63
+ " argument is only for self-attention models.")
64
+ if include_layers is None:
65
+ include_layers = list(range(num_layers(attention)))
66
+ attention = format_attention(attention, include_layers)
67
+ if sentence_b_start is None:
68
+ attn_data.append(
69
+ {
70
+ 'name': None,
71
+ 'attn': attention.tolist(),
72
+ 'left_text': tokens,
73
+ 'right_text': tokens
74
+ }
75
+ )
76
+ else:
77
+ slice_a = slice(0, sentence_b_start) # Positions corresponding to sentence A in input
78
+ slice_b = slice(sentence_b_start, len(tokens)) # Position corresponding to sentence B in input
79
+ attn_data.append(
80
+ {
81
+ 'name': 'All',
82
+ 'attn': attention.tolist(),
83
+ 'left_text': tokens,
84
+ 'right_text': tokens
85
+ }
86
+ )
87
+ attn_data.append(
88
+ {
89
+ 'name': 'Sentence A -> Sentence A',
90
+ 'attn': attention[:, :, slice_a, slice_a].tolist(),
91
+ 'left_text': tokens[slice_a],
92
+ 'right_text': tokens[slice_a]
93
+ }
94
+ )
95
+ attn_data.append(
96
+ {
97
+ 'name': 'Sentence B -> Sentence B',
98
+ 'attn': attention[:, :, slice_b, slice_b].tolist(),
99
+ 'left_text': tokens[slice_b],
100
+ 'right_text': tokens[slice_b]
101
+ }
102
+ )
103
+ attn_data.append(
104
+ {
105
+ 'name': 'Sentence A -> Sentence B',
106
+ 'attn': attention[:, :, slice_a, slice_b].tolist(),
107
+ 'left_text': tokens[slice_a],
108
+ 'right_text': tokens[slice_b]
109
+ }
110
+ )
111
+ attn_data.append(
112
+ {
113
+ 'name': 'Sentence B -> Sentence A',
114
+ 'attn': attention[:, :, slice_b, slice_a].tolist(),
115
+ 'left_text': tokens[slice_b],
116
+ 'right_text': tokens[slice_a]
117
+ }
118
+ )
119
+ elif encoder_attention is not None or decoder_attention is not None or cross_attention is not None:
120
+ if encoder_attention is not None:
121
+ if encoder_tokens is None:
122
+ raise ValueError("'encoder_tokens' required if 'encoder_attention' is not None")
123
+ if include_layers is None:
124
+ include_layers = list(range(num_layers(encoder_attention)))
125
+ encoder_attention = format_attention(encoder_attention, include_layers)
126
+ attn_data.append(
127
+ {
128
+ 'name': 'Encoder',
129
+ 'attn': encoder_attention.tolist(),
130
+ 'left_text': encoder_tokens,
131
+ 'right_text': encoder_tokens
132
+ }
133
+ )
134
+ if decoder_attention is not None:
135
+ if decoder_tokens is None:
136
+ raise ValueError("'decoder_tokens' required if 'decoder_attention' is not None")
137
+ if include_layers is None:
138
+ include_layers = list(range(num_layers(decoder_attention)))
139
+ decoder_attention = format_attention(decoder_attention, include_layers)
140
+ attn_data.append(
141
+ {
142
+ 'name': 'Decoder',
143
+ 'attn': decoder_attention.tolist(),
144
+ 'left_text': decoder_tokens,
145
+ 'right_text': decoder_tokens
146
+ }
147
+ )
148
+ if cross_attention is not None:
149
+ if encoder_tokens is None:
150
+ raise ValueError("'encoder_tokens' required if 'cross_attention' is not None")
151
+ if decoder_tokens is None:
152
+ raise ValueError("'decoder_tokens' required if 'cross_attention' is not None")
153
+ if include_layers is None:
154
+ include_layers = list(range(num_layers(cross_attention)))
155
+ cross_attention = format_attention(cross_attention, include_layers)
156
+ attn_data.append(
157
+ {
158
+ 'name': 'Cross',
159
+ 'attn': cross_attention.tolist(),
160
+ 'left_text': decoder_tokens,
161
+ 'right_text': encoder_tokens
162
+ }
163
+ )
164
+ else:
165
+ raise ValueError("You must specify at least one attention argument.")
166
+
167
+ if layer is not None and layer not in include_layers:
168
+ raise ValueError(f"Layer {layer} is not in include_layers: {include_layers}")
169
+
170
+ # Generate unique div id to enable multiple visualizations in one notebook
171
+ vis_id = 'bertviz-%s'%(uuid.uuid4().hex)
172
+ # vis_id = 'bertviz'#-%s'%(uuid.uuid4().hex)
173
+
174
+ # Compose html
175
+ if len(attn_data) > 1:
176
+ options = '\n'.join(
177
+ f'<option value="{i}">{attn_data[i]["name"]}</option>'
178
+ for i, d in enumerate(attn_data)
179
+ )
180
+ select_html = f'Attention: <select id="filter">{options}</select>'
181
+ else:
182
+ select_html = ""
183
+ vis_html = f"""
184
+ <div id="{vis_id}" style="font-family:'Helvetica Neue', Helvetica, Arial, sans-serif;">
185
+ <span style="user-select:none">
186
+ Layer: <select id="layer"></select>
187
+ {select_html}
188
+ </span>
189
+ <div id='vis'></div>
190
+ </div>
191
+ """
192
+
193
+ for d in attn_data:
194
+ attn_seq_len_left = len(d['attn'][0][0])
195
+ if attn_seq_len_left != len(d['left_text']):
196
+ raise ValueError(
197
+ f"Attention has {attn_seq_len_left} positions, while number of tokens is {len(d['left_text'])} "
198
+ f"for tokens: {' '.join(d['left_text'])}"
199
+ )
200
+ attn_seq_len_right = len(d['attn'][0][0][0])
201
+ if attn_seq_len_right != len(d['right_text']):
202
+ raise ValueError(
203
+ f"Attention has {attn_seq_len_right} positions, while number of tokens is {len(d['right_text'])} "
204
+ f"for tokens: {' '.join(d['right_text'])}"
205
+ )
206
+ if prettify_tokens:
207
+ d['left_text'] = format_special_chars(d['left_text'])
208
+ d['right_text'] = format_special_chars(d['right_text'])
209
+ params = {
210
+ 'attention': attn_data,
211
+ 'default_filter': "0",
212
+ 'root_div_id': vis_id,
213
+ 'layer': layer,
214
+ 'heads': heads,
215
+ 'include_layers': include_layers,
216
+ 'test': 'test'
217
+ }
218
+
219
+ # require.js must be imported for Colab or JupyterLab:
220
+
221
+ if html_action == 'gradio':
222
+ html1 = HTML('<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script>')
223
+ html2 = HTML(vis_html)
224
+
225
+ return {'html1': html1, 'html2' : html2, 'params': params }
226
+
227
+
228
+ if html_action == 'view':
229
+ display(HTML('<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script>'))
230
+ display(HTML(vis_html))
231
+ __location__ = os.path.realpath(
232
+ os.path.join(os.getcwd(), os.path.dirname(__file__)))
233
+ vis_js = open(os.path.join(__location__, 'head_view.js')).read().replace("PYTHON_PARAMS", json.dumps(params))
234
+ display(Javascript(vis_js))
235
+
236
+ elif html_action == 'return':
237
+ html1 = HTML('<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script>')
238
+
239
+ html2 = HTML(vis_html)
240
+
241
+ __location__ = os.path.realpath(
242
+ os.path.join(os.getcwd(), os.path.dirname(__file__)))
243
+ vis_js = open(os.path.join(__location__, 'head_view.js')).read().replace("PYTHON_PARAMS", json.dumps(params))
244
+ html3 = Javascript(vis_js)
245
+ script = '\n<script type="text/javascript">\n' + html3.data + '\n</script>\n'
246
+
247
+ head_html = HTML(html1.data + html2.data + script)
248
+ return head_html
249
+
250
+ else:
251
+ raise ValueError("'html_action' parameter must be 'view' or 'return")
challenge_sets.csv ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name,Lang.,Source sentence,Challenge,category_minor,category_major,Interesting?
2
+ Isabel challenge set,EN,The repeated calls from his mother [should] have alerted us.,Is subject-verb agrement correct? (Possible interference from distractors between the subject's head and the verb).,"S-V agreement, across distractors",Morpho-Syntactic,
3
+ Isabel challenge set,EN,The sudden noise in the upper rooms [should] have alerted us.,Is subject-verb agrement correct? (Possible interference from distractors between the subject's head and the verb).,"S-V agreement, across distractors",Morpho-Syntactic,
4
+ Isabel challenge set,EN,Their repeated failures to report the problem [should] have alerted us.,Is subject-verb agrement correct? (Possible interference from distractors between the subject's head and the verb).,"S-V agreement, across distractors",Morpho-Syntactic,
5
+ Isabel challenge set,EN,She asked her brother not to be [arrogant].,Does the flagged adjective agree correctly with its subject? (Subject-control versus object-control verbs).,"S-V agreement, through control verbs",Morpho-Syntactic,
6
+ Isabel challenge set,EN,She promised her brother not to be [arrogant].,Does the flagged adjective agree correctly with its subject? (Subject-control versus object-control verbs).,"S-V agreement, through control verbs",Morpho-Syntactic,
7
+ Isabel challenge set,EN,She promised her doctor to remain [active] after retiring.,Does the flagged adjective agree correctly with its subject? (Subject-control versus object-control verbs).,"S-V agreement, through control verbs",Morpho-Syntactic,
8
+ Isabel challenge set,EN,My mother promised my father to be more [prudent] on the road.,Does the flagged adjective agree correctly with its subject? (Subject-control versus object-control verbs).,"S-V agreement, through control verbs",Morpho-Syntactic,
9
+ Isabel challenge set,EN,The woman was very [tall] and extremely [strong].,Do the marked verbs/adjective agree correctly with their subject? (Agreement distribution over coordinated predicates),"S-V agreement, coordinated targets",Morpho-Syntactic,
10
+ Isabel challenge set,EN,Their politicians were more [ignorant] than [stupid].,Do the marked verbs/adjective agree correctly with their subject? (Agreement distribution over coordinated predicates),"S-V agreement, coordinated targets",Morpho-Syntactic,
11
+ Isabel challenge set,EN,We [shouted] an insult and [left] abruptly.,Do the marked verbs/adjective agree correctly with their subject? (Agreement distribution over coordinated predicates),"S-V agreement, coordinated targets",Morpho-Syntactic,
12
+ Isabel challenge set,EN,The cat and the dog [should] be [watched].,Do the marked verbs/adjective agree correctly with their subject? (Masculine singular ET masculine singular yields masculine plural).,"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
13
+ Isabel challenge set,EN,My father and my brother [will] be [happy] tomorrow.,Do the marked verbs/adjective agree correctly with their subject? (Masculine singular ET masculine singular yields masculine plural).,"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
14
+ Isabel challenge set,EN,My book and my pencil [could] be [stolen].,Do the marked verbs/adjective agree correctly with their subject? (Masculine singular ET masculine singular yields masculine plural).,"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
15
+ Isabel challenge set,EN,The cow and the hen [must] be [fed].,Do the marked verbs/adjectives agree correctly with their subject? (Feminine singular ET feminine singular yields feminine plural).,"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
16
+ Isabel challenge set,EN,My mother and my sister [will be happy] tomorrow.,Do the marked verbs/adjectives agree correctly with their subject? (Feminine singular ET feminine singular yields feminine plural).,"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
17
+ Isabel challenge set,EN,My shoes and my socks [will] be [found].,Do the marked verbs/adjectives agree correctly with their subject? (Feminine singular ET feminine singular yields feminine plural).,"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
18
+ Isabel challenge set,EN,The dog and the cow [are] [nervous].,Do the marked verbs/adjectives agree correctly with their subject? (Masculine singular ET feminine singular yields masculine plural.),"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
19
+ Isabel challenge set,EN,My father and my mother will be happy tomorrow.,Do the marked verbs/adjectives agree correctly with their subject? (Masculine singular ET feminine singular yields masculine plural.),"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
20
+ Isabel challenge set,EN,My refrigerator and my kitchen table [were] [stolen].,Do the marked verbs/adjectives agree correctly with their subject? (Masculine singular ET feminine singular yields masculine plural.),"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
21
+ Isabel challenge set,EN,Paul and I [could] easily be [convinced] to join you.,Do the marked verbs/adjectives agree correctly with their subject? (Smallest coordinated grammatical person wins.),"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
22
+ Isabel challenge set,EN,You and he [could] be [surprised] by her findings.,Do the marked verbs/adjectives agree correctly with their subject? (Smallest coordinated grammatical person wins.),"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
23
+ Isabel challenge set,EN,We and they [are] on different courses.,Do the marked verbs/adjectives agree correctly with their subject? (Smallest coordinated grammatical person wins.),"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
24
+ Isabel challenge set,EN,The woman who [saw] a mouse in the corridor is charming.,Are the agreement marks of the flagged participles the correct ones? (Past participle placed after auxiliary AVOIR agrees with verb object iff object precedes auxiliary. Otherwise participle is in masculine singular form).,"S-V agreement, past participles",Morpho-Syntactic,
25
+ Isabel challenge set,EN,The woman that your brother [saw] in the corridor is charming.,Are the agreement marks of the flagged participles the correct ones? (Past participle placed after auxiliary AVOIR agrees with verb object iff object precedes auxiliary. Otherwise participle is in masculine singular form).,"S-V agreement, past participles",Morpho-Syntactic,
26
+ Isabel challenge set,EN,The house that John has [visited] is crumbling.,Are the agreement marks of the flagged participles the correct ones? (Past participle placed after auxiliary AVOIR agrees with verb object iff object precedes auxiliary. Otherwise participle is in masculine singular form).,"S-V agreement, past participles",Morpho-Syntactic,
27
+ Isabel challenge set,EN,John sold the car that he had [won] in a lottery.,Are the agreement marks of the flagged participles the correct ones? (Past participle placed after auxiliary AVOIR agrees with verb object iff object precedes auxiliary. Otherwise participle is in masculine singular form).,"S-V agreement, past participles",Morpho-Syntactic,
28
+ Isabel challenge set,EN,He will come provided that you [come] too.,"Is the flagged verb in the correct mood? (Certain triggering verbs, adjectives or subordinate conjunctions, induce the subjunctive mood in the subordinate clause that they govern).",Subjunctive mood,Morpho-Syntactic,
29
+ Isabel challenge set,EN,It is unfortunate that he is not [coming] either.,"Is the flagged verb in the correct mood? (Certain triggering verbs, adjectives or subordinate conjunctions, induce the subjunctive mood in the subordinate clause that they govern).",Subjunctive mood,Morpho-Syntactic,
30
+ Isabel challenge set,EN,I requested that families not [be] separated.,"Is the flagged verb in the correct mood? (Certain triggering verbs, adjectives or subordinate conjunctions, induce the subjunctive mood in the subordinate clause that they govern).",Subjunctive mood,Morpho-Syntactic,
31
+ Isabel challenge set,EN,[Mary] sorely misses [Jim].,Are the experiencer and the object of the ``missing'' situation correctly preserved in the French translation? (Argument switch).,Argument switch,Lexico-Syntactic,
32
+ Isabel challenge set,EN,[My sister] is really missing [New York.],Are the experiencer and the object of the ``missing'' situation correctly preserved in the French translation? (Argument switch).,Argument switch,Lexico-Syntactic,
33
+ Isabel challenge set,EN,What [he] misses most is [his dog].,Are the experiencer and the object of the ``missing'' situation correctly preserved in the French translation? (Argument switch).,Argument switch,Lexico-Syntactic,
34
+ Isabel challenge set,EN,John gave [his wonderful wife] a nice present.,Are ``gift'' and ``recipient'' arguments correctly rendered in French? (English double-object constructions),Double-object verbs,Lexico-Syntactic,
35
+ Isabel challenge set,EN,John told [the kids] a nice story.,Are ``gift'' and ``recipient'' arguments correctly rendered in French? (English double-object constructions),Double-object verbs,Lexico-Syntactic,
36
+ Isabel challenge set,EN,John sent [his mother] a nice postcard.,Are ``gift'' and ``recipient'' arguments correctly rendered in French? (English double-object constructions),Double-object verbs,Lexico-Syntactic,
37
+ Isabel challenge set,EN,John [failed to] see the relevance of this point.,Is the meaning of ``fail to'' correctly rendered in the French translation?,Fail to,Lexico-Syntactic,
38
+ Isabel challenge set,EN,He failed to respond.,Is the meaning of ``fail to'' correctly rendered in the French translation?,Fail to,Lexico-Syntactic,
39
+ Isabel challenge set,EN,Those who fail to comply with this requirement will be penalized.,Is the meaning of ``fail to'' correctly rendered in the French translation?,Fail to,Lexico-Syntactic,
40
+ Isabel challenge set,EN,John would like to [swim across] the river.,Is the movement action expressed in the English source correctly rendered in French? (Manner-of-movement verbs with path argument may need to be rephrased in French).,Manner-of-movement verbs,Lexico-Syntactic,
41
+ Isabel challenge set,EN,They [ran into] the room.,Is the movement action expressed in the English source correctly rendered in French? (Manner-of-movement verbs with path argument may need to be rephrased in French).,Manner-of-movement verbs,Lexico-Syntactic,
42
+ Isabel challenge set,EN,The man [ran out of] the park.,Is the movement action expressed in the English source correctly rendered in French? (Manner-of-movement verbs with path argument may need to be rephrased in French).,Manner-of-movement verbs,Lexico-Syntactic,
43
+ Isabel challenge set,EN,John [guitared his way] to San Francisco.,Hard example featuring spontaneous noun-to-verb derivation (``nonce verb'').,Manner-of-movement verbs,Lexico-Syntactic,
44
+ Isabel challenge set,EN,Paul [knows] that this is a fact.,Is the French verb for ``know'' correctly chosen? (Choice between ``savoir''/``connaître'' depends on syntactic nature of its object),Overlapping subcat frames,Lexico-Syntactic,
45
+ Isabel challenge set,EN,Paul [knows] this story.,Is the French verb for ``know'' correctly chosen? (Choice between ``savoir''/``connaître'' depends on syntactic nature of its object),Overlapping subcat frames,Lexico-Syntactic,
46
+ Isabel challenge set,EN,Paul [knows] this story is hard to believe.,Is the French verb for ``know'' correctly chosen? (Choice between ``savoir''/``connaître'' depends on syntactic nature of its object),Overlapping subcat frames,Lexico-Syntactic,
47
+ Isabel challenge set,EN,He [knows] my sister will not take it.,Is the French verb for ``know'' correctly chosen? (Choice between ``savoir''/``connaître'' depends on syntactic nature of its object),Overlapping subcat frames,Lexico-Syntactic,
48
+ Isabel challenge set,EN,My sister [knows] your son is reliable.,Is the French verb for ``know'' correctly chosen? (Choice between ``savoir''/``connaître'' depends on syntactic nature of its object),Overlapping subcat frames,Lexico-Syntactic,
49
+ Isabel challenge set,EN,John believes [Bill to be dishonest].,Is the English ``NP to VP'' complement correctly rendred in the French translation? (Sometimes one needs to translate this structure as a finite clause).,NP to VP,Lexico-Syntactic,
50
+ Isabel challenge set,EN,He liked [his father to tell him stories].,Is the English ``NP to VP'' complement correctly rendred in the French translation? (Sometimes one needs to translate this structure as a finite clause).,NP to VP,Lexico-Syntactic,
51
+ Isabel challenge set,EN,She wanted [her mother to let her go].,Is the English ``NP to VP'' complement correctly rendred in the French translation? (Sometimes one needs to translate this structure as a finite clause).,NP to VP,Lexico-Syntactic,
52
+ Isabel challenge set,EN,John [cooked] a big chicken.,Is the English verb correctly rendered in the French translation? (Agentive use of some French verbs require embedding under ``faire'').,Factitives,Lexico-Syntactic,
53
+ Isabel challenge set,EN,John [melted] a lot of ice.,Is the English verb correctly rendered in the French translation? (Agentive use of some French verbs require embedding under ``faire'').,Factitives,Lexico-Syntactic,
54
+ Isabel challenge set,EN,She likes to [grow] flowers.,Is the English verb correctly rendered in the French translation? (Agentive use of some French verbs require embedding under ``faire'').,Factitives,Lexico-Syntactic,
55
+ Isabel challenge set,EN,Use the meat knife.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
56
+ Isabel challenge set,EN,Use the butter knife.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
57
+ Isabel challenge set,EN,Use the steak knife.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
58
+ Isabel challenge set,EN,Clean the water filter.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
59
+ Isabel challenge set,EN,Clean the juice filter.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
60
+ Isabel challenge set,EN,Clean the tea filter.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
61
+ Isabel challenge set,EN,Clean the cloth filter.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
62
+ Isabel challenge set,EN,Clean the metal filter.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
63
+ Isabel challenge set,EN,Clean the paper filter.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
64
+ Isabel challenge set,EN,Stop [beating around the bush].,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Common idioms,Lexico-Syntactic,
65
+ Isabel challenge set,EN,You are [putting the cart before the horse].,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Common idioms,Lexico-Syntactic,
66
+ Isabel challenge set,EN,His comment proved to be [the straw that broke the camel's back].,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Common idioms,Lexico-Syntactic,
67
+ Isabel challenge set,EN,His argument really [hit the nail on the head].,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Common idioms,Lexico-Syntactic,
68
+ Isabel challenge set,EN,It's [no use crying over spilt milk].,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Common idioms,Lexico-Syntactic,
69
+ Isabel challenge set,EN,It is [no use crying over spilt milk].,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Common idioms,Lexico-Syntactic,
70
+ Isabel challenge set,EN,The cart has been put before the horse.,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Syntactically flexible idioms,Lexico-Syntactic,
71
+ Isabel challenge set,EN,"With this argument, [the nail has been hit on the head].",Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Syntactically flexible idioms,Lexico-Syntactic,
72
+ Isabel challenge set,EN,[Have the kids] ever watched that movie?,Is the English question correctly rendered as a French question?,Yes-no question syntax,Syntactic,
73
+ Isabel challenge set,EN,[Hasn't your boss denied you] a promotion?,Is the English question correctly rendered as a French question?,Yes-no question syntax,Syntactic,
74
+ Isabel challenge set,EN,[Shouldn't I attend] this meeting?,Is the English question correctly rendered as a French question?,Yes-no question syntax,Syntactic,
75
+ Isabel challenge set,EN,"Mary looked really happy tonight, [didn't she]?",Is the English ``tag question'' element correctly rendered in the translation?,Tag questions,Syntactic,
76
+ Isabel challenge set,EN,"We should not do that again, [should we]?",Is the English ``tag question'' element correctly rendered in the translation?,Tag questions,Syntactic,
77
+ Isabel challenge set,EN,"She was perfect tonight, [was she not]?",Is the English ``tag question'' element correctly rendered in the translation?,Tag questions,Syntactic,
78
+ Isabel challenge set,EN,The guy [that] she is going out [with] is handsome.,Is the dangling preposition of the English sentence correctly placed in the French translation?,WH-MVT and stranded preps,Syntactic,
79
+ Isabel challenge set,EN,[Whom] is she going out [with] these days?,Is the dangling preposition of the English sentence correctly placed in the French translation?,WH-MVT and stranded preps,Syntactic,
80
+ Isabel challenge set,EN,The girl [that] he has been talking [about] is smart.,Is the dangling preposition of the English sentence correctly placed in the French translation?,WH-MVT and stranded preps,Syntactic,
81
+ Isabel challenge set,EN,[Who] was he talking [to] when you left?,Is the dangling preposition of the English sentence correctly placed in the French translation?,WH-MVT and stranded preps,Syntactic,
82
+ Isabel challenge set,EN,The city [that] he is arriving [from] is dangerous.,Is the dangling preposition of the English sentence correctly placed in the French translation?,WH-MVT and stranded preps,Syntactic,
83
+ Isabel challenge set,EN,[Where] is he arriving [from]?,Is the dangling preposition of the English sentence correctly placed in the French translation?,WH-MVT and stranded preps,Syntactic,
84
+ Isabel challenge set,EN,Rarely [did the dog] run.,Is the adverb-triggered subject-verb inversion in the English sentence correctly rendered in the French translation?,Adverb-triggered inversion,Syntactic,
85
+ Isabel challenge set,EN,Never before [had she been] so unhappy.,Is the adverb-triggered subject-verb inversion in the English sentence correctly rendered in the French translation?,Adverb-triggered inversion,Syntactic,
86
+ Isabel challenge set,EN,Nowhere [were the birds] so colorful.,Is the adverb-triggered subject-verb inversion in the English sentence correctly rendered in the French translation?,Adverb-triggered inversion,Syntactic,
87
+ Isabel challenge set,EN,Soup [is eaten] with a large spoon.,Is the generic statement made in the English sentence correctly and naturally rendered in the French translation?,Middle voice,Syntactic,
88
+ Isabel challenge set,EN,Masonry [is cut] using a diamond blade.,Is the generic statement made in the English sentence correctly and naturally rendered in the French translation?,Middle voice,Syntactic,
89
+ Isabel challenge set,EN,Champagne [is drunk] in a glass called a flûte.,Is the generic statement made in the English sentence correctly and naturally rendered in the French translation?,Middle voice,Syntactic,
90
+ Isabel challenge set,EN,"[Should] Paul leave, I would be sad.",Fronted ``should'' is interpreted as a conditional subordinator. It is normally translated as ``si'' with imperfect tense.,Fronted ``should'',Syntactic,
91
+ Isabel challenge set,EN,"Should he become president, she would be promoted immediately.",Fronted ``should'' is interpreted as a conditional subordinator. It is normally translated as ``si'' with imperfect tense.,Fronted ``should'',Syntactic,
92
+ Isabel challenge set,EN,"[Should] he fall, he would get up again immediately.",Fronted ``should'' is interpreted as a conditional subordinator. It is normally translated as ``si'' with imperfect tense.,Fronted ``should'',Syntactic,
93
+ Isabel challenge set,EN,She had a lot of money but he did not have [any].,Are the English pronouns correctly rendered in the French translations?,Clitic pronouns,Syntactic,
94
+ Isabel challenge set,EN,He did not talk [to them] very often.,Are the English pronouns correctly rendered in the French translations?,Clitic pronouns,Syntactic,
95
+ Isabel challenge set,EN,The men are watching [each other].,Are the English pronouns correctly rendered in the French translations?,Clitic pronouns,Syntactic,
96
+ Isabel challenge set,EN,He gave [it] to the man.,Are the English pronouns correctly rendered in the French translations?,Clitic pronouns,Syntactic,
97
+ Isabel challenge set,EN,He did not give [it] to [her].,Are the English pronouns correctly rendered in the French translations?,Clitic pronouns,Syntactic,
98
+ Isabel challenge set,EN,The [first four] men were exhausted.,Is the relative order of the ordinals and numerals correct in the French tranlation?,Ordinal placement,Syntactic,
99
+ Isabel challenge set,EN,The [last three] candidates were eliminated.,Is the relative order of the ordinals and numerals correct in the French tranlation?,Ordinal placement,Syntactic,
100
+ Isabel challenge set,EN,The [other two] guys left without paying.,Is the relative order of the ordinals and numerals correct in the French tranlation?,Ordinal placement,Syntactic,
101
+ Isabel challenge set,EN,He washed [his] hands.,Is the French translation correct and natural both in: a) its use of a particular determiner on the body part noun; and b) the presence or absence of a reflexive pronoun before the verb?,Inalienable possession,Syntactic,
102
+ Isabel challenge set,EN,I brushed [my] teeth.,Is the French translation correct and natural both in: a) its use of a particular determiner on the body part noun; and b) the presence or absence of a reflexive pronoun before the verb?,Inalienable possession,Syntactic,
103
+ Isabel challenge set,EN,You brushed [your] teeth.,Is the French translation correct and natural both in: a) its use of a particular determiner on the body part noun; and b) the presence or absence of a reflexive pronoun before the verb?,Inalienable possession,Syntactic,
104
+ Isabel challenge set,EN,I raised [my] hand.,Is the French translation correct and natural both in: a) its use of a particular determiner on the body part noun; and b) the presence or absence of a reflexive pronoun before the verb?,Inalienable possession,Syntactic,
105
+ Isabel challenge set,EN,He turned [his] head.,Is the French translation correct and natural both in: a) its use of a particular determiner on the body part noun; and b) the presence or absence of a reflexive pronoun before the verb?,Inalienable possession,Syntactic,
106
+ Isabel challenge set,EN,He raised his eyes to heaven.,Is the French translation correct and natural both in: a) its use of a particular determiner on the body part noun; and b) the presence or absence of a reflexive pronoun before the verb?,Inalienable possession,Syntactic,
107
+ Isabel challenge set,EN,The strangers [] the woman saw were working.,Is the English zero relative pronoun correctly translated as a non-zero one in the French translation?,Zero REL PRO,Syntactic,
108
+ Isabel challenge set,EN,The man [] your sister hates is evil.,Is the English zero relative pronoun correctly translated as a non-zero one in the French translation?,Zero REL PRO,Syntactic,
109
+ Isabel challenge set,EN,The girl [] my friend was talking about is gone.,Is the English zero relative pronoun correctly translated as a non-zero one in the French translation?,Zero REL PRO,Syntactic,
index/en-es_input_tokens.index ADDED
Binary file (245 kB). View file
 
index/en-es_input_words.index ADDED
Binary file (206 kB). View file
 
index/en-es_metadata_ref.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bc8b47dc5db98c9cee79be4647d6853756838632ea2f42c1a4c377e948fd8a3
3
+ size 7599198
index/en-es_output_tokens.index ADDED
Binary file (331 kB). View file
 
index/en-es_output_words.index ADDED
Binary file (259 kB). View file
 
plotsjs.js ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ async () => {
4
+ // set testFn() function on globalThis, so you html onlclick can access it
5
+
6
+
7
+ globalThis.testFn = () => {
8
+ document.getElementById('demo').innerHTML = "Hello?"
9
+ };
10
+
11
+ const d37 = await import("https://cdn.jsdelivr.net/npm/d3@7/+esm");
12
+ const d3 = await import("https://cdn.jsdelivr.net/npm/d3@5/+esm");
13
+ const $ = await import("https://cdn.jsdelivr.net/npm/jquery@3.7.1/dist/jquery.min.js");
14
+ globalThis.$ = $;
15
+
16
+ globalThis.d3 = d3;
17
+
18
+ globalThis.d3Fn = () => {
19
+ d3.select('#viz').append('svg')
20
+ .append('rect')
21
+ .attr('width', 50)
22
+ .attr('height', 50)
23
+ .attr('fill', 'black')
24
+ .on('mouseover', function(){d3.select(this).attr('fill', 'red')})
25
+ .on('mouseout', function(){d3.select(this).attr('fill', 'black')});
26
+
27
+ };
28
+
29
+ globalThis.testFn_out = (val,radio_c) => {
30
+ // document.getElementById('demo').innerHTML = val
31
+ console.log(val);
32
+ // globalThis.d3Fn();
33
+ return([val,radio_c]);
34
+ };
35
+
36
+
37
+ globalThis.testFn_out_json = (data) => {
38
+ console.log(data);
39
+ var $ = jQuery;
40
+
41
+ data_beam = data[1][0];
42
+ data_probs = data[1][1];
43
+ data_html_inputs = data[1][2];
44
+ data_html_target = data[1][3];
45
+ data_embds = data[2];
46
+
47
+ attViz(data[3]);
48
+ attViz(data[4]);
49
+ attViz(data[5]);
50
+
51
+
52
+ console.log(data_beam, )
53
+ const idMapping = data_beam.reduce((acc, el, i) => {
54
+ acc[el.id] = i;
55
+ return acc;
56
+ }, {});
57
+
58
+ let root;
59
+ data_beam.forEach(el => {
60
+ // Handle the root element
61
+ if (el.parentId === null) {
62
+ root = el;
63
+ return;
64
+ }
65
+ // Use our mapping to locate the parent element in our data_beam array
66
+ const parentEl = data_beam[idMapping[el.parentId]];
67
+ // Add our current el to its parent's `children` array
68
+ parentEl.children = [...(parentEl.children || []), el];
69
+ });
70
+
71
+
72
+ // console.log(Tree(root));
73
+ // document.getElementById('d3_beam_search').innerHTML = Tree(root)
74
+ d3.select('#d3_beam_search').html("");
75
+ d3.select('#d3_beam_search').append(function(){return Tree(root);});
76
+
77
+ //probabilities;
78
+ //
79
+ d3.select('#d3_text_grid').html("");
80
+ d3.select('#d3_text_grid').append(function(){return TextGrid(data_probs);});
81
+ // $('#d3_text_grid').html(TextGrid(data)) ;
82
+
83
+ //tokenization;
84
+ d3.select('#d3_tok').html(data_html_inputs);
85
+ d3.select('#d3_tok_target').html(data_html_target);
86
+
87
+ //embeddings
88
+ d3.select("#d3_embeds_source").html("here");
89
+ // words or token visualization ?
90
+ console.log(d3.select("#select_type").node().value);
91
+ d3.select("#select_type").attr("hidden", null);
92
+ d3.select("#select_type").on("change", change);
93
+ change();
94
+ // tokens
95
+ // network plots;
96
+ ['input', 'output'].forEach(text_type => {
97
+ ['tokens', 'words'].forEach(text_key => {
98
+ // console.log(type, key, data[0][text_type]);
99
+ data_i = data_embds[text_type][text_key];
100
+ embeddings_network([], data_i['tnse'], data_i['similar_queries'], type=text_type +"_"+text_key, )
101
+ });
102
+ });
103
+
104
+ // $('#d3_beam_search').html(Tree(root)) ;
105
+
106
+ return(['string', {}])
107
+
108
+ }
109
+
110
+ function change() {
111
+ show_type = d3.select("#select_type").node().value;
112
+ // hide all
113
+ d3.selectAll(".d3_embed").attr("hidden",'');
114
+ d3.selectAll(".d3_graph").attr("hidden", '');
115
+ // show current type;
116
+ d3.select("#d3_embeds_input_" + show_type).attr("hidden", null);
117
+ d3.select("#d3_embeds_output_" + show_type).attr("hidden", null);
118
+ d3.select("#d3_graph_input_" + show_type).attr("hidden", null);
119
+ d3.select("#d3_graph_output_" + show_type).attr("hidden", null);
120
+ }
121
+
122
+ function embeddings_network(tokens_text, dict_projected_embds, similar_vocab_queries, type="source", ){
123
+ // tokens_text : not used;
124
+ // dict_projected_embds = tnse
125
+ console.log("Each token is a node; distance if in similar list", type );
126
+ console.log(tokens_text, dict_projected_embds, similar_vocab_queries);
127
+ // similar_vocab_queries_target[key]['similar_topk']
128
+
129
+ var nodes_tokens = {}
130
+ var nodeHash = {};
131
+ var nodes = []; // [{id: , label: }]
132
+ var edges = []; // [{source: , target: weight: }]
133
+ var edges_ids = []; // [{source: , target: weight: }]
134
+
135
+ // similar_vocab_queries {key: {similar_topk : [], distance : []}}
136
+ console.log('similar_vocab_queries', similar_vocab_queries);
137
+ prev_node = '';
138
+ for ([sent_token, value] of Object.entries(similar_vocab_queries)) {
139
+ // console.log('dict_projected_embds',sent_token, parseInt(sent_token), value, dict_projected_embds);
140
+ // sent_token = parseInt(sent_token); // Object.entries assumes key:string;
141
+ token_text = dict_projected_embds[sent_token][3]
142
+ if (!nodeHash[sent_token]) {
143
+ nodeHash[sent_token] = {id: sent_token, label: token_text, type: 'sentence', type_i: 0};
144
+ nodes.push(nodeHash[sent_token]);
145
+ }
146
+ sim_tokens = value['similar_topk']
147
+ dist_tokens = value['distance']
148
+
149
+ for (let index = 0; index < sim_tokens.length; index++) {
150
+ const sim = sim_tokens[index];
151
+ const dist = dist_tokens[index];
152
+
153
+ token_text_sim = dict_projected_embds[sim][3]
154
+ if (!nodeHash[sim]) {
155
+ nodeHash[sim] = {id: sim, label: token_text_sim, type:'similar', type_i: 1};
156
+ nodes.push(nodeHash[sim]);
157
+ }
158
+ edges.push({source: nodeHash[sent_token], target: nodeHash[sim], weight: dist});
159
+ edges_ids.push({source: sent_token, target: sim, weight: dist});
160
+ }
161
+
162
+ if (prev_node != '' ) {
163
+ edges.push({source: nodeHash[prev_node], target:nodeHash[sent_token], weight: 1});
164
+ edges_ids.push({source: prev_node, target: sent_token, weight: 1});
165
+ }
166
+ prev_node = sent_token;
167
+
168
+ }
169
+ console.log("TYPE", type, edges, nodes, edges_ids, similar_vocab_queries)
170
+ // d3.select('#d3_graph_input_tokens').html(networkPlot({nodes: nodes, links:edges}, similar_vocab_queries, div_type=type) );
171
+ // type +"_"+key
172
+ d3.select('#d3_graph_'+type).html("");
173
+ d3.select('#d3_graph_'+type).append(function(){return networkPlot({nodes: nodes, links:edges}, similar_vocab_queries, dict_projected_embds,div_type=type);});
174
+
175
+ // $('#d3_embeds_network_target').html(networkPlot({nodes: nodes, links:edges}));
176
+ // $('#d3_embeds_network_'+type).html(etworkPlot({nodes: nodes, link:edges}));
177
+ }
178
+
179
+ function networkPlot(data, similar_vocab_queries,dict_proj, div_type="source", {
180
+ width = 400, // outer width, in pixels
181
+ height , // outer height, in pixels
182
+ r = 3, // radius of nodes
183
+ padding = 1, // horizontal padding for first and last column
184
+ // text = d => d[2],
185
+ } = {}){
186
+ // data_dict = data;
187
+ data = data// [div_type];
188
+ similar_vocab_queries = similar_vocab_queries// [div_type];
189
+ console.log("data, similar_vocab_queries, div_type");
190
+ console.log(data, similar_vocab_queries, div_type);
191
+
192
+ // Create the SVG container.
193
+ var margin = {top: 10, right: 10, bottom: 30, left: 50 },
194
+ width = width //- margin.left - margin.right,
195
+ height = 400 //- margin.top - margin.bottom;
196
+
197
+ width_box = width + margin.left + margin.right;
198
+ height_box = height + margin.top + margin.bottom
199
+ totalWidth = width*2;
200
+
201
+
202
+ var svg = d37.create("svg")
203
+ .attr("width", width + margin.left + margin.right)
204
+ .attr("height", height + margin.top + margin.bottom)
205
+
206
+ // Initialize the links
207
+ var link = svg
208
+ .selectAll("line")
209
+ .data(data.links)
210
+ .enter()
211
+ .append("line")
212
+ .style("fill", d => d.weight == 1 ? "#dfd5d5" : "#000000") // , "#69b3a2" : "#69b3a2")
213
+ .style("stroke", "#aaa")
214
+
215
+
216
+
217
+ var text = svg
218
+ .selectAll("text")
219
+ .data(data.nodes)
220
+ .enter()
221
+ .append("text")
222
+ .style("text-anchor", "middle")
223
+ .attr("y", 15)
224
+ .attr("class", d => 'text_token-'+ dict_proj[d.id][4] + div_type)
225
+ .attr("div-type", div_type)
226
+ // .attr("class", d => 'text_token-'+ d.index)
227
+ .text(function (d) {return d.label} )
228
+ // .on('mouseover', function(d) { (d.type_i == 0) ? highlight_mouseover_text : console.log(0)} )
229
+ // .on('mouseover', function(d) { (d.type_i == 0) ? highlight_mouseout_text : '' } )
230
+ // .on('mouseout', highlight_mouseout_text )
231
+ // .join('text')
232
+ // .text(function(d) {
233
+ // return d.id
234
+ // })
235
+
236
+ // Initialize the nodes
237
+ var node = svg
238
+ .selectAll("circle")
239
+ .data(data.nodes)
240
+ .enter()
241
+ .append("circle")
242
+ .attr("r", 6)
243
+ // .attr("class", d => 'node_token-'+ d.id)
244
+ .attr("class", d => 'node_token-'+ dict_proj[d.id][4] + div_type)
245
+ .attr("div-type", div_type)
246
+ .style("fill", d => d.type_i ? "#e85252" : "#6689c6") // , "#69b3a2" : "#69b3a2")
247
+ .on('mouseover', highlight_mouseover )
248
+ // .on('mouseover', function(d) { return (d.type_i == 0) ? highlight_mouseover : console.log(0)} )
249
+ .on('mouseout',highlight_mouseout )
250
+ .on('click', change_legend )
251
+ // .on('click', show_similar_tokens )
252
+
253
+
254
+
255
+ // Let's list the force we wanna apply on the network
256
+ var simulation = d37.forceSimulation(data.nodes) // Force algorithm is applied to data.nodes
257
+ .force("link", d37.forceLink() // This force provides links between nodes
258
+ .id(function(d) { return d.id; }) // This provide the id of a node
259
+ .links(data.links) // and this the list of links
260
+ )
261
+ .force("charge", d37.forceManyBody(-400)) // This adds repulsion between nodes. Play with the -400 for the repulsion strength
262
+ .force("center", d37.forceCenter(width / 2, height / 2)) // This force attracts nodes to the center of the svg area
263
+ // .force("collision", d3.forceCollide())
264
+ .on("end", ticked);
265
+
266
+ // This function is run at each iteration of the force algorithm, updating the nodes position.
267
+ function ticked() {
268
+ link
269
+ .attr("x1", function(d) { return d.source.x; })
270
+ .attr("y1", function(d) { return d.source.y; })
271
+ .attr("x2", function(d) { return d.target.x; })
272
+ .attr("y2", function(d) { return d.target.y; });
273
+
274
+ node
275
+ .attr("cx", function (d) { return d.x+3; })
276
+ .attr("cy", function(d) { return d.y-3; });
277
+
278
+ text
279
+ .attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
280
+ }
281
+
282
+ function highlight_mouseover(d,i) {
283
+ console.log("highlight_mouseover", d,i, d37.select(this).attr("div-type"));
284
+ if (i.type_i == 0 ){
285
+ token_id = i.id
286
+ similar_ids = similar_vocab_queries[token_id]['similar_topk'];
287
+ d37.select(this).transition()
288
+ .duration('50')
289
+ .style('opacity', '1')
290
+ .attr("r", 12)
291
+ type = d37.select(this).attr("div-type")
292
+ similar_ids.forEach(similar_token => {
293
+ node_id_name = dict_proj[similar_token][4]
294
+ d37.selectAll('.node_token-'+ node_id_name + type).attr("r",12 ).style('opacity', '1')//.raise()
295
+ // d3.selectAll('.text_token-'+ node_id_name).raise()
296
+ });
297
+ }
298
+ }
299
+
300
+
301
+ function highlight_mouseout(d,i) {
302
+ if (i.type_i == 0 ){
303
+ token_id = i.id
304
+ console.log("similar_vocab_queries", similar_vocab_queries, "this type:", d37.select(this).attr("div-type"));
305
+ similar_ids = similar_vocab_queries[token_id]['similar_topk'];
306
+ // clean_sentences();
307
+ d37.select(this).transition()
308
+ .duration('50')
309
+ .style('opacity', '.7')
310
+ .attr("r", 6)
311
+ type = d37.select(this).attr("div-type")
312
+ similar_ids.forEach(similar_token => {
313
+ node_id_name = dict_proj[similar_token][4]
314
+ d37.selectAll('.node_token-' + node_id_name + type).attr("r",6 ).style('opacity', '.7')
315
+ d37.selectAll("circle").raise()
316
+ });
317
+ }
318
+ }
319
+
320
+ function change_legend(d,i,j) {
321
+ console.log(d,i,dict_proj);
322
+ if (i['id'] in dict_proj){
323
+ // show_sentences(dict_proj[i[2]], i[2]);
324
+
325
+ show_similar_tokens(i['id'], '#similar_'+type);
326
+
327
+ console.log(dict_proj[i['id']]);
328
+ }
329
+ else{console.log("no sentence")};
330
+ }
331
+
332
+ function show_similar_tokens(token, div_name_similar='#similar_input_tokens') {
333
+ d37.select(div_name_similar).html("");
334
+ console.log("token", token);
335
+ console.log("similar_vocab_queries[token]", similar_vocab_queries[token]);
336
+ token_data = similar_vocab_queries[token];
337
+ console.log(token, token_data);
338
+ var decForm = d37.format(".3f");
339
+
340
+ d37.select(div_name_similar)
341
+ .selectAll().append("p")
342
+ .data(token_data['similar_topk'])
343
+ .enter()
344
+ .append("p").append('text')
345
+ // .attr('class_data', sent_id)
346
+ .attr('class_id', d => d)
347
+ .style("background", d=> {if (d == token) return "yellow"} )
348
+ // .text( d => d + " \n ");
349
+ .text((d,i) => do_text(d,i) );
350
+
351
+ function do_text(d,i){
352
+ console.log("do_text d,i" );
353
+ console.log(d,i);
354
+ console.log("data_dict[d], data_dict");
355
+ console.log(dict_proj[d], dict_proj);
356
+ return dict_proj[d][3] + " " + decForm(token_data['distance'][i]) + " ";
357
+ }
358
+
359
+
360
+ }
361
+
362
+
363
+ return svg.node();
364
+
365
+ };
366
+
367
+
368
+
369
+ // Copyright 2021 Observable, Inc.
370
+ // Released under the ISC license.
371
+ // https://observablehq.com/@d3/tree
372
+ function Tree(data, { // data is either tabular (array of objects) or hierarchy (nested objects)
373
+ path, // as an alternative to id and parentId, returns an array identifier, imputing internal nodes
374
+ id = Array.isArray(data) ? d => d.id : null, // if tabular data, given a d in data, returns a unique identifier (string)
375
+ parentId = Array.isArray(data) ? d => d.parentId : null, // if tabular data, given a node d, returns its parent’s identifier
376
+ children, // if hierarchical data, given a d in data, returns its children
377
+ tree = d3.tree, // layout algorithm (typically d3.tree or d3.cluster)
378
+ sort, // how to sort nodes prior to layout (e.g., (a, b) => d3.descending(a.height, b.height))
379
+ label = d => d.name, // given a node d, returns the display name
380
+ title = d => d.name, // given a node d, returns its hover text
381
+ link , // given a node d, its link (if any)
382
+ linkTarget = "_blank", // the target attribute for links (if any)
383
+ width = 800, // outer width, in pixels
384
+ height, // outer height, in pixels
385
+ r = 3, // radius of nodes
386
+ padding = 1, // horizontal padding for first and last column
387
+ fill = "#999", // fill for nodes
388
+ fillOpacity, // fill opacity for nodes
389
+ stroke = "#555", // stroke for links
390
+ strokeWidth = 2, // stroke width for links
391
+ strokeOpacity = 0.4, // stroke opacity for links
392
+ strokeLinejoin, // stroke line join for links
393
+ strokeLinecap, // stroke line cap for links
394
+ halo = "#fff", // color of label halo
395
+ haloWidth = 3, // padding around the labels
396
+ curve = d37.curveBumpX, // curve for the link
397
+ } = {}) {
398
+
399
+ // If id and parentId options are specified, or the path option, use d3.stratify
400
+ // to convert tabular data to a hierarchy; otherwise we assume that the data is
401
+ // specified as an object {children} with nested objects (a.k.a. the “flare.json”
402
+ // format), and use d3.hierarchy.
403
+ const root = path != null ? d3.stratify().path(path)(data)
404
+ : id != null || parentId != null ? d3.stratify().id(id).parentId(parentId)(data)
405
+ : d3.hierarchy(data, children);
406
+
407
+ // Sort the nodes.
408
+ if (sort != null) root.sort(sort);
409
+
410
+ // Compute labels and titles.
411
+ const descendants = root.descendants();
412
+ const L = label == null ? null : descendants.map(d => label(d.data, d));
413
+
414
+ // Compute the layout.
415
+ const descWidth = 10;
416
+ // console.log('descendants', descendants);
417
+ const realWidth = descWidth * descendants.length
418
+ const totalWidth = (realWidth > width) ? realWidth : width;
419
+
420
+ const dx = 25;
421
+ const dy = totalWidth / (root.height + padding);
422
+ tree().nodeSize([dx, dy])(root);
423
+
424
+ // Center the tree.
425
+ let x0 = Infinity;
426
+ let x1 = -x0;
427
+ root.each(d => {
428
+ if (d.x > x1) x1 = d.x;
429
+ if (d.x < x0) x0 = d.x;
430
+ });
431
+
432
+ // Compute the default height.
433
+ if (height === undefined) height = x1 - x0 + dx * 2;
434
+
435
+
436
+
437
+ // Use the required curve
438
+ if (typeof curve !== "function") throw new Error(`Unsupported curve`);
439
+
440
+ const parent = d3.create("div");
441
+
442
+ const body = parent.append("div")
443
+ .style("overflow-x", "scroll")
444
+ .style("-webkit-overflow-scrolling", "touch");
445
+
446
+ const svg = body.append("svg")
447
+ .attr("viewBox", [-dy * padding / 2, x0 - dx, totalWidth, height])
448
+ .attr("width", totalWidth)
449
+ .attr("height", height)
450
+ .attr("style", "max-width: 100%; height: auto; height: intrinsic;")
451
+ .attr("font-family", "sans-serif")
452
+ .attr("font-size", 12);
453
+
454
+ svg.append("g")
455
+ .attr("fill", "none")
456
+ .attr("stroke", stroke)
457
+ .attr("stroke-opacity", strokeOpacity)
458
+ .attr("stroke-linecap", strokeLinecap)
459
+ .attr("stroke-linejoin", strokeLinejoin)
460
+ .attr("stroke-width", strokeWidth)
461
+ .selectAll("path")
462
+ .data(root.links())
463
+ .join("path")
464
+ // .attr("stroke", d => d.prob > 0.5 ? 'red' : 'blue' )
465
+ // .attr("fill", "red")
466
+ .attr("d", d37.link(curve)
467
+ .x(d => d.y)
468
+ .y(d => d.x));
469
+
470
+ const node = svg.append("g")
471
+ .selectAll("a")
472
+ .data(root.descendants())
473
+ .join("a")
474
+ .attr("xlink:href", link == null ? null : d => link(d.data, d))
475
+ .attr("target", link == null ? null : linkTarget)
476
+ .attr("transform", d => `translate(${d.y},${d.x})`);
477
+
478
+ node.append("circle")
479
+ .attr("fill", d => d.children ? stroke : fill)
480
+ .attr("r", r);
481
+
482
+ title = d => (d.name + ( d.prob));
483
+
484
+ if (title != null) node.append("title")
485
+ .text(d => title(d.data, d));
486
+
487
+ if (L) node.append("text")
488
+ .attr("dy", "0.32em")
489
+ .attr("x", d => d.children ? -6 : 6)
490
+ .attr("text-anchor", d => d.children ? "end" : "start")
491
+ .attr("paint-order", "stroke")
492
+ .attr("stroke", 'white')
493
+ .attr("fill", d => d.data.prob == 1 ? ('red') : ('black') )
494
+ .attr("stroke-width", haloWidth)
495
+ .text((d, i) => L[i]);
496
+ body.node().scrollBy(totalWidth, 0);
497
+ return svg.node();
498
+ }
499
+
500
+ function TextGrid(data, div_name, {
501
+ width = 640, // outer width, in pixels
502
+ height , // outer height, in pixels
503
+ r = 3, // radius of nodes
504
+ padding = 1, // horizontal padding for first and last column
505
+ // text = d => d[2],
506
+ } = {}){
507
+ // console.log("TextGrid", data);
508
+
509
+ // Compute the layout.
510
+ const dx = 10;
511
+ const dy = 10; //width / (root.height + padding);
512
+
513
+ const marginTop = 20;
514
+ const marginRight = 20;
515
+ const marginBottom = 30;
516
+ const marginLeft = 30;
517
+
518
+ // Center the tree.
519
+ let x0 = Infinity;
520
+ let x1 = -x0;
521
+ topk = 10;
522
+ word_length = 20;
523
+ const rectWidth = 60;
524
+ const rectTotal = 70;
525
+
526
+ wval = 0
527
+
528
+ const realWidth = rectTotal * data.length
529
+ const totalWidth = (realWidth > width) ? realWidth : width;
530
+ // root.each(d => {
531
+ // if (d.x > x1) x1 = d.x;
532
+ // if (d.x < x0) x0 = d.x;
533
+ // });
534
+
535
+ // Compute the default height.
536
+ // if (height === undefined) height = x1 - x0 + dx * 2;
537
+ if (height === undefined) height = topk * word_length + 10;
538
+
539
+ const parent = d3.create("div");
540
+
541
+ // parent.append("svg")
542
+ // .attr("width", width)
543
+ // .attr("height", height)
544
+ // .style("position", "absolute")
545
+ // .style("pointer-events", "none")
546
+ // .style("z-index", 1);
547
+
548
+
549
+ // const svg = d3.create("svg")
550
+ // // svg = parent.append("svg")
551
+ // .attr("viewBox", [-dy * padding / 2, x0 - dx, width, height])
552
+ // .attr("width", width)
553
+ // .attr("height", height)
554
+ // .attr("style", "max-width: 100%; height: auto; height: intrinsic;")
555
+ // .attr("font-family", "sans-serif")
556
+ // .attr("font-size", 10);
557
+
558
+ // div.data([1, 2, 4, 8, 16, 32], d => d);
559
+ // div.enter().append("div").text(d => d);
560
+
561
+ const body = parent.append("div")
562
+ .style("overflow-x", "scroll")
563
+ .style("-webkit-overflow-scrolling", "touch");
564
+
565
+ const svg = body.append("svg")
566
+ .attr("width", totalWidth)
567
+ .attr("height", height)
568
+ .style("display", "block")
569
+ .attr("font-family", "sans-serif")
570
+ .attr("font-size", 10);
571
+
572
+
573
+ data.forEach(words_list => {
574
+ // console.log(wval, words_list);
575
+ words = words_list[2]; // {'t': words_list[2], 'p': words_list[1]};
576
+ scores = words_list[1];
577
+ words_score = words.map( (x,i) => {return {t: x, p: scores[i]}})
578
+ // console.log(words_score);
579
+ // svg.selectAll("text").enter()
580
+ // .data(words)
581
+ // .join("text")
582
+ // .text((d,i) => (d))
583
+ // .attr("x", wval)
584
+ // .attr("y", ((d,i) => (20 + i*20)))
585
+
586
+ var probs = svg.selectAll("text").enter()
587
+ .data(words_score).join('g');
588
+
589
+
590
+
591
+ probs.append("rect")
592
+ // .data(words)
593
+ .attr("x", wval)
594
+ .attr("y", ((d,i) => ( 10+ i*20)))
595
+ .attr('width', rectWidth)
596
+ .attr('height', 15)
597
+ .attr("color", 'gray')
598
+ .attr("fill", "gray")
599
+ // .attr("fill-opacity", "0.2")
600
+ .attr("fill-opacity", (d) => (d.p))
601
+ .attr("stroke-opacity", 0.8)
602
+ .append("svg:title")
603
+ .text(function(d){return d.t+":"+d.p;});
604
+
605
+
606
+ probs.append("text")
607
+ // .data(words)
608
+ .text((d,i) => (d.t))
609
+ .attr("x", wval)
610
+ .attr("y", ((d,i) => (20 + i*20)))
611
+ // .attr("fill", 'white')
612
+ .attr("font-weight", 700);
613
+
614
+ wval = wval + rectTotal;
615
+ });
616
+
617
+
618
+ body.node().scrollBy(totalWidth, 0);
619
+ // return svg.node();
620
+ return parent.node();
621
+ }
622
+
623
+
624
+ function attViz(PYTHON_PARAMS) {
625
+ var $ = jQuery;
626
+ const params = PYTHON_PARAMS; // HACK: PYTHON_PARAMS is a template marker that is replaced by actual params.
627
+ const TEXT_SIZE = 15;
628
+ const BOXWIDTH = 110;
629
+ const BOXHEIGHT = 22.5;
630
+ const MATRIX_WIDTH = 115;
631
+ const CHECKBOX_SIZE = 20;
632
+ const TEXT_TOP = 30;
633
+
634
+ console.log("d3 version in ffuntions", d3.version)
635
+ let headColors;
636
+ try {
637
+ headColors = d3.scaleOrdinal(d3.schemeCategory10);
638
+ } catch (err) {
639
+ console.log('Older d3 version')
640
+ headColors = d3.scale.category10();
641
+ }
642
+ let config = {};
643
+ // globalThis.
644
+ initialize();
645
+ renderVis();
646
+
647
+ function initialize() {
648
+ // globalThis.initialize = () => {
649
+
650
+ console.log("init")
651
+ config.attention = params['attention'];
652
+ config.filter = params['default_filter'];
653
+ config.rootDivId = params['root_div_id'];
654
+ config.nLayers = config.attention[config.filter]['attn'].length;
655
+ config.nHeads = config.attention[config.filter]['attn'][0].length;
656
+ config.layers = params['include_layers']
657
+
658
+ if (params['heads']) {
659
+ config.headVis = new Array(config.nHeads).fill(false);
660
+ params['heads'].forEach(x => config.headVis[x] = true);
661
+ } else {
662
+ config.headVis = new Array(config.nHeads).fill(true);
663
+ }
664
+ config.initialTextLength = config.attention[config.filter].right_text.length;
665
+ config.layer_seq = (params['layer'] == null ? 0 : config.layers.findIndex(layer => params['layer'] === layer));
666
+ config.layer = config.layers[config.layer_seq]
667
+
668
+ // '#' + temp1.root_div_id+ ' #layer'
669
+ $('#' + config.rootDivId+ ' #layer').empty();
670
+
671
+ let layerEl = $('#' + config.rootDivId+ ' #layer');
672
+ console.log(layerEl)
673
+ for (const layer of config.layers) {
674
+ layerEl.append($("<option />").val(layer).text(layer));
675
+ }
676
+ layerEl.val(config.layer).change();
677
+ layerEl.on('change', function (e) {
678
+ config.layer = +e.currentTarget.value;
679
+ config.layer_seq = config.layers.findIndex(layer => config.layer === layer);
680
+ renderVis();
681
+ });
682
+
683
+ $('#'+config.rootDivId+' #filter').on('change', function (e) {
684
+ // $(`#${config.rootDivId} #filter`).on('change', function (e) {
685
+
686
+ config.filter = e.currentTarget.value;
687
+ renderVis();
688
+ });
689
+ }
690
+
691
+ function renderVis() {
692
+
693
+ // Load parameters
694
+ const attnData = config.attention[config.filter];
695
+ const leftText = attnData.left_text;
696
+ const rightText = attnData.right_text;
697
+
698
+ // Select attention for given layer
699
+ const layerAttention = attnData.attn[config.layer_seq];
700
+
701
+ // Clear vis
702
+ $('#'+config.rootDivId+' #vis').empty();
703
+
704
+ // Determine size of visualization
705
+ const height = Math.max(leftText.length, rightText.length) * BOXHEIGHT + TEXT_TOP;
706
+ const svg = d3.select('#'+ config.rootDivId +' #vis')
707
+ .append('svg')
708
+ .attr("width", "100%")
709
+ .attr("height", height + "px");
710
+
711
+ // Display tokens on left and right side of visualization
712
+ renderText(svg, leftText, true, layerAttention, 0);
713
+ renderText(svg, rightText, false, layerAttention, MATRIX_WIDTH + BOXWIDTH);
714
+
715
+ // Render attention arcs
716
+ renderAttention(svg, layerAttention);
717
+
718
+ // Draw squares at top of visualization, one for each head
719
+ drawCheckboxes(0, svg, layerAttention);
720
+ }
721
+
722
+ function renderText(svg, text, isLeft, attention, leftPos) {
723
+
724
+ const textContainer = svg.append("svg:g")
725
+ .attr("id", isLeft ? "left" : "right");
726
+
727
+ // Add attention highlights superimposed over words
728
+ textContainer.append("g")
729
+ .classed("attentionBoxes", true)
730
+ .selectAll("g")
731
+ .data(attention)
732
+ .enter()
733
+ .append("g")
734
+ .attr("head-index", (d, i) => i)
735
+ .selectAll("rect")
736
+ .data(d => isLeft ? d : transpose(d)) // if right text, transpose attention to get right-to-left weights
737
+ .enter()
738
+ .append("rect")
739
+ .attr("x", function () {
740
+ var headIndex = +this.parentNode.getAttribute("head-index");
741
+ return leftPos + boxOffsets(headIndex);
742
+ })
743
+ .attr("y", (+1) * BOXHEIGHT)
744
+ .attr("width", BOXWIDTH / activeHeads())
745
+ .attr("height", BOXHEIGHT)
746
+ .attr("fill", function () {
747
+ return headColors(+this.parentNode.getAttribute("head-index"))
748
+ })
749
+ .style("opacity", 0.0);
750
+
751
+ const tokenContainer = textContainer.append("g").selectAll("g")
752
+ .data(text)
753
+ .enter()
754
+ .append("g");
755
+
756
+ // Add gray background that appears when hovering over text
757
+ tokenContainer.append("rect")
758
+ .classed("background", true)
759
+ .style("opacity", 0.0)
760
+ .attr("fill", "lightgray")
761
+ .attr("x", leftPos)
762
+ .attr("y", (d, i) => TEXT_TOP + i * BOXHEIGHT)
763
+ .attr("width", BOXWIDTH)
764
+ .attr("height", BOXHEIGHT);
765
+
766
+ // Add token text
767
+ const textEl = tokenContainer.append("text")
768
+ .text(d => d)
769
+ .attr("font-size", TEXT_SIZE + "px")
770
+ .style("cursor", "default")
771
+ .style("-webkit-user-select", "none")
772
+ .attr("x", leftPos)
773
+ .attr("y", (d, i) => TEXT_TOP + i * BOXHEIGHT);
774
+
775
+ if (isLeft) {
776
+ textEl.style("text-anchor", "end")
777
+ .attr("dx", BOXWIDTH - 0.5 * TEXT_SIZE)
778
+ .attr("dy", TEXT_SIZE);
779
+ } else {
780
+ textEl.style("text-anchor", "start")
781
+ .attr("dx", +0.5 * TEXT_SIZE)
782
+ .attr("dy", TEXT_SIZE);
783
+ }
784
+
785
+ tokenContainer.on("mouseover", function (d, index) {
786
+
787
+ // Show gray background for moused-over token
788
+ textContainer.selectAll(".background")
789
+ .style("opacity", (d, i) => i === index ? 1.0 : 0.0)
790
+
791
+ // Reset visibility attribute for any previously highlighted attention arcs
792
+ svg.select("#attention")
793
+ .selectAll("line[visibility='visible']")
794
+ .attr("visibility", null)
795
+
796
+ // Hide group containing attention arcs
797
+ svg.select("#attention").attr("visibility", "hidden");
798
+
799
+ // Set to visible appropriate attention arcs to be highlighted
800
+ if (isLeft) {
801
+ svg.select("#attention").selectAll("line[left-token-index='" + index + "']").attr("visibility", "visible");
802
+ } else {
803
+ svg.select("#attention").selectAll("line[right-token-index='" + index + "']").attr("visibility", "visible");
804
+ }
805
+
806
+ // Update color boxes superimposed over tokens
807
+ const id = isLeft ? "right" : "left";
808
+ const leftPos = isLeft ? MATRIX_WIDTH + BOXWIDTH : 0;
809
+ svg.select("#" + id)
810
+ .selectAll(".attentionBoxes")
811
+ .selectAll("g")
812
+ .attr("head-index", (d, i) => i)
813
+ .selectAll("rect")
814
+ .attr("x", function () {
815
+ const headIndex = +this.parentNode.getAttribute("head-index");
816
+ return leftPos + boxOffsets(headIndex);
817
+ })
818
+ .attr("y", (d, i) => TEXT_TOP + i * BOXHEIGHT)
819
+ .attr("width", BOXWIDTH / activeHeads())
820
+ .attr("height", BOXHEIGHT)
821
+ .style("opacity", function (d) {
822
+ const headIndex = +this.parentNode.getAttribute("head-index");
823
+ if (config.headVis[headIndex])
824
+ if (d) {
825
+ return d[index];
826
+ } else {
827
+ return 0.0;
828
+ }
829
+ else
830
+ return 0.0;
831
+ });
832
+ });
833
+
834
+ textContainer.on("mouseleave", function () {
835
+
836
+ // Unhighlight selected token
837
+ d3.select(this).selectAll(".background")
838
+ .style("opacity", 0.0);
839
+
840
+ // Reset visibility attributes for previously selected lines
841
+ svg.select("#attention")
842
+ .selectAll("line[visibility='visible']")
843
+ .attr("visibility", null) ;
844
+ svg.select("#attention").attr("visibility", "visible");
845
+
846
+ // Reset highlights superimposed over tokens
847
+ svg.selectAll(".attentionBoxes")
848
+ .selectAll("g")
849
+ .selectAll("rect")
850
+ .style("opacity", 0.0);
851
+ });
852
+ }
853
+
854
+ function renderAttention(svg, attention) {
855
+
856
+ // Remove previous dom elements
857
+ svg.select("#attention").remove();
858
+
859
+ // Add new elements
860
+ svg.append("g")
861
+ .attr("id", "attention") // Container for all attention arcs
862
+ .selectAll(".headAttention")
863
+ .data(attention)
864
+ .enter()
865
+ .append("g")
866
+ .classed("headAttention", true) // Group attention arcs by head
867
+ .attr("head-index", (d, i) => i)
868
+ .selectAll(".tokenAttention")
869
+ .data(d => d)
870
+ .enter()
871
+ .append("g")
872
+ .classed("tokenAttention", true) // Group attention arcs by left token
873
+ .attr("left-token-index", (d, i) => i)
874
+ .selectAll("line")
875
+ .data(d => d)
876
+ .enter()
877
+ .append("line")
878
+ .attr("x1", BOXWIDTH)
879
+ .attr("y1", function () {
880
+ const leftTokenIndex = +this.parentNode.getAttribute("left-token-index")
881
+ return TEXT_TOP + leftTokenIndex * BOXHEIGHT + (BOXHEIGHT / 2)
882
+ })
883
+ .attr("x2", BOXWIDTH + MATRIX_WIDTH)
884
+ .attr("y2", (d, rightTokenIndex) => TEXT_TOP + rightTokenIndex * BOXHEIGHT + (BOXHEIGHT / 2))
885
+ .attr("stroke-width", 2)
886
+ .attr("stroke", function () {
887
+ const headIndex = +this.parentNode.parentNode.getAttribute("head-index");
888
+ return headColors(headIndex)
889
+ })
890
+ .attr("left-token-index", function () {
891
+ return +this.parentNode.getAttribute("left-token-index")
892
+ })
893
+ .attr("right-token-index", (d, i) => i)
894
+ ;
895
+ updateAttention(svg)
896
+ }
897
+
898
+ function updateAttention(svg) {
899
+ svg.select("#attention")
900
+ .selectAll("line")
901
+ .attr("stroke-opacity", function (d) {
902
+ const headIndex = +this.parentNode.parentNode.getAttribute("head-index");
903
+ // If head is selected
904
+ if (config.headVis[headIndex]) {
905
+ // Set opacity to attention weight divided by number of active heads
906
+ return d / activeHeads()
907
+ } else {
908
+ return 0.0;
909
+ }
910
+ })
911
+ }
912
+
913
+ function boxOffsets(i) {
914
+ const numHeadsAbove = config.headVis.reduce(
915
+ function (acc, val, cur) {
916
+ return val && cur < i ? acc + 1 : acc;
917
+ }, 0);
918
+ return numHeadsAbove * (BOXWIDTH / activeHeads());
919
+ }
920
+
921
+ function activeHeads() {
922
+ return config.headVis.reduce(function (acc, val) {
923
+ return val ? acc + 1 : acc;
924
+ }, 0);
925
+ }
926
+
927
+ function drawCheckboxes(top, svg) {
928
+ const checkboxContainer = svg.append("g");
929
+ const checkbox = checkboxContainer.selectAll("rect")
930
+ .data(config.headVis)
931
+ .enter()
932
+ .append("rect")
933
+ .attr("fill", (d, i) => headColors(i))
934
+ .attr("x", (d, i) => i * CHECKBOX_SIZE)
935
+ .attr("y", top)
936
+ .attr("width", CHECKBOX_SIZE)
937
+ .attr("height", CHECKBOX_SIZE);
938
+
939
+ function updateCheckboxes() {
940
+ checkboxContainer.selectAll("rect")
941
+ .data(config.headVis)
942
+ .attr("fill", (d, i) => d ? headColors(i): lighten(headColors(i)));
943
+ }
944
+
945
+ updateCheckboxes();
946
+
947
+ checkbox.on("click", function (d, i) {
948
+ if (config.headVis[i] && activeHeads() === 1) return;
949
+ config.headVis[i] = !config.headVis[i];
950
+ updateCheckboxes();
951
+ updateAttention(svg);
952
+ });
953
+
954
+ checkbox.on("dblclick", function (d, i) {
955
+ // If we double click on the only active head then reset
956
+ if (config.headVis[i] && activeHeads() === 1) {
957
+ config.headVis = new Array(config.nHeads).fill(true);
958
+ } else {
959
+ config.headVis = new Array(config.nHeads).fill(false);
960
+ config.headVis[i] = true;
961
+ }
962
+ updateCheckboxes();
963
+ updateAttention(svg);
964
+ });
965
+ }
966
+
967
+ function lighten(color) {
968
+ const c = d3.hsl(color);
969
+ const increment = (1 - c.l) * 0.6;
970
+ c.l += increment;
971
+ c.s -= increment;
972
+ return c;
973
+ }
974
+
975
+ function transpose(mat) {
976
+ return mat[0].map(function (col, i) {
977
+ return mat.map(function (row) {
978
+ return row[i];
979
+ });
980
+ });
981
+ }
982
+
983
+ }
984
+
985
+
986
+
987
+
988
+
989
+ }
990
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ bertviz
2
+ jupyter
3
+ scikit-learn
4
+ faiss-cpu