Spaces:
Sleeping
Sleeping
Gabriela Nicole Gonzalez Saez
commited on
Commit
•
c2ad8fd
1
Parent(s):
0b60561
init
Browse files- .gitattributes +1 -0
- app.py +900 -0
- bertviz_gradio.py +251 -0
- challenge_sets.csv +109 -0
- index/en-es_input_tokens.index +0 -0
- index/en-es_input_words.index +0 -0
- index/en-es_metadata_ref.pkl +3 -0
- index/en-es_output_tokens.index +0 -0
- index/en-es_output_words.index +0 -0
- plotsjs.js +990 -0
- requirements.txt +4 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
index/en-es_metadata_ref.pkl filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,900 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from time import time
|
3 |
+
|
4 |
+
from bertviz import model_view, head_view
|
5 |
+
from bertviz_gradio import head_view_mod
|
6 |
+
|
7 |
+
import faiss
|
8 |
+
import torch
|
9 |
+
import os
|
10 |
+
# import nltk
|
11 |
+
import argparse
|
12 |
+
import random
|
13 |
+
import numpy as np
|
14 |
+
import pandas as pd
|
15 |
+
|
16 |
+
from argparse import Namespace
|
17 |
+
from tqdm.notebook import tqdm
|
18 |
+
from torch.utils.data import DataLoader
|
19 |
+
from functools import partial
|
20 |
+
|
21 |
+
from transformers import AutoTokenizer, MarianTokenizer, AutoModel, AutoModelForSeq2SeqLM, MarianMTModel
|
22 |
+
|
23 |
+
model_es = "Helsinki-NLP/opus-mt-en-es"
|
24 |
+
model_fr = "Helsinki-NLP/opus-mt-en-fr"
|
25 |
+
model_zh = "Helsinki-NLP/opus-mt-en-zh"
|
26 |
+
model_sw = "Helsinki-NLP/opus-mt-en-sw"
|
27 |
+
|
28 |
+
tokenizer_es = AutoTokenizer.from_pretrained(model_es)
|
29 |
+
tokenizer_fr = AutoTokenizer.from_pretrained(model_fr)
|
30 |
+
tokenizer_zh = AutoTokenizer.from_pretrained(model_zh)
|
31 |
+
tokenizer_sw = AutoTokenizer.from_pretrained(model_sw)
|
32 |
+
|
33 |
+
model_tr_es = MarianMTModel.from_pretrained(model_es)
|
34 |
+
model_tr_fr = MarianMTModel.from_pretrained(model_fr)
|
35 |
+
model_tr_zh = MarianMTModel.from_pretrained(model_zh)
|
36 |
+
model_tr_sw = MarianMTModel.from_pretrained(model_sw)
|
37 |
+
|
38 |
+
from faiss import write_index, read_index
|
39 |
+
import pickle
|
40 |
+
|
41 |
+
def load_index(model):
|
42 |
+
with open('index/'+ model + '_metadata_ref.pkl', 'rb') as f:
|
43 |
+
loaded_dict = pickle.load(f)
|
44 |
+
for type in ['tokens','words']:
|
45 |
+
for kind in ['input', 'output']:
|
46 |
+
## save index file
|
47 |
+
name = 'index/'+ model + "_" + kind + "_"+ type + ".index"
|
48 |
+
loaded_dict[kind][type][1] = read_index(name)
|
49 |
+
# write_index(metadata_all[kind][type][1], name)
|
50 |
+
return loaded_dict
|
51 |
+
|
52 |
+
|
53 |
+
dict_models = {
|
54 |
+
'en-es': model_es,
|
55 |
+
'en-fr': model_fr,
|
56 |
+
'en-zh': model_zh,
|
57 |
+
'en-sw': model_sw,
|
58 |
+
}
|
59 |
+
|
60 |
+
dict_models_tr = {
|
61 |
+
'en-es': model_tr_es,
|
62 |
+
'en-fr': model_tr_fr,
|
63 |
+
'en-zh': model_tr_zh,
|
64 |
+
'en-sw': model_tr_sw,
|
65 |
+
}
|
66 |
+
|
67 |
+
dict_tokenizer_tr = {
|
68 |
+
'en-es': tokenizer_es,
|
69 |
+
'en-fr': tokenizer_fr,
|
70 |
+
'en-zh': tokenizer_zh,
|
71 |
+
'en-sw': tokenizer_sw,
|
72 |
+
}
|
73 |
+
|
74 |
+
dict_reference_faiss = {
|
75 |
+
'en-es': load_index('en-es'),
|
76 |
+
}
|
77 |
+
|
78 |
+
saliency_examples = [
|
79 |
+
"Peace of Mind: Protection for consumers.",
|
80 |
+
"The sustainable development goals report: towards a rescue plan for people and planet",
|
81 |
+
"We will leave no stone unturned to hold those responsible to account.",
|
82 |
+
"The clock is now ticking on our work to finalise the remaining key legislative proposals presented by this Commission to ensure that citizens and businesses can reap the benefits of our policy actions.",
|
83 |
+
"Pumpkins, squash and gourds, fresh or chilled, excluding courgettes",
|
84 |
+
"The labour market participation of mothers with infants has even deteriorated over the past two decades, often impacting their career and incomes for years.",
|
85 |
+
]
|
86 |
+
|
87 |
+
contrastive_examples = [
|
88 |
+
["Peace of Mind: Protection for consumers.",
|
89 |
+
"Paz mental: protección de los consumidores",
|
90 |
+
"Paz de la mente: protección de los consumidores"],
|
91 |
+
["the slaughterer has finished his work.",
|
92 |
+
"l'abatteur a terminé son travail.",
|
93 |
+
"l'abatteuse a terminé son travail."],
|
94 |
+
['A fundamental shift is needed - in commitment, solidarity, financing and action - to put the world on a better path.',
|
95 |
+
'需要在承诺、团结、筹资和行动方面进行根本转变,使世界走上更美好的道路。',
|
96 |
+
'我们需要从根本上转变承诺、团结、资助和行动,使世界走上更美好的道路。',]
|
97 |
+
]
|
98 |
+
|
99 |
+
|
100 |
+
|
101 |
+
#Load challenge set examples
|
102 |
+
df_challenge_set = pd.read_csv("challenge_sets.csv")
|
103 |
+
arr_challenge_set = df_challenge_set.values
|
104 |
+
arr_challenge_set = [[x[2], x[3], x[4], x[5]] for x in arr_challenge_set]
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
def get_k_prob_tokens(transition_scores, result, model, k_values=5):
|
110 |
+
tokenizer_tr = dict_tokenizer_tr[model]
|
111 |
+
gen_sequences = result.sequences[:, 1:]
|
112 |
+
|
113 |
+
result_output = []
|
114 |
+
|
115 |
+
# First beam only...
|
116 |
+
bs = 0
|
117 |
+
text = ' '
|
118 |
+
for tok, score, i_step in zip(gen_sequences[bs], transition_scores[bs],range(len(gen_sequences[bs]))):
|
119 |
+
|
120 |
+
beam_i = result.beam_indices[0][i_step]
|
121 |
+
if beam_i < 0:
|
122 |
+
beam_i = bs
|
123 |
+
bs_alt = [tokenizer_tr.decode(tok) for tok in result.scores[i_step][beam_i].topk(k_values).indices ]
|
124 |
+
bs_alt_scores = np.exp(result.scores[i_step][beam_i].topk(k_values).values)
|
125 |
+
result_output.append([np.array(result.scores[i_step][beam_i].topk(k_values).indices), np.array(bs_alt_scores),bs_alt])
|
126 |
+
|
127 |
+
return result_output
|
128 |
+
|
129 |
+
|
130 |
+
def split_token_from_sequences(sequences, model) -> dict :
|
131 |
+
n_sentences = len(sequences)
|
132 |
+
|
133 |
+
gen_sequences_texts = []
|
134 |
+
for bs in range(n_sentences):
|
135 |
+
# gen_sequences_texts.append(dict_tokenizer_tr[model].decode(sequences[:, 1:][bs], skip_special_tokens=True).split(' '))
|
136 |
+
#### decoder per token.
|
137 |
+
seq_bs = []
|
138 |
+
|
139 |
+
for token in sequences[:, 1:][bs]:
|
140 |
+
seq_bs.append(dict_tokenizer_tr[model].decode(token, skip_special_tokens=True))
|
141 |
+
gen_sequences_texts.append(seq_bs)
|
142 |
+
|
143 |
+
score = 0
|
144 |
+
#raw dict is bos
|
145 |
+
text = 'bos'
|
146 |
+
new_id = text +'--1'
|
147 |
+
dict_parent = [{'id': new_id, 'parentId': None , 'text': text, 'name': 'bos', 'prob': score }]
|
148 |
+
id_dict_pos = {}
|
149 |
+
step_i = 0
|
150 |
+
cont = True
|
151 |
+
words_by_step = [] #[['bos' for i in range(n_sentences)]]
|
152 |
+
|
153 |
+
while cont:
|
154 |
+
# append to dict_parent for all beams of step_i
|
155 |
+
cont = False
|
156 |
+
step_words = []
|
157 |
+
for beam in range(n_sentences):
|
158 |
+
app_text = '<empty_word>'
|
159 |
+
if step_i < len(gen_sequences_texts[beam]):
|
160 |
+
app_text = gen_sequences_texts[beam][step_i]
|
161 |
+
cont = True
|
162 |
+
step_words.append(app_text)
|
163 |
+
words_by_step.append(step_words)
|
164 |
+
print(words_by_step)
|
165 |
+
|
166 |
+
for i_bs, step_w in enumerate(step_words):
|
167 |
+
if not step_w in ['<empty_word>', '<pad>']:
|
168 |
+
#new id if the same word is not in another beam (?) [beam[i] was a token id]
|
169 |
+
#parent id = previous word and previous step.
|
170 |
+
|
171 |
+
|
172 |
+
# new_parent_id = "-".join([str(beam[i]) for i in range(step_i)])
|
173 |
+
|
174 |
+
new_id = "-".join([str(words_by_step[i][i_bs])+ '-' + str(i) for i in range(step_i+1)])
|
175 |
+
parent_id = "-".join([words_by_step[i][i_bs] + '-' + str(i) for i in range(step_i) ])
|
176 |
+
|
177 |
+
# new_id = step_w +'-' + str(step_i)
|
178 |
+
# parent_id = words_by_step[step_i-1][i_bs] + '-' + str(step_i -1)
|
179 |
+
next_word_flag = 1
|
180 |
+
if step_i == 0 :
|
181 |
+
parent_id = 'bos--1'
|
182 |
+
## if the dict already exists remove it, if it is not a root...
|
183 |
+
## root?? then next is ''
|
184 |
+
else:
|
185 |
+
next_word_flag = len(gen_sequences_texts[i_bs][step_i]) > step_i ## Not in step_i = 0;
|
186 |
+
if next_word_flag:
|
187 |
+
if not (new_id in id_dict_pos):
|
188 |
+
dict_parent.append({'id': new_id, 'parentId': parent_id , 'text': step_w, 'name': step_w, 'prob' : score })
|
189 |
+
id_dict_pos[new_id] = len(dict_parent) - 1
|
190 |
+
else:
|
191 |
+
if not (new_id in id_dict_pos):
|
192 |
+
dict_parent.append({'id': new_id, 'parentId': parent_id , 'text': step_w, 'name': step_w, 'prob' : score })
|
193 |
+
id_dict_pos[new_id] = len(dict_parent) - 1
|
194 |
+
|
195 |
+
|
196 |
+
step_i += 1
|
197 |
+
return dict_parent
|
198 |
+
|
199 |
+
|
200 |
+
## Tokenization
|
201 |
+
def compute_tokenization(inputs, targets, w1, model):
|
202 |
+
colors = ['tok-first-color', 'tok-second-color', 'tok-third-color', 'tok-fourth-color']
|
203 |
+
len_colors = len(colors);
|
204 |
+
inputs = inputs.input_ids
|
205 |
+
html_tokens = ""
|
206 |
+
i = 0
|
207 |
+
for sentence in inputs:
|
208 |
+
html_tokens += "<p>"
|
209 |
+
# print("TOKENS", inputs, targets)
|
210 |
+
# print("input", [dict_tokenizer_tr[model].decode(tok) for tok in sentence])
|
211 |
+
tokens = [dict_tokenizer_tr[model].decode(tok) for tok in sentence]
|
212 |
+
for token in tokens:
|
213 |
+
token = token.replace("<", "<'") # .substring(0, token.length - 2)
|
214 |
+
html_tokens += "<span class='" + colors[i % len_colors] + "'>" + token + " </span>"
|
215 |
+
i +=1
|
216 |
+
html_tokens += "</p>"
|
217 |
+
i = 0
|
218 |
+
# for tgt_sentence in targets :
|
219 |
+
html_tokens_tgt = ""
|
220 |
+
html_tokens_tgt += "<p>"
|
221 |
+
# print("targets", [dict_tokenizer_tr[model].decode(tok) for tok in targets])
|
222 |
+
# print("targets", dict_tokenizer_tr[model].decode(targets))
|
223 |
+
tokens = [dict_tokenizer_tr[model].decode(tok) for tok in targets]
|
224 |
+
for token in tokens:
|
225 |
+
token = token.replace("<", "<'") # .substring(0, token.length - 2)
|
226 |
+
html_tokens_tgt += "<span class='" + colors[i % len_colors] + "'>" + token + " </span>"
|
227 |
+
i +=1
|
228 |
+
html_tokens_tgt += "</p>"
|
229 |
+
# print("HTML", html_tokens, html_tokens_tgt)
|
230 |
+
return html_tokens, html_tokens_tgt
|
231 |
+
|
232 |
+
|
233 |
+
def create_vocab_multiple(embeddings_list, model):
|
234 |
+
"""_summary_
|
235 |
+
|
236 |
+
Args:
|
237 |
+
embeddings_list (list): embedding array
|
238 |
+
|
239 |
+
Returns:
|
240 |
+
Dict: vocabulary of tokens' embeddings
|
241 |
+
"""
|
242 |
+
print("START VOCAB CREATION MULTIPLE \n \n ")
|
243 |
+
vocab = {} ## add embedds.
|
244 |
+
sentence_tokens_text_list = []
|
245 |
+
for embeddings in embeddings_list:
|
246 |
+
tokens_id = embeddings['tokens'] # [[tokens_id]x n_sentences ]
|
247 |
+
for sent_i, sentence in enumerate(tokens_id):
|
248 |
+
sentence_tokens = []
|
249 |
+
for tok_i, token in enumerate(sentence):
|
250 |
+
sentence_tokens.append(token)
|
251 |
+
if not (token in vocab):
|
252 |
+
vocab[token] = {
|
253 |
+
'token' : token,
|
254 |
+
'count': 1,
|
255 |
+
# 'text': embeddings['texts'][sent_i][tok_i],
|
256 |
+
'text': dict_tokenizer_tr[model].decode([token]),
|
257 |
+
# 'text': src_token_lists[sent_i][tok_i],
|
258 |
+
'embed': embeddings['embeddings'][sent_i][tok_i]}
|
259 |
+
else:
|
260 |
+
vocab[token]['count'] = vocab[token]['count'] + 1
|
261 |
+
# print(vocab)
|
262 |
+
sentence_tokens_text_list.append(sentence_tokens)
|
263 |
+
print("END VOCAB CREATION MULTIPLE \n \n ")
|
264 |
+
return vocab, sentence_tokens_text_list
|
265 |
+
|
266 |
+
def vocab_words_all_prefix(token_embeddings, model, sufix="@@",prefix = '▁' ):
|
267 |
+
vocab = {}
|
268 |
+
# inf_model = dict_models_tr[model]
|
269 |
+
sentence_words_text_list = []
|
270 |
+
if prefix :
|
271 |
+
n_prefix = len(prefix)
|
272 |
+
for input_sentences in token_embeddings:
|
273 |
+
# n_tokens_in_word
|
274 |
+
for sent_i, sentence in enumerate(input_sentences['tokens']):
|
275 |
+
words_text_list = []
|
276 |
+
# embedding = input_sentences['embed'][sent_i]
|
277 |
+
word = ''
|
278 |
+
tokens_ids = []
|
279 |
+
embeddings = []
|
280 |
+
ids_to_tokens = dict_tokenizer_tr[model].convert_ids_to_tokens(sentence)
|
281 |
+
# print("validate same len", len(sentence) == len(ids_to_tokens), len(sentence), len(ids_to_tokens), ids_to_tokens)
|
282 |
+
|
283 |
+
to_save= False
|
284 |
+
for tok_i, token_text in enumerate(ids_to_tokens):
|
285 |
+
token_id = sentence[tok_i]
|
286 |
+
if token_text[:n_prefix] == prefix :
|
287 |
+
#first we save the previous word
|
288 |
+
if to_save:
|
289 |
+
vocab[word] = {
|
290 |
+
'word' : word,
|
291 |
+
'text': word,
|
292 |
+
'count': 1,
|
293 |
+
'tokens_ids' : tokens_ids,
|
294 |
+
'embed': np.mean(np.array(embeddings), 0).tolist()
|
295 |
+
}
|
296 |
+
words_text_list.append(word)
|
297 |
+
#word is starting if prefix
|
298 |
+
tokens_ids = [token_id]
|
299 |
+
embeddings = [input_sentences['embeddings'][sent_i][tok_i]]
|
300 |
+
word = token_text[n_prefix:]
|
301 |
+
## if word
|
302 |
+
to_save = True
|
303 |
+
|
304 |
+
else :
|
305 |
+
if (token_text in dict_tokenizer_tr[model].special_tokens_map.values()):
|
306 |
+
# print('final or save', token_text, token_id, to_save, word)
|
307 |
+
if to_save:
|
308 |
+
# vocab[word] = ids
|
309 |
+
vocab[word] = {
|
310 |
+
'word' : word,
|
311 |
+
'text': word,
|
312 |
+
'count': 1,
|
313 |
+
'tokens_ids' : tokens_ids,
|
314 |
+
'embed': np.mean(np.array(embeddings), 0).tolist()
|
315 |
+
}
|
316 |
+
words_text_list.append(word)
|
317 |
+
#special token is one token element, no continuation
|
318 |
+
# vocab[token_text] = [token_id]
|
319 |
+
tokens_ids = [token_id]
|
320 |
+
embeddings = [input_sentences['embeddings'][sent_i][tok_i]]
|
321 |
+
vocab[token_text] = {
|
322 |
+
'word' : token_text,
|
323 |
+
'count': 1,
|
324 |
+
'text': word,
|
325 |
+
'tokens_ids' : tokens_ids,
|
326 |
+
'embed': np.mean(np.array(embeddings), 0).tolist()
|
327 |
+
}
|
328 |
+
words_text_list.append(token_text)
|
329 |
+
to_save = False
|
330 |
+
else:
|
331 |
+
# is a continuation; we do not know if it is final; we don't save here.
|
332 |
+
to_save = True
|
333 |
+
word += token_text
|
334 |
+
tokens_ids.append(token_id)
|
335 |
+
embeddings.append(input_sentences['embeddings'][sent_i][tok_i])
|
336 |
+
if to_save:
|
337 |
+
# print('final save', token_text, token_id, to_save, word)
|
338 |
+
vocab[word] = tokens_ids
|
339 |
+
if not (word in vocab):
|
340 |
+
vocab[word] = {
|
341 |
+
'word' : word,
|
342 |
+
'count': 1,
|
343 |
+
'text': word,
|
344 |
+
'tokens_ids' : tokens_ids,
|
345 |
+
'embed': np.mean(np.array(embeddings), 0).tolist()
|
346 |
+
}
|
347 |
+
words_text_list.append(word)
|
348 |
+
else:
|
349 |
+
vocab[word]['count'] = vocab[word]['count'] + 1
|
350 |
+
sentence_words_text_list.append(words_text_list)
|
351 |
+
|
352 |
+
return vocab, sentence_words_text_list
|
353 |
+
def search_query_vocab(index, vocab_queries, topk = 10, limited_search = []):
|
354 |
+
""" the embed queries are a vocabulary of words : embds_input_voc
|
355 |
+
|
356 |
+
Args:
|
357 |
+
index (_type_): faiss index
|
358 |
+
embed_queries (_type_): vocab format.
|
359 |
+
{ 'token' : token,
|
360 |
+
'count': 1,
|
361 |
+
'text': src_token_lists[sent_i][tok_i],
|
362 |
+
'embed': embeddings[0]['embeddings'][sent_i][tok_i] }
|
363 |
+
nb_ids (_type_): hash to find the token_id w.r.t the faiss index id.
|
364 |
+
topk (int, optional): nb of similar tokens. Defaults to 10.
|
365 |
+
|
366 |
+
Returns:
|
367 |
+
_type_: Distance matrix D, indices matrix I and tokens ids (using nb_ids)
|
368 |
+
"""
|
369 |
+
# nb_qi_ids = [] ##ordered ids list
|
370 |
+
nb_q_embds = [] ##ordered embeddings list
|
371 |
+
metadata = {}
|
372 |
+
qi_pos = 0
|
373 |
+
for key , token_values in vocab_queries.items():
|
374 |
+
# nb_qi_ids.append(token_values['token']) # for x in vocab_tokens]
|
375 |
+
metadata[qi_pos] = {'word': token_values['word'], 'tokens': token_values['tokens_ids'], 'text': token_values['text']}
|
376 |
+
qi_pos += 1
|
377 |
+
nb_q_embds.append(token_values['embed']) # for x in vocab_tokens]
|
378 |
+
|
379 |
+
xq = np.array(nb_q_embds).astype('float32') #elements to query
|
380 |
+
|
381 |
+
D,I = index.search(xq, topk)
|
382 |
+
|
383 |
+
return D,I, metadata
|
384 |
+
|
385 |
+
def search_query_vocab_token(index, vocab_queries, topk = 10, limited_search = []):
|
386 |
+
""" the embed queries are a vocabulary of words : embds_input_vov
|
387 |
+
Returns:
|
388 |
+
_type_: Distance matrix D, indices matrix I and tokens ids (using nb_ids)
|
389 |
+
"""
|
390 |
+
# nb_qi_ids = [] ##ordered ids list
|
391 |
+
nb_q_embds = [] ##ordered embeddings list
|
392 |
+
metadata = {}
|
393 |
+
qi_pos = 0
|
394 |
+
for key , token_values in vocab_queries.items():
|
395 |
+
# nb_qi_ids.append(token_values['token']) # for x in vocab_tokens]
|
396 |
+
metadata[qi_pos] = {'token': token_values['token'], 'text': token_values['text']}
|
397 |
+
qi_pos += 1
|
398 |
+
nb_q_embds.append(token_values['embed']) # for x in vocab_tokens]
|
399 |
+
|
400 |
+
xq = np.array(nb_q_embds).astype('float32') #elements to query
|
401 |
+
|
402 |
+
D,I = index.search(xq, topk)
|
403 |
+
|
404 |
+
return D,I, metadata
|
405 |
+
|
406 |
+
|
407 |
+
def build_search(query_embeddings, model,type="input"):
|
408 |
+
metadata_all = dict_reference_faiss[model]
|
409 |
+
|
410 |
+
# ## biuld vocab for index
|
411 |
+
vocab_queries, sentence_tokens_list = create_vocab_multiple(query_embeddings, model)
|
412 |
+
words_vocab_queries, sentence_words_list = vocab_words_all_prefix(query_embeddings, model, sufix="@@",prefix="▁")
|
413 |
+
|
414 |
+
index_vor_tokens = metadata_all[type]['tokens'][1]
|
415 |
+
md_tokens = metadata_all[type]['tokens'][2]
|
416 |
+
D, I, meta = search_query_vocab_token(index_vor_tokens, vocab_queries)
|
417 |
+
|
418 |
+
qi_pos = 0
|
419 |
+
similar_tokens = {}
|
420 |
+
# similar_tokens = []
|
421 |
+
for dist, ind in zip(D,I):
|
422 |
+
try:
|
423 |
+
# similar_tokens.append({
|
424 |
+
similar_tokens[str(meta[qi_pos]['token'])] = {
|
425 |
+
'token': meta[qi_pos]['token'],
|
426 |
+
'text': meta[qi_pos]['text'],
|
427 |
+
# 'text': dict_tokenizer_tr[model].decode(meta[qi_pos]['token'])
|
428 |
+
# 'text': meta[qi_pos]['text'],
|
429 |
+
"similar_topk": [md_tokens[i_index]['token'] for i_index in ind if (i_index != -1) ],
|
430 |
+
"distance": [dist[i] for (i, i_index) in enumerate(ind) if (i_index != -1)],
|
431 |
+
}
|
432 |
+
# )
|
433 |
+
except:
|
434 |
+
print("\n ERROR ", qi_pos, dist, ind)
|
435 |
+
qi_pos += 1
|
436 |
+
|
437 |
+
|
438 |
+
index_vor_words = metadata_all[type]['words'][1]
|
439 |
+
md_words = metadata_all[type]['words'][2]
|
440 |
+
|
441 |
+
Dw, Iw, metaw = search_query_vocab(index_vor_words, words_vocab_queries)
|
442 |
+
# D, I, meta, vocab_words, sentence_words_list = result_input['words']# [2] # D ; I ; meta
|
443 |
+
qi_pos = 0
|
444 |
+
# similar_words = []
|
445 |
+
similar_words = {}
|
446 |
+
for dist, ind in zip(Dw,Iw):
|
447 |
+
try:
|
448 |
+
# similar_words.append({
|
449 |
+
similar_words[str(metaw[qi_pos]['word']) ] = {
|
450 |
+
'word': metaw[qi_pos]['word'],
|
451 |
+
'text': metaw[qi_pos]['word'],
|
452 |
+
"similar_topk": [md_words[i_index]['word'] for i_index in ind if (i_index != -1) ],
|
453 |
+
"distance": [dist[i] for (i, i_index) in enumerate(ind) if (i_index != -1)],
|
454 |
+
}
|
455 |
+
# )
|
456 |
+
except:
|
457 |
+
print("\n ERROR ", qi_pos, dist, ind)
|
458 |
+
qi_pos += 1
|
459 |
+
|
460 |
+
|
461 |
+
return {'tokens': {'D': D, 'I': I, 'meta': meta, 'vocab_queries': vocab_queries, 'similar':similar_tokens, 'sentence_key_list': sentence_tokens_list},
|
462 |
+
'words': {'D':Dw,'I': Iw, 'meta': metaw, 'vocab_queries':words_vocab_queries, 'sentence_key_list': sentence_words_list, 'similar': similar_words}
|
463 |
+
}
|
464 |
+
from sklearn.manifold import TSNE
|
465 |
+
def embds_input_projection_vocab(vocab, key="token"):
|
466 |
+
t0 = time()
|
467 |
+
|
468 |
+
nb_ids = [] ##ordered ids list
|
469 |
+
nb_embds = [] ##ordered embeddings list
|
470 |
+
nb_text = [] ##ordered embeddings list
|
471 |
+
tnse_error = []
|
472 |
+
for _ , token_values in vocab.items():
|
473 |
+
tnse_error.append([0,0])
|
474 |
+
nb_ids.append(token_values[key]) # for x in vocab_tokens]
|
475 |
+
nb_text.append(token_values['text']) # for x in vocab_tokens]
|
476 |
+
nb_embds.append(token_values['embed']) # for x in vocab_tokens]
|
477 |
+
|
478 |
+
X = np.array(nb_embds).astype('float32') #elements to project
|
479 |
+
try:
|
480 |
+
tsne = TSNE(random_state=0, n_iter=1000)
|
481 |
+
tsne_results = tsne.fit_transform(X)
|
482 |
+
|
483 |
+
tsne_results = np.c_[tsne_results, nb_ids, nb_text, range(len(nb_ids))] ## creates a zip array : [[TNSE[X,Y], tokenid, token_text], ...]
|
484 |
+
except:
|
485 |
+
tsne_results = np.c_[tnse_error, nb_ids, nb_text, range(len(nb_ids))] ## creates a zip array : [[TNSE[X,Y], tokenid, token_text], ...]
|
486 |
+
|
487 |
+
t1 = time()
|
488 |
+
print("t-SNE: %.2g sec" % (t1 - t0))
|
489 |
+
# print(tsne_results)
|
490 |
+
|
491 |
+
return tsne_results.tolist()
|
492 |
+
|
493 |
+
|
494 |
+
|
495 |
+
def filtered_projection(similar_key, vocab, model, type="input", key="word"):
|
496 |
+
metadata_all = dict_reference_faiss[model]
|
497 |
+
vocab_proj = vocab.copy()
|
498 |
+
## tnse projection Input words
|
499 |
+
source_words_voc_similar = set()
|
500 |
+
# for words_set in similar_key:
|
501 |
+
for key_i in similar_key:
|
502 |
+
words_set = similar_key[key_i]
|
503 |
+
source_words_voc_similar.update(words_set['similar_topk'])
|
504 |
+
|
505 |
+
# print(len(source_words_voc_similar))
|
506 |
+
# source_embeddings_filtered = {key: metadata_all['input']['words'][0][key] for key in source_words_voc_similar}
|
507 |
+
source_embeddings_filtered = {key_value: metadata_all[type][key][0][key_value] for key_value in source_words_voc_similar}
|
508 |
+
vocab_proj.update(source_embeddings_filtered)
|
509 |
+
## vocab_proj add
|
510 |
+
try:
|
511 |
+
result_TSNE = embds_input_projection_vocab(vocab_proj, key=key[:-1]) ## singular => without 's'
|
512 |
+
dict_projected_embds_all = {str(embds[2]): [embds[0], embds[1], embds[2], embds[3], embds[4]] for embds in result_TSNE}
|
513 |
+
except:
|
514 |
+
print('TSNE error', type, key)
|
515 |
+
dict_projected_embds_all = {}
|
516 |
+
|
517 |
+
|
518 |
+
|
519 |
+
# print(result_TSNE)
|
520 |
+
return dict_projected_embds_all
|
521 |
+
|
522 |
+
def get_bertvis_data(input_text, lg_model):
|
523 |
+
tokenizer_tr = dict_tokenizer_tr[lg_model]
|
524 |
+
model_tr = dict_models_tr[lg_model]
|
525 |
+
|
526 |
+
# input_ids = tokenizer_tr(input_text, return_tensors="pt", padding=True)
|
527 |
+
input_ids = tokenizer_tr(input_text, return_tensors="pt", padding=False)
|
528 |
+
result_att = model_tr.generate(**input_ids,
|
529 |
+
num_beams=4,
|
530 |
+
num_return_sequences=4,
|
531 |
+
return_dict_in_generate=True,
|
532 |
+
output_attentions =True,
|
533 |
+
output_scores=True,
|
534 |
+
)
|
535 |
+
|
536 |
+
# tokenizer_tr.convert_ids_to_tokens(result_att.sequences[0])
|
537 |
+
# tokenizer_tr.convert_ids_to_tokens(input_ids.input_ids[0])
|
538 |
+
|
539 |
+
tgt_text = tokenizer_tr.decode(result_att.sequences[0], skip_special_tokens=True)
|
540 |
+
|
541 |
+
|
542 |
+
outputs = model_tr(input_ids=input_ids.input_ids,
|
543 |
+
decoder_input_ids=result_att.sequences[:1],
|
544 |
+
output_attentions =True,
|
545 |
+
)
|
546 |
+
html_attentions = head_view_mod(
|
547 |
+
encoder_attention = outputs.encoder_attentions,
|
548 |
+
cross_attention = outputs.cross_attentions,
|
549 |
+
decoder_attention = outputs.decoder_attentions,
|
550 |
+
encoder_tokens = tokenizer_tr.convert_ids_to_tokens(input_ids.input_ids[0]),
|
551 |
+
decoder_tokens = tokenizer_tr.convert_ids_to_tokens(result_att.sequences[0]),
|
552 |
+
html_action='gradio'
|
553 |
+
)
|
554 |
+
return html_attentions, tgt_text, result_att, outputs
|
555 |
+
|
556 |
+
|
557 |
+
def translation_model(w1, model):
|
558 |
+
#translate and get internal values and visualizations;
|
559 |
+
# src_text = saliency_examples[0]
|
560 |
+
inputs = dict_tokenizer_tr[model](w1, return_tensors="pt", padding=True)
|
561 |
+
|
562 |
+
num_ret_seq = 4
|
563 |
+
translated = dict_models_tr[model].generate(**inputs,
|
564 |
+
num_beams=4,
|
565 |
+
num_return_sequences=num_ret_seq,
|
566 |
+
return_dict_in_generate=True,
|
567 |
+
output_attentions =True,
|
568 |
+
output_hidden_states = True,
|
569 |
+
output_scores=True,)
|
570 |
+
|
571 |
+
|
572 |
+
beam_dict = split_token_from_sequences(translated.sequences,model )
|
573 |
+
|
574 |
+
tgt_text = dict_tokenizer_tr[model].decode(translated.sequences[0], skip_special_tokens=True)
|
575 |
+
## Attentions
|
576 |
+
outputs = dict_models_tr[model](input_ids=inputs.input_ids,
|
577 |
+
decoder_input_ids=translated.sequences[:1],
|
578 |
+
output_attentions =True,
|
579 |
+
)
|
580 |
+
encoder_tokens = dict_tokenizer_tr[model].convert_ids_to_tokens(inputs.input_ids[0])
|
581 |
+
decoder_tokens = dict_tokenizer_tr[model].convert_ids_to_tokens(translated.sequences[0])
|
582 |
+
# decoder_tokens = [tok for tok in decoder_tokens if tok != '<pad>']
|
583 |
+
# decoder_tokens = [tok for tok in decoder_tokens if tok != '<pad>']
|
584 |
+
|
585 |
+
# html_attentions = head_view_mod(
|
586 |
+
# encoder_attention = outputs.encoder_attentions,
|
587 |
+
# cross_attention = outputs.cross_attentions,
|
588 |
+
# decoder_attention = outputs.decoder_attentions,
|
589 |
+
# encoder_tokens = encoder_tokens,
|
590 |
+
# decoder_tokens = decoder_tokens,
|
591 |
+
# html_action='gradio'
|
592 |
+
# )
|
593 |
+
|
594 |
+
html_attentions_enc = head_view_mod(
|
595 |
+
encoder_attention = outputs.encoder_attentions,
|
596 |
+
encoder_tokens = encoder_tokens,
|
597 |
+
decoder_tokens = decoder_tokens,
|
598 |
+
html_action='gradio'
|
599 |
+
)
|
600 |
+
|
601 |
+
html_attentions_dec = head_view_mod(
|
602 |
+
# encoder_attention = outputs.encoder_attentions,
|
603 |
+
decoder_attention = outputs.decoder_attentions,
|
604 |
+
encoder_tokens = encoder_tokens,
|
605 |
+
decoder_tokens = decoder_tokens,
|
606 |
+
html_action='gradio'
|
607 |
+
)
|
608 |
+
|
609 |
+
html_attentions_cross = head_view_mod(
|
610 |
+
cross_attention = outputs.cross_attentions,
|
611 |
+
encoder_tokens = encoder_tokens,
|
612 |
+
decoder_tokens = decoder_tokens,
|
613 |
+
html_action='gradio'
|
614 |
+
)
|
615 |
+
|
616 |
+
# tokenization
|
617 |
+
html_in, html_out = compute_tokenization(inputs, translated.sequences[0],w1, model)
|
618 |
+
|
619 |
+
transition_scores = dict_models_tr[model].compute_transition_scores(
|
620 |
+
translated.sequences, translated.scores, translated.beam_indices , normalize_logits=True
|
621 |
+
)
|
622 |
+
prob_tokens = get_k_prob_tokens(transition_scores, translated, model, k_values=10)
|
623 |
+
|
624 |
+
input_embeddings = dict_models_tr[model].get_encoder().embed_tokens(inputs.input_ids)
|
625 |
+
target_embeddings = dict_models_tr[model].get_decoder().embed_tokens(translated.sequences)
|
626 |
+
|
627 |
+
|
628 |
+
return [tgt_text,
|
629 |
+
[beam_dict,prob_tokens, html_in, html_out, translated, inputs.input_ids,input_embeddings,target_embeddings],
|
630 |
+
[html_attentions_enc['params'], html_attentions_enc['html2'].data],
|
631 |
+
[html_attentions_dec['params'], html_attentions_dec['html2'].data],
|
632 |
+
[html_attentions_cross['params'], html_attentions_cross['html2'].data] ]
|
633 |
+
|
634 |
+
|
635 |
+
|
636 |
+
html = """
|
637 |
+
<html>
|
638 |
+
<script async src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script>
|
639 |
+
|
640 |
+
<style>
|
641 |
+
.tok-first-color {
|
642 |
+
background: #e0ffcd;
|
643 |
+
}
|
644 |
+
|
645 |
+
.tok-second-color {
|
646 |
+
background: #fdffcd;
|
647 |
+
}
|
648 |
+
|
649 |
+
.tok-third-color {
|
650 |
+
background: #ffebbb;
|
651 |
+
}
|
652 |
+
|
653 |
+
.tok-fourth-color {
|
654 |
+
background: #ffcab0;
|
655 |
+
}
|
656 |
+
</style>
|
657 |
+
<body>
|
658 |
+
|
659 |
+
<p id="demo"></p>
|
660 |
+
<p id="viz"></p>
|
661 |
+
|
662 |
+
<p id="demo2"></p>
|
663 |
+
<h4> Exploring top-k probable tokens </h4>
|
664 |
+
<div id="d3_text_grid">... top 10 tokens generated at each step ...</div>
|
665 |
+
|
666 |
+
<h4> Exploring the Beam Search sequence generation</h4>
|
667 |
+
<div id="d3_beam_search">... top 4 generated sequences using Beam Search...</div>
|
668 |
+
|
669 |
+
</body>
|
670 |
+
</html>
|
671 |
+
"""
|
672 |
+
|
673 |
+
html_tok = """
|
674 |
+
<div id="d3_tok">... tokenization visualization ...</div>
|
675 |
+
"""
|
676 |
+
|
677 |
+
html_embd = """
|
678 |
+
<div id="d3_embd">... token embeddings visualization ...</div>
|
679 |
+
<div id="select_div">
|
680 |
+
<select id="select_type" class="form-select" aria-label="select example" hidden>
|
681 |
+
<option selected value="words">Words</option>
|
682 |
+
<option value="tokens">Tokens</option>
|
683 |
+
</select>
|
684 |
+
</div>
|
685 |
+
<div class="row">
|
686 |
+
<div class="col-9">
|
687 |
+
<div id="d3_graph_input_words" class="d3_graph words"></div>
|
688 |
+
</div>
|
689 |
+
<div class="col-3">
|
690 |
+
<div id="similar_input_words" class=""></div>
|
691 |
+
</div>
|
692 |
+
</div>
|
693 |
+
<div id="d3_graph_input_tokens" class="d3_graph tokens"></div>
|
694 |
+
<div id="similar_input_tokens" class=" "></div>
|
695 |
+
|
696 |
+
"""
|
697 |
+
|
698 |
+
html_tok_target ="""
|
699 |
+
<div id="d3_tok_target">... tokenization visualization ...</div>
|
700 |
+
"""
|
701 |
+
|
702 |
+
html_embd_target= """
|
703 |
+
<div id="d3_embd_target">... token embeddings visualization ...</div>
|
704 |
+
<div id="d3_graph_output_words" class="d3_graph words"></div>
|
705 |
+
<div id="d3_graph_output_tokens" class="d3_graph tokens"></div>
|
706 |
+
<div id="similar_output_words" class=""></div>
|
707 |
+
<div id="similar_output_tokens" class=" "></div>
|
708 |
+
"""
|
709 |
+
|
710 |
+
html_att_enc = """
|
711 |
+
<div id="d3_att_enc">... Encoder self attention only -- last layer and mean across heads ... Always read from left to right</div>
|
712 |
+
<div id="bertviz_enc"></div>
|
713 |
+
"""
|
714 |
+
|
715 |
+
html_att_cross = """
|
716 |
+
<div id="d3_att_cross">... Encoder-decoder cross attention only -- last layer and mean across heads ...</div>
|
717 |
+
"""
|
718 |
+
|
719 |
+
html_att_dec = """
|
720 |
+
<div id="d3_att_dec">... decoder self attention only -- last layer and mean across heads ...</div>
|
721 |
+
"""
|
722 |
+
|
723 |
+
|
724 |
+
|
725 |
+
def sentence_maker2(w1,j2):
|
726 |
+
print(w1,j2)
|
727 |
+
return "in sentence22..."
|
728 |
+
|
729 |
+
|
730 |
+
def first_function(w1, model):
|
731 |
+
global metadata_all
|
732 |
+
#translate and get internal values
|
733 |
+
sentences = w1.split("\n")
|
734 |
+
all_sentences = []
|
735 |
+
translated_text = ''
|
736 |
+
input_embeddings = []
|
737 |
+
output_embeddings = []
|
738 |
+
for sentence in sentences :
|
739 |
+
# print(sentence, end=";")
|
740 |
+
params = translation_model(sentence, model)
|
741 |
+
all_sentences.append(params)
|
742 |
+
# print(len(params))
|
743 |
+
translated_text += params[0] + ' \n'
|
744 |
+
input_embeddings.append({
|
745 |
+
'embeddings': params[1][6].detach(), ## create a vocabulary with the set of embeddings
|
746 |
+
'tokens': params[1][3+2].tolist(), # one translation = one sentence
|
747 |
+
# 'texts' : dict_tokenizer_tr[model].decode(params[2].tolist())
|
748 |
+
|
749 |
+
})
|
750 |
+
output_embeddings.append({
|
751 |
+
'embeddings' : params[1][7].detach(),
|
752 |
+
'tokens': params[1][3+1].sequences.tolist(),
|
753 |
+
# 'texts' : dict_tokenizer_tr[model].decode(params[1].sequences.tolist())
|
754 |
+
})
|
755 |
+
|
756 |
+
## load_reference;
|
757 |
+
## Build FAISS index
|
758 |
+
# ---> preload faiss using the respective model with a initial dataset.
|
759 |
+
## dict_reference_faiss[model] = metadata_all [per language]
|
760 |
+
# result_input = build_reference(input_embeddings,model)
|
761 |
+
# result_output = build_reference(output_embeddings,model)
|
762 |
+
# metadata_all = {'input': result_input, 'output': result_output}
|
763 |
+
|
764 |
+
## Build FAISS index
|
765 |
+
# ---> preload faiss using the respective model with a initial dataset.
|
766 |
+
result_search = {}
|
767 |
+
result_search['input'] = build_search(input_embeddings, model, type='input')
|
768 |
+
result_search['output'] = build_search(output_embeddings, model, type='output')
|
769 |
+
|
770 |
+
json_out = {'input': {'tokens': {}, 'words': {}}, 'output': {'tokens': {}, 'words': {}}}
|
771 |
+
dict_projected = {}
|
772 |
+
for type in ['input', 'output']:
|
773 |
+
dict_projected[type] = {}
|
774 |
+
for key in ['tokens', 'words']:
|
775 |
+
similar_key = result_search[type][key]['similar']
|
776 |
+
vocab = result_search[type][key]['vocab_queries']
|
777 |
+
dict_projected[type][key] = filtered_projection(similar_key, vocab, model, type=type, key=key)
|
778 |
+
json_out[type][key]['similar_queries'] = similar_key
|
779 |
+
json_out[type][key]['tnse'] = dict_projected[type][key]
|
780 |
+
json_out[type][key]['key_text_list'] = result_search[type][key]['sentence_key_list']
|
781 |
+
|
782 |
+
## bertviz
|
783 |
+
# paramsbv, tgtbv = get_bertvis_data(w1, model)
|
784 |
+
|
785 |
+
# params.append(json_out)
|
786 |
+
html_att_enc = params[2][1]#.root_div_id = "bertviz_enc"
|
787 |
+
html_att_dec = params[3][1]
|
788 |
+
html_att_cross = params[4][1]
|
789 |
+
|
790 |
+
|
791 |
+
params = [params[0], params[1], json_out, params[2][0], params[3][0], params[4][0]]
|
792 |
+
# params.append([tgt, params['params'], params['html2'].data]
|
793 |
+
|
794 |
+
return [translated_text, params, html_att_enc, html_att_dec, html_att_cross]
|
795 |
+
|
796 |
+
def second_function(w1,j2):
|
797 |
+
# json_value = {'one':1}# return f"{w1['two']} in sentence22..."
|
798 |
+
# to transfer the data to json.
|
799 |
+
print("second_function -- after the js", w1,j2)
|
800 |
+
return "transition to second js function finished."
|
801 |
+
|
802 |
+
|
803 |
+
with gr.Blocks(js="plotsjs.js") as demo:
|
804 |
+
gr.Markdown(
|
805 |
+
"""
|
806 |
+
# MAKE NMT Workshop \t `Literacy task`
|
807 |
+
""")
|
808 |
+
|
809 |
+
gr.Markdown(
|
810 |
+
"""
|
811 |
+
### Translation
|
812 |
+
""")
|
813 |
+
|
814 |
+
gr.Markdown(
|
815 |
+
"""
|
816 |
+
1. Select the language pair for the translation
|
817 |
+
""")
|
818 |
+
radio_c = gr.Radio(choices=['en-zh', 'en-es', 'en-fr', 'en-sw'], value="en-es", label= '', container=False)
|
819 |
+
gr.Markdown(
|
820 |
+
"""
|
821 |
+
2. Source text to translate
|
822 |
+
""")
|
823 |
+
in_text = gr.Textbox(label="source text")
|
824 |
+
with gr.Accordion("Optional: Challenge selection:", open=False):
|
825 |
+
gr.Markdown(
|
826 |
+
"""
|
827 |
+
### select an example from the challenge set listed bellow
|
828 |
+
""")
|
829 |
+
challenge_ex = gr.Textbox(label="Challenge", interactive=False)
|
830 |
+
category_minor = gr.Textbox(label="category_minor", interactive=False)
|
831 |
+
category_major = gr.Textbox(label="category_major", interactive=False)
|
832 |
+
|
833 |
+
with gr.Accordion("Examples:"):
|
834 |
+
gr.Examples(arr_challenge_set,[in_text, challenge_ex,category_minor,category_major], label="")
|
835 |
+
|
836 |
+
btn = gr.Button("Translate")
|
837 |
+
|
838 |
+
|
839 |
+
with gr.Accordion("3. Review the source tokenization:", open=False):
|
840 |
+
input_tokenisation = gr.HTML(html_tok)
|
841 |
+
|
842 |
+
with gr.Accordion("4. Review similar source tokens in the embedding space:", open=False):
|
843 |
+
input_embd= gr.HTML(html_embd)
|
844 |
+
|
845 |
+
with gr.Accordion("5. Review the attention between the source tokens:", open=False):
|
846 |
+
gr.Markdown(
|
847 |
+
"""
|
848 |
+
`Bertviz `
|
849 |
+
""")
|
850 |
+
input_embd= gr.HTML(html_att_enc)
|
851 |
+
enc_html = gr.HTML()
|
852 |
+
|
853 |
+
gr.Markdown(
|
854 |
+
"""
|
855 |
+
### Text is translated into Target Language
|
856 |
+
""")
|
857 |
+
out_text = gr.Textbox(label="target text")
|
858 |
+
|
859 |
+
with gr.Accordion("1. Review the target tokenization:", open=False):
|
860 |
+
target_tokenisation = gr.HTML(html_tok_target)
|
861 |
+
|
862 |
+
with gr.Accordion("2. Review similar target tokens in the embedding space:", open=False):
|
863 |
+
target_embd= gr.HTML(html_embd_target)
|
864 |
+
|
865 |
+
with gr.Accordion("3. Review the attention between the target and source tokens:", open=False):
|
866 |
+
gr.Markdown(
|
867 |
+
"""
|
868 |
+
`Bertviz -cross attention`
|
869 |
+
""")
|
870 |
+
input_embd= gr.HTML(html_att_cross)
|
871 |
+
cross_html = gr.HTML()
|
872 |
+
|
873 |
+
with gr.Accordion("4. Review the attention between the target tokens:", open=False):
|
874 |
+
gr.Markdown(
|
875 |
+
"""
|
876 |
+
`Bertviz -dec attention`
|
877 |
+
""")
|
878 |
+
input_embd= gr.HTML(html_att_dec)
|
879 |
+
dec_html = gr.HTML()
|
880 |
+
|
881 |
+
with gr.Accordion("6. Review the alternative translations tokens:", open=False):
|
882 |
+
gr.Markdown(
|
883 |
+
"""
|
884 |
+
Generation process : `topk - beam search `
|
885 |
+
""")
|
886 |
+
input_mic = gr.HTML(html)
|
887 |
+
|
888 |
+
|
889 |
+
out_text2 = gr.Textbox(visible=False)
|
890 |
+
var2 = gr.JSON(visible=False)
|
891 |
+
|
892 |
+
|
893 |
+
btn.click(first_function, [in_text, radio_c], [out_text,var2,enc_html, dec_html, cross_html], js="(in_text,radio_c) => testFn_out(in_text,radio_c)") #should return an output comp.
|
894 |
+
out_text.change(second_function, [out_text, var2], out_text2, js="(out_text,var2) => testFn_out_json(var2)") #
|
895 |
+
|
896 |
+
# run script function on load,
|
897 |
+
# demo.load(None,None,None,js="plotsjs.js")
|
898 |
+
|
899 |
+
if __name__ == "__main__":
|
900 |
+
demo.launch()
|
bertviz_gradio.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import uuid
|
5 |
+
|
6 |
+
from IPython.core.display import display, HTML, Javascript
|
7 |
+
|
8 |
+
from bertviz.util import format_special_chars, format_attention, num_layers
|
9 |
+
|
10 |
+
print("UP TO DATE")
|
11 |
+
|
12 |
+
def head_view_mod(
|
13 |
+
attention=None,
|
14 |
+
tokens=None,
|
15 |
+
sentence_b_start=None,
|
16 |
+
prettify_tokens=True,
|
17 |
+
layer=None,
|
18 |
+
heads=None,
|
19 |
+
encoder_attention=None,
|
20 |
+
decoder_attention=None,
|
21 |
+
cross_attention=None,
|
22 |
+
encoder_tokens=None,
|
23 |
+
decoder_tokens=None,
|
24 |
+
include_layers=None,
|
25 |
+
html_action='view',
|
26 |
+
patest ="something"
|
27 |
+
):
|
28 |
+
"""Render head view
|
29 |
+
|
30 |
+
Args:
|
31 |
+
For self-attention models:
|
32 |
+
attention: list of ``torch.FloatTensor``(one for each layer) of shape
|
33 |
+
``(batch_size(must be 1), num_heads, sequence_length, sequence_length)``
|
34 |
+
tokens: list of tokens
|
35 |
+
sentence_b_start: index of first wordpiece in sentence B if input text is sentence pair (optional)
|
36 |
+
For encoder-decoder models:
|
37 |
+
encoder_attention: list of ``torch.FloatTensor``(one for each layer) of shape
|
38 |
+
``(batch_size(must be 1), num_heads, encoder_sequence_length, encoder_sequence_length)``
|
39 |
+
decoder_attention: list of ``torch.FloatTensor``(one for each layer) of shape
|
40 |
+
``(batch_size(must be 1), num_heads, decoder_sequence_length, decoder_sequence_length)``
|
41 |
+
cross_attention: list of ``torch.FloatTensor``(one for each layer) of shape
|
42 |
+
``(batch_size(must be 1), num_heads, decoder_sequence_length, encoder_sequence_length)``
|
43 |
+
encoder_tokens: list of tokens for encoder input
|
44 |
+
decoder_tokens: list of tokens for decoder input
|
45 |
+
For all models:
|
46 |
+
prettify_tokens: indicates whether to remove special characters in wordpieces, e.g. Ġ
|
47 |
+
layer: index (zero-based) of initial selected layer in visualization. Defaults to layer 0.
|
48 |
+
heads: Indices (zero-based) of initial selected heads in visualization. Defaults to all heads.
|
49 |
+
include_layers: Indices (zero-based) of layers to include in visualization. Defaults to all layers.
|
50 |
+
Note: filtering layers may improve responsiveness of the visualization for long inputs.
|
51 |
+
html_action: Specifies the action to be performed with the generated HTML object
|
52 |
+
- 'view' (default): Displays the generated HTML representation as a notebook cell output
|
53 |
+
- 'return' : Returns an HTML object containing the generated view for further processing or custom visualization
|
54 |
+
"""
|
55 |
+
|
56 |
+
attn_data = []
|
57 |
+
if attention is not None:
|
58 |
+
if tokens is None:
|
59 |
+
raise ValueError("'tokens' is required")
|
60 |
+
if encoder_attention is not None or decoder_attention is not None or cross_attention is not None \
|
61 |
+
or encoder_tokens is not None or decoder_tokens is not None:
|
62 |
+
raise ValueError("If you specify 'attention' you may not specify any encoder-decoder arguments. This"
|
63 |
+
" argument is only for self-attention models.")
|
64 |
+
if include_layers is None:
|
65 |
+
include_layers = list(range(num_layers(attention)))
|
66 |
+
attention = format_attention(attention, include_layers)
|
67 |
+
if sentence_b_start is None:
|
68 |
+
attn_data.append(
|
69 |
+
{
|
70 |
+
'name': None,
|
71 |
+
'attn': attention.tolist(),
|
72 |
+
'left_text': tokens,
|
73 |
+
'right_text': tokens
|
74 |
+
}
|
75 |
+
)
|
76 |
+
else:
|
77 |
+
slice_a = slice(0, sentence_b_start) # Positions corresponding to sentence A in input
|
78 |
+
slice_b = slice(sentence_b_start, len(tokens)) # Position corresponding to sentence B in input
|
79 |
+
attn_data.append(
|
80 |
+
{
|
81 |
+
'name': 'All',
|
82 |
+
'attn': attention.tolist(),
|
83 |
+
'left_text': tokens,
|
84 |
+
'right_text': tokens
|
85 |
+
}
|
86 |
+
)
|
87 |
+
attn_data.append(
|
88 |
+
{
|
89 |
+
'name': 'Sentence A -> Sentence A',
|
90 |
+
'attn': attention[:, :, slice_a, slice_a].tolist(),
|
91 |
+
'left_text': tokens[slice_a],
|
92 |
+
'right_text': tokens[slice_a]
|
93 |
+
}
|
94 |
+
)
|
95 |
+
attn_data.append(
|
96 |
+
{
|
97 |
+
'name': 'Sentence B -> Sentence B',
|
98 |
+
'attn': attention[:, :, slice_b, slice_b].tolist(),
|
99 |
+
'left_text': tokens[slice_b],
|
100 |
+
'right_text': tokens[slice_b]
|
101 |
+
}
|
102 |
+
)
|
103 |
+
attn_data.append(
|
104 |
+
{
|
105 |
+
'name': 'Sentence A -> Sentence B',
|
106 |
+
'attn': attention[:, :, slice_a, slice_b].tolist(),
|
107 |
+
'left_text': tokens[slice_a],
|
108 |
+
'right_text': tokens[slice_b]
|
109 |
+
}
|
110 |
+
)
|
111 |
+
attn_data.append(
|
112 |
+
{
|
113 |
+
'name': 'Sentence B -> Sentence A',
|
114 |
+
'attn': attention[:, :, slice_b, slice_a].tolist(),
|
115 |
+
'left_text': tokens[slice_b],
|
116 |
+
'right_text': tokens[slice_a]
|
117 |
+
}
|
118 |
+
)
|
119 |
+
elif encoder_attention is not None or decoder_attention is not None or cross_attention is not None:
|
120 |
+
if encoder_attention is not None:
|
121 |
+
if encoder_tokens is None:
|
122 |
+
raise ValueError("'encoder_tokens' required if 'encoder_attention' is not None")
|
123 |
+
if include_layers is None:
|
124 |
+
include_layers = list(range(num_layers(encoder_attention)))
|
125 |
+
encoder_attention = format_attention(encoder_attention, include_layers)
|
126 |
+
attn_data.append(
|
127 |
+
{
|
128 |
+
'name': 'Encoder',
|
129 |
+
'attn': encoder_attention.tolist(),
|
130 |
+
'left_text': encoder_tokens,
|
131 |
+
'right_text': encoder_tokens
|
132 |
+
}
|
133 |
+
)
|
134 |
+
if decoder_attention is not None:
|
135 |
+
if decoder_tokens is None:
|
136 |
+
raise ValueError("'decoder_tokens' required if 'decoder_attention' is not None")
|
137 |
+
if include_layers is None:
|
138 |
+
include_layers = list(range(num_layers(decoder_attention)))
|
139 |
+
decoder_attention = format_attention(decoder_attention, include_layers)
|
140 |
+
attn_data.append(
|
141 |
+
{
|
142 |
+
'name': 'Decoder',
|
143 |
+
'attn': decoder_attention.tolist(),
|
144 |
+
'left_text': decoder_tokens,
|
145 |
+
'right_text': decoder_tokens
|
146 |
+
}
|
147 |
+
)
|
148 |
+
if cross_attention is not None:
|
149 |
+
if encoder_tokens is None:
|
150 |
+
raise ValueError("'encoder_tokens' required if 'cross_attention' is not None")
|
151 |
+
if decoder_tokens is None:
|
152 |
+
raise ValueError("'decoder_tokens' required if 'cross_attention' is not None")
|
153 |
+
if include_layers is None:
|
154 |
+
include_layers = list(range(num_layers(cross_attention)))
|
155 |
+
cross_attention = format_attention(cross_attention, include_layers)
|
156 |
+
attn_data.append(
|
157 |
+
{
|
158 |
+
'name': 'Cross',
|
159 |
+
'attn': cross_attention.tolist(),
|
160 |
+
'left_text': decoder_tokens,
|
161 |
+
'right_text': encoder_tokens
|
162 |
+
}
|
163 |
+
)
|
164 |
+
else:
|
165 |
+
raise ValueError("You must specify at least one attention argument.")
|
166 |
+
|
167 |
+
if layer is not None and layer not in include_layers:
|
168 |
+
raise ValueError(f"Layer {layer} is not in include_layers: {include_layers}")
|
169 |
+
|
170 |
+
# Generate unique div id to enable multiple visualizations in one notebook
|
171 |
+
vis_id = 'bertviz-%s'%(uuid.uuid4().hex)
|
172 |
+
# vis_id = 'bertviz'#-%s'%(uuid.uuid4().hex)
|
173 |
+
|
174 |
+
# Compose html
|
175 |
+
if len(attn_data) > 1:
|
176 |
+
options = '\n'.join(
|
177 |
+
f'<option value="{i}">{attn_data[i]["name"]}</option>'
|
178 |
+
for i, d in enumerate(attn_data)
|
179 |
+
)
|
180 |
+
select_html = f'Attention: <select id="filter">{options}</select>'
|
181 |
+
else:
|
182 |
+
select_html = ""
|
183 |
+
vis_html = f"""
|
184 |
+
<div id="{vis_id}" style="font-family:'Helvetica Neue', Helvetica, Arial, sans-serif;">
|
185 |
+
<span style="user-select:none">
|
186 |
+
Layer: <select id="layer"></select>
|
187 |
+
{select_html}
|
188 |
+
</span>
|
189 |
+
<div id='vis'></div>
|
190 |
+
</div>
|
191 |
+
"""
|
192 |
+
|
193 |
+
for d in attn_data:
|
194 |
+
attn_seq_len_left = len(d['attn'][0][0])
|
195 |
+
if attn_seq_len_left != len(d['left_text']):
|
196 |
+
raise ValueError(
|
197 |
+
f"Attention has {attn_seq_len_left} positions, while number of tokens is {len(d['left_text'])} "
|
198 |
+
f"for tokens: {' '.join(d['left_text'])}"
|
199 |
+
)
|
200 |
+
attn_seq_len_right = len(d['attn'][0][0][0])
|
201 |
+
if attn_seq_len_right != len(d['right_text']):
|
202 |
+
raise ValueError(
|
203 |
+
f"Attention has {attn_seq_len_right} positions, while number of tokens is {len(d['right_text'])} "
|
204 |
+
f"for tokens: {' '.join(d['right_text'])}"
|
205 |
+
)
|
206 |
+
if prettify_tokens:
|
207 |
+
d['left_text'] = format_special_chars(d['left_text'])
|
208 |
+
d['right_text'] = format_special_chars(d['right_text'])
|
209 |
+
params = {
|
210 |
+
'attention': attn_data,
|
211 |
+
'default_filter': "0",
|
212 |
+
'root_div_id': vis_id,
|
213 |
+
'layer': layer,
|
214 |
+
'heads': heads,
|
215 |
+
'include_layers': include_layers,
|
216 |
+
'test': 'test'
|
217 |
+
}
|
218 |
+
|
219 |
+
# require.js must be imported for Colab or JupyterLab:
|
220 |
+
|
221 |
+
if html_action == 'gradio':
|
222 |
+
html1 = HTML('<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script>')
|
223 |
+
html2 = HTML(vis_html)
|
224 |
+
|
225 |
+
return {'html1': html1, 'html2' : html2, 'params': params }
|
226 |
+
|
227 |
+
|
228 |
+
if html_action == 'view':
|
229 |
+
display(HTML('<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script>'))
|
230 |
+
display(HTML(vis_html))
|
231 |
+
__location__ = os.path.realpath(
|
232 |
+
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
233 |
+
vis_js = open(os.path.join(__location__, 'head_view.js')).read().replace("PYTHON_PARAMS", json.dumps(params))
|
234 |
+
display(Javascript(vis_js))
|
235 |
+
|
236 |
+
elif html_action == 'return':
|
237 |
+
html1 = HTML('<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script>')
|
238 |
+
|
239 |
+
html2 = HTML(vis_html)
|
240 |
+
|
241 |
+
__location__ = os.path.realpath(
|
242 |
+
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
243 |
+
vis_js = open(os.path.join(__location__, 'head_view.js')).read().replace("PYTHON_PARAMS", json.dumps(params))
|
244 |
+
html3 = Javascript(vis_js)
|
245 |
+
script = '\n<script type="text/javascript">\n' + html3.data + '\n</script>\n'
|
246 |
+
|
247 |
+
head_html = HTML(html1.data + html2.data + script)
|
248 |
+
return head_html
|
249 |
+
|
250 |
+
else:
|
251 |
+
raise ValueError("'html_action' parameter must be 'view' or 'return")
|
challenge_sets.csv
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name,Lang.,Source sentence,Challenge,category_minor,category_major,Interesting?
|
2 |
+
Isabel challenge set,EN,The repeated calls from his mother [should] have alerted us.,Is subject-verb agrement correct? (Possible interference from distractors between the subject's head and the verb).,"S-V agreement, across distractors",Morpho-Syntactic,
|
3 |
+
Isabel challenge set,EN,The sudden noise in the upper rooms [should] have alerted us.,Is subject-verb agrement correct? (Possible interference from distractors between the subject's head and the verb).,"S-V agreement, across distractors",Morpho-Syntactic,
|
4 |
+
Isabel challenge set,EN,Their repeated failures to report the problem [should] have alerted us.,Is subject-verb agrement correct? (Possible interference from distractors between the subject's head and the verb).,"S-V agreement, across distractors",Morpho-Syntactic,
|
5 |
+
Isabel challenge set,EN,She asked her brother not to be [arrogant].,Does the flagged adjective agree correctly with its subject? (Subject-control versus object-control verbs).,"S-V agreement, through control verbs",Morpho-Syntactic,
|
6 |
+
Isabel challenge set,EN,She promised her brother not to be [arrogant].,Does the flagged adjective agree correctly with its subject? (Subject-control versus object-control verbs).,"S-V agreement, through control verbs",Morpho-Syntactic,
|
7 |
+
Isabel challenge set,EN,She promised her doctor to remain [active] after retiring.,Does the flagged adjective agree correctly with its subject? (Subject-control versus object-control verbs).,"S-V agreement, through control verbs",Morpho-Syntactic,
|
8 |
+
Isabel challenge set,EN,My mother promised my father to be more [prudent] on the road.,Does the flagged adjective agree correctly with its subject? (Subject-control versus object-control verbs).,"S-V agreement, through control verbs",Morpho-Syntactic,
|
9 |
+
Isabel challenge set,EN,The woman was very [tall] and extremely [strong].,Do the marked verbs/adjective agree correctly with their subject? (Agreement distribution over coordinated predicates),"S-V agreement, coordinated targets",Morpho-Syntactic,
|
10 |
+
Isabel challenge set,EN,Their politicians were more [ignorant] than [stupid].,Do the marked verbs/adjective agree correctly with their subject? (Agreement distribution over coordinated predicates),"S-V agreement, coordinated targets",Morpho-Syntactic,
|
11 |
+
Isabel challenge set,EN,We [shouted] an insult and [left] abruptly.,Do the marked verbs/adjective agree correctly with their subject? (Agreement distribution over coordinated predicates),"S-V agreement, coordinated targets",Morpho-Syntactic,
|
12 |
+
Isabel challenge set,EN,The cat and the dog [should] be [watched].,Do the marked verbs/adjective agree correctly with their subject? (Masculine singular ET masculine singular yields masculine plural).,"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
|
13 |
+
Isabel challenge set,EN,My father and my brother [will] be [happy] tomorrow.,Do the marked verbs/adjective agree correctly with their subject? (Masculine singular ET masculine singular yields masculine plural).,"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
|
14 |
+
Isabel challenge set,EN,My book and my pencil [could] be [stolen].,Do the marked verbs/adjective agree correctly with their subject? (Masculine singular ET masculine singular yields masculine plural).,"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
|
15 |
+
Isabel challenge set,EN,The cow and the hen [must] be [fed].,Do the marked verbs/adjectives agree correctly with their subject? (Feminine singular ET feminine singular yields feminine plural).,"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
|
16 |
+
Isabel challenge set,EN,My mother and my sister [will be happy] tomorrow.,Do the marked verbs/adjectives agree correctly with their subject? (Feminine singular ET feminine singular yields feminine plural).,"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
|
17 |
+
Isabel challenge set,EN,My shoes and my socks [will] be [found].,Do the marked verbs/adjectives agree correctly with their subject? (Feminine singular ET feminine singular yields feminine plural).,"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
|
18 |
+
Isabel challenge set,EN,The dog and the cow [are] [nervous].,Do the marked verbs/adjectives agree correctly with their subject? (Masculine singular ET feminine singular yields masculine plural.),"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
|
19 |
+
Isabel challenge set,EN,My father and my mother will be happy tomorrow.,Do the marked verbs/adjectives agree correctly with their subject? (Masculine singular ET feminine singular yields masculine plural.),"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
|
20 |
+
Isabel challenge set,EN,My refrigerator and my kitchen table [were] [stolen].,Do the marked verbs/adjectives agree correctly with their subject? (Masculine singular ET feminine singular yields masculine plural.),"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
|
21 |
+
Isabel challenge set,EN,Paul and I [could] easily be [convinced] to join you.,Do the marked verbs/adjectives agree correctly with their subject? (Smallest coordinated grammatical person wins.),"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
|
22 |
+
Isabel challenge set,EN,You and he [could] be [surprised] by her findings.,Do the marked verbs/adjectives agree correctly with their subject? (Smallest coordinated grammatical person wins.),"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
|
23 |
+
Isabel challenge set,EN,We and they [are] on different courses.,Do the marked verbs/adjectives agree correctly with their subject? (Smallest coordinated grammatical person wins.),"S-V agreement, feature calculus on coordinated source",Morpho-Syntactic,
|
24 |
+
Isabel challenge set,EN,The woman who [saw] a mouse in the corridor is charming.,Are the agreement marks of the flagged participles the correct ones? (Past participle placed after auxiliary AVOIR agrees with verb object iff object precedes auxiliary. Otherwise participle is in masculine singular form).,"S-V agreement, past participles",Morpho-Syntactic,
|
25 |
+
Isabel challenge set,EN,The woman that your brother [saw] in the corridor is charming.,Are the agreement marks of the flagged participles the correct ones? (Past participle placed after auxiliary AVOIR agrees with verb object iff object precedes auxiliary. Otherwise participle is in masculine singular form).,"S-V agreement, past participles",Morpho-Syntactic,
|
26 |
+
Isabel challenge set,EN,The house that John has [visited] is crumbling.,Are the agreement marks of the flagged participles the correct ones? (Past participle placed after auxiliary AVOIR agrees with verb object iff object precedes auxiliary. Otherwise participle is in masculine singular form).,"S-V agreement, past participles",Morpho-Syntactic,
|
27 |
+
Isabel challenge set,EN,John sold the car that he had [won] in a lottery.,Are the agreement marks of the flagged participles the correct ones? (Past participle placed after auxiliary AVOIR agrees with verb object iff object precedes auxiliary. Otherwise participle is in masculine singular form).,"S-V agreement, past participles",Morpho-Syntactic,
|
28 |
+
Isabel challenge set,EN,He will come provided that you [come] too.,"Is the flagged verb in the correct mood? (Certain triggering verbs, adjectives or subordinate conjunctions, induce the subjunctive mood in the subordinate clause that they govern).",Subjunctive mood,Morpho-Syntactic,
|
29 |
+
Isabel challenge set,EN,It is unfortunate that he is not [coming] either.,"Is the flagged verb in the correct mood? (Certain triggering verbs, adjectives or subordinate conjunctions, induce the subjunctive mood in the subordinate clause that they govern).",Subjunctive mood,Morpho-Syntactic,
|
30 |
+
Isabel challenge set,EN,I requested that families not [be] separated.,"Is the flagged verb in the correct mood? (Certain triggering verbs, adjectives or subordinate conjunctions, induce the subjunctive mood in the subordinate clause that they govern).",Subjunctive mood,Morpho-Syntactic,
|
31 |
+
Isabel challenge set,EN,[Mary] sorely misses [Jim].,Are the experiencer and the object of the ``missing'' situation correctly preserved in the French translation? (Argument switch).,Argument switch,Lexico-Syntactic,
|
32 |
+
Isabel challenge set,EN,[My sister] is really missing [New York.],Are the experiencer and the object of the ``missing'' situation correctly preserved in the French translation? (Argument switch).,Argument switch,Lexico-Syntactic,
|
33 |
+
Isabel challenge set,EN,What [he] misses most is [his dog].,Are the experiencer and the object of the ``missing'' situation correctly preserved in the French translation? (Argument switch).,Argument switch,Lexico-Syntactic,
|
34 |
+
Isabel challenge set,EN,John gave [his wonderful wife] a nice present.,Are ``gift'' and ``recipient'' arguments correctly rendered in French? (English double-object constructions),Double-object verbs,Lexico-Syntactic,
|
35 |
+
Isabel challenge set,EN,John told [the kids] a nice story.,Are ``gift'' and ``recipient'' arguments correctly rendered in French? (English double-object constructions),Double-object verbs,Lexico-Syntactic,
|
36 |
+
Isabel challenge set,EN,John sent [his mother] a nice postcard.,Are ``gift'' and ``recipient'' arguments correctly rendered in French? (English double-object constructions),Double-object verbs,Lexico-Syntactic,
|
37 |
+
Isabel challenge set,EN,John [failed to] see the relevance of this point.,Is the meaning of ``fail to'' correctly rendered in the French translation?,Fail to,Lexico-Syntactic,
|
38 |
+
Isabel challenge set,EN,He failed to respond.,Is the meaning of ``fail to'' correctly rendered in the French translation?,Fail to,Lexico-Syntactic,
|
39 |
+
Isabel challenge set,EN,Those who fail to comply with this requirement will be penalized.,Is the meaning of ``fail to'' correctly rendered in the French translation?,Fail to,Lexico-Syntactic,
|
40 |
+
Isabel challenge set,EN,John would like to [swim across] the river.,Is the movement action expressed in the English source correctly rendered in French? (Manner-of-movement verbs with path argument may need to be rephrased in French).,Manner-of-movement verbs,Lexico-Syntactic,
|
41 |
+
Isabel challenge set,EN,They [ran into] the room.,Is the movement action expressed in the English source correctly rendered in French? (Manner-of-movement verbs with path argument may need to be rephrased in French).,Manner-of-movement verbs,Lexico-Syntactic,
|
42 |
+
Isabel challenge set,EN,The man [ran out of] the park.,Is the movement action expressed in the English source correctly rendered in French? (Manner-of-movement verbs with path argument may need to be rephrased in French).,Manner-of-movement verbs,Lexico-Syntactic,
|
43 |
+
Isabel challenge set,EN,John [guitared his way] to San Francisco.,Hard example featuring spontaneous noun-to-verb derivation (``nonce verb'').,Manner-of-movement verbs,Lexico-Syntactic,
|
44 |
+
Isabel challenge set,EN,Paul [knows] that this is a fact.,Is the French verb for ``know'' correctly chosen? (Choice between ``savoir''/``connaître'' depends on syntactic nature of its object),Overlapping subcat frames,Lexico-Syntactic,
|
45 |
+
Isabel challenge set,EN,Paul [knows] this story.,Is the French verb for ``know'' correctly chosen? (Choice between ``savoir''/``connaître'' depends on syntactic nature of its object),Overlapping subcat frames,Lexico-Syntactic,
|
46 |
+
Isabel challenge set,EN,Paul [knows] this story is hard to believe.,Is the French verb for ``know'' correctly chosen? (Choice between ``savoir''/``connaître'' depends on syntactic nature of its object),Overlapping subcat frames,Lexico-Syntactic,
|
47 |
+
Isabel challenge set,EN,He [knows] my sister will not take it.,Is the French verb for ``know'' correctly chosen? (Choice between ``savoir''/``connaître'' depends on syntactic nature of its object),Overlapping subcat frames,Lexico-Syntactic,
|
48 |
+
Isabel challenge set,EN,My sister [knows] your son is reliable.,Is the French verb for ``know'' correctly chosen? (Choice between ``savoir''/``connaître'' depends on syntactic nature of its object),Overlapping subcat frames,Lexico-Syntactic,
|
49 |
+
Isabel challenge set,EN,John believes [Bill to be dishonest].,Is the English ``NP to VP'' complement correctly rendred in the French translation? (Sometimes one needs to translate this structure as a finite clause).,NP to VP,Lexico-Syntactic,
|
50 |
+
Isabel challenge set,EN,He liked [his father to tell him stories].,Is the English ``NP to VP'' complement correctly rendred in the French translation? (Sometimes one needs to translate this structure as a finite clause).,NP to VP,Lexico-Syntactic,
|
51 |
+
Isabel challenge set,EN,She wanted [her mother to let her go].,Is the English ``NP to VP'' complement correctly rendred in the French translation? (Sometimes one needs to translate this structure as a finite clause).,NP to VP,Lexico-Syntactic,
|
52 |
+
Isabel challenge set,EN,John [cooked] a big chicken.,Is the English verb correctly rendered in the French translation? (Agentive use of some French verbs require embedding under ``faire'').,Factitives,Lexico-Syntactic,
|
53 |
+
Isabel challenge set,EN,John [melted] a lot of ice.,Is the English verb correctly rendered in the French translation? (Agentive use of some French verbs require embedding under ``faire'').,Factitives,Lexico-Syntactic,
|
54 |
+
Isabel challenge set,EN,She likes to [grow] flowers.,Is the English verb correctly rendered in the French translation? (Agentive use of some French verbs require embedding under ``faire'').,Factitives,Lexico-Syntactic,
|
55 |
+
Isabel challenge set,EN,Use the meat knife.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
|
56 |
+
Isabel challenge set,EN,Use the butter knife.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
|
57 |
+
Isabel challenge set,EN,Use the steak knife.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
|
58 |
+
Isabel challenge set,EN,Clean the water filter.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
|
59 |
+
Isabel challenge set,EN,Clean the juice filter.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
|
60 |
+
Isabel challenge set,EN,Clean the tea filter.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
|
61 |
+
Isabel challenge set,EN,Clean the cloth filter.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
|
62 |
+
Isabel challenge set,EN,Clean the metal filter.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
|
63 |
+
Isabel challenge set,EN,Clean the paper filter.,Is the English nominal compound rendered with the right preposition in the French translation?,Noun Compounds,Lexico-Syntactic,
|
64 |
+
Isabel challenge set,EN,Stop [beating around the bush].,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Common idioms,Lexico-Syntactic,
|
65 |
+
Isabel challenge set,EN,You are [putting the cart before the horse].,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Common idioms,Lexico-Syntactic,
|
66 |
+
Isabel challenge set,EN,His comment proved to be [the straw that broke the camel's back].,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Common idioms,Lexico-Syntactic,
|
67 |
+
Isabel challenge set,EN,His argument really [hit the nail on the head].,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Common idioms,Lexico-Syntactic,
|
68 |
+
Isabel challenge set,EN,It's [no use crying over spilt milk].,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Common idioms,Lexico-Syntactic,
|
69 |
+
Isabel challenge set,EN,It is [no use crying over spilt milk].,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Common idioms,Lexico-Syntactic,
|
70 |
+
Isabel challenge set,EN,The cart has been put before the horse.,Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Syntactically flexible idioms,Lexico-Syntactic,
|
71 |
+
Isabel challenge set,EN,"With this argument, [the nail has been hit on the head].",Is the English idiomatic expression correctly rendered with a suitable French idiomatic expression?,Syntactically flexible idioms,Lexico-Syntactic,
|
72 |
+
Isabel challenge set,EN,[Have the kids] ever watched that movie?,Is the English question correctly rendered as a French question?,Yes-no question syntax,Syntactic,
|
73 |
+
Isabel challenge set,EN,[Hasn't your boss denied you] a promotion?,Is the English question correctly rendered as a French question?,Yes-no question syntax,Syntactic,
|
74 |
+
Isabel challenge set,EN,[Shouldn't I attend] this meeting?,Is the English question correctly rendered as a French question?,Yes-no question syntax,Syntactic,
|
75 |
+
Isabel challenge set,EN,"Mary looked really happy tonight, [didn't she]?",Is the English ``tag question'' element correctly rendered in the translation?,Tag questions,Syntactic,
|
76 |
+
Isabel challenge set,EN,"We should not do that again, [should we]?",Is the English ``tag question'' element correctly rendered in the translation?,Tag questions,Syntactic,
|
77 |
+
Isabel challenge set,EN,"She was perfect tonight, [was she not]?",Is the English ``tag question'' element correctly rendered in the translation?,Tag questions,Syntactic,
|
78 |
+
Isabel challenge set,EN,The guy [that] she is going out [with] is handsome.,Is the dangling preposition of the English sentence correctly placed in the French translation?,WH-MVT and stranded preps,Syntactic,
|
79 |
+
Isabel challenge set,EN,[Whom] is she going out [with] these days?,Is the dangling preposition of the English sentence correctly placed in the French translation?,WH-MVT and stranded preps,Syntactic,
|
80 |
+
Isabel challenge set,EN,The girl [that] he has been talking [about] is smart.,Is the dangling preposition of the English sentence correctly placed in the French translation?,WH-MVT and stranded preps,Syntactic,
|
81 |
+
Isabel challenge set,EN,[Who] was he talking [to] when you left?,Is the dangling preposition of the English sentence correctly placed in the French translation?,WH-MVT and stranded preps,Syntactic,
|
82 |
+
Isabel challenge set,EN,The city [that] he is arriving [from] is dangerous.,Is the dangling preposition of the English sentence correctly placed in the French translation?,WH-MVT and stranded preps,Syntactic,
|
83 |
+
Isabel challenge set,EN,[Where] is he arriving [from]?,Is the dangling preposition of the English sentence correctly placed in the French translation?,WH-MVT and stranded preps,Syntactic,
|
84 |
+
Isabel challenge set,EN,Rarely [did the dog] run.,Is the adverb-triggered subject-verb inversion in the English sentence correctly rendered in the French translation?,Adverb-triggered inversion,Syntactic,
|
85 |
+
Isabel challenge set,EN,Never before [had she been] so unhappy.,Is the adverb-triggered subject-verb inversion in the English sentence correctly rendered in the French translation?,Adverb-triggered inversion,Syntactic,
|
86 |
+
Isabel challenge set,EN,Nowhere [were the birds] so colorful.,Is the adverb-triggered subject-verb inversion in the English sentence correctly rendered in the French translation?,Adverb-triggered inversion,Syntactic,
|
87 |
+
Isabel challenge set,EN,Soup [is eaten] with a large spoon.,Is the generic statement made in the English sentence correctly and naturally rendered in the French translation?,Middle voice,Syntactic,
|
88 |
+
Isabel challenge set,EN,Masonry [is cut] using a diamond blade.,Is the generic statement made in the English sentence correctly and naturally rendered in the French translation?,Middle voice,Syntactic,
|
89 |
+
Isabel challenge set,EN,Champagne [is drunk] in a glass called a flûte.,Is the generic statement made in the English sentence correctly and naturally rendered in the French translation?,Middle voice,Syntactic,
|
90 |
+
Isabel challenge set,EN,"[Should] Paul leave, I would be sad.",Fronted ``should'' is interpreted as a conditional subordinator. It is normally translated as ``si'' with imperfect tense.,Fronted ``should'',Syntactic,
|
91 |
+
Isabel challenge set,EN,"Should he become president, she would be promoted immediately.",Fronted ``should'' is interpreted as a conditional subordinator. It is normally translated as ``si'' with imperfect tense.,Fronted ``should'',Syntactic,
|
92 |
+
Isabel challenge set,EN,"[Should] he fall, he would get up again immediately.",Fronted ``should'' is interpreted as a conditional subordinator. It is normally translated as ``si'' with imperfect tense.,Fronted ``should'',Syntactic,
|
93 |
+
Isabel challenge set,EN,She had a lot of money but he did not have [any].,Are the English pronouns correctly rendered in the French translations?,Clitic pronouns,Syntactic,
|
94 |
+
Isabel challenge set,EN,He did not talk [to them] very often.,Are the English pronouns correctly rendered in the French translations?,Clitic pronouns,Syntactic,
|
95 |
+
Isabel challenge set,EN,The men are watching [each other].,Are the English pronouns correctly rendered in the French translations?,Clitic pronouns,Syntactic,
|
96 |
+
Isabel challenge set,EN,He gave [it] to the man.,Are the English pronouns correctly rendered in the French translations?,Clitic pronouns,Syntactic,
|
97 |
+
Isabel challenge set,EN,He did not give [it] to [her].,Are the English pronouns correctly rendered in the French translations?,Clitic pronouns,Syntactic,
|
98 |
+
Isabel challenge set,EN,The [first four] men were exhausted.,Is the relative order of the ordinals and numerals correct in the French tranlation?,Ordinal placement,Syntactic,
|
99 |
+
Isabel challenge set,EN,The [last three] candidates were eliminated.,Is the relative order of the ordinals and numerals correct in the French tranlation?,Ordinal placement,Syntactic,
|
100 |
+
Isabel challenge set,EN,The [other two] guys left without paying.,Is the relative order of the ordinals and numerals correct in the French tranlation?,Ordinal placement,Syntactic,
|
101 |
+
Isabel challenge set,EN,He washed [his] hands.,Is the French translation correct and natural both in: a) its use of a particular determiner on the body part noun; and b) the presence or absence of a reflexive pronoun before the verb?,Inalienable possession,Syntactic,
|
102 |
+
Isabel challenge set,EN,I brushed [my] teeth.,Is the French translation correct and natural both in: a) its use of a particular determiner on the body part noun; and b) the presence or absence of a reflexive pronoun before the verb?,Inalienable possession,Syntactic,
|
103 |
+
Isabel challenge set,EN,You brushed [your] teeth.,Is the French translation correct and natural both in: a) its use of a particular determiner on the body part noun; and b) the presence or absence of a reflexive pronoun before the verb?,Inalienable possession,Syntactic,
|
104 |
+
Isabel challenge set,EN,I raised [my] hand.,Is the French translation correct and natural both in: a) its use of a particular determiner on the body part noun; and b) the presence or absence of a reflexive pronoun before the verb?,Inalienable possession,Syntactic,
|
105 |
+
Isabel challenge set,EN,He turned [his] head.,Is the French translation correct and natural both in: a) its use of a particular determiner on the body part noun; and b) the presence or absence of a reflexive pronoun before the verb?,Inalienable possession,Syntactic,
|
106 |
+
Isabel challenge set,EN,He raised his eyes to heaven.,Is the French translation correct and natural both in: a) its use of a particular determiner on the body part noun; and b) the presence or absence of a reflexive pronoun before the verb?,Inalienable possession,Syntactic,
|
107 |
+
Isabel challenge set,EN,The strangers [] the woman saw were working.,Is the English zero relative pronoun correctly translated as a non-zero one in the French translation?,Zero REL PRO,Syntactic,
|
108 |
+
Isabel challenge set,EN,The man [] your sister hates is evil.,Is the English zero relative pronoun correctly translated as a non-zero one in the French translation?,Zero REL PRO,Syntactic,
|
109 |
+
Isabel challenge set,EN,The girl [] my friend was talking about is gone.,Is the English zero relative pronoun correctly translated as a non-zero one in the French translation?,Zero REL PRO,Syntactic,
|
index/en-es_input_tokens.index
ADDED
Binary file (245 kB). View file
|
|
index/en-es_input_words.index
ADDED
Binary file (206 kB). View file
|
|
index/en-es_metadata_ref.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4bc8b47dc5db98c9cee79be4647d6853756838632ea2f42c1a4c377e948fd8a3
|
3 |
+
size 7599198
|
index/en-es_output_tokens.index
ADDED
Binary file (331 kB). View file
|
|
index/en-es_output_words.index
ADDED
Binary file (259 kB). View file
|
|
plotsjs.js
ADDED
@@ -0,0 +1,990 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
async () => {
|
4 |
+
// set testFn() function on globalThis, so you html onlclick can access it
|
5 |
+
|
6 |
+
|
7 |
+
globalThis.testFn = () => {
|
8 |
+
document.getElementById('demo').innerHTML = "Hello?"
|
9 |
+
};
|
10 |
+
|
11 |
+
const d37 = await import("https://cdn.jsdelivr.net/npm/d3@7/+esm");
|
12 |
+
const d3 = await import("https://cdn.jsdelivr.net/npm/d3@5/+esm");
|
13 |
+
const $ = await import("https://cdn.jsdelivr.net/npm/jquery@3.7.1/dist/jquery.min.js");
|
14 |
+
globalThis.$ = $;
|
15 |
+
|
16 |
+
globalThis.d3 = d3;
|
17 |
+
|
18 |
+
globalThis.d3Fn = () => {
|
19 |
+
d3.select('#viz').append('svg')
|
20 |
+
.append('rect')
|
21 |
+
.attr('width', 50)
|
22 |
+
.attr('height', 50)
|
23 |
+
.attr('fill', 'black')
|
24 |
+
.on('mouseover', function(){d3.select(this).attr('fill', 'red')})
|
25 |
+
.on('mouseout', function(){d3.select(this).attr('fill', 'black')});
|
26 |
+
|
27 |
+
};
|
28 |
+
|
29 |
+
globalThis.testFn_out = (val,radio_c) => {
|
30 |
+
// document.getElementById('demo').innerHTML = val
|
31 |
+
console.log(val);
|
32 |
+
// globalThis.d3Fn();
|
33 |
+
return([val,radio_c]);
|
34 |
+
};
|
35 |
+
|
36 |
+
|
37 |
+
globalThis.testFn_out_json = (data) => {
|
38 |
+
console.log(data);
|
39 |
+
var $ = jQuery;
|
40 |
+
|
41 |
+
data_beam = data[1][0];
|
42 |
+
data_probs = data[1][1];
|
43 |
+
data_html_inputs = data[1][2];
|
44 |
+
data_html_target = data[1][3];
|
45 |
+
data_embds = data[2];
|
46 |
+
|
47 |
+
attViz(data[3]);
|
48 |
+
attViz(data[4]);
|
49 |
+
attViz(data[5]);
|
50 |
+
|
51 |
+
|
52 |
+
console.log(data_beam, )
|
53 |
+
const idMapping = data_beam.reduce((acc, el, i) => {
|
54 |
+
acc[el.id] = i;
|
55 |
+
return acc;
|
56 |
+
}, {});
|
57 |
+
|
58 |
+
let root;
|
59 |
+
data_beam.forEach(el => {
|
60 |
+
// Handle the root element
|
61 |
+
if (el.parentId === null) {
|
62 |
+
root = el;
|
63 |
+
return;
|
64 |
+
}
|
65 |
+
// Use our mapping to locate the parent element in our data_beam array
|
66 |
+
const parentEl = data_beam[idMapping[el.parentId]];
|
67 |
+
// Add our current el to its parent's `children` array
|
68 |
+
parentEl.children = [...(parentEl.children || []), el];
|
69 |
+
});
|
70 |
+
|
71 |
+
|
72 |
+
// console.log(Tree(root));
|
73 |
+
// document.getElementById('d3_beam_search').innerHTML = Tree(root)
|
74 |
+
d3.select('#d3_beam_search').html("");
|
75 |
+
d3.select('#d3_beam_search').append(function(){return Tree(root);});
|
76 |
+
|
77 |
+
//probabilities;
|
78 |
+
//
|
79 |
+
d3.select('#d3_text_grid').html("");
|
80 |
+
d3.select('#d3_text_grid').append(function(){return TextGrid(data_probs);});
|
81 |
+
// $('#d3_text_grid').html(TextGrid(data)) ;
|
82 |
+
|
83 |
+
//tokenization;
|
84 |
+
d3.select('#d3_tok').html(data_html_inputs);
|
85 |
+
d3.select('#d3_tok_target').html(data_html_target);
|
86 |
+
|
87 |
+
//embeddings
|
88 |
+
d3.select("#d3_embeds_source").html("here");
|
89 |
+
// words or token visualization ?
|
90 |
+
console.log(d3.select("#select_type").node().value);
|
91 |
+
d3.select("#select_type").attr("hidden", null);
|
92 |
+
d3.select("#select_type").on("change", change);
|
93 |
+
change();
|
94 |
+
// tokens
|
95 |
+
// network plots;
|
96 |
+
['input', 'output'].forEach(text_type => {
|
97 |
+
['tokens', 'words'].forEach(text_key => {
|
98 |
+
// console.log(type, key, data[0][text_type]);
|
99 |
+
data_i = data_embds[text_type][text_key];
|
100 |
+
embeddings_network([], data_i['tnse'], data_i['similar_queries'], type=text_type +"_"+text_key, )
|
101 |
+
});
|
102 |
+
});
|
103 |
+
|
104 |
+
// $('#d3_beam_search').html(Tree(root)) ;
|
105 |
+
|
106 |
+
return(['string', {}])
|
107 |
+
|
108 |
+
}
|
109 |
+
|
110 |
+
function change() {
|
111 |
+
show_type = d3.select("#select_type").node().value;
|
112 |
+
// hide all
|
113 |
+
d3.selectAll(".d3_embed").attr("hidden",'');
|
114 |
+
d3.selectAll(".d3_graph").attr("hidden", '');
|
115 |
+
// show current type;
|
116 |
+
d3.select("#d3_embeds_input_" + show_type).attr("hidden", null);
|
117 |
+
d3.select("#d3_embeds_output_" + show_type).attr("hidden", null);
|
118 |
+
d3.select("#d3_graph_input_" + show_type).attr("hidden", null);
|
119 |
+
d3.select("#d3_graph_output_" + show_type).attr("hidden", null);
|
120 |
+
}
|
121 |
+
|
122 |
+
function embeddings_network(tokens_text, dict_projected_embds, similar_vocab_queries, type="source", ){
|
123 |
+
// tokens_text : not used;
|
124 |
+
// dict_projected_embds = tnse
|
125 |
+
console.log("Each token is a node; distance if in similar list", type );
|
126 |
+
console.log(tokens_text, dict_projected_embds, similar_vocab_queries);
|
127 |
+
// similar_vocab_queries_target[key]['similar_topk']
|
128 |
+
|
129 |
+
var nodes_tokens = {}
|
130 |
+
var nodeHash = {};
|
131 |
+
var nodes = []; // [{id: , label: }]
|
132 |
+
var edges = []; // [{source: , target: weight: }]
|
133 |
+
var edges_ids = []; // [{source: , target: weight: }]
|
134 |
+
|
135 |
+
// similar_vocab_queries {key: {similar_topk : [], distance : []}}
|
136 |
+
console.log('similar_vocab_queries', similar_vocab_queries);
|
137 |
+
prev_node = '';
|
138 |
+
for ([sent_token, value] of Object.entries(similar_vocab_queries)) {
|
139 |
+
// console.log('dict_projected_embds',sent_token, parseInt(sent_token), value, dict_projected_embds);
|
140 |
+
// sent_token = parseInt(sent_token); // Object.entries assumes key:string;
|
141 |
+
token_text = dict_projected_embds[sent_token][3]
|
142 |
+
if (!nodeHash[sent_token]) {
|
143 |
+
nodeHash[sent_token] = {id: sent_token, label: token_text, type: 'sentence', type_i: 0};
|
144 |
+
nodes.push(nodeHash[sent_token]);
|
145 |
+
}
|
146 |
+
sim_tokens = value['similar_topk']
|
147 |
+
dist_tokens = value['distance']
|
148 |
+
|
149 |
+
for (let index = 0; index < sim_tokens.length; index++) {
|
150 |
+
const sim = sim_tokens[index];
|
151 |
+
const dist = dist_tokens[index];
|
152 |
+
|
153 |
+
token_text_sim = dict_projected_embds[sim][3]
|
154 |
+
if (!nodeHash[sim]) {
|
155 |
+
nodeHash[sim] = {id: sim, label: token_text_sim, type:'similar', type_i: 1};
|
156 |
+
nodes.push(nodeHash[sim]);
|
157 |
+
}
|
158 |
+
edges.push({source: nodeHash[sent_token], target: nodeHash[sim], weight: dist});
|
159 |
+
edges_ids.push({source: sent_token, target: sim, weight: dist});
|
160 |
+
}
|
161 |
+
|
162 |
+
if (prev_node != '' ) {
|
163 |
+
edges.push({source: nodeHash[prev_node], target:nodeHash[sent_token], weight: 1});
|
164 |
+
edges_ids.push({source: prev_node, target: sent_token, weight: 1});
|
165 |
+
}
|
166 |
+
prev_node = sent_token;
|
167 |
+
|
168 |
+
}
|
169 |
+
console.log("TYPE", type, edges, nodes, edges_ids, similar_vocab_queries)
|
170 |
+
// d3.select('#d3_graph_input_tokens').html(networkPlot({nodes: nodes, links:edges}, similar_vocab_queries, div_type=type) );
|
171 |
+
// type +"_"+key
|
172 |
+
d3.select('#d3_graph_'+type).html("");
|
173 |
+
d3.select('#d3_graph_'+type).append(function(){return networkPlot({nodes: nodes, links:edges}, similar_vocab_queries, dict_projected_embds,div_type=type);});
|
174 |
+
|
175 |
+
// $('#d3_embeds_network_target').html(networkPlot({nodes: nodes, links:edges}));
|
176 |
+
// $('#d3_embeds_network_'+type).html(etworkPlot({nodes: nodes, link:edges}));
|
177 |
+
}
|
178 |
+
|
179 |
+
function networkPlot(data, similar_vocab_queries,dict_proj, div_type="source", {
|
180 |
+
width = 400, // outer width, in pixels
|
181 |
+
height , // outer height, in pixels
|
182 |
+
r = 3, // radius of nodes
|
183 |
+
padding = 1, // horizontal padding for first and last column
|
184 |
+
// text = d => d[2],
|
185 |
+
} = {}){
|
186 |
+
// data_dict = data;
|
187 |
+
data = data// [div_type];
|
188 |
+
similar_vocab_queries = similar_vocab_queries// [div_type];
|
189 |
+
console.log("data, similar_vocab_queries, div_type");
|
190 |
+
console.log(data, similar_vocab_queries, div_type);
|
191 |
+
|
192 |
+
// Create the SVG container.
|
193 |
+
var margin = {top: 10, right: 10, bottom: 30, left: 50 },
|
194 |
+
width = width //- margin.left - margin.right,
|
195 |
+
height = 400 //- margin.top - margin.bottom;
|
196 |
+
|
197 |
+
width_box = width + margin.left + margin.right;
|
198 |
+
height_box = height + margin.top + margin.bottom
|
199 |
+
totalWidth = width*2;
|
200 |
+
|
201 |
+
|
202 |
+
var svg = d37.create("svg")
|
203 |
+
.attr("width", width + margin.left + margin.right)
|
204 |
+
.attr("height", height + margin.top + margin.bottom)
|
205 |
+
|
206 |
+
// Initialize the links
|
207 |
+
var link = svg
|
208 |
+
.selectAll("line")
|
209 |
+
.data(data.links)
|
210 |
+
.enter()
|
211 |
+
.append("line")
|
212 |
+
.style("fill", d => d.weight == 1 ? "#dfd5d5" : "#000000") // , "#69b3a2" : "#69b3a2")
|
213 |
+
.style("stroke", "#aaa")
|
214 |
+
|
215 |
+
|
216 |
+
|
217 |
+
var text = svg
|
218 |
+
.selectAll("text")
|
219 |
+
.data(data.nodes)
|
220 |
+
.enter()
|
221 |
+
.append("text")
|
222 |
+
.style("text-anchor", "middle")
|
223 |
+
.attr("y", 15)
|
224 |
+
.attr("class", d => 'text_token-'+ dict_proj[d.id][4] + div_type)
|
225 |
+
.attr("div-type", div_type)
|
226 |
+
// .attr("class", d => 'text_token-'+ d.index)
|
227 |
+
.text(function (d) {return d.label} )
|
228 |
+
// .on('mouseover', function(d) { (d.type_i == 0) ? highlight_mouseover_text : console.log(0)} )
|
229 |
+
// .on('mouseover', function(d) { (d.type_i == 0) ? highlight_mouseout_text : '' } )
|
230 |
+
// .on('mouseout', highlight_mouseout_text )
|
231 |
+
// .join('text')
|
232 |
+
// .text(function(d) {
|
233 |
+
// return d.id
|
234 |
+
// })
|
235 |
+
|
236 |
+
// Initialize the nodes
|
237 |
+
var node = svg
|
238 |
+
.selectAll("circle")
|
239 |
+
.data(data.nodes)
|
240 |
+
.enter()
|
241 |
+
.append("circle")
|
242 |
+
.attr("r", 6)
|
243 |
+
// .attr("class", d => 'node_token-'+ d.id)
|
244 |
+
.attr("class", d => 'node_token-'+ dict_proj[d.id][4] + div_type)
|
245 |
+
.attr("div-type", div_type)
|
246 |
+
.style("fill", d => d.type_i ? "#e85252" : "#6689c6") // , "#69b3a2" : "#69b3a2")
|
247 |
+
.on('mouseover', highlight_mouseover )
|
248 |
+
// .on('mouseover', function(d) { return (d.type_i == 0) ? highlight_mouseover : console.log(0)} )
|
249 |
+
.on('mouseout',highlight_mouseout )
|
250 |
+
.on('click', change_legend )
|
251 |
+
// .on('click', show_similar_tokens )
|
252 |
+
|
253 |
+
|
254 |
+
|
255 |
+
// Let's list the force we wanna apply on the network
|
256 |
+
var simulation = d37.forceSimulation(data.nodes) // Force algorithm is applied to data.nodes
|
257 |
+
.force("link", d37.forceLink() // This force provides links between nodes
|
258 |
+
.id(function(d) { return d.id; }) // This provide the id of a node
|
259 |
+
.links(data.links) // and this the list of links
|
260 |
+
)
|
261 |
+
.force("charge", d37.forceManyBody(-400)) // This adds repulsion between nodes. Play with the -400 for the repulsion strength
|
262 |
+
.force("center", d37.forceCenter(width / 2, height / 2)) // This force attracts nodes to the center of the svg area
|
263 |
+
// .force("collision", d3.forceCollide())
|
264 |
+
.on("end", ticked);
|
265 |
+
|
266 |
+
// This function is run at each iteration of the force algorithm, updating the nodes position.
|
267 |
+
function ticked() {
|
268 |
+
link
|
269 |
+
.attr("x1", function(d) { return d.source.x; })
|
270 |
+
.attr("y1", function(d) { return d.source.y; })
|
271 |
+
.attr("x2", function(d) { return d.target.x; })
|
272 |
+
.attr("y2", function(d) { return d.target.y; });
|
273 |
+
|
274 |
+
node
|
275 |
+
.attr("cx", function (d) { return d.x+3; })
|
276 |
+
.attr("cy", function(d) { return d.y-3; });
|
277 |
+
|
278 |
+
text
|
279 |
+
.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
|
280 |
+
}
|
281 |
+
|
282 |
+
function highlight_mouseover(d,i) {
|
283 |
+
console.log("highlight_mouseover", d,i, d37.select(this).attr("div-type"));
|
284 |
+
if (i.type_i == 0 ){
|
285 |
+
token_id = i.id
|
286 |
+
similar_ids = similar_vocab_queries[token_id]['similar_topk'];
|
287 |
+
d37.select(this).transition()
|
288 |
+
.duration('50')
|
289 |
+
.style('opacity', '1')
|
290 |
+
.attr("r", 12)
|
291 |
+
type = d37.select(this).attr("div-type")
|
292 |
+
similar_ids.forEach(similar_token => {
|
293 |
+
node_id_name = dict_proj[similar_token][4]
|
294 |
+
d37.selectAll('.node_token-'+ node_id_name + type).attr("r",12 ).style('opacity', '1')//.raise()
|
295 |
+
// d3.selectAll('.text_token-'+ node_id_name).raise()
|
296 |
+
});
|
297 |
+
}
|
298 |
+
}
|
299 |
+
|
300 |
+
|
301 |
+
function highlight_mouseout(d,i) {
|
302 |
+
if (i.type_i == 0 ){
|
303 |
+
token_id = i.id
|
304 |
+
console.log("similar_vocab_queries", similar_vocab_queries, "this type:", d37.select(this).attr("div-type"));
|
305 |
+
similar_ids = similar_vocab_queries[token_id]['similar_topk'];
|
306 |
+
// clean_sentences();
|
307 |
+
d37.select(this).transition()
|
308 |
+
.duration('50')
|
309 |
+
.style('opacity', '.7')
|
310 |
+
.attr("r", 6)
|
311 |
+
type = d37.select(this).attr("div-type")
|
312 |
+
similar_ids.forEach(similar_token => {
|
313 |
+
node_id_name = dict_proj[similar_token][4]
|
314 |
+
d37.selectAll('.node_token-' + node_id_name + type).attr("r",6 ).style('opacity', '.7')
|
315 |
+
d37.selectAll("circle").raise()
|
316 |
+
});
|
317 |
+
}
|
318 |
+
}
|
319 |
+
|
320 |
+
function change_legend(d,i,j) {
|
321 |
+
console.log(d,i,dict_proj);
|
322 |
+
if (i['id'] in dict_proj){
|
323 |
+
// show_sentences(dict_proj[i[2]], i[2]);
|
324 |
+
|
325 |
+
show_similar_tokens(i['id'], '#similar_'+type);
|
326 |
+
|
327 |
+
console.log(dict_proj[i['id']]);
|
328 |
+
}
|
329 |
+
else{console.log("no sentence")};
|
330 |
+
}
|
331 |
+
|
332 |
+
function show_similar_tokens(token, div_name_similar='#similar_input_tokens') {
|
333 |
+
d37.select(div_name_similar).html("");
|
334 |
+
console.log("token", token);
|
335 |
+
console.log("similar_vocab_queries[token]", similar_vocab_queries[token]);
|
336 |
+
token_data = similar_vocab_queries[token];
|
337 |
+
console.log(token, token_data);
|
338 |
+
var decForm = d37.format(".3f");
|
339 |
+
|
340 |
+
d37.select(div_name_similar)
|
341 |
+
.selectAll().append("p")
|
342 |
+
.data(token_data['similar_topk'])
|
343 |
+
.enter()
|
344 |
+
.append("p").append('text')
|
345 |
+
// .attr('class_data', sent_id)
|
346 |
+
.attr('class_id', d => d)
|
347 |
+
.style("background", d=> {if (d == token) return "yellow"} )
|
348 |
+
// .text( d => d + " \n ");
|
349 |
+
.text((d,i) => do_text(d,i) );
|
350 |
+
|
351 |
+
function do_text(d,i){
|
352 |
+
console.log("do_text d,i" );
|
353 |
+
console.log(d,i);
|
354 |
+
console.log("data_dict[d], data_dict");
|
355 |
+
console.log(dict_proj[d], dict_proj);
|
356 |
+
return dict_proj[d][3] + " " + decForm(token_data['distance'][i]) + " ";
|
357 |
+
}
|
358 |
+
|
359 |
+
|
360 |
+
}
|
361 |
+
|
362 |
+
|
363 |
+
return svg.node();
|
364 |
+
|
365 |
+
};
|
366 |
+
|
367 |
+
|
368 |
+
|
369 |
+
// Copyright 2021 Observable, Inc.
|
370 |
+
// Released under the ISC license.
|
371 |
+
// https://observablehq.com/@d3/tree
|
372 |
+
function Tree(data, { // data is either tabular (array of objects) or hierarchy (nested objects)
|
373 |
+
path, // as an alternative to id and parentId, returns an array identifier, imputing internal nodes
|
374 |
+
id = Array.isArray(data) ? d => d.id : null, // if tabular data, given a d in data, returns a unique identifier (string)
|
375 |
+
parentId = Array.isArray(data) ? d => d.parentId : null, // if tabular data, given a node d, returns its parent’s identifier
|
376 |
+
children, // if hierarchical data, given a d in data, returns its children
|
377 |
+
tree = d3.tree, // layout algorithm (typically d3.tree or d3.cluster)
|
378 |
+
sort, // how to sort nodes prior to layout (e.g., (a, b) => d3.descending(a.height, b.height))
|
379 |
+
label = d => d.name, // given a node d, returns the display name
|
380 |
+
title = d => d.name, // given a node d, returns its hover text
|
381 |
+
link , // given a node d, its link (if any)
|
382 |
+
linkTarget = "_blank", // the target attribute for links (if any)
|
383 |
+
width = 800, // outer width, in pixels
|
384 |
+
height, // outer height, in pixels
|
385 |
+
r = 3, // radius of nodes
|
386 |
+
padding = 1, // horizontal padding for first and last column
|
387 |
+
fill = "#999", // fill for nodes
|
388 |
+
fillOpacity, // fill opacity for nodes
|
389 |
+
stroke = "#555", // stroke for links
|
390 |
+
strokeWidth = 2, // stroke width for links
|
391 |
+
strokeOpacity = 0.4, // stroke opacity for links
|
392 |
+
strokeLinejoin, // stroke line join for links
|
393 |
+
strokeLinecap, // stroke line cap for links
|
394 |
+
halo = "#fff", // color of label halo
|
395 |
+
haloWidth = 3, // padding around the labels
|
396 |
+
curve = d37.curveBumpX, // curve for the link
|
397 |
+
} = {}) {
|
398 |
+
|
399 |
+
// If id and parentId options are specified, or the path option, use d3.stratify
|
400 |
+
// to convert tabular data to a hierarchy; otherwise we assume that the data is
|
401 |
+
// specified as an object {children} with nested objects (a.k.a. the “flare.json”
|
402 |
+
// format), and use d3.hierarchy.
|
403 |
+
const root = path != null ? d3.stratify().path(path)(data)
|
404 |
+
: id != null || parentId != null ? d3.stratify().id(id).parentId(parentId)(data)
|
405 |
+
: d3.hierarchy(data, children);
|
406 |
+
|
407 |
+
// Sort the nodes.
|
408 |
+
if (sort != null) root.sort(sort);
|
409 |
+
|
410 |
+
// Compute labels and titles.
|
411 |
+
const descendants = root.descendants();
|
412 |
+
const L = label == null ? null : descendants.map(d => label(d.data, d));
|
413 |
+
|
414 |
+
// Compute the layout.
|
415 |
+
const descWidth = 10;
|
416 |
+
// console.log('descendants', descendants);
|
417 |
+
const realWidth = descWidth * descendants.length
|
418 |
+
const totalWidth = (realWidth > width) ? realWidth : width;
|
419 |
+
|
420 |
+
const dx = 25;
|
421 |
+
const dy = totalWidth / (root.height + padding);
|
422 |
+
tree().nodeSize([dx, dy])(root);
|
423 |
+
|
424 |
+
// Center the tree.
|
425 |
+
let x0 = Infinity;
|
426 |
+
let x1 = -x0;
|
427 |
+
root.each(d => {
|
428 |
+
if (d.x > x1) x1 = d.x;
|
429 |
+
if (d.x < x0) x0 = d.x;
|
430 |
+
});
|
431 |
+
|
432 |
+
// Compute the default height.
|
433 |
+
if (height === undefined) height = x1 - x0 + dx * 2;
|
434 |
+
|
435 |
+
|
436 |
+
|
437 |
+
// Use the required curve
|
438 |
+
if (typeof curve !== "function") throw new Error(`Unsupported curve`);
|
439 |
+
|
440 |
+
const parent = d3.create("div");
|
441 |
+
|
442 |
+
const body = parent.append("div")
|
443 |
+
.style("overflow-x", "scroll")
|
444 |
+
.style("-webkit-overflow-scrolling", "touch");
|
445 |
+
|
446 |
+
const svg = body.append("svg")
|
447 |
+
.attr("viewBox", [-dy * padding / 2, x0 - dx, totalWidth, height])
|
448 |
+
.attr("width", totalWidth)
|
449 |
+
.attr("height", height)
|
450 |
+
.attr("style", "max-width: 100%; height: auto; height: intrinsic;")
|
451 |
+
.attr("font-family", "sans-serif")
|
452 |
+
.attr("font-size", 12);
|
453 |
+
|
454 |
+
svg.append("g")
|
455 |
+
.attr("fill", "none")
|
456 |
+
.attr("stroke", stroke)
|
457 |
+
.attr("stroke-opacity", strokeOpacity)
|
458 |
+
.attr("stroke-linecap", strokeLinecap)
|
459 |
+
.attr("stroke-linejoin", strokeLinejoin)
|
460 |
+
.attr("stroke-width", strokeWidth)
|
461 |
+
.selectAll("path")
|
462 |
+
.data(root.links())
|
463 |
+
.join("path")
|
464 |
+
// .attr("stroke", d => d.prob > 0.5 ? 'red' : 'blue' )
|
465 |
+
// .attr("fill", "red")
|
466 |
+
.attr("d", d37.link(curve)
|
467 |
+
.x(d => d.y)
|
468 |
+
.y(d => d.x));
|
469 |
+
|
470 |
+
const node = svg.append("g")
|
471 |
+
.selectAll("a")
|
472 |
+
.data(root.descendants())
|
473 |
+
.join("a")
|
474 |
+
.attr("xlink:href", link == null ? null : d => link(d.data, d))
|
475 |
+
.attr("target", link == null ? null : linkTarget)
|
476 |
+
.attr("transform", d => `translate(${d.y},${d.x})`);
|
477 |
+
|
478 |
+
node.append("circle")
|
479 |
+
.attr("fill", d => d.children ? stroke : fill)
|
480 |
+
.attr("r", r);
|
481 |
+
|
482 |
+
title = d => (d.name + ( d.prob));
|
483 |
+
|
484 |
+
if (title != null) node.append("title")
|
485 |
+
.text(d => title(d.data, d));
|
486 |
+
|
487 |
+
if (L) node.append("text")
|
488 |
+
.attr("dy", "0.32em")
|
489 |
+
.attr("x", d => d.children ? -6 : 6)
|
490 |
+
.attr("text-anchor", d => d.children ? "end" : "start")
|
491 |
+
.attr("paint-order", "stroke")
|
492 |
+
.attr("stroke", 'white')
|
493 |
+
.attr("fill", d => d.data.prob == 1 ? ('red') : ('black') )
|
494 |
+
.attr("stroke-width", haloWidth)
|
495 |
+
.text((d, i) => L[i]);
|
496 |
+
body.node().scrollBy(totalWidth, 0);
|
497 |
+
return svg.node();
|
498 |
+
}
|
499 |
+
|
500 |
+
function TextGrid(data, div_name, {
|
501 |
+
width = 640, // outer width, in pixels
|
502 |
+
height , // outer height, in pixels
|
503 |
+
r = 3, // radius of nodes
|
504 |
+
padding = 1, // horizontal padding for first and last column
|
505 |
+
// text = d => d[2],
|
506 |
+
} = {}){
|
507 |
+
// console.log("TextGrid", data);
|
508 |
+
|
509 |
+
// Compute the layout.
|
510 |
+
const dx = 10;
|
511 |
+
const dy = 10; //width / (root.height + padding);
|
512 |
+
|
513 |
+
const marginTop = 20;
|
514 |
+
const marginRight = 20;
|
515 |
+
const marginBottom = 30;
|
516 |
+
const marginLeft = 30;
|
517 |
+
|
518 |
+
// Center the tree.
|
519 |
+
let x0 = Infinity;
|
520 |
+
let x1 = -x0;
|
521 |
+
topk = 10;
|
522 |
+
word_length = 20;
|
523 |
+
const rectWidth = 60;
|
524 |
+
const rectTotal = 70;
|
525 |
+
|
526 |
+
wval = 0
|
527 |
+
|
528 |
+
const realWidth = rectTotal * data.length
|
529 |
+
const totalWidth = (realWidth > width) ? realWidth : width;
|
530 |
+
// root.each(d => {
|
531 |
+
// if (d.x > x1) x1 = d.x;
|
532 |
+
// if (d.x < x0) x0 = d.x;
|
533 |
+
// });
|
534 |
+
|
535 |
+
// Compute the default height.
|
536 |
+
// if (height === undefined) height = x1 - x0 + dx * 2;
|
537 |
+
if (height === undefined) height = topk * word_length + 10;
|
538 |
+
|
539 |
+
const parent = d3.create("div");
|
540 |
+
|
541 |
+
// parent.append("svg")
|
542 |
+
// .attr("width", width)
|
543 |
+
// .attr("height", height)
|
544 |
+
// .style("position", "absolute")
|
545 |
+
// .style("pointer-events", "none")
|
546 |
+
// .style("z-index", 1);
|
547 |
+
|
548 |
+
|
549 |
+
// const svg = d3.create("svg")
|
550 |
+
// // svg = parent.append("svg")
|
551 |
+
// .attr("viewBox", [-dy * padding / 2, x0 - dx, width, height])
|
552 |
+
// .attr("width", width)
|
553 |
+
// .attr("height", height)
|
554 |
+
// .attr("style", "max-width: 100%; height: auto; height: intrinsic;")
|
555 |
+
// .attr("font-family", "sans-serif")
|
556 |
+
// .attr("font-size", 10);
|
557 |
+
|
558 |
+
// div.data([1, 2, 4, 8, 16, 32], d => d);
|
559 |
+
// div.enter().append("div").text(d => d);
|
560 |
+
|
561 |
+
const body = parent.append("div")
|
562 |
+
.style("overflow-x", "scroll")
|
563 |
+
.style("-webkit-overflow-scrolling", "touch");
|
564 |
+
|
565 |
+
const svg = body.append("svg")
|
566 |
+
.attr("width", totalWidth)
|
567 |
+
.attr("height", height)
|
568 |
+
.style("display", "block")
|
569 |
+
.attr("font-family", "sans-serif")
|
570 |
+
.attr("font-size", 10);
|
571 |
+
|
572 |
+
|
573 |
+
data.forEach(words_list => {
|
574 |
+
// console.log(wval, words_list);
|
575 |
+
words = words_list[2]; // {'t': words_list[2], 'p': words_list[1]};
|
576 |
+
scores = words_list[1];
|
577 |
+
words_score = words.map( (x,i) => {return {t: x, p: scores[i]}})
|
578 |
+
// console.log(words_score);
|
579 |
+
// svg.selectAll("text").enter()
|
580 |
+
// .data(words)
|
581 |
+
// .join("text")
|
582 |
+
// .text((d,i) => (d))
|
583 |
+
// .attr("x", wval)
|
584 |
+
// .attr("y", ((d,i) => (20 + i*20)))
|
585 |
+
|
586 |
+
var probs = svg.selectAll("text").enter()
|
587 |
+
.data(words_score).join('g');
|
588 |
+
|
589 |
+
|
590 |
+
|
591 |
+
probs.append("rect")
|
592 |
+
// .data(words)
|
593 |
+
.attr("x", wval)
|
594 |
+
.attr("y", ((d,i) => ( 10+ i*20)))
|
595 |
+
.attr('width', rectWidth)
|
596 |
+
.attr('height', 15)
|
597 |
+
.attr("color", 'gray')
|
598 |
+
.attr("fill", "gray")
|
599 |
+
// .attr("fill-opacity", "0.2")
|
600 |
+
.attr("fill-opacity", (d) => (d.p))
|
601 |
+
.attr("stroke-opacity", 0.8)
|
602 |
+
.append("svg:title")
|
603 |
+
.text(function(d){return d.t+":"+d.p;});
|
604 |
+
|
605 |
+
|
606 |
+
probs.append("text")
|
607 |
+
// .data(words)
|
608 |
+
.text((d,i) => (d.t))
|
609 |
+
.attr("x", wval)
|
610 |
+
.attr("y", ((d,i) => (20 + i*20)))
|
611 |
+
// .attr("fill", 'white')
|
612 |
+
.attr("font-weight", 700);
|
613 |
+
|
614 |
+
wval = wval + rectTotal;
|
615 |
+
});
|
616 |
+
|
617 |
+
|
618 |
+
body.node().scrollBy(totalWidth, 0);
|
619 |
+
// return svg.node();
|
620 |
+
return parent.node();
|
621 |
+
}
|
622 |
+
|
623 |
+
|
624 |
+
function attViz(PYTHON_PARAMS) {
|
625 |
+
var $ = jQuery;
|
626 |
+
const params = PYTHON_PARAMS; // HACK: PYTHON_PARAMS is a template marker that is replaced by actual params.
|
627 |
+
const TEXT_SIZE = 15;
|
628 |
+
const BOXWIDTH = 110;
|
629 |
+
const BOXHEIGHT = 22.5;
|
630 |
+
const MATRIX_WIDTH = 115;
|
631 |
+
const CHECKBOX_SIZE = 20;
|
632 |
+
const TEXT_TOP = 30;
|
633 |
+
|
634 |
+
console.log("d3 version in ffuntions", d3.version)
|
635 |
+
let headColors;
|
636 |
+
try {
|
637 |
+
headColors = d3.scaleOrdinal(d3.schemeCategory10);
|
638 |
+
} catch (err) {
|
639 |
+
console.log('Older d3 version')
|
640 |
+
headColors = d3.scale.category10();
|
641 |
+
}
|
642 |
+
let config = {};
|
643 |
+
// globalThis.
|
644 |
+
initialize();
|
645 |
+
renderVis();
|
646 |
+
|
647 |
+
function initialize() {
|
648 |
+
// globalThis.initialize = () => {
|
649 |
+
|
650 |
+
console.log("init")
|
651 |
+
config.attention = params['attention'];
|
652 |
+
config.filter = params['default_filter'];
|
653 |
+
config.rootDivId = params['root_div_id'];
|
654 |
+
config.nLayers = config.attention[config.filter]['attn'].length;
|
655 |
+
config.nHeads = config.attention[config.filter]['attn'][0].length;
|
656 |
+
config.layers = params['include_layers']
|
657 |
+
|
658 |
+
if (params['heads']) {
|
659 |
+
config.headVis = new Array(config.nHeads).fill(false);
|
660 |
+
params['heads'].forEach(x => config.headVis[x] = true);
|
661 |
+
} else {
|
662 |
+
config.headVis = new Array(config.nHeads).fill(true);
|
663 |
+
}
|
664 |
+
config.initialTextLength = config.attention[config.filter].right_text.length;
|
665 |
+
config.layer_seq = (params['layer'] == null ? 0 : config.layers.findIndex(layer => params['layer'] === layer));
|
666 |
+
config.layer = config.layers[config.layer_seq]
|
667 |
+
|
668 |
+
// '#' + temp1.root_div_id+ ' #layer'
|
669 |
+
$('#' + config.rootDivId+ ' #layer').empty();
|
670 |
+
|
671 |
+
let layerEl = $('#' + config.rootDivId+ ' #layer');
|
672 |
+
console.log(layerEl)
|
673 |
+
for (const layer of config.layers) {
|
674 |
+
layerEl.append($("<option />").val(layer).text(layer));
|
675 |
+
}
|
676 |
+
layerEl.val(config.layer).change();
|
677 |
+
layerEl.on('change', function (e) {
|
678 |
+
config.layer = +e.currentTarget.value;
|
679 |
+
config.layer_seq = config.layers.findIndex(layer => config.layer === layer);
|
680 |
+
renderVis();
|
681 |
+
});
|
682 |
+
|
683 |
+
$('#'+config.rootDivId+' #filter').on('change', function (e) {
|
684 |
+
// $(`#${config.rootDivId} #filter`).on('change', function (e) {
|
685 |
+
|
686 |
+
config.filter = e.currentTarget.value;
|
687 |
+
renderVis();
|
688 |
+
});
|
689 |
+
}
|
690 |
+
|
691 |
+
function renderVis() {
|
692 |
+
|
693 |
+
// Load parameters
|
694 |
+
const attnData = config.attention[config.filter];
|
695 |
+
const leftText = attnData.left_text;
|
696 |
+
const rightText = attnData.right_text;
|
697 |
+
|
698 |
+
// Select attention for given layer
|
699 |
+
const layerAttention = attnData.attn[config.layer_seq];
|
700 |
+
|
701 |
+
// Clear vis
|
702 |
+
$('#'+config.rootDivId+' #vis').empty();
|
703 |
+
|
704 |
+
// Determine size of visualization
|
705 |
+
const height = Math.max(leftText.length, rightText.length) * BOXHEIGHT + TEXT_TOP;
|
706 |
+
const svg = d3.select('#'+ config.rootDivId +' #vis')
|
707 |
+
.append('svg')
|
708 |
+
.attr("width", "100%")
|
709 |
+
.attr("height", height + "px");
|
710 |
+
|
711 |
+
// Display tokens on left and right side of visualization
|
712 |
+
renderText(svg, leftText, true, layerAttention, 0);
|
713 |
+
renderText(svg, rightText, false, layerAttention, MATRIX_WIDTH + BOXWIDTH);
|
714 |
+
|
715 |
+
// Render attention arcs
|
716 |
+
renderAttention(svg, layerAttention);
|
717 |
+
|
718 |
+
// Draw squares at top of visualization, one for each head
|
719 |
+
drawCheckboxes(0, svg, layerAttention);
|
720 |
+
}
|
721 |
+
|
722 |
+
function renderText(svg, text, isLeft, attention, leftPos) {
|
723 |
+
|
724 |
+
const textContainer = svg.append("svg:g")
|
725 |
+
.attr("id", isLeft ? "left" : "right");
|
726 |
+
|
727 |
+
// Add attention highlights superimposed over words
|
728 |
+
textContainer.append("g")
|
729 |
+
.classed("attentionBoxes", true)
|
730 |
+
.selectAll("g")
|
731 |
+
.data(attention)
|
732 |
+
.enter()
|
733 |
+
.append("g")
|
734 |
+
.attr("head-index", (d, i) => i)
|
735 |
+
.selectAll("rect")
|
736 |
+
.data(d => isLeft ? d : transpose(d)) // if right text, transpose attention to get right-to-left weights
|
737 |
+
.enter()
|
738 |
+
.append("rect")
|
739 |
+
.attr("x", function () {
|
740 |
+
var headIndex = +this.parentNode.getAttribute("head-index");
|
741 |
+
return leftPos + boxOffsets(headIndex);
|
742 |
+
})
|
743 |
+
.attr("y", (+1) * BOXHEIGHT)
|
744 |
+
.attr("width", BOXWIDTH / activeHeads())
|
745 |
+
.attr("height", BOXHEIGHT)
|
746 |
+
.attr("fill", function () {
|
747 |
+
return headColors(+this.parentNode.getAttribute("head-index"))
|
748 |
+
})
|
749 |
+
.style("opacity", 0.0);
|
750 |
+
|
751 |
+
const tokenContainer = textContainer.append("g").selectAll("g")
|
752 |
+
.data(text)
|
753 |
+
.enter()
|
754 |
+
.append("g");
|
755 |
+
|
756 |
+
// Add gray background that appears when hovering over text
|
757 |
+
tokenContainer.append("rect")
|
758 |
+
.classed("background", true)
|
759 |
+
.style("opacity", 0.0)
|
760 |
+
.attr("fill", "lightgray")
|
761 |
+
.attr("x", leftPos)
|
762 |
+
.attr("y", (d, i) => TEXT_TOP + i * BOXHEIGHT)
|
763 |
+
.attr("width", BOXWIDTH)
|
764 |
+
.attr("height", BOXHEIGHT);
|
765 |
+
|
766 |
+
// Add token text
|
767 |
+
const textEl = tokenContainer.append("text")
|
768 |
+
.text(d => d)
|
769 |
+
.attr("font-size", TEXT_SIZE + "px")
|
770 |
+
.style("cursor", "default")
|
771 |
+
.style("-webkit-user-select", "none")
|
772 |
+
.attr("x", leftPos)
|
773 |
+
.attr("y", (d, i) => TEXT_TOP + i * BOXHEIGHT);
|
774 |
+
|
775 |
+
if (isLeft) {
|
776 |
+
textEl.style("text-anchor", "end")
|
777 |
+
.attr("dx", BOXWIDTH - 0.5 * TEXT_SIZE)
|
778 |
+
.attr("dy", TEXT_SIZE);
|
779 |
+
} else {
|
780 |
+
textEl.style("text-anchor", "start")
|
781 |
+
.attr("dx", +0.5 * TEXT_SIZE)
|
782 |
+
.attr("dy", TEXT_SIZE);
|
783 |
+
}
|
784 |
+
|
785 |
+
tokenContainer.on("mouseover", function (d, index) {
|
786 |
+
|
787 |
+
// Show gray background for moused-over token
|
788 |
+
textContainer.selectAll(".background")
|
789 |
+
.style("opacity", (d, i) => i === index ? 1.0 : 0.0)
|
790 |
+
|
791 |
+
// Reset visibility attribute for any previously highlighted attention arcs
|
792 |
+
svg.select("#attention")
|
793 |
+
.selectAll("line[visibility='visible']")
|
794 |
+
.attr("visibility", null)
|
795 |
+
|
796 |
+
// Hide group containing attention arcs
|
797 |
+
svg.select("#attention").attr("visibility", "hidden");
|
798 |
+
|
799 |
+
// Set to visible appropriate attention arcs to be highlighted
|
800 |
+
if (isLeft) {
|
801 |
+
svg.select("#attention").selectAll("line[left-token-index='" + index + "']").attr("visibility", "visible");
|
802 |
+
} else {
|
803 |
+
svg.select("#attention").selectAll("line[right-token-index='" + index + "']").attr("visibility", "visible");
|
804 |
+
}
|
805 |
+
|
806 |
+
// Update color boxes superimposed over tokens
|
807 |
+
const id = isLeft ? "right" : "left";
|
808 |
+
const leftPos = isLeft ? MATRIX_WIDTH + BOXWIDTH : 0;
|
809 |
+
svg.select("#" + id)
|
810 |
+
.selectAll(".attentionBoxes")
|
811 |
+
.selectAll("g")
|
812 |
+
.attr("head-index", (d, i) => i)
|
813 |
+
.selectAll("rect")
|
814 |
+
.attr("x", function () {
|
815 |
+
const headIndex = +this.parentNode.getAttribute("head-index");
|
816 |
+
return leftPos + boxOffsets(headIndex);
|
817 |
+
})
|
818 |
+
.attr("y", (d, i) => TEXT_TOP + i * BOXHEIGHT)
|
819 |
+
.attr("width", BOXWIDTH / activeHeads())
|
820 |
+
.attr("height", BOXHEIGHT)
|
821 |
+
.style("opacity", function (d) {
|
822 |
+
const headIndex = +this.parentNode.getAttribute("head-index");
|
823 |
+
if (config.headVis[headIndex])
|
824 |
+
if (d) {
|
825 |
+
return d[index];
|
826 |
+
} else {
|
827 |
+
return 0.0;
|
828 |
+
}
|
829 |
+
else
|
830 |
+
return 0.0;
|
831 |
+
});
|
832 |
+
});
|
833 |
+
|
834 |
+
textContainer.on("mouseleave", function () {
|
835 |
+
|
836 |
+
// Unhighlight selected token
|
837 |
+
d3.select(this).selectAll(".background")
|
838 |
+
.style("opacity", 0.0);
|
839 |
+
|
840 |
+
// Reset visibility attributes for previously selected lines
|
841 |
+
svg.select("#attention")
|
842 |
+
.selectAll("line[visibility='visible']")
|
843 |
+
.attr("visibility", null) ;
|
844 |
+
svg.select("#attention").attr("visibility", "visible");
|
845 |
+
|
846 |
+
// Reset highlights superimposed over tokens
|
847 |
+
svg.selectAll(".attentionBoxes")
|
848 |
+
.selectAll("g")
|
849 |
+
.selectAll("rect")
|
850 |
+
.style("opacity", 0.0);
|
851 |
+
});
|
852 |
+
}
|
853 |
+
|
854 |
+
function renderAttention(svg, attention) {
|
855 |
+
|
856 |
+
// Remove previous dom elements
|
857 |
+
svg.select("#attention").remove();
|
858 |
+
|
859 |
+
// Add new elements
|
860 |
+
svg.append("g")
|
861 |
+
.attr("id", "attention") // Container for all attention arcs
|
862 |
+
.selectAll(".headAttention")
|
863 |
+
.data(attention)
|
864 |
+
.enter()
|
865 |
+
.append("g")
|
866 |
+
.classed("headAttention", true) // Group attention arcs by head
|
867 |
+
.attr("head-index", (d, i) => i)
|
868 |
+
.selectAll(".tokenAttention")
|
869 |
+
.data(d => d)
|
870 |
+
.enter()
|
871 |
+
.append("g")
|
872 |
+
.classed("tokenAttention", true) // Group attention arcs by left token
|
873 |
+
.attr("left-token-index", (d, i) => i)
|
874 |
+
.selectAll("line")
|
875 |
+
.data(d => d)
|
876 |
+
.enter()
|
877 |
+
.append("line")
|
878 |
+
.attr("x1", BOXWIDTH)
|
879 |
+
.attr("y1", function () {
|
880 |
+
const leftTokenIndex = +this.parentNode.getAttribute("left-token-index")
|
881 |
+
return TEXT_TOP + leftTokenIndex * BOXHEIGHT + (BOXHEIGHT / 2)
|
882 |
+
})
|
883 |
+
.attr("x2", BOXWIDTH + MATRIX_WIDTH)
|
884 |
+
.attr("y2", (d, rightTokenIndex) => TEXT_TOP + rightTokenIndex * BOXHEIGHT + (BOXHEIGHT / 2))
|
885 |
+
.attr("stroke-width", 2)
|
886 |
+
.attr("stroke", function () {
|
887 |
+
const headIndex = +this.parentNode.parentNode.getAttribute("head-index");
|
888 |
+
return headColors(headIndex)
|
889 |
+
})
|
890 |
+
.attr("left-token-index", function () {
|
891 |
+
return +this.parentNode.getAttribute("left-token-index")
|
892 |
+
})
|
893 |
+
.attr("right-token-index", (d, i) => i)
|
894 |
+
;
|
895 |
+
updateAttention(svg)
|
896 |
+
}
|
897 |
+
|
898 |
+
function updateAttention(svg) {
|
899 |
+
svg.select("#attention")
|
900 |
+
.selectAll("line")
|
901 |
+
.attr("stroke-opacity", function (d) {
|
902 |
+
const headIndex = +this.parentNode.parentNode.getAttribute("head-index");
|
903 |
+
// If head is selected
|
904 |
+
if (config.headVis[headIndex]) {
|
905 |
+
// Set opacity to attention weight divided by number of active heads
|
906 |
+
return d / activeHeads()
|
907 |
+
} else {
|
908 |
+
return 0.0;
|
909 |
+
}
|
910 |
+
})
|
911 |
+
}
|
912 |
+
|
913 |
+
function boxOffsets(i) {
|
914 |
+
const numHeadsAbove = config.headVis.reduce(
|
915 |
+
function (acc, val, cur) {
|
916 |
+
return val && cur < i ? acc + 1 : acc;
|
917 |
+
}, 0);
|
918 |
+
return numHeadsAbove * (BOXWIDTH / activeHeads());
|
919 |
+
}
|
920 |
+
|
921 |
+
function activeHeads() {
|
922 |
+
return config.headVis.reduce(function (acc, val) {
|
923 |
+
return val ? acc + 1 : acc;
|
924 |
+
}, 0);
|
925 |
+
}
|
926 |
+
|
927 |
+
function drawCheckboxes(top, svg) {
|
928 |
+
const checkboxContainer = svg.append("g");
|
929 |
+
const checkbox = checkboxContainer.selectAll("rect")
|
930 |
+
.data(config.headVis)
|
931 |
+
.enter()
|
932 |
+
.append("rect")
|
933 |
+
.attr("fill", (d, i) => headColors(i))
|
934 |
+
.attr("x", (d, i) => i * CHECKBOX_SIZE)
|
935 |
+
.attr("y", top)
|
936 |
+
.attr("width", CHECKBOX_SIZE)
|
937 |
+
.attr("height", CHECKBOX_SIZE);
|
938 |
+
|
939 |
+
function updateCheckboxes() {
|
940 |
+
checkboxContainer.selectAll("rect")
|
941 |
+
.data(config.headVis)
|
942 |
+
.attr("fill", (d, i) => d ? headColors(i): lighten(headColors(i)));
|
943 |
+
}
|
944 |
+
|
945 |
+
updateCheckboxes();
|
946 |
+
|
947 |
+
checkbox.on("click", function (d, i) {
|
948 |
+
if (config.headVis[i] && activeHeads() === 1) return;
|
949 |
+
config.headVis[i] = !config.headVis[i];
|
950 |
+
updateCheckboxes();
|
951 |
+
updateAttention(svg);
|
952 |
+
});
|
953 |
+
|
954 |
+
checkbox.on("dblclick", function (d, i) {
|
955 |
+
// If we double click on the only active head then reset
|
956 |
+
if (config.headVis[i] && activeHeads() === 1) {
|
957 |
+
config.headVis = new Array(config.nHeads).fill(true);
|
958 |
+
} else {
|
959 |
+
config.headVis = new Array(config.nHeads).fill(false);
|
960 |
+
config.headVis[i] = true;
|
961 |
+
}
|
962 |
+
updateCheckboxes();
|
963 |
+
updateAttention(svg);
|
964 |
+
});
|
965 |
+
}
|
966 |
+
|
967 |
+
function lighten(color) {
|
968 |
+
const c = d3.hsl(color);
|
969 |
+
const increment = (1 - c.l) * 0.6;
|
970 |
+
c.l += increment;
|
971 |
+
c.s -= increment;
|
972 |
+
return c;
|
973 |
+
}
|
974 |
+
|
975 |
+
function transpose(mat) {
|
976 |
+
return mat[0].map(function (col, i) {
|
977 |
+
return mat.map(function (row) {
|
978 |
+
return row[i];
|
979 |
+
});
|
980 |
+
});
|
981 |
+
}
|
982 |
+
|
983 |
+
}
|
984 |
+
|
985 |
+
|
986 |
+
|
987 |
+
|
988 |
+
|
989 |
+
}
|
990 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bertviz
|
2 |
+
jupyter
|
3 |
+
scikit-learn
|
4 |
+
faiss-cpu
|