Spaces:
Runtime error
Runtime error
visualizing more direct information upfront. leaving interactive parts as the next step.
Browse files
app.py
CHANGED
@@ -43,7 +43,7 @@ def get_similar_paper(
|
|
43 |
name, papers = get_text_from_author_id(author_id_input)
|
44 |
|
45 |
# Compute Doc-level affinity scores for the Papers
|
46 |
-
print('computing scores...')
|
47 |
# TODO detect duplicate papers?
|
48 |
titles, abstracts, doc_scores = compute_document_score(
|
49 |
doc_model,
|
@@ -72,40 +72,77 @@ def get_similar_paper(
|
|
72 |
start = time.time()
|
73 |
input_sentences = sent_tokenize(abstract_text_input)
|
74 |
num_sents = len(input_sentences)
|
|
|
|
|
75 |
for aa, (tt, ab, ds) in enumerate(zip(titles, abstracts, doc_scores)):
|
76 |
# Compute sent-level and phrase-level affinity scores for each papers
|
77 |
-
sent_ids, sent_scores, info = get_highlight_info(
|
78 |
sent_model,
|
79 |
abstract_text_input,
|
80 |
ab,
|
81 |
K=2
|
82 |
)
|
83 |
-
|
84 |
-
word_scores = dict()
|
85 |
|
86 |
-
#
|
|
|
87 |
for i in range(num_sents):
|
88 |
word_scores[str(i)] = {
|
89 |
"original": ab,
|
90 |
"interpretation": list(zip(info['all_words'], info[i]['scores']))
|
91 |
-
}
|
92 |
|
93 |
tmp[display_title[aa]] = {
|
94 |
'title': tt,
|
95 |
'abstract': ab,
|
96 |
'doc_score': ds,
|
97 |
'source_sentences': input_sentences,
|
98 |
-
'highlight': word_scores
|
|
|
99 |
}
|
100 |
-
|
|
|
|
|
101 |
end = time.time()
|
102 |
-
print('done in [%0.2f] seconds'%(end - start))
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
return (
|
105 |
-
gr.update(
|
106 |
-
gr.update(
|
107 |
-
gr.update(visible=True),
|
108 |
-
gr.update(visible=True),
|
109 |
)
|
110 |
|
111 |
def update_name(author_id_input):
|
@@ -147,6 +184,7 @@ with gr.Blocks() as demo:
|
|
147 |
# Text description about the app and disclaimer
|
148 |
### TEXT Description
|
149 |
# TODO add instruction video link
|
|
|
150 |
gr.Markdown(
|
151 |
"""
|
152 |
# Paper Matching Helper
|
@@ -186,9 +224,93 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
|
|
186 |
author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
|
187 |
with gr.Row():
|
188 |
compute_btn = gr.Button('What Makes This a Good Match?')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
### PAPER INFORMATION
|
191 |
-
|
192 |
# show multiple papers in radio check box to select from
|
193 |
with gr.Row():
|
194 |
selected_papers_radio = gr.Radio(
|
@@ -205,9 +327,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
|
|
205 |
affinity= gr.Number(label='Affinity', interactive=False, value=0)
|
206 |
with gr.Row():
|
207 |
paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
|
208 |
-
|
209 |
-
## TODO consider adding more direct information feeding to the users before giving them options for interactions.
|
210 |
-
|
211 |
### RELEVANT PARTS (HIGHLIGHTS)
|
212 |
with gr.Row():
|
213 |
with gr.Column(scale=2): # text from submission
|
@@ -221,7 +341,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
|
|
221 |
|
222 |
### EVENT LISTENERS
|
223 |
|
224 |
-
# retrieve similar papers
|
225 |
compute_btn.click(
|
226 |
fn=get_similar_paper,
|
227 |
inputs=[
|
@@ -229,13 +349,60 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
|
|
229 |
pdf_file_input,
|
230 |
author_id_input
|
231 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
outputs=[
|
233 |
selected_papers_radio,
|
234 |
source_sentences,
|
235 |
title_row,
|
236 |
paper_abstract
|
237 |
]
|
238 |
-
)
|
239 |
|
240 |
# change highlight based on selected sentences from submission
|
241 |
source_sentences.change(
|
|
|
43 |
name, papers = get_text_from_author_id(author_id_input)
|
44 |
|
45 |
# Compute Doc-level affinity scores for the Papers
|
46 |
+
print('computing document scores...')
|
47 |
# TODO detect duplicate papers?
|
48 |
titles, abstracts, doc_scores = compute_document_score(
|
49 |
doc_model,
|
|
|
72 |
start = time.time()
|
73 |
input_sentences = sent_tokenize(abstract_text_input)
|
74 |
num_sents = len(input_sentences)
|
75 |
+
|
76 |
+
summary_info = dict() # elements to visualize upfront
|
77 |
for aa, (tt, ab, ds) in enumerate(zip(titles, abstracts, doc_scores)):
|
78 |
# Compute sent-level and phrase-level affinity scores for each papers
|
79 |
+
sent_ids, sent_scores, info, top_pairs_info = get_highlight_info(
|
80 |
sent_model,
|
81 |
abstract_text_input,
|
82 |
ab,
|
83 |
K=2
|
84 |
)
|
|
|
|
|
85 |
|
86 |
+
# get scores for each word in the format for Gradio Interpretation component
|
87 |
+
word_scores = dict()
|
88 |
for i in range(num_sents):
|
89 |
word_scores[str(i)] = {
|
90 |
"original": ab,
|
91 |
"interpretation": list(zip(info['all_words'], info[i]['scores']))
|
92 |
+
}
|
93 |
|
94 |
tmp[display_title[aa]] = {
|
95 |
'title': tt,
|
96 |
'abstract': ab,
|
97 |
'doc_score': ds,
|
98 |
'source_sentences': input_sentences,
|
99 |
+
'highlight': word_scores,
|
100 |
+
'top_pairs': top_pairs_info
|
101 |
}
|
102 |
+
|
103 |
+
# TODO better ways of saving intermediate results? user identifiers per session?
|
104 |
+
pickle.dump(tmp, open('info.pkl', 'wb'))
|
105 |
end = time.time()
|
106 |
+
print('done in [%0.2f] seconds'%(end - start))
|
107 |
+
|
108 |
+
# set up elements to show
|
109 |
+
out = [
|
110 |
+
gr.update(choices=display_title, interactive=True, visible=False), # set of papers (radio)
|
111 |
+
gr.update(choices=input_sentences, interactive=True, visible=False) # submission sentences
|
112 |
+
]
|
113 |
|
114 |
+
# set up elements to visualize upfront
|
115 |
+
top_papers_show = 3 # number of top papers to show upfront
|
116 |
+
top_num_info_show = 2 # number of sentence pairs from each paper to show upfront
|
117 |
+
summary_out = []
|
118 |
+
for i in range(top_papers_show):
|
119 |
+
out_tmp = [
|
120 |
+
gr.update(value=titles[i], visible=True),
|
121 |
+
gr.update(value=doc_scores[i], visible=True)
|
122 |
+
]
|
123 |
+
tp = tmp[display_title[i]]['top_pairs']
|
124 |
+
for j in range(top_num_info_show):
|
125 |
+
out_tmp += [
|
126 |
+
gr.update(value=tp[j]['score'], visible=True),
|
127 |
+
tp[j]['query']['original'],
|
128 |
+
tp[j]['query'],
|
129 |
+
tp[j]['candidate']['original'],
|
130 |
+
tp[j]['candidate']
|
131 |
+
]
|
132 |
+
summary_out += out_tmp
|
133 |
+
|
134 |
+
# add updates to the show more button
|
135 |
+
out = out + summary_out + [gr.update(visible=True)] # show more button
|
136 |
+
assert(len(out) == (top_num_info_show * 5 + 2) * top_papers_show + 3)
|
137 |
+
|
138 |
+
return tuple(out)
|
139 |
+
|
140 |
+
def show_more():
|
141 |
return (
|
142 |
+
gr.update(visible=True), # set of papers
|
143 |
+
gr.update(visible=True), # submission sentences
|
144 |
+
gr.update(visible=True), # title row
|
145 |
+
gr.update(visible=True), # abstract row
|
146 |
)
|
147 |
|
148 |
def update_name(author_id_input):
|
|
|
184 |
# Text description about the app and disclaimer
|
185 |
### TEXT Description
|
186 |
# TODO add instruction video link
|
187 |
+
# TODO udpate instruction based on new changes
|
188 |
gr.Markdown(
|
189 |
"""
|
190 |
# Paper Matching Helper
|
|
|
224 |
author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
|
225 |
with gr.Row():
|
226 |
compute_btn = gr.Button('What Makes This a Good Match?')
|
227 |
+
|
228 |
+
|
229 |
+
### OVERVIEW
|
230 |
+
# Paper title, score, and top-ranking sentence pairs -- two sentence pairs per paper, three papers
|
231 |
+
# TODO blockfy similar components together and simplify
|
232 |
+
## ONE BLOCK OF INFO FOR A SINGLE PAPER
|
233 |
+
## PAPER1
|
234 |
+
with gr.Row():
|
235 |
+
with gr.Column(scale=3):
|
236 |
+
paper_title1 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
|
237 |
+
with gr.Column(scale=1):
|
238 |
+
affinity1 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
|
239 |
+
with gr.Row() as rel1_1:
|
240 |
+
with gr.Column(scale=1):
|
241 |
+
sent_pair_score1_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
|
242 |
+
with gr.Column(scale=4):
|
243 |
+
sent_pair_source1_1 = gr.Textbox(label='Sentence from Submission', visible=False)
|
244 |
+
sent_pair_source1_1_hl = gr.components.Interpretation(sent_pair_source1_1)
|
245 |
+
with gr.Column(scale=4):
|
246 |
+
sent_pair_candidate1_1 = gr.Textbox(label='Sentence from Paper', visible=False)
|
247 |
+
sent_pair_candidate1_1_hl = gr.components.Interpretation(sent_pair_candidate1_1)
|
248 |
+
with gr.Row() as rel1_2:
|
249 |
+
with gr.Column(scale=1):
|
250 |
+
sent_pair_score1_2 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
|
251 |
+
with gr.Column(scale=4):
|
252 |
+
sent_pair_source1_2 = gr.Textbox(label='Sentence from Submission', visible=False)
|
253 |
+
sent_pair_source1_2_hl = gr.components.Interpretation(sent_pair_source1_2)
|
254 |
+
with gr.Column(scale=4):
|
255 |
+
sent_pair_candidate1_2 = gr.Textbox(label='Sentence from Paper', visible=False)
|
256 |
+
sent_pair_candidate1_2_hl = gr.components.Interpretation(sent_pair_candidate1_2)
|
257 |
+
|
258 |
+
## PAPER 2
|
259 |
+
with gr.Row():
|
260 |
+
with gr.Column(scale=3):
|
261 |
+
paper_title2 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
|
262 |
+
with gr.Column(scale=1):
|
263 |
+
affinity2 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
|
264 |
+
with gr.Row() as rel2_1:
|
265 |
+
with gr.Column(scale=1):
|
266 |
+
sent_pair_score2_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
|
267 |
+
with gr.Column(scale=4):
|
268 |
+
sent_pair_source2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
|
269 |
+
sent_pair_source2_1_hl = gr.components.Interpretation(sent_pair_source2_1)
|
270 |
+
with gr.Column(scale=4):
|
271 |
+
sent_pair_candidate2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
|
272 |
+
sent_pair_candidate2_1_hl = gr.components.Interpretation(sent_pair_candidate2_1)
|
273 |
+
with gr.Row() as rel2_2:
|
274 |
+
with gr.Column(scale=1):
|
275 |
+
sent_pair_score2_2 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
|
276 |
+
with gr.Column(scale=4):
|
277 |
+
sent_pair_source2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
|
278 |
+
sent_pair_source2_2_hl = gr.components.Interpretation(sent_pair_source2_2)
|
279 |
+
with gr.Column(scale=4):
|
280 |
+
sent_pair_candidate2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
|
281 |
+
sent_pair_candidate2_2_hl = gr.components.Interpretation(sent_pair_candidate2_2)
|
282 |
+
|
283 |
+
## PAPER 3
|
284 |
+
with gr.Row():
|
285 |
+
with gr.Column(scale=3):
|
286 |
+
paper_title3 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
|
287 |
+
with gr.Column(scale=1):
|
288 |
+
affinity3 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
|
289 |
+
with gr.Row() as rel3_1:
|
290 |
+
with gr.Column(scale=1):
|
291 |
+
sent_pair_score3_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
|
292 |
+
with gr.Column(scale=4):
|
293 |
+
sent_pair_source3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
|
294 |
+
sent_pair_source3_1_hl = gr.components.Interpretation(sent_pair_source3_1)
|
295 |
+
with gr.Column(scale=4):
|
296 |
+
sent_pair_candidate3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
|
297 |
+
sent_pair_candidate3_1_hl = gr.components.Interpretation(sent_pair_candidate3_1)
|
298 |
+
with gr.Row() as rel3_2:
|
299 |
+
with gr.Column(scale=1):
|
300 |
+
sent_pair_score3_2 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
|
301 |
+
with gr.Column(scale=4):
|
302 |
+
sent_pair_source3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
|
303 |
+
sent_pair_source3_2_hl = gr.components.Interpretation(sent_pair_source3_2)
|
304 |
+
with gr.Column(scale=4):
|
305 |
+
sent_pair_candidate3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
|
306 |
+
sent_pair_candidate3_2_hl = gr.components.Interpretation(sent_pair_candidate3_2)
|
307 |
+
|
308 |
+
## Show more button
|
309 |
+
with gr.Row():
|
310 |
+
see_more_rel_btn = gr.Button('See more relevant parts from papers', visible=False)
|
311 |
|
312 |
### PAPER INFORMATION
|
313 |
+
|
314 |
# show multiple papers in radio check box to select from
|
315 |
with gr.Row():
|
316 |
selected_papers_radio = gr.Radio(
|
|
|
327 |
affinity= gr.Number(label='Affinity', interactive=False, value=0)
|
328 |
with gr.Row():
|
329 |
paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
|
330 |
+
|
|
|
|
|
331 |
### RELEVANT PARTS (HIGHLIGHTS)
|
332 |
with gr.Row():
|
333 |
with gr.Column(scale=2): # text from submission
|
|
|
341 |
|
342 |
### EVENT LISTENERS
|
343 |
|
344 |
+
# retrieve similar papers and show top results
|
345 |
compute_btn.click(
|
346 |
fn=get_similar_paper,
|
347 |
inputs=[
|
|
|
349 |
pdf_file_input,
|
350 |
author_id_input
|
351 |
],
|
352 |
+
outputs=[
|
353 |
+
selected_papers_radio,
|
354 |
+
source_sentences,
|
355 |
+
paper_title1, # paper info
|
356 |
+
affinity1,
|
357 |
+
sent_pair_score1_1,
|
358 |
+
sent_pair_source1_1,
|
359 |
+
sent_pair_source1_1_hl,
|
360 |
+
sent_pair_candidate1_1,
|
361 |
+
sent_pair_candidate1_1_hl,
|
362 |
+
sent_pair_score1_2,
|
363 |
+
sent_pair_source1_2,
|
364 |
+
sent_pair_source1_2_hl,
|
365 |
+
sent_pair_candidate1_2,
|
366 |
+
sent_pair_candidate1_2_hl,
|
367 |
+
paper_title2,
|
368 |
+
affinity2,
|
369 |
+
sent_pair_score2_1,
|
370 |
+
sent_pair_source2_1,
|
371 |
+
sent_pair_source2_1_hl,
|
372 |
+
sent_pair_candidate2_1,
|
373 |
+
sent_pair_candidate2_1_hl,
|
374 |
+
sent_pair_score2_2,
|
375 |
+
sent_pair_source2_2,
|
376 |
+
sent_pair_source2_2_hl,
|
377 |
+
sent_pair_candidate2_2,
|
378 |
+
sent_pair_candidate2_2_hl,
|
379 |
+
paper_title3,
|
380 |
+
affinity3,
|
381 |
+
sent_pair_score3_1,
|
382 |
+
sent_pair_source3_1,
|
383 |
+
sent_pair_source3_1_hl,
|
384 |
+
sent_pair_candidate3_1,
|
385 |
+
sent_pair_candidate3_1_hl,
|
386 |
+
sent_pair_score3_2,
|
387 |
+
sent_pair_source3_2,
|
388 |
+
sent_pair_source3_2_hl,
|
389 |
+
sent_pair_candidate3_2,
|
390 |
+
sent_pair_candidate3_2_hl,
|
391 |
+
see_more_rel_btn
|
392 |
+
]
|
393 |
+
)
|
394 |
+
|
395 |
+
# Get more info (move to more interactive portion)
|
396 |
+
see_more_rel_btn.click(
|
397 |
+
fn=show_more,
|
398 |
+
inputs=None,
|
399 |
outputs=[
|
400 |
selected_papers_radio,
|
401 |
source_sentences,
|
402 |
title_row,
|
403 |
paper_abstract
|
404 |
]
|
405 |
+
)
|
406 |
|
407 |
# change highlight based on selected sentences from submission
|
408 |
source_sentences.change(
|
score.py
CHANGED
@@ -6,7 +6,6 @@ import numpy as np
|
|
6 |
import tqdm
|
7 |
|
8 |
def compute_sentencewise_scores(model, query_sents, candidate_sents):
|
9 |
-
# TODO make this more general for different types of models
|
10 |
# list of sentences from query and candidate
|
11 |
q_v, c_v = get_embedding(model, query_sents, candidate_sents)
|
12 |
|
@@ -74,8 +73,10 @@ def get_match_phrase(w1, w2, method='pos'):
|
|
74 |
pos2 = pos_tag(w2)
|
75 |
for i, (w, p) in enumerate(pos2):
|
76 |
if w.lower() in w1 and p in include:
|
|
|
77 |
mask2[i] = 1
|
78 |
-
|
|
|
79 |
|
80 |
def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
|
81 |
"""
|
@@ -102,12 +103,12 @@ def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scor
|
|
102 |
sent_range = (sent_start_id[sid], sent_start_id[sid+1])
|
103 |
is_selected_sent[sent_range[0]:sent_range[1]] = 1
|
104 |
word_scores[sent_range[0]:sent_range[1]] = sscore
|
105 |
-
is_selected_phrase[sent_range[0]:sent_range[1]] = \
|
106 |
get_match_phrase(query_words, all_words[sent_range[0]:sent_range[1]])
|
107 |
else:
|
108 |
is_selected_sent[sent_start_id[sid]:] = 1
|
109 |
word_scores[sent_start_id[sid]:] = sscore
|
110 |
-
is_selected_phrase[sent_start_id[sid]:] = \
|
111 |
get_match_phrase(query_words, all_words[sent_start_id[sid]:])
|
112 |
|
113 |
# update selected phrase scores (-1 meaning a different color in gradio)
|
@@ -135,7 +136,42 @@ def get_highlight_info(model, text1, text2, K=None):
|
|
135 |
words2, all_words2, sent_start_id2 = get_words(sent2)
|
136 |
info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
|
137 |
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
### Document-level operations
|
141 |
|
@@ -194,4 +230,4 @@ def compute_document_score(doc_model, tokenizer, query, papers, batch=5):
|
|
194 |
abstracts_sorted = [abstracts[x] for x in idx_sorted]
|
195 |
scores_sorted = [scores[x] for x in idx_sorted]
|
196 |
|
197 |
-
return titles_sorted, abstracts_sorted, scores_sorted
|
|
|
6 |
import tqdm
|
7 |
|
8 |
def compute_sentencewise_scores(model, query_sents, candidate_sents):
|
|
|
9 |
# list of sentences from query and candidate
|
10 |
q_v, c_v = get_embedding(model, query_sents, candidate_sents)
|
11 |
|
|
|
73 |
pos2 = pos_tag(w2)
|
74 |
for i, (w, p) in enumerate(pos2):
|
75 |
if w.lower() in w1 and p in include:
|
76 |
+
j = w1.index(w.lower())
|
77 |
mask2[i] = 1
|
78 |
+
mask1[j] = 1
|
79 |
+
return mask1, mask2
|
80 |
|
81 |
def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
|
82 |
"""
|
|
|
103 |
sent_range = (sent_start_id[sid], sent_start_id[sid+1])
|
104 |
is_selected_sent[sent_range[0]:sent_range[1]] = 1
|
105 |
word_scores[sent_range[0]:sent_range[1]] = sscore
|
106 |
+
_, is_selected_phrase[sent_range[0]:sent_range[1]] = \
|
107 |
get_match_phrase(query_words, all_words[sent_range[0]:sent_range[1]])
|
108 |
else:
|
109 |
is_selected_sent[sent_start_id[sid]:] = 1
|
110 |
word_scores[sent_start_id[sid]:] = sscore
|
111 |
+
_, is_selected_phrase[sent_start_id[sid]:] = \
|
112 |
get_match_phrase(query_words, all_words[sent_start_id[sid]:])
|
113 |
|
114 |
# update selected phrase scores (-1 meaning a different color in gradio)
|
|
|
136 |
words2, all_words2, sent_start_id2 = get_words(sent2)
|
137 |
info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
|
138 |
|
139 |
+
# get top sentence pairs from the query and candidate (score, index_pair)
|
140 |
+
top_pair_num = 5
|
141 |
+
top_pairs = []
|
142 |
+
ii = np.unravel_index(np.argsort(np.array(sent_scores).ravel())[-top_pair_num:], sent_scores.shape)
|
143 |
+
for i, j in zip(ii[0][::-1], ii[1][::-1]):
|
144 |
+
score = sent_scores[i,j]
|
145 |
+
index_pair = (i, sent_ids[i,j].item())
|
146 |
+
top_pairs.append((score, index_pair)) # list of (score, (sent_id_query, sent_id_candidate))
|
147 |
+
|
148 |
+
# convert top_pairs to corresponding highlights format for GRadio Interpretation component
|
149 |
+
top_pairs_info = dict()
|
150 |
+
count = 0
|
151 |
+
for s, (sidq, sidc) in top_pairs:
|
152 |
+
q_sent = sent1[sidq]
|
153 |
+
c_sent = sent2[sidc]
|
154 |
+
q_words = word_tokenize(q_sent)
|
155 |
+
c_words = word_tokenize(c_sent)
|
156 |
+
mask1, mask2 = get_match_phrase(q_words, c_words)
|
157 |
+
mask1 *= -1 # mark matching phrases as blue
|
158 |
+
mask2 *= -1
|
159 |
+
assert(len(mask1) == len(q_words) and len(mask2) == len(c_words))
|
160 |
+
top_pairs_info[count] = {
|
161 |
+
'query': {
|
162 |
+
'original': q_sent,
|
163 |
+
'interpretation': list(zip(q_words, mask1))
|
164 |
+
},
|
165 |
+
'candidate': {
|
166 |
+
'original': c_sent,
|
167 |
+
'interpretation': list(zip(c_words, mask2))
|
168 |
+
},
|
169 |
+
'score': s,
|
170 |
+
'sent_idx': (sidq, sidc)
|
171 |
+
}
|
172 |
+
count += 1
|
173 |
+
|
174 |
+
return sent_ids, sent_scores, info, top_pairs_info
|
175 |
|
176 |
### Document-level operations
|
177 |
|
|
|
230 |
abstracts_sorted = [abstracts[x] for x in idx_sorted]
|
231 |
scores_sorted = [scores[x] for x in idx_sorted]
|
232 |
|
233 |
+
return titles_sorted, abstracts_sorted, scores_sorted
|