Spaces:

jskim
/

paper-matching

Runtime error

App Files Files Community

jskim commited on Mar 6, 2023

Commit

091bb76

•

1 Parent(s): 0532283

visualizing more direct information upfront. leaving interactive parts as the next step.

Browse files

Files changed (2) hide show

app.py +186 -19
score.py +42 -6

app.py CHANGED Viewed

@@ -43,7 +43,7 @@ def get_similar_paper(
         name, papers = get_text_from_author_id(author_id_input)
     # Compute Doc-level affinity scores for the Papers
-    print('computing scores...')
     # TODO detect duplicate papers?
     titles, abstracts, doc_scores = compute_document_score(
         doc_model,
@@ -72,40 +72,77 @@ def get_similar_paper(
     start = time.time()
     input_sentences = sent_tokenize(abstract_text_input)
     num_sents = len(input_sentences)
     for aa, (tt, ab, ds) in enumerate(zip(titles, abstracts, doc_scores)):
         # Compute sent-level and phrase-level affinity scores for each papers
-        sent_ids, sent_scores, info = get_highlight_info(
             sent_model,
             abstract_text_input,
             ab,
             K=2
         )
-        word_scores = dict()
-        # different highlights for each input sentence
         for i in range(num_sents):
             word_scores[str(i)] = {
                 "original": ab,
                 "interpretation": list(zip(info['all_words'], info[i]['scores']))
-            } # format to feed to for Gradio Interpretation component
         tmp[display_title[aa]] = {
             'title': tt,
             'abstract': ab,
             'doc_score': ds,
             'source_sentences': input_sentences,
-            'highlight': word_scores
         }
-    pickle.dump(tmp, open('info.pkl', 'wb'))  # TODO better ways of saving intermediate results?
     end = time.time()
-    print('done in [%0.2f] seconds'%(end - start))
     return (
-        gr.update(choices=display_title, interactive=True, visible=True), # set of papers
-        gr.update(choices=input_sentences, interactive=True, visible=True), # submission sentences
-        gr.update(visible=True),    # title row
-        gr.update(visible=True),    # abstract row
     )
 def update_name(author_id_input):
@@ -147,6 +184,7 @@ with gr.Blocks() as demo:
     # Text description about the app and disclaimer
     ### TEXT Description
     # TODO add instruction video link
     gr.Markdown(
         """
 # Paper Matching Helper
@@ -186,9 +224,93 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
                 author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
     with gr.Row():
         compute_btn = gr.Button('What Makes This a Good Match?')
     ### PAPER INFORMATION
     # show multiple papers in radio check box to select from
     with gr.Row():
         selected_papers_radio = gr.Radio(
@@ -205,9 +327,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
             affinity= gr.Number(label='Affinity', interactive=False, value=0)
     with gr.Row():
         paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
-    ## TODO consider adding more direct information feeding to the users before giving them options for interactions.
     ### RELEVANT PARTS (HIGHLIGHTS)
     with gr.Row():
         with gr.Column(scale=2): # text from submission
@@ -221,7 +341,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
     ### EVENT LISTENERS
-    # retrieve similar papers
     compute_btn.click(
         fn=get_similar_paper,
         inputs=[
@@ -229,13 +349,60 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
             pdf_file_input,
             author_id_input
         ],
         outputs=[
             selected_papers_radio,
             source_sentences,
             title_row,
             paper_abstract
         ]
-    )
     # change highlight based on selected sentences from submission
     source_sentences.change(

         name, papers = get_text_from_author_id(author_id_input)
     # Compute Doc-level affinity scores for the Papers
+    print('computing document scores...')
     # TODO detect duplicate papers?
     titles, abstracts, doc_scores = compute_document_score(
         doc_model,
     start = time.time()
     input_sentences = sent_tokenize(abstract_text_input)
     num_sents = len(input_sentences)
+    summary_info = dict() # elements to visualize upfront
     for aa, (tt, ab, ds) in enumerate(zip(titles, abstracts, doc_scores)):
         # Compute sent-level and phrase-level affinity scores for each papers
+        sent_ids, sent_scores, info, top_pairs_info = get_highlight_info(
             sent_model,
             abstract_text_input,
             ab,
             K=2
         )
+        # get scores for each word in the format for Gradio Interpretation component
+        word_scores = dict()
         for i in range(num_sents):
             word_scores[str(i)] = {
                 "original": ab,
                 "interpretation": list(zip(info['all_words'], info[i]['scores']))
+            }
         tmp[display_title[aa]] = {
             'title': tt,
             'abstract': ab,
             'doc_score': ds,
             'source_sentences': input_sentences,
+            'highlight': word_scores,
+            'top_pairs': top_pairs_info
         }
+    # TODO better ways of saving intermediate results? user identifiers per session?
+    pickle.dump(tmp, open('info.pkl', 'wb'))
     end = time.time()
+    print('done in [%0.2f] seconds'%(end - start))
+    # set up elements to show
+    out = [
+        gr.update(choices=display_title, interactive=True, visible=False), # set of papers (radio)
+        gr.update(choices=input_sentences, interactive=True, visible=False) # submission sentences
+    ]
+    # set up elements to visualize upfront
+    top_papers_show = 3 # number of top papers to show upfront
+    top_num_info_show = 2 # number of sentence pairs from each paper to show upfront
+    summary_out = []
+    for i in range(top_papers_show):
+        out_tmp = [
+            gr.update(value=titles[i], visible=True),
+            gr.update(value=doc_scores[i], visible=True)
+        ]
+        tp = tmp[display_title[i]]['top_pairs']
+        for j in range(top_num_info_show):
+            out_tmp += [
+                gr.update(value=tp[j]['score'], visible=True),
+                tp[j]['query']['original'],
+                tp[j]['query'],
+                tp[j]['candidate']['original'],
+                tp[j]['candidate']
+            ]
+        summary_out += out_tmp
+    # add updates to the show more button
+    out = out + summary_out + [gr.update(visible=True)] # show more button
+    assert(len(out) == (top_num_info_show * 5 + 2) * top_papers_show + 3)
+    return tuple(out)
+def show_more():
     return (
+        gr.update(visible=True), # set of papers
+        gr.update(visible=True), # submission sentences
+        gr.update(visible=True), # title row
+        gr.update(visible=True), # abstract row
     )
 def update_name(author_id_input):
     # Text description about the app and disclaimer
     ### TEXT Description
     # TODO add instruction video link
+    # TODO udpate instruction based on new changes
     gr.Markdown(
         """
 # Paper Matching Helper
                 author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
     with gr.Row():
         compute_btn = gr.Button('What Makes This a Good Match?')
+    ### OVERVIEW
+    # Paper title, score, and top-ranking sentence pairs -- two sentence pairs per paper, three papers
+    # TODO blockfy similar components together and simplify
+    ## ONE BLOCK OF INFO FOR A SINGLE PAPER
+    ## PAPER1
+    with gr.Row():
+        with gr.Column(scale=3):
+            paper_title1 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
+        with gr.Column(scale=1):
+            affinity1 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
+    with gr.Row() as rel1_1:
+        with gr.Column(scale=1):
+            sent_pair_score1_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
+        with gr.Column(scale=4):
+            sent_pair_source1_1 = gr.Textbox(label='Sentence from Submission', visible=False)
+            sent_pair_source1_1_hl = gr.components.Interpretation(sent_pair_source1_1)
+        with gr.Column(scale=4):
+            sent_pair_candidate1_1 = gr.Textbox(label='Sentence from Paper', visible=False)
+            sent_pair_candidate1_1_hl = gr.components.Interpretation(sent_pair_candidate1_1)
+    with gr.Row() as rel1_2:
+        with gr.Column(scale=1):
+            sent_pair_score1_2 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
+        with gr.Column(scale=4):
+            sent_pair_source1_2 = gr.Textbox(label='Sentence from Submission', visible=False)
+            sent_pair_source1_2_hl = gr.components.Interpretation(sent_pair_source1_2)
+        with gr.Column(scale=4):
+            sent_pair_candidate1_2 = gr.Textbox(label='Sentence from Paper', visible=False)
+            sent_pair_candidate1_2_hl = gr.components.Interpretation(sent_pair_candidate1_2)
+    ## PAPER 2
+    with gr.Row():
+        with gr.Column(scale=3):
+            paper_title2 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
+        with gr.Column(scale=1):
+            affinity2 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
+    with gr.Row() as rel2_1:
+        with gr.Column(scale=1):
+            sent_pair_score2_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
+        with gr.Column(scale=4):
+            sent_pair_source2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
+            sent_pair_source2_1_hl = gr.components.Interpretation(sent_pair_source2_1)
+        with gr.Column(scale=4):
+            sent_pair_candidate2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
+            sent_pair_candidate2_1_hl = gr.components.Interpretation(sent_pair_candidate2_1)
+    with gr.Row() as rel2_2:
+        with gr.Column(scale=1):
+            sent_pair_score2_2 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
+        with gr.Column(scale=4):
+            sent_pair_source2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
+            sent_pair_source2_2_hl = gr.components.Interpretation(sent_pair_source2_2)
+        with gr.Column(scale=4):
+            sent_pair_candidate2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
+            sent_pair_candidate2_2_hl = gr.components.Interpretation(sent_pair_candidate2_2)
+    ## PAPER 3
+    with gr.Row():
+        with gr.Column(scale=3):
+            paper_title3 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
+        with gr.Column(scale=1):
+            affinity3 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
+    with gr.Row() as rel3_1:
+        with gr.Column(scale=1):
+            sent_pair_score3_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
+        with gr.Column(scale=4):
+            sent_pair_source3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
+            sent_pair_source3_1_hl = gr.components.Interpretation(sent_pair_source3_1)
+        with gr.Column(scale=4):
+            sent_pair_candidate3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
+            sent_pair_candidate3_1_hl = gr.components.Interpretation(sent_pair_candidate3_1)
+    with gr.Row() as rel3_2:
+        with gr.Column(scale=1):
+            sent_pair_score3_2 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
+        with gr.Column(scale=4):
+            sent_pair_source3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
+            sent_pair_source3_2_hl = gr.components.Interpretation(sent_pair_source3_2)
+        with gr.Column(scale=4):
+            sent_pair_candidate3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
+            sent_pair_candidate3_2_hl = gr.components.Interpretation(sent_pair_candidate3_2)
+    ## Show more button
+    with gr.Row():
+        see_more_rel_btn = gr.Button('See more relevant parts from papers', visible=False)
     ### PAPER INFORMATION
     # show multiple papers in radio check box to select from
     with gr.Row():
         selected_papers_radio = gr.Radio(
             affinity= gr.Number(label='Affinity', interactive=False, value=0)
     with gr.Row():
         paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
     ### RELEVANT PARTS (HIGHLIGHTS)
     with gr.Row():
         with gr.Column(scale=2): # text from submission
     ### EVENT LISTENERS
+    # retrieve similar papers and show top results
     compute_btn.click(
         fn=get_similar_paper,
         inputs=[
             pdf_file_input,
             author_id_input
         ],
+        outputs=[
+            selected_papers_radio,
+            source_sentences,
+            paper_title1, # paper info
+            affinity1,
+            sent_pair_score1_1,
+            sent_pair_source1_1,
+            sent_pair_source1_1_hl,
+            sent_pair_candidate1_1,
+            sent_pair_candidate1_1_hl,
+            sent_pair_score1_2,
+            sent_pair_source1_2,
+            sent_pair_source1_2_hl,
+            sent_pair_candidate1_2,
+            sent_pair_candidate1_2_hl,
+            paper_title2,
+            affinity2,
+            sent_pair_score2_1,
+            sent_pair_source2_1,
+            sent_pair_source2_1_hl,
+            sent_pair_candidate2_1,
+            sent_pair_candidate2_1_hl,
+            sent_pair_score2_2,
+            sent_pair_source2_2,
+            sent_pair_source2_2_hl,
+            sent_pair_candidate2_2,
+            sent_pair_candidate2_2_hl,
+            paper_title3,
+            affinity3,
+            sent_pair_score3_1,
+            sent_pair_source3_1,
+            sent_pair_source3_1_hl,
+            sent_pair_candidate3_1,
+            sent_pair_candidate3_1_hl,
+            sent_pair_score3_2,
+            sent_pair_source3_2,
+            sent_pair_source3_2_hl,
+            sent_pair_candidate3_2,
+            sent_pair_candidate3_2_hl,
+            see_more_rel_btn
+        ]
+    )
+    # Get more info (move to more interactive portion)
+    see_more_rel_btn.click(
+        fn=show_more,
+        inputs=None,
         outputs=[
             selected_papers_radio,
             source_sentences,
             title_row,
             paper_abstract
         ]
+    )
     # change highlight based on selected sentences from submission
     source_sentences.change(

score.py CHANGED Viewed

@@ -6,7 +6,6 @@ import numpy as np
 import tqdm
 def compute_sentencewise_scores(model, query_sents, candidate_sents):
-    # TODO make this more general for different types of models
     # list of sentences from query and candidate
     q_v, c_v = get_embedding(model, query_sents, candidate_sents)
@@ -74,8 +73,10 @@ def get_match_phrase(w1, w2, method='pos'):
         pos2 = pos_tag(w2)
         for i, (w, p) in enumerate(pos2):
             if w.lower() in w1 and p in include:
                 mask2[i] = 1
-    return mask2
 def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
     """
@@ -102,12 +103,12 @@ def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scor
                 sent_range = (sent_start_id[sid], sent_start_id[sid+1])
                 is_selected_sent[sent_range[0]:sent_range[1]] = 1
                 word_scores[sent_range[0]:sent_range[1]] = sscore
-                is_selected_phrase[sent_range[0]:sent_range[1]] = \
                     get_match_phrase(query_words, all_words[sent_range[0]:sent_range[1]])
             else:
                 is_selected_sent[sent_start_id[sid]:] = 1
                 word_scores[sent_start_id[sid]:] = sscore
-                is_selected_phrase[sent_start_id[sid]:] = \
                     get_match_phrase(query_words, all_words[sent_start_id[sid]:])
         # update selected phrase scores (-1 meaning a different color in gradio)
@@ -135,7 +136,42 @@ def get_highlight_info(model, text1, text2, K=None):
     words2, all_words2, sent_start_id2 = get_words(sent2)
     info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
-    return sent_ids, sent_scores, info
 ### Document-level operations
@@ -194,4 +230,4 @@ def compute_document_score(doc_model, tokenizer, query, papers, batch=5):
     abstracts_sorted = [abstracts[x] for x in idx_sorted]
     scores_sorted = [scores[x] for x in idx_sorted]
-    return titles_sorted, abstracts_sorted, scores_sorted

 import tqdm
 def compute_sentencewise_scores(model, query_sents, candidate_sents):
     # list of sentences from query and candidate
     q_v, c_v = get_embedding(model, query_sents, candidate_sents)
         pos2 = pos_tag(w2)
         for i, (w, p) in enumerate(pos2):
             if w.lower() in w1 and p in include:
+                j = w1.index(w.lower())
                 mask2[i] = 1
+                mask1[j] = 1
+    return mask1, mask2
 def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
     """
                 sent_range = (sent_start_id[sid], sent_start_id[sid+1])
                 is_selected_sent[sent_range[0]:sent_range[1]] = 1
                 word_scores[sent_range[0]:sent_range[1]] = sscore
+                _, is_selected_phrase[sent_range[0]:sent_range[1]] = \
                     get_match_phrase(query_words, all_words[sent_range[0]:sent_range[1]])
             else:
                 is_selected_sent[sent_start_id[sid]:] = 1
                 word_scores[sent_start_id[sid]:] = sscore
+                _, is_selected_phrase[sent_start_id[sid]:] = \
                     get_match_phrase(query_words, all_words[sent_start_id[sid]:])
         # update selected phrase scores (-1 meaning a different color in gradio)
     words2, all_words2, sent_start_id2 = get_words(sent2)
     info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
+    # get top sentence pairs from the query and candidate (score, index_pair)
+    top_pair_num = 5
+    top_pairs = []
+    ii = np.unravel_index(np.argsort(np.array(sent_scores).ravel())[-top_pair_num:], sent_scores.shape)
+    for i, j in zip(ii[0][::-1], ii[1][::-1]):
+        score = sent_scores[i,j]
+        index_pair = (i, sent_ids[i,j].item())
+        top_pairs.append((score, index_pair)) # list of (score, (sent_id_query, sent_id_candidate))
+    # convert top_pairs to corresponding highlights format for GRadio Interpretation component
+    top_pairs_info = dict()
+    count = 0
+    for s, (sidq, sidc) in top_pairs:
+        q_sent = sent1[sidq]
+        c_sent = sent2[sidc]
+        q_words = word_tokenize(q_sent)
+        c_words = word_tokenize(c_sent)
+        mask1, mask2 = get_match_phrase(q_words, c_words)
+        mask1 *= -1 # mark matching phrases as blue
+        mask2 *= -1
+        assert(len(mask1) == len(q_words) and len(mask2) == len(c_words))
+        top_pairs_info[count] = {
+            'query': {
+                'original': q_sent,
+                'interpretation': list(zip(q_words, mask1))
+            },
+            'candidate': {
+                'original': c_sent,
+                'interpretation': list(zip(c_words, mask2))
+            },
+            'score': s,
+            'sent_idx': (sidq, sidc)
+        }
+        count += 1
+    return sent_ids, sent_scores, info, top_pairs_info
 ### Document-level operations
     abstracts_sorted = [abstracts[x] for x in idx_sorted]
     scores_sorted = [scores[x] for x in idx_sorted]
+    return titles_sorted, abstracts_sorted, scores_sorted