Spaces:

jskim
/

paper-matching

Runtime error

App Files Files Community

jskim commited on Mar 10, 2023

Commit

81ca652

1 Parent(s): b5a0311

pdf input removed. now retrieving urls for reviewer papers.

Browse files

Files changed (3) hide show

app.py +27 -26
input_format.py +1 -1
score.py +5 -2

app.py CHANGED Viewed

@@ -25,7 +25,6 @@ sent_model.to(device)
 def get_similar_paper(
     abstract_text_input,
-    pdf_file_input,
     author_id_input,
     results={} # variable will be updated and returned
 ):
@@ -34,19 +33,13 @@ def get_similar_paper(
     start = time.time()
     input_sentences = sent_tokenize(abstract_text_input)
-    # TODO handle pdf file input
-    if pdf_file_input is not None:
-        name = None
-        papers = []
-        raise ValueError('Use submission abstract instead.')
-    else:
-        # Get author papers from id
-        name, papers = get_text_from_author_id(author_id_input)
     # Compute Doc-level affinity scores for the Papers
     print('computing document scores...')
     # TODO detect duplicate papers?
-    titles, abstracts, doc_scores = compute_document_score(
         doc_model,
         tokenizer,
         abstract_text_input,
@@ -57,6 +50,7 @@ def get_similar_paper(
     results = {
         'titles': titles,
         'abstracts': abstracts,
         'doc_scores': doc_scores
     }
@@ -64,6 +58,7 @@ def get_similar_paper(
     titles = titles[:num_papers_show]
     abstracts = abstracts[:num_papers_show]
     doc_scores = doc_scores[:num_papers_show]
     display_title = ['[ %0.3f ] %s'%(s, t) for t, s in zip(titles, doc_scores)]
     end = time.time()
@@ -117,14 +112,15 @@ def get_similar_paper(
     top_num_info_show = 2 # number of sentence pairs from each paper to show upfront
     summary_out = []
     for i in range(top_papers_show):
         out_tmp = [
             gr.update(value=titles[i], visible=True),
-            gr.update(value=doc_scores[i], visible=True)
         ]
         tp = results[display_title[i]]['top_pairs']
         for j in range(top_num_info_show):
             out_tmp += [
-                gr.update(value=tp[j]['score'], visible=True),
                 tp[j]['query']['original'],
                 tp[j]['query'],
                 tp[j]['candidate']['original'],
@@ -133,7 +129,7 @@ def get_similar_paper(
         summary_out += out_tmp
     # add updates to the show more button
-    out = out + summary_out + [gr.update(visible=True)] # show more button
     assert(len(out) == (top_num_info_show * 5 + 2) * top_papers_show + 3)
     # add the search results to pass on to the Gradio State varaible
@@ -196,7 +192,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
 - Once the name is confirmed, press the `What Makes This a Good Match?` button.
 - Based on the input information, the tool will first search for similar papers from the reviewer's previous publications using [Semantic Scholar API](https://www.semanticscholar.org/product/api).
 ##### Relevant Parts from Top Papers
-- You will be shown three most relevant papers from the reviewer with high **affinity scores** (ranging from 0 - 1) computed using text representations from a [language model](https://github.com/allenai/specter/tree/master/specter).
 - For each of the paper, we present relevant pieces of information from the submission and the paper: two pairs of (sentence relevance score, sentence from the submission abstract, sentnece from the paper abstract)
 - **<span style="color:black;background-color:#5296D5;">Blue highlights</span>** inidicate phrases that are included in both sentences.
 ##### More Relevant Parts
@@ -217,8 +213,6 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
     with gr.Row() as input_row:
         with gr.Column():
             abstract_text_input = gr.Textbox(label='Submission Abstract')
-        with gr.Column():
-            pdf_file_input = gr.File(label='OR upload a submission PDF File')
         with gr.Column():
             with gr.Row():
                 author_id_input = gr.Textbox(label='Reviewer Link or ID (Semantic Scholar)')
@@ -226,13 +220,13 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
                 name = gr.Textbox(label='Confirm Reviewer Name', interactive=False)
                 author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
     with gr.Row():
-        compute_btn = gr.Button('What Makes This a Good Match?')
     ### OVERVIEW
     # Paper title, score, and top-ranking sentence pairs -- two sentence pairs per paper, three papers
-    # TODO blockfy similar components together and simplify
     ## ONE BLOCK OF INFO FOR A SINGLE PAPER
     ## PAPER1
     with gr.Row():
         with gr.Column(scale=3):
             paper_title1 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
@@ -256,13 +250,17 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
         with gr.Column(scale=4):
             sent_pair_candidate1_2 = gr.Textbox(label='Sentence from Paper', visible=False)
             sent_pair_candidate1_2_hl = gr.components.Interpretation(sent_pair_candidate1_2)
     ## PAPER 2
     with gr.Row():
         with gr.Column(scale=3):
             paper_title2 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
         with gr.Column(scale=1):
-            affinity2 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
     with gr.Row() as rel2_1:
         with gr.Column(scale=1):
             sent_pair_score2_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
@@ -270,7 +268,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
             sent_pair_source2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
             sent_pair_source2_1_hl = gr.components.Interpretation(sent_pair_source2_1)
         with gr.Column(scale=4):
-            sent_pair_candidate2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
             sent_pair_candidate2_1_hl = gr.components.Interpretation(sent_pair_candidate2_1)
     with gr.Row() as rel2_2:
         with gr.Column(scale=1):
@@ -279,9 +277,13 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
             sent_pair_source2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
             sent_pair_source2_2_hl = gr.components.Interpretation(sent_pair_source2_2)
         with gr.Column(scale=4):
-            sent_pair_candidate2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
             sent_pair_candidate2_2_hl = gr.components.Interpretation(sent_pair_candidate2_2)
     ## PAPER 3
     with gr.Row():
         with gr.Column(scale=3):
@@ -295,7 +297,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
             sent_pair_source3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
             sent_pair_source3_1_hl = gr.components.Interpretation(sent_pair_source3_1)
         with gr.Column(scale=4):
-            sent_pair_candidate3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
             sent_pair_candidate3_1_hl = gr.components.Interpretation(sent_pair_candidate3_1)
     with gr.Row() as rel3_2:
         with gr.Column(scale=1):
@@ -304,7 +306,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
             sent_pair_source3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
             sent_pair_source3_2_hl = gr.components.Interpretation(sent_pair_source3_2)
         with gr.Column(scale=4):
-            sent_pair_candidate3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
             sent_pair_candidate3_2_hl = gr.components.Interpretation(sent_pair_candidate3_2)
     ## Show more button
@@ -348,7 +350,6 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
         fn=get_similar_paper,
         inputs=[
             abstract_text_input,
-            pdf_file_input,
             author_id_input,
             info
         ],
@@ -437,7 +438,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
     gr.Markdown(
         """
         ---------
-        **Disclaimer.** This tool and its output should not serve as the sole justification for confirming a match for the submission. It is intended as a supplementary tool that the user may use at their discretion; the correctness of the output of the tool is not guaranteed. This may be improved by updating the internal models used to compute the affinity scores and sentence relevance, which may require additional research independently. The tool does not compromise the privacy of the reviewers as it relies only on their publicly-available information (e.g., names and list of previously published papers).
         """
     )

 def get_similar_paper(
     abstract_text_input,
     author_id_input,
     results={} # variable will be updated and returned
 ):
     start = time.time()
     input_sentences = sent_tokenize(abstract_text_input)
+    # Get author papers from id
+    name, papers = get_text_from_author_id(author_id_input)
     # Compute Doc-level affinity scores for the Papers
     print('computing document scores...')
     # TODO detect duplicate papers?
+    titles, abstracts, paper_urls, doc_scores = compute_document_score(
         doc_model,
         tokenizer,
         abstract_text_input,
     results = {
         'titles': titles,
         'abstracts': abstracts,
+        'urls': paper_urls,
         'doc_scores': doc_scores
     }
     titles = titles[:num_papers_show]
     abstracts = abstracts[:num_papers_show]
     doc_scores = doc_scores[:num_papers_show]
+    paper_urls = paper_urls[:num_papers_show]
     display_title = ['[ %0.3f ] %s'%(s, t) for t, s in zip(titles, doc_scores)]
     end = time.time()
     top_num_info_show = 2 # number of sentence pairs from each paper to show upfront
     summary_out = []
     for i in range(top_papers_show):
+        # TODO keep score precision consistent
         out_tmp = [
             gr.update(value=titles[i], visible=True),
+            gr.update(value=round(doc_scores[i],3), visible=True) # document affinity
         ]
         tp = results[display_title[i]]['top_pairs']
         for j in range(top_num_info_show):
             out_tmp += [
+                gr.update(value=round(tp[j]['score'],3), visible=True), # sentence relevance
                 tp[j]['query']['original'],
                 tp[j]['query'],
                 tp[j]['candidate']['original'],
         summary_out += out_tmp
     # add updates to the show more button
+    out = out + summary_out + [gr.update(visible=True)] # make show more button visible
     assert(len(out) == (top_num_info_show * 5 + 2) * top_papers_show + 3)
     # add the search results to pass on to the Gradio State varaible
 - Once the name is confirmed, press the `What Makes This a Good Match?` button.
 - Based on the input information, the tool will first search for similar papers from the reviewer's previous publications using [Semantic Scholar API](https://www.semanticscholar.org/product/api).
 ##### Relevant Parts from Top Papers
+- You will be shown three most relevant papers from the reviewer with high **affinity scores** (ranging from 0 to 1) computed using text representations from a [language model](https://github.com/allenai/specter/tree/master/specter).
 - For each of the paper, we present relevant pieces of information from the submission and the paper: two pairs of (sentence relevance score, sentence from the submission abstract, sentnece from the paper abstract)
 - **<span style="color:black;background-color:#5296D5;">Blue highlights</span>** inidicate phrases that are included in both sentences.
 ##### More Relevant Parts
     with gr.Row() as input_row:
         with gr.Column():
             abstract_text_input = gr.Textbox(label='Submission Abstract')
         with gr.Column():
             with gr.Row():
                 author_id_input = gr.Textbox(label='Reviewer Link or ID (Semantic Scholar)')
                 name = gr.Textbox(label='Confirm Reviewer Name', interactive=False)
                 author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
     with gr.Row():
+        compute_btn = gr.Button('What Makes This a Good Match?') # TODO indicate the progress when pressed
     ### OVERVIEW
     # Paper title, score, and top-ranking sentence pairs -- two sentence pairs per paper, three papers
     ## ONE BLOCK OF INFO FOR A SINGLE PAPER
     ## PAPER1
+    # TODO link to the paper
     with gr.Row():
         with gr.Column(scale=3):
             paper_title1 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
         with gr.Column(scale=4):
             sent_pair_candidate1_2 = gr.Textbox(label='Sentence from Paper', visible=False)
             sent_pair_candidate1_2_hl = gr.components.Interpretation(sent_pair_candidate1_2)
+    # TODO demarcate the entries
+    gr.Markdown(
+        """---"""
+    )
     ## PAPER 2
     with gr.Row():
         with gr.Column(scale=3):
             paper_title2 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
         with gr.Column(scale=1):
+            affinity2 = gr.Number(label='Affinity', interactive=False, value=0., visible=False)
     with gr.Row() as rel2_1:
         with gr.Column(scale=1):
             sent_pair_score2_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
             sent_pair_source2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
             sent_pair_source2_1_hl = gr.components.Interpretation(sent_pair_source2_1)
         with gr.Column(scale=4):
+            sent_pair_candidate2_1 = gr.Textbox(label='Sentence from Paper', visible=False)
             sent_pair_candidate2_1_hl = gr.components.Interpretation(sent_pair_candidate2_1)
     with gr.Row() as rel2_2:
         with gr.Column(scale=1):
             sent_pair_source2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
             sent_pair_source2_2_hl = gr.components.Interpretation(sent_pair_source2_2)
         with gr.Column(scale=4):
+            sent_pair_candidate2_2 = gr.Textbox(label='Sentence from Paper', visible=False)
             sent_pair_candidate2_2_hl = gr.components.Interpretation(sent_pair_candidate2_2)
+    gr.Markdown(
+        """---"""
+    )
     ## PAPER 3
     with gr.Row():
         with gr.Column(scale=3):
             sent_pair_source3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
             sent_pair_source3_1_hl = gr.components.Interpretation(sent_pair_source3_1)
         with gr.Column(scale=4):
+            sent_pair_candidate3_1 = gr.Textbox(label='Sentence from Paper', visible=False)
             sent_pair_candidate3_1_hl = gr.components.Interpretation(sent_pair_candidate3_1)
     with gr.Row() as rel3_2:
         with gr.Column(scale=1):
             sent_pair_source3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
             sent_pair_source3_2_hl = gr.components.Interpretation(sent_pair_source3_2)
         with gr.Column(scale=4):
+            sent_pair_candidate3_2 = gr.Textbox(label='Sentence from Paper', visible=False)
             sent_pair_candidate3_2_hl = gr.components.Interpretation(sent_pair_candidate3_2)
     ## Show more button
         fn=get_similar_paper,
         inputs=[
             abstract_text_input,
             author_id_input,
             info
         ],
     gr.Markdown(
         """
         ---------
+        **Disclaimer.** This tool and its output should not serve as the sole justification for confirming a match for the submission. It is intended as a supplementary tool that the users may use at their discretion; the correctness of the output of the tool is not guaranteed. This may be improved by updating the internal models used to compute the affinity scores and sentence relevance, which may require additional research independently. The tool does not compromise the privacy of the reviewers as it relies only on their publicly-available information (e.g., names and list of previously published papers).
         """
     )

input_format.py CHANGED Viewed

@@ -73,7 +73,7 @@ def get_text_from_author_id(author_id, max_count=100):
     if 'http' in aid: # handle semantic scholar url input
         aid = aid.split('/')
         aid = aid[aid.index('author')+2]
-    url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract"%aid
     r = requests.get(url)
     if r.status_code == 404:
         raise ValueError('Author link not found.')

     if 'http' in aid: # handle semantic scholar url input
         aid = aid.split('/')
         aid = aid[aid.index('author')+2]
+    url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract,papers.url"%aid
     r = requests.get(url)
     if r.status_code == 404:
         raise ValueError('Author link not found.')

score.py CHANGED Viewed

@@ -141,7 +141,7 @@ def get_highlight_info(model, text1, text2, K=None):
     top_pairs = []
     ii = np.unravel_index(np.argsort(np.array(sent_scores).ravel())[-top_pair_num:], sent_scores.shape)
     for i, j in zip(ii[0][::-1], ii[1][::-1]):
-        score = sent_scores[i,j]
         index_pair = (i, sent_ids[i,j].item())
         top_pairs.append((score, index_pair)) # list of (score, (sent_id_query, sent_id_candidate))
@@ -218,10 +218,12 @@ def compute_document_score(doc_model, tokenizer, query, papers, batch=5):
     scores = []
     titles = []
     abstracts = []
     for p in papers:
         if p['title'] is not None and p['abstract'] is not None:
             titles.append(p['title'])
             abstracts.append(p['abstract'])
     scores = predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=batch)
     assert(len(scores) == len(abstracts))
     idx_sorted = np.argsort(scores)[::-1]
@@ -229,5 +231,6 @@ def compute_document_score(doc_model, tokenizer, query, papers, batch=5):
     titles_sorted = [titles[x] for x in idx_sorted]
     abstracts_sorted = [abstracts[x] for x in idx_sorted]
     scores_sorted = [scores[x] for x in idx_sorted]
-    return titles_sorted, abstracts_sorted, scores_sorted

     top_pairs = []
     ii = np.unravel_index(np.argsort(np.array(sent_scores).ravel())[-top_pair_num:], sent_scores.shape)
     for i, j in zip(ii[0][::-1], ii[1][::-1]):
+        score = sent_scores[i,j].item()
         index_pair = (i, sent_ids[i,j].item())
         top_pairs.append((score, index_pair)) # list of (score, (sent_id_query, sent_id_candidate))
     scores = []
     titles = []
     abstracts = []
+    urls = []
     for p in papers:
         if p['title'] is not None and p['abstract'] is not None:
             titles.append(p['title'])
             abstracts.append(p['abstract'])
+            urls.append(p['url'])
     scores = predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=batch)
     assert(len(scores) == len(abstracts))
     idx_sorted = np.argsort(scores)[::-1]
     titles_sorted = [titles[x] for x in idx_sorted]
     abstracts_sorted = [abstracts[x] for x in idx_sorted]
     scores_sorted = [scores[x] for x in idx_sorted]
+    urls_sorted = [urls[x] for x in idx_sorted]
+    return titles_sorted, abstracts_sorted, urls_sorted, scores_sorted