jskim commited on
Commit
81ca652
1 Parent(s): b5a0311

pdf input removed. now retrieving urls for reviewer papers.

Browse files
Files changed (3) hide show
  1. app.py +27 -26
  2. input_format.py +1 -1
  3. score.py +5 -2
app.py CHANGED
@@ -25,7 +25,6 @@ sent_model.to(device)
25
 
26
  def get_similar_paper(
27
  abstract_text_input,
28
- pdf_file_input,
29
  author_id_input,
30
  results={} # variable will be updated and returned
31
  ):
@@ -34,19 +33,13 @@ def get_similar_paper(
34
  start = time.time()
35
  input_sentences = sent_tokenize(abstract_text_input)
36
 
37
- # TODO handle pdf file input
38
- if pdf_file_input is not None:
39
- name = None
40
- papers = []
41
- raise ValueError('Use submission abstract instead.')
42
- else:
43
- # Get author papers from id
44
- name, papers = get_text_from_author_id(author_id_input)
45
 
46
  # Compute Doc-level affinity scores for the Papers
47
  print('computing document scores...')
48
  # TODO detect duplicate papers?
49
- titles, abstracts, doc_scores = compute_document_score(
50
  doc_model,
51
  tokenizer,
52
  abstract_text_input,
@@ -57,6 +50,7 @@ def get_similar_paper(
57
  results = {
58
  'titles': titles,
59
  'abstracts': abstracts,
 
60
  'doc_scores': doc_scores
61
  }
62
 
@@ -64,6 +58,7 @@ def get_similar_paper(
64
  titles = titles[:num_papers_show]
65
  abstracts = abstracts[:num_papers_show]
66
  doc_scores = doc_scores[:num_papers_show]
 
67
 
68
  display_title = ['[ %0.3f ] %s'%(s, t) for t, s in zip(titles, doc_scores)]
69
  end = time.time()
@@ -117,14 +112,15 @@ def get_similar_paper(
117
  top_num_info_show = 2 # number of sentence pairs from each paper to show upfront
118
  summary_out = []
119
  for i in range(top_papers_show):
 
120
  out_tmp = [
121
  gr.update(value=titles[i], visible=True),
122
- gr.update(value=doc_scores[i], visible=True)
123
  ]
124
  tp = results[display_title[i]]['top_pairs']
125
  for j in range(top_num_info_show):
126
  out_tmp += [
127
- gr.update(value=tp[j]['score'], visible=True),
128
  tp[j]['query']['original'],
129
  tp[j]['query'],
130
  tp[j]['candidate']['original'],
@@ -133,7 +129,7 @@ def get_similar_paper(
133
  summary_out += out_tmp
134
 
135
  # add updates to the show more button
136
- out = out + summary_out + [gr.update(visible=True)] # show more button
137
  assert(len(out) == (top_num_info_show * 5 + 2) * top_papers_show + 3)
138
 
139
  # add the search results to pass on to the Gradio State varaible
@@ -196,7 +192,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
196
  - Once the name is confirmed, press the `What Makes This a Good Match?` button.
197
  - Based on the input information, the tool will first search for similar papers from the reviewer's previous publications using [Semantic Scholar API](https://www.semanticscholar.org/product/api).
198
  ##### Relevant Parts from Top Papers
199
- - You will be shown three most relevant papers from the reviewer with high **affinity scores** (ranging from 0 - 1) computed using text representations from a [language model](https://github.com/allenai/specter/tree/master/specter).
200
  - For each of the paper, we present relevant pieces of information from the submission and the paper: two pairs of (sentence relevance score, sentence from the submission abstract, sentnece from the paper abstract)
201
  - **<span style="color:black;background-color:#5296D5;">Blue highlights</span>** inidicate phrases that are included in both sentences.
202
  ##### More Relevant Parts
@@ -217,8 +213,6 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
217
  with gr.Row() as input_row:
218
  with gr.Column():
219
  abstract_text_input = gr.Textbox(label='Submission Abstract')
220
- with gr.Column():
221
- pdf_file_input = gr.File(label='OR upload a submission PDF File')
222
  with gr.Column():
223
  with gr.Row():
224
  author_id_input = gr.Textbox(label='Reviewer Link or ID (Semantic Scholar)')
@@ -226,13 +220,13 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
226
  name = gr.Textbox(label='Confirm Reviewer Name', interactive=False)
227
  author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
228
  with gr.Row():
229
- compute_btn = gr.Button('What Makes This a Good Match?')
230
 
231
  ### OVERVIEW
232
  # Paper title, score, and top-ranking sentence pairs -- two sentence pairs per paper, three papers
233
- # TODO blockfy similar components together and simplify
234
  ## ONE BLOCK OF INFO FOR A SINGLE PAPER
235
  ## PAPER1
 
236
  with gr.Row():
237
  with gr.Column(scale=3):
238
  paper_title1 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
@@ -256,13 +250,17 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
256
  with gr.Column(scale=4):
257
  sent_pair_candidate1_2 = gr.Textbox(label='Sentence from Paper', visible=False)
258
  sent_pair_candidate1_2_hl = gr.components.Interpretation(sent_pair_candidate1_2)
 
 
 
 
259
 
260
  ## PAPER 2
261
  with gr.Row():
262
  with gr.Column(scale=3):
263
  paper_title2 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
264
  with gr.Column(scale=1):
265
- affinity2 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
266
  with gr.Row() as rel2_1:
267
  with gr.Column(scale=1):
268
  sent_pair_score2_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
@@ -270,7 +268,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
270
  sent_pair_source2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
271
  sent_pair_source2_1_hl = gr.components.Interpretation(sent_pair_source2_1)
272
  with gr.Column(scale=4):
273
- sent_pair_candidate2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
274
  sent_pair_candidate2_1_hl = gr.components.Interpretation(sent_pair_candidate2_1)
275
  with gr.Row() as rel2_2:
276
  with gr.Column(scale=1):
@@ -279,9 +277,13 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
279
  sent_pair_source2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
280
  sent_pair_source2_2_hl = gr.components.Interpretation(sent_pair_source2_2)
281
  with gr.Column(scale=4):
282
- sent_pair_candidate2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
283
  sent_pair_candidate2_2_hl = gr.components.Interpretation(sent_pair_candidate2_2)
284
-
 
 
 
 
285
  ## PAPER 3
286
  with gr.Row():
287
  with gr.Column(scale=3):
@@ -295,7 +297,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
295
  sent_pair_source3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
296
  sent_pair_source3_1_hl = gr.components.Interpretation(sent_pair_source3_1)
297
  with gr.Column(scale=4):
298
- sent_pair_candidate3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
299
  sent_pair_candidate3_1_hl = gr.components.Interpretation(sent_pair_candidate3_1)
300
  with gr.Row() as rel3_2:
301
  with gr.Column(scale=1):
@@ -304,7 +306,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
304
  sent_pair_source3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
305
  sent_pair_source3_2_hl = gr.components.Interpretation(sent_pair_source3_2)
306
  with gr.Column(scale=4):
307
- sent_pair_candidate3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
308
  sent_pair_candidate3_2_hl = gr.components.Interpretation(sent_pair_candidate3_2)
309
 
310
  ## Show more button
@@ -348,7 +350,6 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
348
  fn=get_similar_paper,
349
  inputs=[
350
  abstract_text_input,
351
- pdf_file_input,
352
  author_id_input,
353
  info
354
  ],
@@ -437,7 +438,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
437
  gr.Markdown(
438
  """
439
  ---------
440
- **Disclaimer.** This tool and its output should not serve as the sole justification for confirming a match for the submission. It is intended as a supplementary tool that the user may use at their discretion; the correctness of the output of the tool is not guaranteed. This may be improved by updating the internal models used to compute the affinity scores and sentence relevance, which may require additional research independently. The tool does not compromise the privacy of the reviewers as it relies only on their publicly-available information (e.g., names and list of previously published papers).
441
  """
442
  )
443
 
 
25
 
26
  def get_similar_paper(
27
  abstract_text_input,
 
28
  author_id_input,
29
  results={} # variable will be updated and returned
30
  ):
 
33
  start = time.time()
34
  input_sentences = sent_tokenize(abstract_text_input)
35
 
36
+ # Get author papers from id
37
+ name, papers = get_text_from_author_id(author_id_input)
 
 
 
 
 
 
38
 
39
  # Compute Doc-level affinity scores for the Papers
40
  print('computing document scores...')
41
  # TODO detect duplicate papers?
42
+ titles, abstracts, paper_urls, doc_scores = compute_document_score(
43
  doc_model,
44
  tokenizer,
45
  abstract_text_input,
 
50
  results = {
51
  'titles': titles,
52
  'abstracts': abstracts,
53
+ 'urls': paper_urls,
54
  'doc_scores': doc_scores
55
  }
56
 
 
58
  titles = titles[:num_papers_show]
59
  abstracts = abstracts[:num_papers_show]
60
  doc_scores = doc_scores[:num_papers_show]
61
+ paper_urls = paper_urls[:num_papers_show]
62
 
63
  display_title = ['[ %0.3f ] %s'%(s, t) for t, s in zip(titles, doc_scores)]
64
  end = time.time()
 
112
  top_num_info_show = 2 # number of sentence pairs from each paper to show upfront
113
  summary_out = []
114
  for i in range(top_papers_show):
115
+ # TODO keep score precision consistent
116
  out_tmp = [
117
  gr.update(value=titles[i], visible=True),
118
+ gr.update(value=round(doc_scores[i],3), visible=True) # document affinity
119
  ]
120
  tp = results[display_title[i]]['top_pairs']
121
  for j in range(top_num_info_show):
122
  out_tmp += [
123
+ gr.update(value=round(tp[j]['score'],3), visible=True), # sentence relevance
124
  tp[j]['query']['original'],
125
  tp[j]['query'],
126
  tp[j]['candidate']['original'],
 
129
  summary_out += out_tmp
130
 
131
  # add updates to the show more button
132
+ out = out + summary_out + [gr.update(visible=True)] # make show more button visible
133
  assert(len(out) == (top_num_info_show * 5 + 2) * top_papers_show + 3)
134
 
135
  # add the search results to pass on to the Gradio State varaible
 
192
  - Once the name is confirmed, press the `What Makes This a Good Match?` button.
193
  - Based on the input information, the tool will first search for similar papers from the reviewer's previous publications using [Semantic Scholar API](https://www.semanticscholar.org/product/api).
194
  ##### Relevant Parts from Top Papers
195
+ - You will be shown three most relevant papers from the reviewer with high **affinity scores** (ranging from 0 to 1) computed using text representations from a [language model](https://github.com/allenai/specter/tree/master/specter).
196
  - For each of the paper, we present relevant pieces of information from the submission and the paper: two pairs of (sentence relevance score, sentence from the submission abstract, sentnece from the paper abstract)
197
  - **<span style="color:black;background-color:#5296D5;">Blue highlights</span>** inidicate phrases that are included in both sentences.
198
  ##### More Relevant Parts
 
213
  with gr.Row() as input_row:
214
  with gr.Column():
215
  abstract_text_input = gr.Textbox(label='Submission Abstract')
 
 
216
  with gr.Column():
217
  with gr.Row():
218
  author_id_input = gr.Textbox(label='Reviewer Link or ID (Semantic Scholar)')
 
220
  name = gr.Textbox(label='Confirm Reviewer Name', interactive=False)
221
  author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
222
  with gr.Row():
223
+ compute_btn = gr.Button('What Makes This a Good Match?') # TODO indicate the progress when pressed
224
 
225
  ### OVERVIEW
226
  # Paper title, score, and top-ranking sentence pairs -- two sentence pairs per paper, three papers
 
227
  ## ONE BLOCK OF INFO FOR A SINGLE PAPER
228
  ## PAPER1
229
+ # TODO link to the paper
230
  with gr.Row():
231
  with gr.Column(scale=3):
232
  paper_title1 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
 
250
  with gr.Column(scale=4):
251
  sent_pair_candidate1_2 = gr.Textbox(label='Sentence from Paper', visible=False)
252
  sent_pair_candidate1_2_hl = gr.components.Interpretation(sent_pair_candidate1_2)
253
+ # TODO demarcate the entries
254
+ gr.Markdown(
255
+ """---"""
256
+ )
257
 
258
  ## PAPER 2
259
  with gr.Row():
260
  with gr.Column(scale=3):
261
  paper_title2 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
262
  with gr.Column(scale=1):
263
+ affinity2 = gr.Number(label='Affinity', interactive=False, value=0., visible=False)
264
  with gr.Row() as rel2_1:
265
  with gr.Column(scale=1):
266
  sent_pair_score2_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
 
268
  sent_pair_source2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
269
  sent_pair_source2_1_hl = gr.components.Interpretation(sent_pair_source2_1)
270
  with gr.Column(scale=4):
271
+ sent_pair_candidate2_1 = gr.Textbox(label='Sentence from Paper', visible=False)
272
  sent_pair_candidate2_1_hl = gr.components.Interpretation(sent_pair_candidate2_1)
273
  with gr.Row() as rel2_2:
274
  with gr.Column(scale=1):
 
277
  sent_pair_source2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
278
  sent_pair_source2_2_hl = gr.components.Interpretation(sent_pair_source2_2)
279
  with gr.Column(scale=4):
280
+ sent_pair_candidate2_2 = gr.Textbox(label='Sentence from Paper', visible=False)
281
  sent_pair_candidate2_2_hl = gr.components.Interpretation(sent_pair_candidate2_2)
282
+
283
+ gr.Markdown(
284
+ """---"""
285
+ )
286
+
287
  ## PAPER 3
288
  with gr.Row():
289
  with gr.Column(scale=3):
 
297
  sent_pair_source3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
298
  sent_pair_source3_1_hl = gr.components.Interpretation(sent_pair_source3_1)
299
  with gr.Column(scale=4):
300
+ sent_pair_candidate3_1 = gr.Textbox(label='Sentence from Paper', visible=False)
301
  sent_pair_candidate3_1_hl = gr.components.Interpretation(sent_pair_candidate3_1)
302
  with gr.Row() as rel3_2:
303
  with gr.Column(scale=1):
 
306
  sent_pair_source3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
307
  sent_pair_source3_2_hl = gr.components.Interpretation(sent_pair_source3_2)
308
  with gr.Column(scale=4):
309
+ sent_pair_candidate3_2 = gr.Textbox(label='Sentence from Paper', visible=False)
310
  sent_pair_candidate3_2_hl = gr.components.Interpretation(sent_pair_candidate3_2)
311
 
312
  ## Show more button
 
350
  fn=get_similar_paper,
351
  inputs=[
352
  abstract_text_input,
 
353
  author_id_input,
354
  info
355
  ],
 
438
  gr.Markdown(
439
  """
440
  ---------
441
+ **Disclaimer.** This tool and its output should not serve as the sole justification for confirming a match for the submission. It is intended as a supplementary tool that the users may use at their discretion; the correctness of the output of the tool is not guaranteed. This may be improved by updating the internal models used to compute the affinity scores and sentence relevance, which may require additional research independently. The tool does not compromise the privacy of the reviewers as it relies only on their publicly-available information (e.g., names and list of previously published papers).
442
  """
443
  )
444
 
input_format.py CHANGED
@@ -73,7 +73,7 @@ def get_text_from_author_id(author_id, max_count=100):
73
  if 'http' in aid: # handle semantic scholar url input
74
  aid = aid.split('/')
75
  aid = aid[aid.index('author')+2]
76
- url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract"%aid
77
  r = requests.get(url)
78
  if r.status_code == 404:
79
  raise ValueError('Author link not found.')
 
73
  if 'http' in aid: # handle semantic scholar url input
74
  aid = aid.split('/')
75
  aid = aid[aid.index('author')+2]
76
+ url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract,papers.url"%aid
77
  r = requests.get(url)
78
  if r.status_code == 404:
79
  raise ValueError('Author link not found.')
score.py CHANGED
@@ -141,7 +141,7 @@ def get_highlight_info(model, text1, text2, K=None):
141
  top_pairs = []
142
  ii = np.unravel_index(np.argsort(np.array(sent_scores).ravel())[-top_pair_num:], sent_scores.shape)
143
  for i, j in zip(ii[0][::-1], ii[1][::-1]):
144
- score = sent_scores[i,j]
145
  index_pair = (i, sent_ids[i,j].item())
146
  top_pairs.append((score, index_pair)) # list of (score, (sent_id_query, sent_id_candidate))
147
 
@@ -218,10 +218,12 @@ def compute_document_score(doc_model, tokenizer, query, papers, batch=5):
218
  scores = []
219
  titles = []
220
  abstracts = []
 
221
  for p in papers:
222
  if p['title'] is not None and p['abstract'] is not None:
223
  titles.append(p['title'])
224
  abstracts.append(p['abstract'])
 
225
  scores = predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=batch)
226
  assert(len(scores) == len(abstracts))
227
  idx_sorted = np.argsort(scores)[::-1]
@@ -229,5 +231,6 @@ def compute_document_score(doc_model, tokenizer, query, papers, batch=5):
229
  titles_sorted = [titles[x] for x in idx_sorted]
230
  abstracts_sorted = [abstracts[x] for x in idx_sorted]
231
  scores_sorted = [scores[x] for x in idx_sorted]
 
232
 
233
- return titles_sorted, abstracts_sorted, scores_sorted
 
141
  top_pairs = []
142
  ii = np.unravel_index(np.argsort(np.array(sent_scores).ravel())[-top_pair_num:], sent_scores.shape)
143
  for i, j in zip(ii[0][::-1], ii[1][::-1]):
144
+ score = sent_scores[i,j].item()
145
  index_pair = (i, sent_ids[i,j].item())
146
  top_pairs.append((score, index_pair)) # list of (score, (sent_id_query, sent_id_candidate))
147
 
 
218
  scores = []
219
  titles = []
220
  abstracts = []
221
+ urls = []
222
  for p in papers:
223
  if p['title'] is not None and p['abstract'] is not None:
224
  titles.append(p['title'])
225
  abstracts.append(p['abstract'])
226
+ urls.append(p['url'])
227
  scores = predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=batch)
228
  assert(len(scores) == len(abstracts))
229
  idx_sorted = np.argsort(scores)[::-1]
 
231
  titles_sorted = [titles[x] for x in idx_sorted]
232
  abstracts_sorted = [abstracts[x] for x in idx_sorted]
233
  scores_sorted = [scores[x] for x in idx_sorted]
234
+ urls_sorted = [urls[x] for x in idx_sorted]
235
 
236
+ return titles_sorted, abstracts_sorted, urls_sorted, scores_sorted