jskim commited on
Commit
5ee7598
1 Parent(s): e7933f3

adding flexibility to use different models for setence-level info.

Browse files
Files changed (3) hide show
  1. app.py +14 -9
  2. input_format.py +1 -1
  3. score.py +45 -14
app.py CHANGED
@@ -14,14 +14,18 @@ from score import *
14
  # load document scoring model
15
  #torch.cuda.is_available = lambda : False # uncomment to test with CPU only
16
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
17
- pretrained_model = 'allenai/specter'
 
18
  tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
19
  doc_model = AutoModel.from_pretrained(pretrained_model)
20
  doc_model.to(device)
21
 
22
  # load sentence model
23
- sent_model = SentenceTransformer('sentence-transformers/gtr-t5-base')
24
- sent_model.to(device)
 
 
 
25
 
26
  def get_similar_paper(
27
  title_input,
@@ -84,6 +88,7 @@ def get_similar_paper(
84
  # Compute sent-level and phrase-level affinity scores for each papers
85
  sent_ids, sent_scores, info, top_pairs_info = get_highlight_info(
86
  sent_model,
 
87
  abstract_text_input,
88
  ab,
89
  K=2 # top two sentences from the candidate
@@ -256,21 +261,21 @@ with gr.Blocks(css='style.css') as demo:
256
 
257
  # General instruction
258
  general_instruction = """
259
- # R2P2: Reviewer TO Paper in Peer review
260
 
261
  #### Who is it for?
262
  It is for meta-reviewers, area chairs, program chairs, or anyone who oversees the submission-reviewer matching process in peer review for academic conferences, journals, and grants.
263
 
264
- <center><img src="file/tool.gif" width="70%" alt="general workflow"></center>
265
 
266
  #### How does it help?
267
  A typical meta-reviewer workflow lacks supportive information on **what makes the pre-selected candidate reviewers a good fit** for the submission. Only affinity scores between the reviewer and the paper are shown, without additional detail.
268
 
269
- R2P2 provides more information about each reviewer. It searches for the most relevant papers among the reviewer's previous publications and highlights relevant parts within them.
270
  """
271
  # TODO add instruction video link
272
  # More details (video, addendum)
273
- more_details_instruction = """Check out <a href="", target="_blank">this video</a> for a quick demo of what R2P2 is and how it can help. You can find more details <a href="file/details.html", target="_blank">here</a> about R2P2, along with our privacy policy and disclaimer."""
274
 
275
  gr.Markdown(general_instruction)
276
  gr.HTML(more_details_instruction)
@@ -298,7 +303,7 @@ R2P2 provides more information about each reviewer. It searches for the most rel
298
  examples=[[example_title, example_submission, example_reviewer]],
299
  inputs=[title_input, abstract_text_input, author_id_input],
300
  cache_examples=False,
301
- label="Click to try out the example input."
302
  )
303
 
304
  with gr.Row():
@@ -417,7 +422,7 @@ R2P2 provides more information about each reviewer. It searches for the most rel
417
 
418
  ---
419
  """
420
-
421
  # show multiple papers in radio check box to select from
422
  paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
423
  with gr.Row():
 
14
  # load document scoring model
15
  #torch.cuda.is_available = lambda : False # uncomment to test with CPU only
16
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
17
+ #pretrained_model = 'allenai/specter'
18
+ pretrained_model = 'allenai/specter2'
19
  tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
20
  doc_model = AutoModel.from_pretrained(pretrained_model)
21
  doc_model.to(device)
22
 
23
  # load sentence model
24
+ sent_model = doc_model # have the same model for document and sentence level
25
+
26
+ # OR specify different model for sentence level
27
+ # sent_model = SentenceTransformer('sentence-transformers/gtr-t5-base')
28
+ # sent_model.to(device)
29
 
30
  def get_similar_paper(
31
  title_input,
 
88
  # Compute sent-level and phrase-level affinity scores for each papers
89
  sent_ids, sent_scores, info, top_pairs_info = get_highlight_info(
90
  sent_model,
91
+ tokenizer,
92
  abstract_text_input,
93
  ab,
94
  K=2 # top two sentences from the candidate
 
261
 
262
  # General instruction
263
  general_instruction = """
264
+ # R2P2: An Assistance Tool for Reviewer-Paper Matching in Peer Review
265
 
266
  #### Who is it for?
267
  It is for meta-reviewers, area chairs, program chairs, or anyone who oversees the submission-reviewer matching process in peer review for academic conferences, journals, and grants.
268
 
269
+ <center><img src="file/tool-img.jpeg" width="70%" alt="general workflow"></center>
270
 
271
  #### How does it help?
272
  A typical meta-reviewer workflow lacks supportive information on **what makes the pre-selected candidate reviewers a good fit** for the submission. Only affinity scores between the reviewer and the paper are shown, without additional detail.
273
 
274
+ R2P2 provides more information about each reviewer. It searches for the **most relevant papers** among the reviewer's previous publications and **highlights relevant parts** within them.
275
  """
276
  # TODO add instruction video link
277
  # More details (video, addendum)
278
+ more_details_instruction = """Check out <a href="", target="_blank">this video</a> for a quick demo of what R2P2 is and how it can help. You can find more details <a href="file/details.html", target="_blank">here</a>, along with our privacy policy and disclaimer."""
279
 
280
  gr.Markdown(general_instruction)
281
  gr.HTML(more_details_instruction)
 
303
  examples=[[example_title, example_submission, example_reviewer]],
304
  inputs=[title_input, abstract_text_input, author_id_input],
305
  cache_examples=False,
306
+ label="Try out the following example input."
307
  )
308
 
309
  with gr.Row():
 
422
 
423
  ---
424
  """
425
+ # TODO allow users to change the number of highlights to show?
426
  # show multiple papers in radio check box to select from
427
  paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
428
  with gr.Row():
input_format.py CHANGED
@@ -66,7 +66,7 @@ def download_pdf(url, file_name):
66
  ## Input formatting for the given author (reviewer)
67
  # Extracting text from a link
68
 
69
- def get_text_from_author_id(author_id, max_count=100):
70
  if author_id is None:
71
  raise ValueError('Input valid author ID')
72
  aid = str(author_id)
 
66
  ## Input formatting for the given author (reviewer)
67
  # Extracting text from a link
68
 
69
+ def get_text_from_author_id(author_id, max_count=150):
70
  if author_id is None:
71
  raise ValueError('Input valid author ID')
72
  aid = str(author_id)
score.py CHANGED
@@ -1,20 +1,39 @@
1
- from sentence_transformers import util
 
2
  from nltk.tokenize import sent_tokenize
3
  from nltk import word_tokenize, pos_tag
4
  import torch
5
  import numpy as np
6
  import tqdm
7
 
8
- def compute_sentencewise_scores(model, query_sents, candidate_sents):
9
- # list of sentences from query and candidate
10
- q_v, c_v = get_embedding(model, query_sents, candidate_sents)
11
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  return util.cos_sim(q_v, c_v)
13
 
14
  def get_embedding(model, query_sents, candidate_sents):
15
  q_v = model.encode(query_sents)
16
  c_v = model.encode(candidate_sents)
17
-
18
  return q_v, c_v
19
 
20
  def get_top_k(score_mat, K=3):
@@ -72,10 +91,10 @@ def get_match_phrase(w1, w2, method='pos'):
72
  pos1 = pos_tag(w1)
73
  pos2 = pos_tag(w2)
74
  for i, (w, p) in enumerate(pos2):
75
- if w.lower() in w1 and p in include:
76
- j = w1.index(w.lower())
77
- mask2[i] = 1
78
- mask1[j] = 1
79
  return mask1, mask2
80
 
81
  def remove_spaces(words, attrs):
@@ -90,14 +109,14 @@ def remove_spaces(words, attrs):
90
  idx, single_q, double_q = 0, 0, 0
91
  while idx < len(words):
92
  # stick to the word that appears right before
93
- if words[idx] in [',', '.', '%', ')', ':', '?', ';', "'s"]:
94
  ww = word_out.pop()
95
  aa = attr_out.pop()
96
  word_out.append(ww + words[idx])
97
  attr_out.append(aa)
98
  idx += 1
99
  # stick to the word that appears right after
100
- elif words[idx] in ["("]:
101
  word_out.append(words[idx] + words[idx+1])
102
  attr_out.append(attrs[idx+1])
103
  idx += 2
@@ -141,6 +160,18 @@ def remove_spaces(words, attrs):
141
  word_out.append(words[idx] + words[idx+1])
142
  attr_out.append(attrs[idx+1])
143
  idx += 2
 
 
 
 
 
 
 
 
 
 
 
 
144
  else:
145
  word_out.append(words[idx])
146
  attr_out.append(attrs[idx])
@@ -193,7 +224,7 @@ def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scor
193
 
194
  return output
195
 
196
- def get_highlight_info(model, text1, text2, K=None):
197
  """
198
  Get highlight information from two texts
199
  """
@@ -201,7 +232,7 @@ def get_highlight_info(model, text1, text2, K=None):
201
  sent2 = sent_tokenize(text2) # candidate
202
  if K is None: # if K is not set, select based on the length of the candidate
203
  K = int(len(sent2) / 3)
204
- score_mat = compute_sentencewise_scores(model, sent1, sent2)
205
 
206
  sent_ids, sent_scores = get_top_k(score_mat, K=K)
207
  words2, all_words2, sent_start_id2 = get_words(sent2)
 
1
+ from sentence_transformers import util, SentenceTransformer
2
+ from transformers import BertModel
3
  from nltk.tokenize import sent_tokenize
4
  from nltk import word_tokenize, pos_tag
5
  import torch
6
  import numpy as np
7
  import tqdm
8
 
9
+ def compute_sentencewise_scores(model, query_sents, candidate_sents, tokenizer=None):
10
+ if isinstance(model, SentenceTransformer):
11
+ # if the model is using SentenceTrasformer style
12
+ q_v, c_v = get_embedding(model, query_sents, candidate_sents)
13
+ elif isinstance(model, BertModel):
14
+ # if the model is BERT-style model using transformers library
15
+ inputs = tokenizer(
16
+ query_sents + candidate_sents,
17
+ padding=True,
18
+ truncation=True,
19
+ return_tensors="pt",
20
+ max_length=512
21
+ )
22
+ inputs.to(model.device)
23
+ result = model(**inputs)
24
+ embeddings = result.last_hidden_state[:, 0, :].detach().cpu().numpy()
25
+ q_v = embeddings[:len(query_sents)]
26
+ c_v = embeddings[len(query_sents):]
27
+ else:
28
+ raise ValueError('model not supported at the time')
29
+ assert(q_v.shape[1] == c_v.shape[1])
30
+ assert(q_v.shape[0] == len(query_sents))
31
+ assert(c_v.shape[0] == len(candidate_sents))
32
  return util.cos_sim(q_v, c_v)
33
 
34
  def get_embedding(model, query_sents, candidate_sents):
35
  q_v = model.encode(query_sents)
36
  c_v = model.encode(candidate_sents)
 
37
  return q_v, c_v
38
 
39
  def get_top_k(score_mat, K=3):
 
91
  pos1 = pos_tag(w1)
92
  pos2 = pos_tag(w2)
93
  for i, (w, p) in enumerate(pos2):
94
+ for j, (w_, p_) in enumerate(pos1):
95
+ if w.lower() == w_.lower() and p in include:
96
+ mask2[i] = 1
97
+ mask1[j] = 1
98
  return mask1, mask2
99
 
100
  def remove_spaces(words, attrs):
 
109
  idx, single_q, double_q = 0, 0, 0
110
  while idx < len(words):
111
  # stick to the word that appears right before
112
+ if words[idx] in [',', '.', '%', ')', ':', '?', ';', "'s", '”', "''"]:
113
  ww = word_out.pop()
114
  aa = attr_out.pop()
115
  word_out.append(ww + words[idx])
116
  attr_out.append(aa)
117
  idx += 1
118
  # stick to the word that appears right after
119
+ elif words[idx] in ["(", '“']:
120
  word_out.append(words[idx] + words[idx+1])
121
  attr_out.append(attrs[idx+1])
122
  idx += 2
 
160
  word_out.append(words[idx] + words[idx+1])
161
  attr_out.append(attrs[idx+1])
162
  idx += 2
163
+ elif words[idx] == '``':
164
+ # this is opening quote: stick to the word after, but change to real double quote
165
+ word_out.append('"' + words[idx+1])
166
+ attr_out.append(attrs[idx+1])
167
+ idx += 2
168
+ elif words[idx] == "''":
169
+ # this is closing quote: stick to word before, but change to real double quote
170
+ ww = word_out.pop()
171
+ aa = attr_out.pop()
172
+ word_out.append(ww + '"')
173
+ attr_out.append(aa)
174
+ idx += 1
175
  else:
176
  word_out.append(words[idx])
177
  attr_out.append(attrs[idx])
 
224
 
225
  return output
226
 
227
+ def get_highlight_info(model, tokenizer, text1, text2, K=None):
228
  """
229
  Get highlight information from two texts
230
  """
 
232
  sent2 = sent_tokenize(text2) # candidate
233
  if K is None: # if K is not set, select based on the length of the candidate
234
  K = int(len(sent2) / 3)
235
+ score_mat = compute_sentencewise_scores(model, sent1, sent2, tokenizer=tokenizer)
236
 
237
  sent_ids, sent_scores = get_top_k(score_mat, K=K)
238
  words2, all_words2, sent_start_id2 = get_words(sent2)