Spaces:
Runtime error
Runtime error
adding flexibility to use different models for setence-level info.
Browse files- app.py +14 -9
- input_format.py +1 -1
- score.py +45 -14
app.py
CHANGED
@@ -14,14 +14,18 @@ from score import *
|
|
14 |
# load document scoring model
|
15 |
#torch.cuda.is_available = lambda : False # uncomment to test with CPU only
|
16 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
17 |
-
pretrained_model = 'allenai/specter'
|
|
|
18 |
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
|
19 |
doc_model = AutoModel.from_pretrained(pretrained_model)
|
20 |
doc_model.to(device)
|
21 |
|
22 |
# load sentence model
|
23 |
-
sent_model =
|
24 |
-
|
|
|
|
|
|
|
25 |
|
26 |
def get_similar_paper(
|
27 |
title_input,
|
@@ -84,6 +88,7 @@ def get_similar_paper(
|
|
84 |
# Compute sent-level and phrase-level affinity scores for each papers
|
85 |
sent_ids, sent_scores, info, top_pairs_info = get_highlight_info(
|
86 |
sent_model,
|
|
|
87 |
abstract_text_input,
|
88 |
ab,
|
89 |
K=2 # top two sentences from the candidate
|
@@ -256,21 +261,21 @@ with gr.Blocks(css='style.css') as demo:
|
|
256 |
|
257 |
# General instruction
|
258 |
general_instruction = """
|
259 |
-
# R2P2:
|
260 |
|
261 |
#### Who is it for?
|
262 |
It is for meta-reviewers, area chairs, program chairs, or anyone who oversees the submission-reviewer matching process in peer review for academic conferences, journals, and grants.
|
263 |
|
264 |
-
<center><img src="file/tool.
|
265 |
|
266 |
#### How does it help?
|
267 |
A typical meta-reviewer workflow lacks supportive information on **what makes the pre-selected candidate reviewers a good fit** for the submission. Only affinity scores between the reviewer and the paper are shown, without additional detail.
|
268 |
|
269 |
-
R2P2 provides more information about each reviewer. It searches for the most relevant papers among the reviewer's previous publications and highlights relevant parts within them.
|
270 |
"""
|
271 |
# TODO add instruction video link
|
272 |
# More details (video, addendum)
|
273 |
-
more_details_instruction = """Check out <a href="", target="_blank">this video</a> for a quick demo of what R2P2 is and how it can help. You can find more details <a href="file/details.html", target="_blank">here</a
|
274 |
|
275 |
gr.Markdown(general_instruction)
|
276 |
gr.HTML(more_details_instruction)
|
@@ -298,7 +303,7 @@ R2P2 provides more information about each reviewer. It searches for the most rel
|
|
298 |
examples=[[example_title, example_submission, example_reviewer]],
|
299 |
inputs=[title_input, abstract_text_input, author_id_input],
|
300 |
cache_examples=False,
|
301 |
-
label="
|
302 |
)
|
303 |
|
304 |
with gr.Row():
|
@@ -417,7 +422,7 @@ R2P2 provides more information about each reviewer. It searches for the most rel
|
|
417 |
|
418 |
---
|
419 |
"""
|
420 |
-
|
421 |
# show multiple papers in radio check box to select from
|
422 |
paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
|
423 |
with gr.Row():
|
|
|
14 |
# load document scoring model
|
15 |
#torch.cuda.is_available = lambda : False # uncomment to test with CPU only
|
16 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
17 |
+
#pretrained_model = 'allenai/specter'
|
18 |
+
pretrained_model = 'allenai/specter2'
|
19 |
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
|
20 |
doc_model = AutoModel.from_pretrained(pretrained_model)
|
21 |
doc_model.to(device)
|
22 |
|
23 |
# load sentence model
|
24 |
+
sent_model = doc_model # have the same model for document and sentence level
|
25 |
+
|
26 |
+
# OR specify different model for sentence level
|
27 |
+
# sent_model = SentenceTransformer('sentence-transformers/gtr-t5-base')
|
28 |
+
# sent_model.to(device)
|
29 |
|
30 |
def get_similar_paper(
|
31 |
title_input,
|
|
|
88 |
# Compute sent-level and phrase-level affinity scores for each papers
|
89 |
sent_ids, sent_scores, info, top_pairs_info = get_highlight_info(
|
90 |
sent_model,
|
91 |
+
tokenizer,
|
92 |
abstract_text_input,
|
93 |
ab,
|
94 |
K=2 # top two sentences from the candidate
|
|
|
261 |
|
262 |
# General instruction
|
263 |
general_instruction = """
|
264 |
+
# R2P2: An Assistance Tool for Reviewer-Paper Matching in Peer Review
|
265 |
|
266 |
#### Who is it for?
|
267 |
It is for meta-reviewers, area chairs, program chairs, or anyone who oversees the submission-reviewer matching process in peer review for academic conferences, journals, and grants.
|
268 |
|
269 |
+
<center><img src="file/tool-img.jpeg" width="70%" alt="general workflow"></center>
|
270 |
|
271 |
#### How does it help?
|
272 |
A typical meta-reviewer workflow lacks supportive information on **what makes the pre-selected candidate reviewers a good fit** for the submission. Only affinity scores between the reviewer and the paper are shown, without additional detail.
|
273 |
|
274 |
+
R2P2 provides more information about each reviewer. It searches for the **most relevant papers** among the reviewer's previous publications and **highlights relevant parts** within them.
|
275 |
"""
|
276 |
# TODO add instruction video link
|
277 |
# More details (video, addendum)
|
278 |
+
more_details_instruction = """Check out <a href="", target="_blank">this video</a> for a quick demo of what R2P2 is and how it can help. You can find more details <a href="file/details.html", target="_blank">here</a>, along with our privacy policy and disclaimer."""
|
279 |
|
280 |
gr.Markdown(general_instruction)
|
281 |
gr.HTML(more_details_instruction)
|
|
|
303 |
examples=[[example_title, example_submission, example_reviewer]],
|
304 |
inputs=[title_input, abstract_text_input, author_id_input],
|
305 |
cache_examples=False,
|
306 |
+
label="Try out the following example input."
|
307 |
)
|
308 |
|
309 |
with gr.Row():
|
|
|
422 |
|
423 |
---
|
424 |
"""
|
425 |
+
# TODO allow users to change the number of highlights to show?
|
426 |
# show multiple papers in radio check box to select from
|
427 |
paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
|
428 |
with gr.Row():
|
input_format.py
CHANGED
@@ -66,7 +66,7 @@ def download_pdf(url, file_name):
|
|
66 |
## Input formatting for the given author (reviewer)
|
67 |
# Extracting text from a link
|
68 |
|
69 |
-
def get_text_from_author_id(author_id, max_count=
|
70 |
if author_id is None:
|
71 |
raise ValueError('Input valid author ID')
|
72 |
aid = str(author_id)
|
|
|
66 |
## Input formatting for the given author (reviewer)
|
67 |
# Extracting text from a link
|
68 |
|
69 |
+
def get_text_from_author_id(author_id, max_count=150):
|
70 |
if author_id is None:
|
71 |
raise ValueError('Input valid author ID')
|
72 |
aid = str(author_id)
|
score.py
CHANGED
@@ -1,20 +1,39 @@
|
|
1 |
-
from sentence_transformers import util
|
|
|
2 |
from nltk.tokenize import sent_tokenize
|
3 |
from nltk import word_tokenize, pos_tag
|
4 |
import torch
|
5 |
import numpy as np
|
6 |
import tqdm
|
7 |
|
8 |
-
def compute_sentencewise_scores(model, query_sents, candidate_sents):
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
return util.cos_sim(q_v, c_v)
|
13 |
|
14 |
def get_embedding(model, query_sents, candidate_sents):
|
15 |
q_v = model.encode(query_sents)
|
16 |
c_v = model.encode(candidate_sents)
|
17 |
-
|
18 |
return q_v, c_v
|
19 |
|
20 |
def get_top_k(score_mat, K=3):
|
@@ -72,10 +91,10 @@ def get_match_phrase(w1, w2, method='pos'):
|
|
72 |
pos1 = pos_tag(w1)
|
73 |
pos2 = pos_tag(w2)
|
74 |
for i, (w, p) in enumerate(pos2):
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
return mask1, mask2
|
80 |
|
81 |
def remove_spaces(words, attrs):
|
@@ -90,14 +109,14 @@ def remove_spaces(words, attrs):
|
|
90 |
idx, single_q, double_q = 0, 0, 0
|
91 |
while idx < len(words):
|
92 |
# stick to the word that appears right before
|
93 |
-
if words[idx] in [',', '.', '%', ')', ':', '?', ';', "'s"]:
|
94 |
ww = word_out.pop()
|
95 |
aa = attr_out.pop()
|
96 |
word_out.append(ww + words[idx])
|
97 |
attr_out.append(aa)
|
98 |
idx += 1
|
99 |
# stick to the word that appears right after
|
100 |
-
elif words[idx] in ["("]:
|
101 |
word_out.append(words[idx] + words[idx+1])
|
102 |
attr_out.append(attrs[idx+1])
|
103 |
idx += 2
|
@@ -141,6 +160,18 @@ def remove_spaces(words, attrs):
|
|
141 |
word_out.append(words[idx] + words[idx+1])
|
142 |
attr_out.append(attrs[idx+1])
|
143 |
idx += 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
else:
|
145 |
word_out.append(words[idx])
|
146 |
attr_out.append(attrs[idx])
|
@@ -193,7 +224,7 @@ def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scor
|
|
193 |
|
194 |
return output
|
195 |
|
196 |
-
def get_highlight_info(model, text1, text2, K=None):
|
197 |
"""
|
198 |
Get highlight information from two texts
|
199 |
"""
|
@@ -201,7 +232,7 @@ def get_highlight_info(model, text1, text2, K=None):
|
|
201 |
sent2 = sent_tokenize(text2) # candidate
|
202 |
if K is None: # if K is not set, select based on the length of the candidate
|
203 |
K = int(len(sent2) / 3)
|
204 |
-
score_mat = compute_sentencewise_scores(model, sent1, sent2)
|
205 |
|
206 |
sent_ids, sent_scores = get_top_k(score_mat, K=K)
|
207 |
words2, all_words2, sent_start_id2 = get_words(sent2)
|
|
|
1 |
+
from sentence_transformers import util, SentenceTransformer
|
2 |
+
from transformers import BertModel
|
3 |
from nltk.tokenize import sent_tokenize
|
4 |
from nltk import word_tokenize, pos_tag
|
5 |
import torch
|
6 |
import numpy as np
|
7 |
import tqdm
|
8 |
|
9 |
+
def compute_sentencewise_scores(model, query_sents, candidate_sents, tokenizer=None):
|
10 |
+
if isinstance(model, SentenceTransformer):
|
11 |
+
# if the model is using SentenceTrasformer style
|
12 |
+
q_v, c_v = get_embedding(model, query_sents, candidate_sents)
|
13 |
+
elif isinstance(model, BertModel):
|
14 |
+
# if the model is BERT-style model using transformers library
|
15 |
+
inputs = tokenizer(
|
16 |
+
query_sents + candidate_sents,
|
17 |
+
padding=True,
|
18 |
+
truncation=True,
|
19 |
+
return_tensors="pt",
|
20 |
+
max_length=512
|
21 |
+
)
|
22 |
+
inputs.to(model.device)
|
23 |
+
result = model(**inputs)
|
24 |
+
embeddings = result.last_hidden_state[:, 0, :].detach().cpu().numpy()
|
25 |
+
q_v = embeddings[:len(query_sents)]
|
26 |
+
c_v = embeddings[len(query_sents):]
|
27 |
+
else:
|
28 |
+
raise ValueError('model not supported at the time')
|
29 |
+
assert(q_v.shape[1] == c_v.shape[1])
|
30 |
+
assert(q_v.shape[0] == len(query_sents))
|
31 |
+
assert(c_v.shape[0] == len(candidate_sents))
|
32 |
return util.cos_sim(q_v, c_v)
|
33 |
|
34 |
def get_embedding(model, query_sents, candidate_sents):
|
35 |
q_v = model.encode(query_sents)
|
36 |
c_v = model.encode(candidate_sents)
|
|
|
37 |
return q_v, c_v
|
38 |
|
39 |
def get_top_k(score_mat, K=3):
|
|
|
91 |
pos1 = pos_tag(w1)
|
92 |
pos2 = pos_tag(w2)
|
93 |
for i, (w, p) in enumerate(pos2):
|
94 |
+
for j, (w_, p_) in enumerate(pos1):
|
95 |
+
if w.lower() == w_.lower() and p in include:
|
96 |
+
mask2[i] = 1
|
97 |
+
mask1[j] = 1
|
98 |
return mask1, mask2
|
99 |
|
100 |
def remove_spaces(words, attrs):
|
|
|
109 |
idx, single_q, double_q = 0, 0, 0
|
110 |
while idx < len(words):
|
111 |
# stick to the word that appears right before
|
112 |
+
if words[idx] in [',', '.', '%', ')', ':', '?', ';', "'s", '”', "''"]:
|
113 |
ww = word_out.pop()
|
114 |
aa = attr_out.pop()
|
115 |
word_out.append(ww + words[idx])
|
116 |
attr_out.append(aa)
|
117 |
idx += 1
|
118 |
# stick to the word that appears right after
|
119 |
+
elif words[idx] in ["(", '“']:
|
120 |
word_out.append(words[idx] + words[idx+1])
|
121 |
attr_out.append(attrs[idx+1])
|
122 |
idx += 2
|
|
|
160 |
word_out.append(words[idx] + words[idx+1])
|
161 |
attr_out.append(attrs[idx+1])
|
162 |
idx += 2
|
163 |
+
elif words[idx] == '``':
|
164 |
+
# this is opening quote: stick to the word after, but change to real double quote
|
165 |
+
word_out.append('"' + words[idx+1])
|
166 |
+
attr_out.append(attrs[idx+1])
|
167 |
+
idx += 2
|
168 |
+
elif words[idx] == "''":
|
169 |
+
# this is closing quote: stick to word before, but change to real double quote
|
170 |
+
ww = word_out.pop()
|
171 |
+
aa = attr_out.pop()
|
172 |
+
word_out.append(ww + '"')
|
173 |
+
attr_out.append(aa)
|
174 |
+
idx += 1
|
175 |
else:
|
176 |
word_out.append(words[idx])
|
177 |
attr_out.append(attrs[idx])
|
|
|
224 |
|
225 |
return output
|
226 |
|
227 |
+
def get_highlight_info(model, tokenizer, text1, text2, K=None):
|
228 |
"""
|
229 |
Get highlight information from two texts
|
230 |
"""
|
|
|
232 |
sent2 = sent_tokenize(text2) # candidate
|
233 |
if K is None: # if K is not set, select based on the length of the candidate
|
234 |
K = int(len(sent2) / 3)
|
235 |
+
score_mat = compute_sentencewise_scores(model, sent1, sent2, tokenizer=tokenizer)
|
236 |
|
237 |
sent_ids, sent_scores = get_top_k(score_mat, K=K)
|
238 |
words2, all_words2, sent_start_id2 = get_words(sent2)
|