Spaces:

jskim
/

paper-matching

Runtime error

App Files Files Community

jskim commited on Mar 26, 2023

Commit

6004e76

1 Parent(s): 77829a1

adding knob to control number of highlights. replacing the main model with specter2. using specter2 for sentence-level highlight as well.

Browse files

Files changed (4) hide show

app.py +103 -38
details.html +2 -2
input_format.py +1 -16
score.py +52 -40

app.py CHANGED Viewed

@@ -24,15 +24,14 @@ doc_model.to(device)
 sent_model = doc_model # have the same model for document and sentence level
 # OR specify different model for sentence level
-# sent_model = SentenceTransformer('sentence-transformers/gtr-t5-base')
-# sent_model.to(device)
 def get_similar_paper(
     title_input,
     abstract_text_input,
     author_id_input,
     results={}, # this state variable will be updated and returned
-    #progress=gr.Progress()
 ):
     progress = gr.Progress()
     num_papers_show = 10 # number of top papers to show from the reviewer
@@ -82,7 +81,7 @@ def get_similar_paper(
     print('obtaining highlights..')
     start = time.time()
     input_sentences = sent_tokenize(abstract_text_input)
-    num_sents = len(input_sentences)
     for aa, (tt, ab, ds, url) in enumerate(zip(titles, abstracts, doc_scores, paper_urls)):
         # Compute sent-level and phrase-level affinity scores for each papers
@@ -91,21 +90,26 @@ def get_similar_paper(
             tokenizer,
             abstract_text_input,
             ab,
-            K=2 # top two sentences from the candidate
         )
         # get scores for each word in the format for Gradio Interpretation component
         word_scores = dict()
-        for i in range(num_sents):
-            ww, ss = remove_spaces(info['all_words'], info[i]['scores'])
-            word_scores[str(i)] = {
-                "original": ab,
-                "interpretation": list(zip(ww, ss))
-            }
         results[display_title[aa]] = {
             'title': tt,
             'abstract': ab,
             'doc_score': '%0.3f'%ds,
             'source_sentences': input_sentences,
             'highlight': word_scores,
@@ -117,6 +121,9 @@ def get_similar_paper(
     highlight_time = end - start
     print('done in [%0.2f] seconds'%(highlight_time))
     ## Set up output elements
     # first the list of top papers, sentences to select from, paper_title, affinity
@@ -180,13 +187,13 @@ def get_similar_paper(
     assert(len(out) == (top_num_info_show * 5 + 2) * top_papers_show + 5)
     out += [gr.update(value="""
-                            <h3>Top three relevant papers by the reviewer <a href="%s" target="_blank">%s</a></h3>
-                            For each paper, two sentence pairs (one from the submission, one from the paper) with the highest relevance scores are shown.
-                            **<span style="color:black;background-color:#65B5E3;">Blue highlights</span>**: phrases that appear in both sentences.
-                            """%(author_id_input, results['name']),
-                            visible=True)] # result 1 description
     out += [gr.update(visible=True), gr.update(visible=True)] # demarcation line between results
@@ -195,11 +202,14 @@ def get_similar_paper(
     # result 2 description
     desc = """
-        ##### Click a paper by %s on the left (sorted by affinity scores), and a sentence from the submission on the right, to see which parts the paper are relevant.
     """%results['name']
     out += [gr.update(value=desc)]
-    # add the search results to pass on to the Gradio State varaible
     out += [results]
     return tuple(out)
@@ -213,6 +223,7 @@ def show_more(info):
         gr.update(visible=True), # title row
         gr.update(visible=True), # affinity row
         gr.update(visible=True), # highlight legend
         gr.update(visible=True), # highlight abstract
     )
@@ -226,33 +237,59 @@ def update_name(author_id_input):
     return gr.update(value=name)
-def change_sentence(selected_papers_radio, source_sent_choice, info={}):
     # change the output highlight based on the sentence selected from the submission
     if len(info.keys()) != 0: # if the info is not empty
         source_sents = info[selected_papers_radio]['source_sentences']
         highlights = info[selected_papers_radio]['highlight']
-        for i, s in enumerate(source_sents):
-            if source_sent_choice == s:
-                return highlights[str(i)]
     else:
         return
-def change_paper(selected_papers_radio, source_sent_choice, info={}):
     if len(info.keys()) != 0: # if the info is not empty
         source_sents = info[selected_papers_radio]['source_sentences']
         title = info[selected_papers_radio]['title']
         abstract = info[selected_papers_radio]['abstract']
         aff_score = info[selected_papers_radio]['doc_score']
         highlights = info[selected_papers_radio]['highlight']
         url = info[selected_papers_radio]['url']
         title_out = """<a href="%s" target="_blank"><h5>%s</h5></a>"""%(url, title)
         aff_score_out = '##### Affinity Score: %s'%aff_score
-        for i, s in enumerate(source_sents):
-            if source_sent_choice == s:
-                return title_out, abstract, aff_score_out, highlights[str(i)]
     else:
         return
 with gr.Blocks(css='style.css') as demo:
     info = gr.State({})  # cached search results as a State variable shared throughout
@@ -281,6 +318,7 @@ R2P2 provides more information about each reviewer. It searches for the **most r
     gr.HTML(more_details_instruction)
     gr.Markdown("""---""")
     ### INPUT
     with gr.Row() as input_row:
         with gr.Column(scale=3):
@@ -311,7 +349,6 @@ R2P2 provides more information about each reviewer. It searches for the **most r
     with gr.Row():
         search_status = gr.Textbox(label='Search Status', interactive=False, visible=False)
     ### OVERVIEW
@@ -416,13 +453,11 @@ R2P2 provides more information about each reviewer. It searches for the **most r
     # Highlight description
     hl_desc = """
-    **<span style="color:black;background-color:#DB7262;">Red</span>**: sentences simiar to the selected sentence from submission. Darker = more similar.
-    **<span style="color:black;background-color:#65B5E3;">Blue</span>**: phrases that appear in both sentences.
-    ---
-    """
-    # TODO allow users to change the number of highlights to show?
     # show multiple papers in radio check box to select from
     paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
     with gr.Row():
@@ -442,17 +477,29 @@ R2P2 provides more information about each reviewer. It searches for the **most r
         with gr.Column(scale=3):
             # selected paper and highlight
             with gr.Row():
                 highlight_legend = gr.Markdown(value=hl_desc, visible=False)
             with gr.Row(visible=False) as title_row:
                 paper_title = gr.Markdown(value='')
             with gr.Row(visible=False) as aff_row:
                 affinity = gr.Markdown(value='')
             with gr.Row(visible=False) as hl_row:
                 # highlighted text from paper
                 highlight = gr.components.Interpretation(paper_abstract)
     ### EVENT LISTENERS
     compute_btn.click(
@@ -517,6 +564,7 @@ R2P2 provides more information about each reviewer. It searches for the **most r
             demarc2,
             search_status,
             result2_desc,
             info,
         ],
         show_progress=True,
@@ -534,7 +582,8 @@ R2P2 provides more information about each reviewer. It searches for the **most r
             title_row,
             aff_row,
             highlight_legend,
-            hl_row
         ]
     )
@@ -544,6 +593,7 @@ R2P2 provides more information about each reviewer. It searches for the **most r
         inputs=[
             selected_papers_radio,
             source_sentences,
             info
         ],
         outputs=highlight
@@ -555,12 +605,27 @@ R2P2 provides more information about each reviewer. It searches for the **most r
         inputs=[
             selected_papers_radio,
             source_sentences,
             info,
         ],
         outputs= [
             paper_title,
             paper_abstract,
             affinity,
             highlight
         ]
     )

 sent_model = doc_model # have the same model for document and sentence level
 # OR specify different model for sentence level
+#sent_model = SentenceTransformer('sentence-transformers/gtr-t5-base')
+#sent_model.to(device)
 def get_similar_paper(
     title_input,
     abstract_text_input,
     author_id_input,
     results={}, # this state variable will be updated and returned
 ):
     progress = gr.Progress()
     num_papers_show = 10 # number of top papers to show from the reviewer
     print('obtaining highlights..')
     start = time.time()
     input_sentences = sent_tokenize(abstract_text_input)
+    num_input_sents = len(input_sentences)
     for aa, (tt, ab, ds, url) in enumerate(zip(titles, abstracts, doc_scores, paper_urls)):
         # Compute sent-level and phrase-level affinity scores for each papers
             tokenizer,
             abstract_text_input,
             ab,
+            K=None, # top two sentences from the candidate
+            top_pair_num=3, # top five sentence pairs to show upfront
         )
+        num_cand_sents = sent_ids.shape[1]
         # get scores for each word in the format for Gradio Interpretation component
         word_scores = dict()
+        for i in range(num_input_sents):
+            word_scores[str(i)] = dict()
+            for j in range(1, num_cand_sents+1):
+                ww, ss = remove_spaces(info['all_words'], info[i][j]['scores'])
+                word_scores[str(i)][str(j)] = {
+                    "original": ab,
+                    "interpretation": list(zip(ww, ss))
+                }
         results[display_title[aa]] = {
             'title': tt,
             'abstract': ab,
+            'num_cand_sents': num_cand_sents,
             'doc_score': '%0.3f'%ds,
             'source_sentences': input_sentences,
             'highlight': word_scores,
     highlight_time = end - start
     print('done in [%0.2f] seconds'%(highlight_time))
+    # debugging only
+    pickle.dump(results, open('info.pkl', 'wb'))
     ## Set up output elements
     # first the list of top papers, sentences to select from, paper_title, affinity
     assert(len(out) == (top_num_info_show * 5 + 2) * top_papers_show + 5)
     out += [gr.update(value="""
+        <h3>Top three relevant papers by the reviewer <a href="%s" target="_blank">%s</a></h3>
+        For each paper, two sentence pairs (one from the submission, one from the paper) with the highest relevance scores are shown.
+        **<span style="color:black;background-color:#65B5E3;">Blue highlights</span>**: phrases that appear in both sentences.
+        """%(author_id_input, results['name']),
+        visible=True)] # result 1 description
     out += [gr.update(visible=True), gr.update(visible=True)] # demarcation line between results
     # result 2 description
     desc = """
+        ##### Click a paper by %s on the left (sorted by affinity scores), and a sentence from the submission on the right, to see which parts of the paper are relevant.
     """%results['name']
     out += [gr.update(value=desc)]
+    # slider to control the number of highlights
+    out += [gr.update(value=1, maximum=len(sent_tokenize(abstracts[0])))]
+    # finally add the search results to pass on to the Gradio State varaible
     out += [results]
     return tuple(out)
         gr.update(visible=True), # title row
         gr.update(visible=True), # affinity row
         gr.update(visible=True), # highlight legend
+        gr.update(visible=True), # highlight slider
         gr.update(visible=True), # highlight abstract
     )
     return gr.update(value=name)
+def change_sentence(
+    selected_papers_radio,
+    source_sent_choice,
+    highlight_slider,
+    info={}
+):
     # change the output highlight based on the sentence selected from the submission
     if len(info.keys()) != 0: # if the info is not empty
         source_sents = info[selected_papers_radio]['source_sentences']
         highlights = info[selected_papers_radio]['highlight']
+        idx = source_sents.index(source_sent_choice)
+        return highlights[str(idx)][str(highlight_slider)]
     else:
         return
+def change_paper(
+    selected_papers_radio,
+    source_sent_choice,
+    highlight_slider,
+    info={}
+):
     if len(info.keys()) != 0: # if the info is not empty
         source_sents = info[selected_papers_radio]['source_sentences']
         title = info[selected_papers_radio]['title']
+        num_sents = info[selected_papers_radio]['num_cand_sents']
         abstract = info[selected_papers_radio]['abstract']
         aff_score = info[selected_papers_radio]['doc_score']
         highlights = info[selected_papers_radio]['highlight']
         url = info[selected_papers_radio]['url']
         title_out = """<a href="%s" target="_blank"><h5>%s</h5></a>"""%(url, title)
         aff_score_out = '##### Affinity Score: %s'%aff_score
+        idx = source_sents.index(source_sent_choice)
+        if highlight_slider <= num_sents:
+            return title_out, abstract, aff_score_out, highlights[str(idx)][str(highlight_slider)], gr.update(value=highlight_slider, maximum=num_sents)
+        else: # if the slider is set to more than the current number of sentences, show the max number of highlights
+            return title_out, abstract, aff_score_out, highlights[str(idx)][str(num_sents)], gr.update(value=num_sents, maximum=num_sents)
     else:
         return
+def change_num_highlight(
+    selected_papers_radio,
+    source_sent_choice,
+    highlight_slider,
+    info={}
+):
+    if len(info.keys()) != 0: # if the info is not empty
+        source_sents = info[selected_papers_radio]['source_sentences']
+        highlights = info[selected_papers_radio]['highlight']
+        idx = source_sents.index(source_sent_choice)
+        return highlights[str(idx)][str(highlight_slider)]
+    else:
+        return
 with gr.Blocks(css='style.css') as demo:
     info = gr.State({})  # cached search results as a State variable shared throughout
     gr.HTML(more_details_instruction)
     gr.Markdown("""---""")
     ### INPUT
     with gr.Row() as input_row:
         with gr.Column(scale=3):
     with gr.Row():
         search_status = gr.Textbox(label='Search Status', interactive=False, visible=False)
     ### OVERVIEW
     # Highlight description
     hl_desc = """
+    <font size="2">**<span style="color:black;background-color:#DB7262;">Red</span>**: sentences simiar to the selected sentence from submission. Darker = more similar.</font>
+    <font size="2">**<span style="color:black;background-color:#65B5E3;">Blue</span>**: phrases that appear in both sentences.</font>
+    """
+    #---"""
     # show multiple papers in radio check box to select from
     paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
     with gr.Row():
         with gr.Column(scale=3):
             # selected paper and highlight
             with gr.Row():
+                # slider for highlight amount
+                highlight_slider = gr.Slider(
+                    label='Number of Highlighted Sentences',
+                    minimum=1,
+                    maximum=15,
+                    step=1,
+                    value=2,
+                    visible=False
+                )
+            with gr.Row():
+                # highlight legend
                 highlight_legend = gr.Markdown(value=hl_desc, visible=False)
             with gr.Row(visible=False) as title_row:
+                # selected paper title
                 paper_title = gr.Markdown(value='')
             with gr.Row(visible=False) as aff_row:
+                # selected paper's affinity score
                 affinity = gr.Markdown(value='')
             with gr.Row(visible=False) as hl_row:
                 # highlighted text from paper
                 highlight = gr.components.Interpretation(paper_abstract)
     ### EVENT LISTENERS
     compute_btn.click(
             demarc2,
             search_status,
             result2_desc,
+            highlight_slider,
             info,
         ],
         show_progress=True,
             title_row,
             aff_row,
             highlight_legend,
+            highlight_slider,
+            hl_row,
         ]
     )
         inputs=[
             selected_papers_radio,
             source_sentences,
+            highlight_slider,
             info
         ],
         outputs=highlight
         inputs=[
             selected_papers_radio,
             source_sentences,
+            highlight_slider,
             info,
         ],
         outputs= [
             paper_title,
             paper_abstract,
             affinity,
+            highlight,
+            highlight_slider
+        ]
+    )
+    highlight_slider.change(
+        fn=change_num_highlight,
+        inputs=[
+            selected_papers_radio,
+            source_sentences,
+            highlight_slider,
+            info
+        ],
+        outputs=[
             highlight
         ]
     )

details.html CHANGED Viewed

@@ -9,8 +9,8 @@ The tool is developed by <a href="https://wnstlr.github.io", target="_blank">Joo
 <h1>What Happens Behind the Scenes</h1>
 <ul>
     <li> The tool retrieves the reviewer's previous publications using <a href="https://www.semanticscholar.org/product/api", target="_blank">Semantic Scholar API</a>.</li>
-    <li> The tool computes the affinity score between the submission abstract and each paper's abstract, using text representations from a <a href="https://github.com/allenai/specter/tree/master/specter", target="_blank">language model fine-tuned on academic papers</a>.</li>
-    <li> The tool computes pairwise sentence relevance scores between the submission abstract and the reviewer paper's abstract, using text representations from a <a href="https://huggingface.co/sentence-transformers/gtr-t5-base", target="_blank">sentence-level langauge model</a>.</li>
     <li> The tool highlights overlapping words (nouns) between setence pairs using <a href="https://www.nltk.org/book/ch05.html", target="_blank">POS tagging</a>.</li>
 </ul>

 <h1>What Happens Behind the Scenes</h1>
 <ul>
     <li> The tool retrieves the reviewer's previous publications using <a href="https://www.semanticscholar.org/product/api", target="_blank">Semantic Scholar API</a>.</li>
+    <li> The tool computes the affinity score between the submission abstract and each paper's abstract, using text representations from a <a href="https://huggingface.co/allenai/specter2", target="_blank">language model fine-tuned on academic papers</a>.</li>
+    <li> The tool then computes pairwise sentence relevance scores between the submission abstract and the reviewer paper's abstract, using text representations from <a href="https://huggingface.co/allenai/specter2", target="_blank">the same model</a>.</li>
     <li> The tool highlights overlapping words (nouns) between setence pairs using <a href="https://www.nltk.org/book/ch05.html", target="_blank">POS tagging</a>.</li>
 </ul>

input_format.py CHANGED Viewed

@@ -81,19 +81,4 @@ def get_text_from_author_id(author_id, max_count=150):
     papers = data['papers'][:max_count]
     name = data['name']
-    return name, papers
-## TODO Preprocess Extracted Texts from PDFs
-# Get a portion of the text for actual task
-def get_title(text):
-    pass
-def get_abstract(text):
-    pass
-def get_introduction(text):
-    pass
-def get_conclusion(text):
-    pass

     papers = data['papers'][:max_count]
     name = data['name']
+    return name, papers

score.py CHANGED Viewed

@@ -40,12 +40,9 @@ def get_top_k(score_mat, K=3):
     """
     Pick top K sentences to show
     """
-    idx = torch.argsort(-score_mat)
-    picked_sent = idx[:,:K]
-    picked_scores = torch.vstack(
-        [score_mat[i,picked_sent[i]] for i in range(picked_sent.shape[0])]
-    )
     return picked_sent, picked_scores
 def get_words(sent):
@@ -57,7 +54,6 @@ def get_words(sent):
     sent_start_id = [] # keep track of the word index where the new sentence starts
     counter = 0
     for x in sent:
-        #w = x.split()
         w = word_tokenize(x)
         nw = len(w)
         counter += nw
@@ -180,11 +176,21 @@ def remove_spaces(words, attrs):
     assert(len(word_out) == len(attr_out))
     return word_out, attr_out
 def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
     """
     Mark the words that are highlighted, both by in terms of sentence and phrase
     """
     num_query_sent = sent_ids.shape[0]
     num_words = len(all_words)
     output = dict()
@@ -193,53 +199,59 @@ def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scor
     # for each query sentence, mark the highlight information
     for i in range(num_query_sent):
-        query_words = word_tokenize(query_sents[i])
-        is_selected_sent = np.zeros(num_words)
-        is_selected_phrase = np.zeros(num_words)
-        word_scores = np.zeros(num_words)
-        # for each selected sentences from the candidate, compile information
-        for sid, sscore in zip(sent_ids[i], sent_scores[i]):
-            #print(len(sent_start_id), sid, sid+1)
-            if sid+1 < len(sent_start_id):
-                sent_range = (sent_start_id[sid], sent_start_id[sid+1])
-                is_selected_sent[sent_range[0]:sent_range[1]] = 1
-                word_scores[sent_range[0]:sent_range[1]] = sscore
-                _, is_selected_phrase[sent_range[0]:sent_range[1]] = \
-                    get_match_phrase(query_words, all_words[sent_range[0]:sent_range[1]])
-            else:
-                is_selected_sent[sent_start_id[sid]:] = 1
-                word_scores[sent_start_id[sid]:] = sscore
-                _, is_selected_phrase[sent_start_id[sid]:] = \
-                    get_match_phrase(query_words, all_words[sent_start_id[sid]:])
-        # update selected phrase scores (-1 meaning a different color in gradio)
-        word_scores[is_selected_sent+is_selected_phrase==2] = -0.5
-        output[i] = {
-            'is_selected_sent': is_selected_sent,
-            'is_selected_phrase': is_selected_phrase,
-            'scores': word_scores
-        }
     return output
-def get_highlight_info(model, tokenizer, text1, text2, K=None):
     """
     Get highlight information from two texts
     """
     sent1 = sent_tokenize(text1) # query
     sent2 = sent_tokenize(text2) # candidate
-    if K is None: # if K is not set, select based on the length of the candidate
-        K = int(len(sent2) / 3)
     score_mat = compute_sentencewise_scores(model, sent1, sent2, tokenizer=tokenizer)
     sent_ids, sent_scores = get_top_k(score_mat, K=K)
     words2, all_words2, sent_start_id2 = get_words(sent2)
     info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
-    # get top sentence pairs from the query and candidate (score, index_pair)
-    top_pair_num = 5
     top_pairs = []
     ii = np.unravel_index(np.argsort(np.array(sent_scores).ravel())[-top_pair_num:], sent_scores.shape)
     for i, j in zip(ii[0][::-1], ii[1][::-1]):

     """
     Pick top K sentences to show
     """
+    picked_scores, picked_sent = torch.sort(-score_mat, axis=1)
+    picked_sent = picked_sent[:,:K]
+    picked_scores = -picked_scores[:,:K]
     return picked_sent, picked_scores
 def get_words(sent):
     sent_start_id = [] # keep track of the word index where the new sentence starts
     counter = 0
     for x in sent:
         w = word_tokenize(x)
         nw = len(w)
         counter += nw
     assert(len(word_out) == len(attr_out))
     return word_out, attr_out
+def scale_scores(arr, vmin=0.1, vmax=1):
+    # rescale positive and negative attributions to be between vmin and vmax.
+    # while keeping 0 at 0.
+    pos_max, pos_min = np.max(arr[arr > 0]), np.min(arr[arr > 0])
+    out = (arr - pos_min) / (pos_max - pos_min) * (vmax - vmin) + vmin
+    idx = np.where(arr == 0.0)[0]
+    out[idx] = 0.0
+    return out
 def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
     """
     Mark the words that are highlighted, both by in terms of sentence and phrase
     """
     num_query_sent = sent_ids.shape[0]
+    num_cand_sent = sent_ids.shape[1]
     num_words = len(all_words)
     output = dict()
     # for each query sentence, mark the highlight information
     for i in range(num_query_sent):
+        output[i] = dict()
+        for j in range(1, num_cand_sent+1): # for each number of selected sentences from candidate
+            query_words = word_tokenize(query_sents[i])
+            is_selected_sent = np.zeros(num_words)
+            is_selected_phrase = np.zeros(num_words)
+            word_scores = np.zeros(num_words)
+            # for each selected sentences from the candidate, compile information
+            for sid, sscore in zip(sent_ids[i][:j], sent_scores[i][:j]):
+                #print(len(sent_start_id), sid, sid+1)
+                if sid+1 < len(sent_start_id):
+                    sent_range = (sent_start_id[sid], sent_start_id[sid+1])
+                    is_selected_sent[sent_range[0]:sent_range[1]] = 1
+                    word_scores[sent_range[0]:sent_range[1]] = sscore
+                    _, is_selected_phrase[sent_range[0]:sent_range[1]] = \
+                        get_match_phrase(query_words, all_words[sent_range[0]:sent_range[1]])
+                else:
+                    is_selected_sent[sent_start_id[sid]:] = 1
+                    word_scores[sent_start_id[sid]:] = sscore
+                    _, is_selected_phrase[sent_start_id[sid]:] = \
+                        get_match_phrase(query_words, all_words[sent_start_id[sid]:])
+            # scale the word_scores: maximum value gets the darkest, minimum value gets the lightest color
+            if j > 1:
+                word_scores = scale_scores(word_scores)
+            # update selected phrase scores (-1 meaning a different color in gradio)
+            word_scores[is_selected_sent+is_selected_phrase==2] = -0.5
+            output[i][j] = {
+                'is_selected_sent': is_selected_sent,
+                'is_selected_phrase': is_selected_phrase,
+                'scores': word_scores
+            }
     return output
+def get_highlight_info(model, tokenizer, text1, text2, K=None, top_pair_num=5):
     """
     Get highlight information from two texts
     """
     sent1 = sent_tokenize(text1) # query
     sent2 = sent_tokenize(text2) # candidate
     score_mat = compute_sentencewise_scores(model, sent1, sent2, tokenizer=tokenizer)
+    if K is None: # if K is not set, get all information
+        K = score_mat.shape[1]
     sent_ids, sent_scores = get_top_k(score_mat, K=K)
     words2, all_words2, sent_start_id2 = get_words(sent2)
     info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
+    # get top sentence pairs from the query and candidate (score, index_pair) to show upfront
     top_pairs = []
     ii = np.unravel_index(np.argsort(np.array(sent_scores).ravel())[-top_pair_num:], sent_scores.shape)
     for i, j in zip(ii[0][::-1], ii[1][::-1]):