Spaces:

malteos
/

aspect-based-paper-similarity

Runtime error

App Files Files Community

malteos commited on Mar 21, 2022

Commit

7a888ed

1 Parent(s): 42610a4

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -101

app.py CHANGED Viewed

@@ -69,15 +69,10 @@ def st_load_dataset(name_or_path):
         if isinstance(dataset, DatasetDict):
             dataset = dataset['train']
-        # load existing faiss
         for a in aspects:
             dataset.load_faiss_index(f'{a}_embeddings', f'{a}_embeddings.faiss')
-        # add faiss
-        #dataset.add_faiss_index(column=f'{aspect}_embeddings')
-        #loaded_dataset.add_faiss_index(column='method_embeddings')
-        #loaded_dataset.add_faiss_index(column='dataset_embeddings')
     return dataset
@@ -99,64 +94,58 @@ def get_paper(doc_id):
 def find_related_papers(paper_id, user_aspect):
-    # Add result to session
-    paper = get_paper(paper_id)
-    if paper is None or 'title' not in paper or 'abstract' not in paper:
-        raise ValueError('Could not retrieve data for input paper')
-    title_abs = paper['title'] + ': ' + paper['abstract']
-    # preprocess the input
-    inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
-    # inference
-    outputs = aspect_to_model[user_aspect](**inputs)
-    # logger.info(f'attention_mask: {inputs["attention_mask"].shape}')
-    #
-    # logger.info(f'Outputs: {outputs["last_hidden_state"]}')
-    # logger.info(f'Outputs: {outputs["last_hidden_state"].shape}')
-    # Mean pool the token-level embeddings to get sentence-level embeddings
-    embeddings = torch.sum(
-        outputs["last_hidden_state"] * inputs['attention_mask'].unsqueeze(-1), dim=1
-    ) / torch.clamp(torch.sum(inputs['attention_mask'], dim=1, keepdims=True), min=1e-9)
-    result = dict(
-        paper=paper,
-        aspect=user_aspect,
-    )
-    result.update(dict(
-        #embeddings=embeddings.tolist(),
-    ))
-    # Retrieval
-    prompt = embeddings.detach().numpy()[0]
-    scores, retrieved_examples = dataset.get_nearest_examples(f'{user_aspect}_embeddings', prompt, k=10)
-    result.update(dict(
-        related_papers=retrieved_examples,
-    ))
-    # st.session_state.results.append(result)
     return result
-# # Start session
-# if 'results' not in st.session_state:
-#     st.session_state.results = []
 # Page
 st.title('Aspect-based Paper Similarity')
 st.markdown("""This demo showcases [Specialized Document Embeddings for Aspect-based Research Paper Similarity](#TODO).""")
 # Introduction
-st.markdown(f"""The model was trained using a triplet loss on machine learning papers from the [paperswithcode.com](https://paperswithcode.com/) corpus with the objective of pulling embeddings of papers with the same task, method, or datasetclose together. For a more comprehensive overview of the model check out the [model card on 🤗 Model Hub]({model_hub_url}) or read [our paper](#TODO).
-""")
 st.markdown("""Enter a ArXiv ID or a DOI of a paper for that you want find similar papers.
 Try it yourself! 👇""",
@@ -170,19 +159,20 @@ with st.form("aspect-input", clear_on_submit=False):
         placeholder='Any DOI, ACL, or ArXiv ID'
     )
     example = st.selectbox(
-        label='Or select example',
-        options=[
-            "arXiv:2202.06671",
-            '10.1016/j.eswa.2019.06.026'
-        ]
     )
-    # click_clear = st.button('clear text input', key=1)
-    # if click_clear:
-    #     paper_id = st.text_input(
-    #         label='Enter paper ID (arXiv:<arxiv_id>, or <doi>):', value="XXX", placeholder='123')
     user_aspect = st.radio(
         label="In what aspect are you interested?",
         options=aspects
@@ -194,61 +184,29 @@ with st.form("aspect-input", clear_on_submit=False):
 # Listener
 if submitted:
     if paper_id or example:
-        with st.spinner('Finding related papers...'):
-            try:
-                result = find_related_papers(paper_id if paper_id else example, user_aspect)
-                input_paper = result['paper']
-                related_papers = result['related_papers']
-                # with st.empty():
-                st.markdown(
-                    f'''Your input paper: \n\n<a href="{input_paper['url']}"><b>{input_paper['title']}</b></a> ({input_paper['year']})<hr />''',
-                    unsafe_allow_html=True)
-                related_html = '<ul>'
-                for i in range(len(related_papers['paper_id'])):
-                    related_html += f'''<li><a href="{related_papers['url_abs'][i]}">{related_papers['title'][i]}</a></li>'''
-                related_html += '</ul>'
-                st.markdown(f'''Related papers with similar {result['aspect']}: {related_html}''', unsafe_allow_html=True)
-            except (TypeError, ValueError, KeyError) as e:
-                st.error(f'**Error**: {e}')
     else:
         st.error('**Error**: No paper ID provided. Please provide a ArXiv ID or DOI.')
-# # Results
-# if 'results' in st.session_state and st.session_state.results:
-#     first = True
-#     for result in st.session_state.results[::-1]:
-#         if not first:
-#             st.markdown("---")
-#         # st.markdown(f"ID:\n> {result['paperId']}")
-#         # col_1, col_2, col_3 = st.columns([1,2,2])
-#         # col_1.metric(label='', value=json.dumps(result))
-#         # col_2.metric(label='Label', value=f"fooo")
-#         # col_3.metric(label='Score', value=f"123")
-#         input_paper = result['paper']
-#         related_papers = result['related_papers']
-#
-#         # with st.empty():
-#
-#         st.markdown(f'''Your input paper: \n\n<a href="{input_paper['url']}"><b>{input_paper['title']}</b></a> ({input_paper['year']})<hr />''', unsafe_allow_html=True)
-#
-#         related_html = '<ul>'
-#
-#         for i in range(len(related_papers['paper_id'])):
-#             related_html += f'''<li><a href="{related_papers['url_abs'][i]}">{related_papers['title'][i]}</a></li>'''
-#
-#         related_html += '</ul>'
-#
-#         st.markdown(f'''Related papers with similar {result['aspect']}: {related_html}''', unsafe_allow_html=True)
-#
-#         # st.markdown(f'''Related papers: {related_html}''', unsafe_allow_html=True)
-#
-#         first = False

         if isinstance(dataset, DatasetDict):
             dataset = dataset['train']
+        # load existing FAISS index for each aspect
         for a in aspects:
             dataset.load_faiss_index(f'{a}_embeddings', f'{a}_embeddings.faiss')
     return dataset
 def find_related_papers(paper_id, user_aspect):
+    with st.spinner('Searching for related papers...'):
+        paper = get_paper(paper_id)
+        if paper is None or 'title' not in paper or paper['title'] is None or 'abstract' not in paper or paper['abstract'] is None:
+            raise ValueError(f'Could not retrieve title and abstract for input paper: {paper_id}')
+        title_abs = paper['title'] + ': ' + paper['abstract']
+        # preprocess the input
+        inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
+        # inference
+        outputs = aspect_to_model[user_aspect](**inputs)
+        # logger.info(f'attention_mask: {inputs["attention_mask"].shape}')
+        #
+        # logger.info(f'Outputs: {outputs["last_hidden_state"]}')
+        # logger.info(f'Outputs: {outputs["last_hidden_state"].shape}')
+        # Mean pool the token-level embeddings to get sentence-level embeddings
+        embeddings = torch.sum(
+            outputs["last_hidden_state"] * inputs['attention_mask'].unsqueeze(-1), dim=1
+        ) / torch.clamp(torch.sum(inputs['attention_mask'], dim=1, keepdims=True), min=1e-9)
+        result = dict(
+            paper=paper,
+            aspect=user_aspect,
+        )
+        result.update(dict(
+            #embeddings=embeddings.tolist(),
+        ))
+        # Retrieval
+        prompt = embeddings.detach().numpy()[0]
+        scores, retrieved_examples = dataset.get_nearest_examples(f'{user_aspect}_embeddings', prompt, k=10)
+        result.update(dict(
+            related_papers=retrieved_examples,
+        ))
     return result
 # Page
 st.title('Aspect-based Paper Similarity')
 st.markdown("""This demo showcases [Specialized Document Embeddings for Aspect-based Research Paper Similarity](#TODO).""")
 # Introduction
+st.markdown(f"""The model was trained using a triplet loss on machine learning papers from the [paperswithcode.com](https://paperswithcode.com/) corpus with the objective of pulling embeddings of papers with the same task, method, or datasetclose together.
+For a more comprehensive overview of the model check out the [model card on 🤗 Model Hub]({model_hub_url}) or read [our paper](#TODO).""")
 st.markdown("""Enter a ArXiv ID or a DOI of a paper for that you want find similar papers.
 Try it yourself! 👇""",
         placeholder='Any DOI, ACL, or ArXiv ID'
     )
+    example_labels = {
+        "arXiv:1902.06818": "Data augmentation for low resource sentiment analysis using generative adversarial networks",
+        "arXiv:2202.06671": "Neighborhood Contrastive Learning for Scientific Document Representations with Citation Embeddings",
+        "ACL:N19-1423": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
+        "10.18653/v1/S16-1001": "SemEval-2016 Task 4: Sentiment Analysis in Twitter",
+        "10.1145/3065386": "ImageNet classification with deep convolutional neural networks",
+    }
     example = st.selectbox(
+        label='Or select an example:',
+        options=list(example_labels.keys()),
+        format_func=lambda option_key: f'{example_labels[option_key]} ({option_key})',
     )
     user_aspect = st.radio(
         label="In what aspect are you interested?",
         options=aspects
 # Listener
 if submitted:
     if paper_id or example:
+        try:
+            result = find_related_papers(paper_id if paper_id else example, user_aspect)
+            input_paper = result['paper']
+            related_papers = result['related_papers']
+            # with st.empty():
+            st.markdown(
+                f'''Your input paper: \n\n<a href="{input_paper['url']}"><b>{input_paper['title']}</b></a> ({input_paper['year']})<hr />''',
+                unsafe_allow_html=True)
+            related_html = '<ul>'
+            for i in range(len(related_papers['paper_id'])):
+                related_html += f'''<li><a href="{related_papers['url_abs'][i]}">{related_papers['title'][i]}</a></li>'''
+            related_html += '</ul>'
+            st.markdown(f'''Related papers with similar {result['aspect']}: {related_html}''', unsafe_allow_html=True)
+        except (TypeError, ValueError, KeyError) as e:
+            st.error(f'**Error**: {e}')
     else:
         st.error('**Error**: No paper ID provided. Please provide a ArXiv ID or DOI.')