Spaces:

ArneBinder
/

sam-pointer-bart-base-v0.3

Running

App Files Files Community

ArneBinder commited on Jun 11

Commit

b77f1d0

•

1 Parent(s): 4467900

Upload 7 files

Browse files

Files changed (2) hide show

app.py +77 -17
backend.py +3 -4

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import logging
 import os.path
 from functools import partial
 from typing import Dict, List, Optional, Tuple
@@ -71,7 +72,7 @@ def process_uploaded_files(
         str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
     ],
     vector_store: VectorStore[Tuple[str, str]],
-) -> None:
     try:
         for file_name in file_names:
             if file_name.lower().endswith(".txt"):
@@ -86,6 +87,8 @@ def process_uploaded_files(
     except Exception as e:
         raise gr.Error(f"Failed to process uploaded files: {e}")
 def open_accordion():
     return gr.Accordion(open=True)
@@ -181,6 +184,41 @@ def set_relation_types(
     )
 def main():
     example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
@@ -283,15 +321,22 @@ def main():
                 )
             with gr.Column(scale=1):
-                with gr.Accordion("Indexed Documents", open=False):
                     processed_documents_df = gr.DataFrame(
                         headers=["id", "num_adus", "num_relations"],
                         interactive=False,
                     )
-                with gr.Accordion("Reference ADU", open=False):
-                    reference_adu_id = gr.Textbox(label="ID", elem_id="reference_adu_id")
-                    reference_adu_text = gr.Textbox(label="Text")
                 with gr.Accordion("Retrieval Configuration", open=False):
                     min_similarity = gr.Slider(
@@ -318,13 +363,14 @@ def main():
                 relevant_adus = gr.DataFrame(
                     label="Relevant ADUs from other documents",
                     headers=[
-                        "text",
                         "relation",
-                        "doc_id",
                         "reference_adu",
                         "sim_score",
                         "rel_score",
                     ],
                 )
         render_event_kwargs = dict(
@@ -354,12 +400,10 @@ def main():
         )
         upload_btn.upload(
             fn=process_uploaded_files,
             inputs=[upload_btn, models_state, processed_documents_state, vector_store_state],
-            outputs=[],
-        ).success(
-            fn=update_processed_documents_df,
-            inputs=[processed_documents_state],
             outputs=[processed_documents_df],
         )
         processed_documents_df.select(
@@ -368,30 +412,46 @@ def main():
             outputs=[document_state],
         )
         retrieve_relevant_adus_event_kwargs = dict(
             fn=get_relevant_adus,
             inputs=[
-                reference_adu_id,
                 document_state,
                 vector_store_state,
                 processed_documents_state,
                 min_similarity,
                 top_k,
                 relation_types,
             ],
             outputs=[relevant_adus],
         )
-        reference_adu_id.change(
             fn=partial(get_annotation_from_document, annotation_layer="labeled_spans"),
-            inputs=[document_state, reference_adu_id],
-            outputs=[reference_adu_text],
         ).success(**retrieve_relevant_adus_event_kwargs)
         retrieve_similar_adus_btn.click(
             fn=get_similar_adus,
             inputs=[
-                reference_adu_id,
                 document_state,
                 vector_store_state,
                 processed_documents_state,
@@ -475,7 +535,7 @@ def main():
             }
             function setReferenceAduId(entityId) {
                 // get the textarea element that holds the reference adu id
-                let referenceAduIdDiv = document.querySelector('#reference_adu_id textarea');
                 // set the value of the input field
                 referenceAduIdDiv.value = entityId;
                 // trigger an input event to update the state

 import json
 import logging
 import os.path
+import tempfile
 from functools import partial
 from typing import Dict, List, Optional, Tuple
         str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
     ],
     vector_store: VectorStore[Tuple[str, str]],
+) -> pd.DataFrame:
     try:
         for file_name in file_names:
             if file_name.lower().endswith(".txt"):
     except Exception as e:
         raise gr.Error(f"Failed to process uploaded files: {e}")
+    return update_processed_documents_df(processed_documents)
 def open_accordion():
     return gr.Accordion(open=True)
     )
+def download_processed_documents(
+    processed_documents: dict[
+        str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+    ],
+    file_name: str = "processed_documents.json",
+) -> str:
+    processed_documents_json = {
+        doc_id: document.asdict() for doc_id, document in processed_documents.items()
+    }
+    file_path = os.path.join(tempfile.gettempdir(), file_name)
+    with open(file_path, "w", encoding="utf-8") as f:
+        json.dump(processed_documents_json, f, indent=2)
+    return file_path
+def upload_processed_documents(
+    file_name: str,
+    processed_documents: dict[
+        str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+    ],
+) -> Dict[str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]:
+    with open(file_name, "r", encoding="utf-8") as f:
+        processed_documents_json = json.load(f)
+    for doc_id, document_json in processed_documents_json.items():
+        document = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions.fromdict(
+            document_json
+        )
+        # metadata is not automatically deserialized, so we need to set it manually
+        document.metadata["embeddings"] = document_json["metadata"]["embeddings"]
+        if doc_id in processed_documents:
+            gr.Warning(f"Document '{doc_id}' already exists. Overwriting.")
+        processed_documents[doc_id] = document
+    return processed_documents
 def main():
     example_text = "Scholarly Argumentation Mining (SAM) has recently gained attention due to its potential to help scholars with the rapid growth of published scientific literature. It comprises two subtasks: argumentative discourse unit recognition (ADUR) and argumentative relation extraction (ARE), both of which are challenging since they require e.g. the integration of domain knowledge, the detection of implicit statements, and the disambiguation of argument structure. While previous work focused on dataset construction and baseline methods for specific document sections, such as abstract or results, full-text scholarly argumentation mining has seen little progress. In this work, we introduce a sequential pipeline model combining ADUR and ARE for full-text SAM, and provide a first analysis of the performance of pretrained language models (PLMs) on both subtasks. We establish a new SotA for ADUR on the Sci-Arg corpus, outperforming the previous best reported result by a large margin (+7% F1). We also present the first results for ARE, and thus for the full AM pipeline, on this benchmark dataset. Our detailed error analysis reveals that non-contiguous ADUs as well as the interpretation of discourse connectors pose major challenges and that data annotation needs to be more consistent."
                 )
             with gr.Column(scale=1):
+                with gr.Accordion(
+                    "Indexed Documents", open=False
+                ) as processed_documents_accordion:
                     processed_documents_df = gr.DataFrame(
                         headers=["id", "num_adus", "num_relations"],
                         interactive=False,
                     )
+                    with gr.Row():
+                        download_processed_documents_btn = gr.DownloadButton("Download")
+                        upload_processed_documents_btn = gr.UploadButton(
+                            "Upload", file_types=["json"]
+                        )
+                with gr.Accordion("Selected ADU", open=False):
+                    selected_adu_id = gr.Textbox(label="ID", elem_id="selected_adu_id")
+                    selected_adu_text = gr.Textbox(label="Text")
                 with gr.Accordion("Retrieval Configuration", open=False):
                     min_similarity = gr.Slider(
                 relevant_adus = gr.DataFrame(
                     label="Relevant ADUs from other documents",
                     headers=[
                         "relation",
+                        "adu",
                         "reference_adu",
+                        "doc_id",
                         "sim_score",
                         "rel_score",
                     ],
+                    interactive=False,
                 )
         render_event_kwargs = dict(
         )
         upload_btn.upload(
+            fn=open_accordion, inputs=[], outputs=[processed_documents_accordion]
+        ).then(
             fn=process_uploaded_files,
             inputs=[upload_btn, models_state, processed_documents_state, vector_store_state],
             outputs=[processed_documents_df],
         )
         processed_documents_df.select(
             outputs=[document_state],
         )
+        download_processed_documents_btn.click(
+            fn=download_processed_documents,
+            inputs=[processed_documents_state],
+            outputs=[download_processed_documents_btn],
+        )
+        upload_processed_documents_btn.upload(
+            fn=upload_processed_documents,
+            inputs=[upload_processed_documents_btn, processed_documents_state],
+            outputs=[processed_documents_state],
+        ).success(
+            fn=update_processed_documents_df,
+            inputs=[processed_documents_state],
+            outputs=[processed_documents_df],
+        )
         retrieve_relevant_adus_event_kwargs = dict(
             fn=get_relevant_adus,
             inputs=[
+                selected_adu_id,
                 document_state,
                 vector_store_state,
                 processed_documents_state,
                 min_similarity,
                 top_k,
                 relation_types,
+                relevant_adus,
             ],
             outputs=[relevant_adus],
         )
+        selected_adu_id.change(
             fn=partial(get_annotation_from_document, annotation_layer="labeled_spans"),
+            inputs=[document_state, selected_adu_id],
+            outputs=[selected_adu_text],
         ).success(**retrieve_relevant_adus_event_kwargs)
         retrieve_similar_adus_btn.click(
             fn=get_similar_adus,
             inputs=[
+                selected_adu_id,
                 document_state,
                 vector_store_state,
                 processed_documents_state,
             }
             function setReferenceAduId(entityId) {
                 // get the textarea element that holds the reference adu id
+                let referenceAduIdDiv = document.querySelector('#selected_adu_id textarea');
                 // set the value of the input field
                 referenceAduIdDiv.value = entityId;
                 // trigger an input event to update the state

backend.py CHANGED Viewed

@@ -253,6 +253,7 @@ def get_relevant_adus(
     min_similarity: float,
     top_k: int,
     relation_types: List[str],
 ) -> pd.DataFrame:
     similar_entries = vector_store.retrieve_similar(
         ref_id=(ref_document.id, ref_annotation_id),
@@ -289,12 +290,10 @@ def get_relevant_adus(
                     "sim_score": score,
                     "rel_score": rel.score,
                     "relation": rel.label,
-                    "text": str(rel.tail),
                 }
             )
     # define column order
-    df = pd.DataFrame(
-        result, columns=["text", "relation", "doc_id", "reference_adu", "sim_score", "rel_score"]
-    )
     return df

     min_similarity: float,
     top_k: int,
     relation_types: List[str],
+    previous_result: pd.DataFrame,
 ) -> pd.DataFrame:
     similar_entries = vector_store.retrieve_similar(
         ref_id=(ref_document.id, ref_annotation_id),
                     "sim_score": score,
                     "rel_score": rel.score,
                     "relation": rel.label,
+                    "adu": str(rel.tail),
                 }
             )
     # define column order
+    df = pd.DataFrame(result, columns=previous_result.columns)
     return df