Spaces:

ArneBinder
/

sam-pointer-bart-base-v0.3

Running

App Files Files Community

ArneBinder commited on Jul 11

Commit

fed112f

•

1 Parent(s): 38e5624

from https://github.com/ArneBinder/pie-document-level/pull/229

Browse files

except that we still use `SimpleVectorStore` instead of `QdrantVectorStore`

Files changed (3) hide show

app.py +78 -16
document_store.py +85 -1
model_utils.py +42 -11

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import logging
 import os.path
 import tempfile
 from functools import partial
 from typing import List, Optional, Tuple
@@ -32,6 +33,19 @@ DEFAULT_EMBEDDING_MODEL_NAME = "allenai/scibert_scivocab_uncased"
 DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DEFAULT_EMBEDDING_MAX_LENGTH = 512
 DEFAULT_EMBEDDING_BATCH_SIZE = 32
 def render_annotated_document(
@@ -55,9 +69,16 @@ def wrapped_process_text(
     doc_id: str,
     models: Tuple[Pipeline, Optional[EmbeddingModel]],
     document_store: DocumentStore,
 ) -> Tuple[dict, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]:
     try:
-        document = create_document(text=text, doc_id=doc_id)
         annotate_document(
             document=document,
             annotation_pipeline=models[0],
@@ -77,6 +98,8 @@ def process_uploaded_files(
     file_names: List[str],
     models: Tuple[Pipeline, Optional[EmbeddingModel]],
     document_store: DocumentStore,
 ) -> pd.DataFrame:
     try:
         new_documents = []
@@ -90,6 +113,9 @@ def process_uploaded_files(
                 new_document = create_document(
                     text=text,
                     doc_id=base_file_name,
                 )
                 annotate_document(
                     document=new_document,
@@ -103,7 +129,7 @@ def process_uploaded_files(
     except Exception as e:
         raise gr.Error(f"Failed to process uploaded files: {e}")
-    return document_store.overview()
 def open_accordion():
@@ -137,7 +163,7 @@ def set_relation_types(
     return gr.Dropdown(
         choices=relation_types,
-        label="Relation Types",
         value=default,
         multiselect=True,
     )
@@ -204,7 +230,7 @@ def main():
             DocumentStore(
                 span_annotation_caption="adu",
                 relation_annotation_caption="relation",
-                vector_store=SimpleVectorStore(),
             )
         )
         # wrap the pipeline and the embedding model/tokenizer in a tuple to avoid that it gets called
@@ -264,6 +290,11 @@ def main():
                         ],
                         outputs=models_state,
                     )
                 predict_btn = gr.Button("Analyse")
@@ -289,11 +320,6 @@ def main():
                 rendered_output = gr.HTML(label="Rendered Output")
-                # add_to_index_btn = gr.Button("Add current result to Index")
-                upload_btn = gr.UploadButton(
-                    "Upload & Analyse Documents", file_types=["text"], file_count="multiple"
-                )
             with gr.Column(scale=1):
                 with gr.Accordion(
                     "Indexed Documents", open=False
@@ -302,12 +328,22 @@ def main():
                         headers=["id", "num_adus", "num_relations"],
                         interactive=False,
                     )
                     with gr.Row():
                         download_processed_documents_btn = gr.DownloadButton("Download")
                         upload_processed_documents_btn = gr.UploadButton(
                             "Upload", file_types=["json"]
                         )
                 with gr.Accordion("Selected ADU", open=False):
                     selected_adu_id = gr.Textbox(label="ID", elem_id="selected_adu_id")
                     selected_adu_text = gr.Textbox(label="Text")
@@ -329,6 +365,14 @@ def main():
                     )
                     retrieve_similar_adus_btn = gr.Button("Retrieve similar ADUs")
                     similar_adus = gr.DataFrame(headers=["doc_id", "adu_id", "score", "text"])
                     relation_types = set_relation_types(
                         models_state.value, default=["supports", "contradicts"]
                     )
@@ -353,16 +397,19 @@ def main():
             outputs=rendered_output,
         )
         predict_btn.click(fn=open_accordion, inputs=[], outputs=[output_accordion]).then(
             fn=wrapped_process_text,
-            inputs=[doc_text, doc_id, models_state, document_store_state],
             outputs=[document_json, document_state],
             api_name="predict",
-        ).success(
-            fn=lambda document_store: document_store.overview(),
-            inputs=[document_store_state],
-            outputs=[processed_documents_df],
-        )
         render_btn.click(**render_event_kwargs, api_name="render")
         document_state.change(
@@ -377,7 +424,13 @@ def main():
             fn=open_accordion, inputs=[], outputs=[processed_documents_accordion]
         ).then(
             fn=process_uploaded_files,
-            inputs=[upload_btn, models_state, document_store_state],
             outputs=[processed_documents_df],
         )
         processed_documents_df.select(
@@ -385,6 +438,7 @@ def main():
             inputs=[processed_documents_df, document_store_state],
             outputs=[document_state],
         )
         download_processed_documents_btn.click(
             fn=partial(download_processed_documents, file_name="processed_documents.zip"),
@@ -446,6 +500,14 @@ def main():
             inputs=[models_state],
             outputs=[relation_types],
         )
         # retrieve_relevant_adus_btn.click(
         #     **retrieve_relevant_adus_event_kwargs

 import json
 import logging
 import os.path
+import re
 import tempfile
 from functools import partial
 from typing import List, Optional, Tuple
 DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DEFAULT_EMBEDDING_MAX_LENGTH = 512
 DEFAULT_EMBEDDING_BATCH_SIZE = 32
+DEFAULT_SPLIT_REGEX = "\n\n\n+"
+def escape_regex(regex: str) -> str:
+    # "double escape" the backslashes
+    result = regex.encode("unicode_escape").decode("utf-8")
+    return result
+def unescape_regex(regex: str) -> str:
+    # reverse of escape_regex
+    result = regex.encode("utf-8").decode("unicode_escape")
+    return result
 def render_annotated_document(
     doc_id: str,
     models: Tuple[Pipeline, Optional[EmbeddingModel]],
     document_store: DocumentStore,
+    split_regex_escaped: str,
 ) -> Tuple[dict, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]:
     try:
+        document = create_document(
+            text=text,
+            doc_id=doc_id,
+            split_regex=unescape_regex(split_regex_escaped)
+            if len(split_regex_escaped) > 0
+            else None,
+        )
         annotate_document(
             document=document,
             annotation_pipeline=models[0],
     file_names: List[str],
     models: Tuple[Pipeline, Optional[EmbeddingModel]],
     document_store: DocumentStore,
+    split_regex_escaped: str,
+    show_max_cross_doc_sims: bool = False,
 ) -> pd.DataFrame:
     try:
         new_documents = []
                 new_document = create_document(
                     text=text,
                     doc_id=base_file_name,
+                    split_regex=unescape_regex(split_regex_escaped)
+                    if len(split_regex_escaped) > 0
+                    else None,
                 )
                 annotate_document(
                     document=new_document,
     except Exception as e:
         raise gr.Error(f"Failed to process uploaded files: {e}")
+    return document_store.overview(with_max_cross_doc_sims=show_max_cross_doc_sims)
 def open_accordion():
     return gr.Dropdown(
         choices=relation_types,
+        label="Argumentative Relation Types",
         value=default,
         multiselect=True,
     )
             DocumentStore(
                 span_annotation_caption="adu",
                 relation_annotation_caption="relation",
+                vector_store=QdrantVectorStore(),
             )
         )
         # wrap the pipeline and the embedding model/tokenizer in a tuple to avoid that it gets called
                         ],
                         outputs=models_state,
                     )
+                    split_regex_escaped = gr.Textbox(
+                        label="Regex to partition the text",
+                        placeholder="Regular expression pattern to split the text into partitions",
+                        value=escape_regex(DEFAULT_SPLIT_REGEX),
+                    )
                 predict_btn = gr.Button("Analyse")
                 rendered_output = gr.HTML(label="Rendered Output")
             with gr.Column(scale=1):
                 with gr.Accordion(
                     "Indexed Documents", open=False
                         headers=["id", "num_adus", "num_relations"],
                         interactive=False,
                     )
+                    show_max_cross_docu_sims = gr.Checkbox(
+                        label="Show max cross-document similarities", value=False
+                    )
+                    gr.Markdown("Data Snapshot:")
                     with gr.Row():
                         download_processed_documents_btn = gr.DownloadButton("Download")
                         upload_processed_documents_btn = gr.UploadButton(
                             "Upload", file_types=["json"]
                         )
+                upload_btn = gr.UploadButton(
+                    "Upload & Analyse Reference Documents",
+                    file_types=["text"],
+                    file_count="multiple",
+                )
                 with gr.Accordion("Selected ADU", open=False):
                     selected_adu_id = gr.Textbox(label="ID", elem_id="selected_adu_id")
                     selected_adu_text = gr.Textbox(label="Text")
                     )
                     retrieve_similar_adus_btn = gr.Button("Retrieve similar ADUs")
                     similar_adus = gr.DataFrame(headers=["doc_id", "adu_id", "score", "text"])
+                    all2all_adu_similarities_button = gr.Button(
+                        "Compute all ADU-to-ADU similarities"
+                    )
+                    all2all_adu_similarities = gr.DataFrame(
+                        headers=["sim_score", "doc_id", "other_doc_id", "text", "other_text"]
+                    )
                     relation_types = set_relation_types(
                         models_state.value, default=["supports", "contradicts"]
                     )
             outputs=rendered_output,
         )
+        show_overview_kwargs = dict(
+            fn=lambda document_store, show_max_sims: document_store.overview(
+                with_max_cross_doc_sims=show_max_sims
+            ),
+            inputs=[document_store_state, show_max_cross_docu_sims],
+            outputs=[processed_documents_df],
+        )
         predict_btn.click(fn=open_accordion, inputs=[], outputs=[output_accordion]).then(
             fn=wrapped_process_text,
+            inputs=[doc_text, doc_id, models_state, document_store_state, split_regex_escaped],
             outputs=[document_json, document_state],
             api_name="predict",
+        ).success(**show_overview_kwargs)
         render_btn.click(**render_event_kwargs, api_name="render")
         document_state.change(
             fn=open_accordion, inputs=[], outputs=[processed_documents_accordion]
         ).then(
             fn=process_uploaded_files,
+            inputs=[
+                upload_btn,
+                models_state,
+                document_store_state,
+                split_regex_escaped,
+                show_max_cross_docu_sims,
+            ],
             outputs=[processed_documents_df],
         )
         processed_documents_df.select(
             inputs=[processed_documents_df, document_store_state],
             outputs=[document_state],
         )
+        show_max_cross_docu_sims.change(**show_overview_kwargs)
         download_processed_documents_btn.click(
             fn=partial(download_processed_documents, file_name="processed_documents.zip"),
             inputs=[models_state],
             outputs=[relation_types],
         )
+        all2all_adu_similarities_button.click(
+            fn=partial(
+                DocumentStore.get_all2all_adu_similarities,
+                columns=all2all_adu_similarities.headers,
+            ),
+            inputs=[document_store_state],
+            outputs=[all2all_adu_similarities],
+        )
         # retrieve_relevant_adus_btn.click(
         #     **retrieve_relevant_adus_event_kwargs

document_store.py CHANGED Viewed

@@ -8,6 +8,7 @@ from collections import defaultdict
 from typing import Any, Dict, List, Optional
 import gradio as gr
 import pandas as pd
 from annotation_utils import labeled_span_to_id
 from pytorch_ie import Annotation
@@ -417,7 +418,7 @@ class DocumentStore:
         return document
-    def overview(self) -> pd.DataFrame:
         rows = []
         for doc_id, document in self.documents.items():
             layers = {
@@ -429,6 +430,38 @@ class DocumentStore:
             layer_sizes = {f"num_{caption}s": len(layer) for caption, layer in layers.items()}
             rows.append({"doc_id": doc_id, **layer_sizes})
         df = pd.DataFrame(rows)
         return df
     def as_dict(self, include_embeddings: bool = True) -> dict:
@@ -441,3 +474,54 @@ class DocumentStore:
                 }
             result[doc_id] = doc_dict
         return result

 from typing import Any, Dict, List, Optional
 import gradio as gr
+import numpy as np
 import pandas as pd
 from annotation_utils import labeled_span_to_id
 from pytorch_ie import Annotation
         return document
+    def overview(self, with_max_cross_doc_sims: bool = False) -> pd.DataFrame:
         rows = []
         for doc_id, document in self.documents.items():
             layers = {
             layer_sizes = {f"num_{caption}s": len(layer) for caption, layer in layers.items()}
             rows.append({"doc_id": doc_id, **layer_sizes})
         df = pd.DataFrame(rows)
+        # add highest cross-document similarity score for each document
+        if with_max_cross_doc_sims and len(self.documents) > 1:
+            # Setting min_similarity to None and top_k to None to get all similarities. Otherwise,
+            # it may happen that this occludes max cross-doc sim for some documents in the
+            # case that there are more than top_k ADUs in the reference document that have a higher
+            # similarity with each other than the highest similarity to any ADU in another document
+            # or if the cross-doc similarity is below the min_similarity threshold.
+            all2all_adu_similarities = self.get_all2all_adu_similarities(
+                min_similarity=None, top_k=None, columns=["doc_id", "other_doc_id", "sim_score"]
+            )
+            max_doc2doc_similarities = all2all_adu_similarities.pivot_table(
+                values="sim_score", index="doc_id", columns="other_doc_id", aggfunc="max"
+            )
+            max_doc2doc_similarities.sort_index(axis="index", inplace=True)
+            max_doc2doc_similarities.sort_index(axis="columns", inplace=True)
+            # check that the index and columns are the same
+            if (max_doc2doc_similarities.index != max_doc2doc_similarities.columns).any():
+                raise gr.Error("Index and columns of max_doc2doc_similarities are not the same.")
+            # set diagonal entries to minus infinity to exclude them from the maximum
+            np.fill_diagonal(max_doc2doc_similarities.values, -np.inf)
+            max_doc_ids = max_doc2doc_similarities.idxmax(axis="columns")
+            max_similarities = max_doc2doc_similarities.max(axis="columns")
+            # set the index to the doc_id to correctly join the series
+            df.set_index("doc_id", inplace=True)
+            df["max_cross_doc_sim_doc_id"] = max_doc_ids
+            df["max_cross_doc_sim_score"] = max_similarities
+            df.reset_index(inplace=True)
         return df
     def as_dict(self, include_embeddings: bool = True) -> dict:
                 }
             result[doc_id] = doc_dict
         return result
+    def get_all2all_adu_similarities(
+        self,
+        min_similarity: Optional[float] = 0.5,
+        top_k: Optional[int] = 100,
+        columns: Optional[List[str]] = None,
+    ) -> pd.DataFrame:
+        """Get the similarities between all ADUs in the store.
+        Args:
+            min_similarity: The minimum similarity score to consider.
+            top_k: The number of similar ADUs to return.
+            columns: The columns to include in the result DataFrame. If None, all columns are included.
+        Returns:
+            A DataFrame with the columns: doc_id, text, other_doc_id, other_text, sim_score.
+        """
+        result = []
+        document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+        for doc_id, document in self.documents.items():
+            for adu in document.labeled_spans.predictions:
+                adu_id = labeled_span_to_id(adu)
+                similar_entries = self.vector_store.retrieve_similar(
+                    ref_payload=self.construct_embedding_payload(document, adu_id),
+                    min_similarity=min_similarity,
+                    top_k=top_k,
+                )
+                for _, payload, score in similar_entries:
+                    other_doc_id = payload["doc_id"]
+                    other_document = self.documents[other_doc_id]
+                    other_adu = get_annotation_from_document(
+                        other_document,
+                        payload["annotation_id"],
+                        self.span_layer_name,
+                        use_predictions=self.use_predictions,
+                    )
+                    result.append(
+                        {
+                            "sim_score": score,
+                            "doc_id": doc_id,
+                            "other_doc_id": other_doc_id,
+                            "adu_id": adu_id,
+                            "other_adu_id": payload["annotation_id"],
+                            "text": str(adu),
+                            "other_text": str(other_adu),
+                        }
+                    )
+        result_df = pd.DataFrame(result)
+        if columns is not None:
+            result_df = result_df[columns]
+        return result_df

model_utils.py CHANGED Viewed

@@ -5,6 +5,7 @@ import gradio as gr
 import torch
 from annotation_utils import labeled_span_to_id
 from embedding import EmbeddingModel, HuggingfaceEmbeddingModel
 from pytorch_ie import Pipeline
 from pytorch_ie.annotations import LabeledSpan
 from pytorch_ie.auto import AutoPipeline
@@ -48,7 +49,7 @@ def annotate_document(
 def create_document(
-    text: str, doc_id: str
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     """Create a TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions from the provided
     text.
@@ -56,6 +57,7 @@ def create_document(
     Parameters:
         text: The text to process.
         doc_id: The ID of the document.
     Returns:
         The processed document.
@@ -64,8 +66,14 @@ def create_document(
     document = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions(
         id=doc_id, text=text, metadata={}
     )
-    # add single partition from the whole text (the model only considers text in partitions)
-    document.labeled_partitions.append(LabeledSpan(start=0, end=len(text), label="text"))
     return document
@@ -92,24 +100,22 @@ def load_argumentation_model(
             taskmodule_kwargs=dict(revision=revision),
             model_kwargs=dict(revision=revision),
         )
     except Exception as e:
         raise gr.Error(f"Failed to load argumentation model: {e}")
-    gr.Info(f"Loaded argumentation model: model_name={model_name}, revision={revision})")
     return model
-def load_models(
-    model_name: str,
-    revision: Optional[str] = None,
     embedding_model_name: Optional[str] = None,
     # embedding_model_revision: Optional[str] = None,
     embedding_max_length: int = 512,
     embedding_batch_size: int = 16,
     device: str = "cpu",
-) -> Tuple[Pipeline, Optional[EmbeddingModel]]:
-    torch.cuda.empty_cache()
-    argumentation_model = load_argumentation_model(model_name, revision=revision, device=device)
-    embedding_model = None
     if embedding_model_name is not None and embedding_model_name.strip():
         try:
             embedding_model = HuggingfaceEmbeddingModel(
@@ -119,7 +125,32 @@ def load_models(
                 max_length=embedding_max_length,
                 batch_size=embedding_batch_size,
             )
         except Exception as e:
             raise gr.Error(f"Failed to load embedding model: {e}")
     return argumentation_model, embedding_model

 import torch
 from annotation_utils import labeled_span_to_id
 from embedding import EmbeddingModel, HuggingfaceEmbeddingModel
+from pie_modules.document.processing import RegexPartitioner
 from pytorch_ie import Pipeline
 from pytorch_ie.annotations import LabeledSpan
 from pytorch_ie.auto import AutoPipeline
 def create_document(
+    text: str, doc_id: str, split_regex: Optional[str] = None
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     """Create a TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions from the provided
     text.
     Parameters:
         text: The text to process.
         doc_id: The ID of the document.
+        split_regex: A regular expression pattern to use for splitting the text into partitions.
     Returns:
         The processed document.
     document = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions(
         id=doc_id, text=text, metadata={}
     )
+    if split_regex is not None:
+        partitioner = RegexPartitioner(
+            pattern=split_regex, partition_layer_name="labeled_partitions"
+        )
+        document = partitioner(document)
+    else:
+        # add single partition from the whole text (the model only considers text in partitions)
+        document.labeled_partitions.append(LabeledSpan(start=0, end=len(text), label="text"))
     return document
             taskmodule_kwargs=dict(revision=revision),
             model_kwargs=dict(revision=revision),
         )
+        gr.Info(
+            f"Loaded argumentation model: model_name={model_name}, revision={revision}, device={device}"
+        )
     except Exception as e:
         raise gr.Error(f"Failed to load argumentation model: {e}")
     return model
+def load_embedding_model(
     embedding_model_name: Optional[str] = None,
     # embedding_model_revision: Optional[str] = None,
     embedding_max_length: int = 512,
     embedding_batch_size: int = 16,
     device: str = "cpu",
+) -> Optional[EmbeddingModel]:
     if embedding_model_name is not None and embedding_model_name.strip():
         try:
             embedding_model = HuggingfaceEmbeddingModel(
                 max_length=embedding_max_length,
                 batch_size=embedding_batch_size,
             )
+            gr.Info(f"Loaded embedding model: model_name={embedding_model_name}, device={device}")
         except Exception as e:
             raise gr.Error(f"Failed to load embedding model: {e}")
+    else:
+        embedding_model = None
+    return embedding_model
+def load_models(
+    model_name: str,
+    revision: Optional[str] = None,
+    embedding_model_name: Optional[str] = None,
+    # embedding_model_revision: Optional[str] = None,
+    embedding_max_length: int = 512,
+    embedding_batch_size: int = 16,
+    device: str = "cpu",
+) -> Tuple[Pipeline, Optional[EmbeddingModel]]:
+    torch.cuda.empty_cache()
+    argumentation_model = load_argumentation_model(model_name, revision=revision, device=device)
+    embedding_model = load_embedding_model(
+        embedding_model_name=embedding_model_name,
+        # embedding_model_revision=embedding_model_revision,
+        embedding_max_length=embedding_max_length,
+        embedding_batch_size=embedding_batch_size,
+        device=device,
+    )
     return argumentation_model, embedding_model