Spaces:

ArneBinder
/

sam-pointer-bart-base-v0.3

Running

App Files Files Community

ArneBinder commited on Jul 15

Commit

a8df5fb

•

1 Parent(s): b0174fa

from https://github.com/ArneBinder/pie-document-level/pull/238

Browse files

Files changed (5) hide show

app.py +4 -4
document_store.py +108 -40
embedding.py +4 -4
rendering_utils.py +32 -12
vector_store.py +9 -1

app.py CHANGED Viewed

@@ -354,7 +354,7 @@ def main():
                         minimum=0.0,
                         maximum=1.0,
                         step=0.01,
-                        value=0.8,
                     )
                     top_k = gr.Slider(
                         label="Top K",
@@ -398,10 +398,10 @@ def main():
         )
         show_overview_kwargs = dict(
-            fn=lambda document_store, show_max_sims: document_store.overview(
                 with_max_cross_doc_sims=show_max_sims
             ),
-            inputs=[document_store_state, show_max_cross_docu_sims],
             outputs=[processed_documents_df],
         )
         predict_btn.click(fn=open_accordion, inputs=[], outputs=[output_accordion]).then(
@@ -505,7 +505,7 @@ def main():
                 DocumentStore.get_all2all_adu_similarities,
                 columns=all2all_adu_similarities.headers,
             ),
-            inputs=[document_store_state],
             outputs=[all2all_adu_similarities],
         )

                         minimum=0.0,
                         maximum=1.0,
                         step=0.01,
+                        value=0.95,
                     )
                     top_k = gr.Slider(
                         label="Top K",
         )
         show_overview_kwargs = dict(
+            fn=lambda document_store, show_max_sims, min_sim: document_store.overview(
                 with_max_cross_doc_sims=show_max_sims
             ),
+            inputs=[document_store_state, show_max_cross_docu_sims, min_similarity],
             outputs=[processed_documents_df],
         )
         predict_btn.click(fn=open_accordion, inputs=[], outputs=[output_accordion]).then(
                 DocumentStore.get_all2all_adu_similarities,
                 columns=all2all_adu_similarities.headers,
             ),
+            inputs=[document_store_state, min_similarity],
             outputs=[all2all_adu_similarities],
         )

document_store.py CHANGED Viewed

@@ -16,6 +16,7 @@ from pytorch_ie.documents import (
     TextBasedDocument,
     TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
 )
 from vector_store import VectorStore
 logger = logging.getLogger(__name__)
@@ -342,6 +343,38 @@ class DocumentStore:
             f"Added {len(documents_json)} documents to the index ({len(self.documents)} documents in total)."
         )
     def add_documents_from_zip(self, file_path: str) -> None:
         temp_dir = os.path.join(tempfile.gettempdir(), "document_store")
         # remove the temporary directory if it already exists
@@ -418,7 +451,9 @@ class DocumentStore:
         return document
-    def overview(self, with_max_cross_doc_sims: bool = False) -> pd.DataFrame:
         rows = []
         for doc_id, document in self.documents.items():
             layers = {
@@ -433,13 +468,8 @@ class DocumentStore:
         # add highest cross-document similarity score for each document
         if with_max_cross_doc_sims and len(self.documents) > 1:
-            # Setting min_similarity to None and top_k to None to get all similarities. Otherwise,
-            # it may happen that this occludes max cross-doc sim for some documents in the
-            # case that there are more than top_k ADUs in the reference document that have a higher
-            # similarity with each other than the highest similarity to any ADU in another document
-            # or if the cross-doc similarity is below the min_similarity threshold.
             all2all_adu_similarities = self.get_all2all_adu_similarities(
-                min_similarity=None, top_k=None, columns=["doc_id", "other_doc_id", "sim_score"]
             )
             max_doc2doc_similarities = all2all_adu_similarities.pivot_table(
                 values="sim_score", index="doc_id", columns="other_doc_id", aggfunc="max"
@@ -478,50 +508,88 @@ class DocumentStore:
     def get_all2all_adu_similarities(
         self,
         min_similarity: Optional[float] = 0.5,
-        top_k: Optional[int] = 100,
         columns: Optional[List[str]] = None,
     ) -> pd.DataFrame:
         """Get the similarities between all ADUs in the store.
         Args:
-            min_similarity: The minimum similarity score to consider.
-            top_k: The number of similar ADUs to return.
             columns: The columns to include in the result DataFrame. If None, all columns are included.
         Returns:
             A DataFrame with the columns: doc_id, text, other_doc_id, other_text, sim_score.
         """
-        result = []
-        document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-        for doc_id, document in self.documents.items():
-            for adu in document.labeled_spans.predictions:
-                adu_id = labeled_span_to_id(adu)
-                similar_entries = self.vector_store.retrieve_similar(
-                    ref_payload=self.construct_embedding_payload(document, adu_id),
-                    min_similarity=min_similarity,
-                    top_k=top_k,
                 )
-                for _, payload, score in similar_entries:
-                    other_doc_id = payload["doc_id"]
-                    other_document = self.documents[other_doc_id]
-                    other_adu = get_annotation_from_document(
-                        other_document,
-                        payload["annotation_id"],
-                        self.span_layer_name,
-                        use_predictions=self.use_predictions,
-                    )
-                    result.append(
-                        {
-                            "sim_score": score,
-                            "doc_id": doc_id,
-                            "other_doc_id": other_doc_id,
-                            "adu_id": adu_id,
-                            "other_adu_id": payload["annotation_id"],
-                            "text": str(adu),
-                            "other_text": str(other_adu),
-                        }
-                    )
-        result_df = pd.DataFrame(result)
         if columns is not None:
             result_df = result_df[columns]
         return result_df

     TextBasedDocument,
     TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
 )
+from scipy.sparse import csr_matrix
 from vector_store import VectorStore
 logger = logging.getLogger(__name__)
             f"Added {len(documents_json)} documents to the index ({len(self.documents)} documents in total)."
         )
+    def get_payloads_for_missing_and_unexpected_embeddings(self) -> dict[str, dict[str, Any]]:
+        """Get the payloads for missing and unexpected embeddings in the vector store. An embedding
+        is missing if its annotation is in the documents but the embedding is not in the vector
+        store. An embedding is unexpected if it is in the vector store but the annotation is not in
+        the documents.
+        Returns:
+            A dictionary with the missing and unexpected payloads.
+        """
+        expected_payloads = []
+        for document in self.documents.values():
+            for annotation in document[self.span_layer_name].predictions:
+                annotation_id = labeled_span_to_id(annotation)
+                payload = self.construct_embedding_payload(document, annotation_id)
+                expected_payloads.append(payload)
+        vector_sore_payloads = self.vector_store.as_indices_vectors_payloads()[2]
+        # construct mappings from ids to payloads to compare the expected and actual payloads
+        expected_mapping = {
+            json.dumps(payload, sort_keys=True): payload for payload in expected_payloads
+        }
+        vector_store_mapping = {
+            json.dumps(payload, sort_keys=True): payload for payload in vector_sore_payloads
+        }
+        missing = set(expected_mapping) - set(vector_store_mapping)
+        unexpected = set(vector_store_mapping) - set(expected_mapping)
+        # return the missing and unexpected payloads
+        return {
+            "missing": {payload: expected_mapping[payload] for payload in missing},
+            "unexpected": {payload: vector_store_mapping[payload] for payload in unexpected},
+        }
     def add_documents_from_zip(self, file_path: str) -> None:
         temp_dir = os.path.join(tempfile.gettempdir(), "document_store")
         # remove the temporary directory if it already exists
         return document
+    def overview(
+        self, with_max_cross_doc_sims: bool = False, min_similarity: float = 0.9
+    ) -> pd.DataFrame:
         rows = []
         for doc_id, document in self.documents.items():
             layers = {
         # add highest cross-document similarity score for each document
         if with_max_cross_doc_sims and len(self.documents) > 1:
             all2all_adu_similarities = self.get_all2all_adu_similarities(
+                min_similarity=min_similarity, columns=["doc_id", "other_doc_id", "sim_score"]
             )
             max_doc2doc_similarities = all2all_adu_similarities.pivot_table(
                 values="sim_score", index="doc_id", columns="other_doc_id", aggfunc="max"
     def get_all2all_adu_similarities(
         self,
         min_similarity: Optional[float] = 0.5,
         columns: Optional[List[str]] = None,
     ) -> pd.DataFrame:
         """Get the similarities between all ADUs in the store.
         Args:
+            min_similarity: The minimum similarity score to consider. If None, all similarities are included.
             columns: The columns to include in the result DataFrame. If None, all columns are included.
         Returns:
             A DataFrame with the columns: doc_id, text, other_doc_id, other_text, sim_score.
         """
+        # shape of all_embeddings: (num_embeddings, embedding_dim)
+        (
+            all_embed_ids,
+            all_embeddings,
+            all_payloads,
+        ) = self.vector_store.as_indices_vectors_payloads()
+        doc_id_and_annotation_id2annotation_text = {}
+        for doc in self.documents.values():
+            for annotation in doc[self.span_layer_name]:
+                doc_id_and_annotation_id2annotation_text[
+                    (doc.id, labeled_span_to_id(annotation))
+                ] = str(annotation)
+            for annotation in doc[self.span_layer_name].predictions:
+                doc_id_and_annotation_id2annotation_text[
+                    (doc.id, labeled_span_to_id(annotation))
+                ] = str(annotation)
+        # calculate cosine similarities between all embeddings
+        dot_prod = np.dot(all_embeddings, all_embeddings.T)
+        norm = np.linalg.norm(all_embeddings, axis=1)
+        norm_prod = np.outer(norm, norm)
+        similarities = dot_prod / norm_prod
+        gr.Info(f"Similarities shape: {similarities.shape}")
+        if min_similarity is not None:
+            gr.Info(f"Filtering similarities below {min_similarity}.")
+            # set similarities below min_similarity to 0
+            similarities[similarities < min_similarity] = 0.0
+        # set triangular part to 0
+        similarities = np.triu(similarities, k=1)
+        # create a sparse matrix
+        sparse_matrix = csr_matrix(similarities)
+        sparse_matrix.eliminate_zeros()
+        # Get the non-zero values and their indices
+        non_zero_idx = sparse_matrix.nonzero()
+        scores = sparse_matrix.data
+        gr.Info(f"Number of similarities above {min_similarity}: {len(scores)}")
+        # construct the DataFrame
+        records = []
+        for idx1, idx2 in zip(non_zero_idx[0], non_zero_idx[1]):
+            if idx1 < idx2:
+                doc_id1 = all_payloads[idx1]["doc_id"]
+                doc_id2 = all_payloads[idx2]["doc_id"]
+                annotation_id1 = all_payloads[idx1]["annotation_id"]
+                annotation_id2 = all_payloads[idx2]["annotation_id"]
+                annotation_text1 = doc_id_and_annotation_id2annotation_text[
+                    (doc_id1, annotation_id1)
+                ]
+                annotation_text2 = doc_id_and_annotation_id2annotation_text[
+                    (doc_id2, annotation_id2)
+                ]
+                records.append(
+                    {
+                        "sim_score": scores[idx1],
+                        "doc_id": doc_id1,
+                        "other_doc_id": doc_id2,
+                        "adu_id": annotation_id1,
+                        "other_adu_id": annotation_id2,
+                        "text": annotation_text1,
+                        "other_text": annotation_text2,
+                    }
                 )
+        result_df = pd.DataFrame(records)
+        gr.Info(f"DataFrame shape: {result_df.shape}")
         if columns is not None:
             result_df = result_df[columns]
         return result_df

embedding.py CHANGED Viewed

@@ -114,10 +114,10 @@ class HuggingfaceEmbeddingModel(EmbeddingModel):
                     )
                     text_ann = tok2text_ann[tok_ann]
-                    if text_ann in embeddings:
-                        logger.warning(
-                            f"Overwriting embedding for annotation '{text_ann}' (do you use striding?)"
-                        )
                     embeddings[text_ann] = embedding
                 example_idx += 1

                     )
                     text_ann = tok2text_ann[tok_ann]
+                    # if text_ann in embeddings:
+                    #    logger.warning(
+                    #        f"Overwriting embedding for annotation '{text_ann}' (do you use striding?)"
+                    #    )
                     embeddings[text_ann] = embedding
                 example_idx += 1

rendering_utils.py CHANGED Viewed

@@ -4,12 +4,20 @@ from collections import defaultdict
 from typing import Dict, List, Optional, Union
 from annotation_utils import labeled_span_to_id
-from pytorch_ie.annotations import BinaryRelation
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils_displacy import EntityRenderer
 logger = logging.getLogger(__name__)
 def render_pretty_table(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions, **render_kwargs
@@ -36,15 +44,27 @@ def render_displacy(
     **render_kwargs,
 ):
-    spans = list(document.labeled_spans) + list(document.labeled_spans.predictions)
     spacy_doc = {
         "text": document.text,
         "ents": [
-            {"start": entity.start, "end": entity.end, "label": entity.label} for entity in spans
         ],
         "title": None,
     }
     renderer = EntityRenderer(options=entity_options)
     html = renderer.render([spacy_doc], page=True, minify=True).strip()
@@ -53,10 +73,9 @@ def render_displacy(
         binary_relations = list(document.binary_relations) + list(
             document.binary_relations.predictions
         )
-        sorted_entities = sorted(spans, key=lambda x: (x.start, x.end))
         html = inject_relation_data(
             html,
-            sorted_entities=sorted_entities,
             binary_relations=binary_relations,
             additional_colors=colors_hover,
         )
@@ -65,7 +84,7 @@ def render_displacy(
 def inject_relation_data(
     html: str,
-    sorted_entities,
     binary_relations: List[BinaryRelation],
     additional_colors: Optional[Dict[str, Union[str, dict]]] = None,
 ) -> str:
@@ -80,11 +99,10 @@ def inject_relation_data(
         entity2heads[relation.tail].append((relation.head, relation.label))
         entity2tails[relation.head].append((relation.tail, relation.label))
     # Add unique IDs to each entity
     entities = soup.find_all(class_="entity")
-    for idx, entity in enumerate(entities):
-        annotation = sorted_entities[idx]
-        entity["id"] = labeled_span_to_id(annotation)
         original_color = entity["style"].split("background:")[1].split(";")[0].strip()
         entity["data-color-original"] = original_color
         if additional_colors is not None:
@@ -92,9 +110,11 @@ def inject_relation_data(
                 entity[f"data-color-{key}"] = (
                     json.dumps(color) if isinstance(color, dict) else color
                 )
-        entity_annotation = sorted_entities[idx]
-        # sanity check
-        if str(entity_annotation) != entity.next:
             logger.warning(f"Entity text mismatch: {entity_annotation} != {entity.text}")
         entity["data-label"] = entity_annotation.label
         entity["data-relation-tails"] = json.dumps(

 from typing import Dict, List, Optional, Union
 from annotation_utils import labeled_span_to_id
+from pytorch_ie.annotations import BinaryRelation, LabeledSpan
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils_displacy import EntityRenderer
 logger = logging.getLogger(__name__)
+# adjusted from rendering_utils_displacy.TPL_ENT
+TPL_ENT_WITH_ID = """
+<mark class="entity" id="{id}" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
+    {text}
+    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">{label}</span>
+</mark>
+"""
 def render_pretty_table(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions, **render_kwargs
     **render_kwargs,
 ):
+    labeled_spans = list(document.labeled_spans) + list(document.labeled_spans.predictions)
     spacy_doc = {
         "text": document.text,
         "ents": [
+            {
+                "start": labeled_span.start,
+                "end": labeled_span.end,
+                "label": labeled_span.label,
+                # pass the ID as a parameter to the entity. The id is required to fetch the entity annotations
+                # on hover and to inject the relation data.
+                "params": {"id": labeled_span_to_id(labeled_span)},
+            }
+            for labeled_span in labeled_spans
         ],
         "title": None,
     }
+    # copy to avoid modifying the original options
+    entity_options = entity_options.copy()
+    # use the custom template with the entity ID
+    entity_options["template"] = TPL_ENT_WITH_ID
     renderer = EntityRenderer(options=entity_options)
     html = renderer.render([spacy_doc], page=True, minify=True).strip()
         binary_relations = list(document.binary_relations) + list(
             document.binary_relations.predictions
         )
         html = inject_relation_data(
             html,
+            labeled_spans=labeled_spans,
             binary_relations=binary_relations,
             additional_colors=colors_hover,
         )
 def inject_relation_data(
     html: str,
+    labeled_spans: List[LabeledSpan],
     binary_relations: List[BinaryRelation],
     additional_colors: Optional[Dict[str, Union[str, dict]]] = None,
 ) -> str:
         entity2heads[relation.tail].append((relation.head, relation.label))
         entity2tails[relation.head].append((relation.tail, relation.label))
+    ann_id2annotation = {labeled_span_to_id(entity): entity for entity in labeled_spans}
     # Add unique IDs to each entity
     entities = soup.find_all(class_="entity")
+    for entity in entities:
         original_color = entity["style"].split("background:")[1].split(";")[0].strip()
         entity["data-color-original"] = original_color
         if additional_colors is not None:
                 entity[f"data-color-{key}"] = (
                     json.dumps(color) if isinstance(color, dict) else color
                 )
+        entity_annotation = ann_id2annotation[entity["id"]]
+        # sanity check.
+        annotation_text_without_newline = str(entity_annotation).replace("\n", "")
+        # Just check the start, because the text has the label attached to the end
+        if not entity.text.startswith(annotation_text_without_newline):
             logger.warning(f"Entity text mismatch: {entity_annotation} != {entity.text}")
         entity["data-label"] = entity_annotation.label
         entity["data-relation-tails"] = json.dumps(

vector_store.py CHANGED Viewed

@@ -52,12 +52,16 @@ class VectorStore(Generic[T, E], abc.ABC):
     def get(self, emb_id: Optional[str] = None, payload: Optional[T] = None) -> Optional[E]:
         return self._get(emb_id=self._get_emb_id(emb_id=emb_id, payload=payload))
     @abc.abstractmethod
     def _retrieve_similar(
         self, ref_id: str, top_k: Optional[int] = None, min_similarity: Optional[float] = None
     ) -> List[Tuple[T, float]]:
         """Retrieve IDs, payloads and the respective similarity scores with respect to the
-        reference entry. Note that this requires the reference entry to be present in the store.
         Args:
             ref_id: The ID of the reference entry.
@@ -74,6 +78,8 @@ class VectorStore(Generic[T, E], abc.ABC):
     def retrieve_similar(
         self, ref_id: Optional[str] = None, ref_payload: Optional[T] = None, **kwargs
     ) -> List[Tuple[T, float]]:
         return self._retrieve_similar(
             ref_id=self._get_emb_id(emb_id=ref_id, payload=ref_payload), **kwargs
         )
@@ -244,6 +250,8 @@ class QdrantVectorStore(VectorStore[T, List[float]]):
         )
     def _get(self, emb_id: str) -> Optional[List[float]]:
         points = self.client.retrieve(
             collection_name=self.COLLECTION_NAME,
             ids=[self.emb_id2point_id[emb_id]],

     def get(self, emb_id: Optional[str] = None, payload: Optional[T] = None) -> Optional[E]:
         return self._get(emb_id=self._get_emb_id(emb_id=emb_id, payload=payload))
+    def has(self, emb_id: Optional[str] = None, payload: Optional[T] = None) -> bool:
+        return self.get(emb_id=emb_id, payload=payload) is not None
     @abc.abstractmethod
     def _retrieve_similar(
         self, ref_id: str, top_k: Optional[int] = None, min_similarity: Optional[float] = None
     ) -> List[Tuple[T, float]]:
         """Retrieve IDs, payloads and the respective similarity scores with respect to the
+        reference entry. In the case that the reference entry is not in the store itself, an empty
+        list will be returned.
         Args:
             ref_id: The ID of the reference entry.
     def retrieve_similar(
         self, ref_id: Optional[str] = None, ref_payload: Optional[T] = None, **kwargs
     ) -> List[Tuple[T, float]]:
+        if not self.has(emb_id=ref_id, payload=ref_payload):
+            return []
         return self._retrieve_similar(
             ref_id=self._get_emb_id(emb_id=ref_id, payload=ref_payload), **kwargs
         )
         )
     def _get(self, emb_id: str) -> Optional[List[float]]:
+        if emb_id not in self.emb_id2point_id:
+            return None
         points = self.client.retrieve(
             collection_name=self.COLLECTION_NAME,
             ids=[self.emb_id2point_id[emb_id]],