Spaces:

ArneBinder
/

sam-pointer-bart-base-v0.3

Running

App Files Files Community

ArneBinder commited on Aug 23

Commit

efae5be

•

1 Parent(s): 47cc11e

from https://github.com/ArneBinder/pie-document-level/pull/243

Browse files

Files changed (7) hide show

annotation_utils.py +26 -5
app.py +63 -107
document_store.py +35 -25
embedding.py +46 -13
model_utils.py +29 -5
rendering_utils.py +168 -22
requirements.txt +1 -0

annotation_utils.py CHANGED Viewed

@@ -1,10 +1,31 @@
-from pytorch_ie.annotations import LabeledSpan
-def labeled_span_to_id(span: LabeledSpan) -> str:
-    return f"span-{span.start}-{span.end}-{span.label}"
-def labeled_span_from_id(span_id: str) -> LabeledSpan:
     parts = span_id.split("-")
-    return LabeledSpan(int(parts[1]), int(parts[2]), parts[3])

+from typing import Union
+from pytorch_ie.annotations import LabeledMultiSpan, LabeledSpan
+def labeled_span_to_id(span: Union[LabeledSpan, LabeledMultiSpan]) -> str:
+    if isinstance(span, LabeledSpan):
+        # {type indicator}-{start}-{end}-{label}
+        return f"span-{span.start}-{span.end}-{span.label}"
+    elif isinstance(span, LabeledMultiSpan):
+        # {type indicator}-({start}-{end})*-{label
+        starts_ends = "-".join(f"{start}-{end}" for start, end in span.slices)
+        return f"multispan-{starts_ends}-{span.label}"
+    else:
+        raise ValueError(f"Unsupported span type: {type(span)}")
+def labeled_span_from_id(span_id: str) -> Union[LabeledSpan, LabeledMultiSpan]:
     parts = span_id.split("-")
+    if parts[0] == "span":
+        return LabeledSpan(int(parts[1]), int(parts[2]), parts[3])
+    elif parts[0] == "multispan":
+        label = parts[-1]
+        # this contains: start1, end1, start2, end2, ...
+        starts_ends = parts[1:-1]
+        slices = tuple(
+            (int(start), int(end)) for start, end in zip(starts_ends[::2], starts_ends[1::2])
+        )
+        return LabeledMultiSpan(slices, label)
+    else:
+        raise ValueError(f"Unsupported span id: {span_id}")

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os.path
 import re
 import tempfile
 from functools import partial
-from typing import List, Optional, Tuple
 import gradio as gr
 import pandas as pd
@@ -14,8 +14,11 @@ from embedding import EmbeddingModel
 from model_utils import annotate_document, create_document, load_models
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
 from pytorch_ie import Pipeline
-from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-from rendering_utils import render_displacy, render_pretty_table
 from transformers import PreTrainedModel, PreTrainedTokenizer
 from vector_store import QdrantVectorStore, SimpleVectorStore
@@ -35,6 +38,10 @@ DEFAULT_EMBEDDING_MAX_LENGTH = 512
 DEFAULT_EMBEDDING_BATCH_SIZE = 32
 DEFAULT_SPLIT_REGEX = "\n\n\n+"
 def escape_regex(regex: str) -> str:
     # "double escape" the backslashes
@@ -49,7 +56,10 @@ def unescape_regex(regex: str) -> str:
 def render_annotated_document(
-    document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
     render_with: str,
     render_kwargs_json: str,
 ) -> str:
@@ -70,7 +80,14 @@ def wrapped_process_text(
     models: Tuple[Pipeline, Optional[EmbeddingModel]],
     document_store: DocumentStore,
     split_regex_escaped: str,
-) -> Tuple[dict, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]:
     try:
         document = create_document(
             text=text,
@@ -79,10 +96,11 @@ def wrapped_process_text(
             if len(split_regex_escaped) > 0
             else None,
         )
-        annotate_document(
             document=document,
             annotation_pipeline=models[0],
             embedding_model=models[1],
         )
         document_store.add_document(document)
     except Exception as e:
@@ -100,6 +118,8 @@ def process_uploaded_files(
     document_store: DocumentStore,
     split_regex_escaped: str,
     show_max_cross_doc_sims: bool = False,
 ) -> pd.DataFrame:
     try:
         new_documents = []
@@ -117,10 +137,11 @@ def process_uploaded_files(
                     if len(split_regex_escaped) > 0
                     else None,
                 )
-                annotate_document(
                     document=new_document,
                     annotation_pipeline=models[0],
                     embedding_model=models[1],
                 )
                 new_documents.append(new_document)
             else:
@@ -129,7 +150,9 @@ def process_uploaded_files(
     except Exception as e:
         raise gr.Error(f"Failed to process uploaded files: {e}")
-    return document_store.overview(with_max_cross_doc_sims=show_max_cross_doc_sims)
 def open_accordion():
@@ -144,9 +167,15 @@ def select_processed_document(
     evt: gr.SelectData,
     processed_documents_df: pd.DataFrame,
     document_store: DocumentStore,
-) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     row_idx, col_idx = evt.index
-    doc_id = processed_documents_df.iloc[row_idx]["doc_id"]
     doc = document_store.get_document(doc_id, with_embeddings=False)
     return doc
@@ -231,6 +260,12 @@ def main():
                 span_annotation_caption="adu",
                 relation_annotation_caption="relation",
                 vector_store=QdrantVectorStore(),
             )
         )
         # wrap the pipeline and the embedding model/tokenizer in a tuple to avoid that it gets called
@@ -399,14 +434,20 @@ def main():
         show_overview_kwargs = dict(
             fn=lambda document_store, show_max_sims, min_sim: document_store.overview(
-                with_max_cross_doc_sims=show_max_sims
             ),
             inputs=[document_store_state, show_max_cross_docu_sims, min_similarity],
             outputs=[processed_documents_df],
         )
         predict_btn.click(fn=open_accordion, inputs=[], outputs=[output_accordion]).then(
-            fn=wrapped_process_text,
-            inputs=[doc_text, doc_id, models_state, document_store_state, split_regex_escaped],
             outputs=[document_json, document_state],
             api_name="predict",
         ).success(**show_overview_kwargs)
@@ -423,13 +464,14 @@ def main():
         upload_btn.upload(
             fn=open_accordion, inputs=[], outputs=[processed_documents_accordion]
         ).then(
-            fn=process_uploaded_files,
             inputs=[
                 upload_btn,
                 models_state,
                 document_store_state,
                 split_regex_escaped,
                 show_max_cross_docu_sims,
             ],
             outputs=[processed_documents_df],
         )
@@ -470,7 +512,9 @@ def main():
         selected_adu_id.change(
             fn=partial(
                 get_annotation_from_document,
-                annotation_layer="labeled_spans",
                 use_predictions=True,
             ),
             inputs=[document_state, selected_adu_id],
@@ -483,7 +527,9 @@ def main():
                 ref_document=document,
                 min_similarity=min_sim,
                 top_k=k,
-                annotation_layer="labeled_spans",
             ),
             inputs=[
                 document_store_state,
@@ -513,97 +559,7 @@ def main():
         #     **retrieve_relevant_adus_event_kwargs
         # )
-        js = """
-        () => {
-            function maybeSetColor(entity, colorAttributeKey, colorDictKey) {
-                var color = entity.getAttribute('data-color-' + colorAttributeKey);
-                // if color is a json string, parse it and use the value at colorDictKey
-                try {
-                    const colors = JSON.parse(color);
-                    color = colors[colorDictKey];
-                } catch (e) {}
-                if (color) {
-                    entity.style.backgroundColor = color;
-                    entity.style.color = '#000';
-                }
-            }
-            function highlightRelationArguments(entityId) {
-                const entities = document.querySelectorAll('.entity');
-                // reset all entities
-                entities.forEach(entity => {
-                    const color = entity.getAttribute('data-color-original');
-                    entity.style.backgroundColor = color;
-                    entity.style.color = '';
-                });
-                if (entityId !== null) {
-                    var visitedEntities = new Set();
-                    // highlight selected entity
-                    const selectedEntity = document.getElementById(entityId);
-                    if (selectedEntity) {
-                        const label = selectedEntity.getAttribute('data-label');
-                        maybeSetColor(selectedEntity, 'selected', label);
-                        visitedEntities.add(selectedEntity);
-                    }
-                    // highlight tails
-                    const relationTailsAndLabels = JSON.parse(selectedEntity.getAttribute('data-relation-tails'));
-                    relationTailsAndLabels.forEach(relationTail => {
-                        const tailEntity = document.getElementById(relationTail['entity-id']);
-                        if (tailEntity) {
-                            const label = relationTail['label'];
-                            maybeSetColor(tailEntity, 'tail', label);
-                            visitedEntities.add(tailEntity);
-                        }
-                    });
-                    // highlight heads
-                    const relationHeadsAndLabels = JSON.parse(selectedEntity.getAttribute('data-relation-heads'));
-                    relationHeadsAndLabels.forEach(relationHead => {
-                        const headEntity = document.getElementById(relationHead['entity-id']);
-                        if (headEntity) {
-                            const label = relationHead['label'];
-                            maybeSetColor(headEntity, 'head', label);
-                            visitedEntities.add(headEntity);
-                        }
-                    });
-                    // highlight other entities
-                    entities.forEach(entity => {
-                        if (!visitedEntities.has(entity)) {
-                            const label = entity.getAttribute('data-label');
-                            maybeSetColor(entity, 'other', label);
-                        }
-                    });
-                }
-            }
-            function setReferenceAduId(entityId) {
-                // get the textarea element that holds the reference adu id
-                let referenceAduIdDiv = document.querySelector('#selected_adu_id textarea');
-                // set the value of the input field
-                referenceAduIdDiv.value = entityId;
-                // trigger an input event to update the state
-                var event = new Event('input');
-                referenceAduIdDiv.dispatchEvent(event);
-            }
-            const entities = document.querySelectorAll('.entity');
-            entities.forEach(entity => {
-                const alreadyHasListener = entity.getAttribute('data-has-listener');
-                if (alreadyHasListener) {
-                    return;
-                }
-                entity.addEventListener('mouseover', () => {
-                    highlightRelationArguments(entity.id);
-                    setReferenceAduId(entity.id);
-                });
-                entity.addEventListener('mouseout', () => {
-                    highlightRelationArguments(null);
-                });
-                entity.setAttribute('data-has-listener', 'true');
-            });
-        }
-        """
-        rendered_output.change(fn=None, js=js, inputs=[], outputs=[])
     demo.launch()

 import re
 import tempfile
 from functools import partial
+from typing import List, Optional, Tuple, Union
 import gradio as gr
 import pandas as pd
 from model_utils import annotate_document, create_document, load_models
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
 from pytorch_ie import Pipeline
+from pytorch_ie.documents import (
+    TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+    TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+)
+from rendering_utils import HIGHLIGHT_SPANS_JS, render_displacy, render_pretty_table
 from transformers import PreTrainedModel, PreTrainedTokenizer
 from vector_store import QdrantVectorStore, SimpleVectorStore
 DEFAULT_EMBEDDING_BATCH_SIZE = 32
 DEFAULT_SPLIT_REGEX = "\n\n\n+"
+# Whether to handle segmented entities in the document. If True, labeled_spans are converted
+# to labeled_multi_spans and binary_relations with label "parts_of_same" are used to merge them.
+HANDLE_PARTS_OF_SAME = True
 def escape_regex(regex: str) -> str:
     # "double escape" the backslashes
 def render_annotated_document(
+    document: Union[
+        TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+        TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+    ],
     render_with: str,
     render_kwargs_json: str,
 ) -> str:
     models: Tuple[Pipeline, Optional[EmbeddingModel]],
     document_store: DocumentStore,
     split_regex_escaped: str,
+    handle_parts_of_same: bool = False,
+) -> Tuple[
+    dict,
+    Union[
+        TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+        TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+    ],
+]:
     try:
         document = create_document(
             text=text,
             if len(split_regex_escaped) > 0
             else None,
         )
+        document = annotate_document(
             document=document,
             annotation_pipeline=models[0],
             embedding_model=models[1],
+            handle_parts_of_same=handle_parts_of_same,
         )
         document_store.add_document(document)
     except Exception as e:
     document_store: DocumentStore,
     split_regex_escaped: str,
     show_max_cross_doc_sims: bool = False,
+    min_similarity: float = 0.95,
+    handle_parts_of_same: bool = False,
 ) -> pd.DataFrame:
     try:
         new_documents = []
                     if len(split_regex_escaped) > 0
                     else None,
                 )
+                new_document = annotate_document(
                     document=new_document,
                     annotation_pipeline=models[0],
                     embedding_model=models[1],
+                    handle_parts_of_same=handle_parts_of_same,
                 )
                 new_documents.append(new_document)
             else:
     except Exception as e:
         raise gr.Error(f"Failed to process uploaded files: {e}")
+    return document_store.overview(
+        with_max_cross_doc_sims=show_max_cross_doc_sims, min_similarity=min_similarity
+    )
 def open_accordion():
     evt: gr.SelectData,
     processed_documents_df: pd.DataFrame,
     document_store: DocumentStore,
+) -> Union[
+    TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+    TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+]:
     row_idx, col_idx = evt.index
+    col_name = processed_documents_df.columns[col_idx]
+    if not col_name.endswith("doc_id"):
+        col_name = "doc_id"
+    doc_id = processed_documents_df.iloc[row_idx][col_name]
     doc = document_store.get_document(doc_id, with_embeddings=False)
     return doc
                 span_annotation_caption="adu",
                 relation_annotation_caption="relation",
                 vector_store=QdrantVectorStore(),
+                document_type=TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+                if not HANDLE_PARTS_OF_SAME
+                else TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+                span_layer_name="labeled_spans"
+                if not HANDLE_PARTS_OF_SAME
+                else "labeled_multi_spans",
             )
         )
         # wrap the pipeline and the embedding model/tokenizer in a tuple to avoid that it gets called
         show_overview_kwargs = dict(
             fn=lambda document_store, show_max_sims, min_sim: document_store.overview(
+                with_max_cross_doc_sims=show_max_sims, min_similarity=min_sim
             ),
             inputs=[document_store_state, show_max_cross_docu_sims, min_similarity],
             outputs=[processed_documents_df],
         )
         predict_btn.click(fn=open_accordion, inputs=[], outputs=[output_accordion]).then(
+            fn=partial(wrapped_process_text, handle_parts_of_same=HANDLE_PARTS_OF_SAME),
+            inputs=[
+                doc_text,
+                doc_id,
+                models_state,
+                document_store_state,
+                split_regex_escaped,
+            ],
             outputs=[document_json, document_state],
             api_name="predict",
         ).success(**show_overview_kwargs)
         upload_btn.upload(
             fn=open_accordion, inputs=[], outputs=[processed_documents_accordion]
         ).then(
+            fn=partial(process_uploaded_files, handle_parts_of_same=HANDLE_PARTS_OF_SAME),
             inputs=[
                 upload_btn,
                 models_state,
                 document_store_state,
                 split_regex_escaped,
                 show_max_cross_docu_sims,
+                min_similarity,
             ],
             outputs=[processed_documents_df],
         )
         selected_adu_id.change(
             fn=partial(
                 get_annotation_from_document,
+                annotation_layer="labeled_spans"
+                if not HANDLE_PARTS_OF_SAME
+                else "labeled_multi_spans",
                 use_predictions=True,
             ),
             inputs=[document_state, selected_adu_id],
                 ref_document=document,
                 min_similarity=min_sim,
                 top_k=k,
+                annotation_layer="labeled_spans"
+                if not HANDLE_PARTS_OF_SAME
+                else "labeled_multi_spans",
             ),
             inputs=[
                 document_store_state,
         #     **retrieve_relevant_adus_event_kwargs
         # )
+        rendered_output.change(fn=None, js=HIGHLIGHT_SPANS_JS, inputs=[], outputs=[])
     demo.launch()

document_store.py CHANGED Viewed

@@ -14,6 +14,7 @@ from annotation_utils import labeled_span_to_id
 from pytorch_ie import Annotation
 from pytorch_ie.documents import (
     TextBasedDocument,
     TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
 )
 from scipy.sparse import csr_matrix
@@ -45,7 +46,7 @@ def get_annotation_from_document(
     if use_predictions:
         annotations = annotations.predictions
-    if annotation_layer == "labeled_spans":
         annotation_to_id_func = labeled_span_to_id
     else:
         raise gr.Error(f"Unknown annotation layer '{annotation_layer}'.")
@@ -301,6 +302,12 @@ class DocumentStore:
     def add_document(self, document: TextBasedDocument) -> None:
         try:
             if document.id in self.documents:
                 gr.Warning(f"Document '{document.id}' already in index. Overwriting.")
@@ -485,6 +492,11 @@ class DocumentStore:
             max_doc_ids = max_doc2doc_similarities.idxmax(axis="columns")
             max_similarities = max_doc2doc_similarities.max(axis="columns")
             # set the index to the doc_id to correctly join the series
             df.set_index("doc_id", inplace=True)
@@ -551,7 +563,8 @@ class DocumentStore:
             # set similarities below min_similarity to 0
             similarities[similarities < min_similarity] = 0.0
-        # set triangular part to 0
         similarities = np.triu(similarities, k=1)
         # create a sparse matrix
         sparse_matrix = csr_matrix(similarities)
@@ -564,29 +577,26 @@ class DocumentStore:
         # construct the DataFrame
         records = []
-        for idx1, idx2 in zip(non_zero_idx[0], non_zero_idx[1]):
-            if idx1 < idx2:
-                doc_id1 = all_payloads[idx1]["doc_id"]
-                doc_id2 = all_payloads[idx2]["doc_id"]
-                annotation_id1 = all_payloads[idx1]["annotation_id"]
-                annotation_id2 = all_payloads[idx2]["annotation_id"]
-                annotation_text1 = doc_id_and_annotation_id2annotation_text[
-                    (doc_id1, annotation_id1)
-                ]
-                annotation_text2 = doc_id_and_annotation_id2annotation_text[
-                    (doc_id2, annotation_id2)
-                ]
-                records.append(
-                    {
-                        "sim_score": scores[idx1],
-                        "doc_id": doc_id1,
-                        "other_doc_id": doc_id2,
-                        "adu_id": annotation_id1,
-                        "other_adu_id": annotation_id2,
-                        "text": annotation_text1,
-                        "other_text": annotation_text2,
-                    }
-                )
         result_df = pd.DataFrame(records)
         gr.Info(f"DataFrame shape: {result_df.shape}")

 from pytorch_ie import Annotation
 from pytorch_ie.documents import (
     TextBasedDocument,
+    TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
     TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
 )
 from scipy.sparse import csr_matrix
     if use_predictions:
         annotations = annotations.predictions
+    if annotation_layer in ["labeled_spans", "labeled_multi_spans"]:
         annotation_to_id_func = labeled_span_to_id
     else:
         raise gr.Error(f"Unknown annotation layer '{annotation_layer}'.")
     def add_document(self, document: TextBasedDocument) -> None:
         try:
+            if not isinstance(document, self.document_type):
+                raise gr.Error(
+                    f"The document to add must be of type {self.document_type}, but is of type "
+                    f"{type(document)}."
+                )
             if document.id in self.documents:
                 gr.Warning(f"Document '{document.id}' already in index. Overwriting.")
             max_doc_ids = max_doc2doc_similarities.idxmax(axis="columns")
             max_similarities = max_doc2doc_similarities.max(axis="columns")
+            # entries where max_similarities is -inf are documents with no entries in other documents
+            # with similarity > min_similarity
+            mask = max_similarities == -np.inf
+            max_doc_ids[mask] = np.nan
+            max_similarities[mask] = np.nan
             # set the index to the doc_id to correctly join the series
             df.set_index("doc_id", inplace=True)
             # set similarities below min_similarity to 0
             similarities[similarities < min_similarity] = 0.0
+        # set triangular part to 0 because we only want the upper triangular part which
+        # contains entries with idx1 < idx2
         similarities = np.triu(similarities, k=1)
         # create a sparse matrix
         sparse_matrix = csr_matrix(similarities)
         # construct the DataFrame
         records = []
+        for sparse_idx, (idx1, idx2) in enumerate(zip(non_zero_idx[0], non_zero_idx[1])):
+            payload1 = all_payloads[idx1]
+            payload2 = all_payloads[idx2]
+            doc_id1 = payload1["doc_id"]
+            doc_id2 = payload2["doc_id"]
+            annotation_id1 = payload1["annotation_id"]
+            annotation_id2 = payload2["annotation_id"]
+            annotation_text1 = doc_id_and_annotation_id2annotation_text[(doc_id1, annotation_id1)]
+            annotation_text2 = doc_id_and_annotation_id2annotation_text[(doc_id2, annotation_id2)]
+            records.append(
+                {
+                    "sim_score": scores[sparse_idx],
+                    "doc_id": doc_id1,
+                    "other_doc_id": doc_id2,
+                    "adu_id": annotation_id1,
+                    "other_adu_id": annotation_id2,
+                    "text": annotation_text1,
+                    "other_text": annotation_text2,
+                }
+            )
         result_df = pd.DataFrame(records)
         gr.Info(f"DataFrame shape: {result_df.shape}")

embedding.py CHANGED Viewed

@@ -1,12 +1,15 @@
 import abc
 import logging
-from typing import Dict
 import torch
 from datasets import Dataset
 from pie_modules.document.processing import tokenize_document
-from pie_modules.documents import TokenDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-from pytorch_ie.annotations import Span
 from pytorch_ie.documents import TextBasedDocument
 from torch import FloatTensor, Tensor
 from torch.utils.data import DataLoader
@@ -18,7 +21,7 @@ logger = logging.getLogger(__name__)
 class EmbeddingModel(abc.ABC):
     def __call__(
         self, document: TextBasedDocument, span_layer_name: str
-    ) -> Dict[Span, FloatTensor]:
         """Embed text annotations from a document.
         Args:
@@ -51,7 +54,7 @@ class HuggingfaceEmbeddingModel(EmbeddingModel):
     def __call__(
         self, document: TextBasedDocument, span_layer_name: str
-    ) -> Dict[Span, FloatTensor]:
         # to not modify the original document
         document = document.copy()
         # tokenize_document does not yet consider predictions, so we need to add them manually
@@ -65,10 +68,21 @@ class HuggingfaceEmbeddingModel(EmbeddingModel):
             "return_overflowing_tokens": True,
         }
         # tokenize once to get the tokenized documents with mapped annotations
         tokenized_documents = tokenize_document(
             document,
             tokenizer=self._tokenizer,
-            result_document_type=TokenDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
             partition_layer="labeled_partitions",
             added_annotations=added_annotations,
             strict_span_conversion=False,
@@ -104,14 +118,33 @@ class HuggingfaceEmbeddingModel(EmbeddingModel):
             for last_hidden_state in model_output.last_hidden_state:
                 text2tok_ann = added_annotations[example_idx][span_layer_name]
                 tok2text_ann = {v: k for k, v in text2tok_ann.items()}
-                for tok_ann in tokenized_documents[example_idx].labeled_spans:
-                    # skip "empty" annotations
-                    if tok_ann.start == tok_ann.end:
-                        continue
                     # use the max pooling strategy to get a single embedding for the annotation text
-                    embedding = (
-                        last_hidden_state[tok_ann.start : tok_ann.end].max(dim=0)[0].detach().cpu()
-                    )
                     text_ann = tok2text_ann[tok_ann]
                     # if text_ann in embeddings:

 import abc
 import logging
+from typing import Dict, Union
 import torch
 from datasets import Dataset
 from pie_modules.document.processing import tokenize_document
+from pie_modules.documents import (
+    TokenDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+    TokenDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+)
+from pytorch_ie.annotations import LabeledSpan, MultiSpan, Span
 from pytorch_ie.documents import TextBasedDocument
 from torch import FloatTensor, Tensor
 from torch.utils.data import DataLoader
 class EmbeddingModel(abc.ABC):
     def __call__(
         self, document: TextBasedDocument, span_layer_name: str
+    ) -> Dict[Union[Span, MultiSpan], FloatTensor]:
         """Embed text annotations from a document.
         Args:
     def __call__(
         self, document: TextBasedDocument, span_layer_name: str
+    ) -> Dict[Union[Span, MultiSpan], FloatTensor]:
         # to not modify the original document
         document = document.copy()
         # tokenize_document does not yet consider predictions, so we need to add them manually
             "return_overflowing_tokens": True,
         }
         # tokenize once to get the tokenized documents with mapped annotations
+        span_annotation_type = document.annotation_types()[span_layer_name]
+        if issubclass(span_annotation_type, Span):
+            result_document_type = TokenDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+            tokenized_span_layer_name = "labeled_spans"
+        elif issubclass(span_annotation_type, MultiSpan):
+            result_document_type = (
+                TokenDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions
+            )
+            tokenized_span_layer_name = "labeled_multi_spans"
+        else:
+            raise ValueError(f"Unsupported annotation type: {span_annotation_type}")
         tokenized_documents = tokenize_document(
             document,
             tokenizer=self._tokenizer,
+            result_document_type=result_document_type,
             partition_layer="labeled_partitions",
             added_annotations=added_annotations,
             strict_span_conversion=False,
             for last_hidden_state in model_output.last_hidden_state:
                 text2tok_ann = added_annotations[example_idx][span_layer_name]
                 tok2text_ann = {v: k for k, v in text2tok_ann.items()}
+                for tok_ann in tokenized_documents[example_idx][tokenized_span_layer_name]:
+                    if isinstance(tok_ann, LabeledSpan):
+                        # skip "empty" annotations
+                        if tok_ann.start == tok_ann.end:
+                            continue
+                        embedded_tokens = last_hidden_state[tok_ann.start : tok_ann.end]
+                    elif isinstance(tok_ann, MultiSpan):
+                        # skip "empty" annotations
+                        if all(start == end for start, end in tok_ann.slices):
+                            continue
+                        # concatenate the embeddings of the tokens that make up the multi-span
+                        embedded_tokens = torch.concat(
+                            [
+                                last_hidden_state[start:end]
+                                for start, end in tok_ann.slices
+                                if start != end
+                            ],
+                            dim=0,
+                        )
+                    else:
+                        raise ValueError(f"Unsupported annotation type: {type(tok_ann)}")
                     # use the max pooling strategy to get a single embedding for the annotation text
+                    embedding = embedded_tokens.max(dim=0)[0].detach().cpu()
                     text_ann = tok2text_ann[tok_ann]
                     # if text_ann in embeddings:

model_utils.py CHANGED Viewed

@@ -1,15 +1,18 @@
 import logging
-from typing import Optional, Tuple
 import gradio as gr
 import torch
 from annotation_utils import labeled_span_to_id
 from embedding import EmbeddingModel, HuggingfaceEmbeddingModel
-from pie_modules.document.processing import RegexPartitioner
 from pytorch_ie import Pipeline
 from pytorch_ie.annotations import LabeledSpan
 from pytorch_ie.auto import AutoPipeline
-from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 logger = logging.getLogger(__name__)
@@ -18,7 +21,11 @@ def annotate_document(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
     annotation_pipeline: Pipeline,
     embedding_model: Optional[EmbeddingModel] = None,
-) -> None:
     """Annotate a document with the provided pipeline. If an embedding model is provided, also
     extract embeddings for the labeled spans.
@@ -26,15 +33,30 @@ def annotate_document(
         document: The document to annotate.
         annotation_pipeline: The pipeline to use for annotation.
         embedding_model: The embedding model to use for extracting text span embeddings.
     """
     # execute prediction pipeline
     annotation_pipeline(document)
     if embedding_model is not None:
         text_span_embeddings = embedding_model(
             document=document,
-            span_layer_name="labeled_spans",
         )
         # convert keys to str because JSON keys must be strings
         text_span_embeddings_dict = {
@@ -47,6 +69,8 @@ def annotate_document(
             "model in the 'Model Configuration' section."
         )
 def create_document(
     text: str, doc_id: str, split_regex: Optional[str] = None

 import logging
+from typing import Optional, Tuple, Union
 import gradio as gr
 import torch
 from annotation_utils import labeled_span_to_id
 from embedding import EmbeddingModel, HuggingfaceEmbeddingModel
+from pie_modules.document.processing import RegexPartitioner, SpansViaRelationMerger
 from pytorch_ie import Pipeline
 from pytorch_ie.annotations import LabeledSpan
 from pytorch_ie.auto import AutoPipeline
+from pytorch_ie.documents import (
+    TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+    TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+)
 logger = logging.getLogger(__name__)
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
     annotation_pipeline: Pipeline,
     embedding_model: Optional[EmbeddingModel] = None,
+    handle_parts_of_same: bool = False,
+) -> Union[
+    TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+    TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+]:
     """Annotate a document with the provided pipeline. If an embedding model is provided, also
     extract embeddings for the labeled spans.
         document: The document to annotate.
         annotation_pipeline: The pipeline to use for annotation.
         embedding_model: The embedding model to use for extracting text span embeddings.
+        handle_parts_of_same: Whether to merge spans that are part of the same entity into a single multi span.
     """
     # execute prediction pipeline
     annotation_pipeline(document)
+    if handle_parts_of_same:
+        merger = SpansViaRelationMerger(
+            relation_layer="binary_relations",
+            link_relation_label="parts_of_same",
+            create_multi_spans=True,
+            result_document_type=TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+            result_field_mapping={
+                "labeled_spans": "labeled_multi_spans",
+                "binary_relations": "binary_relations",
+                "labeled_partitions": "labeled_partitions",
+            },
+        )
+        document = merger(document)
     if embedding_model is not None:
         text_span_embeddings = embedding_model(
             document=document,
+            span_layer_name="labeled_spans" if not handle_parts_of_same else "labeled_multi_spans",
         )
         # convert keys to str because JSON keys must be strings
         text_span_embeddings_dict = {
             "model in the 'Model Configuration' section."
         )
+    return document
 def create_document(
     text: str, doc_id: str, split_regex: Optional[str] = None

rendering_utils.py CHANGED Viewed

@@ -4,23 +4,130 @@ from collections import defaultdict
 from typing import Dict, List, Optional, Union
 from annotation_utils import labeled_span_to_id
-from pytorch_ie.annotations import BinaryRelation, LabeledSpan
-from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils_displacy import EntityRenderer
 logger = logging.getLogger(__name__)
 # adjusted from rendering_utils_displacy.TPL_ENT
 TPL_ENT_WITH_ID = """
-<mark class="entity" id="{id}" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
     {text}
     <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">{label}</span>
 </mark>
 """
 def render_pretty_table(
-    document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions, **render_kwargs
 ):
     from prettytable import PrettyTable
@@ -37,27 +144,57 @@ def render_pretty_table(
 def render_displacy(
-    document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
     inject_relations=True,
     colors_hover=None,
     entity_options={},
     **render_kwargs,
 ):
-    labeled_spans = list(document.labeled_spans) + list(document.labeled_spans.predictions)
     spacy_doc = {
         "text": document.text,
-        "ents": [
-            {
-                "start": labeled_span.start,
-                "end": labeled_span.end,
-                "label": labeled_span.label,
-                # pass the ID as a parameter to the entity. The id is required to fetch the entity annotations
-                # on hover and to inject the relation data.
-                "params": {"id": labeled_span_to_id(labeled_span)},
-            }
-            for labeled_span in labeled_spans
-        ],
         "title": None,
     }
@@ -75,7 +212,7 @@ def render_displacy(
         )
         html = inject_relation_data(
             html,
-            labeled_spans=labeled_spans,
             binary_relations=binary_relations,
             additional_colors=colors_hover,
         )
@@ -84,7 +221,7 @@ def render_displacy(
 def inject_relation_data(
     html: str,
-    labeled_spans: List[LabeledSpan],
     binary_relations: List[BinaryRelation],
     additional_colors: Optional[Dict[str, Union[str, dict]]] = None,
 ) -> str:
@@ -99,7 +236,7 @@ def inject_relation_data(
         entity2heads[relation.tail].append((relation.head, relation.label))
         entity2tails[relation.head].append((relation.tail, relation.label))
-    ann_id2annotation = {labeled_span_to_id(entity): entity for entity in labeled_spans}
     # Add unique IDs to each entity
     entities = soup.find_all(class_="entity")
     for entity in entities:
@@ -110,12 +247,21 @@ def inject_relation_data(
                 entity[f"data-color-{key}"] = (
                     json.dumps(color) if isinstance(color, dict) else color
                 )
-        entity_annotation = ann_id2annotation[entity["id"]]
         # sanity check.
-        annotation_text_without_newline = str(entity_annotation).replace("\n", "")
         # Just check the start, because the text has the label attached to the end
         if not entity.text.startswith(annotation_text_without_newline):
             logger.warning(f"Entity text mismatch: {entity_annotation} != {entity.text}")
         entity["data-label"] = entity_annotation.label
         entity["data-relation-tails"] = json.dumps(
             [

 from typing import Dict, List, Optional, Union
 from annotation_utils import labeled_span_to_id
+from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan
+from pytorch_ie.documents import (
+    TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+    TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+)
 from rendering_utils_displacy import EntityRenderer
 logger = logging.getLogger(__name__)
 # adjusted from rendering_utils_displacy.TPL_ENT
 TPL_ENT_WITH_ID = """
+<mark class="entity" data-entity-id="{entity_id}" data-slice-idx="{slice_idx}" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
     {text}
     <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">{label}</span>
 </mark>
 """
+HIGHLIGHT_SPANS_JS = """
+() => {
+    function maybeSetColor(entity, colorAttributeKey, colorDictKey) {
+        var color = entity.getAttribute('data-color-' + colorAttributeKey);
+        // if color is a json string, parse it and use the value at colorDictKey
+        try {
+            const colors = JSON.parse(color);
+            color = colors[colorDictKey];
+        } catch (e) {}
+        if (color) {
+            entity.style.backgroundColor = color;
+            entity.style.color = '#000';
+        }
+    }
+    function highlightRelationArguments(entityId) {
+        const entities = document.querySelectorAll('.entity');
+        // reset all entities
+        entities.forEach(entity => {
+            const color = entity.getAttribute('data-color-original');
+            entity.style.backgroundColor = color;
+            entity.style.color = '';
+        });
+        if (entityId !== null) {
+            var visitedEntities = new Set();
+            // highlight selected entity
+            // get all elements with attribute data-entity-id==entityId
+            const selectedEntityParts = document.querySelectorAll(`[data-entity-id="${entityId}"]`);
+            selectedEntityParts.forEach(selectedEntityPart => {
+                const label = selectedEntityPart.getAttribute('data-label');
+                maybeSetColor(selectedEntityPart, 'selected', label);
+                visitedEntities.add(selectedEntityPart);
+            }); // <-- Corrected closing parenthesis here
+            // if there is at least one part, get the first one and ...
+            if (selectedEntityParts.length > 0) {
+                const selectedEntity = selectedEntityParts[0];
+                // ... highlight tails and ...
+                const relationTailsAndLabels = JSON.parse(selectedEntity.getAttribute('data-relation-tails'));
+                relationTailsAndLabels.forEach(relationTail => {
+                    const tailEntityId = relationTail['entity-id'];
+                    const tailEntityParts = document.querySelectorAll(`[data-entity-id="${tailEntityId}"]`);
+                    tailEntityParts.forEach(tailEntity => {
+                        const label = relationTail['label'];
+                        maybeSetColor(tailEntity, 'tail', label);
+                        visitedEntities.add(tailEntity);
+                    }); // <-- Corrected closing parenthesis here
+                }); // <-- Corrected closing parenthesis here
+                // .. highlight heads
+                const relationHeadsAndLabels = JSON.parse(selectedEntity.getAttribute('data-relation-heads'));
+                relationHeadsAndLabels.forEach(relationHead => {
+                    const headEntityId = relationHead['entity-id'];
+                    const headEntityParts = document.querySelectorAll(`[data-entity-id="${headEntityId}"]`);
+                    headEntityParts.forEach(headEntity => {
+                        const label = relationHead['label'];
+                        maybeSetColor(headEntity, 'head', label);
+                        visitedEntities.add(headEntity);
+                    }); // <-- Corrected closing parenthesis here
+                }); // <-- Corrected closing parenthesis here
+            }
+            // highlight other entities
+            entities.forEach(entity => {
+                if (!visitedEntities.has(entity)) {
+                    const label = entity.getAttribute('data-label');
+                    maybeSetColor(entity, 'other', label);
+                }
+            });
+        }
+    }
+    function setReferenceAduId(entityId) {
+        // get the textarea element that holds the reference adu id
+        let referenceAduIdDiv = document.querySelector('#selected_adu_id textarea');
+        // set the value of the input field
+        referenceAduIdDiv.value = entityId;
+        // trigger an input event to update the state
+        var event = new Event('input');
+        referenceAduIdDiv.dispatchEvent(event);
+    }
+    const entities = document.querySelectorAll('.entity');
+    entities.forEach(entity => {
+        const alreadyHasListener = entity.getAttribute('data-has-listener');
+        if (alreadyHasListener) {
+            return;
+        }
+        entity.addEventListener('mouseover', () => {
+            const entityId = entity.getAttribute('data-entity-id');
+            highlightRelationArguments(entityId);
+            setReferenceAduId(entityId);
+        });
+        entity.addEventListener('mouseout', () => {
+            highlightRelationArguments(null);
+        });
+        entity.setAttribute('data-has-listener', 'true');
+    });
+}
+"""
 def render_pretty_table(
+    document: Union[
+        TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+        TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+    ],
+    **render_kwargs,
 ):
     from prettytable import PrettyTable
 def render_displacy(
+    document: Union[
+        TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+        TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions,
+    ],
     inject_relations=True,
     colors_hover=None,
     entity_options={},
     **render_kwargs,
 ):
+    if isinstance(document, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions):
+        span_layer = document.labeled_spans
+    elif isinstance(
+        document, TextDocumentWithLabeledMultiSpansBinaryRelationsAndLabeledPartitions
+    ):
+        span_layer = document.labeled_multi_spans
+    else:
+        raise ValueError(f"Unsupported document type: {type(document)}")
+    span_annotations = list(span_layer) + list(span_layer.predictions)
+    ents = []
+    for labeled_span in span_annotations:
+        entity_id = labeled_span_to_id(labeled_span)
+        # pass the ID as a parameter to the entity. The id is required to fetch the entity annotations
+        # on hover and to inject the relation data.
+        if isinstance(labeled_span, LabeledSpan):
+            ents.append(
+                {
+                    "start": labeled_span.start,
+                    "end": labeled_span.end,
+                    "label": labeled_span.label,
+                    "params": {"entity_id": entity_id, "slice_idx": 0},
+                }
+            )
+        elif isinstance(labeled_span, LabeledMultiSpan):
+            for i, (start, end) in enumerate(labeled_span.slices):
+                ents.append(
+                    {
+                        "start": start,
+                        "end": end,
+                        "label": labeled_span.label,
+                        "params": {"entity_id": entity_id, "slice_idx": i},
+                    }
+                )
+        else:
+            raise ValueError(f"Unsupported labeled span type: {type(labeled_span)}")
     spacy_doc = {
         "text": document.text,
+        # the ents MUST be sorted by start and end
+        "ents": sorted(ents, key=lambda x: (x["start"], x["end"])),
         "title": None,
     }
         )
         html = inject_relation_data(
             html,
+            span_annotations=span_annotations,
             binary_relations=binary_relations,
             additional_colors=colors_hover,
         )
 def inject_relation_data(
     html: str,
+    span_annotations: Union[List[LabeledSpan], List[LabeledMultiSpan]],
     binary_relations: List[BinaryRelation],
     additional_colors: Optional[Dict[str, Union[str, dict]]] = None,
 ) -> str:
         entity2heads[relation.tail].append((relation.head, relation.label))
         entity2tails[relation.head].append((relation.tail, relation.label))
+    ann_id2annotation = {labeled_span_to_id(entity): entity for entity in span_annotations}
     # Add unique IDs to each entity
     entities = soup.find_all(class_="entity")
     for entity in entities:
                 entity[f"data-color-{key}"] = (
                     json.dumps(color) if isinstance(color, dict) else color
                 )
+        entity_annotation = ann_id2annotation[entity["data-entity-id"]]
         # sanity check.
+        if isinstance(entity_annotation, LabeledSpan):
+            annotation_text = entity_annotation.resolve()[1]
+        elif isinstance(entity_annotation, LabeledMultiSpan):
+            slice_idx = int(entity["data-slice-idx"])
+            annotation_text = entity_annotation.resolve()[1][slice_idx]
+        else:
+            raise ValueError(f"Unsupported entity type: {type(entity_annotation)}")
+        annotation_text_without_newline = annotation_text.replace("\n", "")
         # Just check the start, because the text has the label attached to the end
         if not entity.text.startswith(annotation_text_without_newline):
             logger.warning(f"Entity text mismatch: {entity_annotation} != {entity.text}")
         entity["data-label"] = entity_annotation.label
         entity["data-relation-tails"] = json.dumps(
             [

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 gradio==4.36.0
 prettytable==3.10.0
 pie-modules==0.12.0

+pytorch-ie==0.31.1
 gradio==4.36.0
 prettytable==3.10.0
 pie-modules==0.12.0