Spaces:

ArneBinder
/

sam-pointer-bart-base-v0.3

Running

App Files Files Community

ArneBinder commited on 28 days ago

Commit

04ce9af

•

1 Parent(s): b77f1d0

Upload 7 files

Browse files

Files changed (2) hide show

app.py +30 -78
backend.py +160 -140

app.py CHANGED Viewed

@@ -3,18 +3,17 @@ import logging
 import os.path
 import tempfile
 from functools import partial
-from typing import Dict, List, Optional, Tuple
 import gradio as gr
 import pandas as pd
-from backend import get_annotation_from_document, get_relevant_adus, get_similar_adus, process_text
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
 from pytorch_ie import Pipeline
 from pytorch_ie.auto import AutoPipeline
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils import render_displacy, render_pretty_table
 from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizer
-from vector_store import SimpleVectorStore, VectorStore
 logger = logging.getLogger(__name__)
@@ -49,18 +48,14 @@ def wrapped_process_text(
     text: str,
     doc_id: str,
     models: Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]],
-    processed_documents: dict[
-        str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-    ],
-    vector_store: VectorStore[Tuple[str, str]],
 ) -> Tuple[dict, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]:
-    document = process_text(
         text=text,
         doc_id=doc_id,
         models=models,
-        processed_documents=processed_documents,
-        vector_store=vector_store,
     )
     # Return as dict and document to avoid serialization issues
     return document.asdict(), document
@@ -68,10 +63,7 @@ def wrapped_process_text(
 def process_uploaded_files(
     file_names: List[str],
     models: Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]],
-    processed_documents: dict[
-        str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-    ],
-    vector_store: VectorStore[Tuple[str, str]],
 ) -> pd.DataFrame:
     try:
         for file_name in file_names:
@@ -81,13 +73,14 @@ def process_uploaded_files(
                     text = f.read()
                 base_file_name = os.path.basename(file_name)
                 gr.Info(f"Processing file '{base_file_name}' ...")
-                process_text(text, base_file_name, models, processed_documents, vector_store)
             else:
                 raise gr.Error(f"Unsupported file format: {file_name}")
     except Exception as e:
         raise gr.Error(f"Failed to process uploaded files: {e}")
-    return update_processed_documents_df(processed_documents)
 def open_accordion():
@@ -135,34 +128,15 @@ def load_models(
     return argumentation_model, embedding_model, embedding_tokenizer
-def update_processed_documents_df(
-    processed_documents: dict[str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]
-) -> pd.DataFrame:
-    df = pd.DataFrame(
-        [
-            (
-                doc_id,
-                len(document.labeled_spans.predictions),
-                len(document.binary_relations.predictions),
-            )
-            for doc_id, document in processed_documents.items()
-        ],
-        columns=["doc_id", "num_adus", "num_relations"],
-    )
-    return df
 def select_processed_document(
     evt: gr.SelectData,
     processed_documents_df: pd.DataFrame,
-    processed_documents: Dict[
-        str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-    ],
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     row_idx, col_idx = evt.index
     doc_id = processed_documents_df.iloc[row_idx]["doc_id"]
     gr.Info(f"Select document: {doc_id}")
-    doc = processed_documents[doc_id]
     return doc
@@ -185,38 +159,24 @@ def set_relation_types(
 def download_processed_documents(
-    processed_documents: dict[
-        str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-    ],
     file_name: str = "processed_documents.json",
 ) -> str:
-    processed_documents_json = {
-        doc_id: document.asdict() for doc_id, document in processed_documents.items()
-    }
     file_path = os.path.join(tempfile.gettempdir(), file_name)
     with open(file_path, "w", encoding="utf-8") as f:
-        json.dump(processed_documents_json, f, indent=2)
     return file_path
 def upload_processed_documents(
     file_name: str,
-    processed_documents: dict[
-        str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-    ],
-) -> Dict[str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]:
     with open(file_name, "r", encoding="utf-8") as f:
         processed_documents_json = json.load(f)
-    for doc_id, document_json in processed_documents_json.items():
-        document = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions.fromdict(
-            document_json
-        )
-        # metadata is not automatically deserialized, so we need to set it manually
-        document.metadata["embeddings"] = document_json["metadata"]["embeddings"]
-        if doc_id in processed_documents:
-            gr.Warning(f"Document '{doc_id}' already exists. Overwriting.")
-        processed_documents[doc_id] = document
-    return processed_documents
 def main():
@@ -256,8 +216,7 @@ def main():
     }
     with gr.Blocks() as demo:
-        processed_documents_state = gr.State(dict())
-        vector_store_state = gr.State(SimpleVectorStore())
         # wrap the pipeline and the embedding model/tokenizer in a tuple to avoid that it gets called
         models_state = gr.State((argumentation_model, embedding_model, embedding_tokenizer))
         with gr.Row():
@@ -381,12 +340,12 @@ def main():
         predict_btn.click(fn=open_accordion, inputs=[], outputs=[output_accordion]).then(
             fn=wrapped_process_text,
-            inputs=[doc_text, doc_id, models_state, processed_documents_state, vector_store_state],
             outputs=[document_json, document_state],
             api_name="predict",
         ).success(
-            fn=update_processed_documents_df,
-            inputs=[processed_documents_state],
             outputs=[processed_documents_df],
         )
         render_btn.click(**render_event_kwargs, api_name="render")
@@ -403,41 +362,35 @@ def main():
             fn=open_accordion, inputs=[], outputs=[processed_documents_accordion]
         ).then(
             fn=process_uploaded_files,
-            inputs=[upload_btn, models_state, processed_documents_state, vector_store_state],
             outputs=[processed_documents_df],
         )
         processed_documents_df.select(
             select_processed_document,
-            inputs=[processed_documents_df, processed_documents_state],
             outputs=[document_state],
         )
         download_processed_documents_btn.click(
             fn=download_processed_documents,
-            inputs=[processed_documents_state],
             outputs=[download_processed_documents_btn],
         )
         upload_processed_documents_btn.upload(
             fn=upload_processed_documents,
-            inputs=[upload_processed_documents_btn, processed_documents_state],
-            outputs=[processed_documents_state],
-        ).success(
-            fn=update_processed_documents_df,
-            inputs=[processed_documents_state],
             outputs=[processed_documents_df],
         )
         retrieve_relevant_adus_event_kwargs = dict(
-            fn=get_relevant_adus,
             inputs=[
                 selected_adu_id,
                 document_state,
-                vector_store_state,
-                processed_documents_state,
                 min_similarity,
                 top_k,
                 relation_types,
-                relevant_adus,
             ],
             outputs=[relevant_adus],
         )
@@ -449,12 +402,11 @@ def main():
         ).success(**retrieve_relevant_adus_event_kwargs)
         retrieve_similar_adus_btn.click(
-            fn=get_similar_adus,
             inputs=[
                 selected_adu_id,
                 document_state,
-                vector_store_state,
-                processed_documents_state,
                 min_similarity,
                 top_k,
             ],

 import os.path
 import tempfile
 from functools import partial
+from typing import List, Optional, Tuple
 import gradio as gr
 import pandas as pd
+from backend import DocumentStore, create_and_annotate_document, get_annotation_from_document
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
 from pytorch_ie import Pipeline
 from pytorch_ie.auto import AutoPipeline
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils import render_displacy, render_pretty_table
 from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizer
 logger = logging.getLogger(__name__)
     text: str,
     doc_id: str,
     models: Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]],
+    document_store: DocumentStore,
 ) -> Tuple[dict, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]:
+    document = create_and_annotate_document(
         text=text,
         doc_id=doc_id,
         models=models,
     )
+    document_store.add_document(document)
     # Return as dict and document to avoid serialization issues
     return document.asdict(), document
 def process_uploaded_files(
     file_names: List[str],
     models: Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]],
+    document_store: DocumentStore,
 ) -> pd.DataFrame:
     try:
         for file_name in file_names:
                     text = f.read()
                 base_file_name = os.path.basename(file_name)
                 gr.Info(f"Processing file '{base_file_name}' ...")
+                document = create_and_annotate_document(text, base_file_name, models)
+                document_store.add_document(document)
             else:
                 raise gr.Error(f"Unsupported file format: {file_name}")
     except Exception as e:
         raise gr.Error(f"Failed to process uploaded files: {e}")
+    return document_store.overview()
 def open_accordion():
     return argumentation_model, embedding_model, embedding_tokenizer
 def select_processed_document(
     evt: gr.SelectData,
     processed_documents_df: pd.DataFrame,
+    document_store: DocumentStore,
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     row_idx, col_idx = evt.index
     doc_id = processed_documents_df.iloc[row_idx]["doc_id"]
     gr.Info(f"Select document: {doc_id}")
+    doc = document_store.get_document(doc_id)
     return doc
 def download_processed_documents(
+    document_store: DocumentStore,
     file_name: str = "processed_documents.json",
 ) -> str:
     file_path = os.path.join(tempfile.gettempdir(), file_name)
     with open(file_path, "w", encoding="utf-8") as f:
+        json.dump(document_store.as_dict(), f, indent=2)
     return file_path
 def upload_processed_documents(
     file_name: str,
+    document_store: DocumentStore,
+) -> pd.DataFrame:
     with open(file_name, "r", encoding="utf-8") as f:
         processed_documents_json = json.load(f)
+    for _, document_json in processed_documents_json.items():
+        document_store.add_document_from_dict(document_dict=document_json)
+    return document_store.overview()
 def main():
     }
     with gr.Blocks() as demo:
+        document_store_state = gr.State(DocumentStore())
         # wrap the pipeline and the embedding model/tokenizer in a tuple to avoid that it gets called
         models_state = gr.State((argumentation_model, embedding_model, embedding_tokenizer))
         with gr.Row():
         predict_btn.click(fn=open_accordion, inputs=[], outputs=[output_accordion]).then(
             fn=wrapped_process_text,
+            inputs=[doc_text, doc_id, models_state, document_store_state],
             outputs=[document_json, document_state],
             api_name="predict",
         ).success(
+            fn=lambda document_store: document_store.overview(),
+            inputs=[document_store_state],
             outputs=[processed_documents_df],
         )
         render_btn.click(**render_event_kwargs, api_name="render")
             fn=open_accordion, inputs=[], outputs=[processed_documents_accordion]
         ).then(
             fn=process_uploaded_files,
+            inputs=[upload_btn, models_state, document_store_state],
             outputs=[processed_documents_df],
         )
         processed_documents_df.select(
             select_processed_document,
+            inputs=[processed_documents_df, document_store_state],
             outputs=[document_state],
         )
         download_processed_documents_btn.click(
             fn=download_processed_documents,
+            inputs=[document_store_state],
             outputs=[download_processed_documents_btn],
         )
         upload_processed_documents_btn.upload(
             fn=upload_processed_documents,
+            inputs=[upload_processed_documents_btn, document_store_state],
             outputs=[processed_documents_df],
         )
         retrieve_relevant_adus_event_kwargs = dict(
+            fn=partial(DocumentStore.get_relevant_adus_df, columns=relevant_adus.headers),
             inputs=[
+                document_store_state,
                 selected_adu_id,
                 document_state,
                 min_similarity,
                 top_k,
                 relation_types,
             ],
             outputs=[relevant_adus],
         )
         ).success(**retrieve_relevant_adus_event_kwargs)
         retrieve_similar_adus_btn.click(
+            fn=DocumentStore.get_similar_adus_df,
             inputs=[
+                document_store_state,
                 selected_adu_id,
                 document_state,
                 min_similarity,
                 top_k,
             ],

backend.py CHANGED Viewed

@@ -11,12 +11,12 @@ from pytorch_ie.annotations import LabeledSpan, Span
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils import labeled_span_to_id
 from transformers import PreTrainedModel, PreTrainedTokenizer
-from vector_store import VectorStore
 logger = logging.getLogger(__name__)
-def embed_text_annotations(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
     model: PreTrainedModel,
     tokenizer: PreTrainedTokenizer,
@@ -73,7 +73,7 @@ def embed_text_annotations(
     return embeddings
-def annotate(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
     pipeline: Pipeline,
     embedding_model: Optional[PreTrainedModel] = None,
@@ -84,7 +84,7 @@ def annotate(
     pipeline(document)
     if embedding_model is not None and embedding_tokenizer is not None:
-        adu_embeddings = embed_text_annotations(
             document=document,
             model=embedding_model,
             tokenizer=embedding_tokenizer,
@@ -102,38 +102,10 @@ def annotate(
         )
-def add_to_index(
-    document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
-    processed_documents: dict,
-    vector_store: VectorStore[Tuple[str, str]],
-) -> None:
-    try:
-        if document.id in processed_documents:
-            gr.Warning(f"Document '{document.id}' already in index. Overwriting.")
-        # save the processed document to the index
-        processed_documents[document.id] = document
-        # save the embeddings to the vector store
-        for adu_id, embedding in document.metadata["embeddings"].items():
-            vector_store.save((document.id, adu_id), embedding)
-        gr.Info(
-            f"Added document {document.id} to index (index contains {len(processed_documents)} "
-            f"documents and {len(vector_store)} embeddings)."
-        )
-    except Exception as e:
-        raise gr.Error(f"Failed to add document {document.id} to index: {e}")
-def process_text(
     text: str,
     doc_id: str,
     models: Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]],
-    processed_documents: dict[
-        str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-    ],
-    vector_store: VectorStore[Tuple[str, str]],
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     """Create a TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions from the provided
     text, annotate it, and add it to the index.
@@ -142,8 +114,6 @@ def process_text(
         text: The text to process.
         doc_id: The ID of the document.
         models: A tuple containing the prediction pipeline and the embedding model and tokenizer.
-        processed_documents: The index of processed documents.
-        vector_store: The vector store to save the embeddings.
     Returns:
         The processed document.
@@ -156,14 +126,12 @@ def process_text(
         # add single partition from the whole text (the model only considers text in partitions)
         document.labeled_partitions.append(LabeledSpan(start=0, end=len(text), label="text"))
         # annotate the document
-        annotate(
             document=document,
             pipeline=models[0],
             embedding_model=models[1],
             embedding_tokenizer=models[2],
         )
-        # add the document to the index
-        add_to_index(document, processed_documents, vector_store)
         return document
     except Exception as e:
@@ -187,113 +155,165 @@ def get_annotation_from_document(
     return annotation
-def get_annotation_from_processed_documents(
-    doc_id: str,
-    annotation_id: str,
-    annotation_layer: str,
-    processed_documents: dict[
-        str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-    ],
-) -> LabeledSpan:
-    document = processed_documents.get(doc_id)
-    if document is None:
-        raise gr.Error(
-            f"Document '{doc_id}' not found in index. Available documents: {list(processed_documents)}"
-        )
-    return get_annotation_from_document(document, annotation_id, annotation_layer)
-def get_similar_adus(
-    ref_annotation_id: str,
-    ref_document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
-    vector_store: VectorStore[Tuple[str, str]],
-    processed_documents: dict[
-        str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-    ],
-    min_similarity: float,
-    top_k: int,
-) -> pd.DataFrame:
-    similar_entries = vector_store.retrieve_similar(
-        ref_id=(ref_document.id, ref_annotation_id),
-        min_similarity=min_similarity,
-        top_k=top_k,
-    )
-    similar_annotations = [
-        get_annotation_from_processed_documents(
-            doc_id=doc_id,
-            annotation_id=annotation_id,
-            annotation_layer="labeled_spans",
-            processed_documents=processed_documents,
         )
-        for (doc_id, annotation_id), _ in similar_entries
-    ]
-    df = pd.DataFrame(
-        [
-            # unpack the tuple (doc_id, annotation_id) to separate columns
-            # and add the similarity score and the text of the annotation
-            (doc_id, annotation_id, score, str(annotation))
-            for ((doc_id, annotation_id), score), annotation in zip(
-                similar_entries, similar_annotations
             )
-        ],
-        columns=["doc_id", "adu_id", "sim_score", "text"],
-    )
-    return df
-def get_relevant_adus(
-    ref_annotation_id: str,
-    ref_document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
-    vector_store: VectorStore[Tuple[str, str]],
-    processed_documents: dict[
-        str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
-    ],
-    min_similarity: float,
-    top_k: int,
-    relation_types: List[str],
-    previous_result: pd.DataFrame,
-) -> pd.DataFrame:
-    similar_entries = vector_store.retrieve_similar(
-        ref_id=(ref_document.id, ref_annotation_id),
-        min_similarity=min_similarity,
-        top_k=top_k,
-    )
-    result = []
-    for (doc_id, annotation_id), score in similar_entries:
-        # skip entries from the same document
-        if doc_id == ref_document.id:
-            continue
-        document = processed_documents[doc_id]
-        tail2rels = defaultdict(list)
-        head2rels = defaultdict(list)
-        for rel in document.binary_relations.predictions:
-            # skip non-argumentative relations
-            if rel.label not in relation_types:
                 continue
-            head2rels[rel.head].append(rel)
-            tail2rels[rel.tail].append(rel)
-        id2annotation = {
-            labeled_span_to_id(annotation): annotation
-            for annotation in document.labeled_spans.predictions
-        }
-        annotation = id2annotation.get(annotation_id)
-        # note: we do not need to check if the annotation is different from the reference annotation,
-        # because they come from different documents and we already skip entries from the same document
-        for rel in head2rels.get(annotation, []):
-            result.append(
-                {
-                    "doc_id": doc_id,
-                    "reference_adu": str(annotation),
-                    "sim_score": score,
-                    "rel_score": rel.score,
-                    "relation": rel.label,
-                    "adu": str(rel.tail),
-                }
             )
-    # define column order
-    df = pd.DataFrame(result, columns=previous_result.columns)
-    return df

 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils import labeled_span_to_id
 from transformers import PreTrainedModel, PreTrainedTokenizer
+from vector_store import SimpleVectorStore, VectorStore
 logger = logging.getLogger(__name__)
+def _embed_text_annotations(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
     model: PreTrainedModel,
     tokenizer: PreTrainedTokenizer,
     return embeddings
+def _annotate(
     document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
     pipeline: Pipeline,
     embedding_model: Optional[PreTrainedModel] = None,
     pipeline(document)
     if embedding_model is not None and embedding_tokenizer is not None:
+        adu_embeddings = _embed_text_annotations(
             document=document,
             model=embedding_model,
             tokenizer=embedding_tokenizer,
         )
+def create_and_annotate_document(
     text: str,
     doc_id: str,
     models: Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]],
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     """Create a TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions from the provided
     text, annotate it, and add it to the index.
         text: The text to process.
         doc_id: The ID of the document.
         models: A tuple containing the prediction pipeline and the embedding model and tokenizer.
     Returns:
         The processed document.
         # add single partition from the whole text (the model only considers text in partitions)
         document.labeled_partitions.append(LabeledSpan(start=0, end=len(text), label="text"))
         # annotate the document
+        _annotate(
             document=document,
             pipeline=models[0],
             embedding_model=models[1],
             embedding_tokenizer=models[2],
         )
         return document
     except Exception as e:
     return annotation
+class DocumentStore:
+    DOCUMENT_TYPE = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+    def __init__(self, vector_store: Optional[VectorStore[Tuple[str, str]]] = None):
+        self.documents = {}
+        self.vector_store = vector_store or SimpleVectorStore()
+    def get_annotation(
+        self,
+        doc_id: str,
+        annotation_id: str,
+        annotation_layer: str,
+    ) -> LabeledSpan:
+        document = self.documents.get(doc_id)
+        if document is None:
+            raise gr.Error(
+                f"Document '{doc_id}' not found in index. Available documents: {list(self.documents)}"
+            )
+        return get_annotation_from_document(document, annotation_id, annotation_layer)
+    def get_similar_adus_df(
+        self,
+        ref_annotation_id: str,
+        ref_document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+        min_similarity: float,
+        top_k: int,
+    ) -> pd.DataFrame:
+        similar_entries = self.vector_store.retrieve_similar(
+            ref_id=(ref_document.id, ref_annotation_id),
+            min_similarity=min_similarity,
+            top_k=top_k,
         )
+        similar_annotations = [
+            self.get_annotation(
+                doc_id=doc_id,
+                annotation_id=annotation_id,
+                annotation_layer="labeled_spans",
             )
+            for (doc_id, annotation_id), _ in similar_entries
+        ]
+        df = pd.DataFrame(
+            [
+                # unpack the tuple (doc_id, annotation_id) to separate columns
+                # and add the similarity score and the text of the annotation
+                (doc_id, annotation_id, score, str(annotation))
+                for ((doc_id, annotation_id), score), annotation in zip(
+                    similar_entries, similar_annotations
+                )
+            ],
+            columns=["doc_id", "adu_id", "sim_score", "text"],
+        )
+        return df
+    def get_relevant_adus_df(
+        self,
+        ref_annotation_id: str,
+        ref_document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+        min_similarity: float,
+        top_k: int,
+        relation_types: List[str],
+        columns: List[str],
+    ) -> pd.DataFrame:
+        similar_entries = self.vector_store.retrieve_similar(
+            ref_id=(ref_document.id, ref_annotation_id),
+            min_similarity=min_similarity,
+            top_k=top_k,
+        )
+        result = []
+        for (doc_id, annotation_id), score in similar_entries:
+            # skip entries from the same document
+            if doc_id == ref_document.id:
                 continue
+            document = self.documents[doc_id]
+            tail2rels = defaultdict(list)
+            head2rels = defaultdict(list)
+            for rel in document.binary_relations.predictions:
+                # skip non-argumentative relations
+                if rel.label not in relation_types:
+                    continue
+                head2rels[rel.head].append(rel)
+                tail2rels[rel.tail].append(rel)
+            id2annotation = {
+                labeled_span_to_id(annotation): annotation
+                for annotation in document.labeled_spans.predictions
+            }
+            annotation = id2annotation.get(annotation_id)
+            # note: we do not need to check if the annotation is different from the reference annotation,
+            # because they come from different documents and we already skip entries from the same document
+            for rel in head2rels.get(annotation, []):
+                result.append(
+                    {
+                        "doc_id": doc_id,
+                        "reference_adu": str(annotation),
+                        "sim_score": score,
+                        "rel_score": rel.score,
+                        "relation": rel.label,
+                        "adu": str(rel.tail),
+                    }
+                )
+        # define column order
+        df = pd.DataFrame(result, columns=columns)
+        return df
+    def add_document(
+        self, document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+    ) -> None:
+        try:
+            if document.id in self.documents:
+                gr.Warning(f"Document '{document.id}' already in index. Overwriting.")
+            # save the processed document to the index
+            self.documents[document.id] = document
+            # save the embeddings to the vector store
+            for adu_id, embedding in document.metadata["embeddings"].items():
+                self.vector_store.save((document.id, adu_id), embedding)
+            gr.Info(
+                f"Added document {document.id} to index (index contains {len(self.documents)} "
+                f"documents and {len(self.vector_store)} embeddings)."
             )
+        except Exception as e:
+            raise gr.Error(f"Failed to add document {document.id} to index: {e}")
+    def add_document_from_dict(self, document_dict: dict) -> None:
+        document = self.DOCUMENT_TYPE.fromdict(document_dict)
+        # metadata is not automatically deserialized, so we need to set it manually
+        document.metadata = document_dict["metadata"]
+        self.add_document(document)
+    def add_documents(
+        self, documents: List[TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]
+    ) -> None:
+        for document in documents:
+            self.add_document(document)
+    def get_document(
+        self, doc_id: str
+    ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
+        return self.documents[doc_id]
+    def overview(self) -> pd.DataFrame:
+        df = pd.DataFrame(
+            [
+                (
+                    doc_id,
+                    len(document.labeled_spans.predictions),
+                    len(document.binary_relations.predictions),
+                )
+                for doc_id, document in self.documents.items()
+            ],
+            columns=["doc_id", "num_adus", "num_relations"],
+        )
+        return df
+    def as_dict(self) -> dict:
+        return {doc_id: document.asdict() for doc_id, document in self.documents.items()}