Spaces:

ArneBinder
/

sam-pointer-bart-base-v0.3

Running

App Files Files Community

ArneBinder commited on 8 days ago

Commit

d7a2972

•

1 Parent(s): 1681237

from https://github.com/ArneBinder/pie-document-level/pull/225

Browse files

Files changed (4) hide show

app.py +13 -5
document_store.py +120 -27
requirements.txt +1 -0
vector_store.py +165 -17

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ from pytorch_ie import Pipeline
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils import render_displacy, render_pretty_table
 from transformers import PreTrainedModel, PreTrainedTokenizer
 logger = logging.getLogger(__name__)
@@ -65,6 +66,9 @@ def wrapped_process_text(
         document_store.add_document(document)
     except Exception as e:
         raise gr.Error(f"Failed to process text: {e}")
     # Return as dict and document to avoid serialization issues
     return document.asdict(), document
@@ -117,7 +121,7 @@ def select_processed_document(
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     row_idx, col_idx = evt.index
     doc_id = processed_documents_df.iloc[row_idx]["doc_id"]
-    doc = document_store.get_document(doc_id)
     return doc
@@ -144,7 +148,7 @@ def download_processed_documents(
     file_name: str = "processed_documents.json",
 ) -> str:
     file_path = os.path.join(tempfile.gettempdir(), file_name)
-    document_store.save_to_json(file_path, indent=2)
     return file_path
@@ -152,7 +156,7 @@ def upload_processed_documents(
     file_name: str,
     document_store: DocumentStore,
 ) -> pd.DataFrame:
-    document_store.add_documents_from_json(file_name)
     return document_store.overview()
@@ -197,7 +201,11 @@ def main():
     with gr.Blocks() as demo:
         document_store_state = gr.State(
-            DocumentStore(span_annotation_caption="adu", relation_annotation_caption="relation")
         )
         # wrap the pipeline and the embedding model/tokenizer in a tuple to avoid that it gets called
         models_state = gr.State((argumentation_model, embedding_model))
@@ -379,7 +387,7 @@ def main():
         )
         download_processed_documents_btn.click(
-            fn=download_processed_documents,
             inputs=[document_store_state],
             outputs=[download_processed_documents_btn],
         )

 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils import render_displacy, render_pretty_table
 from transformers import PreTrainedModel, PreTrainedTokenizer
+from vector_store import QdrantVectorStore, SimpleVectorStore
 logger = logging.getLogger(__name__)
         document_store.add_document(document)
     except Exception as e:
         raise gr.Error(f"Failed to process text: {e}")
+    # remove the embeddings because they are very large
+    if document.metadata.get("embeddings"):
+        document.metadata = {k: v for k, v in document.metadata.items() if k != "embeddings"}
     # Return as dict and document to avoid serialization issues
     return document.asdict(), document
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     row_idx, col_idx = evt.index
     doc_id = processed_documents_df.iloc[row_idx]["doc_id"]
+    doc = document_store.get_document(doc_id, with_embeddings=False)
     return doc
     file_name: str = "processed_documents.json",
 ) -> str:
     file_path = os.path.join(tempfile.gettempdir(), file_name)
+    document_store.save_to_file(file_path, indent=2)
     return file_path
     file_name: str,
     document_store: DocumentStore,
 ) -> pd.DataFrame:
+    document_store.add_documents_from_file(file_name)
     return document_store.overview()
     with gr.Blocks() as demo:
         document_store_state = gr.State(
+            DocumentStore(
+                span_annotation_caption="adu",
+                relation_annotation_caption="relation",
+                vector_store=QdrantVectorStore(),
+            )
         )
         # wrap the pipeline and the embedding model/tokenizer in a tuple to avoid that it gets called
         models_state = gr.State((argumentation_model, embedding_model))
         )
         download_processed_documents_btn.click(
+            fn=partial(download_processed_documents, file_name="processed_documents.zip"),
             inputs=[document_store_state],
             outputs=[download_processed_documents_btn],
         )

document_store.py CHANGED Viewed

@@ -1,7 +1,11 @@
 import json
 import logging
 from collections import defaultdict
-from typing import Dict, List, Optional, Tuple
 import gradio as gr
 import pandas as pd
@@ -11,7 +15,7 @@ from pytorch_ie.documents import (
     TextBasedDocument,
     TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
 )
-from vector_store import SimpleVectorStore, VectorStore
 logger = logging.getLogger(__name__)
@@ -134,9 +138,11 @@ class DocumentStore:
             are used, otherwise the gold annotations are used.
     """
     def __init__(
         self,
-        vector_store: Optional[VectorStore[Tuple[str, str], List[float]]] = None,
         document_type: type[
             TextBasedDocument
         ] = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
@@ -151,9 +157,7 @@ class DocumentStore:
         self.documents: Dict[str, TextBasedDocument] = {}
         # The vector store to efficiently retrieve similar spans. Can be constructed from the
         # documents.
-        self.vector_store: VectorStore[Tuple[str, str], List[float]] = (
-            vector_store or SimpleVectorStore()
-        )
         # the document type (to create new documents from dicts)
         self.document_type = document_type
         self.span_layer_name = span_layer_name
@@ -180,6 +184,10 @@ class DocumentStore:
             document, annotation_id, annotation_layer, use_predictions=use_predictions
         )
     def get_similar_annotations_df(
         self,
         ref_annotation_id: str,
@@ -203,27 +211,25 @@ class DocumentStore:
         """
         similar_entries = self.vector_store.retrieve_similar(
-            ref_id=(ref_document.id, ref_annotation_id),
             **similarity_kwargs,
         )
         similar_annotations = [
             self.get_annotation(
-                doc_id=doc_id,
-                annotation_id=annotation_id,
                 annotation_layer=annotation_layer,
                 use_predictions=self.use_predictions,
             )
-            for (doc_id, annotation_id), _ in similar_entries
         ]
         df = pd.DataFrame(
             [
                 # unpack the tuple (doc_id, annotation_id) to separate columns
                 # and add the similarity score and the text of the annotation
-                (doc_id, annotation_id, score, str(annotation))
-                for ((doc_id, annotation_id), score), annotation in zip(
-                    similar_entries, similar_annotations
-                )
             ],
             columns=["doc_id", "annotation_id", "sim_score", "text"],
         )
@@ -258,19 +264,20 @@ class DocumentStore:
         """
         similar_entries = self.vector_store.retrieve_similar(
-            ref_id=(ref_document.id, ref_annotation_id),
             min_similarity=min_similarity,
             top_k=top_k,
         )
         result = []
-        for (doc_id, annotation_id), score in similar_entries:
             # skip entries from the same document
             if doc_id == ref_document.id:
                 continue
             document = self.documents[doc_id]
             reference_annotation = get_annotation_from_document(
                 document=document,
-                annotation_id=annotation_id,
                 annotation_layer=self.span_layer_name,
                 use_predictions=self.use_predictions,
             )
@@ -295,12 +302,21 @@ class DocumentStore:
             if document.id in self.documents:
                 gr.Warning(f"Document '{document.id}' already in index. Overwriting.")
             # save the processed document to the index
             self.documents[document.id] = document
-            # save the embeddings to the vector store
-            for annotation_id, embedding in document.metadata["embeddings"].items():
-                self.vector_store.save((document.id, annotation_id), embedding)
         except Exception as e:
             raise gr.Error(f"Failed to add document {document.id} to index: {e}")
@@ -325,12 +341,81 @@ class DocumentStore:
             f"Added {len(documents_json)} documents to the index ({len(self.documents)} documents in total)."
         )
-    def save_to_json(self, file_path: str, **kwargs) -> None:
         with open(file_path, "w", encoding="utf-8") as f:
-            json.dump(self.as_dict(), f, **kwargs)
-    def get_document(self, doc_id: str) -> TextBasedDocument:
-        return self.documents[doc_id]
     def overview(self) -> pd.DataFrame:
         rows = []
@@ -346,5 +431,13 @@ class DocumentStore:
         df = pd.DataFrame(rows)
         return df
-    def as_dict(self) -> dict:
-        return {doc_id: document.asdict() for doc_id, document in self.documents.items()}

 import json
 import logging
+import os
+import shutil
+import tempfile
+import zipfile
 from collections import defaultdict
+from typing import Any, Dict, List, Optional
 import gradio as gr
 import pandas as pd
     TextBasedDocument,
     TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
 )
+from vector_store import VectorStore
 logger = logging.getLogger(__name__)
             are used, otherwise the gold annotations are used.
     """
+    JSON_FILE_NAME = "documents.json"
     def __init__(
         self,
+        vector_store: VectorStore[Dict[str, Any], List[float]],
         document_type: type[
             TextBasedDocument
         ] = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
         self.documents: Dict[str, TextBasedDocument] = {}
         # The vector store to efficiently retrieve similar spans. Can be constructed from the
         # documents.
+        self.vector_store = vector_store
         # the document type (to create new documents from dicts)
         self.document_type = document_type
         self.span_layer_name = span_layer_name
             document, annotation_id, annotation_layer, use_predictions=use_predictions
         )
+    def construct_embedding_payload(self, document: TextBasedDocument, annotation_id: str) -> dict:
+        payload = {"doc_id": document.id, "annotation_id": annotation_id}
+        return payload
     def get_similar_annotations_df(
         self,
         ref_annotation_id: str,
         """
         similar_entries = self.vector_store.retrieve_similar(
+            ref_payload=self.construct_embedding_payload(ref_document, ref_annotation_id),
             **similarity_kwargs,
         )
         similar_annotations = [
             self.get_annotation(
+                doc_id=payload["doc_id"],
+                annotation_id=payload["annotation_id"],
                 annotation_layer=annotation_layer,
                 use_predictions=self.use_predictions,
             )
+            for _, payload, _ in similar_entries
         ]
         df = pd.DataFrame(
             [
                 # unpack the tuple (doc_id, annotation_id) to separate columns
                 # and add the similarity score and the text of the annotation
+                (payload["doc_id"], payload["annotation_id"], score, str(annotation))
+                for (_, payload, score), annotation in zip(similar_entries, similar_annotations)
             ],
             columns=["doc_id", "annotation_id", "sim_score", "text"],
         )
         """
         similar_entries = self.vector_store.retrieve_similar(
+            ref_payload=self.construct_embedding_payload(ref_document, ref_annotation_id),
             min_similarity=min_similarity,
             top_k=top_k,
         )
         result = []
+        for _, payload, score in similar_entries:
+            doc_id = payload["doc_id"]
             # skip entries from the same document
             if doc_id == ref_document.id:
                 continue
             document = self.documents[doc_id]
             reference_annotation = get_annotation_from_document(
                 document=document,
+                annotation_id=payload["annotation_id"],
                 annotation_layer=self.span_layer_name,
                 use_predictions=self.use_predictions,
             )
             if document.id in self.documents:
                 gr.Warning(f"Document '{document.id}' already in index. Overwriting.")
+            # copy the document to avoid side effects
+            document = document.copy()
             # save the processed document to the index
             self.documents[document.id] = document
+            # save the embeddings to the vector store, if available
+            if "embeddings" in document.metadata:
+                for annotation_id, embedding in document.metadata["embeddings"].items():
+                    payload = self.construct_embedding_payload(document, annotation_id)
+                    self.vector_store.add(payload=payload, embedding=embedding)
+                # remove the embeddings from the document metadata
+                document.metadata = {
+                    k: v for k, v in document.metadata.items() if k != "embeddings"
+                }
         except Exception as e:
             raise gr.Error(f"Failed to add document {document.id} to index: {e}")
             f"Added {len(documents_json)} documents to the index ({len(self.documents)} documents in total)."
         )
+    def add_documents_from_zip(self, file_path: str) -> None:
+        temp_dir = os.path.join(tempfile.gettempdir(), "document_store")
+        # remove the temporary directory if it already exists
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+        with zipfile.ZipFile(file_path, "r") as zipf:
+            # extract all files to the temporary directory
+            zipf.extractall(temp_dir)
+        json_file_path = os.path.join(temp_dir, self.JSON_FILE_NAME)
+        self.add_documents_from_json(json_file_path)
+        # load the vector store from the temporary directory
+        self.vector_store.load_from_directory(temp_dir)
+        # delete the temporary directory
+        shutil.rmtree(temp_dir)
+    def add_documents_from_file(self, file_path: str) -> None:
+        if file_path.endswith(".json"):
+            self.add_documents_from_json(file_path)
+        elif file_path.endswith(".zip"):
+            self.add_documents_from_zip(file_path)
+        else:
+            raise gr.Error(f"Unsupported file format: {file_path}")
+    def save_to_json(self, file_path: str, include_embeddings: bool = True, **kwargs) -> None:
         with open(file_path, "w", encoding="utf-8") as f:
+            json.dump(self.as_dict(include_embeddings=include_embeddings), f, **kwargs)
+    def save_to_zip(self, file_path: str, **kwargs) -> None:
+        # first create a new temporary directory and save the documents as json file in it
+        temp_dir = os.path.join(tempfile.gettempdir(), "document_store")
+        # remove the temporary directory if it already exists
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+        os.makedirs(temp_dir)
+        temp_file_path = os.path.join(temp_dir, self.JSON_FILE_NAME)
+        self.save_to_json(temp_file_path, include_embeddings=False, **kwargs)
+        self.vector_store.save_to_directory(temp_dir)
+        # then zip all files in the temporary directory and write them to the target file
+        with zipfile.ZipFile(file_path, "w") as zipf:
+            for root, _, files in os.walk(temp_dir):
+                for file in files:
+                    zipf.write(
+                        os.path.join(root, file),
+                        os.path.relpath(os.path.join(root, file), temp_dir),
+                    )
+        # delete the temporary directory
+        shutil.rmtree(temp_dir)
+    def save_to_file(self, file_path: str, **kwargs) -> None:
+        if file_path.endswith(".json"):
+            self.save_to_json(file_path, **kwargs)
+        elif file_path.endswith(".zip"):
+            self.save_to_zip(file_path, **kwargs)
+        else:
+            raise gr.Error(f"Unsupported file format: {file_path}")
+    def get_document(self, doc_id: str, with_embeddings: bool = False) -> TextBasedDocument:
+        document = self.documents[doc_id]
+        if not with_embeddings:
+            return document
+        # TODO: is this really required?
+        # copy because we add the embeddings to the metadata
+        document = document.copy()
+        # get the embeddings from the vector store
+        embeddings = {}
+        for annotation in document[self.span_layer_name].predictions:
+            annotation_id = labeled_span_to_id(annotation)
+            payload = self.construct_embedding_payload(document, annotation_id)
+            embedding = self.vector_store.get(payload=payload)
+            if embedding is not None:
+                embeddings[annotation_id] = embedding
+        document.metadata["embeddings"] = embeddings
+        return document
     def overview(self) -> pd.DataFrame:
         rows = []
         df = pd.DataFrame(rows)
         return df
+    def as_dict(self, include_embeddings: bool = True) -> dict:
+        result = {}
+        for doc_id, document in self.documents.items():
+            doc_dict = document.asdict()
+            if not include_embeddings and "embeddings" in (doc_dict.get("metadata") or {}):
+                doc_dict["metadata"] = {
+                    k: v for k, v in doc_dict["metadata"].items() if k != "embeddings"
+                }
+            result[doc_id] = doc_dict
+        return result

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ beautifulsoup4==4.12.3
 datasets==2.14.4
 # numpy 2.0.0 breaks the code
 numpy==1.25.2

 datasets==2.14.4
 # numpy 2.0.0 breaks the code
 numpy==1.25.2
+qdrant-client==1.9.1

vector_store.py CHANGED Viewed

@@ -1,22 +1,48 @@
 import abc
-from typing import Generic, Hashable, List, Optional, Tuple, TypeVar
-T = TypeVar("T", bound=Hashable)
 E = TypeVar("E")
 class VectorStore(Generic[T, E], abc.ABC):
     @abc.abstractmethod
-    def save(self, emb_id: T, embedding: E) -> None:
-        """Save an embedding for a given ID."""
         pass
     @abc.abstractmethod
-    def retrieve_similar(
-        self, ref_id: T, top_k: Optional[int] = None, min_similarity: Optional[float] = None
     ) -> List[Tuple[T, float]]:
-        """Retrieve IDs and the respective similarity scores with respect to the reference entry.
-        Note that this requires the reference entry to be present in the store.
         Args:
             ref_id: The ID of the reference entry.
@@ -30,10 +56,28 @@ class VectorStore(Generic[T, E], abc.ABC):
         """
         pass
     @abc.abstractmethod
     def __len__(self):
         pass
 def vector_norm(vector: List[float]) -> float:
     return sum(x**2 for x in vector) ** 0.5
@@ -44,34 +88,43 @@ def cosine_similarity(a: List[float], b: List[float]) -> float:
 class SimpleVectorStore(VectorStore[T, List[float]]):
     def __init__(self):
-        self.vectors: dict[T, List[float]] = {}
         self._cache = {}
         self._sim = cosine_similarity
-    def save(self, emb_id: T, embedding: List[float]) -> None:
         self.vectors[emb_id] = embedding
-    def get(self, emb_id: T) -> Optional[List[float]]:
         return self.vectors.get(emb_id)
-    def delete(self, emb_id: T) -> None:
         if emb_id in self.vectors:
             del self.vectors[emb_id]
             # remove from cache
             self._cache = {k: v for k, v in self._cache.items() if emb_id not in k}
     def clear(self) -> None:
         self.vectors.clear()
         self._cache.clear()
     def __len__(self):
         return len(self.vectors)
-    def retrieve_similar(
-        self, ref_id: T, top_k: Optional[int] = None, min_similarity: Optional[float] = None
-    ) -> List[Tuple[T, float]]:
-        ref_embedding = self.get(ref_id)
         if ref_embedding is None:
             raise ValueError(f"Reference embedding '{ref_id}' not found.")
@@ -93,4 +146,99 @@ class SimpleVectorStore(VectorStore[T, List[float]]):
         if top_k is not None:
             similar_entries = similar_entries[:top_k]
-        return similar_entries

 import abc
+import json
+import os
+from typing import Any, Generic, List, Optional, Tuple, TypeVar
+import numpy as np
+from qdrant_client import QdrantClient
+from qdrant_client.models import Distance, PointStruct, VectorParams
+T = TypeVar("T", bound=dict[str, Any])
 E = TypeVar("E")
 class VectorStore(Generic[T, E], abc.ABC):
     @abc.abstractmethod
+    def _add(self, embedding: E, payload: T, emb_id: str) -> None:
+        """Save an embedding with payload for a given ID."""
         pass
     @abc.abstractmethod
+    def _get(self, emb_id: str) -> Optional[E]:
+        """Get the embedding for a given ID."""
+        pass
+    def _get_emb_id(self, emb_id: Optional[str] = None, payload: Optional[T] = None) -> str:
+        if emb_id is None:
+            if payload is None:
+                raise ValueError("Either emb_id or payload must be provided.")
+            emb_id = json.dumps(payload, sort_keys=True)
+        return emb_id
+    def add(self, embedding: E, payload: T, emb_id: Optional[str] = None) -> None:
+        if emb_id is None:
+            emb_id = json.dumps(payload, sort_keys=True)
+        self._add(embedding=embedding, payload=payload, emb_id=emb_id)
+    def get(self, emb_id: Optional[str] = None, payload: Optional[T] = None) -> Optional[E]:
+        return self._get(emb_id=self._get_emb_id(emb_id=emb_id, payload=payload))
+    @abc.abstractmethod
+    def _retrieve_similar(
+        self, ref_id: str, top_k: Optional[int] = None, min_similarity: Optional[float] = None
     ) -> List[Tuple[T, float]]:
+        """Retrieve IDs, payloads and the respective similarity scores with respect to the
+        reference entry. Note that this requires the reference entry to be present in the store.
         Args:
             ref_id: The ID of the reference entry.
         """
         pass
+    def retrieve_similar(
+        self, ref_id: Optional[str] = None, ref_payload: Optional[T] = None, **kwargs
+    ) -> List[Tuple[T, float]]:
+        return self._retrieve_similar(
+            ref_id=self._get_emb_id(emb_id=ref_id, payload=ref_payload), **kwargs
+        )
     @abc.abstractmethod
     def __len__(self):
         pass
+    def save_to_directory(self, directory: str) -> None:
+        """Save the vector store to a directory."""
+        raise NotImplementedError
+    def load_from_directory(self, directory: str, replace: bool = False) -> None:
+        """Load the vector store from a directory.
+        If `replace` is True, the current content of the store will be replaced.
+        """
+        raise NotImplementedError
 def vector_norm(vector: List[float]) -> float:
     return sum(x**2 for x in vector) ** 0.5
 class SimpleVectorStore(VectorStore[T, List[float]]):
+    INDEX_FILE = "vectors_index.json"
+    EMBEDDINGS_FILE = "vectors_data.npy"
+    PAYLOADS_FILE = "vectors_payloads.json"
     def __init__(self):
+        self.vectors: dict[str, List[float]] = {}
+        self.payloads: dict[str, T] = {}
         self._cache = {}
         self._sim = cosine_similarity
+    def _add(self, embedding: E, payload: T, emb_id: str) -> None:
         self.vectors[emb_id] = embedding
+        self.payloads[emb_id] = payload
+    def _get(self, emb_id: str) -> Optional[E]:
         return self.vectors.get(emb_id)
+    def delete(self, emb_id: str) -> None:
         if emb_id in self.vectors:
             del self.vectors[emb_id]
+            del self.payloads[emb_id]
             # remove from cache
             self._cache = {k: v for k, v in self._cache.items() if emb_id not in k}
     def clear(self) -> None:
         self.vectors.clear()
         self._cache.clear()
+        self.payloads.clear()
     def __len__(self):
         return len(self.vectors)
+    def _retrieve_similar(
+        self, ref_id: str, top_k: Optional[int] = None, min_similarity: Optional[float] = None
+    ) -> List[Tuple[str, T, float]]:
+        ref_embedding = self.get(emb_id=ref_id)
         if ref_embedding is None:
             raise ValueError(f"Reference embedding '{ref_id}' not found.")
         if top_k is not None:
             similar_entries = similar_entries[:top_k]
+        return [(emb_id, self.payloads[emb_id], sim) for emb_id, sim in similar_entries]
+    def save_to_directory(self, directory: str) -> None:
+        os.makedirs(directory, exist_ok=True)
+        indices = list(self.vectors.keys())
+        with open(os.path.join(directory, self.INDEX_FILE), "w") as f:
+            json.dump(indices, f)
+        embeddings_np = np.array(list(self.vectors.values()))
+        np.save(os.path.join(directory, self.EMBEDDINGS_FILE), embeddings_np)
+        payloads = [self.payloads[idx] for idx in indices]
+        with open(os.path.join(directory, self.PAYLOADS_FILE), "w") as f:
+            json.dump(payloads, f)
+    def load_from_directory(self, directory: str, replace: bool = False) -> None:
+        if replace:
+            self.clear()
+        with open(os.path.join(directory, self.INDEX_FILE), "r") as f:
+            index = json.load(f)
+        embeddings_np = np.load(os.path.join(directory, self.EMBEDDINGS_FILE))
+        with open(os.path.join(directory, self.PAYLOADS_FILE), "r") as f:
+            payloads = json.load(f)
+        for emb_id, emb, payload in zip(index, embeddings_np, payloads):
+            self.vectors[emb_id] = emb.tolist()
+            self.payloads[emb_id] = payload
+class QdrantVectorStore(VectorStore[T, List[float]]):
+    COLLECTION_NAME = "ADUs"
+    MAX_LIMIT = 100
+    def __init__(
+        self,
+        location: str = ":memory:",
+        vector_size: int = 768,
+        distance: Distance = Distance.COSINE,
+    ):
+        self.client = QdrantClient(location=location)
+        self.id2idx = {}
+        self.idx2id = {}
+        self.client.create_collection(
+            collection_name=self.COLLECTION_NAME,
+            vectors_config=VectorParams(size=vector_size, distance=distance),
+        )
+    def __len__(self):
+        return self.client.get_collection(collection_name=self.COLLECTION_NAME).points_count
+    def _add(self, emb_id: str, payload: T, embedding: List[float]) -> None:
+        # we use the length of the id2idx dict as the index,
+        # because we assume that, even when we delete an entry from
+        # the store, we do not delete it from the index
+        _id = len(self.id2idx)
+        self.client.upsert(
+            collection_name=self.COLLECTION_NAME,
+            points=[PointStruct(id=_id, vector=embedding, payload=payload)],
+        )
+        self.id2idx[emb_id] = _id
+        self.idx2id[_id] = emb_id
+    def _get(self, emb_id: str) -> Optional[List[float]]:
+        points = self.client.retrieve(
+            collection_name=self.COLLECTION_NAME,
+            ids=[self.id2idx[emb_id]],
+            with_vectors=True,
+        )
+        if len(points) == 0:
+            return None
+        elif len(points) == 1:
+            return points[0].vector
+        else:
+            raise ValueError(f"Multiple points found for ID '{emb_id}'.")
+    def _retrieve_similar(
+        self, ref_id: str, top_k: Optional[int] = None, min_similarity: Optional[float] = None
+    ) -> List[Tuple[str, T, float]]:
+        similar_entries = self.client.recommend(
+            collection_name=self.COLLECTION_NAME,
+            positive=[self.id2idx[ref_id]],
+            limit=top_k or self.MAX_LIMIT,
+            score_threshold=min_similarity,
+        )
+        return [(self.idx2id[entry.id], entry.payload, entry.score) for entry in similar_entries]
+    def clear(self) -> None:
+        vectors_config = self.client.get_collection(
+            collection_name=self.COLLECTION_NAME
+        ).vectors_config
+        self.client.delete_collection(collection_name=self.COLLECTION_NAME)
+        self.client.create_collection(
+            collection_name=self.COLLECTION_NAME,
+            vectors_config=vectors_config,
+        )
+        self.id2idx.clear()
+        self.idx2id.clear()