Spaces:

ArneBinder
/

sam-pointer-bart-base-v0.3

Running

App Files Files Community

ArneBinder commited on 28 days ago

Commit

86277c0

•

1 Parent(s): 04ce9af

Upload 9 files

Browse files

Files changed (6) hide show

annotation_utils.py +10 -0
app.py +8 -49
document_store.py +218 -0
model_utils.py +173 -0
rendering_utils.py +2 -10
vector_store.py +18 -3

annotation_utils.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from pytorch_ie.annotations import LabeledSpan
+def labeled_span_to_id(span: LabeledSpan) -> str:
+    return f"span-{span.start}-{span.end}-{span.label}"
+def labeled_span_from_id(span_id: str) -> LabeledSpan:
+    parts = span_id.split("-")
+    return LabeledSpan(int(parts[1]), int(parts[2]), parts[3])

app.py CHANGED Viewed

@@ -7,13 +7,13 @@ from typing import List, Optional, Tuple
 import gradio as gr
 import pandas as pd
-from backend import DocumentStore, create_and_annotate_document, get_annotation_from_document
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
 from pytorch_ie import Pipeline
-from pytorch_ie.auto import AutoPipeline
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils import render_displacy, render_pretty_table
-from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizer
 logger = logging.getLogger(__name__)
@@ -66,6 +66,7 @@ def process_uploaded_files(
     document_store: DocumentStore,
 ) -> pd.DataFrame:
     try:
         for file_name in file_names:
             if file_name.lower().endswith(".txt"):
                 # read the file content
@@ -73,10 +74,10 @@ def process_uploaded_files(
                     text = f.read()
                 base_file_name = os.path.basename(file_name)
                 gr.Info(f"Processing file '{base_file_name}' ...")
-                document = create_and_annotate_document(text, base_file_name, models)
-                document_store.add_document(document)
             else:
                 raise gr.Error(f"Unsupported file format: {file_name}")
     except Exception as e:
         raise gr.Error(f"Failed to process uploaded files: {e}")
@@ -91,43 +92,6 @@ def close_accordion():
     return gr.Accordion(open=False)
-def load_argumentation_model(model_name: str, revision: Optional[str] = None) -> Pipeline:
-    try:
-        model = AutoPipeline.from_pretrained(
-            model_name,
-            device=-1,
-            num_workers=0,
-            taskmodule_kwargs=dict(revision=revision),
-            model_kwargs=dict(revision=revision),
-        )
-    except Exception as e:
-        raise gr.Error(f"Failed to load argumentation model: {e}")
-    gr.Info(f"Loaded argumentation model: model_name={model_name}, revision={revision})")
-    return model
-def load_embedding_model(model_name: str) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
-    try:
-        embedding_model = AutoModel.from_pretrained(model_name)
-        embedding_tokenizer = AutoTokenizer.from_pretrained(model_name)
-    except Exception as e:
-        raise gr.Error(f"Failed to load embedding model: {e}")
-    gr.Info(f"Loaded embedding model: model_name={model_name})")
-    return embedding_model, embedding_tokenizer
-def load_models(
-    model_name: str, revision: Optional[str] = None, embedding_model_name: Optional[str] = None
-) -> Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]]:
-    argumentation_model = load_argumentation_model(model_name, revision)
-    embedding_model = None
-    embedding_tokenizer = None
-    if embedding_model_name is not None and embedding_model_name.strip():
-        embedding_model, embedding_tokenizer = load_embedding_model(embedding_model_name)
-    return argumentation_model, embedding_model, embedding_tokenizer
 def select_processed_document(
     evt: gr.SelectData,
     processed_documents_df: pd.DataFrame,
@@ -135,7 +99,6 @@ def select_processed_document(
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     row_idx, col_idx = evt.index
     doc_id = processed_documents_df.iloc[row_idx]["doc_id"]
-    gr.Info(f"Select document: {doc_id}")
     doc = document_store.get_document(doc_id)
     return doc
@@ -163,8 +126,7 @@ def download_processed_documents(
     file_name: str = "processed_documents.json",
 ) -> str:
     file_path = os.path.join(tempfile.gettempdir(), file_name)
-    with open(file_path, "w", encoding="utf-8") as f:
-        json.dump(document_store.as_dict(), f, indent=2)
     return file_path
@@ -172,10 +134,7 @@ def upload_processed_documents(
     file_name: str,
     document_store: DocumentStore,
 ) -> pd.DataFrame:
-    with open(file_name, "r", encoding="utf-8") as f:
-        processed_documents_json = json.load(f)
-    for _, document_json in processed_documents_json.items():
-        document_store.add_document_from_dict(document_dict=document_json)
     return document_store.overview()

 import gradio as gr
 import pandas as pd
+from document_store import DocumentStore, get_annotation_from_document
+from model_utils import create_and_annotate_document, load_models
 from pie_modules.taskmodules import PointerNetworkTaskModuleForEnd2EndRE
 from pytorch_ie import Pipeline
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils import render_displacy, render_pretty_table
+from transformers import PreTrainedModel, PreTrainedTokenizer
 logger = logging.getLogger(__name__)
     document_store: DocumentStore,
 ) -> pd.DataFrame:
     try:
+        new_documents = []
         for file_name in file_names:
             if file_name.lower().endswith(".txt"):
                 # read the file content
                     text = f.read()
                 base_file_name = os.path.basename(file_name)
                 gr.Info(f"Processing file '{base_file_name}' ...")
+                new_documents.append(create_and_annotate_document(text, base_file_name, models))
             else:
                 raise gr.Error(f"Unsupported file format: {file_name}")
+        document_store.add_documents(new_documents)
     except Exception as e:
         raise gr.Error(f"Failed to process uploaded files: {e}")
     return gr.Accordion(open=False)
 def select_processed_document(
     evt: gr.SelectData,
     processed_documents_df: pd.DataFrame,
 ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
     row_idx, col_idx = evt.index
     doc_id = processed_documents_df.iloc[row_idx]["doc_id"]
     doc = document_store.get_document(doc_id)
     return doc
     file_name: str = "processed_documents.json",
 ) -> str:
     file_path = os.path.join(tempfile.gettempdir(), file_name)
+    document_store.save_to_json(file_path, indent=2)
     return file_path
     file_name: str,
     document_store: DocumentStore,
 ) -> pd.DataFrame:
+    document_store.add_from_json(file_name)
     return document_store.overview()

document_store.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import json
+import logging
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+import gradio as gr
+import pandas as pd
+from annotation_utils import labeled_span_to_id
+from pytorch_ie.annotations import LabeledSpan
+from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+from vector_store import SimpleVectorStore, VectorStore
+logger = logging.getLogger(__name__)
+def get_annotation_from_document(
+    document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+    annotation_id: str,
+    annotation_layer: str,
+) -> LabeledSpan:
+    # use predictions
+    annotations = document[annotation_layer].predictions
+    id2annotation = {labeled_span_to_id(annotation): annotation for annotation in annotations}
+    annotation = id2annotation.get(annotation_id)
+    if annotation is None:
+        raise gr.Error(
+            f"annotation '{annotation_id}' not found in document '{document.id}'. Available "
+            f"annotations: {id2annotation}"
+        )
+    return annotation
+class DocumentStore:
+    DOCUMENT_TYPE = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+    def __init__(self, vector_store: Optional[VectorStore[Tuple[str, str], List[float]]] = None):
+        # The annotated documents. As key, we use the document id. All documents keep the embeddings
+        # of the ADUs in the metadata.
+        self.documents: Dict[
+            str, TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+        ] = {}
+        # The vector store to efficiently retrieve similar ADUs. Can be constructed from the
+        # documents.
+        self.vector_store: VectorStore[Tuple[str, str], List[float]] = (
+            vector_store or SimpleVectorStore()
+        )
+    def get_annotation(
+        self,
+        doc_id: str,
+        annotation_id: str,
+        annotation_layer: str,
+    ) -> LabeledSpan:
+        document = self.documents.get(doc_id)
+        if document is None:
+            raise gr.Error(
+                f"Document '{doc_id}' not found in index. Available documents: {list(self.documents)}"
+            )
+        return get_annotation_from_document(document, annotation_id, annotation_layer)
+    def get_similar_adus_df(
+        self,
+        ref_annotation_id: str,
+        ref_document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+        min_similarity: float,
+        top_k: int,
+    ) -> pd.DataFrame:
+        similar_entries = self.vector_store.retrieve_similar(
+            ref_id=(ref_document.id, ref_annotation_id),
+            min_similarity=min_similarity,
+            top_k=top_k,
+        )
+        similar_annotations = [
+            self.get_annotation(
+                doc_id=doc_id,
+                annotation_id=annotation_id,
+                annotation_layer="labeled_spans",
+            )
+            for (doc_id, annotation_id), _ in similar_entries
+        ]
+        df = pd.DataFrame(
+            [
+                # unpack the tuple (doc_id, annotation_id) to separate columns
+                # and add the similarity score and the text of the annotation
+                (doc_id, annotation_id, score, str(annotation))
+                for ((doc_id, annotation_id), score), annotation in zip(
+                    similar_entries, similar_annotations
+                )
+            ],
+            columns=["doc_id", "adu_id", "sim_score", "text"],
+        )
+        return df
+    def get_relevant_adus_df(
+        self,
+        ref_annotation_id: str,
+        ref_document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+        min_similarity: float,
+        top_k: int,
+        relation_types: List[str],
+        columns: List[str],
+    ) -> pd.DataFrame:
+        similar_entries = self.vector_store.retrieve_similar(
+            ref_id=(ref_document.id, ref_annotation_id),
+            min_similarity=min_similarity,
+            top_k=top_k,
+        )
+        result = []
+        for (doc_id, annotation_id), score in similar_entries:
+            # skip entries from the same document
+            if doc_id == ref_document.id:
+                continue
+            document = self.documents[doc_id]
+            tail2rels = defaultdict(list)
+            head2rels = defaultdict(list)
+            for rel in document.binary_relations.predictions:
+                # skip non-argumentative relations
+                if rel.label not in relation_types:
+                    continue
+                head2rels[rel.head].append(rel)
+                tail2rels[rel.tail].append(rel)
+            id2annotation = {
+                labeled_span_to_id(annotation): annotation
+                for annotation in document.labeled_spans.predictions
+            }
+            annotation = id2annotation.get(annotation_id)
+            # note: we do not need to check if the annotation is different from the reference annotation,
+            # because they come from different documents and we already skip entries from the same document
+            for rel in head2rels.get(annotation, []):
+                result.append(
+                    {
+                        "doc_id": doc_id,
+                        "reference_adu": str(annotation),
+                        "sim_score": score,
+                        "rel_score": rel.score,
+                        "relation": rel.label,
+                        "adu": str(rel.tail),
+                    }
+                )
+        # define column order
+        df = pd.DataFrame(result, columns=columns)
+        return df
+    def add_document(
+        self, document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+    ) -> None:
+        try:
+            if document.id in self.documents:
+                gr.Warning(f"Document '{document.id}' already in index. Overwriting.")
+            # save the processed document to the index
+            self.documents[document.id] = document
+            # save the embeddings to the vector store
+            for adu_id, embedding in document.metadata["embeddings"].items():
+                self.vector_store.save((document.id, adu_id), embedding)
+        except Exception as e:
+            raise gr.Error(f"Failed to add document {document.id} to index: {e}")
+    def add_document_from_dict(self, document_dict: dict) -> None:
+        document = self.DOCUMENT_TYPE.fromdict(document_dict)
+        # metadata is not automatically deserialized, so we need to set it manually
+        document.metadata = document_dict["metadata"]
+        self.add_document(document)
+    def add_documents(
+        self, documents: List[TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions]
+    ) -> None:
+        size_before = len(self.documents)
+        for document in documents:
+            self.add_document(document)
+        size_after = len(self.documents)
+        gr.Info(
+            f"Added {size_after - size_before} documents to the index ({size_after} documents in total)."
+        )
+    def add_from_json(self, file_path: str) -> None:
+        size_before = len(self.documents)
+        with open(file_path, "r", encoding="utf-8") as f:
+            processed_documents_json = json.load(f)
+        for _, document_json in processed_documents_json.items():
+            self.add_document_from_dict(document_dict=document_json)
+        size_after = len(self.documents)
+        gr.Info(
+            f"Added {size_after - size_before} documents to the index ({size_after} documents in total)."
+        )
+    def save_to_json(self, file_path: str, **kwargs) -> None:
+        with open(file_path, "w", encoding="utf-8") as f:
+            json.dump(self.as_dict(), f, **kwargs)
+    def get_document(
+        self, doc_id: str
+    ) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
+        return self.documents[doc_id]
+    def overview(self) -> pd.DataFrame:
+        df = pd.DataFrame(
+            [
+                (
+                    doc_id,
+                    len(document.labeled_spans.predictions),
+                    len(document.binary_relations.predictions),
+                )
+                for doc_id, document in self.documents.items()
+            ],
+            columns=["doc_id", "num_adus", "num_relations"],
+        )
+        return df
+    def as_dict(self) -> dict:
+        return {doc_id: document.asdict() for doc_id, document in self.documents.items()}

model_utils.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import logging
+from typing import Dict, List, Optional, Tuple
+import gradio as gr
+from annotation_utils import labeled_span_to_id
+from pie_modules.document.processing import tokenize_document
+from pie_modules.documents import TokenDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+from pytorch_ie import Pipeline
+from pytorch_ie.annotations import LabeledSpan
+from pytorch_ie.auto import AutoPipeline
+from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
+from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizer
+logger = logging.getLogger(__name__)
+def _embed_text_annotations(
+    document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+    model: PreTrainedModel,
+    tokenizer: PreTrainedTokenizer,
+    text_layer_name: str,
+) -> Dict[LabeledSpan, List[float]]:
+    # to not modify the original document
+    document = document.copy()
+    # tokenize_document does not yet consider predictions, so we need to add them manually
+    document[text_layer_name].extend(document[text_layer_name].predictions.clear())
+    added_annotations = []
+    tokenizer_kwargs = {
+        "max_length": 512,
+        "stride": 64,
+        "truncation": True,
+        "return_overflowing_tokens": True,
+    }
+    tokenized_documents = tokenize_document(
+        document,
+        tokenizer=tokenizer,
+        result_document_type=TokenDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+        partition_layer="labeled_partitions",
+        added_annotations=added_annotations,
+        strict_span_conversion=False,
+        **tokenizer_kwargs,
+    )
+    # just tokenize again to get tensors in the correct format for the model
+    model_inputs = tokenizer(document.text, return_tensors="pt", **tokenizer_kwargs)
+    # this is added when using return_overflowing_tokens=True, but the model does not accept it
+    model_inputs.pop("overflow_to_sample_mapping", None)
+    assert len(model_inputs.encodings) == len(tokenized_documents)
+    model_output = model(**model_inputs)
+    # get embeddings for all text annotations
+    embeddings = {}
+    for batch_idx in range(len(model_output.last_hidden_state)):
+        text2tok_ann = added_annotations[batch_idx][text_layer_name]
+        tok2text_ann = {v: k for k, v in text2tok_ann.items()}
+        for tok_ann in tokenized_documents[batch_idx].labeled_spans:
+            # skip "empty" annotations
+            if tok_ann.start == tok_ann.end:
+                continue
+            # use the max pooling strategy to get a single embedding for the annotation text
+            embedding = model_output.last_hidden_state[batch_idx, tok_ann.start : tok_ann.end].max(
+                dim=0
+            )[0]
+            text_ann = tok2text_ann[tok_ann]
+            if text_ann in embeddings:
+                logger.warning(
+                    f"Overwriting embedding for annotation '{text_ann}' (do you use striding?)"
+                )
+            embeddings[text_ann] = embedding
+    return embeddings
+def _annotate(
+    document: TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions,
+    pipeline: Pipeline,
+    embedding_model: Optional[PreTrainedModel] = None,
+    embedding_tokenizer: Optional[PreTrainedTokenizer] = None,
+) -> None:
+    # execute prediction pipeline
+    pipeline(document)
+    if embedding_model is not None and embedding_tokenizer is not None:
+        adu_embeddings = _embed_text_annotations(
+            document=document,
+            model=embedding_model,
+            tokenizer=embedding_tokenizer,
+            text_layer_name="labeled_spans",
+        )
+        # convert keys to str because JSON keys must be strings
+        adu_embeddings_dict = {
+            labeled_span_to_id(k): v.detach().tolist() for k, v in adu_embeddings.items()
+        }
+        document.metadata["embeddings"] = adu_embeddings_dict
+    else:
+        gr.Warning(
+            "No embedding model provided. Skipping embedding extraction. You can load an embedding "
+            "model in the 'Model Configuration' section."
+        )
+def create_and_annotate_document(
+    text: str,
+    doc_id: str,
+    models: Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]],
+) -> TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions:
+    """Create a TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions from the provided
+    text, annotate it, and add it to the index.
+    Parameters:
+        text: The text to process.
+        doc_id: The ID of the document.
+        models: A tuple containing the prediction pipeline and the embedding model and tokenizer.
+    Returns:
+        The processed document.
+    """
+    try:
+        document = TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions(
+            id=doc_id, text=text, metadata={}
+        )
+        # add single partition from the whole text (the model only considers text in partitions)
+        document.labeled_partitions.append(LabeledSpan(start=0, end=len(text), label="text"))
+        # annotate the document
+        _annotate(
+            document=document,
+            pipeline=models[0],
+            embedding_model=models[1],
+            embedding_tokenizer=models[2],
+        )
+        return document
+    except Exception as e:
+        raise gr.Error(f"Failed to process text: {e}")
+def load_argumentation_model(model_name: str, revision: Optional[str] = None) -> Pipeline:
+    try:
+        model = AutoPipeline.from_pretrained(
+            model_name,
+            device=-1,
+            num_workers=0,
+            taskmodule_kwargs=dict(revision=revision),
+            model_kwargs=dict(revision=revision),
+        )
+    except Exception as e:
+        raise gr.Error(f"Failed to load argumentation model: {e}")
+    gr.Info(f"Loaded argumentation model: model_name={model_name}, revision={revision})")
+    return model
+def load_embedding_model(model_name: str) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+    try:
+        embedding_model = AutoModel.from_pretrained(model_name)
+        embedding_tokenizer = AutoTokenizer.from_pretrained(model_name)
+    except Exception as e:
+        raise gr.Error(f"Failed to load embedding model: {e}")
+    gr.Info(f"Loaded embedding model: model_name={model_name})")
+    return embedding_model, embedding_tokenizer
+def load_models(
+    model_name: str, revision: Optional[str] = None, embedding_model_name: Optional[str] = None
+) -> Tuple[Pipeline, Optional[PreTrainedModel], Optional[PreTrainedTokenizer]]:
+    argumentation_model = load_argumentation_model(model_name, revision)
+    embedding_model = None
+    embedding_tokenizer = None
+    if embedding_model_name is not None and embedding_model_name.strip():
+        embedding_model, embedding_tokenizer = load_embedding_model(embedding_model_name)
+    return argumentation_model, embedding_model, embedding_tokenizer

rendering_utils.py CHANGED Viewed

@@ -2,7 +2,8 @@ import json
 from collections import defaultdict
 from typing import Dict, List, Optional, Union
-from pytorch_ie.annotations import BinaryRelation, LabeledSpan, Span
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils_displacy import EntityRenderer
@@ -59,15 +60,6 @@ def render_displacy(
     return html
-def labeled_span_to_id(span: LabeledSpan) -> str:
-    return f"span-{span.start}-{span.end}-{span.label}"
-def labeled_span_from_id(span_id: str) -> LabeledSpan:
-    parts = span_id.split("-")
-    return LabeledSpan(int(parts[1]), int(parts[2]), parts[3])
 def inject_relation_data(
     html: str,
     sorted_entities,

 from collections import defaultdict
 from typing import Dict, List, Optional, Union
+from annotation_utils import labeled_span_to_id
+from pytorch_ie.annotations import BinaryRelation
 from pytorch_ie.documents import TextDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions
 from rendering_utils_displacy import EntityRenderer
     return html
 def inject_relation_data(
     html: str,
     sorted_entities,

vector_store.py CHANGED Viewed

@@ -2,17 +2,32 @@ import abc
 from typing import Generic, Hashable, List, Optional, Tuple, TypeVar
 T = TypeVar("T", bound=Hashable)
-class VectorStore(Generic[T], abc.ABC):
     @abc.abstractmethod
-    def save(self, emb_id: T, embedding: List[float]) -> None:
         pass
     @abc.abstractmethod
     def retrieve_similar(
         self, ref_id: T, top_k: Optional[int] = None, min_similarity: Optional[float] = None
     ) -> List[Tuple[T, float]]:
         pass
     @abc.abstractmethod
@@ -28,7 +43,7 @@ def cosine_similarity(a: List[float], b: List[float]) -> float:
     return sum(a * b for a, b in zip(a, b)) / (vector_norm(a) * vector_norm(b))
-class SimpleVectorStore(VectorStore[T]):
     def __init__(self):
         self.vectors: dict[T, List[float]] = {}
         self._cache = {}

 from typing import Generic, Hashable, List, Optional, Tuple, TypeVar
 T = TypeVar("T", bound=Hashable)
+E = TypeVar("E")
+class VectorStore(Generic[T, E], abc.ABC):
     @abc.abstractmethod
+    def save(self, emb_id: T, embedding: E) -> None:
+        """Save an embedding for a given ID."""
         pass
     @abc.abstractmethod
     def retrieve_similar(
         self, ref_id: T, top_k: Optional[int] = None, min_similarity: Optional[float] = None
     ) -> List[Tuple[T, float]]:
+        """Retrieve IDs and the respective similarity scores with respect to the reference entry.
+        Note that this requires the reference entry to be present in the store.
+        Args:
+            ref_id: The ID of the reference entry.
+            top_k: If provided, only the top-k most similar entries will be returned.
+            min_similarity: If provided, only entries with a similarity score greater or equal to
+                this value will be returned.
+        Returns:
+            A list of tuples consisting of the ID and the similarity score, sorted by similarity
+            score in descending order.
+        """
         pass
     @abc.abstractmethod
     return sum(a * b for a, b in zip(a, b)) / (vector_norm(a) * vector_norm(b))
+class SimpleVectorStore(VectorStore[T, List[float]]):
     def __init__(self):
         self.vectors: dict[T, List[float]] = {}
         self._cache = {}