|
from abc import ABC, abstractmethod |
|
from collections.abc import Sequence |
|
from typing import Any, Optional |
|
|
|
from pydantic import BaseModel, Field |
|
|
|
|
|
class Document(BaseModel): |
|
"""Class for storing a piece of text and associated metadata.""" |
|
|
|
page_content: str |
|
|
|
vector: Optional[list[float]] = None |
|
|
|
"""Arbitrary metadata about the page content (e.g., source, relationships to other |
|
documents, etc.). |
|
""" |
|
metadata: Optional[dict] = Field(default_factory=dict) |
|
|
|
provider: Optional[str] = "dify" |
|
|
|
|
|
class BaseDocumentTransformer(ABC): |
|
"""Abstract base class for document transformation systems. |
|
|
|
A document transformation system takes a sequence of Documents and returns a |
|
sequence of transformed Documents. |
|
|
|
Example: |
|
.. code-block:: python |
|
|
|
class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel): |
|
embeddings: Embeddings |
|
similarity_fn: Callable = cosine_similarity |
|
similarity_threshold: float = 0.95 |
|
|
|
class Config: |
|
arbitrary_types_allowed = True |
|
|
|
def transform_documents( |
|
self, documents: Sequence[Document], **kwargs: Any |
|
) -> Sequence[Document]: |
|
stateful_documents = get_stateful_documents(documents) |
|
embedded_documents = _get_embeddings_from_stateful_docs( |
|
self.embeddings, stateful_documents |
|
) |
|
included_idxs = _filter_similar_embeddings( |
|
embedded_documents, self.similarity_fn, self.similarity_threshold |
|
) |
|
return [stateful_documents[i] for i in sorted(included_idxs)] |
|
|
|
async def atransform_documents( |
|
self, documents: Sequence[Document], **kwargs: Any |
|
) -> Sequence[Document]: |
|
raise NotImplementedError |
|
|
|
""" |
|
|
|
@abstractmethod |
|
def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]: |
|
"""Transform a list of documents. |
|
|
|
Args: |
|
documents: A sequence of Documents to be transformed. |
|
|
|
Returns: |
|
A list of transformed Documents. |
|
""" |
|
|
|
@abstractmethod |
|
async def atransform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]: |
|
"""Asynchronously transform a list of documents. |
|
|
|
Args: |
|
documents: A sequence of Documents to be transformed. |
|
|
|
Returns: |
|
A list of transformed Documents. |
|
""" |
|
|