Spaces:

AdrienB134
/

matriv-rag-demo

Running

App Files Files Community

AdrienB134 commited on Dec 6, 2024

Commit

7fdb8e9

verified ·

1 Parent(s): 9ae6b81

Upload 54 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
Dockerfile +32 -0
README.md +0 -8
app.py +81 -0
pyproject.toml +18 -0
rag_demo/__init__.py +3 -0
rag_demo/__pycache__/__init__.cpython-311.pyc +0 -0
rag_demo/__pycache__/pipeline.cpython-311.pyc +0 -0
rag_demo/__pycache__/settings.cpython-311.pyc +0 -0
rag_demo/data/test.pdf +0 -0
rag_demo/data/test2.pdf +3 -0
rag_demo/infra/__pycache__/qdrant.cpython-311.pyc +0 -0
rag_demo/infra/qdrant.py +25 -0
rag_demo/pipeline.py +13 -0
rag_demo/preprocessing/__init__.py +5 -0
rag_demo/preprocessing/__pycache__/__init__.cpython-311.pyc +0 -0
rag_demo/preprocessing/__pycache__/chunking.cpython-311.pyc +0 -0
rag_demo/preprocessing/__pycache__/embed.cpython-311.pyc +0 -0
rag_demo/preprocessing/__pycache__/load_to_vectordb.cpython-311.pyc +0 -0
rag_demo/preprocessing/__pycache__/pdf_conversion.cpython-311.pyc +0 -0
rag_demo/preprocessing/base/__init__.py +12 -0
rag_demo/preprocessing/base/__pycache__/__init__.cpython-311.pyc +0 -0
rag_demo/preprocessing/base/__pycache__/chunk.cpython-311.pyc +0 -0
rag_demo/preprocessing/base/__pycache__/document.cpython-311.pyc +0 -0
rag_demo/preprocessing/base/__pycache__/embedded_chunk.cpython-311.pyc +0 -0
rag_demo/preprocessing/base/__pycache__/vectordb.cpython-311.pyc +0 -0
rag_demo/preprocessing/base/chunk.py +13 -0
rag_demo/preprocessing/base/document.py +19 -0
rag_demo/preprocessing/base/embedded_chunk.py +34 -0
rag_demo/preprocessing/base/embeddings.py +145 -0
rag_demo/preprocessing/base/vectordb.py +289 -0
rag_demo/preprocessing/chunking.py +26 -0
rag_demo/preprocessing/embed.py +57 -0
rag_demo/preprocessing/load_to_vectordb.py +30 -0
rag_demo/preprocessing/pdf_conversion.py +33 -0
rag_demo/rag/__pycache__/prompt_templates.cpython-311.pyc +0 -0
rag_demo/rag/__pycache__/query_expansion.cpython-311.pyc +0 -0
rag_demo/rag/__pycache__/reranker.cpython-311.pyc +0 -0
rag_demo/rag/__pycache__/retriever.cpython-311.pyc +0 -0
rag_demo/rag/base/__init__.py +3 -0
rag_demo/rag/base/__pycache__/__init__.cpython-311.pyc +0 -0
rag_demo/rag/base/__pycache__/query.cpython-311.pyc +0 -0
rag_demo/rag/base/__pycache__/template_factory.cpython-311.pyc +0 -0
rag_demo/rag/base/base.py +22 -0
rag_demo/rag/base/query.py +29 -0
rag_demo/rag/base/template_factory.py +22 -0
rag_demo/rag/prompt_templates.py +38 -0
rag_demo/rag/query_expansion.py +39 -0
rag_demo/rag/reranker.py +24 -0
rag_demo/rag/retriever.py +133 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ rag_demo/data/test2.pdf filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+# Use Python 3.10 as base image
+FROM python:3.10-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies and uv
+RUN apt-get update && apt-get install -y \
+    poppler-utils \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc \
+    && . ~/.bashrc
+# Copy requirements first to leverage Docker cache
+COPY pyproject.toml .
+# Install Python dependencies using uv
+RUN . ~/.bashrc && uv pip install -r pyproject.toml --system
+# Copy the rest of the application
+COPY . .
+# Create directories for uploads and embeddings if they don't exist
+RUN mkdir -p uploads embeddings
+# Expose the port the app runs on
+EXPOSE 7860
+# Change to rag_demo directory and run the app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,8 +0,0 @@
----
-title: matriv-rag-demo
-colorFrom: blue
-colorTo: red
-sdk: docker
-app_file: app.py
-pinned: false
----

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from fastapi import FastAPI, File, UploadFile, Request
+from fastapi.templating import Jinja2Templates
+from fastapi.responses import HTMLResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+import os
+from rag_demo.pipeline import process_pdf
+import nest_asyncio
+from rag_demo.rag.retriever import RAGPipeline
+from loguru import logger
+app = FastAPI()
+# Apply nest_asyncio at the start of the application
+nest_asyncio.apply()
+# Create templates directory if it doesn't exist
+templates = Jinja2Templates(directory="templates")
+app.mount("/static", StaticFiles(directory="static"), name="static")
+class ChatRequest(BaseModel):
+    question: str
+@app.get("/", response_class=HTMLResponse)
+async def upload_page(request: Request):
+    return templates.TemplateResponse("upload.html", {"request": request})
+@app.get("/chat", response_class=HTMLResponse)
+async def chat_page(request: Request):
+    return templates.TemplateResponse("chat.html", {"request": request})
+@app.post("/upload")
+async def upload_pdf(request: Request, file: UploadFile = File(...)):
+    try:
+        # Create uploads directory if it doesn't exist
+        os.makedirs("data", exist_ok=True)
+        file_path = f"data/{file.filename}"
+        with open(file_path, "wb") as buffer:
+            content = await file.read()
+            buffer.write(content)
+        # Process the PDF file with proper await statements
+        await process_pdf(file_path)
+        # Return template response with success message
+        return templates.TemplateResponse(
+            "upload.html",
+            {
+                "request": request,
+                "message": f"Successfully processed {file.filename}",
+                "processing": False,
+            },
+        )
+    except Exception as e:
+        return templates.TemplateResponse(
+            "upload.html", {"request": request, "error": str(e), "processing": False}
+        )
+@app.post("/chat")
+async def chat(chat_request: ChatRequest):
+    rag_pipeline = RAGPipeline()
+    try:
+        answer = rag_pipeline.rag(chat_request.question)
+        print(answer)
+        logger.info(answer)
+        return {"answer": answer}
+    except Exception as e:
+        return {"error": str(e)}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,18 @@

+[project]
+name = "rag-base"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "loguru>=0.7.2",
+    "langchain>=0.3.9",
+    "marker-pdf>=1.0.2",
+    "qdrant-client[fastembed]>=1.12.1",
+    "fastapi>=0.115.6",
+    "pydantic>=2.10.3",
+    "python-multipart>=0.0.19",
+    "uvicorn>=0.32.1",
+    "huggingface-hub>=0.26.3",
+    "llama-parse>=0.5.17",
+]

rag_demo/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .infra.qdrant import connection
2	+
3	+ __all__ = ["connection"]

rag_demo/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (263 Bytes). View file

rag_demo/__pycache__/pipeline.cpython-311.pyc ADDED Viewed

Binary file (738 Bytes). View file

rag_demo/__pycache__/settings.cpython-311.pyc ADDED Viewed

Binary file (1.99 kB). View file

rag_demo/data/test.pdf ADDED Viewed

Binary file (344 kB). View file

rag_demo/data/test2.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3041eb7dd274b02a2f18049891dc3f184dff4151796f225b92cd34d676ba923
+size 1962780

rag_demo/infra/__pycache__/qdrant.cpython-311.pyc ADDED Viewed

Binary file (1.32 kB). View file

rag_demo/infra/qdrant.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from loguru import logger
+from qdrant_client import QdrantClient
+from qdrant_client.http.exceptions import UnexpectedResponse
+class QdrantDatabaseConnector:
+    _instance: QdrantClient | None = None
+    def __new__(cls, *args, **kwargs) -> QdrantClient:
+        if cls._instance is None:
+            try:
+                cls._instance = QdrantClient(":memory:")
+                logger.info(f"Connection to Qdrant DB with URI successful")
+            except:
+                logger.exception(
+                    "Couldn't connect to Qdrant.",
+                )
+                raise
+        return cls._instance
+connection = QdrantDatabaseConnector()

rag_demo/pipeline.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from rag_demo.preprocessing import (
+    convert_pdf_to_text,
+    load_to_vector_db,
+    chunk_and_embed,
+)
+from loguru import logger
+def process_pdf(file_path: str):
+    convert = convert_pdf_to_text([file_path])
+    embedded_chunks = chunk_and_embed([convert])
+    load_to_vector_db(embedded_chunks)
+    return True

rag_demo/preprocessing/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .pdf_conversion import convert_pdf_to_text
+from .load_to_vectordb import load_to_vector_db
+from .embed import chunk_and_embed
+__all__ = ["convert_pdf_to_text", "load_to_vector_db", "chunk_and_embed"]

rag_demo/preprocessing/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (441 Bytes). View file

rag_demo/preprocessing/__pycache__/chunking.cpython-311.pyc ADDED Viewed

Binary file (1.25 kB). View file

rag_demo/preprocessing/__pycache__/embed.cpython-311.pyc ADDED Viewed

Binary file (3.53 kB). View file

rag_demo/preprocessing/__pycache__/load_to_vectordb.cpython-311.pyc ADDED Viewed

Binary file (2.39 kB). View file

rag_demo/preprocessing/__pycache__/pdf_conversion.cpython-311.pyc ADDED Viewed

Binary file (1.8 kB). View file

rag_demo/preprocessing/base/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .document import Document, CleanedDocument
+from .chunk import Chunk
+from .embedded_chunk import EmbeddedChunk
+from .vectordb import VectorBaseDocument
+__all__ = [
+    "Document",
+    "CleanedDocument",
+    "Chunk",
+    "EmbeddedChunk",
+    "VectorBaseDocument",
+]

rag_demo/preprocessing/base/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (528 Bytes). View file

rag_demo/preprocessing/base/__pycache__/chunk.cpython-311.pyc ADDED Viewed

Binary file (927 Bytes). View file

rag_demo/preprocessing/base/__pycache__/document.cpython-311.pyc ADDED Viewed

Binary file (1.12 kB). View file

rag_demo/preprocessing/base/__pycache__/embedded_chunk.cpython-311.pyc ADDED Viewed

Binary file (2.04 kB). View file

rag_demo/preprocessing/base/__pycache__/vectordb.cpython-311.pyc ADDED Viewed

Binary file (16.7 kB). View file

rag_demo/preprocessing/base/chunk.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from abc import ABC
+from typing import Optional
+from pydantic import UUID4, Field
+from rag_demo.preprocessing.base.vectordb import VectorBaseDocument
+class Chunk(VectorBaseDocument, ABC):
+    content: str
+    document_id: UUID4
+    chunk_id: UUID4
+    metadata: dict = Field(default_factory=dict)

rag_demo/preprocessing/base/document.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from abc import ABC
+from typing import Optional
+from pydantic import UUID4, BaseModel
+from .vectordb import VectorBaseDocument
+class CleanedDocument(VectorBaseDocument, ABC):
+    content: str
+    doc_id: UUID4
+    doc_title: str
+    # doc_url: str
+class Document(BaseModel):
+    text: str
+    document_id: UUID4
+    metadata: dict

rag_demo/preprocessing/base/embedded_chunk.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from abc import ABC
+from pydantic import UUID4, Field
+from .vectordb import VectorBaseDocument
+class EmbeddedChunk(VectorBaseDocument, ABC):
+    content: str
+    embedding: list[float] | None
+    document_id: UUID4
+    chunk_id: UUID4
+    metadata: dict = Field(default_factory=dict)
+    similarity: float | None
+    @classmethod
+    def to_context(cls, chunks: list["EmbeddedChunk"]) -> str:
+        context = ""
+        for i, chunk in enumerate(chunks):
+            context += f"""
+            Chunk {i + 1}:
+            Type: {chunk.__class__.__name__}
+            Document ID: {chunk.document_id}
+            Chunk ID: {chunk.chunk_id}
+            Content: {chunk.content}\n
+            """
+        return context
+    class Config:
+        name = "embedded_documents"
+        category = "Document"
+        use_vector_index = True

rag_demo/preprocessing/base/embeddings.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from functools import cached_property
+from pathlib import Path
+from typing import Optional, ClassVar
+from threading import Lock
+import numpy as np
+from loguru import logger
+from numpy.typing import NDArray
+from sentence_transformers.SentenceTransformer import SentenceTransformer
+from transformers import AutoTokenizer
+from rag_demo.settings import settings
+class SingletonMeta(type):
+    """
+    This is a thread-safe implementation of Singleton.
+    """
+    _instances: ClassVar = {}
+    _lock: Lock = Lock()
+    """
+    We now have a lock object that will be used to synchronize threads during
+    first access to the Singleton.
+    """
+    def __call__(cls, *args, **kwargs):
+        """
+        Possible changes to the value of the `__init__` argument do not affect
+        the returned instance.
+        """
+        # Now, imagine that the program has just been launched. Since there's no
+        # Singleton instance yet, multiple threads can simultaneously pass the
+        # previous conditional and reach this point almost at the same time. The
+        # first of them will acquire lock and will proceed further, while the
+        # rest will wait here.
+        with cls._lock:
+            # The first thread to acquire the lock, reaches this conditional,
+            # goes inside and creates the Singleton instance. Once it leaves the
+            # lock block, a thread that might have been waiting for the lock
+            # release may then enter this section. But since the Singleton field
+            # is already initialized, the thread won't create a new object.
+            if cls not in cls._instances:
+                instance = super().__call__(*args, **kwargs)
+                cls._instances[cls] = instance
+        return cls._instances[cls]
+class EmbeddingModelSingleton(metaclass=SingletonMeta):
+    """
+    A singleton class that provides a pre-trained transformer model for generating embeddings of input text.
+    """
+    def __init__(
+        self,
+        model_id: str = settings.TEXT_EMBEDDING_MODEL_ID,
+        device: str = settings.RAG_MODEL_DEVICE,
+        cache_dir: Optional[Path] = None,
+    ) -> None:
+        self._model_id = model_id
+        self._device = device
+        self._model = SentenceTransformer(
+            self._model_id,
+            device=self._device,
+            cache_folder=str(cache_dir) if cache_dir else None,
+        )
+        self._model.eval()
+    @property
+    def model_id(self) -> str:
+        """
+        Returns the identifier of the pre-trained transformer model to use.
+        Returns:
+            str: The identifier of the pre-trained transformer model to use.
+        """
+        return self._model_id
+    @cached_property
+    def embedding_size(self) -> int:
+        """
+        Returns the size of the embeddings generated by the pre-trained transformer model.
+        Returns:
+            int: The size of the embeddings generated by the pre-trained transformer model.
+        """
+        dummy_embedding = self._model.encode("")
+        return dummy_embedding.shape[0]
+    @property
+    def max_input_length(self) -> int:
+        """
+        Returns the maximum length of input text to tokenize.
+        Returns:
+            int: The maximum length of input text to tokenize.
+        """
+        return self._model.max_seq_length
+    @property
+    def tokenizer(self) -> AutoTokenizer:
+        """
+        Returns the tokenizer used to tokenize input text.
+        Returns:
+            AutoTokenizer: The tokenizer used to tokenize input text.
+        """
+        return self._model.tokenizer
+    def __call__(
+        self, input_text: str | list[str], to_list: bool = True
+    ) -> NDArray[np.float32] | list[float] | list[list[float]]:
+        """
+        Generates embeddings for the input text using the pre-trained transformer model.
+        Args:
+            input_text (str): The input text to generate embeddings for.
+            to_list (bool): Whether to return the embeddings as a list or numpy array. Defaults to True.
+        Returns:
+            Union[np.ndarray, list]: The embeddings generated for the input text.
+        """
+        try:
+            embeddings = self._model.encode(input_text)
+        except Exception:
+            logger.error(
+                f"Error generating embeddings for {self._model_id=} and {input_text=}"
+            )
+            return [] if to_list else np.array([])
+        if to_list:
+            embeddings = embeddings.tolist()
+        return embeddings

rag_demo/preprocessing/base/vectordb.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import uuid
+from abc import ABC
+from typing import Any, Callable, Dict, Generic, Type, TypeVar
+from uuid import UUID
+import numpy as np
+from loguru import logger
+from pydantic import UUID4, BaseModel, Field
+from qdrant_client.http import exceptions
+from qdrant_client.http.models import Distance, VectorParams
+from qdrant_client.models import CollectionInfo, PointStruct, Record
+from rag_demo.infra.qdrant import connection
+T = TypeVar("T", bound="VectorBaseDocument")
+EMBEDDING_SIZE = 1024
+class VectorBaseDocument(BaseModel, Generic[T], ABC):
+    id: UUID4 = Field(default_factory=uuid.uuid4)
+    def __eq__(self, value: object) -> bool:
+        if not isinstance(value, self.__class__):
+            return False
+        return self.id == value.id
+    def __hash__(self) -> int:
+        return hash(self.id)
+    @classmethod
+    def from_record(cls: Type[T], point: Record) -> T:
+        _id = UUID(point.id, version=4)
+        payload = point.payload or {}
+        attributes = {
+            "id": _id,
+            **payload,
+        }
+        if cls._has_class_attribute("embedding"):
+            attributes["embedding"] = point.vector or None
+        return cls(**attributes)
+    def to_point(self: T, **kwargs) -> PointStruct:
+        exclude_unset = kwargs.pop("exclude_unset", False)
+        by_alias = kwargs.pop("by_alias", True)
+        payload = self.model_dump(
+            exclude_unset=exclude_unset, by_alias=by_alias, **kwargs
+        )
+        _id = str(payload.pop("id"))
+        vector = payload.pop("embedding", {})
+        if vector and isinstance(vector, np.ndarray):
+            vector = vector.tolist()
+        return PointStruct(id=_id, vector=vector, payload=payload)
+    def model_dump(self: T, **kwargs) -> dict:
+        dict_ = super().model_dump(**kwargs)
+        dict_ = self._uuid_to_str(dict_)
+        return dict_
+    def _uuid_to_str(self, item: Any) -> Any:
+        if isinstance(item, dict):
+            for key, value in item.items():
+                if isinstance(value, UUID):
+                    item[key] = str(value)
+                elif isinstance(value, list):
+                    item[key] = [self._uuid_to_str(v) for v in value]
+                elif isinstance(value, dict):
+                    item[key] = {k: self._uuid_to_str(v) for k, v in value.items()}
+        return item
+    @classmethod
+    def bulk_insert(cls: Type[T], documents: list["VectorBaseDocument"]) -> bool:
+        try:
+            cls._bulk_insert(documents)
+            logger.info(
+                f"Successfully inserted {len(documents)} documents into {cls.get_collection_name()}"
+            )
+        except Exception as e:
+            logger.error(f"Error inserting documents: {e}")
+            logger.info(
+                f"Collection '{cls.get_collection_name()}' does not exist. Trying to create the collection and reinsert the documents."
+            )
+            cls.create_collection()
+            try:
+                cls._bulk_insert(documents)
+            except Exception as e:
+                logger.error(f"Error inserting documents: {e}")
+                logger.error(
+                    f"Failed to insert documents in '{cls.get_collection_name()}'."
+                )
+                return False
+        return True
+    @classmethod
+    def _bulk_insert(cls: Type[T], documents: list["VectorBaseDocument"]) -> None:
+        points = [doc.to_point() for doc in documents]
+        connection.upsert(collection_name=cls.get_collection_name(), points=points)
+    @classmethod
+    def bulk_find(
+        cls: Type[T], limit: int = 10, **kwargs
+    ) -> tuple[list[T], UUID | None]:
+        try:
+            documents, next_offset = cls._bulk_find(limit=limit, **kwargs)
+        except exceptions.UnexpectedResponse:
+            logger.error(
+                f"Failed to search documents in '{cls.get_collection_name()}'."
+            )
+            documents, next_offset = [], None
+        return documents, next_offset
+    @classmethod
+    def _bulk_find(
+        cls: Type[T], limit: int = 10, **kwargs
+    ) -> tuple[list[T], UUID | None]:
+        collection_name = cls.get_collection_name()
+        offset = kwargs.pop("offset", None)
+        offset = str(offset) if offset else None
+        records, next_offset = connection.scroll(
+            collection_name=collection_name,
+            limit=limit,
+            with_payload=kwargs.pop("with_payload", True),
+            with_vectors=kwargs.pop("with_vectors", False),
+            offset=offset,
+            **kwargs,
+        )
+        documents = [cls.from_record(record) for record in records]
+        if next_offset is not None:
+            next_offset = UUID(next_offset, version=4)
+        return documents, next_offset
+    @classmethod
+    def search(cls: Type[T], query_vector: list, limit: int = 10, **kwargs) -> list[T]:
+        try:
+            documents = cls._search(query_vector=query_vector, limit=limit, **kwargs)
+        except exceptions.UnexpectedResponse:
+            logger.error(
+                f"Failed to search documents in '{cls.get_collection_name()}'."
+            )
+            documents = []
+        return documents
+    @classmethod
+    def _search(cls: Type[T], query_vector: list, limit: int = 10, **kwargs) -> list[T]:
+        collection_name = cls.get_collection_name()
+        records = connection.search(
+            collection_name=collection_name,
+            query_vector=query_vector,
+            limit=limit,
+            with_payload=kwargs.pop("with_payload", True),
+            with_vectors=kwargs.pop("with_vectors", False),
+            **kwargs,
+        )
+        documents = [cls.from_record(record) for record in records]
+        return documents
+    @classmethod
+    def get_or_create_collection(cls: Type[T]) -> CollectionInfo:
+        collection_name = cls.get_collection_name()
+        try:
+            return connection.get_collection(collection_name=collection_name)
+        except exceptions.UnexpectedResponse:
+            use_vector_index = cls.get_use_vector_index()
+            collection_created = cls._create_collection(
+                collection_name=collection_name, use_vector_index=use_vector_index
+            )
+            if collection_created is False:
+                raise RuntimeError(
+                    f"Couldn't create collection {collection_name}"
+                ) from None
+            return connection.get_collection(collection_name=collection_name)
+    @classmethod
+    def create_collection(cls: Type[T]) -> bool:
+        collection_name = cls.get_collection_name()
+        use_vector_index = cls.get_use_vector_index()
+        logger.info(
+            f"Creating collection {collection_name} with use_vector_index={use_vector_index}"
+        )
+        return cls._create_collection(
+            collection_name=collection_name, use_vector_index=use_vector_index
+        )
+    @classmethod
+    def _create_collection(
+        cls, collection_name: str, use_vector_index: bool = True
+    ) -> bool:
+        if use_vector_index is True:
+            vectors_config = VectorParams(size=EMBEDDING_SIZE, distance=Distance.COSINE)
+        else:
+            vectors_config = {}
+        return connection.create_collection(
+            collection_name=collection_name, vectors_config=vectors_config
+        )
+    @classmethod
+    def get_collection_name(cls: Type[T]) -> str:
+        if not hasattr(cls, "Config") or not hasattr(cls.Config, "name"):
+            raise Exception(
+                f"The class {cls} should define a Config class with the 'name' property that reflects the collection's name."
+            )
+        return cls.Config.name
+    @classmethod
+    def get_use_vector_index(cls: Type[T]) -> bool:
+        if not hasattr(cls, "Config") or not hasattr(cls.Config, "use_vector_index"):
+            return True
+        return cls.Config.use_vector_index
+    @classmethod
+    def group_by_class(
+        cls: Type["VectorBaseDocument"], documents: list["VectorBaseDocument"]
+    ) -> Dict["VectorBaseDocument", list["VectorBaseDocument"]]:
+        return cls._group_by(documents, selector=lambda doc: doc.__class__)
+    @classmethod
+    def _group_by(
+        cls: Type[T], documents: list[T], selector: Callable[[T], Any]
+    ) -> Dict[Any, list[T]]:
+        grouped = {}
+        for doc in documents:
+            key = selector(doc)
+            if key not in grouped:
+                grouped[key] = []
+            grouped[key].append(doc)
+        return grouped
+    @classmethod
+    def collection_name_to_class(
+        cls: Type["VectorBaseDocument"], collection_name: str
+    ) -> type["VectorBaseDocument"]:
+        for subclass in cls.__subclasses__():
+            try:
+                if subclass.get_collection_name() == collection_name:
+                    return subclass
+            except Exception:
+                pass
+            try:
+                return subclass.collection_name_to_class(collection_name)
+            except ValueError:
+                continue
+        raise ValueError(f"No subclass found for collection name: {collection_name}")
+    @classmethod
+    def _has_class_attribute(cls: Type[T], attribute_name: str) -> bool:
+        if attribute_name in cls.__annotations__:
+            return True
+        for base in cls.__bases__:
+            if hasattr(base, "_has_class_attribute") and base._has_class_attribute(
+                attribute_name
+            ):
+                return True
+        return False

rag_demo/preprocessing/chunking.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from uuid import uuid4
+from langchain.text_splitter import MarkdownTextSplitter
+from rag_demo.preprocessing.base import Chunk
+from rag_demo.preprocessing.base import Document
+def chunk_text(
+    document: Document, chunk_size: int = 500, chunk_overlap: int = 50
+) -> list[Chunk]:
+    text_splitter = MarkdownTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )
+    chunks = text_splitter.split_text(document.text)
+    result = []
+    for chunk in chunks:
+        result.append(
+            Chunk(
+                content=chunk,
+                document_id=document.document_id,
+                chunk_id=uuid4(),
+                metadata=document.metadata,
+            )
+        )
+    return result

rag_demo/preprocessing/embed.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing_extensions import Annotated
+from typing import Generator
+from .base import Chunk
+from .base import EmbeddedChunk
+from .chunking import chunk_text
+from huggingface_hub import InferenceClient
+import os
+from dotenv import load_dotenv
+from uuid import uuid4
+from loguru import logger
+load_dotenv()
+def batch(list_: list, size: int) -> Generator[list, None, None]:
+    yield from (list_[i : i + size] for i in range(0, len(list_), size))
+def embed_chunks(chunks: list[Chunk]) -> list[EmbeddedChunk]:
+    api = InferenceClient(
+        model="intfloat/multilingual-e5-large-instruct",
+        token=os.getenv("HF_API_TOKEN"),
+    )
+    logger.info(f"Embedding {len(chunks)} chunks")
+    embedded_chunks = []
+    for chunk in chunks:
+        try:
+            embedded_chunks.append(
+                EmbeddedChunk(
+                    id=uuid4(),
+                    content=chunk.content,
+                    embedding=api.feature_extraction(chunk.content),
+                    document_id=chunk.document_id,
+                    chunk_id=chunk.chunk_id,
+                    metadata=chunk.metadata,
+                    similarity=None,
+                )
+            )
+        except Exception as e:
+            logger.error(f"Error embedding chunk: {e}")
+    logger.info(f"{len(embedded_chunks)} chunks embedded successfully")
+    return embedded_chunks
+def chunk_and_embed(
+    cleaned_documents: Annotated[list, "cleaned_documents"],
+) -> Annotated[list, "embedded_documents"]:
+    embedded_chunks = []
+    for document in cleaned_documents:
+        chunks = chunk_text(document)
+        for batched_chunks in batch(chunks, 10):
+            batched_embedded_chunks = embed_chunks(batched_chunks)
+            embedded_chunks.extend(batched_embedded_chunks)
+    logger.info(f"{len(embedded_chunks)} chunks embedded successfully")
+    return embedded_chunks

rag_demo/preprocessing/load_to_vectordb.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from loguru import logger
+from typing_extensions import Annotated
+from typing import Generator
+from .base import VectorBaseDocument
+def batch(list_: list, size: int) -> Generator[list, None, None]:
+    yield from (list_[i : i + size] for i in range(0, len(list_), size))
+def load_to_vector_db(
+    documents: Annotated[list, "documents"],
+) -> Annotated[bool, "successful"]:
+    logger.info(f"Loading {len(documents)} documents into the vector database.")
+    grouped_documents = VectorBaseDocument.group_by_class(documents)
+    for document_class, documents in grouped_documents.items():
+        logger.info(f"Loading documents into {document_class.get_collection_name()}")
+        for documents_batch in batch(documents, size=4):
+            try:
+                document_class.bulk_insert(documents_batch)
+            except Exception as e:
+                logger.error(
+                    f"Failed to insert documents into {document_class.get_collection_name()}: {e}"
+                )
+                return False
+    return True

rag_demo/preprocessing/pdf_conversion.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from llama_parse import LlamaParse
+from llama_index.core import SimpleDirectoryReader
+from uuid import uuid4
+from .base import Document
+from loguru import logger
+from dotenv import load_dotenv
+load_dotenv()
+# set up parser
+parser = LlamaParse(
+    api_key="llx-TN6YSXvZdpG0qhJ7rVx9QFg5Zq298RXr7Id7XzXb5Wr4Rnpt",
+    result_type="markdown",  # "markdown" and "text" are available
+)
+def convert_pdf_to_text(filepaths: list[str]) -> Document:
+    file_extractor = {".pdf": parser}
+    # use SimpleDirectoryReader to parse our file
+    documents = SimpleDirectoryReader(
+        input_files=filepaths, file_extractor=file_extractor
+    ).load_data()
+    logger.info("Converted 1 documents")
+    return Document(
+        document_id=uuid4(),
+        text=" ".join(document.text for document in documents),
+        metadata={"filename": filepaths[0].split("/")[-1]},
+    )

rag_demo/rag/__pycache__/prompt_templates.cpython-311.pyc ADDED Viewed

Binary file (2.75 kB). View file

rag_demo/rag/__pycache__/query_expansion.cpython-311.pyc ADDED Viewed

Binary file (2.4 kB). View file

rag_demo/rag/__pycache__/reranker.cpython-311.pyc ADDED Viewed

Binary file (1.96 kB). View file

rag_demo/rag/__pycache__/retriever.cpython-311.pyc ADDED Viewed

Binary file (8.21 kB). View file

rag_demo/rag/base/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .template_factory import PromptTemplateFactory
2	+
3	+ __all__ = [PromptTemplateFactory]

rag_demo/rag/base/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (283 Bytes). View file

rag_demo/rag/base/__pycache__/query.cpython-311.pyc ADDED Viewed

Binary file (2.08 kB). View file

rag_demo/rag/base/__pycache__/template_factory.cpython-311.pyc ADDED Viewed

Binary file (1.64 kB). View file

rag_demo/rag/base/base.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from abc import ABC, abstractmethod
+from typing import Any
+from langchain.prompts import PromptTemplate
+from pydantic import BaseModel
+from rag_demo.rag.base.query import Query
+class PromptTemplateFactory(ABC, BaseModel):
+    @abstractmethod
+    def create_template(self) -> PromptTemplate:
+        pass
+class RAGStep(ABC):
+    def __init__(self, mock: bool = False) -> None:
+        self._mock = mock
+    @abstractmethod
+    def generate(self, query: Query, *args, **kwargs) -> Any:
+        pass

rag_demo/rag/base/query.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from pydantic import UUID4, Field
+from rag_demo.preprocessing.base import VectorBaseDocument
+class Query(VectorBaseDocument):
+    content: str
+    metadata: dict = Field(default_factory=dict)
+    class Config:
+        category = "query"
+    @classmethod
+    def from_str(cls, query: str) -> "Query":
+        return Query(content=query.strip("\n "))
+    def replace_content(self, new_content: str) -> "Query":
+        return Query(
+            id=self.id,
+            content=new_content,
+            metadata=self.metadata,
+        )
+class EmbeddedQuery(Query):
+    embedding: list[float]
+    class Config:
+        category = "query"

rag_demo/rag/base/template_factory.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from abc import ABC, abstractmethod
+from typing import Any
+from langchain.prompts import PromptTemplate
+from pydantic import BaseModel
+from .query import Query
+class PromptTemplateFactory(ABC, BaseModel):
+    @abstractmethod
+    def create_template(self) -> PromptTemplate:
+        pass
+class RAGStep(ABC):
+    def __init__(self, mock: bool = False) -> None:
+        self._mock = mock
+    @abstractmethod
+    def generate(self, query: Query, *args, **kwargs) -> Any:
+        pass

rag_demo/rag/prompt_templates.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from langchain.prompts import PromptTemplate
+from .base import PromptTemplateFactory
+class QueryExpansionTemplate(PromptTemplateFactory):
+    prompt: str = """You are an AI language model assistant. Your task is to generate {expand_to_n}
+    different versions of the given user question to retrieve relevant documents from a vector
+    database. By generating multiple perspectives on the user question, your goal is to help
+    the user overcome some of the limitations of the distance-based similarity search.
+    Provide these alternative questions seperated by '{separator}'.
+    Original question: {question}"""
+    @property
+    def separator(self) -> str:
+        return "#next-question#"
+    def create_template(self, expand_to_n: int) -> PromptTemplate:
+        return PromptTemplate(
+            template=self.prompt,
+            input_variables=["question"],
+            partial_variables={
+                "separator": self.separator,
+                "expand_to_n": expand_to_n,
+            },
+        )
+class AnswerGenerationTemplate(PromptTemplateFactory):
+    prompt: str = """You are an AI language model assistant. Your task is to generate an answer to the given user question based on the provided context.
+    Context: {context}
+    Question: {question}
+    Give your answer in markdown format if needed, for example if a table is the best way to answer the question, or if titles and subheadings are needed.
+    Give only your answer, do not include any other text like 'Certainly! Here is the answer:' or 'The answer is:' or anything similar."""
+    def create_template(self, context: str, question: str) -> str:
+        return self.prompt.format(context=context, question=question)

rag_demo/rag/query_expansion.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+from typing import Any
+from huggingface_hub import InferenceClient
+from rag_demo.rag.base.query import Query
+from rag_demo.rag.base.template_factory import RAGStep
+from rag_demo.rag.prompt_templates import QueryExpansionTemplate
+class QueryExpansion(RAGStep):
+    def generate(self, query: Query, expand_to_n: int) -> Any:
+        api = InferenceClient(
+            model="Qwen/Qwen2.5-72B-Instruct",
+            token=os.getenv("HF_API_TOKEN"),
+        )
+        query_expansion_template = QueryExpansionTemplate()
+        prompt = query_expansion_template.create_template(expand_to_n - 1)
+        response = api.chat_completion(
+            [
+                {
+                    "role": "user",
+                    "content": prompt.template.format(
+                        question=query.content,
+                        expand_to_n=expand_to_n,
+                        separator=query_expansion_template.separator,
+                    ),
+                }
+            ]
+        )
+        result = response.choices[0].message.content
+        queries_content = result.split(query_expansion_template.separator)
+        queries = [query]
+        queries += [
+            query.replace_content(stripped_content)
+            for content in queries_content
+            if (stripped_content := content.strip())
+        ]
+        return queries

rag_demo/rag/reranker.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+from huggingface_hub import InferenceClient
+from rag_demo.rag.base.query import Query
+from rag_demo.rag.base.template_factory import RAGStep
+from rag_demo.preprocessing.embed import EmbeddedChunk
+class Reranker(RAGStep):
+    def generate(
+        self, query: Query, chunks: list[EmbeddedChunk], keep_top_k: int
+    ) -> list[EmbeddedChunk]:
+        api = InferenceClient(
+            model="intfloat/multilingual-e5-large-instruct",
+            token=os.getenv("HF_API_TOKEN"),
+        )
+        similarity = api.sentence_similarity(
+            query.content, [chunk.content for chunk in chunks]
+        )
+        for chunk, sim in zip(chunks, similarity):
+            chunk.similarity = sim
+        return sorted(chunks, key=lambda x: x.similarity, reverse=True)[:keep_top_k]

rag_demo/rag/retriever.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import concurrent.futures
+import os
+from loguru import logger
+from qdrant_client.models import FieldCondition, Filter, MatchValue
+from huggingface_hub import InferenceClient
+from rag_demo.preprocessing.base import (
+    EmbeddedChunk,
+)
+from rag_demo.rag.base.query import EmbeddedQuery, Query
+from .query_expansion import QueryExpansion
+from .reranker import Reranker
+from .prompt_templates import AnswerGenerationTemplate
+from dotenv import load_dotenv
+load_dotenv()
+def flatten(nested_list: list) -> list:
+    """Flatten a list of lists into a single list."""
+    return [item for sublist in nested_list for item in sublist]
+class RAGPipeline:
+    def __init__(self, mock: bool = False) -> None:
+        self._query_expander = QueryExpansion(mock=mock)
+        self._reranker = Reranker(mock=mock)
+    def search(
+        self,
+        query: str,
+        k: int = 3,
+        expand_to_n_queries: int = 3,
+    ) -> list:
+        query_model = Query.from_str(query)
+        n_generated_queries = self._query_expander.generate(
+            query_model, expand_to_n=expand_to_n_queries
+        )
+        logger.info(
+            f"Successfully generated {len(n_generated_queries)} search queries.",
+        )
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            search_tasks = [
+                executor.submit(self._search, _query_model, k)
+                for _query_model in n_generated_queries
+            ]
+            n_k_documents = [
+                task.result() for task in concurrent.futures.as_completed(search_tasks)
+            ]
+            n_k_documents = flatten(n_k_documents)
+            n_k_documents = list(set(n_k_documents))
+        logger.info(f"{len(n_k_documents)} documents retrieved successfully")
+        if len(n_k_documents) > 0:
+            k_documents = self.rerank(query, chunks=n_k_documents, keep_top_k=k)
+        else:
+            k_documents = []
+        return k_documents
+    def _search(self, query: Query, k: int = 3) -> list[EmbeddedChunk]:
+        assert k >= 3, "k should be >= 3"
+        def _search_data(
+            data_category_odm: type[EmbeddedChunk], embedded_query: EmbeddedQuery
+        ) -> list[EmbeddedChunk]:
+            return data_category_odm.search(
+                query_vector=embedded_query.embedding,
+                limit=k,
+            )
+        api = InferenceClient(
+            model="intfloat/multilingual-e5-large-instruct",
+            token=os.getenv("HF_API_TOKEN"),
+        )
+        embedded_query: EmbeddedQuery = EmbeddedQuery(
+            embedding=api.feature_extraction(query.content),
+            id=query.id,
+            content=query.content,
+        )
+        retrieved_chunks = _search_data(EmbeddedChunk, embedded_query)
+        logger.info(f"{len(retrieved_chunks)} documents retrieved successfully")
+        return retrieved_chunks
+    def rerank(
+        self, query: str | Query, chunks: list[EmbeddedChunk], keep_top_k: int
+    ) -> list[EmbeddedChunk]:
+        if isinstance(query, str):
+            query = Query.from_str(query)
+        reranked_documents = self._reranker.generate(
+            query=query, chunks=chunks, keep_top_k=keep_top_k
+        )
+        logger.info(f"{len(reranked_documents)} documents reranked successfully.")
+        return reranked_documents
+    def generate_answer(self, query: str, reranked_chunks: list[EmbeddedChunk]) -> str:
+        context = ""
+        for chunk in reranked_chunks:
+            context += "\n Document: "
+            context += chunk.content
+        api = InferenceClient(
+            model="meta-llama/Llama-3.1-8B-Instruct",
+            token=os.getenv("HF_API_TOKEN"),
+        )
+        answer_generation_template = AnswerGenerationTemplate()
+        prompt = answer_generation_template.create_template(context, query)
+        logger.info(prompt)
+        response = api.chat_completion(
+            [{"role": "user", "content": prompt}],
+            max_tokens=8192,
+        )
+        return response.choices[0].message.content
+    def rag(self, query: str) -> tuple[str, list[str]]:
+        docs = self.search(query, k=10)
+        reranked_docs = self.rerank(query, docs, keep_top_k=10)
+        return (
+            self.generate_answer(query, reranked_docs),
+            [doc.metadata["filename"].split(".pdf")[0] for doc in reranked_docs],
+        )