Spaces:

geekyrakshit
/

medrag

Sleeping

App Files Files Community

geekyrakshit commited on Oct 21, 2024

Commit

e382637

unverified ·

2 Parent(s): bf14736 2ab36c4

Merge pull request #13 from soumik12345/feat/retrieval

Browse files

Files changed (18) hide show

.github/workflows/tests.yml +21 -0
.gitignore +3 -0
docs/retreival/bm25s.md +3 -0
docs/retreival/colpali.md +3 -0
docs/retreival/contriever.md +3 -0
docs/retreival/medcpt.md +3 -0
docs/retreival/multi_modal_retrieval.md +0 -3
docs/retreival/nv_embed_2.md +3 -0
medrag_multi_modal/retrieval/__init__.py +14 -2
medrag_multi_modal/retrieval/bm25s_retrieval.py +213 -0
medrag_multi_modal/retrieval/{multi_modal_retrieval.py → colpali_retrieval.py} +163 -57
medrag_multi_modal/retrieval/common.py +45 -0
medrag_multi_modal/retrieval/contriever_retrieval.py +240 -0
medrag_multi_modal/retrieval/medcpt_retrieval.py +279 -0
medrag_multi_modal/retrieval/nv_embed_2.py +282 -0
medrag_multi_modal/utils.py +20 -1
mkdocs.yml +5 -1
pyproject.toml +18 -8

.github/workflows/tests.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+name: Tests
+on:
+  pull_request:
+    paths:
+      - .github/workflows/tests.yml
+      - medrag_multi_modal/**
+      - pyproject.toml
+jobs:
+  code-format:
+    name: check code format using black
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: psf/black@stable
+  lint:
+    name: Check linting using ruff
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: chartboost/ruff-action@v1

.gitignore CHANGED Viewed

@@ -18,3 +18,6 @@ wandb/
 cursor_prompt.txt
 test.py
 uv.lock

 cursor_prompt.txt
 test.py
 uv.lock
+grays-anatomy-bm25s/
+prompt**.txt
+**.safetensors

docs/retreival/bm25s.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # BM25-Sparse Retrieval
2	+
3	+ ::: medrag_multi_modal.retrieval.bm25s_retrieval

docs/retreival/colpali.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # ColPali Retrieval
2	+
3	+ ::: medrag_multi_modal.retrieval.colpali_retrieval

docs/retreival/contriever.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Contriever Retrieval
2	+
3	+ ::: medrag_multi_modal.retrieval.contriever_retrieval

docs/retreival/medcpt.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # MedCPT Retrieval
2	+
3	+ ::: medrag_multi_modal.retrieval.medcpt_retrieval

docs/retreival/multi_modal_retrieval.md DELETED Viewed

@@ -1,3 +0,0 @@
-# Multi-Modal Retrieval
-::: medrag_multi_modal.retrieval.multi_modal_retrieval

docs/retreival/nv_embed_2.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # NV-Embed-v2 Retrieval
2	+
3	+ ::: medrag_multi_modal.retrieval.nv_embed_2

medrag_multi_modal/retrieval/__init__.py CHANGED Viewed

@@ -1,3 +1,15 @@
-from .multi_modal_retrieval import MultiModalRetriever
-__all__ = ["MultiModalRetriever"]

+from .bm25s_retrieval import BM25sRetriever
+from .colpali_retrieval import CalPaliRetriever
+from .common import SimilarityMetric
+from .contriever_retrieval import ContrieverRetriever
+from .medcpt_retrieval import MedCPTRetriever
+from .nv_embed_2 import NVEmbed2Retriever
+__all__ = [
+    "CalPaliRetriever",
+    "BM25sRetriever",
+    "ContrieverRetriever",
+    "SimilarityMetric",
+    "MedCPTRetriever",
+    "NVEmbed2Retriever",
+]

medrag_multi_modal/retrieval/bm25s_retrieval.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import os
+from glob import glob
+from typing import Optional
+import bm25s
+import weave
+from Stemmer import Stemmer
+import wandb
+LANGUAGE_DICT = {
+    "english": "en",
+    "french": "fr",
+    "german": "de",
+}
+class BM25sRetriever(weave.Model):
+    """
+    `BM25sRetriever` is a class that provides functionality for indexing and
+    retrieving documents using the [BM25-Sparse](https://github.com/xhluca/bm25s).
+    Args:
+        language (str): The language of the documents to be indexed and retrieved.
+        use_stemmer (bool): A flag indicating whether to use stemming during tokenization.
+        retriever (Optional[bm25s.BM25]): An instance of the BM25 retriever. If not provided,
+            a new instance is created.
+    """
+    language: str
+    use_stemmer: bool
+    _retriever: Optional[bm25s.BM25]
+    def __init__(
+        self,
+        language: str = "english",
+        use_stemmer: bool = True,
+        retriever: Optional[bm25s.BM25] = None,
+    ):
+        super().__init__(language=language, use_stemmer=use_stemmer)
+        self._retriever = retriever or bm25s.BM25()
+    def index(self, chunk_dataset_name: str, index_name: Optional[str] = None):
+        """
+        Indexes a dataset of text chunks using the BM25 algorithm.
+        This function takes a dataset of text chunks identified by `chunk_dataset_name`,
+        tokenizes the text using the BM25 tokenizer with optional stemming, and indexes
+        the tokenized text using the BM25 retriever. If an `index_name` is provided, the
+        index is saved to disk and logged as a Weights & Biases artifact.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            import wandb
+            from medrag_multi_modal.retrieval import BM25sRetriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="bm25s-index")
+            retriever = BM25sRetriever()
+            retriever.index(chunk_dataset_name="grays-anatomy-text:v13", index_name="grays-anatomy-bm25s")
+            ```
+        Args:
+            chunk_dataset_name (str): The name of the dataset containing text chunks to be indexed.
+            index_name (Optional[str]): The name to save the index under. If provided, the index
+                is saved to disk and logged as a Weights & Biases artifact.
+        """
+        chunk_dataset = weave.ref(chunk_dataset_name).get().rows
+        corpus = [row["text"] for row in chunk_dataset]
+        corpus_tokens = bm25s.tokenize(
+            corpus,
+            stopwords=LANGUAGE_DICT[self.language],
+            stemmer=Stemmer(self.language) if self.use_stemmer else None,
+        )
+        self._retriever.index(corpus_tokens)
+        if index_name:
+            self._retriever.save(
+                index_name, corpus=[dict(row) for row in chunk_dataset]
+            )
+            if wandb.run:
+                artifact = wandb.Artifact(
+                    name=index_name,
+                    type="bm25s-index",
+                    metadata={
+                        "language": self.language,
+                        "use_stemmer": self.use_stemmer,
+                    },
+                )
+                artifact.add_dir(index_name, name=index_name)
+                artifact.save()
+    @classmethod
+    def from_wandb_artifact(cls, index_artifact_address: str):
+        """
+        Creates an instance of the class from a Weights & Biases artifact.
+        This class method retrieves a BM25 index artifact from Weights & Biases,
+        downloads the artifact, and loads the BM25 retriever with the index and its
+        associated corpus. The method also extracts metadata from the artifact to
+        initialize the class instance with the appropriate language and stemming
+        settings.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            from medrag_multi_modal.retrieval import BM25sRetriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = BM25sRetriever.from_wandb_artifact(
+                index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-bm25s:latest"
+            )
+            ```
+        Args:
+            index_artifact_address (str): The address of the Weights & Biases artifact
+                containing the BM25 index.
+        Returns:
+            An instance of the class initialized with the BM25 retriever and metadata
+            from the artifact.
+        """
+        if wandb.run:
+            artifact = wandb.run.use_artifact(
+                index_artifact_address, type="bm25s-index"
+            )
+            artifact_dir = artifact.download()
+        else:
+            api = wandb.Api()
+            artifact = api.artifact(index_artifact_address)
+            artifact_dir = artifact.download()
+        retriever = bm25s.BM25.load(
+            glob(os.path.join(artifact_dir, "*"))[0], load_corpus=True
+        )
+        metadata = artifact.metadata
+        return cls(
+            language=metadata["language"],
+            use_stemmer=metadata["use_stemmer"],
+            retriever=retriever,
+        )
+    @weave.op()
+    def retrieve(self, query: str, top_k: int = 2):
+        """
+        Retrieves the top-k most relevant chunks for a given query using the BM25 algorithm.
+        This method tokenizes the input query using the BM25 tokenizer, which takes into
+        account the language-specific stopwords and optional stemming. It then retrieves
+        the top-k most relevant chunks from the BM25 index based on the tokenized query.
+        The results are returned as a list of dictionaries, each containing a chunk and
+        its corresponding relevance score.
+        Args:
+            query (str): The input query string to search for relevant chunks.
+            top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
+        Returns:
+            list: A list of dictionaries, each containing a retrieved chunk and its
+                relevance score.
+        """
+        query_tokens = bm25s.tokenize(
+            query,
+            stopwords=LANGUAGE_DICT[self.language],
+            stemmer=Stemmer(self.language) if self.use_stemmer else None,
+        )
+        results = self._retriever.retrieve(query_tokens, k=top_k)
+        retrieved_chunks = []
+        for chunk, score in zip(
+            results.documents.flatten().tolist(),
+            results.scores.flatten().tolist(),
+        ):
+            retrieved_chunks.append({"chunk": chunk, "score": score})
+        return retrieved_chunks
+    @weave.op()
+    def predict(self, query: str, top_k: int = 2):
+        """
+        Predicts the top-k most relevant chunks for a given query using the BM25 algorithm.
+        This function is a wrapper around the `retrieve` method. It takes an input query string,
+        tokenizes it using the BM25 tokenizer, and retrieves the top-k most relevant chunks from
+        the BM25 index. The results are returned as a list of dictionaries, each containing a chunk
+        and its corresponding relevance score.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            from medrag_multi_modal.retrieval import BM25sRetriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = BM25sRetriever.from_wandb_artifact(
+                index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-bm25s:latest"
+            )
+            retrieved_chunks = retriever.predict(query="What are Ribosomes?")
+            ```
+        Args:
+            query (str): The input query string to search for relevant chunks.
+            top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
+        Returns:
+            list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
+        """
+        return self.retrieve(query, top_k)

medrag_multi_modal/retrieval/{multi_modal_retrieval.py → colpali_retrieval.py} RENAMED Viewed

@@ -1,8 +1,11 @@
 import os
-from typing import Any, Optional
 import weave
-from byaldi import RAGMultiModalModel
 from PIL import Image
 import wandb
@@ -10,64 +13,33 @@ import wandb
 from ..utils import get_wandb_artifact
-class MultiModalRetriever(weave.Model):
     """
-    MultiModalRetriever is a class that facilitates the retrieval of page images using ColPali.
     This class leverages the `byaldi.RAGMultiModalModel` to perform document retrieval tasks.
     It can be initialized with a pre-trained model or from a specified W&B artifact. The class
     also provides methods to index new data and to predict/retrieve documents based on a query.
-    !!! example "Indexing Data"
-        ```python
-        import wandb
-        from medrag_multi_modal.retrieval import MultiModalRetriever
-        wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="index")
-        retriever = MultiModalRetriever()
-        retriever.index(
-            data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
-            weave_dataset_name="grays-anatomy-images:v0",
-            index_name="grays-anatomy",
-        )
-        ```
-    !!! example "Retrieving Documents"
-        ```python
-        import weave
-        import wandb
-        from medrag_multi_modal.retrieval import MultiModalRetriever
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        retriever = MultiModalRetriever.from_artifact(
-            index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
-            metadata_dataset_name="grays-anatomy-images:v0",
-            data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
-        )
-        retriever.predict(
-            query="which neurotransmitters convey information between Merkel cells and sensory afferents?",
-            top_k=3,
-        )
-        ```
     Attributes:
         model_name (str): The name of the model to be used for retrieval.
     """
     model_name: str
-    _docs_retrieval_model: Optional[RAGMultiModalModel] = None
     _metadata: Optional[dict] = None
     _data_artifact_dir: Optional[str] = None
     def __init__(
         self,
         model_name: str = "vidore/colpali-v1.2",
-        docs_retrieval_model: Optional[RAGMultiModalModel] = None,
         data_artifact_dir: Optional[str] = None,
         metadata_dataset_name: Optional[str] = None,
     ):
         super().__init__(model_name=model_name)
         self._docs_retrieval_model = (
             docs_retrieval_model or RAGMultiModalModel.from_pretrained(self.model_name)
         )
@@ -78,25 +50,54 @@ class MultiModalRetriever(weave.Model):
             else None
         )
-    @classmethod
-    def from_artifact(
-        cls,
-        index_artifact_name: str,
-        metadata_dataset_name: str,
-        data_artifact_name: str,
-    ):
-        index_artifact_dir = get_wandb_artifact(index_artifact_name, "colpali-index")
-        data_artifact_dir = get_wandb_artifact(data_artifact_name, "dataset")
-        docs_retrieval_model = RAGMultiModalModel.from_index(
-            index_path=os.path.join(index_artifact_dir, "index")
-        )
-        return cls(
-            docs_retrieval_model=docs_retrieval_model,
-            metadata_dataset_name=metadata_dataset_name,
-            data_artifact_dir=data_artifact_dir,
-        )
     def index(self, data_artifact_name: str, weave_dataset_name: str, index_name: str):
         data_artifact_dir = get_wandb_artifact(data_artifact_name, "dataset")
         self._docs_retrieval_model.index(
             input_path=data_artifact_dir,
@@ -115,6 +116,76 @@ class MultiModalRetriever(weave.Model):
             )
             artifact.save()
     @weave.op()
     def predict(self, query: str, top_k: int = 3) -> list[dict[str, Any]]:
         """
@@ -125,6 +196,41 @@ class MultiModalRetriever(weave.Model):
         documents based on the provided query. It returns a list of dictionaries, each
         containing the document image, document ID, and the relevance score.
         Args:
             query (str): The search query string.
             top_k (int, optional): The number of top results to retrieve. Defaults to 10.

 import os
+from typing import TYPE_CHECKING, Any, Optional
 import weave
+if TYPE_CHECKING:
+    from byaldi import RAGMultiModalModel
 from PIL import Image
 import wandb
 from ..utils import get_wandb_artifact
+class CalPaliRetriever(weave.Model):
     """
+    CalPaliRetriever is a class that facilitates the retrieval of page images using ColPali.
     This class leverages the `byaldi.RAGMultiModalModel` to perform document retrieval tasks.
     It can be initialized with a pre-trained model or from a specified W&B artifact. The class
     also provides methods to index new data and to predict/retrieve documents based on a query.
     Attributes:
         model_name (str): The name of the model to be used for retrieval.
     """
     model_name: str
+    _docs_retrieval_model: Optional["RAGMultiModalModel"] = None
     _metadata: Optional[dict] = None
     _data_artifact_dir: Optional[str] = None
     def __init__(
         self,
         model_name: str = "vidore/colpali-v1.2",
+        docs_retrieval_model: Optional["RAGMultiModalModel"] = None,
         data_artifact_dir: Optional[str] = None,
         metadata_dataset_name: Optional[str] = None,
     ):
         super().__init__(model_name=model_name)
+        from byaldi import RAGMultiModalModel
         self._docs_retrieval_model = (
             docs_retrieval_model or RAGMultiModalModel.from_pretrained(self.model_name)
         )
             else None
         )
     def index(self, data_artifact_name: str, weave_dataset_name: str, index_name: str):
+        """
+        Indexes a dataset of documents and saves the index as a Weave artifact.
+        This method retrieves a dataset of documents from a Weave artifact using the provided
+        data artifact name. It then indexes the documents using the document retrieval model
+        and assigns the specified index name. The index is stored locally without storing the
+        collection with the index and overwrites any existing index with the same name.
+        If a Weave run is active, the method creates a new Weave artifact with the specified
+        index name and type "colpali-index". It adds the local index directory to the artifact
+        and saves it to Weave, including metadata with the provided Weave dataset name.
+        !!! example "Indexing Data"
+            First you need to install `Byaldi` library by Answer.ai.
+            ```bash
+            uv pip install Byaldi>=0.0.5
+            ```
+            Next, you can index the data by running the following code:
+            ```python
+            import wandb
+            from medrag_multi_modal.retrieval import CalPaliRetriever
+            wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="index")
+            retriever = CalPaliRetriever()
+            retriever.index(
+                data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
+                weave_dataset_name="grays-anatomy-images:v0",
+                index_name="grays-anatomy",
+            )
+            ```
+        ??? note "Optional Speedup using Flash Attention"
+            If you have a GPU with Flash Attention support, you can enable it for ColPali by simply
+            installing the `flash-attn` package.
+            ```bash
+            uv pip install flash-attn --no-build-isolation
+            ```
+        Args:
+            data_artifact_name (str): The name of the Weave artifact containing the dataset.
+            weave_dataset_name (str): The name of the Weave dataset to include in the artifact metadata.
+            index_name (str): The name to assign to the created index.
+        """
         data_artifact_dir = get_wandb_artifact(data_artifact_name, "dataset")
         self._docs_retrieval_model.index(
             input_path=data_artifact_dir,
             )
             artifact.save()
+    @classmethod
+    def from_wandb_artifact(
+        cls,
+        index_artifact_name: str,
+        metadata_dataset_name: str,
+        data_artifact_name: str,
+    ):
+        """
+        Creates an instance of the class from Weights & Biases (wandb) artifacts.
+        This method retrieves the necessary artifacts from wandb to initialize the
+        ColPaliRetriever. It fetches the index artifact directory and the data artifact
+        directory using the provided artifact names. It then loads the document retrieval
+        model from the index path within the index artifact directory. Finally, it returns
+        an instance of the class initialized with the retrieved document retrieval model,
+        metadata dataset name, and data artifact directory.
+        !!! example "Retrieving Documents"
+            First you need to install `Byaldi` library by Answer.ai.
+            ```bash
+            uv pip install Byaldi>=0.0.5
+            ```
+            Next, you can retrieve the documents by running the following code:
+            ```python
+            import weave
+            import wandb
+            from medrag_multi_modal.retrieval import CalPaliRetriever
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = CalPaliRetriever.from_wandb_artifact(
+                index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
+                metadata_dataset_name="grays-anatomy-images:v0",
+                data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
+            )
+            ```
+        ??? note "Optional Speedup using Flash Attention"
+            If you have a GPU with Flash Attention support, you can enable it for ColPali by simply
+            installing the `flash-attn` package.
+            ```bash
+            uv pip install flash-attn --no-build-isolation
+            ```
+        Args:
+            index_artifact_name (str): The name of the wandb artifact containing the index.
+            metadata_dataset_name (str): The name of the dataset containing metadata.
+            data_artifact_name (str): The name of the wandb artifact containing the data.
+        Returns:
+            An instance of the class initialized with the retrieved document retrieval model,
+            metadata dataset name, and data artifact directory.
+        """
+        from byaldi import RAGMultiModalModel
+        index_artifact_dir = get_wandb_artifact(index_artifact_name, "colpali-index")
+        data_artifact_dir = get_wandb_artifact(data_artifact_name, "dataset")
+        docs_retrieval_model = RAGMultiModalModel.from_index(
+            index_path=os.path.join(index_artifact_dir, "index")
+        )
+        return cls(
+            docs_retrieval_model=docs_retrieval_model,
+            metadata_dataset_name=metadata_dataset_name,
+            data_artifact_dir=data_artifact_dir,
+        )
     @weave.op()
     def predict(self, query: str, top_k: int = 3) -> list[dict[str, Any]]:
         """
         documents based on the provided query. It returns a list of dictionaries, each
         containing the document image, document ID, and the relevance score.
+        !!! example "Retrieving Documents"
+            First you need to install `Byaldi` library by Answer.ai.
+            ```bash
+            uv pip install Byaldi>=0.0.5
+            ```
+            Next, you can retrieve the documents by running the following code:
+            ```python
+            import weave
+            import wandb
+            from medrag_multi_modal.retrieval import CalPaliRetriever
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = CalPaliRetriever.from_wandb_artifact(
+                index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
+                metadata_dataset_name="grays-anatomy-images:v0",
+                data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
+            )
+            retriever.predict(
+                query="which neurotransmitters convey information between Merkel cells and sensory afferents?",
+                top_k=3,
+            )
+            ```
+        ??? note "Optional Speedup using Flash Attention"
+            If you have a GPU with Flash Attention support, you can enable it for ColPali by simply
+            installing the `flash-attn` package.
+            ```bash
+            uv pip install flash-attn --no-build-isolation
+            ```
         Args:
             query (str): The search query string.
             top_k (int, optional): The number of top results to retrieve. Defaults to 10.

medrag_multi_modal/retrieval/common.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from enum import Enum
+import safetensors
+import safetensors.torch
+import torch
+import wandb
+class SimilarityMetric(Enum):
+    COSINE = "cosine"
+    EUCLIDEAN = "euclidean"
+def mean_pooling(token_embeddings, mask):
+    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.0)
+    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
+    return sentence_embeddings
+def argsort_scores(scores: list[float], descending: bool = False):
+    return [
+        {"item": item, "original_index": idx}
+        for idx, item in sorted(
+            list(enumerate(scores)), key=lambda x: x[1], reverse=descending
+        )
+    ]
+def save_vector_index(
+    vector_index: torch.Tensor,
+    type: str,
+    index_name: str,
+    metadata: dict,
+    filename: str = "vector_index.safetensors",
+):
+    safetensors.torch.save_file({"vector_index": vector_index.cpu()}, filename)
+    if wandb.run:
+        artifact = wandb.Artifact(
+            name=index_name,
+            type=type,
+            metadata=metadata,
+        )
+        artifact.add_file(filename)
+        artifact.save()

medrag_multi_modal/retrieval/contriever_retrieval.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import os
+from typing import Optional
+import safetensors
+import safetensors.torch
+import torch
+import torch.nn.functional as F
+import weave
+from transformers import (
+    AutoModel,
+    AutoTokenizer,
+    BertPreTrainedModel,
+    PreTrainedTokenizerFast,
+)
+from ..utils import get_torch_backend, get_wandb_artifact
+from .common import SimilarityMetric, argsort_scores, mean_pooling, save_vector_index
+class ContrieverRetriever(weave.Model):
+    """
+    `ContrieverRetriever` is a class to perform retrieval tasks using the Contriever model.
+    It provides methods to encode text data into embeddings, index a dataset of text chunks,
+    and retrieve the most relevant chunks for a given query based on similarity metrics.
+    Args:
+        model_name (str): The name of the pre-trained model to use for encoding.
+        vector_index (Optional[torch.Tensor]): The tensor containing the vector representations
+            of the indexed chunks.
+        chunk_dataset (Optional[list[dict]]): The weave dataset of text chunks to be indexed.
+    """
+    model_name: str
+    _chunk_dataset: Optional[list[dict]]
+    _tokenizer: PreTrainedTokenizerFast
+    _model: BertPreTrainedModel
+    _vector_index: Optional[torch.Tensor]
+    def __init__(
+        self,
+        model_name: str = "facebook/contriever",
+        vector_index: Optional[torch.Tensor] = None,
+        chunk_dataset: Optional[list[dict]] = None,
+    ):
+        super().__init__(model_name=model_name)
+        self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self._model = AutoModel.from_pretrained(self.model_name)
+        self._vector_index = vector_index
+        self._chunk_dataset = chunk_dataset
+    def encode(self, corpus: list[str]) -> torch.Tensor:
+        inputs = self._tokenizer(
+            corpus, padding=True, truncation=True, return_tensors="pt"
+        )
+        outputs = self._model(**inputs)
+        return mean_pooling(outputs[0], inputs["attention_mask"])
+    def index(self, chunk_dataset_name: str, index_name: Optional[str] = None):
+        """
+        Indexes a dataset of text chunks and optionally saves the vector index to a file.
+        This method retrieves a dataset of text chunks from a Weave reference, encodes the
+        text chunks into vector representations using the Contriever model, and stores the
+        resulting vector index. If an index name is provided, the vector index is saved to
+        a file in the safetensors format. Additionally, if a Weave run is active, the vector
+        index file is logged as an artifact to Weave.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            import wandb
+            from medrag_multi_modal.retrieval import ContrieverRetriever, SimilarityMetric
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="contriever-index")
+            retriever = ContrieverRetriever(model_name="facebook/contriever")
+            retriever.index(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_name="grays-anatomy-contriever",
+            )
+            ```
+        Args:
+            chunk_dataset_name (str): The name of the Weave dataset containing the text chunks
+                to be indexed.
+            index_name (Optional[str]): The name of the index artifact to be saved. If provided,
+                the vector index is saved to a file and logged as an artifact to Weave.
+        """
+        self._chunk_dataset = weave.ref(chunk_dataset_name).get().rows
+        corpus = [row["text"] for row in self._chunk_dataset]
+        with torch.no_grad():
+            vector_index = self.encode(corpus)
+            self._vector_index = vector_index
+            if index_name:
+                save_vector_index(
+                    self._vector_index,
+                    "contriever-index",
+                    index_name,
+                    {"model_name": self.model_name},
+                )
+    @classmethod
+    def from_wandb_artifact(cls, chunk_dataset_name: str, index_artifact_address: str):
+        """
+        Creates an instance of the class from a Weave artifact.
+        This method retrieves a vector index and metadata from a Weave artifact stored in
+        Weights & Biases (wandb). It also retrieves a dataset of text chunks from a Weave
+        reference. The vector index is loaded from a safetensors file and moved to the
+        appropriate device (CPU or GPU). The text chunks are converted into a list of
+        dictionaries. The method then returns an instance of the class initialized with
+        the retrieved model name, vector index, and chunk dataset.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            from medrag_multi_modal.retrieval import ContrieverRetriever, SimilarityMetric
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = ContrieverRetriever.from_wandb_artifact(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-contriever:v1",
+            )
+            ```
+        Args:
+            chunk_dataset_name (str): The name of the Weave dataset containing the text chunks.
+            index_artifact_address (str): The address of the Weave artifact containing the
+                vector index.
+        Returns:
+            An instance of the class initialized with the retrieved model name, vector index,
+            and chunk dataset.
+        """
+        artifact_dir, metadata = get_wandb_artifact(
+            index_artifact_address, "contriever-index", get_metadata=True
+        )
+        with safetensors.torch.safe_open(
+            os.path.join(artifact_dir, "vector_index.safetensors"), framework="pt"
+        ) as f:
+            vector_index = f.get_tensor("vector_index")
+        device = torch.device(get_torch_backend())
+        vector_index = vector_index.to(device)
+        chunk_dataset = [dict(row) for row in weave.ref(chunk_dataset_name).get().rows]
+        return cls(
+            model_name=metadata["model_name"],
+            vector_index=vector_index,
+            chunk_dataset=chunk_dataset,
+        )
+    @weave.op()
+    def retrieve(
+        self,
+        query: str,
+        top_k: int = 2,
+        metric: SimilarityMetric = SimilarityMetric.COSINE,
+    ):
+        """
+        Retrieves the top-k most relevant chunks for a given query using the specified similarity metric.
+        This method encodes the input query into an embedding and computes similarity scores between
+        the query embedding and the precomputed vector index. The similarity metric can be either
+        cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
+        are returned as a list of dictionaries, each containing a chunk and its corresponding score.
+        Args:
+            query (str): The input query string to search for relevant chunks.
+            top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
+            metric (SimilarityMetric, optional): The similarity metric to use for scoring.
+        Returns:
+            list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
+        """
+        query = [query]
+        device = torch.device(get_torch_backend())
+        with torch.no_grad():
+            query_embedding = self.encode(query).to(device)
+            if metric == SimilarityMetric.EUCLIDEAN:
+                scores = torch.squeeze(query_embedding @ self._vector_index.T)
+            else:
+                scores = F.cosine_similarity(query_embedding, self._vector_index)
+            scores = scores.cpu().numpy().tolist()
+        scores = argsort_scores(scores, descending=True)[:top_k]
+        retrieved_chunks = []
+        for score in scores:
+            retrieved_chunks.append(
+                {
+                    "chunk": self._chunk_dataset[score["original_index"]],
+                    "score": score["item"],
+                }
+            )
+        return retrieved_chunks
+    @weave.op()
+    def predict(
+        self,
+        query: str,
+        top_k: int = 2,
+        metric: SimilarityMetric = SimilarityMetric.COSINE,
+    ):
+        """
+        Predicts the top-k most relevant chunks for a given query using the specified similarity metric.
+        This function is a wrapper around the `retrieve` method. It takes an input query string,
+        retrieves the top-k most relevant chunks from the precomputed vector index based on the
+        specified similarity metric, and returns the results as a list of dictionaries, each containing
+        a chunk and its corresponding relevance score.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            from medrag_multi_modal.retrieval import ContrieverRetriever, SimilarityMetric
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = ContrieverRetriever.from_wandb_artifact(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-contriever:v1",
+            )
+            scores = retriever.predict(query="What are Ribosomes?", metric=SimilarityMetric.COSINE)
+            ```
+        Args:
+            query (str): The input query string to search for relevant chunks.
+            top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
+            metric (SimilarityMetric, optional): The similarity metric to use for scoring. Defaults to cosine similarity.
+        Returns:
+            list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
+        """
+        return self.retrieve(query, top_k, metric)

medrag_multi_modal/retrieval/medcpt_retrieval.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import os
+from typing import Optional
+import safetensors
+import safetensors.torch
+import torch
+import torch.nn.functional as F
+import weave
+from transformers import (
+    AutoModel,
+    AutoTokenizer,
+    BertPreTrainedModel,
+    PreTrainedTokenizerFast,
+)
+from ..utils import get_torch_backend, get_wandb_artifact
+from .common import SimilarityMetric, argsort_scores, save_vector_index
+class MedCPTRetriever(weave.Model):
+    """
+    A class to retrieve relevant text chunks using MedCPT models.
+    This class provides methods to index a dataset of text chunks and retrieve the most relevant
+    chunks for a given query using MedCPT models. It uses separate models for encoding queries
+    and articles, and supports both cosine similarity and Euclidean distance as similarity metrics.
+    Args:
+        query_encoder_model_name (str): The name of the model used for encoding queries.
+        article_encoder_model_name (str): The name of the model used for encoding articles.
+        chunk_size (Optional[int]): The maximum length of text chunks.
+        vector_index (Optional[torch.Tensor]): The vector index of encoded text chunks.
+        chunk_dataset (Optional[list[dict]]): The dataset of text chunks.
+    """
+    query_encoder_model_name: str
+    article_encoder_model_name: str
+    chunk_size: Optional[int]
+    _chunk_dataset: Optional[list[dict]]
+    _query_tokenizer: PreTrainedTokenizerFast
+    _article_tokenizer: PreTrainedTokenizerFast
+    _query_encoder_model: BertPreTrainedModel
+    _article_encoder_model: BertPreTrainedModel
+    _vector_index: Optional[torch.Tensor]
+    def __init__(
+        self,
+        query_encoder_model_name: str,
+        article_encoder_model_name: str,
+        chunk_size: Optional[int] = None,
+        vector_index: Optional[torch.Tensor] = None,
+        chunk_dataset: Optional[list[dict]] = None,
+    ):
+        super().__init__(
+            query_encoder_model_name=query_encoder_model_name,
+            article_encoder_model_name=article_encoder_model_name,
+            chunk_size=chunk_size,
+        )
+        self._query_tokenizer = AutoTokenizer.from_pretrained(
+            self.query_encoder_model_name
+        )
+        self._article_tokenizer = AutoTokenizer.from_pretrained(
+            self.article_encoder_model_name
+        )
+        self._query_encoder_model = AutoModel.from_pretrained(
+            self.query_encoder_model_name
+        )
+        self._article_encoder_model = AutoModel.from_pretrained(
+            self.article_encoder_model_name
+        )
+        self._chunk_dataset = chunk_dataset
+        self._vector_index = vector_index
+    def index(self, chunk_dataset_name: str, index_name: Optional[str] = None):
+        """
+        Indexes a dataset of text chunks and optionally saves the vector index.
+        This method retrieves a dataset of text chunks from a Weave reference, encodes the text
+        chunks using the article encoder model, and stores the resulting vector index. If an
+        index name is provided, the vector index is saved to a file using the `save_vector_index`
+        function.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            import wandb
+            from medrag_multi_modal.retrieval import MedCPTRetriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="medcpt-index")
+            retriever = MedCPTRetriever(
+                query_encoder_model_name="ncbi/MedCPT-Query-Encoder",
+                article_encoder_model_name="ncbi/MedCPT-Article-Encoder",
+            )
+            retriever.index(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_name="grays-anatomy-medcpt",
+            )
+            ```
+        Args:
+            chunk_dataset_name (str): The name of the dataset containing text chunks to be indexed.
+            index_name (Optional[str]): The name to use when saving the vector index. If not provided,
+                the vector index is not saved.
+        """
+        self._chunk_dataset = weave.ref(chunk_dataset_name).get().rows
+        corpus = [row["text"] for row in self._chunk_dataset]
+        with torch.no_grad():
+            encoded = self._article_tokenizer(
+                corpus,
+                truncation=True,
+                padding=True,
+                return_tensors="pt",
+                max_length=self.chunk_size,
+            )
+            vector_index = (
+                self._article_encoder_model(**encoded)
+                .last_hidden_state[:, 0, :]
+                .contiguous()
+            )
+            self._vector_index = vector_index
+            if index_name:
+                save_vector_index(
+                    self._vector_index,
+                    "medcpt-index",
+                    index_name,
+                    {
+                        "query_encoder_model_name": self.query_encoder_model_name,
+                        "article_encoder_model_name": self.article_encoder_model_name,
+                        "chunk_size": self.chunk_size,
+                    },
+                )
+    @classmethod
+    def from_wandb_artifact(cls, chunk_dataset_name: str, index_artifact_address: str):
+        """
+        Initializes an instance of the class from a Weave artifact.
+        This method retrieves a precomputed vector index and its associated metadata from a Weave artifact
+        stored in Weights & Biases (wandb). It then loads the vector index into memory and initializes an
+        instance of the class with the retrieved model names, vector index, and chunk dataset.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            import wandb
+            from medrag_multi_modal.retrieval import MedCPTRetriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = MedCPTRetriever.from_wandb_artifact(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-medcpt:v0",
+            )
+            ```
+        Args:
+            chunk_dataset_name (str): The name of the dataset containing text chunks to be indexed.
+            index_artifact_address (str): The address of the Weave artifact containing the precomputed vector index.
+        Returns:
+            An instance of the class initialized with the retrieved model name, vector index, and chunk dataset.
+        """
+        artifact_dir, metadata = get_wandb_artifact(
+            index_artifact_address, "medcpt-index", get_metadata=True
+        )
+        with safetensors.torch.safe_open(
+            os.path.join(artifact_dir, "vector_index.safetensors"), framework="pt"
+        ) as f:
+            vector_index = f.get_tensor("vector_index")
+        device = torch.device(get_torch_backend())
+        vector_index = vector_index.to(device)
+        chunk_dataset = [dict(row) for row in weave.ref(chunk_dataset_name).get().rows]
+        return cls(
+            query_encoder_model_name=metadata["query_encoder_model_name"],
+            article_encoder_model_name=metadata["article_encoder_model_name"],
+            chunk_size=metadata["chunk_size"],
+            vector_index=vector_index,
+            chunk_dataset=chunk_dataset,
+        )
+    @weave.op()
+    def retrieve(
+        self,
+        query: str,
+        top_k: int = 2,
+        metric: SimilarityMetric = SimilarityMetric.COSINE,
+    ):
+        """
+        Retrieves the top-k most relevant chunks for a given query using the specified similarity metric.
+        This method encodes the input query into an embedding and computes similarity scores between
+        the query embedding and the precomputed vector index. The similarity metric can be either
+        cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
+        are returned as a list of dictionaries, each containing a chunk and its corresponding score.
+        Args:
+            query (str): The input query string to search for relevant chunks.
+            top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
+            metric (SimilarityMetric, optional): The similarity metric to use for scoring. Defaults to cosine similarity.
+        Returns:
+            list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
+        """
+        query = [query]
+        device = torch.device(get_torch_backend())
+        with torch.no_grad():
+            encoded = self._query_tokenizer(
+                query,
+                truncation=True,
+                padding=True,
+                return_tensors="pt",
+            )
+            query_embedding = self._query_encoder_model(**encoded).last_hidden_state[
+                :, 0, :
+            ]
+            query_embedding = query_embedding.to(device)
+            if metric == SimilarityMetric.EUCLIDEAN:
+                scores = torch.squeeze(query_embedding @ self._vector_index.T)
+            else:
+                scores = F.cosine_similarity(query_embedding, self._vector_index)
+            scores = scores.cpu().numpy().tolist()
+        scores = argsort_scores(scores, descending=True)[:top_k]
+        retrieved_chunks = []
+        for score in scores:
+            retrieved_chunks.append(
+                {
+                    "chunk": self._chunk_dataset[score["original_index"]],
+                    "score": score["item"],
+                }
+            )
+        return retrieved_chunks
+    @weave.op()
+    def predict(
+        self,
+        query: str,
+        top_k: int = 2,
+        metric: SimilarityMetric = SimilarityMetric.COSINE,
+    ):
+        """
+        Predicts the most relevant chunks for a given query.
+        This function uses the `retrieve` method to find the top-k relevant chunks
+        from the dataset based on the input query. It allows specifying the number
+        of top relevant chunks to retrieve and the similarity metric to use for scoring.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            import wandb
+            from medrag_multi_modal.retrieval import MedCPTRetriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = MedCPTRetriever.from_wandb_artifact(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-medcpt:v0",
+            )
+            retriever.predict(query="What are Ribosomes?")
+            ```
+        Args:
+            query (str): The input query string to search for relevant chunks.
+            top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
+            metric (SimilarityMetric, optional): The similarity metric to use for scoring. Defaults to cosine similarity.
+        Returns:
+            list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
+        """
+        return self.retrieve(query, top_k, metric)

medrag_multi_modal/retrieval/nv_embed_2.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import os
+from typing import Optional
+import safetensors
+import torch
+import torch.nn.functional as F
+import weave
+from sentence_transformers import SentenceTransformer
+from ..utils import get_torch_backend, get_wandb_artifact
+from .common import SimilarityMetric, argsort_scores, save_vector_index
+class NVEmbed2Retriever(weave.Model):
+    """
+    `NVEmbed2Retriever` is a class for retrieving relevant text chunks from a dataset using the
+    [NV-Embed-v2](https://huggingface.co/nvidia/NV-Embed-v2) model.
+    This class leverages the SentenceTransformer model to encode text chunks into vector representations and
+    performs similarity-based retrieval. It supports indexing a dataset of text chunks, saving the vector index,
+    and retrieving the most relevant chunks for a given query.
+    Args:
+        model_name (str): The name of the pre-trained model to use for encoding.
+        vector_index (Optional[torch.Tensor]): The tensor containing the vector representations of the indexed chunks.
+        chunk_dataset (Optional[list[dict]]): The dataset of text chunks to be indexed.
+    """
+    model_name: str
+    _chunk_dataset: Optional[list[dict]]
+    _model: SentenceTransformer
+    _vector_index: Optional[torch.Tensor]
+    def __init__(
+        self,
+        model_name: str = "sentence-transformers/nvembed2-nli-v1",
+        vector_index: Optional[torch.Tensor] = None,
+        chunk_dataset: Optional[list[dict]] = None,
+    ):
+        super().__init__(model_name=model_name)
+        self._model = SentenceTransformer(
+            self.model_name,
+            trust_remote_code=True,
+            model_kwargs={"torch_dtype": torch.float16},
+            device=get_torch_backend(),
+        )
+        self._model.max_seq_length = 32768
+        self._model.tokenizer.padding_side = "right"
+        self._vector_index = vector_index
+        self._chunk_dataset = chunk_dataset
+    def add_eos(self, input_examples):
+        input_examples = [
+            input_example + self._model.tokenizer.eos_token
+            for input_example in input_examples
+        ]
+        return input_examples
+    def index(self, chunk_dataset_name: str, index_name: Optional[str] = None):
+        """
+        Indexes a dataset of text chunks and optionally saves the vector index to a file.
+        This method retrieves a dataset of text chunks from a Weave reference, encodes the
+        text chunks into vector representations using the NV-Embed-v2 model, and stores the
+        resulting vector index. If an index name is provided, the vector index is saved to
+        a file in the safetensors format. Additionally, if a Weave run is active, the vector
+        index file is logged as an artifact to Weave.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            import wandb
+            from medrag_multi_modal.retrieval import NVEmbed2Retriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="nvembed2-index")
+            retriever = NVEmbed2Retriever(model_name="nvidia/NV-Embed-v2")
+            retriever.index(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_name="grays-anatomy-nvembed2",
+            )
+            ```
+        ??? note "Optional Speedup using Flash Attention"
+            If you have a GPU with Flash Attention support, you can enable it for NV-Embed-v2 by simply
+            installing the `flash-attn` package.
+            ```bash
+            uv pip install flash-attn --no-build-isolation
+            ```
+        Args:
+            chunk_dataset_name (str): The name of the Weave dataset containing the text chunks
+                to be indexed.
+            index_name (Optional[str]): The name of the index artifact to be saved. If provided,
+                the vector index is saved to a file and logged as an artifact to Weave.
+        """
+        self._chunk_dataset = weave.ref(chunk_dataset_name).get().rows
+        corpus = [row["text"] for row in self._chunk_dataset]
+        self._vector_index = self._model.encode(
+            self.add_eos(corpus), batch_size=len(corpus), normalize_embeddings=True
+        )
+        with torch.no_grad():
+            if index_name:
+                save_vector_index(
+                    torch.from_numpy(self._vector_index),
+                    "nvembed2-index",
+                    index_name,
+                    {"model_name": self.model_name},
+                )
+    @classmethod
+    def from_wandb_artifact(cls, chunk_dataset_name: str, index_artifact_address: str):
+        """
+        Creates an instance of the class from a Weave artifact.
+        This method retrieves a vector index and metadata from a Weave artifact stored in
+        Weights & Biases (wandb). It also retrieves a dataset of text chunks from a Weave
+        reference. The vector index is loaded from a safetensors file and moved to the
+        appropriate device (CPU or GPU). The text chunks are converted into a list of
+        dictionaries. The method then returns an instance of the class initialized with
+        the retrieved model name, vector index, and chunk dataset.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            import wandb
+            from medrag_multi_modal.retrieval import NVEmbed2Retriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = NVEmbed2Retriever(model_name="nvidia/NV-Embed-v2")
+            retriever.index(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_name="grays-anatomy-nvembed2",
+            )
+            retriever = NVEmbed2Retriever.from_wandb_artifact(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-nvembed2:v0",
+            )
+            ```
+        ??? note "Optional Speedup using Flash Attention"
+            If you have a GPU with Flash Attention support, you can enable it for NV-Embed-v2 by simply
+            installing the `flash-attn` package.
+            ```bash
+            uv pip install flash-attn --no-build-isolation
+            ```
+        Args:
+            chunk_dataset_name (str): The name of the Weave dataset containing the text chunks.
+            index_artifact_address (str): The address of the Weave artifact containing the
+                vector index.
+        Returns:
+            An instance of the class initialized with the retrieved model name, vector index,
+            and chunk dataset.
+        """
+        artifact_dir, metadata = get_wandb_artifact(
+            index_artifact_address, "nvembed2-index", get_metadata=True
+        )
+        with safetensors.torch.safe_open(
+            os.path.join(artifact_dir, "vector_index.safetensors"), framework="pt"
+        ) as f:
+            vector_index = f.get_tensor("vector_index")
+        device = torch.device(get_torch_backend())
+        vector_index = vector_index.to(device)
+        chunk_dataset = [dict(row) for row in weave.ref(chunk_dataset_name).get().rows]
+        return cls(
+            model_name=metadata["model_name"],
+            vector_index=vector_index,
+            chunk_dataset=chunk_dataset,
+        )
+    @weave.op()
+    def retrieve(
+        self,
+        query: list[str],
+        top_k: int = 2,
+        metric: SimilarityMetric = SimilarityMetric.COSINE,
+    ):
+        """
+        Retrieves the top-k most relevant chunks for a given query using the specified similarity metric.
+        This method encodes the input query into an embedding and computes similarity scores between
+        the query embedding and the precomputed vector index. The similarity metric can be either
+        cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
+        are returned as a list of dictionaries, each containing a chunk and its corresponding score.
+        Args:
+            query (list[str]): The input query strings to search for relevant chunks.
+            top_k (int, optional): The number of top relevant chunks to retrieve.
+            metric (SimilarityMetric, optional): The similarity metric to use for scoring.
+        Returns:
+            list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
+        """
+        device = torch.device(get_torch_backend())
+        with torch.no_grad():
+            query_embedding = self._model.encode(
+                self.add_eos(query), normalize_embeddings=True
+            )
+            query_embedding = torch.from_numpy(query_embedding).to(device)
+            if metric == SimilarityMetric.EUCLIDEAN:
+                scores = torch.squeeze(query_embedding @ self._vector_index.T)
+            else:
+                scores = F.cosine_similarity(query_embedding, self._vector_index)
+            scores = scores.cpu().numpy().tolist()
+        scores = argsort_scores(scores, descending=True)[:top_k]
+        retrieved_chunks = []
+        for score in scores:
+            retrieved_chunks.append(
+                {
+                    "chunk": self._chunk_dataset[score["original_index"]],
+                    "score": score["item"],
+                }
+            )
+        return retrieved_chunks
+    @weave.op()
+    def predict(
+        self,
+        query: str,
+        top_k: int = 2,
+        metric: SimilarityMetric = SimilarityMetric.COSINE,
+    ):
+        """
+        Predicts the top-k most relevant chunks for a given query using the specified similarity metric.
+        This method formats the input query string by prepending an instruction prompt and then calls the
+        `retrieve` method to get the most relevant chunks. The similarity metric can be either cosine similarity
+        or Euclidean distance. The top-k chunks with the highest similarity scores are returned.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            import wandb
+            from medrag_multi_modal.retrieval import NVEmbed2Retriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = NVEmbed2Retriever(model_name="nvidia/NV-Embed-v2")
+            retriever.index(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_name="grays-anatomy-nvembed2",
+            )
+            retriever = NVEmbed2Retriever.from_wandb_artifact(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-nvembed2:v0",
+            )
+            retriever.predict(query="What are Ribosomes?")
+            ```
+        ??? note "Optional Speedup using Flash Attention"
+            If you have a GPU with Flash Attention support, you can enable it for NV-Embed-v2 by simply
+            installing the `flash-attn` package.
+            ```bash
+            uv pip install flash-attn --no-build-isolation
+            ```
+        Args:
+            query (str): The input query string to search for relevant chunks.
+            top_k (int, optional): The number of top relevant chunks to retrieve.
+            metric (SimilarityMetric, optional): The similarity metric to use for scoring.
+        Returns:
+            list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
+        """
+        query = [
+            f"""Instruct: Given a question, retrieve passages that answer the question
+Query: {query}"""
+        ]
+        return self.retrieve(query, top_k, metric)

medrag_multi_modal/utils.py CHANGED Viewed

@@ -1,7 +1,13 @@
 import wandb
-def get_wandb_artifact(artifact_name: str, artifact_type: str) -> str:
     if wandb.run:
         artifact = wandb.use_artifact(artifact_name, type=artifact_type)
         artifact_dir = artifact.download()
@@ -9,4 +15,17 @@ def get_wandb_artifact(artifact_name: str, artifact_type: str) -> str:
         api = wandb.Api()
         artifact = api.artifact(artifact_name)
         artifact_dir = artifact.download()
     return artifact_dir

+import torch
 import wandb
+def get_wandb_artifact(
+    artifact_name: str,
+    artifact_type: str,
+    get_metadata: bool = False,
+) -> str:
     if wandb.run:
         artifact = wandb.use_artifact(artifact_name, type=artifact_type)
         artifact_dir = artifact.download()
         api = wandb.Api()
         artifact = api.artifact(artifact_name)
         artifact_dir = artifact.download()
+    if get_metadata:
+        return artifact_dir, artifact.metadata
     return artifact_dir
+def get_torch_backend():
+    if torch.cuda.is_available():
+        if torch.backends.cuda.is_built():
+            return "cuda"
+    if torch.backends.mps.is_available():
+        if torch.backends.mps.is_built():
+            return "mps"
+        return "cpu"
+    return "cpu"

mkdocs.yml CHANGED Viewed

@@ -78,6 +78,10 @@ nav:
       - FitzPIL: 'document_loader/image_loader/fitzpil_img_loader.md'
   - Chunking: 'chunking.md'
   - Retrieval:
-    - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
 repo_url: https://github.com/soumik12345/medrag-multi-modal

       - FitzPIL: 'document_loader/image_loader/fitzpil_img_loader.md'
   - Chunking: 'chunking.md'
   - Retrieval:
+    - BM25-Sparse: 'retreival/bm25s.md'
+    - ColPali: 'retreival/colpali.md'
+    - Contriever: 'retreival/contriever.md'
+    - MedCPT: 'retreival/medcpt.md'
+    - NV-Embed-v2: 'retreival/nv_embed_2.md'
 repo_url: https://github.com/soumik12345/medrag-multi-modal

pyproject.toml CHANGED Viewed

@@ -5,8 +5,12 @@ description = ""
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    "Byaldi>=0.0.5",
     "firerequests>=0.0.7",
     "pdf2image>=1.17.0",
     "python-dotenv>=1.0.1",
     "pymupdf4llm>=0.0.17",
@@ -16,6 +20,8 @@ dependencies = [
     "uv>=0.4.20",
     "pytest>=8.3.3",
     "PyPDF2>=3.0.1",
     "isort>=5.13.2",
     "black>=24.10.0",
     "ruff>=0.6.9",
@@ -31,30 +37,34 @@ dependencies = [
     "pdfplumber>=0.11.4",
     "semchunk>=2.2.0",
     "tiktoken>=0.8.0",
 ]
 [project.optional-dependencies]
 core = [
-    "Byaldi>=0.0.5",
     "firerequests>=0.0.7",
     "marker-pdf>=0.2.17",
     "pdf2image>=1.17.0",
     "pdfplumber>=0.11.4",
     "PyPDF2>=3.0.1",
     "python-dotenv>=1.0.1",
     "pymupdf4llm>=0.0.17",
     "semchunk>=2.2.0",
     "tiktoken>=0.8.0",
     "torch>=2.4.1",
     "weave>=0.51.14",
 ]
-dev = [
-    "pytest>=8.3.3",
-    "isort>=5.13.2",
-    "black>=24.10.0",
-    "ruff>=0.6.9",
-]
 docs = [
     "mkdocs>=1.6.1",

 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
+    "adapters>=1.0.0",
+    "bm25s[full]>=0.2.2",
+    "datasets>=3.0.1",
+    "einops>=0.8.0",
     "firerequests>=0.0.7",
+    "jax[cpu]>=0.4.34",
     "pdf2image>=1.17.0",
     "python-dotenv>=1.0.1",
     "pymupdf4llm>=0.0.17",
     "uv>=0.4.20",
     "pytest>=8.3.3",
     "PyPDF2>=3.0.1",
+    "PyStemmer>=2.2.0.3",
+    "safetensors>=0.4.5",
     "isort>=5.13.2",
     "black>=24.10.0",
     "ruff>=0.6.9",
     "pdfplumber>=0.11.4",
     "semchunk>=2.2.0",
     "tiktoken>=0.8.0",
+    "sentence-transformers>=3.2.0",
 ]
 [project.optional-dependencies]
 core = [
+    "adapters>=1.0.0",
+    "bm25s[full]>=0.2.2",
+    "datasets>=3.0.1",
+    "einops>=0.8.0",
     "firerequests>=0.0.7",
+    "jax[cpu]>=0.4.34",
     "marker-pdf>=0.2.17",
     "pdf2image>=1.17.0",
     "pdfplumber>=0.11.4",
     "PyPDF2>=3.0.1",
+    "PyStemmer>=2.2.0.3",
     "python-dotenv>=1.0.1",
     "pymupdf4llm>=0.0.17",
+    "safetensors>=0.4.5",
     "semchunk>=2.2.0",
     "tiktoken>=0.8.0",
     "torch>=2.4.1",
     "weave>=0.51.14",
+    "sentence-transformers>=3.2.0",
 ]
+dev = ["pytest>=8.3.3", "isort>=5.13.2", "black>=24.10.0", "ruff>=0.6.9"]
 docs = [
     "mkdocs>=1.6.1",