Spaces:

geekyrakshit
/

medrag

Running

App Files Files Community

geekyrakshit commited on Oct 20, 2024

Commit

0d77bb1

1 Parent(s): 77a97ce

add: MedCPTRetriever

Browse files

Files changed (7) hide show

docs/retreival/medcpt.md +3 -0
medrag_multi_modal/retrieval/__init__.py +4 -1
medrag_multi_modal/retrieval/common.py +2 -1
medrag_multi_modal/retrieval/contriever_retrieval.py +11 -15
medrag_multi_modal/retrieval/medcpt_retrieval.py +255 -0
medrag_multi_modal/utils.py +4 -1
mkdocs.yml +1 -0

docs/retreival/medcpt.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # MedCPT Retrieval
2	+
3	+ ::: medrag_multi_modal.retrieval.medcpt_retrieval

medrag_multi_modal/retrieval/__init__.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from .bm25s_retrieval import BM25sRetriever
 from .colpali_retrieval import CalPaliRetriever
-from .contriever_retrieval import ContrieverRetriever, SimilarityMetric
 __all__ = [
     "CalPaliRetriever",
     "BM25sRetriever",
     "ContrieverRetriever",
     "SimilarityMetric",
 ]

 from .bm25s_retrieval import BM25sRetriever
 from .colpali_retrieval import CalPaliRetriever
+from .common import SimilarityMetric
+from .contriever_retrieval import ContrieverRetriever
+from .medcpt_retrieval import MedCPTRetriever
 __all__ = [
     "CalPaliRetriever",
     "BM25sRetriever",
     "ContrieverRetriever",
     "SimilarityMetric",
+    "MedCPTRetriever",
 ]

medrag_multi_modal/retrieval/common.py CHANGED Viewed

@@ -29,6 +29,7 @@ def argsort_scores(scores: list[float], descending: bool = False):
 def save_vector_index(
     vector_index: torch.Tensor,
     index_name: str,
     metadata: dict,
     filename: str = "vector_index.safetensors",
@@ -37,7 +38,7 @@ def save_vector_index(
     if wandb.run:
         artifact = wandb.Artifact(
             name=index_name,
-            type="contriever-index",
             metadata=metadata,
         )
         artifact.add_file(filename)

 def save_vector_index(
     vector_index: torch.Tensor,
+    type: str,
     index_name: str,
     metadata: dict,
     filename: str = "vector_index.safetensors",
     if wandb.run:
         artifact = wandb.Artifact(
             name=index_name,
+            type=type,
             metadata=metadata,
         )
         artifact.add_file(filename)

medrag_multi_modal/retrieval/contriever_retrieval.py CHANGED Viewed

@@ -13,10 +13,8 @@ from transformers import (
     PreTrainedTokenizerFast,
 )
-import wandb
-from ..utils import get_wandb_artifact, get_torch_backend
-from .common import SimilarityMetric, argsort_scores, mean_pooling
 class ContrieverRetriever(weave.Model):
@@ -80,7 +78,10 @@ class ContrieverRetriever(weave.Model):
             weave.init(project_name="ml-colabs/medrag-multi-modal")
             wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="contriever-index")
             retriever = ContrieverRetriever(model_name="facebook/contriever")
-            retriever.index(chunk_dataset_name="grays-anatomy-chunks:v0", index_name="grays-anatomy-contriever")
             ```
         Args:
@@ -95,17 +96,12 @@ class ContrieverRetriever(weave.Model):
             vector_index = self.encode(corpus)
             self._vector_index = vector_index
             if index_name:
-                safetensors.torch.save_file(
-                    {"vector_index": vector_index.cpu()}, "vector_index.safetensors"
                 )
-                if wandb.run:
-                    artifact = wandb.Artifact(
-                        name=index_name,
-                        type="contriever-index",
-                        metadata={"model_name": self.model_name},
-                    )
-                    artifact.add_file("vector_index.safetensors")
-                    artifact.save()
     @classmethod
     def from_wandb_artifact(cls, chunk_dataset_name: str, index_artifact_address: str):

     PreTrainedTokenizerFast,
 )
+from ..utils import get_torch_backend, get_wandb_artifact
+from .common import SimilarityMetric, argsort_scores, mean_pooling, save_vector_index
 class ContrieverRetriever(weave.Model):
             weave.init(project_name="ml-colabs/medrag-multi-modal")
             wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="contriever-index")
             retriever = ContrieverRetriever(model_name="facebook/contriever")
+            retriever.index(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_name="grays-anatomy-contriever",
+            )
             ```
         Args:
             vector_index = self.encode(corpus)
             self._vector_index = vector_index
             if index_name:
+                save_vector_index(
+                    self._vector_index,
+                    "contriever-index",
+                    index_name,
+                    {"model_name": self.model_name},
                 )
     @classmethod
     def from_wandb_artifact(cls, chunk_dataset_name: str, index_artifact_address: str):

medrag_multi_modal/retrieval/medcpt_retrieval.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import os
+from typing import Optional
+import safetensors
+import safetensors.torch
+import torch
+import torch.nn.functional as F
+import weave
+from transformers import (
+    AutoModel,
+    AutoTokenizer,
+    BertPreTrainedModel,
+    PreTrainedTokenizerFast,
+)
+from ..utils import get_torch_backend, get_wandb_artifact
+from .common import SimilarityMetric, argsort_scores, save_vector_index
+class MedCPTRetriever(weave.Model):
+    """
+    A class to retrieve relevant text chunks using MedCPT models.
+    This class provides methods to index a dataset of text chunks and retrieve the most relevant
+    chunks for a given query using MedCPT models. It uses separate models for encoding queries
+    and articles, and supports both cosine similarity and Euclidean distance as similarity metrics.
+    Args:
+        query_encoder_model_name (str): The name of the model used for encoding queries.
+        article_encoder_model_name (str): The name of the model used for encoding articles.
+        chunk_size (Optional[int]): The maximum length of text chunks.
+        vector_index (Optional[torch.Tensor]): The vector index of encoded text chunks.
+        chunk_dataset (Optional[list[dict]]): The dataset of text chunks.
+    """
+    query_encoder_model_name: str
+    article_encoder_model_name: str
+    chunk_size: Optional[int]
+    _chunk_dataset: Optional[list[dict]]
+    _query_tokenizer: PreTrainedTokenizerFast
+    _article_tokenizer: PreTrainedTokenizerFast
+    _query_encoder_model: BertPreTrainedModel
+    _article_encoder_model: BertPreTrainedModel
+    _vector_index: Optional[torch.Tensor]
+    def __init__(
+        self,
+        query_encoder_model_name: str,
+        article_encoder_model_name: str,
+        chunk_size: Optional[int] = None,
+        vector_index: Optional[torch.Tensor] = None,
+        chunk_dataset: Optional[list[dict]] = None,
+    ):
+        super().__init__(
+            query_encoder_model_name=query_encoder_model_name,
+            article_encoder_model_name=article_encoder_model_name,
+            chunk_size=chunk_size,
+        )
+        self._query_tokenizer = AutoTokenizer.from_pretrained(
+            self.query_encoder_model_name
+        )
+        self._article_tokenizer = AutoTokenizer.from_pretrained(
+            self.article_encoder_model_name
+        )
+        self._query_encoder_model = AutoModel.from_pretrained(
+            self.query_encoder_model_name
+        )
+        self._article_encoder_model = AutoModel.from_pretrained(
+            self.article_encoder_model_name
+        )
+        self._chunk_dataset = chunk_dataset
+        self._vector_index = vector_index
+    def index(self, chunk_dataset_name: str, index_name: Optional[str] = None):
+        """
+        Indexes a dataset of text chunks and optionally saves the vector index.
+        This method retrieves a dataset of text chunks from a Weave reference, encodes the text
+        chunks using the article encoder model, and stores the resulting vector index. If an
+        index name is provided, the vector index is saved to a file using the `save_vector_index`
+        function.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            import wandb
+            from medrag_multi_modal.retrieval import MedCPTRetriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="medcpt-index")
+            retriever = MedCPTRetriever(
+                query_encoder_model_name="ncbi/MedCPT-Query-Encoder",
+                article_encoder_model_name="ncbi/MedCPT-Article-Encoder",
+            )
+            retriever.index(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_name="grays-anatomy-medcpt",
+            )
+            ```
+        Args:
+            chunk_dataset_name (str): The name of the dataset containing text chunks to be indexed.
+            index_name (Optional[str]): The name to use when saving the vector index. If not provided,
+                the vector index is not saved.
+        """
+        self._chunk_dataset = weave.ref(chunk_dataset_name).get().rows
+        corpus = [row["text"] for row in self._chunk_dataset]
+        with torch.no_grad():
+            encoded = self._article_tokenizer(
+                corpus,
+                truncation=True,
+                padding=True,
+                return_tensors="pt",
+                max_length=self.chunk_size,
+            )
+            vector_index = (
+                self._article_encoder_model(**encoded)
+                .last_hidden_state[:, 0, :]
+                .contiguous()
+            )
+            self._vector_index = vector_index
+            if index_name:
+                save_vector_index(
+                    self._vector_index,
+                    "medcpt-index",
+                    index_name,
+                    {
+                        "query_encoder_model_name": self.query_encoder_model_name,
+                        "article_encoder_model_name": self.article_encoder_model_name,
+                        "chunk_size": self.chunk_size,
+                    },
+                )
+    @classmethod
+    def from_wandb_artifact(cls, chunk_dataset_name: str, index_artifact_address: str):
+        """
+        Initializes an instance of the class from a Weave artifact.
+        This method retrieves a precomputed vector index and its associated metadata from a Weave artifact
+        stored in Weights & Biases (wandb). It then loads the vector index into memory and initializes an
+        instance of the class with the retrieved model names, vector index, and chunk dataset.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            import wandb
+            from medrag_multi_modal.retrieval import MedCPTRetriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = MedCPTRetriever.from_wandb_artifact(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-medcpt:v0",
+            )
+            ```
+        Args:
+            chunk_dataset_name (str): The name of the dataset containing text chunks to be indexed.
+            index_artifact_address (str): The address of the Weave artifact containing the precomputed vector index.
+        Returns:
+            An instance of the class initialized with the retrieved model name, vector index, and chunk dataset.
+        """
+        artifact_dir, metadata = get_wandb_artifact(
+            index_artifact_address, "medcpt-index", get_metadata=True
+        )
+        with safetensors.torch.safe_open(
+            os.path.join(artifact_dir, "vector_index.safetensors"), framework="pt"
+        ) as f:
+            vector_index = f.get_tensor("vector_index")
+        device = torch.device(get_torch_backend())
+        vector_index = vector_index.to(device)
+        chunk_dataset = [dict(row) for row in weave.ref(chunk_dataset_name).get().rows]
+        return cls(
+            query_encoder_model_name=metadata["query_encoder_model_name"],
+            article_encoder_model_name=metadata["article_encoder_model_name"],
+            chunk_size=metadata["chunk_size"],
+            vector_index=vector_index,
+            chunk_dataset=chunk_dataset,
+        )
+    @weave.op()
+    def retrieve(
+        self,
+        query: str,
+        top_k: int = 2,
+        metric: SimilarityMetric = SimilarityMetric.COSINE,
+    ):
+        """
+        Retrieves the top-k most relevant chunks for a given query using the specified similarity metric.
+        This method encodes the input query into an embedding and computes similarity scores between
+        the query embedding and the precomputed vector index. The similarity metric can be either
+        cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
+        are returned as a list of dictionaries, each containing a chunk and its corresponding score.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            import wandb
+            from medrag_multi_modal.retrieval import MedCPTRetriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = MedCPTRetriever.from_wandb_artifact(
+                chunk_dataset_name="grays-anatomy-chunks:v0",
+                index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-medcpt:v0",
+            )
+            retriever.retrieve(query="What are Ribosomes?")
+            ```
+        Args:
+            query (str): The input query string to search for relevant chunks.
+            top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
+            metric (SimilarityMetric, optional): The similarity metric to use for scoring. Defaults to cosine similarity.
+        Returns:
+            list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
+        """
+        query = [query]
+        device = torch.device(get_torch_backend())
+        with torch.no_grad():
+            encoded = self._query_tokenizer(
+                query,
+                truncation=True,
+                padding=True,
+                return_tensors="pt",
+            )
+            query_embedding = self._query_encoder_model(**encoded).last_hidden_state[
+                :, 0, :
+            ]
+            query_embedding = query_embedding.to(device)
+            if metric == SimilarityMetric.EUCLIDEAN:
+                scores = torch.squeeze(query_embedding @ self._vector_index.T)
+            else:
+                scores = F.cosine_similarity(query_embedding, self._vector_index)
+            scores = scores.cpu().numpy().tolist()
+        scores = argsort_scores(scores, descending=True)[:top_k]
+        retrieved_chunks = []
+        for score in scores:
+            retrieved_chunks.append(
+                {
+                    "chunk": self._chunk_dataset[score["original_index"]],
+                    "score": score["item"],
+                }
+            )
+        return retrieved_chunks

medrag_multi_modal/utils.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import torch
 import wandb
 def get_wandb_artifact(
-    artifact_name: str, artifact_type: str, get_metadata: bool = False
 ) -> str:
     if wandb.run:
         artifact = wandb.use_artifact(artifact_name, type=artifact_type)

 import torch
 import wandb
 def get_wandb_artifact(
+    artifact_name: str,
+    artifact_type: str,
+    get_metadata: bool = False,
 ) -> str:
     if wandb.run:
         artifact = wandb.use_artifact(artifact_name, type=artifact_type)

mkdocs.yml CHANGED Viewed

@@ -81,5 +81,6 @@ nav:
     - BM25-Sparse: 'retreival/bm25s.md'
     - ColPali: 'retreival/colpali.md'
     - Contriever: 'retreival/contriever.md'
 repo_url: https://github.com/soumik12345/medrag-multi-modal

     - BM25-Sparse: 'retreival/bm25s.md'
     - ColPali: 'retreival/colpali.md'
     - Contriever: 'retreival/contriever.md'
+    - MedCPT: 'retreival/medcpt.md'
 repo_url: https://github.com/soumik12345/medrag-multi-modal