Spaces:

geekyrakshit
/

medrag

Running

App Files Files Community

Atanu Sarkar commited on Oct 24, 2024

Commit

a19b9c3

unverified ·

2 Parent(s): 8bd2693 6c6905f

Merge pull request #21 from soumik12345/feat/llm-client

Browse files

Files changed (19) hide show

.gitignore +1 -0
docs/assistant/figure_annotation.md +3 -0
docs/assistant/llm_client.md +3 -0
docs/assistant/medqa_assistant.md +3 -0
medrag_multi_modal/assistant/__init__.py +5 -0
medrag_multi_modal/assistant/figure_annotation.py +156 -0
medrag_multi_modal/assistant/llm_client.py +237 -0
medrag_multi_modal/assistant/medqa_assistant.py +108 -0
medrag_multi_modal/document_loader/image_loader/base_img_loader.py +20 -4
medrag_multi_modal/document_loader/image_loader/marker_img_loader.py +45 -3
medrag_multi_modal/document_loader/text_loader/base_text_loader.py +1 -0
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py +3 -0
medrag_multi_modal/retrieval/bm25s_retrieval.py +1 -1
medrag_multi_modal/retrieval/contriever_retrieval.py +2 -2
medrag_multi_modal/retrieval/medcpt_retrieval.py +2 -2
medrag_multi_modal/retrieval/nv_embed_2.py +2 -2
medrag_multi_modal/utils.py +22 -0
mkdocs.yml +4 -0
pyproject.toml +12 -0

.gitignore CHANGED Viewed

@@ -17,6 +17,7 @@ wandb/
 .byaldi/
 cursor_prompt.txt
 test.py
 uv.lock
 grays-anatomy-bm25s/
 prompt**.txt

 .byaldi/
 cursor_prompt.txt
 test.py
+test.ipynb
 uv.lock
 grays-anatomy-bm25s/
 prompt**.txt

docs/assistant/figure_annotation.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Figure Annotation
2	+
3	+ ::: medrag_multi_modal.assistant.figure_annotation

docs/assistant/llm_client.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # LLM Client
2	+
3	+ ::: medrag_multi_modal.assistant.llm_client

docs/assistant/medqa_assistant.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # MedQA Assistant
2	+
3	+ ::: medrag_multi_modal.assistant.medqa_assistant

medrag_multi_modal/assistant/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .figure_annotation import FigureAnnotatorFromPageImage
+from .llm_client import ClientType, LLMClient
+from .medqa_assistant import MedQAAssistant
+__all__ = ["LLMClient", "ClientType", "MedQAAssistant", "FigureAnnotatorFromPageImage"]

medrag_multi_modal/assistant/figure_annotation.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import os
+from glob import glob
+from typing import Optional, Union
+import cv2
+import weave
+from PIL import Image
+from pydantic import BaseModel
+from ..utils import get_wandb_artifact, read_jsonl_file
+from .llm_client import LLMClient
+class FigureAnnotation(BaseModel):
+    figure_id: str
+    figure_description: str
+class FigureAnnotations(BaseModel):
+    annotations: list[FigureAnnotation]
+class FigureAnnotatorFromPageImage(weave.Model):
+    """
+    `FigureAnnotatorFromPageImage` is a class that leverages two LLM clients to annotate
+    figures from a page image of a scientific textbook.
+    !!! example "Example Usage"
+        ```python
+        import weave
+        from dotenv import load_dotenv
+        from medrag_multi_modal.assistant import (
+            FigureAnnotatorFromPageImage, LLMClient
+        )
+        load_dotenv()
+        weave.init(project_name="ml-colabs/medrag-multi-modal")
+        figure_annotator = FigureAnnotatorFromPageImage(
+            figure_extraction_llm_client=LLMClient(model_name="pixtral-12b-2409"),
+            structured_output_llm_client=LLMClient(model_name="gpt-4o"),
+            image_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-images-marker:v6",
+        )
+        annotations = figure_annotator.predict(page_idx=34)
+        ```
+    Args:
+        figure_extraction_llm_client (LLMClient): An LLM client used to extract figure annotations
+            from the page image.
+        structured_output_llm_client (LLMClient): An LLM client used to convert the extracted
+            annotations into a structured format.
+        image_artifact_address (Optional[str]): The address of the image artifact containing the
+            page images.
+    """
+    figure_extraction_llm_client: LLMClient
+    structured_output_llm_client: LLMClient
+    _artifact_dir: str
+    def __init__(
+        self,
+        figure_extraction_llm_client: LLMClient,
+        structured_output_llm_client: LLMClient,
+        image_artifact_address: Optional[str] = None,
+    ):
+        super().__init__(
+            figure_extraction_llm_client=figure_extraction_llm_client,
+            structured_output_llm_client=structured_output_llm_client,
+        )
+        self._artifact_dir = get_wandb_artifact(image_artifact_address, "dataset")
+    @weave.op()
+    def annotate_figures(
+        self, page_image: Image.Image
+    ) -> dict[str, Union[Image.Image, str]]:
+        annotation = self.figure_extraction_llm_client.predict(
+            system_prompt="""
+You are an expert in the domain of scientific textbooks, especially medical texts.
+You are presented with a page from a scientific textbook from the domain of biology, specifically anatomy.
+You are to first identify all the figures in the page image, which could be images or biological diagrams, charts, graphs, etc.
+Then you are to identify the figure IDs associated with each figure in the page image.
+Then, you are to extract only the exact figure descriptions from the page image.
+You need to output the figure IDs and figure descriptions only, in a structured manner as a JSON object.
+Here are some clues you need to follow:
+1. Figure IDs are unique identifiers for each figure in the page image.
+2. Sometimes figure IDs can also be found as captions to the immediate left, right, top, or bottom of the figure.
+3. Figure IDs are in the form "Fig X.Y" where X and Y are integers. For example, 1.1, 1.2, 1.3, etc.
+4. Figure descriptions are contained as captions under the figures in the image, just after the figure ID.
+5. The text in the page image is written in English and is present in a two-column format.
+6. There is a clear distinction between the figure caption and the regular text in the page image in the form of extra white space.
+    You are to carefully identify all the figures in the page image.
+7. There might be multiple figures or even no figures present in the page image. Sometimes the figures can be present side-by-side
+    or one above the other.
+8. The figures may or may not have a distinct border against a white background.
+10. You are not supposed to alter the figure description in any way present in the page image and you are to extract it as is.
+""",
+            user_prompt=[page_image],
+        )
+        return {"page_image": page_image, "annotations": annotation}
+    @weave.op
+    def extract_structured_output(self, annotations: str) -> FigureAnnotations:
+        return self.structured_output_llm_client.predict(
+            system_prompt="You are suppossed to extract a list of figure annotations consisting of figure IDs and corresponding figure descriptions.",
+            user_prompt=[annotations],
+            schema=FigureAnnotations,
+        )
+    @weave.op()
+    def predict(self, page_idx: int) -> dict[int, list[FigureAnnotation]]:
+        """
+        Predicts figure annotations for a specific page in a document.
+        This function retrieves the artifact directory from the given image artifact address,
+        reads the metadata from the 'metadata.jsonl' file, and iterates through the metadata
+        to find the specified page index. If the page index matches, it reads the page image
+        and associated figure images, and then uses the `annotate_figures` method to extract
+        figure annotations from the page image. The extracted annotations are then structured
+        using the `extract_structured_output` method and returned as a dictionary.
+        Args:
+            page_idx (int): The index of the page to annotate.
+            image_artifact_address (str): The address of the image artifact containing the
+                page images.
+        Returns:
+            dict: A dictionary containing the page index as the key and the extracted figure
+                annotations as the value.
+        """
+        metadata = read_jsonl_file(os.path.join(self._artifact_dir, "metadata.jsonl"))
+        annotations = {}
+        for item in metadata:
+            if item["page_idx"] == page_idx:
+                page_image_file = os.path.join(
+                    self._artifact_dir, f"page{item['page_idx']}.png"
+                )
+                figure_image_files = glob(
+                    os.path.join(self._artifact_dir, f"page{item['page_idx']}_fig*.png")
+                )
+                if len(figure_image_files) > 0:
+                    page_image = cv2.imread(page_image_file)
+                    page_image = cv2.cvtColor(page_image, cv2.COLOR_BGR2RGB)
+                    page_image = Image.fromarray(page_image)
+                    figure_extracted_annotations = self.annotate_figures(
+                        page_image=page_image
+                    )
+                    figure_extracted_annotations = self.extract_structured_output(
+                        figure_extracted_annotations["annotations"]
+                    ).model_dump()
+                    annotations[item["page_idx"]] = figure_extracted_annotations[
+                        "annotations"
+                    ]
+                break
+        return annotations

medrag_multi_modal/assistant/llm_client.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import os
+from enum import Enum
+from typing import Any, Optional, Union
+import instructor
+import weave
+from PIL import Image
+from ..utils import base64_encode_image
+class ClientType(str, Enum):
+    GEMINI = "gemini"
+    MISTRAL = "mistral"
+    OPENAI = "openai"
+GOOGLE_MODELS = [
+    "gemini-1.0-pro-latest",
+    "gemini-1.0-pro",
+    "gemini-pro",
+    "gemini-1.0-pro-001",
+    "gemini-1.0-pro-vision-latest",
+    "gemini-pro-vision",
+    "gemini-1.5-pro-latest",
+    "gemini-1.5-pro-001",
+    "gemini-1.5-pro-002",
+    "gemini-1.5-pro",
+    "gemini-1.5-pro-exp-0801",
+    "gemini-1.5-pro-exp-0827",
+    "gemini-1.5-flash-latest",
+    "gemini-1.5-flash-001",
+    "gemini-1.5-flash-001-tuning",
+    "gemini-1.5-flash",
+    "gemini-1.5-flash-exp-0827",
+    "gemini-1.5-flash-002",
+    "gemini-1.5-flash-8b",
+    "gemini-1.5-flash-8b-001",
+    "gemini-1.5-flash-8b-latest",
+    "gemini-1.5-flash-8b-exp-0827",
+    "gemini-1.5-flash-8b-exp-0924",
+]
+MISTRAL_MODELS = [
+    "ministral-3b-latest",
+    "ministral-8b-latest",
+    "mistral-large-latest",
+    "mistral-small-latest",
+    "codestral-latest",
+    "pixtral-12b-2409",
+    "open-mistral-nemo",
+    "open-codestral-mamba",
+    "open-mistral-7b",
+    "open-mixtral-8x7b",
+    "open-mixtral-8x22b",
+]
+OPENAI_MODELS = ["gpt-4o", "gpt-4o-2024-08-06", "gpt-4o-mini", "gpt-4o-mini-2024-07-18"]
+class LLMClient(weave.Model):
+    """
+    LLMClient is a class that interfaces with different large language model (LLM) providers
+    such as Google Gemini, Mistral, and OpenAI. It abstracts the complexity of interacting with
+    these different APIs and provides a unified interface for making predictions.
+    Args:
+        model_name (str): The name of the model to be used for predictions.
+        client_type (Optional[ClientType]): The type of client (e.g., GEMINI, MISTRAL, OPENAI).
+            If not provided, it is inferred from the model_name.
+    """
+    model_name: str
+    client_type: Optional[ClientType]
+    def __init__(self, model_name: str, client_type: Optional[ClientType] = None):
+        if client_type is None:
+            if model_name in GOOGLE_MODELS:
+                client_type = ClientType.GEMINI
+            elif model_name in MISTRAL_MODELS:
+                client_type = ClientType.MISTRAL
+            elif model_name in OPENAI_MODELS:
+                client_type = ClientType.OPENAI
+            else:
+                raise ValueError(f"Invalid model name: {model_name}")
+        super().__init__(model_name=model_name, client_type=client_type)
+    @weave.op()
+    def execute_gemini_sdk(
+        self,
+        user_prompt: Union[str, list[str]],
+        system_prompt: Optional[Union[str, list[str]]] = None,
+        schema: Optional[Any] = None,
+    ) -> Union[str, Any]:
+        import google.generativeai as genai
+        system_prompt = (
+            [system_prompt] if isinstance(system_prompt, str) else system_prompt
+        )
+        user_prompt = [user_prompt] if isinstance(user_prompt, str) else user_prompt
+        genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
+        model = genai.GenerativeModel(self.model_name)
+        generation_config = (
+            None
+            if schema is None
+            else genai.GenerationConfig(
+                response_mime_type="application/json", response_schema=list[schema]
+            )
+        )
+        response = model.generate_content(
+            system_prompt + user_prompt, generation_config=generation_config
+        )
+        return response.text if schema is None else response
+    @weave.op()
+    def execute_mistral_sdk(
+        self,
+        user_prompt: Union[str, list[str]],
+        system_prompt: Optional[Union[str, list[str]]] = None,
+        schema: Optional[Any] = None,
+    ) -> Union[str, Any]:
+        from mistralai import Mistral
+        system_prompt = (
+            [system_prompt] if isinstance(system_prompt, str) else system_prompt
+        )
+        user_prompt = [user_prompt] if isinstance(user_prompt, str) else user_prompt
+        system_messages = [{"type": "text", "text": prompt} for prompt in system_prompt]
+        user_messages = []
+        for prompt in user_prompt:
+            if isinstance(prompt, Image.Image):
+                user_messages.append(
+                    {
+                        "type": "image_url",
+                        "image_url": base64_encode_image(prompt, "image/png"),
+                    }
+                )
+            else:
+                user_messages.append({"type": "text", "text": prompt})
+        messages = [
+            {"role": "system", "content": system_messages},
+            {"role": "user", "content": user_messages},
+        ]
+        client = Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))
+        client = instructor.from_mistral(client) if schema is not None else client
+        response = (
+            client.chat.complete(model=self.model_name, messages=messages)
+            if schema is None
+            else client.messages.create(
+                response_model=schema, messages=messages, temperature=0
+            )
+        )
+        return response.choices[0].message.content
+    @weave.op()
+    def execute_openai_sdk(
+        self,
+        user_prompt: Union[str, list[str]],
+        system_prompt: Optional[Union[str, list[str]]] = None,
+        schema: Optional[Any] = None,
+    ) -> Union[str, Any]:
+        from openai import OpenAI
+        system_prompt = (
+            [system_prompt] if isinstance(system_prompt, str) else system_prompt
+        )
+        user_prompt = [user_prompt] if isinstance(user_prompt, str) else user_prompt
+        system_messages = [
+            {"role": "system", "content": prompt} for prompt in system_prompt
+        ]
+        user_messages = []
+        for prompt in user_prompt:
+            if isinstance(prompt, Image.Image):
+                user_messages.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": base64_encode_image(prompt, "image/png"),
+                        },
+                    },
+                )
+            else:
+                user_messages.append({"type": "text", "text": prompt})
+        messages = system_messages + [{"role": "user", "content": user_messages}]
+        client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+        if schema is None:
+            completion = client.chat.completions.create(
+                model=self.model_name, messages=messages
+            )
+            return completion.choices[0].message.content
+        completion = weave.op()(client.beta.chat.completions.parse)(
+            model=self.model_name, messages=messages, response_format=schema
+        )
+        return completion.choices[0].message.parsed
+    @weave.op()
+    def predict(
+        self,
+        user_prompt: Union[str, list[str]],
+        system_prompt: Optional[Union[str, list[str]]] = None,
+        schema: Optional[Any] = None,
+    ) -> Union[str, Any]:
+        """
+        Predicts the response from a language model based on the provided prompts and schema.
+        This function determines the client type and calls the appropriate SDK execution function
+        to get the response from the language model. It supports multiple client types including
+        GEMINI, MISTRAL, and OPENAI. Depending on the client type, it calls the corresponding
+        execution function with the provided user and system prompts, and an optional schema.
+        Args:
+            user_prompt (Union[str, list[str]]): The user prompt(s) to be sent to the language model.
+            system_prompt (Optional[Union[str, list[str]]]): The system prompt(s) to be sent to the language model.
+            schema (Optional[Any]): The schema to be used for parsing the response, if applicable.
+        Returns:
+            Union[str, Any]: The response from the language model, which could be a string or any other type
+            depending on the schema provided.
+        Raises:
+            ValueError: If the client type is invalid.
+        """
+        if self.client_type == ClientType.GEMINI:
+            return self.execute_gemini_sdk(user_prompt, system_prompt, schema)
+        elif self.client_type == ClientType.MISTRAL:
+            return self.execute_mistral_sdk(user_prompt, system_prompt, schema)
+        elif self.client_type == ClientType.OPENAI:
+            return self.execute_openai_sdk(user_prompt, system_prompt, schema)
+        else:
+            raise ValueError(f"Invalid client type: {self.client_type}")

medrag_multi_modal/assistant/medqa_assistant.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import weave
+from ..retrieval import SimilarityMetric
+from .figure_annotation import FigureAnnotatorFromPageImage
+from .llm_client import LLMClient
+class MedQAAssistant(weave.Model):
+    """
+    `MedQAAssistant` is a class designed to assist with medical queries by leveraging a
+    language model client, a retriever model, and a figure annotator.
+    !!! example "Usage Example"
+        ```python
+        import weave
+        from dotenv import load_dotenv
+        from medrag_multi_modal.assistant import (
+            FigureAnnotatorFromPageImage,
+            LLMClient,
+            MedQAAssistant,
+        )
+        from medrag_multi_modal.retrieval import MedCPTRetriever
+        load_dotenv()
+        weave.init(project_name="ml-colabs/medrag-multi-modal")
+        llm_client = LLMClient(model_name="gemini-1.5-flash")
+        retriever=MedCPTRetriever.from_wandb_artifact(
+            chunk_dataset_name="grays-anatomy-chunks:v0",
+            index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-medcpt:v0",
+        )
+        figure_annotator=FigureAnnotatorFromPageImage(
+            figure_extraction_llm_client=LLMClient(model_name="pixtral-12b-2409"),
+            structured_output_llm_client=LLMClient(model_name="gpt-4o"),
+            image_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-images-marker:v6",
+        )
+        medqa_assistant = MedQAAssistant(
+            llm_client=llm_client, retriever=retriever, figure_annotator=figure_annotator
+        )
+        medqa_assistant.predict(query="What is ribosome?")
+        ```
+    Args:
+        llm_client (LLMClient): The language model client used to generate responses.
+        retriever (weave.Model): The model used to retrieve relevant chunks of text from a medical document.
+        figure_annotator (FigureAnnotatorFromPageImage): The annotator used to extract figure descriptions from pages.
+        top_k_chunks (int): The number of top chunks to retrieve based on similarity metric.
+        retrieval_similarity_metric (SimilarityMetric): The metric used to measure similarity for retrieval.
+    """
+    llm_client: LLMClient
+    retriever: weave.Model
+    figure_annotator: FigureAnnotatorFromPageImage
+    top_k_chunks: int = 2
+    retrieval_similarity_metric: SimilarityMetric = SimilarityMetric.COSINE
+    @weave.op()
+    def predict(self, query: str) -> str:
+        """
+        Generates a response to a medical query by retrieving relevant text chunks and figure descriptions
+        from a medical document and using a language model to generate the final response.
+        This function performs the following steps:
+        1. Retrieves relevant text chunks from the medical document based on the query using the retriever model.
+        2. Extracts the text and page indices from the retrieved chunks.
+        3. Retrieves figure descriptions from the pages identified in the previous step using the figure annotator.
+        4. Constructs a system prompt and user prompt combining the query, retrieved text chunks, and figure descriptions.
+        5. Uses the language model client to generate a response based on the constructed prompts.
+        6. Appends the source information (page numbers) to the generated response.
+        Args:
+            query (str): The medical query to be answered.
+        Returns:
+            str: The generated response to the query, including source information.
+        """
+        retrieved_chunks = self.retriever.predict(
+            query, top_k=self.top_k_chunks, metric=self.retrieval_similarity_metric
+        )
+        retrieved_chunk_texts = []
+        page_indices = set()
+        for chunk in retrieved_chunks:
+            retrieved_chunk_texts.append(chunk["text"])
+            page_indices.add(int(chunk["page_idx"]))
+        figure_descriptions = []
+        for page_idx in page_indices:
+            figure_annotations = self.figure_annotator.predict(page_idx=page_idx)[
+                page_idx
+            ]
+            figure_descriptions += [
+                item["figure_description"] for item in figure_annotations
+            ]
+        system_prompt = """
+        You are an expert in medical science. You are given a query and a list of chunks from a medical document.
+        """
+        response = self.llm_client.predict(
+            system_prompt=system_prompt,
+            user_prompt=[query, *retrieved_chunk_texts, *figure_descriptions],
+        )
+        page_numbers = ", ".join([str(int(page_idx) + 1) for page_idx in page_indices])
+        response += f"\n\n**Source:** {'Pages' if len(page_indices) > 1 else 'Page'} {page_numbers} from Gray's Anatomy"
+        return response

medrag_multi_modal/document_loader/image_loader/base_img_loader.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 from abc import abstractmethod
 from typing import Dict, List, Optional
 import rich
 import wandb
@@ -41,7 +42,8 @@ class BaseImageLoader(BaseTextLoader):
         end_page: Optional[int] = None,
         wandb_artifact_name: Optional[str] = None,
         image_save_dir: str = "./images",
-        cleanup: bool = True,
         **kwargs,
     ) -> List[Dict[str, str]]:
         """
@@ -61,10 +63,11 @@ class BaseImageLoader(BaseTextLoader):
         If a wandb_artifact_name is provided, the processed pages are published to a WandB artifact.
         Args:
-            start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
-            end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
             wandb_artifact_name (Optional[str]): The name of the WandB artifact to publish the pages to, if provided.
             image_save_dir (str): The directory to save the extracted images.
             cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
             **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
@@ -99,8 +102,21 @@ class BaseImageLoader(BaseTextLoader):
         for task in asyncio.as_completed(tasks):
             await task
         if wandb_artifact_name:
-            artifact = wandb.Artifact(name=wandb_artifact_name, type="dataset")
             artifact.add_dir(local_path=image_save_dir)
             artifact.save()
             rich.print("Artifact saved and uploaded to wandb!")

 from abc import abstractmethod
 from typing import Dict, List, Optional
+import jsonlines
 import rich
 import wandb
         end_page: Optional[int] = None,
         wandb_artifact_name: Optional[str] = None,
         image_save_dir: str = "./images",
+        exclude_file_extensions: list[str] = [],
+        cleanup: bool = False,
         **kwargs,
     ) -> List[Dict[str, str]]:
         """
         If a wandb_artifact_name is provided, the processed pages are published to a WandB artifact.
         Args:
+            start_page (Optional[int]): The starting page index (0-based) to process.
+            end_page (Optional[int]): The ending page index (0-based) to process.
             wandb_artifact_name (Optional[str]): The name of the WandB artifact to publish the pages to, if provided.
             image_save_dir (str): The directory to save the extracted images.
+            exclude_file_extensions (list[str]): A list of file extensions to exclude from the image_save_dir.
             cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
             **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
         for task in asyncio.as_completed(tasks):
             await task
+        with jsonlines.open(
+            os.path.join(image_save_dir, "metadata.jsonl"), mode="w"
+        ) as writer:
+            writer.write(pages)
+        for file in os.listdir(image_save_dir):
+            if file.endswith(tuple(exclude_file_extensions)):
+                os.remove(os.path.join(image_save_dir, file))
         if wandb_artifact_name:
+            artifact = wandb.Artifact(
+                name=wandb_artifact_name,
+                type="dataset",
+                metadata={"loader_name": self.__class__.__name__},
+            )
             artifact.add_dir(local_path=image_save_dir)
             artifact.save()
             rich.print("Artifact saved and uploaded to wandb!")

medrag_multi_modal/document_loader/image_loader/marker_img_loader.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import os
-from typing import Any, Dict
 from marker.convert import convert_single_pdf
 from marker.models import load_all_models
 from .base_img_loader import BaseImageLoader
 class MarkerImageLoader(BaseImageLoader):
     """
@@ -46,10 +49,18 @@ class MarkerImageLoader(BaseImageLoader):
         url (str): The URL of the PDF document.
         document_name (str): The name of the document.
         document_file_path (str): The path to the PDF file.
     """
-    def __init__(self, url: str, document_name: str, document_file_path: str):
         super().__init__(url, document_name, document_file_path)
         self.model_lst = load_all_models()
     async def extract_page_data(
@@ -90,11 +101,42 @@ class MarkerImageLoader(BaseImageLoader):
             image.save(image_file_path, "png")
             image_file_paths.append(image_file_path)
         return {
             "page_idx": page_idx,
             "document_name": self.document_name,
             "file_path": self.document_file_path,
             "file_url": self.url,
-            "image_file_paths": image_file_paths,
             "meta": out_meta,
         }

 import os
+from typing import Any, Coroutine, Dict, List
 from marker.convert import convert_single_pdf
 from marker.models import load_all_models
+from pdf2image.pdf2image import convert_from_path
 from .base_img_loader import BaseImageLoader
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 class MarkerImageLoader(BaseImageLoader):
     """
         url (str): The URL of the PDF document.
         document_name (str): The name of the document.
         document_file_path (str): The path to the PDF file.
+        save_page_image (bool): Whether to additionally save the image of the entire page.
     """
+    def __init__(
+        self,
+        url: str,
+        document_name: str,
+        document_file_path: str,
+        save_page_image: bool = False,
+    ):
         super().__init__(url, document_name, document_file_path)
+        self.save_page_image = save_page_image
         self.model_lst = load_all_models()
     async def extract_page_data(
             image.save(image_file_path, "png")
             image_file_paths.append(image_file_path)
+        if self.save_page_image:
+            page_image = convert_from_path(
+                self.document_file_path,
+                first_page=page_idx + 1,
+                last_page=page_idx + 1,
+                **kwargs,
+            )[0]
+            page_image.save(os.path.join(image_save_dir, f"page{page_idx}.png"))
         return {
             "page_idx": page_idx,
             "document_name": self.document_name,
             "file_path": self.document_file_path,
             "file_url": self.url,
+            "image_file_paths": os.path.join(image_save_dir, "*.png"),
             "meta": out_meta,
         }
+    def load_data(
+        self,
+        start_page: int | None = None,
+        end_page: int | None = None,
+        wandb_artifact_name: str | None = None,
+        image_save_dir: str = "./images",
+        exclude_file_extensions: list[str] = [],
+        cleanup: bool = False,
+        **kwargs,
+    ) -> Coroutine[Any, Any, List[Dict[str, str]]]:
+        start_page = start_page - 1 if start_page is not None else None
+        end_page = end_page - 1 if end_page is not None else None
+        return super().load_data(
+            start_page,
+            end_page,
+            wandb_artifact_name,
+            image_save_dir,
+            exclude_file_extensions,
+            cleanup,
+            **kwargs,
+        )

medrag_multi_modal/document_loader/text_loader/base_text_loader.py CHANGED Viewed

@@ -131,6 +131,7 @@ class BaseTextLoader(ABC):
         async def process_page(page_idx):
             nonlocal processed_pages_counter
             page_data = await self.extract_page_data(page_idx, **kwargs)
             pages.append(page_data)
             rich.print(
                 f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"

         async def process_page(page_idx):
             nonlocal processed_pages_counter
             page_data = await self.extract_page_data(page_idx, **kwargs)
+            page_data["loader_name"] = self.__class__.__name__
             pages.append(page_data)
             rich.print(
                 f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"

medrag_multi_modal/document_loader/text_loader/marker_text_loader.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import Dict
 from marker.convert import convert_single_pdf
@@ -5,6 +6,8 @@ from marker.models import load_all_models
 from .base_text_loader import BaseTextLoader
 class MarkerTextLoader(BaseTextLoader):
     """

+import os
 from typing import Dict
 from marker.convert import convert_single_pdf
 from .base_text_loader import BaseTextLoader
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 class MarkerTextLoader(BaseTextLoader):
     """

medrag_multi_modal/retrieval/bm25s_retrieval.py CHANGED Viewed

@@ -175,7 +175,7 @@ class BM25sRetriever(weave.Model):
             results.documents.flatten().tolist(),
             results.scores.flatten().tolist(),
         ):
-            retrieved_chunks.append({"chunk": chunk, "score": score})
         return retrieved_chunks
     @weave.op()

             results.documents.flatten().tolist(),
             results.scores.flatten().tolist(),
         ):
+            retrieved_chunks.append({**chunk, **{"score": score}})
         return retrieved_chunks
     @weave.op()

medrag_multi_modal/retrieval/contriever_retrieval.py CHANGED Viewed

@@ -192,8 +192,8 @@ class ContrieverRetriever(weave.Model):
         for score in scores:
             retrieved_chunks.append(
                 {
-                    "chunk": self._chunk_dataset[score["original_index"]],
-                    "score": score["item"],
                 }
             )
         return retrieved_chunks

         for score in scores:
             retrieved_chunks.append(
                 {
+                    **self._chunk_dataset[score["original_index"]],
+                    **{"score": score["item"]},
                 }
             )
         return retrieved_chunks

medrag_multi_modal/retrieval/medcpt_retrieval.py CHANGED Viewed

@@ -231,8 +231,8 @@ class MedCPTRetriever(weave.Model):
         for score in scores:
             retrieved_chunks.append(
                 {
-                    "chunk": self._chunk_dataset[score["original_index"]],
-                    "score": score["item"],
                 }
             )
         return retrieved_chunks

         for score in scores:
             retrieved_chunks.append(
                 {
+                    **self._chunk_dataset[score["original_index"]],
+                    **{"score": score["item"]},
                 }
             )
         return retrieved_chunks

medrag_multi_modal/retrieval/nv_embed_2.py CHANGED Viewed

@@ -217,8 +217,8 @@ class NVEmbed2Retriever(weave.Model):
         for score in scores:
             retrieved_chunks.append(
                 {
-                    "chunk": self._chunk_dataset[score["original_index"]],
-                    "score": score["item"],
                 }
             )
         return retrieved_chunks

         for score in scores:
             retrieved_chunks.append(
                 {
+                    **self._chunk_dataset[score["original_index"]],
+                    **{"score": score["item"]},
                 }
             )
         return retrieved_chunks

medrag_multi_modal/utils.py CHANGED Viewed

@@ -1,4 +1,9 @@
 import torch
 import wandb
@@ -29,3 +34,20 @@ def get_torch_backend():
             return "mps"
         return "cpu"
     return "cpu"

+import base64
+import io
+import jsonlines
 import torch
+from PIL import Image
 import wandb
             return "mps"
         return "cpu"
     return "cpu"
+def base64_encode_image(image: Image.Image, mimetype: str) -> str:
+    image.load()
+    if image.mode not in ("RGB", "RGBA"):
+        image = image.convert("RGB")
+    byte_arr = io.BytesIO()
+    image.save(byte_arr, format="PNG")
+    encoded_string = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
+    encoded_string = f"data:{mimetype};base64,{encoded_string}"
+    return str(encoded_string)
+def read_jsonl_file(file_path: str) -> list[dict[str, any]]:
+    with jsonlines.open(file_path) as reader:
+        for obj in reader:
+            return obj

mkdocs.yml CHANGED Viewed

@@ -83,5 +83,9 @@ nav:
     - Contriever: 'retreival/contriever.md'
     - MedCPT: 'retreival/medcpt.md'
     - NV-Embed-v2: 'retreival/nv_embed_2.md'
 repo_url: https://github.com/soumik12345/medrag-multi-modal

     - Contriever: 'retreival/contriever.md'
     - MedCPT: 'retreival/medcpt.md'
     - NV-Embed-v2: 'retreival/nv_embed_2.md'
+  - Assistant:
+    - MedQA Assistant: 'assistant/medqa_assistant.md'
+    - Figure Annotation: 'assistant/figure_annotation.md'
+    - LLM Client: 'assistant/llm_client.md'
 repo_url: https://github.com/soumik12345/medrag-multi-modal

pyproject.toml CHANGED Viewed

@@ -38,6 +38,12 @@ dependencies = [
     "semchunk>=2.2.0",
     "tiktoken>=0.8.0",
     "sentence-transformers>=3.2.0",
 ]
 [project.optional-dependencies]
@@ -61,6 +67,12 @@ core = [
     "torch>=2.4.1",
     "weave>=0.51.14",
     "sentence-transformers>=3.2.0",
 ]
 dev = ["pytest>=8.3.3", "isort>=5.13.2", "black>=24.10.0", "ruff>=0.6.9"]

     "semchunk>=2.2.0",
     "tiktoken>=0.8.0",
     "sentence-transformers>=3.2.0",
+    "google-generativeai>=0.8.3",
+    "mistralai>=1.1.0",
+    "instructor>=1.6.3",
+    "jsonlines>=4.0.0",
+    "opencv-python>=4.10.0.84",
+    "openai>=1.52.2",
 ]
 [project.optional-dependencies]
     "torch>=2.4.1",
     "weave>=0.51.14",
     "sentence-transformers>=3.2.0",
+    "google-generativeai>=0.8.3",
+    "mistralai>=1.1.0",
+    "instructor>=1.6.3",
+    "jsonlines>=4.0.0",
+    "opencv-python>=4.10.0.84",
+    "openai>=1.52.2",
 ]
 dev = ["pytest>=8.3.3", "isort>=5.13.2", "black>=24.10.0", "ruff>=0.6.9"]