Spaces:

geekyrakshit
/

medrag

Running

App Files Files Community

mratanusarkar commited on Oct 19, 2024

Commit

5c74069

1 Parent(s): 694a076

add: base image loader + pdf2img from load_image

Browse files

Files changed (5) hide show

medrag_multi_modal/document_loader/__init__.py +2 -4
medrag_multi_modal/document_loader/image_loader/__init__.py +4 -0
medrag_multi_modal/document_loader/image_loader/base_img_loader.py +63 -0
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py +34 -0
medrag_multi_modal/document_loader/load_image.py +0 -131

medrag_multi_modal/document_loader/__init__.py CHANGED Viewed

@@ -1,5 +1,4 @@
-from .load_image import ImageLoader
-from .load_text_image import TextImageLoader
 from .text_loader import (
     MarkerTextLoader,
     PDFPlumberTextLoader,
@@ -12,6 +11,5 @@ __all__ = [
     "PyPDF2TextLoader",
     "PDFPlumberTextLoader",
     "MarkerTextLoader",
-    "ImageLoader",
-    "TextImageLoader",
 ]

+from .image_loader import PDF2ImageLoader
 from .text_loader import (
     MarkerTextLoader,
     PDFPlumberTextLoader,
     "PyPDF2TextLoader",
     "PDFPlumberTextLoader",
     "MarkerTextLoader",
+    "PDF2ImageLoader",
 ]

medrag_multi_modal/document_loader/image_loader/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .base_img_loader import BaseImageLoader
+from .pdf2image_img_loader import PDF2ImageLoader
+__all__ = ["PDF2ImageLoader", "BaseImageLoader"]

medrag_multi_modal/document_loader/image_loader/base_img_loader.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import asyncio
+import os
+from abc import abstractmethod
+from typing import Dict, List, Optional
+import rich
+import wandb
+from medrag_multi_modal.document_loader.text_loader.base_text_loader import (
+    BaseTextLoader,
+)
+class BaseImageLoader(BaseTextLoader):
+    def __init__(self, url: str, document_name: str, document_file_path: str):
+        super().__init__(url, document_name, document_file_path)
+    @abstractmethod
+    async def extract_page_data(
+        self, page_idx: int, image_save_dir: str, **kwargs
+    ) -> Dict[str, str]:
+        pass
+    async def load_data(
+        self,
+        start_page: Optional[int] = None,
+        end_page: Optional[int] = None,
+        wandb_artifact_name: Optional[str] = None,
+        image_save_dir: str = "./images",
+        cleanup: bool = True,
+        **kwargs,
+    ) -> List[Dict[str, str]]:
+        os.makedirs(image_save_dir, exist_ok=True)
+        start_page, end_page = self.get_page_indices(start_page, end_page)
+        pages = []
+        processed_pages_counter: int = 1
+        total_pages = end_page - start_page
+        async def process_page(page_idx):
+            nonlocal processed_pages_counter
+            page_data = await self.extract_page_data(page_idx, image_save_dir, **kwargs)
+            pages.append(page_data)
+            rich.print(
+                f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
+            )
+            processed_pages_counter += 1
+        tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
+        for task in asyncio.as_completed(tasks):
+            await task
+        if wandb_artifact_name:
+            artifact = wandb.Artifact(name=wandb_artifact_name, type="dataset")
+            artifact.add_dir(local_path=image_save_dir)
+            artifact.save()
+            rich.print("Artifact saved and uploaded to wandb!")
+        if cleanup:
+            for file in os.listdir(image_save_dir):
+                file_path = os.path.join(image_save_dir, file)
+                if os.path.isfile(file_path):
+                    os.remove(file_path)
+        return pages

medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+from typing import Any, Dict
+from pdf2image.pdf2image import convert_from_path
+from .base_img_loader import BaseImageLoader
+class PDF2ImageLoader(BaseImageLoader):
+    def __init__(self, url: str, document_name: str, document_file_path: str):
+        super().__init__(url, document_name, document_file_path)
+    async def extract_page_data(
+        self, page_idx: int, image_save_dir: str, **kwargs
+    ) -> Dict[str, Any]:
+        image = convert_from_path(
+            self.document_file_path,
+            first_page=page_idx + 1,
+            last_page=page_idx + 1,
+            **kwargs,
+        )[0]
+        image_file_name = f"page{page_idx}.png"
+        image_file_path = os.path.join(image_save_dir, image_file_name)
+        image.save(image_file_path)
+        return {
+            "page_idx": page_idx,
+            "document_name": self.document_name,
+            "file_path": self.document_file_path,
+            "file_url": self.url,
+            "image_file_path": image_file_path,
+        }

medrag_multi_modal/document_loader/load_image.py DELETED Viewed

@@ -1,131 +0,0 @@
-import asyncio
-import os
-from typing import Optional
-import rich
-import wandb
-import weave
-from pdf2image.pdf2image import convert_from_path
-from PIL import Image
-from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
-class ImageLoader(PyMuPDF4LLMTextLoader):
-    """
-    `ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
-    loading of pages from a PDF file as images.
-    This class provides functionality to convert specific pages of a PDF document into images
-    and optionally publish these images to a Weave dataset.
-    !!! example "Example Usage"
-        ```python
-        import asyncio
-        import wandb
-        from dotenv import load_dotenv
-        from medrag_multi_modal.document_loader import ImageLoader
-        load_dotenv()
-        wandb.init(project="medrag-multi-modal", entity="ml-colabs")
-        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
-        loader = ImageLoader(
-            url=url,
-            document_name="Gray's Anatomy",
-            document_file_path="grays_anatomy.pdf",
-        )
-        asyncio.run(
-            loader.load_data(
-                start_page=31,
-                end_page=33,
-                dataset_name="grays-anatomy-images",
-            )
-        )
-        ```
-    Args:
-        url (str): The URL of the PDF document.
-        document_name (str): The name of the document.
-        document_file_path (str): The path to the PDF file.
-    """
-    def __init__(self, url: str, document_name: str, document_file_path: str):
-        super().__init__(url, document_name, document_file_path)
-    def extract_data_from_pdf_file(
-        self, pdf_file: str, page_number: int
-    ) -> Image.Image:
-        image = convert_from_path(
-            pdf_file, first_page=page_number + 1, last_page=page_number + 1
-        )[0]
-        return image
-    async def load_data(
-        self,
-        start_page: Optional[int] = None,
-        end_page: Optional[int] = None,
-        image_save_dir: str = "./images",
-        dataset_name: Optional[str] = None,
-    ):
-        """
-        Asynchronously loads images from a PDF file specified by a URL or local file path,
-        processes the images for the specified range of pages, and optionally publishes them
-        to a Weave dataset.
-        This function reads the specified range of pages from a PDF document, converts each page
-        to an image using the `pdf2image` library, and returns a list of dictionaries containing
-        the image and metadata for each processed page. It processes pages concurrently using
-        `asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
-        published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
-        with the specified name.
-        Args:
-            start_page (Optional[int]): The starting page index (0-based) to process.
-            end_page (Optional[int]): The ending page index (0-based) to process.
-            dataset_name (Optional[str]): The name of the Weave dataset to publish the
-                processed images to. Defaults to None.
-        Returns:
-            list[dict]: A list of dictionaries, each containing the image and metadata for a
-                processed page.
-        Raises:
-            ValueError: If the specified start_page or end_page is out of bounds of the document's
-                page count.
-        """
-        os.makedirs(image_save_dir, exist_ok=True)
-        start_page, end_page = self.get_page_indices(start_page, end_page)
-        pages = []
-        processed_pages_counter: int = 1
-        total_pages = end_page - start_page
-        async def process_page(page_idx):
-            nonlocal processed_pages_counter
-            image = convert_from_path(
-                self.document_file_path,
-                first_page=page_idx + 1,
-                last_page=page_idx + 1,
-            )[0]
-            pages.append(
-                {
-                    "page_idx": page_idx,
-                    "document_name": self.document_name,
-                    "file_path": self.document_file_path,
-                    "file_url": self.url,
-                }
-            )
-            image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
-            rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
-            processed_pages_counter += 1
-        tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
-        for task in asyncio.as_completed(tasks):
-            await task
-        if dataset_name:
-            artifact = wandb.Artifact(name=dataset_name, type="dataset")
-            artifact.add_dir(local_path=image_save_dir)
-            artifact.save()
-            weave.publish(weave.Dataset(name=dataset_name, rows=pages))
-        return pages