Spaces:

geekyrakshit
/

medrag

Building

App Files Files Community

mratanusarkar commited on Oct 19, 2024

Commit

cc5cebc

1 Parent(s): 5c74069

add: docs for base img loader + pdf2image

Browse files

Files changed (6) hide show

docs/document_loader/image_loader/base_img_loader.md +3 -0
docs/document_loader/image_loader/pdf2image_img_loader.md +3 -0
docs/document_loader/load_image.md +0 -3
medrag_multi_modal/document_loader/image_loader/base_img_loader.py +50 -0
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py +24 -0
mkdocs.yml +3 -1

docs/document_loader/image_loader/base_img_loader.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ ## Load images from PDF files
2	+
3	+ ::: medrag_multi_modal.document_loader.image_loader.base_img_loader

docs/document_loader/image_loader/pdf2image_img_loader.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Load images from PDF files (using pdf2image)
2	+
3	+ ::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader

docs/document_loader/load_image.md DELETED Viewed

@@ -1,3 +0,0 @@
-# Load PDF pages as images
-::: medrag_multi_modal.document_loader.load_image

medrag_multi_modal/document_loader/image_loader/base_img_loader.py CHANGED Viewed

@@ -19,6 +19,20 @@ class BaseImageLoader(BaseTextLoader):
     async def extract_page_data(
         self, page_idx: int, image_save_dir: str, **kwargs
     ) -> Dict[str, str]:
         pass
     async def load_data(
@@ -30,6 +44,42 @@ class BaseImageLoader(BaseTextLoader):
         cleanup: bool = True,
         **kwargs,
     ) -> List[Dict[str, str]]:
         os.makedirs(image_save_dir, exist_ok=True)
         start_page, end_page = self.get_page_indices(start_page, end_page)
         pages = []

     async def extract_page_data(
         self, page_idx: int, image_save_dir: str, **kwargs
     ) -> Dict[str, str]:
+        """
+        Abstract method to process a single page of the PDF and extract the image data.
+        Overwrite this method in the subclass to provide the actual implementation and
+        processing logic for each page of the PDF using various PDF processing libraries.
+        Args:
+            page_idx (int): The index of the page to process.
+            image_save_dir (str): The directory to save the extracted images.
+            **kwargs: Additional keyword arguments that may be used by underlying libraries.
+        Returns:
+            Dict[str, str]: A dictionary containing the processed page data.
+        """
         pass
     async def load_data(
         cleanup: bool = True,
         **kwargs,
     ) -> List[Dict[str, str]]:
+        """
+        Asynchronously loads images from a PDF file specified by a URL or local file path.
+        The overrided processing abstract method then processes the images,
+        and optionally publishes it to a Weave artifact.
+        This function downloads a PDF from a given URL if it does not already exist locally,
+        reads the specified range of pages, scans each page's content to extract images, and
+        returns a list of Page objects containing the images and metadata.
+        It uses `PyPDF2` to calculate the number of pages in the PDF and the
+        overriden `extract_page_data` method provides the actual implementation to process
+        each page, extract the image content from the PDF, and convert it to png format.
+        It processes pages concurrently using `asyncio` for efficiency.
+        If a wandb_artifact_name is provided, the processed pages are published to a Weave artifact.
+        Args:
+            start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
+            end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
+            wandb_artifact_name (Optional[str]): The name of the Weave artifact to publish the pages to, if provided.
+            image_save_dir (str): The directory to save the extracted images.
+            cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
+            **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries, each containing the image and metadata for a processed page.
+            Each dictionary will have the following keys and values:
+            - "page_idx": (int) the index of the page.
+            - "document_name": (str) the name of the document.
+            - "file_path": (str) the local file path where the PDF is stored.
+            - "file_url": (str) the URL of the PDF file.
+            - "image_file_path" or "image_file_paths": (str) the local file path where the image/images are stored.
+        Raises:
+            ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
+        """
         os.makedirs(image_save_dir, exist_ok=True)
         start_page, end_page = self.get_page_indices(start_page, end_page)
         pages = []

medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py CHANGED Viewed

@@ -7,6 +7,19 @@ from .base_img_loader import BaseImageLoader
 class PDF2ImageLoader(BaseImageLoader):
     def __init__(self, url: str, document_name: str, document_file_path: str):
         super().__init__(url, document_name, document_file_path)
@@ -14,6 +27,17 @@ class PDF2ImageLoader(BaseImageLoader):
     async def extract_page_data(
         self, page_idx: int, image_save_dir: str, **kwargs
     ) -> Dict[str, Any]:
         image = convert_from_path(
             self.document_file_path,
             first_page=page_idx + 1,

 class PDF2ImageLoader(BaseImageLoader):
+    """
+    `PDF2ImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
+    loading of pages from a PDF file as images using the pdf2image library.
+    This class provides functionality to convert specific pages of a PDF document into images
+    and optionally publish these images to a Weave artifact.
+    It is like a snapshot image version of each of the pages from the PDF.
+    Args:
+        url (str): The URL of the PDF document.
+        document_name (str): The name of the document.
+        document_file_path (str): The path to the PDF file.
+    """
     def __init__(self, url: str, document_name: str, document_file_path: str):
         super().__init__(url, document_name, document_file_path)
     async def extract_page_data(
         self, page_idx: int, image_save_dir: str, **kwargs
     ) -> Dict[str, Any]:
+        """
+        Extracts a single page from the PDF as an image using pdf2image library.
+        Args:
+            page_idx (int): The index of the page to process.
+            image_save_dir (str): The directory to save the extracted image.
+            **kwargs: Additional keyword arguments that may be used by pdf2image.
+        Returns:
+            Dict[str, str]: A dictionary containing the processed page data.
+        """
         image = convert_from_path(
             self.document_file_path,
             first_page=page_idx + 1,

mkdocs.yml CHANGED Viewed

@@ -69,7 +69,9 @@ nav:
       - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
       - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
       - Marker: 'document_loader/text_loader/marker_text_loader.md'
-    - Image Loader: 'document_loader/load_image.md'
   - Chunking: 'chunking.md'
   - Retrieval:
     - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'

       - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
       - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
       - Marker: 'document_loader/text_loader/marker_text_loader.md'
+    - Image Loader:
+      - Base: 'document_loader/image_loader/base_img_loader.md'
+      - PDF2Image: 'document_loader/image_loader/pdf2image_img_loader.md'
   - Chunking: 'chunking.md'
   - Retrieval:
     - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'