mratanusarkar commited on
Commit
cc5cebc
·
1 Parent(s): 5c74069

add: docs for base img loader + pdf2image

Browse files
docs/document_loader/image_loader/base_img_loader.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ## Load images from PDF files
2
+
3
+ ::: medrag_multi_modal.document_loader.image_loader.base_img_loader
docs/document_loader/image_loader/pdf2image_img_loader.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Load images from PDF files (using pdf2image)
2
+
3
+ ::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader
docs/document_loader/load_image.md DELETED
@@ -1,3 +0,0 @@
1
- # Load PDF pages as images
2
-
3
- ::: medrag_multi_modal.document_loader.load_image
 
 
 
 
medrag_multi_modal/document_loader/image_loader/base_img_loader.py CHANGED
@@ -19,6 +19,20 @@ class BaseImageLoader(BaseTextLoader):
19
  async def extract_page_data(
20
  self, page_idx: int, image_save_dir: str, **kwargs
21
  ) -> Dict[str, str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  pass
23
 
24
  async def load_data(
@@ -30,6 +44,42 @@ class BaseImageLoader(BaseTextLoader):
30
  cleanup: bool = True,
31
  **kwargs,
32
  ) -> List[Dict[str, str]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  os.makedirs(image_save_dir, exist_ok=True)
34
  start_page, end_page = self.get_page_indices(start_page, end_page)
35
  pages = []
 
19
  async def extract_page_data(
20
  self, page_idx: int, image_save_dir: str, **kwargs
21
  ) -> Dict[str, str]:
22
+ """
23
+ Abstract method to process a single page of the PDF and extract the image data.
24
+
25
+ Overwrite this method in the subclass to provide the actual implementation and
26
+ processing logic for each page of the PDF using various PDF processing libraries.
27
+
28
+ Args:
29
+ page_idx (int): The index of the page to process.
30
+ image_save_dir (str): The directory to save the extracted images.
31
+ **kwargs: Additional keyword arguments that may be used by underlying libraries.
32
+
33
+ Returns:
34
+ Dict[str, str]: A dictionary containing the processed page data.
35
+ """
36
  pass
37
 
38
  async def load_data(
 
44
  cleanup: bool = True,
45
  **kwargs,
46
  ) -> List[Dict[str, str]]:
47
+ """
48
+ Asynchronously loads images from a PDF file specified by a URL or local file path.
49
+ The overrided processing abstract method then processes the images,
50
+ and optionally publishes it to a Weave artifact.
51
+
52
+ This function downloads a PDF from a given URL if it does not already exist locally,
53
+ reads the specified range of pages, scans each page's content to extract images, and
54
+ returns a list of Page objects containing the images and metadata.
55
+
56
+ It uses `PyPDF2` to calculate the number of pages in the PDF and the
57
+ overriden `extract_page_data` method provides the actual implementation to process
58
+ each page, extract the image content from the PDF, and convert it to png format.
59
+ It processes pages concurrently using `asyncio` for efficiency.
60
+
61
+ If a wandb_artifact_name is provided, the processed pages are published to a Weave artifact.
62
+
63
+ Args:
64
+ start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
65
+ end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
66
+ wandb_artifact_name (Optional[str]): The name of the Weave artifact to publish the pages to, if provided.
67
+ image_save_dir (str): The directory to save the extracted images.
68
+ cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
69
+ **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
70
+
71
+ Returns:
72
+ List[Dict[str, Any]]: A list of dictionaries, each containing the image and metadata for a processed page.
73
+ Each dictionary will have the following keys and values:
74
+
75
+ - "page_idx": (int) the index of the page.
76
+ - "document_name": (str) the name of the document.
77
+ - "file_path": (str) the local file path where the PDF is stored.
78
+ - "file_url": (str) the URL of the PDF file.
79
+ - "image_file_path" or "image_file_paths": (str) the local file path where the image/images are stored.
80
+ Raises:
81
+ ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
82
+ """
83
  os.makedirs(image_save_dir, exist_ok=True)
84
  start_page, end_page = self.get_page_indices(start_page, end_page)
85
  pages = []
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py CHANGED
@@ -7,6 +7,19 @@ from .base_img_loader import BaseImageLoader
7
 
8
 
9
  class PDF2ImageLoader(BaseImageLoader):
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def __init__(self, url: str, document_name: str, document_file_path: str):
12
  super().__init__(url, document_name, document_file_path)
@@ -14,6 +27,17 @@ class PDF2ImageLoader(BaseImageLoader):
14
  async def extract_page_data(
15
  self, page_idx: int, image_save_dir: str, **kwargs
16
  ) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
17
  image = convert_from_path(
18
  self.document_file_path,
19
  first_page=page_idx + 1,
 
7
 
8
 
9
  class PDF2ImageLoader(BaseImageLoader):
10
+ """
11
+ `PDF2ImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
12
+ loading of pages from a PDF file as images using the pdf2image library.
13
+
14
+ This class provides functionality to convert specific pages of a PDF document into images
15
+ and optionally publish these images to a Weave artifact.
16
+ It is like a snapshot image version of each of the pages from the PDF.
17
+
18
+ Args:
19
+ url (str): The URL of the PDF document.
20
+ document_name (str): The name of the document.
21
+ document_file_path (str): The path to the PDF file.
22
+ """
23
 
24
  def __init__(self, url: str, document_name: str, document_file_path: str):
25
  super().__init__(url, document_name, document_file_path)
 
27
  async def extract_page_data(
28
  self, page_idx: int, image_save_dir: str, **kwargs
29
  ) -> Dict[str, Any]:
30
+ """
31
+ Extracts a single page from the PDF as an image using pdf2image library.
32
+
33
+ Args:
34
+ page_idx (int): The index of the page to process.
35
+ image_save_dir (str): The directory to save the extracted image.
36
+ **kwargs: Additional keyword arguments that may be used by pdf2image.
37
+
38
+ Returns:
39
+ Dict[str, str]: A dictionary containing the processed page data.
40
+ """
41
  image = convert_from_path(
42
  self.document_file_path,
43
  first_page=page_idx + 1,
mkdocs.yml CHANGED
@@ -69,7 +69,9 @@ nav:
69
  - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
70
  - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
71
  - Marker: 'document_loader/text_loader/marker_text_loader.md'
72
- - Image Loader: 'document_loader/load_image.md'
 
 
73
  - Chunking: 'chunking.md'
74
  - Retrieval:
75
  - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
 
69
  - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
70
  - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
71
  - Marker: 'document_loader/text_loader/marker_text_loader.md'
72
+ - Image Loader:
73
+ - Base: 'document_loader/image_loader/base_img_loader.md'
74
+ - PDF2Image: 'document_loader/image_loader/pdf2image_img_loader.md'
75
  - Chunking: 'chunking.md'
76
  - Retrieval:
77
  - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'