mratanusarkar commited on
Commit
5c74069
·
1 Parent(s): 694a076

add: base image loader + pdf2img from load_image

Browse files
medrag_multi_modal/document_loader/__init__.py CHANGED
@@ -1,5 +1,4 @@
1
- from .load_image import ImageLoader
2
- from .load_text_image import TextImageLoader
3
  from .text_loader import (
4
  MarkerTextLoader,
5
  PDFPlumberTextLoader,
@@ -12,6 +11,5 @@ __all__ = [
12
  "PyPDF2TextLoader",
13
  "PDFPlumberTextLoader",
14
  "MarkerTextLoader",
15
- "ImageLoader",
16
- "TextImageLoader",
17
  ]
 
1
+ from .image_loader import PDF2ImageLoader
 
2
  from .text_loader import (
3
  MarkerTextLoader,
4
  PDFPlumberTextLoader,
 
11
  "PyPDF2TextLoader",
12
  "PDFPlumberTextLoader",
13
  "MarkerTextLoader",
14
+ "PDF2ImageLoader",
 
15
  ]
medrag_multi_modal/document_loader/image_loader/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .base_img_loader import BaseImageLoader
2
+ from .pdf2image_img_loader import PDF2ImageLoader
3
+
4
+ __all__ = ["PDF2ImageLoader", "BaseImageLoader"]
medrag_multi_modal/document_loader/image_loader/base_img_loader.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ from abc import abstractmethod
4
+ from typing import Dict, List, Optional
5
+
6
+ import rich
7
+
8
+ import wandb
9
+ from medrag_multi_modal.document_loader.text_loader.base_text_loader import (
10
+ BaseTextLoader,
11
+ )
12
+
13
+
14
+ class BaseImageLoader(BaseTextLoader):
15
+ def __init__(self, url: str, document_name: str, document_file_path: str):
16
+ super().__init__(url, document_name, document_file_path)
17
+
18
+ @abstractmethod
19
+ async def extract_page_data(
20
+ self, page_idx: int, image_save_dir: str, **kwargs
21
+ ) -> Dict[str, str]:
22
+ pass
23
+
24
+ async def load_data(
25
+ self,
26
+ start_page: Optional[int] = None,
27
+ end_page: Optional[int] = None,
28
+ wandb_artifact_name: Optional[str] = None,
29
+ image_save_dir: str = "./images",
30
+ cleanup: bool = True,
31
+ **kwargs,
32
+ ) -> List[Dict[str, str]]:
33
+ os.makedirs(image_save_dir, exist_ok=True)
34
+ start_page, end_page = self.get_page_indices(start_page, end_page)
35
+ pages = []
36
+ processed_pages_counter: int = 1
37
+ total_pages = end_page - start_page
38
+
39
+ async def process_page(page_idx):
40
+ nonlocal processed_pages_counter
41
+ page_data = await self.extract_page_data(page_idx, image_save_dir, **kwargs)
42
+ pages.append(page_data)
43
+ rich.print(
44
+ f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
45
+ )
46
+ processed_pages_counter += 1
47
+
48
+ tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
49
+ for task in asyncio.as_completed(tasks):
50
+ await task
51
+
52
+ if wandb_artifact_name:
53
+ artifact = wandb.Artifact(name=wandb_artifact_name, type="dataset")
54
+ artifact.add_dir(local_path=image_save_dir)
55
+ artifact.save()
56
+ rich.print("Artifact saved and uploaded to wandb!")
57
+
58
+ if cleanup:
59
+ for file in os.listdir(image_save_dir):
60
+ file_path = os.path.join(image_save_dir, file)
61
+ if os.path.isfile(file_path):
62
+ os.remove(file_path)
63
+ return pages
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Dict
3
+
4
+ from pdf2image.pdf2image import convert_from_path
5
+
6
+ from .base_img_loader import BaseImageLoader
7
+
8
+
9
+ class PDF2ImageLoader(BaseImageLoader):
10
+
11
+ def __init__(self, url: str, document_name: str, document_file_path: str):
12
+ super().__init__(url, document_name, document_file_path)
13
+
14
+ async def extract_page_data(
15
+ self, page_idx: int, image_save_dir: str, **kwargs
16
+ ) -> Dict[str, Any]:
17
+ image = convert_from_path(
18
+ self.document_file_path,
19
+ first_page=page_idx + 1,
20
+ last_page=page_idx + 1,
21
+ **kwargs,
22
+ )[0]
23
+
24
+ image_file_name = f"page{page_idx}.png"
25
+ image_file_path = os.path.join(image_save_dir, image_file_name)
26
+ image.save(image_file_path)
27
+
28
+ return {
29
+ "page_idx": page_idx,
30
+ "document_name": self.document_name,
31
+ "file_path": self.document_file_path,
32
+ "file_url": self.url,
33
+ "image_file_path": image_file_path,
34
+ }
medrag_multi_modal/document_loader/load_image.py DELETED
@@ -1,131 +0,0 @@
1
- import asyncio
2
- import os
3
- from typing import Optional
4
-
5
- import rich
6
- import wandb
7
- import weave
8
- from pdf2image.pdf2image import convert_from_path
9
- from PIL import Image
10
-
11
- from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
12
-
13
-
14
- class ImageLoader(PyMuPDF4LLMTextLoader):
15
- """
16
- `ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
17
- loading of pages from a PDF file as images.
18
-
19
- This class provides functionality to convert specific pages of a PDF document into images
20
- and optionally publish these images to a Weave dataset.
21
-
22
- !!! example "Example Usage"
23
- ```python
24
- import asyncio
25
-
26
- import wandb
27
- from dotenv import load_dotenv
28
-
29
- from medrag_multi_modal.document_loader import ImageLoader
30
-
31
- load_dotenv()
32
- wandb.init(project="medrag-multi-modal", entity="ml-colabs")
33
- url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
34
- loader = ImageLoader(
35
- url=url,
36
- document_name="Gray's Anatomy",
37
- document_file_path="grays_anatomy.pdf",
38
- )
39
- asyncio.run(
40
- loader.load_data(
41
- start_page=31,
42
- end_page=33,
43
- dataset_name="grays-anatomy-images",
44
- )
45
- )
46
- ```
47
-
48
- Args:
49
- url (str): The URL of the PDF document.
50
- document_name (str): The name of the document.
51
- document_file_path (str): The path to the PDF file.
52
- """
53
-
54
- def __init__(self, url: str, document_name: str, document_file_path: str):
55
- super().__init__(url, document_name, document_file_path)
56
-
57
- def extract_data_from_pdf_file(
58
- self, pdf_file: str, page_number: int
59
- ) -> Image.Image:
60
- image = convert_from_path(
61
- pdf_file, first_page=page_number + 1, last_page=page_number + 1
62
- )[0]
63
- return image
64
-
65
- async def load_data(
66
- self,
67
- start_page: Optional[int] = None,
68
- end_page: Optional[int] = None,
69
- image_save_dir: str = "./images",
70
- dataset_name: Optional[str] = None,
71
- ):
72
- """
73
- Asynchronously loads images from a PDF file specified by a URL or local file path,
74
- processes the images for the specified range of pages, and optionally publishes them
75
- to a Weave dataset.
76
-
77
- This function reads the specified range of pages from a PDF document, converts each page
78
- to an image using the `pdf2image` library, and returns a list of dictionaries containing
79
- the image and metadata for each processed page. It processes pages concurrently using
80
- `asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
81
- published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
82
- with the specified name.
83
-
84
- Args:
85
- start_page (Optional[int]): The starting page index (0-based) to process.
86
- end_page (Optional[int]): The ending page index (0-based) to process.
87
- dataset_name (Optional[str]): The name of the Weave dataset to publish the
88
- processed images to. Defaults to None.
89
-
90
- Returns:
91
- list[dict]: A list of dictionaries, each containing the image and metadata for a
92
- processed page.
93
-
94
- Raises:
95
- ValueError: If the specified start_page or end_page is out of bounds of the document's
96
- page count.
97
- """
98
- os.makedirs(image_save_dir, exist_ok=True)
99
- start_page, end_page = self.get_page_indices(start_page, end_page)
100
- pages = []
101
- processed_pages_counter: int = 1
102
- total_pages = end_page - start_page
103
-
104
- async def process_page(page_idx):
105
- nonlocal processed_pages_counter
106
- image = convert_from_path(
107
- self.document_file_path,
108
- first_page=page_idx + 1,
109
- last_page=page_idx + 1,
110
- )[0]
111
- pages.append(
112
- {
113
- "page_idx": page_idx,
114
- "document_name": self.document_name,
115
- "file_path": self.document_file_path,
116
- "file_url": self.url,
117
- }
118
- )
119
- image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
120
- rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
121
- processed_pages_counter += 1
122
-
123
- tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
124
- for task in asyncio.as_completed(tasks):
125
- await task
126
- if dataset_name:
127
- artifact = wandb.Artifact(name=dataset_name, type="dataset")
128
- artifact.add_dir(local_path=image_save_dir)
129
- artifact.save()
130
- weave.publish(weave.Dataset(name=dataset_name, rows=pages))
131
- return pages