mratanusarkar commited on
Commit
f9d44bd
·
1 Parent(s): 3d948a1

add: two modules on fitz to handle img extractions

Browse files
medrag_multi_modal/document_loader/__init__.py CHANGED
@@ -1,4 +1,10 @@
1
- from .image_loader import MarkerImageLoader, PDF2ImageLoader, PDFPlumberImageLoader
 
 
 
 
 
 
2
  from .text_loader import (
3
  MarkerTextLoader,
4
  PDFPlumberTextLoader,
@@ -14,4 +20,6 @@ __all__ = [
14
  "PDF2ImageLoader",
15
  "MarkerImageLoader",
16
  "PDFPlumberImageLoader",
 
 
17
  ]
 
1
+ from .image_loader import (
2
+ FitzPILImageLoader,
3
+ MarkerImageLoader,
4
+ PDF2ImageLoader,
5
+ PDFPlumberImageLoader,
6
+ PyMuPDFImageLoader,
7
+ )
8
  from .text_loader import (
9
  MarkerTextLoader,
10
  PDFPlumberTextLoader,
 
20
  "PDF2ImageLoader",
21
  "MarkerImageLoader",
22
  "PDFPlumberImageLoader",
23
+ "PyMuPDFImageLoader",
24
+ "FitzPILImageLoader",
25
  ]
medrag_multi_modal/document_loader/image_loader/__init__.py CHANGED
@@ -1,5 +1,13 @@
 
1
  from .marker_img_loader import MarkerImageLoader
2
  from .pdf2image_img_loader import PDF2ImageLoader
3
  from .pdfplumber_img_loader import PDFPlumberImageLoader
 
4
 
5
- __all__ = ["PDF2ImageLoader", "MarkerImageLoader", "PDFPlumberImageLoader"]
 
 
 
 
 
 
 
1
+ from .fitzpil_img_loader import FitzPILImageLoader
2
  from .marker_img_loader import MarkerImageLoader
3
  from .pdf2image_img_loader import PDF2ImageLoader
4
  from .pdfplumber_img_loader import PDFPlumberImageLoader
5
+ from .pymupdf_img_loader import PyMuPDFImageLoader
6
 
7
+ __all__ = [
8
+ "PDF2ImageLoader",
9
+ "MarkerImageLoader",
10
+ "PDFPlumberImageLoader",
11
+ "PyMuPDFImageLoader",
12
+ "FitzPILImageLoader",
13
+ ]
medrag_multi_modal/document_loader/image_loader/fitzpil_img_loader.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ from typing import Any, Dict
4
+
5
+ import fitz
6
+ from PIL import Image, ImageOps, UnidentifiedImageError
7
+
8
+ from .base_img_loader import BaseImageLoader
9
+
10
+
11
+ class FitzPILImageLoader(BaseImageLoader):
12
+ """
13
+ `FitzPILImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
14
+ loading of pages from a PDF file as images using the fitz and PIL libraries.
15
+
16
+ This class provides functionality to extract images from a PDF file using fitz and PIL libraries,
17
+ and optionally publish these images to a WandB artifact.
18
+
19
+ !!! example "Example Usage"
20
+ ```python
21
+ import asyncio
22
+
23
+ import weave
24
+
25
+ import wandb
26
+ from medrag_multi_modal.document_loader.image_loader import FitzPILImageLoader
27
+
28
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
29
+ wandb.init(project="medrag-multi-modal", entity="ml-colabs")
30
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
31
+ loader = FitzPILImageLoader(
32
+ url=url,
33
+ document_name="Gray's Anatomy",
34
+ document_file_path="grays_anatomy.pdf",
35
+ )
36
+ asyncio.run(
37
+ loader.load_data(
38
+ start_page=32,
39
+ end_page=37,
40
+ wandb_artifact_name="grays-anatomy-images",
41
+ cleanup=False,
42
+ )
43
+ )
44
+ ```
45
+
46
+ Args:
47
+ url (str): The URL of the PDF document.
48
+ document_name (str): The name of the document.
49
+ document_file_path (str): The path to the PDF file.
50
+ """
51
+
52
+ def __init__(self, url: str, document_name: str, document_file_path: str):
53
+ super().__init__(url, document_name, document_file_path)
54
+
55
+ async def extract_page_data(
56
+ self, page_idx: int, image_save_dir: str, **kwargs
57
+ ) -> Dict[str, Any]:
58
+ """
59
+ Extracts a single page from the PDF as an image using fitz and PIL libraries.
60
+
61
+ Args:
62
+ page_idx (int): The index of the page to process.
63
+ image_save_dir (str): The directory to save the extracted image.
64
+ **kwargs: Additional keyword arguments that may be used by fitz and PIL.
65
+
66
+ Returns:
67
+ Dict[str, Any]: A dictionary containing the processed page data.
68
+ The dictionary will have the following keys and values:
69
+
70
+ - "page_idx": (int) the index of the page.
71
+ - "document_name": (str) the name of the document.
72
+ - "file_path": (str) the local file path where the PDF is stored.
73
+ - "file_url": (str) the URL of the PDF file.
74
+ - "image_file_paths": (list) the local file paths where the images are stored.
75
+ """
76
+ image_file_paths = []
77
+
78
+ pdf_document = fitz.open(self.document_file_path)
79
+ page = pdf_document.load_page(page_idx)
80
+
81
+ images = page.get_images(full=True)
82
+ for img_idx, image in enumerate(images):
83
+ xref = image[0]
84
+ base_image = pdf_document.extract_image(xref)
85
+ image_bytes = base_image["image"]
86
+ image_ext = base_image["ext"]
87
+
88
+ try:
89
+ img = Image.open(io.BytesIO(image_bytes))
90
+
91
+ if img.mode in ["L"]:
92
+ # images in greyscale looks inverted, need to test on other PDFs
93
+ img = ImageOps.invert(img)
94
+
95
+ if img.mode == "CMYK":
96
+ img = img.convert("RGB")
97
+
98
+ if image_ext not in ["png", "jpg", "jpeg"]:
99
+ image_ext = "png"
100
+ image_file_name = f"page{page_idx}_fig{img_idx}.png"
101
+ image_file_path = os.path.join(image_save_dir, image_file_name)
102
+
103
+ img.save(image_file_path, format="PNG")
104
+ else:
105
+ image_file_name = f"page{page_idx}_fig{img_idx}.{image_ext}"
106
+ image_file_path = os.path.join(image_save_dir, image_file_name)
107
+
108
+ with open(image_file_path, "wb") as image_file:
109
+ image_file.write(image_bytes)
110
+
111
+ image_file_paths.append(image_file_path)
112
+
113
+ except (UnidentifiedImageError, OSError) as e:
114
+ print(
115
+ f"Skipping image at page {page_idx}, fig {img_idx} due to an error: {e}"
116
+ )
117
+ continue
118
+
119
+ pdf_document.close()
120
+
121
+ return {
122
+ "page_idx": page_idx,
123
+ "document_name": self.document_name,
124
+ "file_path": self.document_file_path,
125
+ "file_url": self.url,
126
+ "image_file_paths": image_file_paths,
127
+ }
medrag_multi_modal/document_loader/image_loader/pymupdf_img_loader.py CHANGED
@@ -1,9 +1,9 @@
 
1
  import os
2
  from typing import Any, Dict
3
 
4
  import fitz
5
- from PIL import Image, ImageOps, UnidentifiedImageError
6
- import io
7
 
8
  from .base_img_loader import BaseImageLoader
9
 
@@ -76,7 +76,7 @@ class PyMuPDFImageLoader(BaseImageLoader):
76
  image_file_paths = []
77
 
78
  pdf_document = fitz.open(self.document_file_path)
79
- page = pdf_document.load_page(page_idx)
80
 
81
  images = page.get_images(full=True)
82
  for img_idx, image in enumerate(images):
@@ -85,33 +85,33 @@ class PyMuPDFImageLoader(BaseImageLoader):
85
  image_bytes = base_image["image"]
86
  image_ext = base_image["ext"]
87
 
88
- try:
89
- img = Image.open(io.BytesIO(image_bytes))
90
-
91
- if img.mode in ['1', 'P']:
92
- img = ImageOps.invert(img.convert('L'))
93
-
94
- if img.mode == 'CMYK':
95
- img = img.convert('RGB')
96
-
97
- if image_ext not in ['png', 'jpg', 'jpeg']:
98
- image_ext = 'png'
99
- image_file_name = f"page{page_idx}_fig{img_idx}.png"
100
- image_file_path = os.path.join(image_save_dir, image_file_name)
101
-
102
- img.save(image_file_path, format="PNG")
103
- else:
104
- image_file_name = f"page{page_idx}_fig{img_idx}.{image_ext}"
105
- image_file_path = os.path.join(image_save_dir, image_file_name)
106
-
107
- with open(image_file_path, "wb") as image_file:
108
- image_file.write(image_bytes)
109
-
110
- image_file_paths.append(image_file_path)
111
-
112
- except (UnidentifiedImageError, OSError) as e:
113
- print(f"Skipping image at page {page_idx}, fig {img_idx} due to an error: {e}")
114
- continue
115
 
116
  pdf_document.close()
117
 
 
1
+ import io
2
  import os
3
  from typing import Any, Dict
4
 
5
  import fitz
6
+ from PIL import Image
 
7
 
8
  from .base_img_loader import BaseImageLoader
9
 
 
76
  image_file_paths = []
77
 
78
  pdf_document = fitz.open(self.document_file_path)
79
+ page = pdf_document[page_idx]
80
 
81
  images = page.get_images(full=True)
82
  for img_idx, image in enumerate(images):
 
85
  image_bytes = base_image["image"]
86
  image_ext = base_image["ext"]
87
 
88
+ if image_ext == "jb2":
89
+ image_ext = "png"
90
+ elif image_ext == "jpx":
91
+ image_ext = "jpg"
92
+
93
+ image_file_name = f"page{page_idx}_fig{img_idx}.{image_ext}"
94
+ image_file_path = os.path.join(image_save_dir, image_file_name)
95
+
96
+ # For JBIG2 and JPEG2000, we need to convert the image
97
+ if base_image["ext"] in ["jb2", "jpx"]:
98
+ try:
99
+ pix = fitz.Pixmap(image_bytes)
100
+ pix.save(image_file_path)
101
+ except Exception as err_fitz:
102
+ print(f"Error processing image with fitz: {err_fitz}")
103
+ # Fallback to using PIL for image conversion
104
+ try:
105
+ img = Image.open(io.BytesIO(image_bytes))
106
+ img.save(image_file_path)
107
+ except Exception as err_pil:
108
+ print(f"Failed to process image with PIL: {err_pil}")
109
+ continue # Skip this image if both methods fail
110
+ else:
111
+ with open(image_file_path, "wb") as image_file:
112
+ image_file.write(image_bytes)
113
+
114
+ image_file_paths.append(image_file_path)
115
 
116
  pdf_document.close()
117