geekyrakshit commited on
Commit
bf14736
·
unverified ·
2 Parent(s): 694a076 ff75fe0

Merge pull request #14 from soumik12345/feat/ensemble-of-image-loaders

Browse files
docs/document_loader/image_loader/base_img_loader.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ## Load images from PDF files
2
+
3
+ ::: medrag_multi_modal.document_loader.image_loader.base_img_loader
docs/document_loader/image_loader/fitzpil_img_loader.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load images from PDF files (using Fitz & PIL)
2
+
3
+ ??? note "Note"
4
+ **Underlying Library:** `fitz` & `pillow`
5
+
6
+ Extract images from PDF files using `fitz` and `pillow`.
7
+
8
+ Use it in our library with:
9
+ ```python
10
+ from medrag_multi_modal.document_loader.image_loader import FitzPILImageLoader
11
+ ```
12
+
13
+ For more details, please refer to the sources below.
14
+
15
+ **Sources:**
16
+
17
+ - [Docs](https://pymupdf.readthedocs.io/en/latest/intro.html)
18
+ - [GitHub](https://github.com/kastman/fitz)
19
+ - [PyPI](https://pypi.org/project/fitz/)
20
+ - [PyPI](https://pypi.org/project/pillow/)
21
+
22
+ ::: medrag_multi_modal.document_loader.image_loader.fitzpil_img_loader
docs/document_loader/image_loader/marker_img_loader.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load images from PDF files (using Marker)
2
+
3
+ ??? note "Note"
4
+ **Underlying Library:** `marker-pdf`
5
+
6
+ Extract images from PDF files using `marker-pdf`.
7
+
8
+ Use it in our library with:
9
+ ```python
10
+ from medrag_multi_modal.document_loader.image_loader import MarkerImageLoader
11
+ ```
12
+
13
+ For details, please refer to the sources below.
14
+
15
+ **Sources:**
16
+
17
+ - [DataLab](https://www.datalab.to)
18
+ - [GitHub](https://github.com/VikParuchuri/marker)
19
+ - [PyPI](https://pypi.org/project/marker-pdf/)
20
+
21
+ ::: medrag_multi_modal.document_loader.image_loader.marker_img_loader
docs/document_loader/image_loader/pdf2image_img_loader.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load images from PDF files (using PDF2Image)
2
+
3
+ !!! danger "Warning"
4
+ Unlike other image extraction methods in `document_loader.image_loader`, this loader does not extract embedded images from the PDF.
5
+ Instead, it creates a snapshot image version of each selected page from the PDF.
6
+
7
+ ??? note "Note"
8
+ **Underlying Library:** `pdf2image`
9
+
10
+ Extract images from PDF files using `pdf2image`.
11
+
12
+
13
+ Use it in our library with:
14
+ ```python
15
+ from medrag_multi_modal.document_loader.image_loader import PDF2ImageLoader
16
+ ```
17
+
18
+ For details and available `**kwargs`, please refer to the sources below.
19
+
20
+ **Sources:**
21
+
22
+ - [DataLab](https://www.datalab.to)
23
+ - [GitHub](https://github.com/VikParuchuri/marker)
24
+ - [PyPI](https://pypi.org/project/marker-pdf/)
25
+
26
+ ::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader
docs/document_loader/image_loader/pdfplumber_img_loader.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load images from PDF files (using PDFPlumber)
2
+
3
+ ??? note "Note"
4
+ **Underlying Library:** `pdfplumber`
5
+
6
+ Extract images from PDF files using `pdfplumber`.
7
+
8
+ You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
9
+
10
+ Use it in our library with:
11
+ ```python
12
+ from medrag_multi_modal.document_loader.image_loader import PDFPlumberImageLoader
13
+ ```
14
+
15
+ For details, please refer to the sources below.
16
+
17
+ **Sources:**
18
+
19
+ - [GitHub](https://github.com/jsvine/pdfplumber)
20
+ - [PyPI](https://pypi.org/project/pdfplumber/)
21
+
22
+ ::: medrag_multi_modal.document_loader.image_loader.pdfplumber_img_loader
docs/document_loader/image_loader/pymupdf_img_loader.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load images from PDF files (using PyMuPDF)
2
+
3
+ ??? note "Note"
4
+ **Underlying Library:** `pymupdf`
5
+
6
+ PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.
7
+
8
+ You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
9
+
10
+ Use it in our library with:
11
+ ```python
12
+ from medrag_multi_modal.document_loader.image_loader import PyMuPDFImageLoader
13
+ ```
14
+
15
+ For details, please refer to the sources below.
16
+
17
+ **Sources:**
18
+
19
+ - [Docs](https://pymupdf.readthedocs.io/en/latest/)
20
+ - [GitHub](https://github.com/pymupdf/PyMuPDF)
21
+ - [PyPI](https://pypi.org/project/PyMuPDF/)
22
+
23
+ ::: medrag_multi_modal.document_loader.image_loader.pymupdf_img_loader
docs/document_loader/load_image.md DELETED
@@ -1,3 +0,0 @@
1
- # Load PDF pages as images
2
-
3
- ::: medrag_multi_modal.document_loader.load_image
 
 
 
 
medrag_multi_modal/document_loader/__init__.py CHANGED
@@ -1,5 +1,10 @@
1
- from .load_image import ImageLoader
2
- from .load_text_image import TextImageLoader
 
 
 
 
 
3
  from .text_loader import (
4
  MarkerTextLoader,
5
  PDFPlumberTextLoader,
@@ -12,6 +17,9 @@ __all__ = [
12
  "PyPDF2TextLoader",
13
  "PDFPlumberTextLoader",
14
  "MarkerTextLoader",
15
- "ImageLoader",
16
- "TextImageLoader",
 
 
 
17
  ]
 
1
+ from .image_loader import (
2
+ FitzPILImageLoader,
3
+ MarkerImageLoader,
4
+ PDF2ImageLoader,
5
+ PDFPlumberImageLoader,
6
+ PyMuPDFImageLoader,
7
+ )
8
  from .text_loader import (
9
  MarkerTextLoader,
10
  PDFPlumberTextLoader,
 
17
  "PyPDF2TextLoader",
18
  "PDFPlumberTextLoader",
19
  "MarkerTextLoader",
20
+ "PDF2ImageLoader",
21
+ "MarkerImageLoader",
22
+ "PDFPlumberImageLoader",
23
+ "PyMuPDFImageLoader",
24
+ "FitzPILImageLoader",
25
  ]
medrag_multi_modal/document_loader/image_loader/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .fitzpil_img_loader import FitzPILImageLoader
2
+ from .marker_img_loader import MarkerImageLoader
3
+ from .pdf2image_img_loader import PDF2ImageLoader
4
+ from .pdfplumber_img_loader import PDFPlumberImageLoader
5
+ from .pymupdf_img_loader import PyMuPDFImageLoader
6
+
7
+ __all__ = [
8
+ "PDF2ImageLoader",
9
+ "MarkerImageLoader",
10
+ "PDFPlumberImageLoader",
11
+ "PyMuPDFImageLoader",
12
+ "FitzPILImageLoader",
13
+ ]
medrag_multi_modal/document_loader/image_loader/base_img_loader.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ from abc import abstractmethod
4
+ from typing import Dict, List, Optional
5
+
6
+ import rich
7
+
8
+ import wandb
9
+ from medrag_multi_modal.document_loader.text_loader.base_text_loader import (
10
+ BaseTextLoader,
11
+ )
12
+
13
+
14
+ class BaseImageLoader(BaseTextLoader):
15
+ def __init__(self, url: str, document_name: str, document_file_path: str):
16
+ super().__init__(url, document_name, document_file_path)
17
+
18
+ @abstractmethod
19
+ async def extract_page_data(
20
+ self, page_idx: int, image_save_dir: str, **kwargs
21
+ ) -> Dict[str, str]:
22
+ """
23
+ Abstract method to process a single page of the PDF and extract the image data.
24
+
25
+ Overwrite this method in the subclass to provide the actual implementation and
26
+ processing logic for each page of the PDF using various PDF processing libraries.
27
+
28
+ Args:
29
+ page_idx (int): The index of the page to process.
30
+ image_save_dir (str): The directory to save the extracted images.
31
+ **kwargs: Additional keyword arguments that may be used by underlying libraries.
32
+
33
+ Returns:
34
+ Dict[str, str]: A dictionary containing the processed page data.
35
+ """
36
+ pass
37
+
38
+ async def load_data(
39
+ self,
40
+ start_page: Optional[int] = None,
41
+ end_page: Optional[int] = None,
42
+ wandb_artifact_name: Optional[str] = None,
43
+ image_save_dir: str = "./images",
44
+ cleanup: bool = True,
45
+ **kwargs,
46
+ ) -> List[Dict[str, str]]:
47
+ """
48
+ Asynchronously loads images from a PDF file specified by a URL or local file path.
49
+ The overrided processing abstract method then processes the images,
50
+ and optionally publishes it to a WandB artifact.
51
+
52
+ This function downloads a PDF from a given URL if it does not already exist locally,
53
+ reads the specified range of pages, scans each page's content to extract images, and
54
+ returns a list of Page objects containing the images and metadata.
55
+
56
+ It uses `PyPDF2` to calculate the number of pages in the PDF and the
57
+ overriden `extract_page_data` method provides the actual implementation to process
58
+ each page, extract the image content from the PDF, and convert it to png format.
59
+ It processes pages concurrently using `asyncio` for efficiency.
60
+
61
+ If a wandb_artifact_name is provided, the processed pages are published to a WandB artifact.
62
+
63
+ Args:
64
+ start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
65
+ end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
66
+ wandb_artifact_name (Optional[str]): The name of the WandB artifact to publish the pages to, if provided.
67
+ image_save_dir (str): The directory to save the extracted images.
68
+ cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
69
+ **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
70
+
71
+ Returns:
72
+ List[Dict[str, Any]]: A list of dictionaries, each containing the image and metadata for a processed page.
73
+ Each dictionary will have the following keys and values:
74
+
75
+ - "page_idx": (int) the index of the page.
76
+ - "document_name": (str) the name of the document.
77
+ - "file_path": (str) the local file path where the PDF is stored.
78
+ - "file_url": (str) the URL of the PDF file.
79
+ - "image_file_path" or "image_file_paths": (str) the local file path where the image/images are stored.
80
+ Raises:
81
+ ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
82
+ """
83
+ os.makedirs(image_save_dir, exist_ok=True)
84
+ start_page, end_page = self.get_page_indices(start_page, end_page)
85
+ pages = []
86
+ processed_pages_counter: int = 1
87
+ total_pages = end_page - start_page
88
+
89
+ async def process_page(page_idx):
90
+ nonlocal processed_pages_counter
91
+ page_data = await self.extract_page_data(page_idx, image_save_dir, **kwargs)
92
+ pages.append(page_data)
93
+ rich.print(
94
+ f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
95
+ )
96
+ processed_pages_counter += 1
97
+
98
+ tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
99
+ for task in asyncio.as_completed(tasks):
100
+ await task
101
+
102
+ if wandb_artifact_name:
103
+ artifact = wandb.Artifact(name=wandb_artifact_name, type="dataset")
104
+ artifact.add_dir(local_path=image_save_dir)
105
+ artifact.save()
106
+ rich.print("Artifact saved and uploaded to wandb!")
107
+
108
+ if cleanup:
109
+ for file in os.listdir(image_save_dir):
110
+ file_path = os.path.join(image_save_dir, file)
111
+ if os.path.isfile(file_path):
112
+ os.remove(file_path)
113
+ return pages
medrag_multi_modal/document_loader/image_loader/fitzpil_img_loader.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ from typing import Any, Dict
4
+
5
+ import fitz
6
+ from PIL import Image, ImageOps, UnidentifiedImageError
7
+
8
+ from .base_img_loader import BaseImageLoader
9
+
10
+
11
+ class FitzPILImageLoader(BaseImageLoader):
12
+ """
13
+ `FitzPILImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
14
+ loading of pages from a PDF file as images using the fitz and PIL libraries.
15
+
16
+ This class provides functionality to extract images from a PDF file using fitz and PIL libraries,
17
+ and optionally publish these images to a WandB artifact.
18
+
19
+ !!! example "Example Usage"
20
+ ```python
21
+ import asyncio
22
+
23
+ import weave
24
+
25
+ import wandb
26
+ from medrag_multi_modal.document_loader.image_loader import FitzPILImageLoader
27
+
28
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
29
+ wandb.init(project="medrag-multi-modal", entity="ml-colabs")
30
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
31
+ loader = FitzPILImageLoader(
32
+ url=url,
33
+ document_name="Gray's Anatomy",
34
+ document_file_path="grays_anatomy.pdf",
35
+ )
36
+ asyncio.run(
37
+ loader.load_data(
38
+ start_page=32,
39
+ end_page=37,
40
+ wandb_artifact_name="grays-anatomy-images-fitzpil",
41
+ cleanup=False,
42
+ )
43
+ )
44
+ ```
45
+
46
+ Args:
47
+ url (str): The URL of the PDF document.
48
+ document_name (str): The name of the document.
49
+ document_file_path (str): The path to the PDF file.
50
+ """
51
+
52
+ def __init__(self, url: str, document_name: str, document_file_path: str):
53
+ super().__init__(url, document_name, document_file_path)
54
+
55
+ async def extract_page_data(
56
+ self, page_idx: int, image_save_dir: str, **kwargs
57
+ ) -> Dict[str, Any]:
58
+ """
59
+ Extracts a single page from the PDF as an image using fitz and PIL libraries.
60
+
61
+ Args:
62
+ page_idx (int): The index of the page to process.
63
+ image_save_dir (str): The directory to save the extracted image.
64
+ **kwargs: Additional keyword arguments that may be used by fitz and PIL.
65
+
66
+ Returns:
67
+ Dict[str, Any]: A dictionary containing the processed page data.
68
+ The dictionary will have the following keys and values:
69
+
70
+ - "page_idx": (int) the index of the page.
71
+ - "document_name": (str) the name of the document.
72
+ - "file_path": (str) the local file path where the PDF is stored.
73
+ - "file_url": (str) the URL of the PDF file.
74
+ - "image_file_paths": (list) the local file paths where the images are stored.
75
+ """
76
+ image_file_paths = []
77
+
78
+ pdf_document = fitz.open(self.document_file_path)
79
+ page = pdf_document.load_page(page_idx)
80
+
81
+ images = page.get_images(full=True)
82
+ for img_idx, image in enumerate(images):
83
+ xref = image[0]
84
+ base_image = pdf_document.extract_image(xref)
85
+ image_bytes = base_image["image"]
86
+ image_ext = base_image["ext"]
87
+
88
+ try:
89
+ img = Image.open(io.BytesIO(image_bytes))
90
+
91
+ if img.mode in ["L"]:
92
+ # images in greyscale looks inverted, need to test on other PDFs
93
+ img = ImageOps.invert(img)
94
+
95
+ if img.mode == "CMYK":
96
+ img = img.convert("RGB")
97
+
98
+ if image_ext not in ["png", "jpg", "jpeg"]:
99
+ image_ext = "png"
100
+ image_file_name = f"page{page_idx}_fig{img_idx}.png"
101
+ image_file_path = os.path.join(image_save_dir, image_file_name)
102
+
103
+ img.save(image_file_path, format="PNG")
104
+ else:
105
+ image_file_name = f"page{page_idx}_fig{img_idx}.{image_ext}"
106
+ image_file_path = os.path.join(image_save_dir, image_file_name)
107
+
108
+ with open(image_file_path, "wb") as image_file:
109
+ image_file.write(image_bytes)
110
+
111
+ image_file_paths.append(image_file_path)
112
+
113
+ except (UnidentifiedImageError, OSError) as e:
114
+ print(
115
+ f"Skipping image at page {page_idx}, fig {img_idx} due to an error: {e}"
116
+ )
117
+ continue
118
+
119
+ pdf_document.close()
120
+
121
+ return {
122
+ "page_idx": page_idx,
123
+ "document_name": self.document_name,
124
+ "file_path": self.document_file_path,
125
+ "file_url": self.url,
126
+ "image_file_paths": image_file_paths,
127
+ }
medrag_multi_modal/document_loader/image_loader/marker_img_loader.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Dict
3
+
4
+ from marker.convert import convert_single_pdf
5
+ from marker.models import load_all_models
6
+
7
+ from .base_img_loader import BaseImageLoader
8
+
9
+
10
+ class MarkerImageLoader(BaseImageLoader):
11
+ """
12
+ `MarkerImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
13
+ loading of pages from a PDF file as images using the marker library.
14
+
15
+ This class provides functionality to extract images from a PDF file using marker library,
16
+ and optionally publish these images to a WandB artifact.
17
+
18
+ !!! example "Example Usage"
19
+ ```python
20
+ import asyncio
21
+
22
+ import weave
23
+
24
+ import wandb
25
+ from medrag_multi_modal.document_loader.image_loader import MarkerImageLoader
26
+
27
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
28
+ wandb.init(project="medrag-multi-modal", entity="ml-colabs")
29
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
30
+ loader = MarkerImageLoader(
31
+ url=url,
32
+ document_name="Gray's Anatomy",
33
+ document_file_path="grays_anatomy.pdf",
34
+ )
35
+ asyncio.run(
36
+ loader.load_data(
37
+ start_page=31,
38
+ end_page=36,
39
+ wandb_artifact_name="grays-anatomy-images-marker",
40
+ cleanup=False,
41
+ )
42
+ )
43
+ ```
44
+
45
+ Args:
46
+ url (str): The URL of the PDF document.
47
+ document_name (str): The name of the document.
48
+ document_file_path (str): The path to the PDF file.
49
+ """
50
+
51
+ def __init__(self, url: str, document_name: str, document_file_path: str):
52
+ super().__init__(url, document_name, document_file_path)
53
+ self.model_lst = load_all_models()
54
+
55
+ async def extract_page_data(
56
+ self, page_idx: int, image_save_dir: str, **kwargs
57
+ ) -> Dict[str, Any]:
58
+ """
59
+ Extracts a single page from the PDF as an image using marker library.
60
+
61
+ Args:
62
+ page_idx (int): The index of the page to process.
63
+ image_save_dir (str): The directory to save the extracted image.
64
+ **kwargs: Additional keyword arguments that may be used by marker.
65
+
66
+ Returns:
67
+ Dict[str, Any]: A dictionary containing the processed page data.
68
+ The dictionary will have the following keys and values:
69
+
70
+ - "page_idx": (int) the index of the page.
71
+ - "document_name": (str) the name of the document.
72
+ - "file_path": (str) the local file path where the PDF is stored.
73
+ - "file_url": (str) the URL of the PDF file.
74
+ - "image_file_path": (str) the local file path where the image is stored.
75
+ """
76
+ _, images, out_meta = convert_single_pdf(
77
+ self.document_file_path,
78
+ self.model_lst,
79
+ max_pages=1,
80
+ batch_multiplier=1,
81
+ start_page=page_idx,
82
+ ocr_all_pages=True,
83
+ **kwargs,
84
+ )
85
+
86
+ image_file_paths = []
87
+ for img_idx, (_, image) in enumerate(images.items()):
88
+ image_file_name = f"page{page_idx}_fig{img_idx}.png"
89
+ image_file_path = os.path.join(image_save_dir, image_file_name)
90
+ image.save(image_file_path, "png")
91
+ image_file_paths.append(image_file_path)
92
+
93
+ return {
94
+ "page_idx": page_idx,
95
+ "document_name": self.document_name,
96
+ "file_path": self.document_file_path,
97
+ "file_url": self.url,
98
+ "image_file_paths": image_file_paths,
99
+ "meta": out_meta,
100
+ }
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Dict
3
+
4
+ from pdf2image.pdf2image import convert_from_path
5
+
6
+ from .base_img_loader import BaseImageLoader
7
+
8
+
9
+ class PDF2ImageLoader(BaseImageLoader):
10
+ """
11
+ `PDF2ImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
12
+ loading of pages from a PDF file as images using the pdf2image library.
13
+
14
+ This class provides functionality to convert specific pages of a PDF document into images
15
+ and optionally publish these images to a WandB artifact.
16
+ It is like a snapshot image version of each of the pages from the PDF.
17
+
18
+ !!! example "Example Usage"
19
+ ```python
20
+ import asyncio
21
+
22
+ import weave
23
+
24
+ import wandb
25
+ from medrag_multi_modal.document_loader.image_loader import PDF2ImageLoader
26
+
27
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
28
+ wandb.init(project="medrag-multi-modal", entity="ml-colabs")
29
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
30
+ loader = PDF2ImageLoader(
31
+ url=url,
32
+ document_name="Gray's Anatomy",
33
+ document_file_path="grays_anatomy.pdf",
34
+ )
35
+ asyncio.run(
36
+ loader.load_data(
37
+ start_page=31,
38
+ end_page=36,
39
+ wandb_artifact_name="grays-anatomy-images-pdf2image",
40
+ cleanup=False,
41
+ )
42
+ )
43
+ ```
44
+
45
+ Args:
46
+ url (str): The URL of the PDF document.
47
+ document_name (str): The name of the document.
48
+ document_file_path (str): The path to the PDF file.
49
+ """
50
+
51
+ def __init__(self, url: str, document_name: str, document_file_path: str):
52
+ super().__init__(url, document_name, document_file_path)
53
+
54
+ async def extract_page_data(
55
+ self, page_idx: int, image_save_dir: str, **kwargs
56
+ ) -> Dict[str, Any]:
57
+ """
58
+ Extracts a single page from the PDF as an image using pdf2image library.
59
+
60
+ Args:
61
+ page_idx (int): The index of the page to process.
62
+ image_save_dir (str): The directory to save the extracted image.
63
+ **kwargs: Additional keyword arguments that may be used by pdf2image.
64
+
65
+ Returns:
66
+ Dict[str, Any]: A dictionary containing the processed page data.
67
+ The dictionary will have the following keys and values:
68
+
69
+ - "page_idx": (int) the index of the page.
70
+ - "document_name": (str) the name of the document.
71
+ - "file_path": (str) the local file path where the PDF is stored.
72
+ - "file_url": (str) the URL of the PDF file.
73
+ - "image_file_path": (str) the local file path where the image is stored.
74
+ """
75
+ image = convert_from_path(
76
+ self.document_file_path,
77
+ first_page=page_idx + 1,
78
+ last_page=page_idx + 1,
79
+ **kwargs,
80
+ )[0]
81
+
82
+ image_file_name = f"page{page_idx}.png"
83
+ image_file_path = os.path.join(image_save_dir, image_file_name)
84
+ image.save(image_file_path)
85
+
86
+ return {
87
+ "page_idx": page_idx,
88
+ "document_name": self.document_name,
89
+ "file_path": self.document_file_path,
90
+ "file_url": self.url,
91
+ "image_file_path": image_file_path,
92
+ }
medrag_multi_modal/document_loader/image_loader/pdfplumber_img_loader.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Dict
3
+
4
+ import pdfplumber
5
+
6
+ from .base_img_loader import BaseImageLoader
7
+
8
+
9
+ class PDFPlumberImageLoader(BaseImageLoader):
10
+ """
11
+ `PDFPlumberImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
12
+ loading of pages from a PDF file as images using the pdfplumber library.
13
+
14
+ This class provides functionality to extract images from a PDF file using pdfplumber library,
15
+ and optionally publish these images to a WandB artifact.
16
+
17
+ !!! example "Example Usage"
18
+ ```python
19
+ import asyncio
20
+
21
+ import weave
22
+
23
+ import wandb
24
+ from medrag_multi_modal.document_loader.image_loader import PDFPlumberImageLoader
25
+
26
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
27
+ wandb.init(project="medrag-multi-modal", entity="ml-colabs")
28
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
29
+ loader = PDFPlumberImageLoader(
30
+ url=url,
31
+ document_name="Gray's Anatomy",
32
+ document_file_path="grays_anatomy.pdf",
33
+ )
34
+ asyncio.run(
35
+ loader.load_data(
36
+ start_page=32,
37
+ end_page=37,
38
+ wandb_artifact_name="grays-anatomy-images-pdfplumber",
39
+ cleanup=False,
40
+ )
41
+ )
42
+ ```
43
+
44
+ Args:
45
+ url (str): The URL of the PDF document.
46
+ document_name (str): The name of the document.
47
+ document_file_path (str): The path to the PDF file.
48
+ """
49
+
50
+ def __init__(self, url: str, document_name: str, document_file_path: str):
51
+ super().__init__(url, document_name, document_file_path)
52
+
53
+ async def extract_page_data(
54
+ self, page_idx: int, image_save_dir: str, **kwargs
55
+ ) -> Dict[str, Any]:
56
+ """
57
+ Extracts a single page from the PDF as an image using pdfplumber library.
58
+
59
+ Args:
60
+ page_idx (int): The index of the page to process.
61
+ image_save_dir (str): The directory to save the extracted image.
62
+ **kwargs: Additional keyword arguments that may be used by pdfplumber.
63
+
64
+ Returns:
65
+ Dict[str, Any]: A dictionary containing the processed page data.
66
+ The dictionary will have the following keys and values:
67
+
68
+ - "page_idx": (int) the index of the page.
69
+ - "document_name": (str) the name of the document.
70
+ - "file_path": (str) the local file path where the PDF is stored.
71
+ - "file_url": (str) the URL of the PDF file.
72
+ - "image_file_path": (str) the local file path where the image is stored.
73
+ """
74
+ with pdfplumber.open(self.document_file_path) as pdf:
75
+ page = pdf.pages[page_idx]
76
+ images = page.images
77
+
78
+ image_file_paths = []
79
+ for img_idx, image in enumerate(images):
80
+ extracted_image = page.crop(
81
+ (
82
+ image["x0"],
83
+ image["top"],
84
+ image["x1"],
85
+ image["bottom"],
86
+ )
87
+ ).to_image(resolution=300)
88
+
89
+ image_file_name = f"page{page_idx}_fig{img_idx}.png"
90
+ image_file_path = os.path.join(image_save_dir, image_file_name)
91
+
92
+ extracted_image.save(image_file_path, "png")
93
+ image_file_paths.append(image_file_path)
94
+
95
+ return {
96
+ "page_idx": page_idx,
97
+ "document_name": self.document_name,
98
+ "file_path": self.document_file_path,
99
+ "file_url": self.url,
100
+ "image_file_paths": image_file_paths,
101
+ }
medrag_multi_modal/document_loader/image_loader/pymupdf_img_loader.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ from typing import Any, Dict
4
+
5
+ import fitz
6
+ from PIL import Image
7
+
8
+ from .base_img_loader import BaseImageLoader
9
+
10
+
11
+ class PyMuPDFImageLoader(BaseImageLoader):
12
+ """
13
+ `PyMuPDFImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
14
+ loading of pages from a PDF file as images using the pymupdf library.
15
+
16
+ This class provides functionality to extract images from a PDF file using pymupdf library,
17
+ and optionally publish these images to a WandB artifact.
18
+
19
+ !!! example "Example Usage"
20
+ ```python
21
+ import asyncio
22
+
23
+ import weave
24
+
25
+ import wandb
26
+ from medrag_multi_modal.document_loader.image_loader import PyMuPDFImageLoader
27
+
28
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
29
+ wandb.init(project="medrag-multi-modal", entity="ml-colabs")
30
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
31
+ loader = PyMuPDFImageLoader(
32
+ url=url,
33
+ document_name="Gray's Anatomy",
34
+ document_file_path="grays_anatomy.pdf",
35
+ )
36
+ asyncio.run(
37
+ loader.load_data(
38
+ start_page=32,
39
+ end_page=37,
40
+ wandb_artifact_name="grays-anatomy-images-pymupdf",
41
+ cleanup=False,
42
+ )
43
+ )
44
+ ```
45
+
46
+ Args:
47
+ url (str): The URL of the PDF document.
48
+ document_name (str): The name of the document.
49
+ document_file_path (str): The path to the PDF file.
50
+ """
51
+
52
+ def __init__(self, url: str, document_name: str, document_file_path: str):
53
+ super().__init__(url, document_name, document_file_path)
54
+
55
+ async def extract_page_data(
56
+ self, page_idx: int, image_save_dir: str, **kwargs
57
+ ) -> Dict[str, Any]:
58
+ """
59
+ Extracts a single page from the PDF as an image using pymupdf library.
60
+
61
+ Args:
62
+ page_idx (int): The index of the page to process.
63
+ image_save_dir (str): The directory to save the extracted image.
64
+ **kwargs: Additional keyword arguments that may be used by pymupdf.
65
+
66
+ Returns:
67
+ Dict[str, Any]: A dictionary containing the processed page data.
68
+ The dictionary will have the following keys and values:
69
+
70
+ - "page_idx": (int) the index of the page.
71
+ - "document_name": (str) the name of the document.
72
+ - "file_path": (str) the local file path where the PDF is stored.
73
+ - "file_url": (str) the URL of the PDF file.
74
+ - "image_file_paths": (list) the local file paths where the images are stored.
75
+ """
76
+ image_file_paths = []
77
+
78
+ pdf_document = fitz.open(self.document_file_path)
79
+ page = pdf_document[page_idx]
80
+
81
+ images = page.get_images(full=True)
82
+ for img_idx, image in enumerate(images):
83
+ xref = image[0]
84
+ base_image = pdf_document.extract_image(xref)
85
+ image_bytes = base_image["image"]
86
+ image_ext = base_image["ext"]
87
+
88
+ if image_ext == "jb2":
89
+ image_ext = "png"
90
+ elif image_ext == "jpx":
91
+ image_ext = "jpg"
92
+
93
+ image_file_name = f"page{page_idx}_fig{img_idx}.{image_ext}"
94
+ image_file_path = os.path.join(image_save_dir, image_file_name)
95
+
96
+ # For JBIG2 and JPEG2000, we need to convert the image
97
+ if base_image["ext"] in ["jb2", "jpx"]:
98
+ try:
99
+ pix = fitz.Pixmap(image_bytes)
100
+ pix.save(image_file_path)
101
+ except Exception as err_fitz:
102
+ print(f"Error processing image with fitz: {err_fitz}")
103
+ # Fallback to using PIL for image conversion
104
+ try:
105
+ img = Image.open(io.BytesIO(image_bytes))
106
+ img.save(image_file_path)
107
+ except Exception as err_pil:
108
+ print(f"Failed to process image with PIL: {err_pil}")
109
+ continue # Skip this image if both methods fail
110
+ else:
111
+ with open(image_file_path, "wb") as image_file:
112
+ image_file.write(image_bytes)
113
+
114
+ image_file_paths.append(image_file_path)
115
+
116
+ pdf_document.close()
117
+
118
+ return {
119
+ "page_idx": page_idx,
120
+ "document_name": self.document_name,
121
+ "file_path": self.document_file_path,
122
+ "file_url": self.url,
123
+ "image_file_paths": image_file_paths,
124
+ }
medrag_multi_modal/document_loader/load_image.py DELETED
@@ -1,131 +0,0 @@
1
- import asyncio
2
- import os
3
- from typing import Optional
4
-
5
- import rich
6
- import wandb
7
- import weave
8
- from pdf2image.pdf2image import convert_from_path
9
- from PIL import Image
10
-
11
- from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
12
-
13
-
14
- class ImageLoader(PyMuPDF4LLMTextLoader):
15
- """
16
- `ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
17
- loading of pages from a PDF file as images.
18
-
19
- This class provides functionality to convert specific pages of a PDF document into images
20
- and optionally publish these images to a Weave dataset.
21
-
22
- !!! example "Example Usage"
23
- ```python
24
- import asyncio
25
-
26
- import wandb
27
- from dotenv import load_dotenv
28
-
29
- from medrag_multi_modal.document_loader import ImageLoader
30
-
31
- load_dotenv()
32
- wandb.init(project="medrag-multi-modal", entity="ml-colabs")
33
- url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
34
- loader = ImageLoader(
35
- url=url,
36
- document_name="Gray's Anatomy",
37
- document_file_path="grays_anatomy.pdf",
38
- )
39
- asyncio.run(
40
- loader.load_data(
41
- start_page=31,
42
- end_page=33,
43
- dataset_name="grays-anatomy-images",
44
- )
45
- )
46
- ```
47
-
48
- Args:
49
- url (str): The URL of the PDF document.
50
- document_name (str): The name of the document.
51
- document_file_path (str): The path to the PDF file.
52
- """
53
-
54
- def __init__(self, url: str, document_name: str, document_file_path: str):
55
- super().__init__(url, document_name, document_file_path)
56
-
57
- def extract_data_from_pdf_file(
58
- self, pdf_file: str, page_number: int
59
- ) -> Image.Image:
60
- image = convert_from_path(
61
- pdf_file, first_page=page_number + 1, last_page=page_number + 1
62
- )[0]
63
- return image
64
-
65
- async def load_data(
66
- self,
67
- start_page: Optional[int] = None,
68
- end_page: Optional[int] = None,
69
- image_save_dir: str = "./images",
70
- dataset_name: Optional[str] = None,
71
- ):
72
- """
73
- Asynchronously loads images from a PDF file specified by a URL or local file path,
74
- processes the images for the specified range of pages, and optionally publishes them
75
- to a Weave dataset.
76
-
77
- This function reads the specified range of pages from a PDF document, converts each page
78
- to an image using the `pdf2image` library, and returns a list of dictionaries containing
79
- the image and metadata for each processed page. It processes pages concurrently using
80
- `asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
81
- published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
82
- with the specified name.
83
-
84
- Args:
85
- start_page (Optional[int]): The starting page index (0-based) to process.
86
- end_page (Optional[int]): The ending page index (0-based) to process.
87
- dataset_name (Optional[str]): The name of the Weave dataset to publish the
88
- processed images to. Defaults to None.
89
-
90
- Returns:
91
- list[dict]: A list of dictionaries, each containing the image and metadata for a
92
- processed page.
93
-
94
- Raises:
95
- ValueError: If the specified start_page or end_page is out of bounds of the document's
96
- page count.
97
- """
98
- os.makedirs(image_save_dir, exist_ok=True)
99
- start_page, end_page = self.get_page_indices(start_page, end_page)
100
- pages = []
101
- processed_pages_counter: int = 1
102
- total_pages = end_page - start_page
103
-
104
- async def process_page(page_idx):
105
- nonlocal processed_pages_counter
106
- image = convert_from_path(
107
- self.document_file_path,
108
- first_page=page_idx + 1,
109
- last_page=page_idx + 1,
110
- )[0]
111
- pages.append(
112
- {
113
- "page_idx": page_idx,
114
- "document_name": self.document_name,
115
- "file_path": self.document_file_path,
116
- "file_url": self.url,
117
- }
118
- )
119
- image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
120
- rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
121
- processed_pages_counter += 1
122
-
123
- tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
124
- for task in asyncio.as_completed(tasks):
125
- await task
126
- if dataset_name:
127
- artifact = wandb.Artifact(name=dataset_name, type="dataset")
128
- artifact.add_dir(local_path=image_save_dir)
129
- artifact.save()
130
- weave.publish(weave.Dataset(name=dataset_name, rows=pages))
131
- return pages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py CHANGED
@@ -53,15 +53,16 @@ class MarkerTextLoader(BaseTextLoader):
53
  """
54
  Process a single page of the PDF and extract its structured text using marker-pdf.
55
 
56
- Returns a dictionary with the processed page data.
57
- The dictionary will have the following keys and values:
58
-
59
- - "text": (str) the extracted structured text from the page.
60
- - "page_idx": (int) the index of the page.
61
- - "document_name": (str) the name of the document.
62
- - "file_path": (str) the local file path where the PDF is stored.
63
- - "file_url": (str) the URL of the PDF file.
64
- - "meta": (dict) the metadata extracted from the page by marker-pdf.
 
65
 
66
  Args:
67
  page_idx (int): The index of the page to process.
 
53
  """
54
  Process a single page of the PDF and extract its structured text using marker-pdf.
55
 
56
+ Returns:
57
+ Dict[str, str]: A dictionary with the processed page data.
58
+ The dictionary will have the following keys and values:
59
+
60
+ - "text": (str) the extracted structured text from the page.
61
+ - "page_idx": (int) the index of the page.
62
+ - "document_name": (str) the name of the document.
63
+ - "file_path": (str) the local file path where the PDF is stored.
64
+ - "file_url": (str) the URL of the PDF file.
65
+ - "meta": (dict) the metadata extracted from the page by marker-pdf.
66
 
67
  Args:
68
  page_idx (int): The index of the page to process.
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py CHANGED
@@ -52,14 +52,15 @@ class PDFPlumberTextLoader(BaseTextLoader):
52
  """
53
  Process a single page of the PDF and extract its text using pdfplumber.
54
 
55
- Returns a dictionary with the processed page data.
56
- The dictionary will have the following keys and values:
57
-
58
- - "text": (str) the extracted text from the page.
59
- - "page_idx": (int) the index of the page.
60
- - "document_name": (str) the name of the document.
61
- - "file_path": (str) the local file path where the PDF is stored.
62
- - "file_url": (str) the URL of the PDF file.
 
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
 
52
  """
53
  Process a single page of the PDF and extract its text using pdfplumber.
54
 
55
+ Returns:
56
+ Dict[str, str]: A dictionary with the processed page data.
57
+ The dictionary will have the following keys and values:
58
+
59
+ - "text": (str) the extracted text from the page.
60
+ - "page_idx": (int) the index of the page.
61
+ - "document_name": (str) the name of the document.
62
+ - "file_path": (str) the local file path where the PDF is stored.
63
+ - "file_url": (str) the URL of the PDF file.
64
 
65
  Args:
66
  page_idx (int): The index of the page to process.
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py CHANGED
@@ -52,14 +52,15 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
52
  """
53
  Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
54
 
55
- Returns a dictionary with the processed page data.
56
- The dictionary will have the following keys and values:
57
-
58
- - "text": (str) the processed page data in markdown format.
59
- - "page_idx": (int) the index of the page.
60
- - "document_name": (str) the name of the document.
61
- - "file_path": (str) the local file path where the PDF is stored.
62
- - "file_url": (str) the URL of the PDF file.
 
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
 
52
  """
53
  Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
54
 
55
+ Returns:
56
+ Dict[str, str]: A dictionary with the processed page data.
57
+ The dictionary will have the following keys and values:
58
+
59
+ - "text": (str) the processed page data in markdown format.
60
+ - "page_idx": (int) the index of the page.
61
+ - "document_name": (str) the name of the document.
62
+ - "file_path": (str) the local file path where the PDF is stored.
63
+ - "file_url": (str) the URL of the PDF file.
64
 
65
  Args:
66
  page_idx (int): The index of the page to process.
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py CHANGED
@@ -52,14 +52,15 @@ class PyPDF2TextLoader(BaseTextLoader):
52
  """
53
  Process a single page of the PDF and extract its text using PyPDF2.
54
 
55
- Returns a dictionary with the processed page data.
56
- The dictionary will have the following keys and values:
57
-
58
- - "text": (str) the extracted text from the page.
59
- - "page_idx": (int) the index of the page.
60
- - "document_name": (str) the name of the document.
61
- - "file_path": (str) the local file path where the PDF is stored.
62
- - "file_url": (str) the URL of the PDF file.
 
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
 
52
  """
53
  Process a single page of the PDF and extract its text using PyPDF2.
54
 
55
+ Returns:
56
+ Dict[str, str]: A dictionary with the processed page data.
57
+ The dictionary will have the following keys and values:
58
+
59
+ - "text": (str) the extracted text from the page.
60
+ - "page_idx": (int) the index of the page.
61
+ - "document_name": (str) the name of the document.
62
+ - "file_path": (str) the local file path where the PDF is stored.
63
+ - "file_url": (str) the URL of the PDF file.
64
 
65
  Args:
66
  page_idx (int): The index of the page to process.
medrag_multi_modal/retrieval/multi_modal_retrieval.py CHANGED
@@ -1,11 +1,12 @@
1
  import os
2
  from typing import Any, Optional
3
 
4
- import wandb
5
  import weave
6
  from byaldi import RAGMultiModalModel
7
  from PIL import Image
8
 
 
 
9
  from ..utils import get_wandb_artifact
10
 
11
 
 
1
  import os
2
  from typing import Any, Optional
3
 
 
4
  import weave
5
  from byaldi import RAGMultiModalModel
6
  from PIL import Image
7
 
8
+ import wandb
9
+
10
  from ..utils import get_wandb_artifact
11
 
12
 
mkdocs.yml CHANGED
@@ -69,7 +69,13 @@ nav:
69
  - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
70
  - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
71
  - Marker: 'document_loader/text_loader/marker_text_loader.md'
72
- - Image Loader: 'document_loader/load_image.md'
 
 
 
 
 
 
73
  - Chunking: 'chunking.md'
74
  - Retrieval:
75
  - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
 
69
  - PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
70
  - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
71
  - Marker: 'document_loader/text_loader/marker_text_loader.md'
72
+ - Image Loader:
73
+ - Base: 'document_loader/image_loader/base_img_loader.md'
74
+ - PDF2Image: 'document_loader/image_loader/pdf2image_img_loader.md'
75
+ - Marker: 'document_loader/image_loader/marker_img_loader.md'
76
+ - PDFPlumber: 'document_loader/image_loader/pdfplumber_img_loader.md'
77
+ - PyMuPDF: 'document_loader/image_loader/pymupdf_img_loader.md'
78
+ - FitzPIL: 'document_loader/image_loader/fitzpil_img_loader.md'
79
  - Chunking: 'chunking.md'
80
  - Retrieval:
81
  - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'