Spaces:
Sleeping
Sleeping
Commit
·
cc5cebc
1
Parent(s):
5c74069
add: docs for base img loader + pdf2image
Browse files- docs/document_loader/image_loader/base_img_loader.md +3 -0
- docs/document_loader/image_loader/pdf2image_img_loader.md +3 -0
- docs/document_loader/load_image.md +0 -3
- medrag_multi_modal/document_loader/image_loader/base_img_loader.py +50 -0
- medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py +24 -0
- mkdocs.yml +3 -1
docs/document_loader/image_loader/base_img_loader.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
## Load images from PDF files
|
2 |
+
|
3 |
+
::: medrag_multi_modal.document_loader.image_loader.base_img_loader
|
docs/document_loader/image_loader/pdf2image_img_loader.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Load images from PDF files (using pdf2image)
|
2 |
+
|
3 |
+
::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader
|
docs/document_loader/load_image.md
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
# Load PDF pages as images
|
2 |
-
|
3 |
-
::: medrag_multi_modal.document_loader.load_image
|
|
|
|
|
|
|
|
medrag_multi_modal/document_loader/image_loader/base_img_loader.py
CHANGED
@@ -19,6 +19,20 @@ class BaseImageLoader(BaseTextLoader):
|
|
19 |
async def extract_page_data(
|
20 |
self, page_idx: int, image_save_dir: str, **kwargs
|
21 |
) -> Dict[str, str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
pass
|
23 |
|
24 |
async def load_data(
|
@@ -30,6 +44,42 @@ class BaseImageLoader(BaseTextLoader):
|
|
30 |
cleanup: bool = True,
|
31 |
**kwargs,
|
32 |
) -> List[Dict[str, str]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
os.makedirs(image_save_dir, exist_ok=True)
|
34 |
start_page, end_page = self.get_page_indices(start_page, end_page)
|
35 |
pages = []
|
|
|
19 |
async def extract_page_data(
|
20 |
self, page_idx: int, image_save_dir: str, **kwargs
|
21 |
) -> Dict[str, str]:
|
22 |
+
"""
|
23 |
+
Abstract method to process a single page of the PDF and extract the image data.
|
24 |
+
|
25 |
+
Overwrite this method in the subclass to provide the actual implementation and
|
26 |
+
processing logic for each page of the PDF using various PDF processing libraries.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
page_idx (int): The index of the page to process.
|
30 |
+
image_save_dir (str): The directory to save the extracted images.
|
31 |
+
**kwargs: Additional keyword arguments that may be used by underlying libraries.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
Dict[str, str]: A dictionary containing the processed page data.
|
35 |
+
"""
|
36 |
pass
|
37 |
|
38 |
async def load_data(
|
|
|
44 |
cleanup: bool = True,
|
45 |
**kwargs,
|
46 |
) -> List[Dict[str, str]]:
|
47 |
+
"""
|
48 |
+
Asynchronously loads images from a PDF file specified by a URL or local file path.
|
49 |
+
The overrided processing abstract method then processes the images,
|
50 |
+
and optionally publishes it to a Weave artifact.
|
51 |
+
|
52 |
+
This function downloads a PDF from a given URL if it does not already exist locally,
|
53 |
+
reads the specified range of pages, scans each page's content to extract images, and
|
54 |
+
returns a list of Page objects containing the images and metadata.
|
55 |
+
|
56 |
+
It uses `PyPDF2` to calculate the number of pages in the PDF and the
|
57 |
+
overriden `extract_page_data` method provides the actual implementation to process
|
58 |
+
each page, extract the image content from the PDF, and convert it to png format.
|
59 |
+
It processes pages concurrently using `asyncio` for efficiency.
|
60 |
+
|
61 |
+
If a wandb_artifact_name is provided, the processed pages are published to a Weave artifact.
|
62 |
+
|
63 |
+
Args:
|
64 |
+
start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
|
65 |
+
end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
|
66 |
+
wandb_artifact_name (Optional[str]): The name of the Weave artifact to publish the pages to, if provided.
|
67 |
+
image_save_dir (str): The directory to save the extracted images.
|
68 |
+
cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
|
69 |
+
**kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
|
70 |
+
|
71 |
+
Returns:
|
72 |
+
List[Dict[str, Any]]: A list of dictionaries, each containing the image and metadata for a processed page.
|
73 |
+
Each dictionary will have the following keys and values:
|
74 |
+
|
75 |
+
- "page_idx": (int) the index of the page.
|
76 |
+
- "document_name": (str) the name of the document.
|
77 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
78 |
+
- "file_url": (str) the URL of the PDF file.
|
79 |
+
- "image_file_path" or "image_file_paths": (str) the local file path where the image/images are stored.
|
80 |
+
Raises:
|
81 |
+
ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
|
82 |
+
"""
|
83 |
os.makedirs(image_save_dir, exist_ok=True)
|
84 |
start_page, end_page = self.get_page_indices(start_page, end_page)
|
85 |
pages = []
|
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py
CHANGED
@@ -7,6 +7,19 @@ from .base_img_loader import BaseImageLoader
|
|
7 |
|
8 |
|
9 |
class PDF2ImageLoader(BaseImageLoader):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def __init__(self, url: str, document_name: str, document_file_path: str):
|
12 |
super().__init__(url, document_name, document_file_path)
|
@@ -14,6 +27,17 @@ class PDF2ImageLoader(BaseImageLoader):
|
|
14 |
async def extract_page_data(
|
15 |
self, page_idx: int, image_save_dir: str, **kwargs
|
16 |
) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
image = convert_from_path(
|
18 |
self.document_file_path,
|
19 |
first_page=page_idx + 1,
|
|
|
7 |
|
8 |
|
9 |
class PDF2ImageLoader(BaseImageLoader):
|
10 |
+
"""
|
11 |
+
`PDF2ImageLoader` is a class that extends the `BaseImageLoader` class to handle the extraction and
|
12 |
+
loading of pages from a PDF file as images using the pdf2image library.
|
13 |
+
|
14 |
+
This class provides functionality to convert specific pages of a PDF document into images
|
15 |
+
and optionally publish these images to a Weave artifact.
|
16 |
+
It is like a snapshot image version of each of the pages from the PDF.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
url (str): The URL of the PDF document.
|
20 |
+
document_name (str): The name of the document.
|
21 |
+
document_file_path (str): The path to the PDF file.
|
22 |
+
"""
|
23 |
|
24 |
def __init__(self, url: str, document_name: str, document_file_path: str):
|
25 |
super().__init__(url, document_name, document_file_path)
|
|
|
27 |
async def extract_page_data(
|
28 |
self, page_idx: int, image_save_dir: str, **kwargs
|
29 |
) -> Dict[str, Any]:
|
30 |
+
"""
|
31 |
+
Extracts a single page from the PDF as an image using pdf2image library.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
page_idx (int): The index of the page to process.
|
35 |
+
image_save_dir (str): The directory to save the extracted image.
|
36 |
+
**kwargs: Additional keyword arguments that may be used by pdf2image.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
Dict[str, str]: A dictionary containing the processed page data.
|
40 |
+
"""
|
41 |
image = convert_from_path(
|
42 |
self.document_file_path,
|
43 |
first_page=page_idx + 1,
|
mkdocs.yml
CHANGED
@@ -69,7 +69,9 @@ nav:
|
|
69 |
- PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
|
70 |
- PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
|
71 |
- Marker: 'document_loader/text_loader/marker_text_loader.md'
|
72 |
-
- Image Loader:
|
|
|
|
|
73 |
- Chunking: 'chunking.md'
|
74 |
- Retrieval:
|
75 |
- Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
|
|
|
69 |
- PyPDF2: 'document_loader/text_loader/pypdf2_text_loader.md'
|
70 |
- PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
|
71 |
- Marker: 'document_loader/text_loader/marker_text_loader.md'
|
72 |
+
- Image Loader:
|
73 |
+
- Base: 'document_loader/image_loader/base_img_loader.md'
|
74 |
+
- PDF2Image: 'document_loader/image_loader/pdf2image_img_loader.md'
|
75 |
- Chunking: 'chunking.md'
|
76 |
- Retrieval:
|
77 |
- Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
|