Spaces:
Sleeping
Sleeping
Commit
·
5c74069
1
Parent(s):
694a076
add: base image loader + pdf2img from load_image
Browse files- medrag_multi_modal/document_loader/__init__.py +2 -4
- medrag_multi_modal/document_loader/image_loader/__init__.py +4 -0
- medrag_multi_modal/document_loader/image_loader/base_img_loader.py +63 -0
- medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py +34 -0
- medrag_multi_modal/document_loader/load_image.py +0 -131
medrag_multi_modal/document_loader/__init__.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
from .
|
2 |
-
from .load_text_image import TextImageLoader
|
3 |
from .text_loader import (
|
4 |
MarkerTextLoader,
|
5 |
PDFPlumberTextLoader,
|
@@ -12,6 +11,5 @@ __all__ = [
|
|
12 |
"PyPDF2TextLoader",
|
13 |
"PDFPlumberTextLoader",
|
14 |
"MarkerTextLoader",
|
15 |
-
"
|
16 |
-
"TextImageLoader",
|
17 |
]
|
|
|
1 |
+
from .image_loader import PDF2ImageLoader
|
|
|
2 |
from .text_loader import (
|
3 |
MarkerTextLoader,
|
4 |
PDFPlumberTextLoader,
|
|
|
11 |
"PyPDF2TextLoader",
|
12 |
"PDFPlumberTextLoader",
|
13 |
"MarkerTextLoader",
|
14 |
+
"PDF2ImageLoader",
|
|
|
15 |
]
|
medrag_multi_modal/document_loader/image_loader/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base_img_loader import BaseImageLoader
|
2 |
+
from .pdf2image_img_loader import PDF2ImageLoader
|
3 |
+
|
4 |
+
__all__ = ["PDF2ImageLoader", "BaseImageLoader"]
|
medrag_multi_modal/document_loader/image_loader/base_img_loader.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import os
|
3 |
+
from abc import abstractmethod
|
4 |
+
from typing import Dict, List, Optional
|
5 |
+
|
6 |
+
import rich
|
7 |
+
|
8 |
+
import wandb
|
9 |
+
from medrag_multi_modal.document_loader.text_loader.base_text_loader import (
|
10 |
+
BaseTextLoader,
|
11 |
+
)
|
12 |
+
|
13 |
+
|
14 |
+
class BaseImageLoader(BaseTextLoader):
|
15 |
+
def __init__(self, url: str, document_name: str, document_file_path: str):
|
16 |
+
super().__init__(url, document_name, document_file_path)
|
17 |
+
|
18 |
+
@abstractmethod
|
19 |
+
async def extract_page_data(
|
20 |
+
self, page_idx: int, image_save_dir: str, **kwargs
|
21 |
+
) -> Dict[str, str]:
|
22 |
+
pass
|
23 |
+
|
24 |
+
async def load_data(
|
25 |
+
self,
|
26 |
+
start_page: Optional[int] = None,
|
27 |
+
end_page: Optional[int] = None,
|
28 |
+
wandb_artifact_name: Optional[str] = None,
|
29 |
+
image_save_dir: str = "./images",
|
30 |
+
cleanup: bool = True,
|
31 |
+
**kwargs,
|
32 |
+
) -> List[Dict[str, str]]:
|
33 |
+
os.makedirs(image_save_dir, exist_ok=True)
|
34 |
+
start_page, end_page = self.get_page_indices(start_page, end_page)
|
35 |
+
pages = []
|
36 |
+
processed_pages_counter: int = 1
|
37 |
+
total_pages = end_page - start_page
|
38 |
+
|
39 |
+
async def process_page(page_idx):
|
40 |
+
nonlocal processed_pages_counter
|
41 |
+
page_data = await self.extract_page_data(page_idx, image_save_dir, **kwargs)
|
42 |
+
pages.append(page_data)
|
43 |
+
rich.print(
|
44 |
+
f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
|
45 |
+
)
|
46 |
+
processed_pages_counter += 1
|
47 |
+
|
48 |
+
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
49 |
+
for task in asyncio.as_completed(tasks):
|
50 |
+
await task
|
51 |
+
|
52 |
+
if wandb_artifact_name:
|
53 |
+
artifact = wandb.Artifact(name=wandb_artifact_name, type="dataset")
|
54 |
+
artifact.add_dir(local_path=image_save_dir)
|
55 |
+
artifact.save()
|
56 |
+
rich.print("Artifact saved and uploaded to wandb!")
|
57 |
+
|
58 |
+
if cleanup:
|
59 |
+
for file in os.listdir(image_save_dir):
|
60 |
+
file_path = os.path.join(image_save_dir, file)
|
61 |
+
if os.path.isfile(file_path):
|
62 |
+
os.remove(file_path)
|
63 |
+
return pages
|
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Any, Dict
|
3 |
+
|
4 |
+
from pdf2image.pdf2image import convert_from_path
|
5 |
+
|
6 |
+
from .base_img_loader import BaseImageLoader
|
7 |
+
|
8 |
+
|
9 |
+
class PDF2ImageLoader(BaseImageLoader):
|
10 |
+
|
11 |
+
def __init__(self, url: str, document_name: str, document_file_path: str):
|
12 |
+
super().__init__(url, document_name, document_file_path)
|
13 |
+
|
14 |
+
async def extract_page_data(
|
15 |
+
self, page_idx: int, image_save_dir: str, **kwargs
|
16 |
+
) -> Dict[str, Any]:
|
17 |
+
image = convert_from_path(
|
18 |
+
self.document_file_path,
|
19 |
+
first_page=page_idx + 1,
|
20 |
+
last_page=page_idx + 1,
|
21 |
+
**kwargs,
|
22 |
+
)[0]
|
23 |
+
|
24 |
+
image_file_name = f"page{page_idx}.png"
|
25 |
+
image_file_path = os.path.join(image_save_dir, image_file_name)
|
26 |
+
image.save(image_file_path)
|
27 |
+
|
28 |
+
return {
|
29 |
+
"page_idx": page_idx,
|
30 |
+
"document_name": self.document_name,
|
31 |
+
"file_path": self.document_file_path,
|
32 |
+
"file_url": self.url,
|
33 |
+
"image_file_path": image_file_path,
|
34 |
+
}
|
medrag_multi_modal/document_loader/load_image.py
DELETED
@@ -1,131 +0,0 @@
|
|
1 |
-
import asyncio
|
2 |
-
import os
|
3 |
-
from typing import Optional
|
4 |
-
|
5 |
-
import rich
|
6 |
-
import wandb
|
7 |
-
import weave
|
8 |
-
from pdf2image.pdf2image import convert_from_path
|
9 |
-
from PIL import Image
|
10 |
-
|
11 |
-
from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
|
12 |
-
|
13 |
-
|
14 |
-
class ImageLoader(PyMuPDF4LLMTextLoader):
|
15 |
-
"""
|
16 |
-
`ImageLoader` is a class that extends the `TextLoader` class to handle the extraction and
|
17 |
-
loading of pages from a PDF file as images.
|
18 |
-
|
19 |
-
This class provides functionality to convert specific pages of a PDF document into images
|
20 |
-
and optionally publish these images to a Weave dataset.
|
21 |
-
|
22 |
-
!!! example "Example Usage"
|
23 |
-
```python
|
24 |
-
import asyncio
|
25 |
-
|
26 |
-
import wandb
|
27 |
-
from dotenv import load_dotenv
|
28 |
-
|
29 |
-
from medrag_multi_modal.document_loader import ImageLoader
|
30 |
-
|
31 |
-
load_dotenv()
|
32 |
-
wandb.init(project="medrag-multi-modal", entity="ml-colabs")
|
33 |
-
url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
|
34 |
-
loader = ImageLoader(
|
35 |
-
url=url,
|
36 |
-
document_name="Gray's Anatomy",
|
37 |
-
document_file_path="grays_anatomy.pdf",
|
38 |
-
)
|
39 |
-
asyncio.run(
|
40 |
-
loader.load_data(
|
41 |
-
start_page=31,
|
42 |
-
end_page=33,
|
43 |
-
dataset_name="grays-anatomy-images",
|
44 |
-
)
|
45 |
-
)
|
46 |
-
```
|
47 |
-
|
48 |
-
Args:
|
49 |
-
url (str): The URL of the PDF document.
|
50 |
-
document_name (str): The name of the document.
|
51 |
-
document_file_path (str): The path to the PDF file.
|
52 |
-
"""
|
53 |
-
|
54 |
-
def __init__(self, url: str, document_name: str, document_file_path: str):
|
55 |
-
super().__init__(url, document_name, document_file_path)
|
56 |
-
|
57 |
-
def extract_data_from_pdf_file(
|
58 |
-
self, pdf_file: str, page_number: int
|
59 |
-
) -> Image.Image:
|
60 |
-
image = convert_from_path(
|
61 |
-
pdf_file, first_page=page_number + 1, last_page=page_number + 1
|
62 |
-
)[0]
|
63 |
-
return image
|
64 |
-
|
65 |
-
async def load_data(
|
66 |
-
self,
|
67 |
-
start_page: Optional[int] = None,
|
68 |
-
end_page: Optional[int] = None,
|
69 |
-
image_save_dir: str = "./images",
|
70 |
-
dataset_name: Optional[str] = None,
|
71 |
-
):
|
72 |
-
"""
|
73 |
-
Asynchronously loads images from a PDF file specified by a URL or local file path,
|
74 |
-
processes the images for the specified range of pages, and optionally publishes them
|
75 |
-
to a Weave dataset.
|
76 |
-
|
77 |
-
This function reads the specified range of pages from a PDF document, converts each page
|
78 |
-
to an image using the `pdf2image` library, and returns a list of dictionaries containing
|
79 |
-
the image and metadata for each processed page. It processes pages concurrently using
|
80 |
-
`asyncio` for efficiency. If a `dataset_name` is provided, the processed page images are
|
81 |
-
published to Weights & Biases artifact and the corresponding metadata to a Weave dataset
|
82 |
-
with the specified name.
|
83 |
-
|
84 |
-
Args:
|
85 |
-
start_page (Optional[int]): The starting page index (0-based) to process.
|
86 |
-
end_page (Optional[int]): The ending page index (0-based) to process.
|
87 |
-
dataset_name (Optional[str]): The name of the Weave dataset to publish the
|
88 |
-
processed images to. Defaults to None.
|
89 |
-
|
90 |
-
Returns:
|
91 |
-
list[dict]: A list of dictionaries, each containing the image and metadata for a
|
92 |
-
processed page.
|
93 |
-
|
94 |
-
Raises:
|
95 |
-
ValueError: If the specified start_page or end_page is out of bounds of the document's
|
96 |
-
page count.
|
97 |
-
"""
|
98 |
-
os.makedirs(image_save_dir, exist_ok=True)
|
99 |
-
start_page, end_page = self.get_page_indices(start_page, end_page)
|
100 |
-
pages = []
|
101 |
-
processed_pages_counter: int = 1
|
102 |
-
total_pages = end_page - start_page
|
103 |
-
|
104 |
-
async def process_page(page_idx):
|
105 |
-
nonlocal processed_pages_counter
|
106 |
-
image = convert_from_path(
|
107 |
-
self.document_file_path,
|
108 |
-
first_page=page_idx + 1,
|
109 |
-
last_page=page_idx + 1,
|
110 |
-
)[0]
|
111 |
-
pages.append(
|
112 |
-
{
|
113 |
-
"page_idx": page_idx,
|
114 |
-
"document_name": self.document_name,
|
115 |
-
"file_path": self.document_file_path,
|
116 |
-
"file_url": self.url,
|
117 |
-
}
|
118 |
-
)
|
119 |
-
image.save(os.path.join(image_save_dir, f"{page_idx}.png"))
|
120 |
-
rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
|
121 |
-
processed_pages_counter += 1
|
122 |
-
|
123 |
-
tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
|
124 |
-
for task in asyncio.as_completed(tasks):
|
125 |
-
await task
|
126 |
-
if dataset_name:
|
127 |
-
artifact = wandb.Artifact(name=dataset_name, type="dataset")
|
128 |
-
artifact.add_dir(local_path=image_save_dir)
|
129 |
-
artifact.save()
|
130 |
-
weave.publish(weave.Dataset(name=dataset_name, rows=pages))
|
131 |
-
return pages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|