Spaces:
Sleeping
Sleeping
Commit
·
e31ec78
1
Parent(s):
fc27062
update: convert _process_page to extract_page_data
Browse files- medrag_multi_modal/document_loader/text_loader/base_text_loader.py +10 -9
- medrag_multi_modal/document_loader/text_loader/marker_text_loader.py +8 -7
- medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py +7 -6
- medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py +7 -7
- medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py +7 -6
medrag_multi_modal/document_loader/text_loader/base_text_loader.py
CHANGED
@@ -65,9 +65,9 @@ class BaseTextLoader(ABC):
|
|
65 |
return start_page, end_page
|
66 |
|
67 |
@abstractmethod
|
68 |
-
async def
|
69 |
"""
|
70 |
-
Abstract method to process a single page of the PDF.
|
71 |
|
72 |
Overwrite this method in the subclass to provide the actual implementation and
|
73 |
processing logic for each page of the PDF using various PDF processing libraries.
|
@@ -96,7 +96,7 @@ class BaseTextLoader(ABC):
|
|
96 |
returns a list of Page objects containing the text and metadata.
|
97 |
|
98 |
It uses `PyPDF2` to calculate the number of pages in the PDF and the
|
99 |
-
overriden `
|
100 |
each page, extract the text from the PDF, and convert it to markdown.
|
101 |
It processes pages concurrently using `asyncio` for efficiency.
|
102 |
|
@@ -110,11 +110,12 @@ class BaseTextLoader(ABC):
|
|
110 |
Returns:
|
111 |
List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
|
112 |
Each dictionary will have the following keys and values:
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
118 |
|
119 |
Raises:
|
120 |
ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
|
@@ -126,7 +127,7 @@ class BaseTextLoader(ABC):
|
|
126 |
|
127 |
async def process_page(page_idx):
|
128 |
nonlocal processed_pages_counter
|
129 |
-
page_data = await self.
|
130 |
pages.append(page_data)
|
131 |
rich.print(
|
132 |
f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
|
|
|
65 |
return start_page, end_page
|
66 |
|
67 |
@abstractmethod
|
68 |
+
async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
|
69 |
"""
|
70 |
+
Abstract method to process a single page of the PDF and extract the text data.
|
71 |
|
72 |
Overwrite this method in the subclass to provide the actual implementation and
|
73 |
processing logic for each page of the PDF using various PDF processing libraries.
|
|
|
96 |
returns a list of Page objects containing the text and metadata.
|
97 |
|
98 |
It uses `PyPDF2` to calculate the number of pages in the PDF and the
|
99 |
+
overriden `extract_page_data` method provides the actual implementation to process
|
100 |
each page, extract the text from the PDF, and convert it to markdown.
|
101 |
It processes pages concurrently using `asyncio` for efficiency.
|
102 |
|
|
|
110 |
Returns:
|
111 |
List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
|
112 |
Each dictionary will have the following keys and values:
|
113 |
+
|
114 |
+
- "text": (str) the processed page data in markdown format.
|
115 |
+
- "page_idx": (int) the index of the page.
|
116 |
+
- "document_name": (str) the name of the document.
|
117 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
118 |
+
- "file_url": (str) the URL of the PDF file.
|
119 |
|
120 |
Raises:
|
121 |
ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
|
|
|
127 |
|
128 |
async def process_page(page_idx):
|
129 |
nonlocal processed_pages_counter
|
130 |
+
page_data = await self.extract_page_data(page_idx)
|
131 |
pages.append(page_data)
|
132 |
rich.print(
|
133 |
f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
|
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py
CHANGED
@@ -49,18 +49,19 @@ class MarkerTextLoader(BaseTextLoader):
|
|
49 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
50 |
"""
|
51 |
|
52 |
-
async def
|
53 |
"""
|
54 |
Process a single page of the PDF and extract its structured text using marker-pdf.
|
55 |
|
56 |
Returns a dictionary with the processed page data.
|
57 |
The dictionary will have the following keys and values:
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
64 |
|
65 |
Args:
|
66 |
page_idx (int): The index of the page to process.
|
|
|
49 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
50 |
"""
|
51 |
|
52 |
+
async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
|
53 |
"""
|
54 |
Process a single page of the PDF and extract its structured text using marker-pdf.
|
55 |
|
56 |
Returns a dictionary with the processed page data.
|
57 |
The dictionary will have the following keys and values:
|
58 |
+
|
59 |
+
- "text": (str) the extracted structured text from the page.
|
60 |
+
- "page_idx": (int) the index of the page.
|
61 |
+
- "document_name": (str) the name of the document.
|
62 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
63 |
+
- "file_url": (str) the URL of the PDF file.
|
64 |
+
- "meta": (dict) the metadata extracted from the page by marker-pdf.
|
65 |
|
66 |
Args:
|
67 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py
CHANGED
@@ -48,17 +48,18 @@ class PDFPlumberTextLoader(BaseTextLoader):
|
|
48 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
"""
|
50 |
|
51 |
-
async def
|
52 |
"""
|
53 |
Process a single page of the PDF and extract its text using pdfplumber.
|
54 |
|
55 |
Returns a dictionary with the processed page data.
|
56 |
The dictionary will have the following keys and values:
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
62 |
|
63 |
Args:
|
64 |
page_idx (int): The index of the page to process.
|
|
|
48 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
"""
|
50 |
|
51 |
+
async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
|
52 |
"""
|
53 |
Process a single page of the PDF and extract its text using pdfplumber.
|
54 |
|
55 |
Returns a dictionary with the processed page data.
|
56 |
The dictionary will have the following keys and values:
|
57 |
+
|
58 |
+
- "text": (str) the extracted text from the page.
|
59 |
+
- "page_idx": (int) the index of the page.
|
60 |
+
- "document_name": (str) the name of the document.
|
61 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
62 |
+
- "file_url": (str) the URL of the PDF file.
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py
CHANGED
@@ -48,18 +48,18 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
|
|
48 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
"""
|
50 |
|
51 |
-
async def
|
52 |
"""
|
53 |
-
|
54 |
Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
|
55 |
|
56 |
Returns a dictionary with the processed page data.
|
57 |
The dictionary will have the following keys and values:
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|
|
|
48 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
"""
|
50 |
|
51 |
+
async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
|
52 |
"""
|
|
|
53 |
Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
|
54 |
|
55 |
Returns a dictionary with the processed page data.
|
56 |
The dictionary will have the following keys and values:
|
57 |
+
|
58 |
+
- "text": (str) the processed page data in markdown format.
|
59 |
+
- "page_idx": (int) the index of the page.
|
60 |
+
- "document_name": (str) the name of the document.
|
61 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
62 |
+
- "file_url": (str) the URL of the PDF file.
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py
CHANGED
@@ -48,17 +48,18 @@ class PyPDF2TextLoader(BaseTextLoader):
|
|
48 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
"""
|
50 |
|
51 |
-
async def
|
52 |
"""
|
53 |
Process a single page of the PDF and extract its text using PyPDF2.
|
54 |
|
55 |
Returns a dictionary with the processed page data.
|
56 |
The dictionary will have the following keys and values:
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
62 |
|
63 |
Args:
|
64 |
page_idx (int): The index of the page to process.
|
|
|
48 |
document_file_path (str): The local file path where the PDF is stored or will be downloaded.
|
49 |
"""
|
50 |
|
51 |
+
async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
|
52 |
"""
|
53 |
Process a single page of the PDF and extract its text using PyPDF2.
|
54 |
|
55 |
Returns a dictionary with the processed page data.
|
56 |
The dictionary will have the following keys and values:
|
57 |
+
|
58 |
+
- "text": (str) the extracted text from the page.
|
59 |
+
- "page_idx": (int) the index of the page.
|
60 |
+
- "document_name": (str) the name of the document.
|
61 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
62 |
+
- "file_url": (str) the URL of the PDF file.
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|