Spaces:

geekyrakshit
/

medrag

Running

App Files Files Community

mratanusarkar commited on Oct 17, 2024

Commit

e31ec78

1 Parent(s): fc27062

update: convert _process_page to extract_page_data

Browse files

Files changed (5) hide show

medrag_multi_modal/document_loader/text_loader/base_text_loader.py +10 -9
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py +8 -7
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py +7 -6
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py +7 -7
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py +7 -6

medrag_multi_modal/document_loader/text_loader/base_text_loader.py CHANGED Viewed

@@ -65,9 +65,9 @@ class BaseTextLoader(ABC):
         return start_page, end_page
     @abstractmethod
-    async def _process_page(self, page_idx: int) -> Dict[str, str]:
         """
-        Abstract method to process a single page of the PDF.
         Overwrite this method in the subclass to provide the actual implementation and
         processing logic for each page of the PDF using various PDF processing libraries.
@@ -96,7 +96,7 @@ class BaseTextLoader(ABC):
         returns a list of Page objects containing the text and metadata.
         It uses `PyPDF2` to calculate the number of pages in the PDF and the
-        overriden `_process_page` method provides the actual implementation to process
         each page, extract the text from the PDF, and convert it to markdown.
         It processes pages concurrently using `asyncio` for efficiency.
@@ -110,11 +110,12 @@ class BaseTextLoader(ABC):
         Returns:
             List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
             Each dictionary will have the following keys and values:
-                - "text": (str) the processed page data in markdown format.
-                - "page_idx": (int) the index of the page.
-                - "document_name": (str) the name of the document.
-                - "file_path": (str) the local file path where the PDF is stored.
-                - "file_url": (str) the URL of the PDF file.
         Raises:
             ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
@@ -126,7 +127,7 @@ class BaseTextLoader(ABC):
         async def process_page(page_idx):
             nonlocal processed_pages_counter
-            page_data = await self._process_page(page_idx)
             pages.append(page_data)
             rich.print(
                 f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"

         return start_page, end_page
     @abstractmethod
+    async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
         """
+        Abstract method to process a single page of the PDF and extract the text data.
         Overwrite this method in the subclass to provide the actual implementation and
         processing logic for each page of the PDF using various PDF processing libraries.
         returns a list of Page objects containing the text and metadata.
         It uses `PyPDF2` to calculate the number of pages in the PDF and the
+        overriden `extract_page_data` method provides the actual implementation to process
         each page, extract the text from the PDF, and convert it to markdown.
         It processes pages concurrently using `asyncio` for efficiency.
         Returns:
             List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
             Each dictionary will have the following keys and values:
+            - "text": (str) the processed page data in markdown format.
+            - "page_idx": (int) the index of the page.
+            - "document_name": (str) the name of the document.
+            - "file_path": (str) the local file path where the PDF is stored.
+            - "file_url": (str) the URL of the PDF file.
         Raises:
             ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
         async def process_page(page_idx):
             nonlocal processed_pages_counter
+            page_data = await self.extract_page_data(page_idx)
             pages.append(page_data)
             rich.print(
                 f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"

medrag_multi_modal/document_loader/text_loader/marker_text_loader.py CHANGED Viewed

@@ -49,18 +49,19 @@ class MarkerTextLoader(BaseTextLoader):
         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
-    async def _process_page(self, page_idx: int) -> Dict[str, str]:
         """
         Process a single page of the PDF and extract its structured text using marker-pdf.
         Returns a dictionary with the processed page data.
         The dictionary will have the following keys and values:
-            - "text": (str) the extracted structured text from the page.
-            - "page_idx": (int) the index of the page.
-            - "document_name": (str) the name of the document.
-            - "file_path": (str) the local file path where the PDF is stored.
-            - "file_url": (str) the URL of the PDF file.
-            - "meta": (dict) the metadata extracted from the page by marker-pdf.
         Args:
             page_idx (int): The index of the page to process.

         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
+    async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
         """
         Process a single page of the PDF and extract its structured text using marker-pdf.
         Returns a dictionary with the processed page data.
         The dictionary will have the following keys and values:
+        - "text": (str) the extracted structured text from the page.
+        - "page_idx": (int) the index of the page.
+        - "document_name": (str) the name of the document.
+        - "file_path": (str) the local file path where the PDF is stored.
+        - "file_url": (str) the URL of the PDF file.
+        - "meta": (dict) the metadata extracted from the page by marker-pdf.
         Args:
             page_idx (int): The index of the page to process.

medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py CHANGED Viewed

@@ -48,17 +48,18 @@ class PDFPlumberTextLoader(BaseTextLoader):
         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
-    async def _process_page(self, page_idx: int) -> Dict[str, str]:
         """
         Process a single page of the PDF and extract its text using pdfplumber.
         Returns a dictionary with the processed page data.
         The dictionary will have the following keys and values:
-            - "text": (str) the extracted text from the page.
-            - "page_idx": (int) the index of the page.
-            - "document_name": (str) the name of the document.
-            - "file_path": (str) the local file path where the PDF is stored.
-            - "file_url": (str) the URL of the PDF file.
         Args:
             page_idx (int): The index of the page to process.

         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
+    async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
         """
         Process a single page of the PDF and extract its text using pdfplumber.
         Returns a dictionary with the processed page data.
         The dictionary will have the following keys and values:
+        - "text": (str) the extracted text from the page.
+        - "page_idx": (int) the index of the page.
+        - "document_name": (str) the name of the document.
+        - "file_path": (str) the local file path where the PDF is stored.
+        - "file_url": (str) the URL of the PDF file.
         Args:
             page_idx (int): The index of the page to process.

medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py CHANGED Viewed

@@ -48,18 +48,18 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
-    async def _process_page(self, page_idx: int) -> Dict[str, str]:
         """
         Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
         Returns a dictionary with the processed page data.
         The dictionary will have the following keys and values:
-            - "text": (str) the processed page data in markdown format.
-            - "page_idx": (int) the index of the page.
-            - "document_name": (str) the name of the document.
-            - "file_path": (str) the local file path where the PDF is stored.
-            - "file_url": (str) the URL of the PDF file.
         Args:
             page_idx (int): The index of the page to process.

         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
+    async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
         """
         Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
         Returns a dictionary with the processed page data.
         The dictionary will have the following keys and values:
+        - "text": (str) the processed page data in markdown format.
+        - "page_idx": (int) the index of the page.
+        - "document_name": (str) the name of the document.
+        - "file_path": (str) the local file path where the PDF is stored.
+        - "file_url": (str) the URL of the PDF file.
         Args:
             page_idx (int): The index of the page to process.

medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py CHANGED Viewed

@@ -48,17 +48,18 @@ class PyPDF2TextLoader(BaseTextLoader):
         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
-    async def _process_page(self, page_idx: int) -> Dict[str, str]:
         """
         Process a single page of the PDF and extract its text using PyPDF2.
         Returns a dictionary with the processed page data.
         The dictionary will have the following keys and values:
-            - "text": (str) the extracted text from the page.
-            - "page_idx": (int) the index of the page.
-            - "document_name": (str) the name of the document.
-            - "file_path": (str) the local file path where the PDF is stored.
-            - "file_url": (str) the URL of the PDF file.
         Args:
             page_idx (int): The index of the page to process.

         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
+    async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
         """
         Process a single page of the PDF and extract its text using PyPDF2.
         Returns a dictionary with the processed page data.
         The dictionary will have the following keys and values:
+        - "text": (str) the extracted text from the page.
+        - "page_idx": (int) the index of the page.
+        - "document_name": (str) the name of the document.
+        - "file_path": (str) the local file path where the PDF is stored.
+        - "file_url": (str) the URL of the PDF file.
         Args:
             page_idx (int): The index of the page to process.