mratanusarkar commited on
Commit
e31ec78
·
1 Parent(s): fc27062

update: convert _process_page to extract_page_data

Browse files
medrag_multi_modal/document_loader/text_loader/base_text_loader.py CHANGED
@@ -65,9 +65,9 @@ class BaseTextLoader(ABC):
65
  return start_page, end_page
66
 
67
  @abstractmethod
68
- async def _process_page(self, page_idx: int) -> Dict[str, str]:
69
  """
70
- Abstract method to process a single page of the PDF.
71
 
72
  Overwrite this method in the subclass to provide the actual implementation and
73
  processing logic for each page of the PDF using various PDF processing libraries.
@@ -96,7 +96,7 @@ class BaseTextLoader(ABC):
96
  returns a list of Page objects containing the text and metadata.
97
 
98
  It uses `PyPDF2` to calculate the number of pages in the PDF and the
99
- overriden `_process_page` method provides the actual implementation to process
100
  each page, extract the text from the PDF, and convert it to markdown.
101
  It processes pages concurrently using `asyncio` for efficiency.
102
 
@@ -110,11 +110,12 @@ class BaseTextLoader(ABC):
110
  Returns:
111
  List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
112
  Each dictionary will have the following keys and values:
113
- - "text": (str) the processed page data in markdown format.
114
- - "page_idx": (int) the index of the page.
115
- - "document_name": (str) the name of the document.
116
- - "file_path": (str) the local file path where the PDF is stored.
117
- - "file_url": (str) the URL of the PDF file.
 
118
 
119
  Raises:
120
  ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
@@ -126,7 +127,7 @@ class BaseTextLoader(ABC):
126
 
127
  async def process_page(page_idx):
128
  nonlocal processed_pages_counter
129
- page_data = await self._process_page(page_idx)
130
  pages.append(page_data)
131
  rich.print(
132
  f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
 
65
  return start_page, end_page
66
 
67
  @abstractmethod
68
+ async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
69
  """
70
+ Abstract method to process a single page of the PDF and extract the text data.
71
 
72
  Overwrite this method in the subclass to provide the actual implementation and
73
  processing logic for each page of the PDF using various PDF processing libraries.
 
96
  returns a list of Page objects containing the text and metadata.
97
 
98
  It uses `PyPDF2` to calculate the number of pages in the PDF and the
99
+ overriden `extract_page_data` method provides the actual implementation to process
100
  each page, extract the text from the PDF, and convert it to markdown.
101
  It processes pages concurrently using `asyncio` for efficiency.
102
 
 
110
  Returns:
111
  List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
112
  Each dictionary will have the following keys and values:
113
+
114
+ - "text": (str) the processed page data in markdown format.
115
+ - "page_idx": (int) the index of the page.
116
+ - "document_name": (str) the name of the document.
117
+ - "file_path": (str) the local file path where the PDF is stored.
118
+ - "file_url": (str) the URL of the PDF file.
119
 
120
  Raises:
121
  ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
 
127
 
128
  async def process_page(page_idx):
129
  nonlocal processed_pages_counter
130
+ page_data = await self.extract_page_data(page_idx)
131
  pages.append(page_data)
132
  rich.print(
133
  f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py CHANGED
@@ -49,18 +49,19 @@ class MarkerTextLoader(BaseTextLoader):
49
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
50
  """
51
 
52
- async def _process_page(self, page_idx: int) -> Dict[str, str]:
53
  """
54
  Process a single page of the PDF and extract its structured text using marker-pdf.
55
 
56
  Returns a dictionary with the processed page data.
57
  The dictionary will have the following keys and values:
58
- - "text": (str) the extracted structured text from the page.
59
- - "page_idx": (int) the index of the page.
60
- - "document_name": (str) the name of the document.
61
- - "file_path": (str) the local file path where the PDF is stored.
62
- - "file_url": (str) the URL of the PDF file.
63
- - "meta": (dict) the metadata extracted from the page by marker-pdf.
 
64
 
65
  Args:
66
  page_idx (int): The index of the page to process.
 
49
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
50
  """
51
 
52
+ async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
53
  """
54
  Process a single page of the PDF and extract its structured text using marker-pdf.
55
 
56
  Returns a dictionary with the processed page data.
57
  The dictionary will have the following keys and values:
58
+
59
+ - "text": (str) the extracted structured text from the page.
60
+ - "page_idx": (int) the index of the page.
61
+ - "document_name": (str) the name of the document.
62
+ - "file_path": (str) the local file path where the PDF is stored.
63
+ - "file_url": (str) the URL of the PDF file.
64
+ - "meta": (dict) the metadata extracted from the page by marker-pdf.
65
 
66
  Args:
67
  page_idx (int): The index of the page to process.
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py CHANGED
@@ -48,17 +48,18 @@ class PDFPlumberTextLoader(BaseTextLoader):
48
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
  """
50
 
51
- async def _process_page(self, page_idx: int) -> Dict[str, str]:
52
  """
53
  Process a single page of the PDF and extract its text using pdfplumber.
54
 
55
  Returns a dictionary with the processed page data.
56
  The dictionary will have the following keys and values:
57
- - "text": (str) the extracted text from the page.
58
- - "page_idx": (int) the index of the page.
59
- - "document_name": (str) the name of the document.
60
- - "file_path": (str) the local file path where the PDF is stored.
61
- - "file_url": (str) the URL of the PDF file.
 
62
 
63
  Args:
64
  page_idx (int): The index of the page to process.
 
48
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
  """
50
 
51
+ async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
52
  """
53
  Process a single page of the PDF and extract its text using pdfplumber.
54
 
55
  Returns a dictionary with the processed page data.
56
  The dictionary will have the following keys and values:
57
+
58
+ - "text": (str) the extracted text from the page.
59
+ - "page_idx": (int) the index of the page.
60
+ - "document_name": (str) the name of the document.
61
+ - "file_path": (str) the local file path where the PDF is stored.
62
+ - "file_url": (str) the URL of the PDF file.
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py CHANGED
@@ -48,18 +48,18 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
48
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
  """
50
 
51
- async def _process_page(self, page_idx: int) -> Dict[str, str]:
52
  """
53
-
54
  Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
55
 
56
  Returns a dictionary with the processed page data.
57
  The dictionary will have the following keys and values:
58
- - "text": (str) the processed page data in markdown format.
59
- - "page_idx": (int) the index of the page.
60
- - "document_name": (str) the name of the document.
61
- - "file_path": (str) the local file path where the PDF is stored.
62
- - "file_url": (str) the URL of the PDF file.
 
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
 
48
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
  """
50
 
51
+ async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
52
  """
 
53
  Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
54
 
55
  Returns a dictionary with the processed page data.
56
  The dictionary will have the following keys and values:
57
+
58
+ - "text": (str) the processed page data in markdown format.
59
+ - "page_idx": (int) the index of the page.
60
+ - "document_name": (str) the name of the document.
61
+ - "file_path": (str) the local file path where the PDF is stored.
62
+ - "file_url": (str) the URL of the PDF file.
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py CHANGED
@@ -48,17 +48,18 @@ class PyPDF2TextLoader(BaseTextLoader):
48
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
  """
50
 
51
- async def _process_page(self, page_idx: int) -> Dict[str, str]:
52
  """
53
  Process a single page of the PDF and extract its text using PyPDF2.
54
 
55
  Returns a dictionary with the processed page data.
56
  The dictionary will have the following keys and values:
57
- - "text": (str) the extracted text from the page.
58
- - "page_idx": (int) the index of the page.
59
- - "document_name": (str) the name of the document.
60
- - "file_path": (str) the local file path where the PDF is stored.
61
- - "file_url": (str) the URL of the PDF file.
 
62
 
63
  Args:
64
  page_idx (int): The index of the page to process.
 
48
  document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
  """
50
 
51
+ async def extract_page_data(self, page_idx: int) -> Dict[str, str]:
52
  """
53
  Process a single page of the PDF and extract its text using PyPDF2.
54
 
55
  Returns a dictionary with the processed page data.
56
  The dictionary will have the following keys and values:
57
+
58
+ - "text": (str) the extracted text from the page.
59
+ - "page_idx": (int) the index of the page.
60
+ - "document_name": (str) the name of the document.
61
+ - "file_path": (str) the local file path where the PDF is stored.
62
+ - "file_url": (str) the URL of the PDF file.
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.