mratanusarkar commited on
Commit
f37090a
·
1 Parent(s): cc5cebc

chore: improve doc + code formatting

Browse files
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py CHANGED
@@ -36,7 +36,14 @@ class PDF2ImageLoader(BaseImageLoader):
36
  **kwargs: Additional keyword arguments that may be used by pdf2image.
37
 
38
  Returns:
39
- Dict[str, str]: A dictionary containing the processed page data.
 
 
 
 
 
 
 
40
  """
41
  image = convert_from_path(
42
  self.document_file_path,
 
36
  **kwargs: Additional keyword arguments that may be used by pdf2image.
37
 
38
  Returns:
39
+ Dict[str, Any]: A dictionary containing the processed page data.
40
+ The dictionary will have the following keys and values:
41
+
42
+ - "page_idx": (int) the index of the page.
43
+ - "document_name": (str) the name of the document.
44
+ - "file_path": (str) the local file path where the PDF is stored.
45
+ - "file_url": (str) the URL of the PDF file.
46
+ - "image_file_path": (str) the local file path where the image is stored.
47
  """
48
  image = convert_from_path(
49
  self.document_file_path,
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py CHANGED
@@ -53,15 +53,16 @@ class MarkerTextLoader(BaseTextLoader):
53
  """
54
  Process a single page of the PDF and extract its structured text using marker-pdf.
55
 
56
- Returns a dictionary with the processed page data.
57
- The dictionary will have the following keys and values:
58
-
59
- - "text": (str) the extracted structured text from the page.
60
- - "page_idx": (int) the index of the page.
61
- - "document_name": (str) the name of the document.
62
- - "file_path": (str) the local file path where the PDF is stored.
63
- - "file_url": (str) the URL of the PDF file.
64
- - "meta": (dict) the metadata extracted from the page by marker-pdf.
 
65
 
66
  Args:
67
  page_idx (int): The index of the page to process.
 
53
  """
54
  Process a single page of the PDF and extract its structured text using marker-pdf.
55
 
56
+ Returns:
57
+ Dict[str, str]: A dictionary with the processed page data.
58
+ The dictionary will have the following keys and values:
59
+
60
+ - "text": (str) the extracted structured text from the page.
61
+ - "page_idx": (int) the index of the page.
62
+ - "document_name": (str) the name of the document.
63
+ - "file_path": (str) the local file path where the PDF is stored.
64
+ - "file_url": (str) the URL of the PDF file.
65
+ - "meta": (dict) the metadata extracted from the page by marker-pdf.
66
 
67
  Args:
68
  page_idx (int): The index of the page to process.
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py CHANGED
@@ -52,14 +52,15 @@ class PDFPlumberTextLoader(BaseTextLoader):
52
  """
53
  Process a single page of the PDF and extract its text using pdfplumber.
54
 
55
- Returns a dictionary with the processed page data.
56
- The dictionary will have the following keys and values:
57
-
58
- - "text": (str) the extracted text from the page.
59
- - "page_idx": (int) the index of the page.
60
- - "document_name": (str) the name of the document.
61
- - "file_path": (str) the local file path where the PDF is stored.
62
- - "file_url": (str) the URL of the PDF file.
 
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
 
52
  """
53
  Process a single page of the PDF and extract its text using pdfplumber.
54
 
55
+ Returns:
56
+ Dict[str, str]: A dictionary with the processed page data.
57
+ The dictionary will have the following keys and values:
58
+
59
+ - "text": (str) the extracted text from the page.
60
+ - "page_idx": (int) the index of the page.
61
+ - "document_name": (str) the name of the document.
62
+ - "file_path": (str) the local file path where the PDF is stored.
63
+ - "file_url": (str) the URL of the PDF file.
64
 
65
  Args:
66
  page_idx (int): The index of the page to process.
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py CHANGED
@@ -52,14 +52,15 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
52
  """
53
  Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
54
 
55
- Returns a dictionary with the processed page data.
56
- The dictionary will have the following keys and values:
57
-
58
- - "text": (str) the processed page data in markdown format.
59
- - "page_idx": (int) the index of the page.
60
- - "document_name": (str) the name of the document.
61
- - "file_path": (str) the local file path where the PDF is stored.
62
- - "file_url": (str) the URL of the PDF file.
 
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
 
52
  """
53
  Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
54
 
55
+ Returns:
56
+ Dict[str, str]: A dictionary with the processed page data.
57
+ The dictionary will have the following keys and values:
58
+
59
+ - "text": (str) the processed page data in markdown format.
60
+ - "page_idx": (int) the index of the page.
61
+ - "document_name": (str) the name of the document.
62
+ - "file_path": (str) the local file path where the PDF is stored.
63
+ - "file_url": (str) the URL of the PDF file.
64
 
65
  Args:
66
  page_idx (int): The index of the page to process.
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py CHANGED
@@ -52,14 +52,15 @@ class PyPDF2TextLoader(BaseTextLoader):
52
  """
53
  Process a single page of the PDF and extract its text using PyPDF2.
54
 
55
- Returns a dictionary with the processed page data.
56
- The dictionary will have the following keys and values:
57
-
58
- - "text": (str) the extracted text from the page.
59
- - "page_idx": (int) the index of the page.
60
- - "document_name": (str) the name of the document.
61
- - "file_path": (str) the local file path where the PDF is stored.
62
- - "file_url": (str) the URL of the PDF file.
 
63
 
64
  Args:
65
  page_idx (int): The index of the page to process.
 
52
  """
53
  Process a single page of the PDF and extract its text using PyPDF2.
54
 
55
+ Returns:
56
+ Dict[str, str]: A dictionary with the processed page data.
57
+ The dictionary will have the following keys and values:
58
+
59
+ - "text": (str) the extracted text from the page.
60
+ - "page_idx": (int) the index of the page.
61
+ - "document_name": (str) the name of the document.
62
+ - "file_path": (str) the local file path where the PDF is stored.
63
+ - "file_url": (str) the URL of the PDF file.
64
 
65
  Args:
66
  page_idx (int): The index of the page to process.
medrag_multi_modal/retrieval/multi_modal_retrieval.py CHANGED
@@ -1,11 +1,12 @@
1
  import os
2
  from typing import Any, Optional
3
 
4
- import wandb
5
  import weave
6
  from byaldi import RAGMultiModalModel
7
  from PIL import Image
8
 
 
 
9
  from ..utils import get_wandb_artifact
10
 
11
 
 
1
  import os
2
  from typing import Any, Optional
3
 
 
4
  import weave
5
  from byaldi import RAGMultiModalModel
6
  from PIL import Image
7
 
8
+ import wandb
9
+
10
  from ..utils import get_wandb_artifact
11
 
12