Spaces:

geekyrakshit
/

medrag

Running

mratanusarkar commited on Oct 19, 2024

Commit

f37090a

1 Parent(s): cc5cebc

chore: improve doc + code formatting

Files changed (6) hide show

medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py CHANGED Viewed

@@ -36,7 +36,14 @@ class PDF2ImageLoader(BaseImageLoader):
             **kwargs: Additional keyword arguments that may be used by pdf2image.
         Returns:
-            Dict[str, str]: A dictionary containing the processed page data.
         """
         image = convert_from_path(
             self.document_file_path,

             **kwargs: Additional keyword arguments that may be used by pdf2image.
         Returns:
+            Dict[str, Any]: A dictionary containing the processed page data.
+            The dictionary will have the following keys and values:
+            - "page_idx": (int) the index of the page.
+            - "document_name": (str) the name of the document.
+            - "file_path": (str) the local file path where the PDF is stored.
+            - "file_url": (str) the URL of the PDF file.
+            - "image_file_path": (str) the local file path where the image is stored.
         """
         image = convert_from_path(
             self.document_file_path,

medrag_multi_modal/document_loader/text_loader/marker_text_loader.py CHANGED Viewed

@@ -53,15 +53,16 @@ class MarkerTextLoader(BaseTextLoader):
         """
         Process a single page of the PDF and extract its structured text using marker-pdf.
-        Returns a dictionary with the processed page data.
-        The dictionary will have the following keys and values:
-        - "text": (str) the extracted structured text from the page.
-        - "page_idx": (int) the index of the page.
-        - "document_name": (str) the name of the document.
-        - "file_path": (str) the local file path where the PDF is stored.
-        - "file_url": (str) the URL of the PDF file.
-        - "meta": (dict) the metadata extracted from the page by marker-pdf.
         Args:
             page_idx (int): The index of the page to process.

         """
         Process a single page of the PDF and extract its structured text using marker-pdf.
+        Returns:
+            Dict[str, str]: A dictionary with the processed page data.
+            The dictionary will have the following keys and values:
+            - "text": (str) the extracted structured text from the page.
+            - "page_idx": (int) the index of the page.
+            - "document_name": (str) the name of the document.
+            - "file_path": (str) the local file path where the PDF is stored.
+            - "file_url": (str) the URL of the PDF file.
+            - "meta": (dict) the metadata extracted from the page by marker-pdf.
         Args:
             page_idx (int): The index of the page to process.

medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py CHANGED Viewed

@@ -52,14 +52,15 @@ class PDFPlumberTextLoader(BaseTextLoader):
         """
         Process a single page of the PDF and extract its text using pdfplumber.
-        Returns a dictionary with the processed page data.
-        The dictionary will have the following keys and values:
-        - "text": (str) the extracted text from the page.
-        - "page_idx": (int) the index of the page.
-        - "document_name": (str) the name of the document.
-        - "file_path": (str) the local file path where the PDF is stored.
-        - "file_url": (str) the URL of the PDF file.
         Args:
             page_idx (int): The index of the page to process.

         """
         Process a single page of the PDF and extract its text using pdfplumber.
+        Returns:
+            Dict[str, str]: A dictionary with the processed page data.
+            The dictionary will have the following keys and values:
+            - "text": (str) the extracted text from the page.
+            - "page_idx": (int) the index of the page.
+            - "document_name": (str) the name of the document.
+            - "file_path": (str) the local file path where the PDF is stored.
+            - "file_url": (str) the URL of the PDF file.
         Args:
             page_idx (int): The index of the page to process.

medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py CHANGED Viewed

@@ -52,14 +52,15 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
         """
         Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
-        Returns a dictionary with the processed page data.
-        The dictionary will have the following keys and values:
-        - "text": (str) the processed page data in markdown format.
-        - "page_idx": (int) the index of the page.
-        - "document_name": (str) the name of the document.
-        - "file_path": (str) the local file path where the PDF is stored.
-        - "file_url": (str) the URL of the PDF file.
         Args:
             page_idx (int): The index of the page to process.

         """
         Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
+        Returns:
+            Dict[str, str]: A dictionary with the processed page data.
+            The dictionary will have the following keys and values:
+            - "text": (str) the processed page data in markdown format.
+            - "page_idx": (int) the index of the page.
+            - "document_name": (str) the name of the document.
+            - "file_path": (str) the local file path where the PDF is stored.
+            - "file_url": (str) the URL of the PDF file.
         Args:
             page_idx (int): The index of the page to process.

medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py CHANGED Viewed

@@ -52,14 +52,15 @@ class PyPDF2TextLoader(BaseTextLoader):
         """
         Process a single page of the PDF and extract its text using PyPDF2.
-        Returns a dictionary with the processed page data.
-        The dictionary will have the following keys and values:
-        - "text": (str) the extracted text from the page.
-        - "page_idx": (int) the index of the page.
-        - "document_name": (str) the name of the document.
-        - "file_path": (str) the local file path where the PDF is stored.
-        - "file_url": (str) the URL of the PDF file.
         Args:
             page_idx (int): The index of the page to process.

         """
         Process a single page of the PDF and extract its text using PyPDF2.
+        Returns:
+            Dict[str, str]: A dictionary with the processed page data.
+            The dictionary will have the following keys and values:
+            - "text": (str) the extracted text from the page.
+            - "page_idx": (int) the index of the page.
+            - "document_name": (str) the name of the document.
+            - "file_path": (str) the local file path where the PDF is stored.
+            - "file_url": (str) the URL of the PDF file.
         Args:
             page_idx (int): The index of the page to process.

medrag_multi_modal/retrieval/multi_modal_retrieval.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import os
 from typing import Any, Optional
-import wandb
 import weave
 from byaldi import RAGMultiModalModel
 from PIL import Image
 from ..utils import get_wandb_artifact

 import os
 from typing import Any, Optional
 import weave
 from byaldi import RAGMultiModalModel
 from PIL import Image
+import wandb
 from ..utils import get_wandb_artifact