Spaces:
Sleeping
Sleeping
Commit
·
f37090a
1
Parent(s):
cc5cebc
chore: improve doc + code formatting
Browse files- medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py +8 -1
- medrag_multi_modal/document_loader/text_loader/marker_text_loader.py +10 -9
- medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py +9 -8
- medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py +9 -8
- medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py +9 -8
- medrag_multi_modal/retrieval/multi_modal_retrieval.py +2 -1
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py
CHANGED
@@ -36,7 +36,14 @@ class PDF2ImageLoader(BaseImageLoader):
|
|
36 |
**kwargs: Additional keyword arguments that may be used by pdf2image.
|
37 |
|
38 |
Returns:
|
39 |
-
Dict[str,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
"""
|
41 |
image = convert_from_path(
|
42 |
self.document_file_path,
|
|
|
36 |
**kwargs: Additional keyword arguments that may be used by pdf2image.
|
37 |
|
38 |
Returns:
|
39 |
+
Dict[str, Any]: A dictionary containing the processed page data.
|
40 |
+
The dictionary will have the following keys and values:
|
41 |
+
|
42 |
+
- "page_idx": (int) the index of the page.
|
43 |
+
- "document_name": (str) the name of the document.
|
44 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
45 |
+
- "file_url": (str) the URL of the PDF file.
|
46 |
+
- "image_file_path": (str) the local file path where the image is stored.
|
47 |
"""
|
48 |
image = convert_from_path(
|
49 |
self.document_file_path,
|
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py
CHANGED
@@ -53,15 +53,16 @@ class MarkerTextLoader(BaseTextLoader):
|
|
53 |
"""
|
54 |
Process a single page of the PDF and extract its structured text using marker-pdf.
|
55 |
|
56 |
-
Returns
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
65 |
|
66 |
Args:
|
67 |
page_idx (int): The index of the page to process.
|
|
|
53 |
"""
|
54 |
Process a single page of the PDF and extract its structured text using marker-pdf.
|
55 |
|
56 |
+
Returns:
|
57 |
+
Dict[str, str]: A dictionary with the processed page data.
|
58 |
+
The dictionary will have the following keys and values:
|
59 |
+
|
60 |
+
- "text": (str) the extracted structured text from the page.
|
61 |
+
- "page_idx": (int) the index of the page.
|
62 |
+
- "document_name": (str) the name of the document.
|
63 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
64 |
+
- "file_url": (str) the URL of the PDF file.
|
65 |
+
- "meta": (dict) the metadata extracted from the page by marker-pdf.
|
66 |
|
67 |
Args:
|
68 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py
CHANGED
@@ -52,14 +52,15 @@ class PDFPlumberTextLoader(BaseTextLoader):
|
|
52 |
"""
|
53 |
Process a single page of the PDF and extract its text using pdfplumber.
|
54 |
|
55 |
-
Returns
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|
|
|
52 |
"""
|
53 |
Process a single page of the PDF and extract its text using pdfplumber.
|
54 |
|
55 |
+
Returns:
|
56 |
+
Dict[str, str]: A dictionary with the processed page data.
|
57 |
+
The dictionary will have the following keys and values:
|
58 |
+
|
59 |
+
- "text": (str) the extracted text from the page.
|
60 |
+
- "page_idx": (int) the index of the page.
|
61 |
+
- "document_name": (str) the name of the document.
|
62 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
63 |
+
- "file_url": (str) the URL of the PDF file.
|
64 |
|
65 |
Args:
|
66 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py
CHANGED
@@ -52,14 +52,15 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
|
|
52 |
"""
|
53 |
Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
|
54 |
|
55 |
-
Returns
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|
|
|
52 |
"""
|
53 |
Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
|
54 |
|
55 |
+
Returns:
|
56 |
+
Dict[str, str]: A dictionary with the processed page data.
|
57 |
+
The dictionary will have the following keys and values:
|
58 |
+
|
59 |
+
- "text": (str) the processed page data in markdown format.
|
60 |
+
- "page_idx": (int) the index of the page.
|
61 |
+
- "document_name": (str) the name of the document.
|
62 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
63 |
+
- "file_url": (str) the URL of the PDF file.
|
64 |
|
65 |
Args:
|
66 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py
CHANGED
@@ -52,14 +52,15 @@ class PyPDF2TextLoader(BaseTextLoader):
|
|
52 |
"""
|
53 |
Process a single page of the PDF and extract its text using PyPDF2.
|
54 |
|
55 |
-
Returns
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
63 |
|
64 |
Args:
|
65 |
page_idx (int): The index of the page to process.
|
|
|
52 |
"""
|
53 |
Process a single page of the PDF and extract its text using PyPDF2.
|
54 |
|
55 |
+
Returns:
|
56 |
+
Dict[str, str]: A dictionary with the processed page data.
|
57 |
+
The dictionary will have the following keys and values:
|
58 |
+
|
59 |
+
- "text": (str) the extracted text from the page.
|
60 |
+
- "page_idx": (int) the index of the page.
|
61 |
+
- "document_name": (str) the name of the document.
|
62 |
+
- "file_path": (str) the local file path where the PDF is stored.
|
63 |
+
- "file_url": (str) the URL of the PDF file.
|
64 |
|
65 |
Args:
|
66 |
page_idx (int): The index of the page to process.
|
medrag_multi_modal/retrieval/multi_modal_retrieval.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
import os
|
2 |
from typing import Any, Optional
|
3 |
|
4 |
-
import wandb
|
5 |
import weave
|
6 |
from byaldi import RAGMultiModalModel
|
7 |
from PIL import Image
|
8 |
|
|
|
|
|
9 |
from ..utils import get_wandb_artifact
|
10 |
|
11 |
|
|
|
1 |
import os
|
2 |
from typing import Any, Optional
|
3 |
|
|
|
4 |
import weave
|
5 |
from byaldi import RAGMultiModalModel
|
6 |
from PIL import Image
|
7 |
|
8 |
+
import wandb
|
9 |
+
|
10 |
from ..utils import get_wandb_artifact
|
11 |
|
12 |
|