Spaces:
Sleeping
Sleeping
import asyncio | |
from medrag_multi_modal.document_loader import ( | |
PDFPlumberTextLoader, | |
PyMuPDF4LLMTextLoader, | |
PyPDF2TextLoader, | |
) | |
URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf" | |
COLUMN_NAMES = [ | |
"text", | |
"page_idx", | |
"document_name", | |
"file_path", | |
"file_url", | |
"loader_name", | |
] | |
def test_pdfplumber_text_loader(): | |
loader = PDFPlumberTextLoader( | |
url=URL, | |
document_name="Gray's Anatomy", | |
document_file_path="grays_anatomy.pdf", | |
) | |
dataset = asyncio.run(loader.load_data(start_page=31, end_page=36)) | |
assert dataset.num_rows == 6 | |
assert dataset.column_names == COLUMN_NAMES | |
def test_pymupdf_text_loader(): | |
loader = PyMuPDF4LLMTextLoader( | |
url=URL, | |
document_name="Gray's Anatomy", | |
document_file_path="grays_anatomy.pdf", | |
) | |
dataset = asyncio.run(loader.load_data(start_page=31, end_page=36)) | |
assert dataset.num_rows == 6 | |
assert dataset.column_names == COLUMN_NAMES | |
def test_pypdf2_text_loader(): | |
loader = PyPDF2TextLoader( | |
url=URL, | |
document_name="Gray's Anatomy", | |
document_file_path="grays_anatomy.pdf", | |
) | |
dataset = asyncio.run(loader.load_data(start_page=31, end_page=36)) | |
assert dataset.num_rows == 6 | |
assert dataset.column_names == COLUMN_NAMES | |