medrag / tests /document_loader /text_loader.py
geekyrakshit's picture
update: app
170d9a9
raw
history blame
1.37 kB
import asyncio
from medrag_multi_modal.document_loader import (
PDFPlumberTextLoader,
PyMuPDF4LLMTextLoader,
PyPDF2TextLoader,
)
URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
COLUMN_NAMES = [
"text",
"page_idx",
"document_name",
"file_path",
"file_url",
"loader_name",
]
def test_pdfplumber_text_loader():
loader = PDFPlumberTextLoader(
url=URL,
document_name="Gray's Anatomy",
document_file_path="grays_anatomy.pdf",
)
dataset = asyncio.run(loader.load_data(start_page=31, end_page=36))
assert dataset.num_rows == 6
assert dataset.column_names == COLUMN_NAMES
def test_pymupdf_text_loader():
loader = PyMuPDF4LLMTextLoader(
url=URL,
document_name="Gray's Anatomy",
document_file_path="grays_anatomy.pdf",
)
dataset = asyncio.run(loader.load_data(start_page=31, end_page=36))
assert dataset.num_rows == 6
assert dataset.column_names == COLUMN_NAMES
def test_pypdf2_text_loader():
loader = PyPDF2TextLoader(
url=URL,
document_name="Gray's Anatomy",
document_file_path="grays_anatomy.pdf",
)
dataset = asyncio.run(loader.load_data(start_page=31, end_page=36))
assert dataset.num_rows == 6
assert dataset.column_names == COLUMN_NAMES