Spaces:
Sleeping
Sleeping
metisllm-dashboard
/
extraction_pipeline
/document_metadata_extractor
/openai_document_metadata_extractor_test.py
import logging | |
import unittest | |
import uuid | |
from typing import List | |
from domain.chunk_d import DocumentD | |
from extraction_pipeline.document_metadata_extractor.openai_document_metadata_extractor import OpenAIDocumentMetadataExtractor, AuthorsError, CreationDateError | |
from llm_handler.mock_llm_handler import MockLLMHandler | |
DOCUMENT_METADATA_EXTRACTION_RESPONSE = ''' | |
{ | |
"authors": ["BofA Global Research", "Michael Hartnett", "Elyas Galou", "Anya Shelekhin", "Myung-Jee Jung"], | |
"publish_date": "2023-04-13" | |
} | |
''' | |
class TestOpenAIDocumentMetadataExtractor(unittest.TestCase): | |
def setUpClass(cls) -> None: | |
cls.test_pdf_path = "extraction_pipeline/test_data/test.pdf" | |
cls.start_document_d = DocumentD(file_path=cls.test_pdf_path, authors="", publish_date="") | |
cls.final_document_d = DocumentD( | |
file_path=cls.test_pdf_path, | |
authors= | |
"BofA Global Research, Michael Hartnett, Elyas Galou, Anya Shelekhin, Myung-Jee Jung", | |
publish_date="2023-04-13") | |
cls.openai_publish_details_extractor = OpenAIDocumentMetadataExtractor() | |
def test__validate_text_missing_publishers(self): | |
missing_publishers_text = {"publish_date": "2023-12-13"} | |
with self.assertRaises(AuthorsError): | |
self.openai_publish_details_extractor._validate_text( | |
missing_publishers_text) # type: ignore | |
def test__validate_text_invalid_date(self): | |
invalid_date_text = { | |
"authors": [ | |
"BofA Global Research", | |
"Michael Hartnett", | |
"Elyas Galou", | |
"Anya Shelekhin", | |
"Myung-Jee Jung" | |
], | |
"publish_date": "2-13" | |
} | |
with self.assertRaises(CreationDateError): | |
self.openai_publish_details_extractor._validate_text(invalid_date_text) # type: ignore | |
def test__validate_text_valid(self): | |
valid_text = { | |
"authors": [ | |
"BofA Global Research", | |
"Michael Hartnett", | |
"Elyas Galou", | |
"Anya Shelekhin", | |
"Myung-Jee Jung" | |
], | |
"publish_date": "2023-04-13" | |
} | |
self.openai_publish_details_extractor._validate_text(valid_text) # type: ignore | |
def test__process_element(self): | |
handler = MockLLMHandler(chat_completion=[DOCUMENT_METADATA_EXTRACTION_RESPONSE]) | |
openai_publish_details_extractor = OpenAIDocumentMetadataExtractor(handler) | |
pdf_document_d = self.start_document_d | |
output = list(openai_publish_details_extractor._process_element(pdf_document_d)) | |
self.assertEqual(output[0], self.final_document_d) | |
if __name__ == '__main__': | |
logging.basicConfig(level=logging.INFO) | |
unittest.main() | |