"""Loader that loads image files.""" from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader from paddleocr import PaddleOCR import os import fitz import nltk from configs.model_config import NLTK_DATA_PATH nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path class UnstructuredPaddlePDFLoader(UnstructuredFileLoader): """Loader that uses unstructured to load image files, such as PNGs and JPGs.""" def _get_elements(self) -> List: def pdf_ocr_txt(filepath, dir_path="tmp_files"): full_dir_path = os.path.join(os.path.dirname(filepath), dir_path) if not os.path.exists(full_dir_path): os.makedirs(full_dir_path) ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False, show_log=False) doc = fitz.open(filepath) txt_file_path = os.path.join(full_dir_path, f"{os.path.split(filepath)[-1]}.txt") img_name = os.path.join(full_dir_path, 'tmp.png') with open(txt_file_path, 'w', encoding='utf-8') as fout: for i in range(doc.page_count): page = doc[i] text = page.get_text("") fout.write(text) fout.write("\n") img_list = page.get_images() for img in img_list: pix = fitz.Pixmap(doc, img[0]) if pix.n - pix.alpha >= 4: pix = fitz.Pixmap(fitz.csRGB, pix) pix.save(img_name) result = ocr.ocr(img_name) ocr_result = [i[1][0] for line in result for i in line] fout.write("\n".join(ocr_result)) if os.path.exists(img_name): os.remove(img_name) return txt_file_path txt_file_path = pdf_ocr_txt(self.file_path) from unstructured.partition.text import partition_text return partition_text(filename=txt_file_path, **self.unstructured_kwargs) if __name__ == "__main__": import sys sys.path.append(os.path.dirname(os.path.dirname(__file__))) filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.pdf") loader = UnstructuredPaddlePDFLoader(filepath, mode="elements") docs = loader.load() for doc in docs: print(doc)