|
|
|
import os |
|
import sys |
|
from timeit import default_timer as timer |
|
from typing import List |
|
|
|
from langchain.document_loaders import PyPDFDirectoryLoader |
|
from langchain.embeddings import HuggingFaceInstructEmbeddings |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.vectorstores.base import VectorStore |
|
from langchain.vectorstores.chroma import Chroma |
|
from langchain.vectorstores.faiss import FAISS |
|
|
|
from app_modules.init import app_init, get_device_types |
|
from app_modules.llm_summarize_chain import SummarizeChain |
|
|
|
|
|
def load_documents(source_pdfs_path, keep_page_info) -> List: |
|
loader = PyPDFDirectoryLoader(source_pdfs_path, silent_errors=True) |
|
documents = loader.load() |
|
if not keep_page_info: |
|
for doc in documents: |
|
if doc is not documents[0]: |
|
documents[0].page_content = ( |
|
documents[0].page_content + "\n" + doc.page_content |
|
) |
|
documents = [documents[0]] |
|
return documents |
|
|
|
|
|
def split_chunks(documents: List, chunk_size, chunk_overlap) -> List: |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap |
|
) |
|
return text_splitter.split_documents(documents) |
|
|
|
|
|
llm_loader = app_init(False)[0] |
|
|
|
source_pdfs_path = ( |
|
sys.argv[1] if len(sys.argv) > 1 else os.environ.get("SOURCE_PDFS_PATH") |
|
) |
|
chunk_size = sys.argv[2] if len(sys.argv) > 2 else os.environ.get("CHUNCK_SIZE") |
|
chunk_overlap = sys.argv[3] if len(sys.argv) > 3 else os.environ.get("CHUNK_OVERLAP") |
|
keep_page_info = ( |
|
sys.argv[3] if len(sys.argv) > 3 else os.environ.get("KEEP_PAGE_INFO") |
|
) == "true" |
|
|
|
sources = load_documents(source_pdfs_path, keep_page_info) |
|
|
|
print(f"Splitting {len(sources)} PDF pages in to chunks ...") |
|
|
|
chunks = split_chunks( |
|
sources, chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap) |
|
) |
|
|
|
print(f"Summarizing {len(chunks)} chunks ...") |
|
start = timer() |
|
|
|
summarize_chain = SummarizeChain(llm_loader) |
|
result = summarize_chain.call_chain( |
|
{"input_documents": chunks}, |
|
None, |
|
None, |
|
True, |
|
) |
|
|
|
end = timer() |
|
print(f"Completed in {end - start:.3f}s") |
|
|
|
print("\n\n***Summary:") |
|
print(result["output_text"]) |
|
|