# vectorstore.py import os from langchain_community.document_loaders import PyPDFLoader from langchain_experimental.text_splitter import SemanticChunker from langchain_community.vectorstores import FAISS def load_or_build_vectorstore(local_file: str, index_folder: str, embeddings): """ Loads a local FAISS index if it exists; otherwise, builds a new index from the specified PDF file. """ if os.path.exists(index_folder): print("Loading existing FAISS index from disk...") vectorstore = FAISS.load_local(index_folder, embeddings, allow_dangerous_deserialization=True) else: print("Building a new FAISS index...") loader = PyPDFLoader(local_file) documents = loader.load() text_splitter = SemanticChunker( embeddings=embeddings, breakpoint_threshold_type='percentile', breakpoint_threshold_amount=90 ) chunked_docs = text_splitter.split_documents(documents) print(f"Document split into {len(chunked_docs)} chunks.") vectorstore = FAISS.from_documents(chunked_docs, embeddings) vectorstore.save_local(index_folder) return vectorstore