Spaces:
Running
Running
# vectorstore.py | |
import os | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_experimental.text_splitter import SemanticChunker | |
from langchain_community.vectorstores import FAISS | |
def load_or_build_vectorstore(local_file: str, index_folder: str, embeddings): | |
""" | |
Loads a local FAISS index if it exists; otherwise, | |
builds a new index from the specified PDF file. | |
""" | |
if os.path.exists(index_folder): | |
print("Loading existing FAISS index from disk...") | |
vectorstore = FAISS.load_local(index_folder, embeddings, allow_dangerous_deserialization=True) | |
else: | |
print("Building a new FAISS index...") | |
loader = PyPDFLoader(local_file) | |
documents = loader.load() | |
text_splitter = SemanticChunker( | |
embeddings=embeddings, | |
breakpoint_threshold_type='percentile', | |
breakpoint_threshold_amount=90 | |
) | |
chunked_docs = text_splitter.split_documents(documents) | |
print(f"Document split into {len(chunked_docs)} chunks.") | |
vectorstore = FAISS.from_documents(chunked_docs, embeddings) | |
vectorstore.save_local(index_folder) | |
return vectorstore | |