MOHAMMED-N commited on
Commit
7867b31
·
verified ·
1 Parent(s): cca3d6b

Create vectorstore.py

Browse files
Files changed (1) hide show
  1. vectorstore.py +32 -0
vectorstore.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # vectorstore.py
2
+
3
+ import os
4
+ from langchain_community.document_loaders import PyPDFLoader
5
+ from langchain_experimental.text_splitter import SemanticChunker
6
+ from langchain_community.vectorstores import FAISS
7
+
8
+ def load_or_build_vectorstore(local_file: str, index_folder: str, embeddings):
9
+ """
10
+ Loads a local FAISS index if it exists; otherwise,
11
+ builds a new index from the specified PDF file.
12
+ """
13
+ if os.path.exists(index_folder):
14
+ print("Loading existing FAISS index from disk...")
15
+ vectorstore = FAISS.load_local(index_folder, embeddings, allow_dangerous_deserialization=True)
16
+ else:
17
+ print("Building a new FAISS index...")
18
+ loader = PyPDFLoader(local_file)
19
+ documents = loader.load()
20
+
21
+ text_splitter = SemanticChunker(
22
+ embeddings=embeddings,
23
+ breakpoint_threshold_type='percentile',
24
+ breakpoint_threshold_amount=90
25
+ )
26
+ chunked_docs = text_splitter.split_documents(documents)
27
+ print(f"Document split into {len(chunked_docs)} chunks.")
28
+
29
+ vectorstore = FAISS.from_documents(chunked_docs, embeddings)
30
+ vectorstore.save_local(index_folder)
31
+
32
+ return vectorstore