legalchatbotD / ingest (1).py
divy131's picture
Upload ingest (1).py
8df2836 verified
# Importing Dependencies
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
# Dataset Directory Path
DATASET = "dataset/"
# Faiss Index Path
FAISS_INDEX = "vectorstore/"
# Create Vector Store and Index
def embed_all():
"""
Embed all files in the dataset directory
"""
# Create the document loader
loader = DirectoryLoader(DATASET, glob="*.pdf", loader_cls=PyPDFLoader)
# Load the documents
documents = loader.load()
# Create the splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
# Split the documents into chunks
chunks = splitter.split_documents(documents)
# Load the embeddings
embeddings = HuggingFaceEmbeddings()
# Create the vector store
vector_store = FAISS.from_documents(chunks, embeddings)
# Save the vector store
vector_store.save_local(FAISS_INDEX)
if __name__ == "__main__":
embed_all()