Spaces:
Sleeping
Sleeping
# Importing Dependencies | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.document_loaders import PyPDFLoader, DirectoryLoader | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import FAISS | |
# Dataset Directory Path | |
DATASET = "dataset/" | |
# Faiss Index Path | |
FAISS_INDEX = "vectorstore/" | |
# Create Vector Store and Index | |
def embed_all(): | |
""" | |
Embed all files in the dataset directory | |
""" | |
# Create the document loader | |
loader = DirectoryLoader(DATASET, glob="*.pdf", loader_cls=PyPDFLoader) | |
# Load the documents | |
documents = loader.load() | |
# Create the splitter | |
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200) | |
# Split the documents into chunks | |
chunks = splitter.split_documents(documents) | |
# Load the embeddings | |
embeddings = HuggingFaceEmbeddings() | |
# Create the vector store | |
vector_store = FAISS.from_documents(chunks, embeddings) | |
# Save the vector store | |
vector_store.save_local(FAISS_INDEX) | |
if __name__ == "__main__": | |
embed_all() |