Spaces:

divy131
/

legalchatbotD

Sleeping

legalchatbotD / ingest (1).py

Upload ingest (1).py

8df2836 verified 12 months ago

1.09 kB

	# Importing Dependencies
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.document_loaders import PyPDFLoader, DirectoryLoader
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS

	# Dataset Directory Path
	DATASET = "dataset/"

	# Faiss Index Path
	FAISS_INDEX = "vectorstore/"

	# Create Vector Store and Index
	def embed_all():
	"""
	Embed all files in the dataset directory
	"""
	# Create the document loader
	loader = DirectoryLoader(DATASET, glob="*.pdf", loader_cls=PyPDFLoader)
	# Load the documents
	documents = loader.load()
	# Create the splitter
	splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
	# Split the documents into chunks
	chunks = splitter.split_documents(documents)
	# Load the embeddings
	embeddings = HuggingFaceEmbeddings()
	# Create the vector store
	vector_store = FAISS.from_documents(chunks, embeddings)
	# Save the vector store
	vector_store.save_local(FAISS_INDEX)

	if __name__ == "__main__":
	embed_all()