import os from langchain.document_loaders import TextLoader, DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS # Set your Hugging Face token HF_TOKEN = os.environ.get("HF_TOKEN", None) # Load documents loader = DirectoryLoader('data2/text/range/0-5000', loader_cls=TextLoader) documents = loader.load() print('len of documents are', len(documents)) # Split documents into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=250) all_splits = text_splitter.split_documents(documents) print("Length of all_splits:", len(all_splits)) # Generate embeddings model_name = "sentence-transformers/all-mpnet-base-v2" model_kwargs = {"device": "cuda"} embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs) # Store embeddings in the vector store vectorstore = FAISS.from_documents(all_splits, embeddings) vectorstore.save_local('faiss_index') print("Embeddings stored successfully!")