# from langchain.document_loaders import DirectoryLoader from langchain_community.document_loaders import DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document # from langchain.embeddings import OpenAIEmbeddings from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from dotenv import load_dotenv import os import shutil import logging logger = logging.getLogger(__name__) # Load environment variables. Assumes that project contains .env file with API keys load_dotenv() #---- Set OpenAI API key # Change environment variable name from "OPENAI_API_KEY" to the name given in # your .env file. CHROMA_PATH = "chroma" DATA_PATH = "data/" def main(): generate_data_store() def generate_data_store(): logger.info("Loading documents..") documents = load_documents() chunks = split_text(documents) save_to_chroma(chunks) def load_documents(): loader = DirectoryLoader(DATA_PATH, glob="*.pdf") documents = loader.load() logger.info("Found {:d} documents..".format(len(documents))) return documents def split_text(documents: list[Document]): text_splitter = RecursiveCharacterTextSplitter( chunk_size=1800, chunk_overlap=100, length_function=len, add_start_index=True, ) chunks = text_splitter.split_documents(documents) print(f"Split {len(documents)} documents into {len(chunks)} chunks.") document = chunks[10] print(document.page_content) print(document.metadata) return chunks def save_to_chroma(chunks: list[Document]): # Clear out the database first. if os.path.exists(CHROMA_PATH): shutil.rmtree(CHROMA_PATH) # Create a new DB from the documents. db = Chroma.from_documents( chunks, HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"), persist_directory=CHROMA_PATH ) db.persist() print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.") if __name__ == "__main__": main()