# import from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings from langchain.document_loaders import PyPDFLoader from .embeddings import EMBEDDING_MODEL_NAME from .vectorstore import get_vectorstore def load_data(): docs = parse_data() embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME) vectorstore = get_vectorstore(embedding_function) assert isinstance(vectorstore, Chroma) vectorstore.from_documents( docs, embedding_function, persist_directory="./chroma_db" ) return vectorstore def parse_data(): loader = PyPDFLoader("data/daoism/tao-te-ching.pdf") pages = loader.load_and_split() # split it into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0) docs = text_splitter.split_documents(pages) print(docs) for doc in docs: doc.metadata["name"] = parse_name(doc.metadata["source"]) doc.metadata["domain"] = parse_domain(doc.metadata["source"]) doc.metadata["page_number"] = doc.metadata["page"] doc.metadata["short_name"] = doc.metadata["name"] return docs def parse_name(source: str) -> str: return source.split("/")[-1].split(".")[0] def parse_domain(source: str) -> str: return source.split("/")[2] if __name__ == "__main__": db = load_data() # query it query = ( "He who can bear the misfortune of a nation is called the ruler of the world." ) docs = db.similarity_search(query) print(docs)