LOUIS SANNA
feat(loader)
cc2ce8c
raw
history blame
No virus
1.62 kB
# import
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from .embeddings import EMBEDDING_MODEL_NAME
from .vectorstore import get_vectorstore
def load_data():
docs = parse_data()
embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
vectorstore = get_vectorstore(embedding_function)
assert isinstance(vectorstore, Chroma)
vectorstore.from_documents(
docs, embedding_function, persist_directory="./chroma_db"
)
return vectorstore
def parse_data():
loader = PyPDFLoader("data/daoism/tao-te-ching.pdf")
pages = loader.load_and_split()
# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0)
docs = text_splitter.split_documents(pages)
print(docs)
for doc in docs:
doc.metadata["name"] = parse_name(doc.metadata["source"])
doc.metadata["domain"] = parse_domain(doc.metadata["source"])
doc.metadata["page_number"] = doc.metadata["page"]
doc.metadata["short_name"] = doc.metadata["name"]
return docs
def parse_name(source: str) -> str:
return source.split("/")[-1].split(".")[0]
def parse_domain(source: str) -> str:
return source.split("/")[2]
if __name__ == "__main__":
db = load_data()
# query it
query = (
"He who can bear the misfortune of a nation is called the ruler of the world."
)
docs = db.similarity_search(query)
print(docs)