File size: 1,622 Bytes
cc2ce8c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# import
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from .embeddings import EMBEDDING_MODEL_NAME
from .vectorstore import get_vectorstore
def load_data():
docs = parse_data()
embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
vectorstore = get_vectorstore(embedding_function)
assert isinstance(vectorstore, Chroma)
vectorstore.from_documents(
docs, embedding_function, persist_directory="./chroma_db"
)
return vectorstore
def parse_data():
loader = PyPDFLoader("data/daoism/tao-te-ching.pdf")
pages = loader.load_and_split()
# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0)
docs = text_splitter.split_documents(pages)
print(docs)
for doc in docs:
doc.metadata["name"] = parse_name(doc.metadata["source"])
doc.metadata["domain"] = parse_domain(doc.metadata["source"])
doc.metadata["page_number"] = doc.metadata["page"]
doc.metadata["short_name"] = doc.metadata["name"]
return docs
def parse_name(source: str) -> str:
return source.split("/")[-1].split(".")[0]
def parse_domain(source: str) -> str:
return source.split("/")[2]
if __name__ == "__main__":
db = load_data()
# query it
query = (
"He who can bear the misfortune of a nation is called the ruler of the world."
)
docs = db.similarity_search(query)
print(docs)
|