|
from langchain_community.document_loaders import PyMuPDFLoader |
|
from langchain_community.document_loaders import UnstructuredMarkdownLoader |
|
from langchain.schema import Document |
|
from langchain_community.embeddings import OpenAIEmbeddings |
|
from langchain_community.vectorstores import Chroma |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
import re |
|
import os |
|
from dotenv import load_dotenv, find_dotenv |
|
|
|
from zhipuai_embedding import ZhipuAIEmbeddings |
|
|
|
|
|
|
|
|
|
|
|
_ = load_dotenv(find_dotenv()) |
|
|
|
|
|
loader = PyMuPDFLoader("/Users/chenshuyi/Documents/agent/data_base/knowledge_db/merck.pdf") |
|
|
|
|
|
pdf_pages = loader.load() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pattern = re.compile(r'[^\u4e00-\u9fff](\n)[^\u4e00-\u9fff]', re.DOTALL) |
|
|
|
for pdf_page in pdf_pages: |
|
|
|
pdf_page.page_content = re.sub(pattern, lambda match: match.group(0).replace('\n', ''), pdf_page.page_content) |
|
|
|
|
|
pdf_page.page_content = pdf_page.page_content.replace('•', '') |
|
|
|
|
|
pdf_page.page_content = pdf_page.page_content.replace('\n\n', '\n') |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=500, chunk_overlap=50) |
|
|
|
split_docs = text_splitter.split_documents(pdf_pages) |
|
|
|
|
|
|
|
|
|
embedding = ZhipuAIEmbeddings() |
|
|
|
|
|
persist_directory = '../../data_base/vector_db/chroma' |
|
|
|
|
|
vectordb = Chroma.from_documents( |
|
documents = split_docs, |
|
embedding=embedding, |
|
persist_directory=persist_directory |
|
|
|
) |
|
vectordb.persist() |
|
print(f"向量库中存储的数量:{vectordb._collection.count()}") |
|
|
|
print(f"Chroma 数据存储在: {vectordb._persist_directory}") |
|
|
|
question="headache" |
|
sim_docs = vectordb.similarity_search(question,k=3) |
|
print(f"检索到的内容数:{len(sim_docs)}") |
|
|
|
for i, sim_doc in enumerate(sim_docs): |
|
print(f"检索到的第{i}个内容: \n{sim_doc.page_content[:200]}", end="\n--------------\n") |