File size: 2,415 Bytes
d98ba57 3657397 d98ba57 cc2ce8c fe19632 cc2ce8c d98ba57 cc2ce8c d6936f0 cc2ce8c fe19632 cc2ce8c d6936f0 cc2ce8c d98ba57 cc2ce8c d6936f0 cc2ce8c d98ba57 fe19632 419f9af fe19632 cc2ce8c d98ba57 3657397 d98ba57 3657397 d98ba57 3657397 d98ba57 3657397 cc2ce8c d98ba57 cc2ce8c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import os
import shutil
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from .config import get_sources
from .embeddings import EMBEDDING_MODEL_NAME
from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
def load_data():
print("Loading data...")
docs = parse_data()
print("Documents loaded")
embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
print("Building index...")
vectorstore = get_vectorstore(embedding_function)
assert isinstance(vectorstore, Chroma)
vectorstore.from_documents(
docs, embedding_function, persist_directory=PERSIST_DIRECTORY
)
print("Index built")
return vectorstore
def parse_data():
docs = []
for source in get_sources():
file_path = source["file_path"]
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()
# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
doc_chunks = text_splitter.split_documents(pages)
for chunk in doc_chunks:
chunk.metadata["name"] = source["name"]
chunk.metadata["domain"] = source["domain"]
url = source.get("url", None)
if url:
chunk.metadata["url"] = source.get("url", None)
chunk.metadata["page_number"] = chunk.metadata["page"]
chunk.metadata["short_name"] = chunk.metadata["name"]
docs.append(chunk)
return docs
def clear_index():
directory_path = PERSIST_DIRECTORY
for filename in os.listdir(directory_path):
file_path = os.path.join(directory_path, filename)
try:
print(f"Deleting {file_path}")
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(f"Failed to delete {file_path}. Reason: {e}")
if __name__ == "__main__":
clear_index()
db = load_data()
# query it
query = (
"He who can bear the misfortune of a nation is called the ruler of the world."
)
docs = db.similarity_search(query)
print(docs)
|