|
import os |
|
import shutil |
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.vectorstores import Chroma |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.document_loaders import PyPDFLoader |
|
|
|
from .config import get_sources |
|
from .embeddings import EMBEDDING_MODEL_NAME |
|
from .vectorstore import PERSIST_DIRECTORY, get_vectorstore |
|
|
|
|
|
def load_data(): |
|
print("Loading data...") |
|
docs = parse_data() |
|
print("Documents loaded") |
|
embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME) |
|
print("Building index...") |
|
vectorstore = get_vectorstore(embedding_function) |
|
|
|
assert isinstance(vectorstore, Chroma) |
|
vectorstore.from_documents( |
|
docs, embedding_function, persist_directory=PERSIST_DIRECTORY |
|
) |
|
print("Index built") |
|
return vectorstore |
|
|
|
|
|
def parse_data(): |
|
docs = [] |
|
for source in get_sources(): |
|
file_path = source["file_path"] |
|
loader = PyPDFLoader(file_path) |
|
pages = loader.load_and_split() |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) |
|
doc_chunks = text_splitter.split_documents(pages) |
|
|
|
for chunk in doc_chunks: |
|
chunk.metadata["name"] = source["name"] |
|
chunk.metadata["domain"] = source["domain"] |
|
url = source.get("url", None) |
|
if url: |
|
chunk.metadata["url"] = source.get("url", None) |
|
chunk.metadata["page_number"] = chunk.metadata["page"] |
|
chunk.metadata["short_name"] = chunk.metadata["name"] |
|
docs.append(chunk) |
|
|
|
return docs |
|
|
|
|
|
def clear_index(): |
|
directory_path = PERSIST_DIRECTORY |
|
for filename in os.listdir(directory_path): |
|
file_path = os.path.join(directory_path, filename) |
|
try: |
|
print(f"Deleting {file_path}") |
|
if os.path.isfile(file_path) or os.path.islink(file_path): |
|
os.unlink(file_path) |
|
elif os.path.isdir(file_path): |
|
shutil.rmtree(file_path) |
|
except Exception as e: |
|
print(f"Failed to delete {file_path}. Reason: {e}") |
|
|
|
|
|
if __name__ == "__main__": |
|
clear_index() |
|
db = load_data() |
|
|
|
query = ( |
|
"He who can bear the misfortune of a nation is called the ruler of the world." |
|
) |
|
docs = db.similarity_search(query) |
|
print(docs) |
|
|