import chromadb import os import shutil from llama_index.core import Settings, SimpleDirectoryReader, StorageContext, VectorStoreIndex from llama_index.vector_stores.chroma import ChromaVectorStore from llama_index.core.node_parser import SemanticSplitterNodeParser from llama_index.core.node_parser import SentenceSplitter root_path = os.environ['ROOT_PATH'] DOCUMENTS_DIRECTORY = os.path.join(root_path,"doc","loaded") INPUT_DIRECTORY = os.path.join(root_path,"doc","input") DB_DIRECTORY = os.path.join(root_path,"chromadb") COLLECTION_NAME = "SmartAgri" DB_METADATA = {"hnsw:space": "cosine"} def move_files(src, dst): for root, dirs, files in os.walk(src): # Tạo cấu trúc thư mục con tương tự trong folder đích for dir in dirs: os.makedirs(os.path.join(dst, os.path.relpath(os.path.join(root, dir), src)), exist_ok=True) # Di chuyển các file for file in files: src_file = os.path.join(root, file) dst_file = os.path.join(dst, os.path.relpath(src_file, src)) # Nếu file đích đã tồn tại, giữ lại file có sẵn trong folder 'dest' if not os.path.exists(dst_file): shutil.move(src_file, dst_file) def ChromaVectorIndex(force_new = False): chroma_client = chromadb.PersistentClient(path = DB_DIRECTORY) if force_new: # chuyển hết sang thư mục input để thực hiện embedding tất cả từ đầu move_files(DOCUMENTS_DIRECTORY, INPUT_DIRECTORY) try: chroma_client.delete_collection(COLLECTION_NAME) # xóa db đang có except: pass try: reader = SimpleDirectoryReader(input_dir=INPUT_DIRECTORY, recursive=True) documents = reader.load_data() except ValueError: documents = None chroma_collection = chroma_client.get_or_create_collection(COLLECTION_NAME, metadata=DB_METADATA) # chuyển hết sang thư mục loaded sau khi đã embedding xong vector_store = ChromaVectorStore(chroma_collection=chroma_collection) move_files(INPUT_DIRECTORY, DOCUMENTS_DIRECTORY) if documents: Settings.text_splitter = SentenceSplitter(chunk_size=1500, chunk_overlap=500) # SemanticSplitterNodeParser(include_metadata=True, include_prev_next_rel=True, embed_model=Settings.embed_model, breakpoint_percentile_threshold=95) storage_context = StorageContext.from_defaults(vector_store=vector_store) index = VectorStoreIndex.from_documents(documents, transformations=[Settings.text_splitter], storage_context=storage_context, embed_model=Settings.embed_model) else: index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=Settings.embed_model) return index