from dotenv import load_dotenv from langchain.document_loaders import UnstructuredFileLoader from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.text_splitter import CharacterTextSplitter from glob import glob import os # Load environment variables from .env file load_dotenv() DOCUMENT_PATH = "data/raw/cixiidae" DB_DIR = "chroma" def parse_documents(path): pdf_files = glob(os.path.join(path, "*.pdf")) documents = [] for file_path in pdf_files: documents.extend(parse_document(file_path)) return documents def parse_document(file_path): try: loader = UnstructuredFileLoader(file_path) document = loader.load() print(f"File parsed: {file_path}") return document except Exception as e: print(f"An error occurred while processing the file {file_path}: {str(e)}") def split(documents): text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20) return text_splitter.split_documents(documents) def persist(documents): embeddings = OpenAIEmbeddings() vectordb = Chroma.from_documents( documents, embedding=embeddings, persist_directory=DB_DIR ) vectordb.persist() def main(): documents = parse_documents(DOCUMENT_PATH) documents = split(documents) print(f"Total pages: {len(documents)}") persist(documents) if __name__ == "__main__": main()