import ray import logging from langchain_community.document_loaders import DirectoryLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from faiss import IndexFlatL2 # Assuming using L2 distance for simplicity # Initialize Ray ray.init() # Set up basic configuration for logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Load documents with logging logging.info("Loading documents...") loader = DirectoryLoader('data', glob="./*.txt") documents = loader.load() # Extract text from documents and split into manageable texts with logging #logging.info("Extracting and splitting texts from documents...") #text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200) #texts = [] #for document in documents: # if hasattr(document, 'get_text'): # text_content = document.get_text() # Adjust according to actual method # else: # text_content = "" # Default to empty string if no text method is available # # texts.extend(text_splitter.split_text(text_content)) # Extract text from documents and split into manageable texts with logging logging.info("Extracting and splitting texts from documents...") texts = [] for document in documents: if hasattr(document, 'get_text'): text_content = document.get_text() # Adjust according to actual method else: text_content = "" # Default to empty string if no text method is available # Check if text_content is valid before splitting if text_content and isinstance(text_content, str): valid_chunks = text_splitter.split_text(text_content) texts.extend(valid_chunks) else: logging.warning(f"Invalid document or empty content encountered: {document}") # Define embedding function #def embedding_function(text): # embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT") # return embeddings_model.embed_query(text) # Define embedding function def embedding_function(text): embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT") # Ensure input is valid if not text or not isinstance(text, str): raise ValueError(f"Invalid text for embedding: {text}") return embeddings_model.embed_query(text) # Create FAISS index for embeddings index = IndexFlatL2(768) # Dimension of embeddings, adjust as needed # Assuming docstore as a simple dictionary to store document texts docstore = {i: text for i, text in enumerate(texts)} index_to_docstore_id = {i: i for i in range(len(texts))} # Initialize FAISS faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id) # Process and store embeddings #logging.info("Storing embeddings in FAISS...") #for i, text in enumerate(texts): # embedding = embedding_function(text) # faiss_db.add_documents([embedding]) # Store embeddings in FAISS logging.info("Storing embeddings in FAISS...") for i, text in enumerate(texts): try: if text: # Check that the text is not None or empty embedding = embedding_function(text) faiss_db.add_documents([embedding]) else: logging.warning(f"Skipping invalid or empty text at index {i}.") except Exception as e: logging.error(f"Error while processing text at index {i}: {text}, Error: {e}") # Exporting the vector embeddings database with logging logging.info("Exporting the vector embeddings database...") faiss_db.save_local("ipc_embed_db") # Log a message to indicate the completion of the process logging.info("Process completed successfully.") # Shutdown Ray after the process ray.shutdown()