#%% import sys import os from dotenv import load_dotenv from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader from langchain_community.vectorstores import Chroma from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI import time from tenacity import retry, stop_after_attempt, wait_exponential from tqdm import tqdm # Add this import for progress bar # Load environment variables load_dotenv() # Set up environment variables try: tavily_api_key = os.getenv("TAVILY_API_KEY") os.environ["LANGCHAIN_TRACING_V2"] = "true" os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY") os.environ["LANGCHAIN_PROJECT"] = "legalairag" azure_endpoint = os.getenv("API_BASE") api_key = os.getenv("API_KEY") api_version = os.getenv("API_VERSION") print("Environment variables loaded successfully.") except Exception as e: print(f"Error loading environment variables: {e}") sys.exit(1) # Set up Azure OpenAI embeddings and model try: embd = AzureOpenAIEmbeddings( api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint ) llm = AzureChatOpenAI( api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint, azure_deployment="gpt-4o", temperature=0.3, ) print("Azure OpenAI embeddings and model set up successfully.") except Exception as e: print(f"Error setting up Azure OpenAI: {e}") sys.exit(1) # Function to check if vector store exists def vector_store_exists(persist_directory): return os.path.exists(persist_directory) and len(os.listdir(persist_directory)) > 0 # Load and process documents try: print("Loading PDF document...") docs = PyPDFLoader("assets/data/IPC_and_Constitution.pdf").load() print("PDF loaded successfully.") print("Splitting documents...") text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( chunk_size=500, chunk_overlap=100 ) doc_splits = text_splitter.split_documents(docs) print(f"Documents split into {len(doc_splits)} chunks.") except Exception as e: print(f"Error processing documents: {e}") sys.exit(1) @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10)) def create_vector_store_batch(persist_directory, documents, embedding, batch_size=50): vectorstore = None for i in tqdm(range(0, len(documents), batch_size), desc="Processing batches"): batch = documents[i:i+batch_size] if vectorstore is None: vectorstore = Chroma.from_documents( documents=batch, collection_name="rag-chroma", embedding=embedding, persist_directory=persist_directory ) else: vectorstore.add_documents(batch) time.sleep(1) # Add a small delay between batches return vectorstore # Create or load vector store try: persist_directory = './vectordb' if not vector_store_exists(persist_directory): print("Creating new vector store...") vectorstore = create_vector_store_batch(persist_directory, doc_splits, embd) print("New vector store created and populated.") else: print("Loading existing vector store...") vectorstore = Chroma( persist_directory=persist_directory, embedding_function=embd, collection_name="rag-chroma" ) print("Existing vector store loaded.") retriever = vectorstore.as_retriever() print("Retriever set up successfully.") except Exception as e: print(f"Error with vector store operations: {e}") sys.exit(1) print("Index setup completed successfully.")