#%% import sys import os from dotenv import load_dotenv from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI import time from tenacity import retry, stop_after_attempt, wait_exponential from tqdm import tqdm from pinecone import Pinecone, ServerlessSpec from langchain_community.vectorstores import Pinecone as LangchainPinecone # Load environment variables load_dotenv() # Set up environment variables try: tavily_api_key = os.getenv("TAVILY_API_KEY") os.environ["LANGCHAIN_TRACING_V2"] = "true" os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY") os.environ["LANGCHAIN_PROJECT"] = "legalairag" azure_endpoint = os.getenv("API_BASE") api_key = os.getenv("API_KEY") api_version = os.getenv("API_VERSION") # Pinecone environment variables PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT") PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME") print("Environment variables loaded successfully.") except Exception as e: print(f"Error loading environment variables: {e}") sys.exit(1) # Set up Azure OpenAI embeddings and model try: embd = AzureOpenAIEmbeddings( api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint ) llm = AzureChatOpenAI( api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint, azure_deployment="gpt-4o", temperature=0.3, ) print("Azure OpenAI embeddings and model set up successfully.") except Exception as e: print(f"Error setting up Azure OpenAI: {e}") sys.exit(1) # Initialize Pinecone pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT) # Function to check if Pinecone index exists def pinecone_index_exists(index_name): return index_name in pc.list_indexes().names() # Load and process documents try: print("Loading PDF document...") docs = PyPDFLoader("assets/data/IPC_and_Constitution.pdf").load() print("PDF loaded successfully.") print("Splitting documents...") text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( chunk_size=400, chunk_overlap=100 ) doc_splits = text_splitter.split_documents(docs) print(f"Documents split into {len(doc_splits)} chunks.") except Exception as e: print(f"Error processing documents: {e}") sys.exit(1) @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10)) def create_pinecone_index(index_name, dimension, spec): try: if not pinecone_index_exists(index_name): print(f"Creating new Pinecone index: {index_name}") pc.create_index( name=index_name, dimension=dimension, metric='cosine', spec=spec ) print(f"Connecting to Pinecone index: {index_name}") return pc.Index(index_name) except Exception as e: print(f"Error creating/connecting to Pinecone index: {e}") raise @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10)) def upsert_to_pinecone(index, documents, embedding, batch_size=50): for i in tqdm(range(0, len(documents), batch_size), desc="Processing batches"): batch = documents[i:i+batch_size] ids = [str(j) for j in range(i, min(i+batch_size, len(documents)))] embeds = embedding.embed_documents([doc.page_content for doc in batch]) metadata = [{"text": doc.page_content} for doc in batch] to_upsert = list(zip(ids, embeds, metadata)) index.upsert(vectors=to_upsert) time.sleep(1) # Add a small delay between batches # Create or load Pinecone index try: print("Setting up Pinecone index...") dimension = 1536 # Dimension for Azure OpenAI embeddings pinecone_index = create_pinecone_index(PINECONE_INDEX_NAME, dimension, spec=ServerlessSpec(cloud='aws', region='us-east-1')) print("Checking index statistics...") index_stats = pinecone_index.describe_index_stats() print(f"Index stats: {index_stats}") if index_stats['total_vector_count'] == 0: print("Upserting documents to Pinecone...") upsert_to_pinecone(pinecone_index, doc_splits, embd) print("Documents upserted to Pinecone successfully.") else: print("Pinecone index already populated.") print("Creating LangChain vectorstore...") vectorstore = LangchainPinecone(pinecone_index, embd.embed_query, "text") retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) print("Retriever set up successfully.") except Exception as e: print(f"Error with Pinecone operations: {e}") import traceback traceback.print_exc() sys.exit(1) print("Index setup completed successfully.")