Spaces:

realrohilbansal
/

LegalAlly

Sleeping

LegalAlly / src /index.py

Rohil Bansal

changed recursion limit and search kw

df844ea 5 months ago

5.06 kB

	#%%
	import sys
	import os
	from dotenv import load_dotenv
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
	import time
	from tenacity import retry, stop_after_attempt, wait_exponential
	from tqdm import tqdm
	from pinecone import Pinecone, ServerlessSpec
	from langchain_community.vectorstores import Pinecone as LangchainPinecone

	# Load environment variables
	load_dotenv()

	# Set up environment variables
	try:
	tavily_api_key = os.getenv("TAVILY_API_KEY")
	os.environ["LANGCHAIN_TRACING_V2"] = "true"
	os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
	os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
	os.environ["LANGCHAIN_PROJECT"] = "legalairag"

	azure_endpoint = os.getenv("API_BASE")
	api_key = os.getenv("API_KEY")
	api_version = os.getenv("API_VERSION")

	# Pinecone environment variables
	PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
	PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
	PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")

	print("Environment variables loaded successfully.")
	except Exception as e:
	print(f"Error loading environment variables: {e}")
	sys.exit(1)

	# Set up Azure OpenAI embeddings and model
	try:
	embd = AzureOpenAIEmbeddings(
	api_key=api_key,
	api_version=api_version,
	azure_endpoint=azure_endpoint
	)
	llm = AzureChatOpenAI(
	api_key=api_key,
	api_version=api_version,
	azure_endpoint=azure_endpoint,
	azure_deployment="gpt-4o",
	temperature=0.3,
	)
	print("Azure OpenAI embeddings and model set up successfully.")
	except Exception as e:
	print(f"Error setting up Azure OpenAI: {e}")
	sys.exit(1)

	# Initialize Pinecone
	pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)

	# Function to check if Pinecone index exists
	def pinecone_index_exists(index_name):
	return index_name in pc.list_indexes().names()

	# Load and process documents
	try:
	print("Loading PDF document...")
	docs = PyPDFLoader("assets/data/IPC_and_Constitution.pdf").load()
	print("PDF loaded successfully.")

	print("Splitting documents...")
	text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
	chunk_size=400, chunk_overlap=100
	)
	doc_splits = text_splitter.split_documents(docs)
	print(f"Documents split into {len(doc_splits)} chunks.")
	except Exception as e:
	print(f"Error processing documents: {e}")
	sys.exit(1)

	@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
	def create_pinecone_index(index_name, dimension, spec):
	try:
	if not pinecone_index_exists(index_name):
	print(f"Creating new Pinecone index: {index_name}")
	pc.create_index(
	name=index_name,
	dimension=dimension,
	metric='cosine',
	spec=spec
	)
	print(f"Connecting to Pinecone index: {index_name}")
	return pc.Index(index_name)
	except Exception as e:
	print(f"Error creating/connecting to Pinecone index: {e}")
	raise

	@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
	def upsert_to_pinecone(index, documents, embedding, batch_size=50):
	for i in tqdm(range(0, len(documents), batch_size), desc="Processing batches"):
	batch = documents[i:i+batch_size]
	ids = [str(j) for j in range(i, min(i+batch_size, len(documents)))]
	embeds = embedding.embed_documents([doc.page_content for doc in batch])
	metadata = [{"text": doc.page_content} for doc in batch]
	to_upsert = list(zip(ids, embeds, metadata))
	index.upsert(vectors=to_upsert)
	time.sleep(1) # Add a small delay between batches

	# Create or load Pinecone index
	try:
	print("Setting up Pinecone index...")
	dimension = 1536 # Dimension for Azure OpenAI embeddings
	pinecone_index = create_pinecone_index(PINECONE_INDEX_NAME, dimension, spec=ServerlessSpec(cloud='aws', region='us-east-1'))

	print("Checking index statistics...")
	index_stats = pinecone_index.describe_index_stats()
	print(f"Index stats: {index_stats}")

	if index_stats['total_vector_count'] == 0:
	print("Upserting documents to Pinecone...")
	upsert_to_pinecone(pinecone_index, doc_splits, embd)
	print("Documents upserted to Pinecone successfully.")
	else:
	print("Pinecone index already populated.")

	print("Creating LangChain vectorstore...")
	vectorstore = LangchainPinecone(pinecone_index, embd.embed_query, "text")
	retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
	print("Retriever set up successfully.")
	except Exception as e:
	print(f"Error with Pinecone operations: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	print("Index setup completed successfully.")