Spaces:
Sleeping
Sleeping
#%% | |
import sys | |
import os | |
from dotenv import load_dotenv | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI | |
import time | |
from tenacity import retry, stop_after_attempt, wait_exponential | |
from tqdm import tqdm | |
from pinecone import Pinecone, ServerlessSpec | |
from langchain_community.vectorstores import Pinecone as LangchainPinecone | |
# Load environment variables | |
load_dotenv() | |
# Set up environment variables | |
try: | |
tavily_api_key = os.getenv("TAVILY_API_KEY") | |
os.environ["LANGCHAIN_TRACING_V2"] = "true" | |
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" | |
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY") | |
os.environ["LANGCHAIN_PROJECT"] = "legalairag" | |
azure_endpoint = os.getenv("API_BASE") | |
api_key = os.getenv("API_KEY") | |
api_version = os.getenv("API_VERSION") | |
# Pinecone environment variables | |
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") | |
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT") | |
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME") | |
print("Environment variables loaded successfully.") | |
except Exception as e: | |
print(f"Error loading environment variables: {e}") | |
sys.exit(1) | |
# Set up Azure OpenAI embeddings and model | |
try: | |
embd = AzureOpenAIEmbeddings( | |
api_key=api_key, | |
api_version=api_version, | |
azure_endpoint=azure_endpoint | |
) | |
llm = AzureChatOpenAI( | |
api_key=api_key, | |
api_version=api_version, | |
azure_endpoint=azure_endpoint, | |
azure_deployment="gpt-4o", | |
temperature=0.3, | |
) | |
print("Azure OpenAI embeddings and model set up successfully.") | |
except Exception as e: | |
print(f"Error setting up Azure OpenAI: {e}") | |
sys.exit(1) | |
# Initialize Pinecone | |
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT) | |
# Function to check if Pinecone index exists | |
def pinecone_index_exists(index_name): | |
return index_name in pc.list_indexes().names() | |
# Load and process documents | |
try: | |
print("Loading PDF document...") | |
docs = PyPDFLoader("assets/data/IPC_and_Constitution.pdf").load() | |
print("PDF loaded successfully.") | |
print("Splitting documents...") | |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | |
chunk_size=400, chunk_overlap=100 | |
) | |
doc_splits = text_splitter.split_documents(docs) | |
print(f"Documents split into {len(doc_splits)} chunks.") | |
except Exception as e: | |
print(f"Error processing documents: {e}") | |
sys.exit(1) | |
def create_pinecone_index(index_name, dimension, spec): | |
try: | |
if not pinecone_index_exists(index_name): | |
print(f"Creating new Pinecone index: {index_name}") | |
pc.create_index( | |
name=index_name, | |
dimension=dimension, | |
metric='cosine', | |
spec=spec | |
) | |
print(f"Connecting to Pinecone index: {index_name}") | |
return pc.Index(index_name) | |
except Exception as e: | |
print(f"Error creating/connecting to Pinecone index: {e}") | |
raise | |
def upsert_to_pinecone(index, documents, embedding, batch_size=50): | |
for i in tqdm(range(0, len(documents), batch_size), desc="Processing batches"): | |
batch = documents[i:i+batch_size] | |
ids = [str(j) for j in range(i, min(i+batch_size, len(documents)))] | |
embeds = embedding.embed_documents([doc.page_content for doc in batch]) | |
metadata = [{"text": doc.page_content} for doc in batch] | |
to_upsert = list(zip(ids, embeds, metadata)) | |
index.upsert(vectors=to_upsert) | |
time.sleep(1) # Add a small delay between batches | |
# Create or load Pinecone index | |
try: | |
print("Setting up Pinecone index...") | |
dimension = 1536 # Dimension for Azure OpenAI embeddings | |
pinecone_index = create_pinecone_index(PINECONE_INDEX_NAME, dimension, spec=ServerlessSpec(cloud='aws', region='us-east-1')) | |
print("Checking index statistics...") | |
index_stats = pinecone_index.describe_index_stats() | |
print(f"Index stats: {index_stats}") | |
if index_stats['total_vector_count'] == 0: | |
print("Upserting documents to Pinecone...") | |
upsert_to_pinecone(pinecone_index, doc_splits, embd) | |
print("Documents upserted to Pinecone successfully.") | |
else: | |
print("Pinecone index already populated.") | |
print("Creating LangChain vectorstore...") | |
vectorstore = LangchainPinecone(pinecone_index, embd.embed_query, "text") | |
retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) | |
print("Retriever set up successfully.") | |
except Exception as e: | |
print(f"Error with Pinecone operations: {e}") | |
import traceback | |
traceback.print_exc() | |
sys.exit(1) | |
print("Index setup completed successfully.") |