Rohil Bansal commited on
Commit
e5068f9
·
1 Parent(s): a531f4b

Shifted to Pinecone

Browse files
src/__pycache__/graph.cpython-312.pyc CHANGED
Binary files a/src/__pycache__/graph.cpython-312.pyc and b/src/__pycache__/graph.cpython-312.pyc differ
 
src/__pycache__/index.cpython-312.pyc CHANGED
Binary files a/src/__pycache__/index.cpython-312.pyc and b/src/__pycache__/index.cpython-312.pyc differ
 
src/graph.py CHANGED
@@ -4,6 +4,7 @@ from src.websearch import *
4
  from src.llm import *
5
  from langchain.schema import Document, AIMessage, HumanMessage, SystemMessage
6
  from typing import Annotated
 
7
 
8
  from typing_extensions import TypedDict
9
 
@@ -75,7 +76,7 @@ def retrieve(state):
75
  chat_context = [serialize_messages(chat_context) for chat_context in chat_context]
76
  chat_context = "\n".join([d.content if hasattr(d, 'content') else d["content"] for d in chat_context])
77
 
78
- documents = retriever.invoke("Question: " + question + " Chat Context: " + chat_context)
79
  print("---RETRIEVED---")
80
  return {"documents": documents}
81
 
 
4
  from src.llm import *
5
  from langchain.schema import Document, AIMessage, HumanMessage, SystemMessage
6
  from typing import Annotated
7
+ from langchain_community.vectorstores import Pinecone as LangchainPinecone
8
 
9
  from typing_extensions import TypedDict
10
 
 
76
  chat_context = [serialize_messages(chat_context) for chat_context in chat_context]
77
  chat_context = "\n".join([d.content if hasattr(d, 'content') else d["content"] for d in chat_context])
78
 
79
+ documents = retriever.get_relevant_documents("Question: " + question + " Chat Context: " + chat_context)
80
  print("---RETRIEVED---")
81
  return {"documents": documents}
82
 
src/index.py CHANGED
@@ -4,11 +4,12 @@ import os
4
  from dotenv import load_dotenv
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_community.document_loaders import PyPDFLoader
7
- from langchain_community.vectorstores import Chroma
8
  from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
9
  import time
10
  from tenacity import retry, stop_after_attempt, wait_exponential
11
- from tqdm import tqdm # Add this import for progress bar
 
 
12
 
13
  # Load environment variables
14
  load_dotenv()
@@ -25,6 +26,11 @@ try:
25
  api_key = os.getenv("API_KEY")
26
  api_version = os.getenv("API_VERSION")
27
 
 
 
 
 
 
28
  print("Environment variables loaded successfully.")
29
  except Exception as e:
30
  print(f"Error loading environment variables: {e}")
@@ -49,9 +55,12 @@ except Exception as e:
49
  print(f"Error setting up Azure OpenAI: {e}")
50
  sys.exit(1)
51
 
52
- # Function to check if vector store exists
53
- def vector_store_exists(persist_directory):
54
- return os.path.exists(persist_directory) and len(os.listdir(persist_directory)) > 0
 
 
 
55
 
56
  # Load and process documents
57
  try:
@@ -61,7 +70,7 @@ try:
61
 
62
  print("Splitting documents...")
63
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
64
- chunk_size=300, chunk_overlap=100
65
  )
66
  doc_splits = text_splitter.split_documents(docs)
67
  print(f"Documents split into {len(doc_splits)} chunks.")
@@ -70,42 +79,58 @@ except Exception as e:
70
  sys.exit(1)
71
 
72
  @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
73
- def create_vector_store_batch(persist_directory, documents, embedding, batch_size=50):
74
- vectorstore = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  for i in tqdm(range(0, len(documents), batch_size), desc="Processing batches"):
76
  batch = documents[i:i+batch_size]
77
- if vectorstore is None:
78
- vectorstore = Chroma.from_documents(
79
- documents=batch,
80
- collection_name="rag-chroma",
81
- embedding=embedding,
82
- persist_directory=persist_directory
83
- )
84
- else:
85
- vectorstore.add_documents(batch)
86
  time.sleep(1) # Add a small delay between batches
87
- return vectorstore
88
 
89
- # Create or load vector store
90
  try:
91
- persist_directory = './vectordb'
92
- if not vector_store_exists(persist_directory):
93
- print("Creating new vector store...")
94
- vectorstore = create_vector_store_batch(persist_directory, doc_splits, embd)
95
- print("New vector store created and populated.")
 
 
 
 
 
 
 
96
  else:
97
- print("Loading existing vector store...")
98
- vectorstore = Chroma(
99
- persist_directory=persist_directory,
100
- embedding_function=embd,
101
- collection_name="rag-chroma"
102
- )
103
- print("Existing vector store loaded.")
104
 
105
- retriever = vectorstore.as_retriever(search_kwargs={"k": 10, "score_threshold": 0.6}, search_type="similarity_score_threshold")
 
 
106
  print("Retriever set up successfully.")
107
  except Exception as e:
108
- print(f"Error with vector store operations: {e}")
 
 
109
  sys.exit(1)
110
 
111
  print("Index setup completed successfully.")
 
4
  from dotenv import load_dotenv
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_community.document_loaders import PyPDFLoader
 
7
  from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
8
  import time
9
  from tenacity import retry, stop_after_attempt, wait_exponential
10
+ from tqdm import tqdm
11
+ from pinecone import Pinecone, ServerlessSpec
12
+ from langchain_community.vectorstores import Pinecone as LangchainPinecone
13
 
14
  # Load environment variables
15
  load_dotenv()
 
26
  api_key = os.getenv("API_KEY")
27
  api_version = os.getenv("API_VERSION")
28
 
29
+ # Pinecone environment variables
30
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
31
+ PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
32
+ PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
33
+
34
  print("Environment variables loaded successfully.")
35
  except Exception as e:
36
  print(f"Error loading environment variables: {e}")
 
55
  print(f"Error setting up Azure OpenAI: {e}")
56
  sys.exit(1)
57
 
58
+ # Initialize Pinecone
59
+ pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
60
+
61
+ # Function to check if Pinecone index exists
62
+ def pinecone_index_exists(index_name):
63
+ return index_name in pc.list_indexes().names()
64
 
65
  # Load and process documents
66
  try:
 
70
 
71
  print("Splitting documents...")
72
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
73
+ chunk_size=400, chunk_overlap=100
74
  )
75
  doc_splits = text_splitter.split_documents(docs)
76
  print(f"Documents split into {len(doc_splits)} chunks.")
 
79
  sys.exit(1)
80
 
81
  @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
82
+ def create_pinecone_index(index_name, dimension, spec):
83
+ try:
84
+ if not pinecone_index_exists(index_name):
85
+ print(f"Creating new Pinecone index: {index_name}")
86
+ pc.create_index(
87
+ name=index_name,
88
+ dimension=dimension,
89
+ metric='cosine',
90
+ spec=spec
91
+ )
92
+ print(f"Connecting to Pinecone index: {index_name}")
93
+ return pc.Index(index_name)
94
+ except Exception as e:
95
+ print(f"Error creating/connecting to Pinecone index: {e}")
96
+ raise
97
+
98
+ @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
99
+ def upsert_to_pinecone(index, documents, embedding, batch_size=50):
100
  for i in tqdm(range(0, len(documents), batch_size), desc="Processing batches"):
101
  batch = documents[i:i+batch_size]
102
+ ids = [str(j) for j in range(i, min(i+batch_size, len(documents)))]
103
+ embeds = embedding.embed_documents([doc.page_content for doc in batch])
104
+ metadata = [{"text": doc.page_content} for doc in batch]
105
+ to_upsert = list(zip(ids, embeds, metadata))
106
+ index.upsert(vectors=to_upsert)
 
 
 
 
107
  time.sleep(1) # Add a small delay between batches
 
108
 
109
+ # Create or load Pinecone index
110
  try:
111
+ print("Setting up Pinecone index...")
112
+ dimension = 1536 # Dimension for Azure OpenAI embeddings
113
+ pinecone_index = create_pinecone_index(PINECONE_INDEX_NAME, dimension, spec=ServerlessSpec(cloud='aws', region='us-east-1'))
114
+
115
+ print("Checking index statistics...")
116
+ index_stats = pinecone_index.describe_index_stats()
117
+ print(f"Index stats: {index_stats}")
118
+
119
+ if index_stats['total_vector_count'] == 0:
120
+ print("Upserting documents to Pinecone...")
121
+ upsert_to_pinecone(pinecone_index, doc_splits, embd)
122
+ print("Documents upserted to Pinecone successfully.")
123
  else:
124
+ print("Pinecone index already populated.")
 
 
 
 
 
 
125
 
126
+ print("Creating LangChain vectorstore...")
127
+ vectorstore = LangchainPinecone(pinecone_index, embd.embed_query, "text")
128
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
129
  print("Retriever set up successfully.")
130
  except Exception as e:
131
+ print(f"Error with Pinecone operations: {e}")
132
+ import traceback
133
+ traceback.print_exc()
134
  sys.exit(1)
135
 
136
  print("Index setup completed successfully.")