Spaces:
Sleeping
Sleeping
Rohil Bansal
commited on
Commit
·
e5068f9
1
Parent(s):
a531f4b
Shifted to Pinecone
Browse files- src/__pycache__/graph.cpython-312.pyc +0 -0
- src/__pycache__/index.cpython-312.pyc +0 -0
- src/graph.py +2 -1
- src/index.py +58 -33
src/__pycache__/graph.cpython-312.pyc
CHANGED
Binary files a/src/__pycache__/graph.cpython-312.pyc and b/src/__pycache__/graph.cpython-312.pyc differ
|
|
src/__pycache__/index.cpython-312.pyc
CHANGED
Binary files a/src/__pycache__/index.cpython-312.pyc and b/src/__pycache__/index.cpython-312.pyc differ
|
|
src/graph.py
CHANGED
@@ -4,6 +4,7 @@ from src.websearch import *
|
|
4 |
from src.llm import *
|
5 |
from langchain.schema import Document, AIMessage, HumanMessage, SystemMessage
|
6 |
from typing import Annotated
|
|
|
7 |
|
8 |
from typing_extensions import TypedDict
|
9 |
|
@@ -75,7 +76,7 @@ def retrieve(state):
|
|
75 |
chat_context = [serialize_messages(chat_context) for chat_context in chat_context]
|
76 |
chat_context = "\n".join([d.content if hasattr(d, 'content') else d["content"] for d in chat_context])
|
77 |
|
78 |
-
documents = retriever.
|
79 |
print("---RETRIEVED---")
|
80 |
return {"documents": documents}
|
81 |
|
|
|
4 |
from src.llm import *
|
5 |
from langchain.schema import Document, AIMessage, HumanMessage, SystemMessage
|
6 |
from typing import Annotated
|
7 |
+
from langchain_community.vectorstores import Pinecone as LangchainPinecone
|
8 |
|
9 |
from typing_extensions import TypedDict
|
10 |
|
|
|
76 |
chat_context = [serialize_messages(chat_context) for chat_context in chat_context]
|
77 |
chat_context = "\n".join([d.content if hasattr(d, 'content') else d["content"] for d in chat_context])
|
78 |
|
79 |
+
documents = retriever.get_relevant_documents("Question: " + question + " Chat Context: " + chat_context)
|
80 |
print("---RETRIEVED---")
|
81 |
return {"documents": documents}
|
82 |
|
src/index.py
CHANGED
@@ -4,11 +4,12 @@ import os
|
|
4 |
from dotenv import load_dotenv
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
from langchain_community.document_loaders import PyPDFLoader
|
7 |
-
from langchain_community.vectorstores import Chroma
|
8 |
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
|
9 |
import time
|
10 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
11 |
-
from tqdm import tqdm
|
|
|
|
|
12 |
|
13 |
# Load environment variables
|
14 |
load_dotenv()
|
@@ -25,6 +26,11 @@ try:
|
|
25 |
api_key = os.getenv("API_KEY")
|
26 |
api_version = os.getenv("API_VERSION")
|
27 |
|
|
|
|
|
|
|
|
|
|
|
28 |
print("Environment variables loaded successfully.")
|
29 |
except Exception as e:
|
30 |
print(f"Error loading environment variables: {e}")
|
@@ -49,9 +55,12 @@ except Exception as e:
|
|
49 |
print(f"Error setting up Azure OpenAI: {e}")
|
50 |
sys.exit(1)
|
51 |
|
52 |
-
#
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
55 |
|
56 |
# Load and process documents
|
57 |
try:
|
@@ -61,7 +70,7 @@ try:
|
|
61 |
|
62 |
print("Splitting documents...")
|
63 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
64 |
-
chunk_size=
|
65 |
)
|
66 |
doc_splits = text_splitter.split_documents(docs)
|
67 |
print(f"Documents split into {len(doc_splits)} chunks.")
|
@@ -70,42 +79,58 @@ except Exception as e:
|
|
70 |
sys.exit(1)
|
71 |
|
72 |
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
|
73 |
-
def
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
for i in tqdm(range(0, len(documents), batch_size), desc="Processing batches"):
|
76 |
batch = documents[i:i+batch_size]
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
persist_directory=persist_directory
|
83 |
-
)
|
84 |
-
else:
|
85 |
-
vectorstore.add_documents(batch)
|
86 |
time.sleep(1) # Add a small delay between batches
|
87 |
-
return vectorstore
|
88 |
|
89 |
-
# Create or load
|
90 |
try:
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
else:
|
97 |
-
print("
|
98 |
-
vectorstore = Chroma(
|
99 |
-
persist_directory=persist_directory,
|
100 |
-
embedding_function=embd,
|
101 |
-
collection_name="rag-chroma"
|
102 |
-
)
|
103 |
-
print("Existing vector store loaded.")
|
104 |
|
105 |
-
|
|
|
|
|
106 |
print("Retriever set up successfully.")
|
107 |
except Exception as e:
|
108 |
-
print(f"Error with
|
|
|
|
|
109 |
sys.exit(1)
|
110 |
|
111 |
print("Index setup completed successfully.")
|
|
|
4 |
from dotenv import load_dotenv
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
from langchain_community.document_loaders import PyPDFLoader
|
|
|
7 |
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
|
8 |
import time
|
9 |
from tenacity import retry, stop_after_attempt, wait_exponential
|
10 |
+
from tqdm import tqdm
|
11 |
+
from pinecone import Pinecone, ServerlessSpec
|
12 |
+
from langchain_community.vectorstores import Pinecone as LangchainPinecone
|
13 |
|
14 |
# Load environment variables
|
15 |
load_dotenv()
|
|
|
26 |
api_key = os.getenv("API_KEY")
|
27 |
api_version = os.getenv("API_VERSION")
|
28 |
|
29 |
+
# Pinecone environment variables
|
30 |
+
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
|
31 |
+
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
|
32 |
+
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
|
33 |
+
|
34 |
print("Environment variables loaded successfully.")
|
35 |
except Exception as e:
|
36 |
print(f"Error loading environment variables: {e}")
|
|
|
55 |
print(f"Error setting up Azure OpenAI: {e}")
|
56 |
sys.exit(1)
|
57 |
|
58 |
+
# Initialize Pinecone
|
59 |
+
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
|
60 |
+
|
61 |
+
# Function to check if Pinecone index exists
|
62 |
+
def pinecone_index_exists(index_name):
|
63 |
+
return index_name in pc.list_indexes().names()
|
64 |
|
65 |
# Load and process documents
|
66 |
try:
|
|
|
70 |
|
71 |
print("Splitting documents...")
|
72 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
73 |
+
chunk_size=400, chunk_overlap=100
|
74 |
)
|
75 |
doc_splits = text_splitter.split_documents(docs)
|
76 |
print(f"Documents split into {len(doc_splits)} chunks.")
|
|
|
79 |
sys.exit(1)
|
80 |
|
81 |
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
|
82 |
+
def create_pinecone_index(index_name, dimension, spec):
|
83 |
+
try:
|
84 |
+
if not pinecone_index_exists(index_name):
|
85 |
+
print(f"Creating new Pinecone index: {index_name}")
|
86 |
+
pc.create_index(
|
87 |
+
name=index_name,
|
88 |
+
dimension=dimension,
|
89 |
+
metric='cosine',
|
90 |
+
spec=spec
|
91 |
+
)
|
92 |
+
print(f"Connecting to Pinecone index: {index_name}")
|
93 |
+
return pc.Index(index_name)
|
94 |
+
except Exception as e:
|
95 |
+
print(f"Error creating/connecting to Pinecone index: {e}")
|
96 |
+
raise
|
97 |
+
|
98 |
+
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
|
99 |
+
def upsert_to_pinecone(index, documents, embedding, batch_size=50):
|
100 |
for i in tqdm(range(0, len(documents), batch_size), desc="Processing batches"):
|
101 |
batch = documents[i:i+batch_size]
|
102 |
+
ids = [str(j) for j in range(i, min(i+batch_size, len(documents)))]
|
103 |
+
embeds = embedding.embed_documents([doc.page_content for doc in batch])
|
104 |
+
metadata = [{"text": doc.page_content} for doc in batch]
|
105 |
+
to_upsert = list(zip(ids, embeds, metadata))
|
106 |
+
index.upsert(vectors=to_upsert)
|
|
|
|
|
|
|
|
|
107 |
time.sleep(1) # Add a small delay between batches
|
|
|
108 |
|
109 |
+
# Create or load Pinecone index
|
110 |
try:
|
111 |
+
print("Setting up Pinecone index...")
|
112 |
+
dimension = 1536 # Dimension for Azure OpenAI embeddings
|
113 |
+
pinecone_index = create_pinecone_index(PINECONE_INDEX_NAME, dimension, spec=ServerlessSpec(cloud='aws', region='us-east-1'))
|
114 |
+
|
115 |
+
print("Checking index statistics...")
|
116 |
+
index_stats = pinecone_index.describe_index_stats()
|
117 |
+
print(f"Index stats: {index_stats}")
|
118 |
+
|
119 |
+
if index_stats['total_vector_count'] == 0:
|
120 |
+
print("Upserting documents to Pinecone...")
|
121 |
+
upsert_to_pinecone(pinecone_index, doc_splits, embd)
|
122 |
+
print("Documents upserted to Pinecone successfully.")
|
123 |
else:
|
124 |
+
print("Pinecone index already populated.")
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
+
print("Creating LangChain vectorstore...")
|
127 |
+
vectorstore = LangchainPinecone(pinecone_index, embd.embed_query, "text")
|
128 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
|
129 |
print("Retriever set up successfully.")
|
130 |
except Exception as e:
|
131 |
+
print(f"Error with Pinecone operations: {e}")
|
132 |
+
import traceback
|
133 |
+
traceback.print_exc()
|
134 |
sys.exit(1)
|
135 |
|
136 |
print("Index setup completed successfully.")
|