DrishtiSharma commited on
Commit
0b3783b
·
verified ·
1 Parent(s): f1054fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -28
app.py CHANGED
@@ -65,13 +65,11 @@ def load_docs(document_path):
65
  text_splitter = NLTKTextSplitter(chunk_size=1000)
66
  split_docs = text_splitter.split_documents(documents)
67
 
68
- # Filter metadata to only include str, int, float, or bool
69
- for doc in split_docs:
70
- if hasattr(doc, "metadata") and isinstance(doc.metadata, dict):
71
- doc.metadata = {
72
- k: v for k, v in doc.metadata.items()
73
- if isinstance(v, (str, int, float, bool))
74
- }
75
  return split_docs
76
  except Exception as e:
77
  st.error(f"Failed to load and process PDF: {e}")
@@ -86,32 +84,68 @@ def already_indexed(vectordb, file_name):
86
  def load_chain(file_name=None):
87
  loaded_patent = st.session_state.get("LOADED_PATENT")
88
 
 
 
89
  vectordb = Chroma(
90
  persist_directory=PERSISTED_DIRECTORY,
91
  embedding_function=HuggingFaceEmbeddings(),
92
  )
 
 
93
  if loaded_patent == file_name or already_indexed(vectordb, file_name):
94
  st.write("✅ Already indexed.")
95
  else:
 
 
 
96
  vectordb.delete_collection()
97
  docs = load_docs(file_name)
98
- st.write("🔍 Number of Documents: ", len(docs))
99
 
 
 
 
 
 
 
100
  vectordb = Chroma.from_documents(
101
  docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
102
  )
103
  vectordb.persist()
 
 
 
104
  st.session_state["LOADED_PATENT"] = file_name
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  memory = ConversationBufferMemory(
107
  memory_key="chat_history",
108
  return_messages=True,
109
  input_key="question",
110
  output_key="answer",
111
  )
 
112
  return ConversationalRetrievalChain.from_llm(
113
  OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
114
- vectordb.as_retriever(search_kwargs={"k": 3}),
115
  return_source_documents=False,
116
  memory=memory,
117
  )
@@ -160,14 +194,9 @@ if __name__ == "__main__":
160
  )
161
 
162
  # Initialize session state
163
- if "LOADED_PATENT" not in st.session_state:
164
- st.session_state.LOADED_PATENT = None
165
- if "pdf_preview" not in st.session_state:
166
- st.session_state.pdf_preview = None
167
- if "loaded_pdf_path" not in st.session_state:
168
- st.session_state.loaded_pdf_path = None
169
- if "chain" not in st.session_state:
170
- st.session_state.chain = None
171
 
172
  # Button to load and process patent
173
  if st.button("Load and Process Patent"):
@@ -187,8 +216,12 @@ if __name__ == "__main__":
187
  pdf_path = os.path.join(tempfile.gettempdir(), f"{patent_number}.pdf")
188
  if not os.path.isfile(pdf_path):
189
  st.write("📥 Downloading patent file...")
190
- pdf_path = download_pdf(patent_number)
191
- st.write(f"✅ File downloaded: {pdf_path}")
 
 
 
 
192
  else:
193
  st.write("✅ File already downloaded.")
194
 
@@ -204,20 +237,22 @@ if __name__ == "__main__":
204
 
205
  # Load the document into the system
206
  st.write("🔄 Loading document into the system...")
207
- st.session_state.chain = load_chain(pdf_path)
208
- st.session_state.LOADED_PATENT = patent_number
209
- st.session_state.loaded_pdf_path = pdf_path
210
-
211
- # Initialize messages AFTER processing
212
- st.session_state.messages = [{"role": "assistant", "content": "Hello! How can I assist you with this patent?"}]
213
- st.success("🚀 Document successfully loaded! You can now start asking questions.")
 
 
214
 
215
  # Display the PDF preview if available
216
  if st.session_state.pdf_preview:
217
  st.image(st.session_state.pdf_preview, caption="First Page Preview", use_container_width=True)
218
 
219
  # Display previous chat messages
220
- if "messages" in st.session_state:
221
  for message in st.session_state.messages:
222
  with st.chat_message(message["role"]):
223
  st.markdown(message["content"])
@@ -237,8 +272,9 @@ if __name__ == "__main__":
237
 
238
  with st.spinner("Generating response..."):
239
  try:
 
240
  assistant_response = st.session_state.chain({"question": user_input})
241
- full_response = assistant_response.get("answer", "I couldn't process that question.")
242
  except Exception as e:
243
  full_response = f"An error occurred: {e}"
244
 
 
65
  text_splitter = NLTKTextSplitter(chunk_size=1000)
66
  split_docs = text_splitter.split_documents(documents)
67
 
68
+ # Debug: Check text chunking
69
+ st.write(f"🔍 Loaded Documents: {len(split_docs)}")
70
+ for i, doc in enumerate(split_docs[:5]): # Show first 5 chunks
71
+ st.write(f"Chunk {i + 1}: {doc.page_content[:200]}...")
72
+
 
 
73
  return split_docs
74
  except Exception as e:
75
  st.error(f"Failed to load and process PDF: {e}")
 
84
  def load_chain(file_name=None):
85
  loaded_patent = st.session_state.get("LOADED_PATENT")
86
 
87
+ # Debug: Check PERSISTED_DIRECTORY
88
+ st.write(f"Using Persisted Directory: {PERSISTED_DIRECTORY}")
89
  vectordb = Chroma(
90
  persist_directory=PERSISTED_DIRECTORY,
91
  embedding_function=HuggingFaceEmbeddings(),
92
  )
93
+
94
+ # Debug: Confirm already indexed
95
  if loaded_patent == file_name or already_indexed(vectordb, file_name):
96
  st.write("✅ Already indexed.")
97
  else:
98
+ st.write("🔄 Starting document processing and vectorstore update...")
99
+
100
+ # Remove existing collection and load new docs
101
  vectordb.delete_collection()
102
  docs = load_docs(file_name)
 
103
 
104
+ # Debug: Verify text chunking
105
+ st.write(f"🔍 Number of Documents Loaded: {len(docs)}")
106
+ for i, doc in enumerate(docs[:5]): # Show first 5 chunks for debugging
107
+ st.write(f"Chunk {i + 1}: {doc.page_content[:200]}...")
108
+
109
+ # Update vectorstore
110
  vectordb = Chroma.from_documents(
111
  docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
112
  )
113
  vectordb.persist()
114
+ st.write("✅ Vectorstore successfully updated and persisted.")
115
+
116
+ # Save loaded patent in session state
117
  st.session_state["LOADED_PATENT"] = file_name
118
 
119
+ # Debug: Check vectorstore indexing
120
+ indexed_docs = vectordb.get(include=["documents"])
121
+ st.write(f"✅ Indexed Documents in Vectorstore: {len(indexed_docs['documents'])}")
122
+ for i, doc in enumerate(indexed_docs["documents"][:3]): # Show first 3 indexed docs
123
+ st.write(f"Indexed Doc {i + 1}: {doc[:200]}...")
124
+
125
+ # Test retrieval with a sample query
126
+ retriever = vectordb.as_retriever(search_kwargs={"k": 3})
127
+ test_query = "What is this document about?"
128
+ results = retriever.get_relevant_documents(test_query)
129
+
130
+ # Debug: Verify document retrieval
131
+ st.write("🔍 Test Retrieval Results for Query:")
132
+ if results:
133
+ for i, res in enumerate(results):
134
+ st.write(f"Retrieved Doc {i + 1}: {res.page_content[:200]}...")
135
+ else:
136
+ st.warning("No documents retrieved for test query.")
137
+
138
+ # Configure memory for conversation
139
  memory = ConversationBufferMemory(
140
  memory_key="chat_history",
141
  return_messages=True,
142
  input_key="question",
143
  output_key="answer",
144
  )
145
+
146
  return ConversationalRetrievalChain.from_llm(
147
  OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
148
+ retriever,
149
  return_source_documents=False,
150
  memory=memory,
151
  )
 
194
  )
195
 
196
  # Initialize session state
197
+ for key in ["LOADED_PATENT", "pdf_preview", "loaded_pdf_path", "chain", "messages"]:
198
+ if key not in st.session_state:
199
+ st.session_state[key] = None
 
 
 
 
 
200
 
201
  # Button to load and process patent
202
  if st.button("Load and Process Patent"):
 
216
  pdf_path = os.path.join(tempfile.gettempdir(), f"{patent_number}.pdf")
217
  if not os.path.isfile(pdf_path):
218
  st.write("📥 Downloading patent file...")
219
+ try:
220
+ pdf_path = download_pdf(patent_number)
221
+ st.write(f"✅ File downloaded: {pdf_path}")
222
+ except Exception as e:
223
+ st.error(f"Failed to download patent: {e}")
224
+ st.stop()
225
  else:
226
  st.write("✅ File already downloaded.")
227
 
 
237
 
238
  # Load the document into the system
239
  st.write("🔄 Loading document into the system...")
240
+ try:
241
+ st.session_state.chain = load_chain(pdf_path)
242
+ st.session_state.LOADED_PATENT = patent_number
243
+ st.session_state.loaded_pdf_path = pdf_path
244
+ st.session_state.messages = [{"role": "assistant", "content": "Hello! How can I assist you with this patent?"}]
245
+ st.success("🚀 Document successfully loaded! You can now start asking questions.")
246
+ except Exception as e:
247
+ st.error(f"Failed to load the document: {e}")
248
+ st.stop()
249
 
250
  # Display the PDF preview if available
251
  if st.session_state.pdf_preview:
252
  st.image(st.session_state.pdf_preview, caption="First Page Preview", use_container_width=True)
253
 
254
  # Display previous chat messages
255
+ if st.session_state.messages:
256
  for message in st.session_state.messages:
257
  with st.chat_message(message["role"]):
258
  st.markdown(message["content"])
 
272
 
273
  with st.spinner("Generating response..."):
274
  try:
275
+ # Generate response using the chain
276
  assistant_response = st.session_state.chain({"question": user_input})
277
+ full_response = assistant_response.get("answer", "I'm sorry, I couldn't process that question.")
278
  except Exception as e:
279
  full_response = f"An error occurred: {e}"
280