DrishtiSharma commited on
Commit
08b924e
·
verified ·
1 Parent(s): 0c77c36

Update interim/app.py

Browse files
Files changed (1) hide show
  1. interim/app.py +70 -57
interim/app.py CHANGED
@@ -56,6 +56,9 @@ def check_poppler_installed():
56
  check_poppler_installed()
57
 
58
  def load_docs(document_path):
 
 
 
59
  try:
60
  import fitz # PyMuPDF for text extraction
61
 
@@ -71,11 +74,11 @@ def load_docs(document_path):
71
 
72
  doc.close()
73
 
74
- # Step 2: Combine cleaned text
75
  full_text = "\n".join(extracted_text)
76
  st.write(f"📄 Total Cleaned Text Length: {len(full_text)} characters")
77
 
78
- # Step 3: Chunk the cleaned text
79
  text_splitter = RecursiveCharacterTextSplitter(
80
  chunk_size=1000,
81
  chunk_overlap=100,
@@ -83,9 +86,9 @@ def load_docs(document_path):
83
  )
84
  split_docs = text_splitter.create_documents([full_text])
85
 
86
- # Debug: Show filtered chunks
87
  st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
88
- for i, doc in enumerate(split_docs[:5]): # Show first 5 chunks
89
  st.write(f"Chunk {i + 1}: {doc.page_content[:300]}...")
90
 
91
  return split_docs
@@ -126,30 +129,28 @@ def already_indexed(vectordb, file_name):
126
  return file_name in indexed_sources
127
 
128
  def load_chain(file_name=None):
 
 
 
129
  loaded_patent = st.session_state.get("LOADED_PATENT")
130
 
131
- # Debug: Check PERSISTED_DIRECTORY
132
- st.write(f"Using Persisted Directory: {PERSISTED_DIRECTORY}")
 
133
  vectordb = Chroma(
134
  persist_directory=PERSISTED_DIRECTORY,
135
  embedding_function=HuggingFaceEmbeddings(),
136
  )
137
 
138
- # Debug: Confirm already indexed
139
  if loaded_patent == file_name or already_indexed(vectordb, file_name):
140
  st.write("✅ Already indexed.")
141
  else:
142
  st.write("🔄 Starting document processing and vectorstore update...")
143
-
144
  # Remove existing collection and load new docs
145
  vectordb.delete_collection()
146
  docs = load_docs(file_name)
147
 
148
- # Debug: Verify text chunking
149
- st.write(f"🔍 Number of Documents Loaded: {len(docs)}")
150
- for i, doc in enumerate(docs[:5]): # Show first 5 chunks for debugging
151
- st.write(f"Chunk {i + 1}: {doc.page_content[:200]}...")
152
-
153
  # Update vectorstore
154
  vectordb = Chroma.from_documents(
155
  docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
@@ -160,18 +161,15 @@ def load_chain(file_name=None):
160
  # Save loaded patent in session state
161
  st.session_state["LOADED_PATENT"] = file_name
162
 
163
- # Debug: Check vectorstore indexing
164
  indexed_docs = vectordb.get(include=["documents"])
165
- st.write(f"✅ Indexed Documents in Vectorstore: {len(indexed_docs['documents'])}")
166
- for i, doc in enumerate(indexed_docs["documents"][:3]): # Show first 3 indexed docs
167
- st.write(f"Indexed Doc {i + 1}: {doc[:200]}...")
168
 
169
- # Test retrieval with a sample query
170
  retriever = vectordb.as_retriever(search_kwargs={"k": 3})
171
  test_query = "What is this document about?"
172
  results = retriever.get_relevant_documents(test_query)
173
 
174
- # Debug: Verify document retrieval
175
  st.write("🔍 Test Retrieval Results for Query:")
176
  if results:
177
  for i, res in enumerate(results):
@@ -182,18 +180,16 @@ def load_chain(file_name=None):
182
  # Configure memory for conversation
183
  memory = ConversationBufferMemory(
184
  memory_key="chat_history",
185
- return_messages=True,
186
- input_key="question",
187
- output_key="answer",
188
  )
189
 
190
  return ConversationalRetrievalChain.from_llm(
191
  OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
192
  retriever,
193
- return_source_documents=False,
194
- memory=memory,
195
  )
196
 
 
197
  def extract_patent_number(url):
198
  pattern = r"/patent/([A-Z]{2}\d+)"
199
  match = re.search(pattern, url)
@@ -208,19 +204,36 @@ def download_pdf(patent_number):
208
  st.error(f"Failed to download patent PDF: {e}")
209
  st.stop()
210
 
211
- def preview_pdf(pdf_path):
212
- """Generate and display the first page of the PDF as an image."""
 
 
 
 
 
 
 
213
  try:
214
- doc = fitz.open(pdf_path) # Open PDF
215
- first_page = doc[0] # Extract the first page
216
- pix = first_page.get_pixmap() # Render page to a Pixmap (image)
 
 
 
 
 
 
217
  temp_image_path = os.path.join(tempfile.gettempdir(), "pdf_preview.png")
218
- pix.save(temp_image_path) # Save the image temporarily
 
 
219
  return temp_image_path
 
220
  except Exception as e:
221
  st.error(f"Error generating PDF preview: {e}")
222
  return None
223
 
 
224
  if __name__ == "__main__":
225
  st.set_page_config(
226
  page_title="Patent Chat: Google Patents Chat Demo",
@@ -234,7 +247,7 @@ if __name__ == "__main__":
234
  patent_link = st.text_area(
235
  "Enter Google Patent Link:",
236
  value="https://patents.google.com/patent/US8676427B1/en",
237
- height=100
238
  )
239
 
240
  # Initialize session state
@@ -259,39 +272,39 @@ if __name__ == "__main__":
259
  # File handling
260
  pdf_path = os.path.join(tempfile.gettempdir(), f"{patent_number}.pdf")
261
  if not os.path.isfile(pdf_path):
262
- st.write("📥 Downloading patent file...")
263
- try:
264
- pdf_path = download_pdf(patent_number)
265
- st.write(f"✅ File downloaded: {pdf_path}")
266
- except Exception as e:
267
- st.error(f"Failed to download patent: {e}")
268
- st.stop()
269
  else:
270
  st.write("✅ File already downloaded.")
271
 
272
  # Generate PDF preview only if not already displayed
273
  if not st.session_state.get("pdf_preview_displayed", False):
274
- st.write("🖼️ Generating PDF preview...")
275
- preview_image_path = preview_pdf(pdf_path)
276
- if preview_image_path:
277
- st.session_state.pdf_preview = preview_image_path
278
- st.image(preview_image_path, caption="First Page Preview", use_container_width=True)
279
- st.session_state["pdf_preview_displayed"] = True
280
- else:
281
- st.warning("Failed to generate PDF preview.")
282
- st.session_state.pdf_preview = None
283
 
284
  # Load the document into the system
285
- st.write("🔄 Loading document into the system...")
286
- try:
287
- st.session_state.chain = load_chain(pdf_path)
288
- st.session_state.LOADED_PATENT = patent_number
289
- st.session_state.loaded_pdf_path = pdf_path
290
- st.session_state.messages = [{"role": "assistant", "content": "Hello! How can I assist you with this patent?"}]
291
- st.success("🚀 Document successfully loaded! You can now start asking questions.")
292
- except Exception as e:
293
- st.error(f"Failed to load the document: {e}")
294
- st.stop()
295
 
296
  # Display previous chat messages
297
  if st.session_state.messages:
 
56
  check_poppler_installed()
57
 
58
  def load_docs(document_path):
59
+ """
60
+ Load and clean the PDF content, then split into chunks.
61
+ """
62
  try:
63
  import fitz # PyMuPDF for text extraction
64
 
 
74
 
75
  doc.close()
76
 
77
+ # Combine all pages into one text
78
  full_text = "\n".join(extracted_text)
79
  st.write(f"📄 Total Cleaned Text Length: {len(full_text)} characters")
80
 
81
+ # Step 2: Chunk the cleaned text
82
  text_splitter = RecursiveCharacterTextSplitter(
83
  chunk_size=1000,
84
  chunk_overlap=100,
 
86
  )
87
  split_docs = text_splitter.create_documents([full_text])
88
 
89
+ # Debug: Show total chunks count and first 3 chunks for verification
90
  st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
91
+ for i, doc in enumerate(split_docs[:3]): # Show first 3 chunks only
92
  st.write(f"Chunk {i + 1}: {doc.page_content[:300]}...")
93
 
94
  return split_docs
 
129
  return file_name in indexed_sources
130
 
131
  def load_chain(file_name=None):
132
+ """
133
+ Load cleaned PDF text, split into chunks, and update the vectorstore.
134
+ """
135
  loaded_patent = st.session_state.get("LOADED_PATENT")
136
 
137
+ # Debug: Show persist directory
138
+ st.write(f"🗂 Using Persisted Directory: {PERSISTED_DIRECTORY}")
139
+
140
  vectordb = Chroma(
141
  persist_directory=PERSISTED_DIRECTORY,
142
  embedding_function=HuggingFaceEmbeddings(),
143
  )
144
 
 
145
  if loaded_patent == file_name or already_indexed(vectordb, file_name):
146
  st.write("✅ Already indexed.")
147
  else:
148
  st.write("🔄 Starting document processing and vectorstore update...")
149
+
150
  # Remove existing collection and load new docs
151
  vectordb.delete_collection()
152
  docs = load_docs(file_name)
153
 
 
 
 
 
 
154
  # Update vectorstore
155
  vectordb = Chroma.from_documents(
156
  docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
 
161
  # Save loaded patent in session state
162
  st.session_state["LOADED_PATENT"] = file_name
163
 
164
+ # Debug: Check vectorstore indexing summary
165
  indexed_docs = vectordb.get(include=["documents"])
166
+ st.write(f"✅ Total Indexed Documents: {len(indexed_docs['documents'])}")
 
 
167
 
168
+ # Test retrieval with a simple query
169
  retriever = vectordb.as_retriever(search_kwargs={"k": 3})
170
  test_query = "What is this document about?"
171
  results = retriever.get_relevant_documents(test_query)
172
 
 
173
  st.write("🔍 Test Retrieval Results for Query:")
174
  if results:
175
  for i, res in enumerate(results):
 
180
  # Configure memory for conversation
181
  memory = ConversationBufferMemory(
182
  memory_key="chat_history",
183
+ return_messages=True
 
 
184
  )
185
 
186
  return ConversationalRetrievalChain.from_llm(
187
  OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
188
  retriever,
189
+ memory=memory
 
190
  )
191
 
192
+
193
  def extract_patent_number(url):
194
  pattern = r"/patent/([A-Z]{2}\d+)"
195
  match = re.search(pattern, url)
 
204
  st.error(f"Failed to download patent PDF: {e}")
205
  st.stop()
206
 
207
+ def preview_pdf(pdf_path, scale_factor=0.5):
208
+ """
209
+ Generate and display a resized preview of the first page of the PDF.
210
+ Args:
211
+ pdf_path (str): Path to the PDF file.
212
+ scale_factor (float): Factor to reduce the image size (default is 0.5).
213
+ Returns:
214
+ str: Path to the resized image preview.
215
+ """
216
  try:
217
+ # Open the PDF and extract the first page
218
+ doc = fitz.open(pdf_path)
219
+ first_page = doc[0]
220
+
221
+ # Apply scaling using a transformation matrix
222
+ matrix = fitz.Matrix(scale_factor, scale_factor) # Scale down the image
223
+ pix = first_page.get_pixmap(matrix=matrix) # Generate scaled image
224
+
225
+ # Save the preview image
226
  temp_image_path = os.path.join(tempfile.gettempdir(), "pdf_preview.png")
227
+ pix.save(temp_image_path)
228
+
229
+ doc.close()
230
  return temp_image_path
231
+
232
  except Exception as e:
233
  st.error(f"Error generating PDF preview: {e}")
234
  return None
235
 
236
+
237
  if __name__ == "__main__":
238
  st.set_page_config(
239
  page_title="Patent Chat: Google Patents Chat Demo",
 
247
  patent_link = st.text_area(
248
  "Enter Google Patent Link:",
249
  value="https://patents.google.com/patent/US8676427B1/en",
250
+ height=90
251
  )
252
 
253
  # Initialize session state
 
272
  # File handling
273
  pdf_path = os.path.join(tempfile.gettempdir(), f"{patent_number}.pdf")
274
  if not os.path.isfile(pdf_path):
275
+ with st.spinner("📥 Downloading patent file..."):
276
+ try:
277
+ pdf_path = download_pdf(patent_number)
278
+ st.write(f"✅ File downloaded: {pdf_path}")
279
+ except Exception as e:
280
+ st.error(f"Failed to download patent: {e}")
281
+ st.stop()
282
  else:
283
  st.write("✅ File already downloaded.")
284
 
285
  # Generate PDF preview only if not already displayed
286
  if not st.session_state.get("pdf_preview_displayed", False):
287
+ with st.spinner("🖼️ Generating PDF preview..."):
288
+ preview_image_path = preview_pdf(pdf_path, scale_factor=0.5)
289
+ if preview_image_path:
290
+ st.session_state.pdf_preview = preview_image_path
291
+ st.image(preview_image_path, caption="First Page Preview", use_container_width=False)
292
+ st.session_state["pdf_preview_displayed"] = True
293
+ else:
294
+ st.warning("Failed to generate PDF preview.")
295
+ st.session_state.pdf_preview = None
296
 
297
  # Load the document into the system
298
+ with st.spinner("🔄 Loading document into the system..."):
299
+ try:
300
+ st.session_state.chain = load_chain(pdf_path)
301
+ st.session_state.LOADED_PATENT = patent_number
302
+ st.session_state.loaded_pdf_path = pdf_path
303
+ st.session_state.messages = [{"role": "assistant", "content": "Hello! How can I assist you with this patent?"}]
304
+ st.success("🚀 Document successfully loaded! You can now start asking questions.")
305
+ except Exception as e:
306
+ st.error(f"Failed to load the document: {e}")
307
+ st.stop()
308
 
309
  # Display previous chat messages
310
  if st.session_state.messages: