DrishtiSharma commited on
Commit
8e22da0
·
verified ·
1 Parent(s): acd48bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -24
app.py CHANGED
@@ -56,6 +56,9 @@ def check_poppler_installed():
56
  check_poppler_installed()
57
 
58
  def load_docs(document_path):
 
 
 
59
  try:
60
  import fitz # PyMuPDF for text extraction
61
 
@@ -71,11 +74,11 @@ def load_docs(document_path):
71
 
72
  doc.close()
73
 
74
- # Step 2: Combine cleaned text
75
  full_text = "\n".join(extracted_text)
76
  st.write(f"📄 Total Cleaned Text Length: {len(full_text)} characters")
77
 
78
- # Step 3: Chunk the cleaned text
79
  text_splitter = RecursiveCharacterTextSplitter(
80
  chunk_size=1000,
81
  chunk_overlap=100,
@@ -83,9 +86,9 @@ def load_docs(document_path):
83
  )
84
  split_docs = text_splitter.create_documents([full_text])
85
 
86
- # Debug: Show filtered chunks
87
  st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
88
- for i, doc in enumerate(split_docs[:5]): # Show first 5 chunks
89
  st.write(f"Chunk {i + 1}: {doc.page_content[:300]}...")
90
 
91
  return split_docs
@@ -126,30 +129,28 @@ def already_indexed(vectordb, file_name):
126
  return file_name in indexed_sources
127
 
128
  def load_chain(file_name=None):
 
 
 
129
  loaded_patent = st.session_state.get("LOADED_PATENT")
130
 
131
- # Debug: Check PERSISTED_DIRECTORY
132
- st.write(f"Using Persisted Directory: {PERSISTED_DIRECTORY}")
 
133
  vectordb = Chroma(
134
  persist_directory=PERSISTED_DIRECTORY,
135
  embedding_function=HuggingFaceEmbeddings(),
136
  )
137
 
138
- # Debug: Confirm already indexed
139
  if loaded_patent == file_name or already_indexed(vectordb, file_name):
140
  st.write("✅ Already indexed.")
141
  else:
142
  st.write("🔄 Starting document processing and vectorstore update...")
143
-
144
  # Remove existing collection and load new docs
145
  vectordb.delete_collection()
146
  docs = load_docs(file_name)
147
 
148
- # Debug: Verify text chunking
149
- st.write(f"🔍 Number of Documents Loaded: {len(docs)}")
150
- for i, doc in enumerate(docs[:5]): # Show first 5 chunks for debugging
151
- st.write(f"Chunk {i + 1}: {doc.page_content[:200]}...")
152
-
153
  # Update vectorstore
154
  vectordb = Chroma.from_documents(
155
  docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
@@ -160,18 +161,15 @@ def load_chain(file_name=None):
160
  # Save loaded patent in session state
161
  st.session_state["LOADED_PATENT"] = file_name
162
 
163
- # Debug: Check vectorstore indexing
164
  indexed_docs = vectordb.get(include=["documents"])
165
- st.write(f"✅ Indexed Documents in Vectorstore: {len(indexed_docs['documents'])}")
166
- for i, doc in enumerate(indexed_docs["documents"][:3]): # Show first 3 indexed docs
167
- st.write(f"Indexed Doc {i + 1}: {doc[:200]}...")
168
 
169
- # Test retrieval with a sample query
170
  retriever = vectordb.as_retriever(search_kwargs={"k": 3})
171
  test_query = "What is this document about?"
172
  results = retriever.get_relevant_documents(test_query)
173
 
174
- # Debug: Verify document retrieval
175
  st.write("🔍 Test Retrieval Results for Query:")
176
  if results:
177
  for i, res in enumerate(results):
@@ -182,18 +180,16 @@ def load_chain(file_name=None):
182
  # Configure memory for conversation
183
  memory = ConversationBufferMemory(
184
  memory_key="chat_history",
185
- return_messages=True,
186
- input_key="question",
187
- output_key="answer",
188
  )
189
 
190
  return ConversationalRetrievalChain.from_llm(
191
  OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
192
  retriever,
193
- return_source_documents=False,
194
- memory=memory,
195
  )
196
 
 
197
  def extract_patent_number(url):
198
  pattern = r"/patent/([A-Z]{2}\d+)"
199
  match = re.search(pattern, url)
 
56
  check_poppler_installed()
57
 
58
  def load_docs(document_path):
59
+ """
60
+ Load and clean the PDF content, then split into chunks.
61
+ """
62
  try:
63
  import fitz # PyMuPDF for text extraction
64
 
 
74
 
75
  doc.close()
76
 
77
+ # Combine all pages into one text
78
  full_text = "\n".join(extracted_text)
79
  st.write(f"📄 Total Cleaned Text Length: {len(full_text)} characters")
80
 
81
+ # Step 2: Chunk the cleaned text
82
  text_splitter = RecursiveCharacterTextSplitter(
83
  chunk_size=1000,
84
  chunk_overlap=100,
 
86
  )
87
  split_docs = text_splitter.create_documents([full_text])
88
 
89
+ # Debug: Show total chunks count and first 3 chunks for verification
90
  st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
91
+ for i, doc in enumerate(split_docs[:3]): # Show first 3 chunks only
92
  st.write(f"Chunk {i + 1}: {doc.page_content[:300]}...")
93
 
94
  return split_docs
 
129
  return file_name in indexed_sources
130
 
131
  def load_chain(file_name=None):
132
+ """
133
+ Load cleaned PDF text, split into chunks, and update the vectorstore.
134
+ """
135
  loaded_patent = st.session_state.get("LOADED_PATENT")
136
 
137
+ # Debug: Show persist directory
138
+ st.write(f"🗂 Using Persisted Directory: {PERSISTED_DIRECTORY}")
139
+
140
  vectordb = Chroma(
141
  persist_directory=PERSISTED_DIRECTORY,
142
  embedding_function=HuggingFaceEmbeddings(),
143
  )
144
 
 
145
  if loaded_patent == file_name or already_indexed(vectordb, file_name):
146
  st.write("✅ Already indexed.")
147
  else:
148
  st.write("🔄 Starting document processing and vectorstore update...")
149
+
150
  # Remove existing collection and load new docs
151
  vectordb.delete_collection()
152
  docs = load_docs(file_name)
153
 
 
 
 
 
 
154
  # Update vectorstore
155
  vectordb = Chroma.from_documents(
156
  docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
 
161
  # Save loaded patent in session state
162
  st.session_state["LOADED_PATENT"] = file_name
163
 
164
+ # Debug: Check vectorstore indexing summary
165
  indexed_docs = vectordb.get(include=["documents"])
166
+ st.write(f"✅ Total Indexed Documents: {len(indexed_docs['documents'])}")
 
 
167
 
168
+ # Test retrieval with a simple query
169
  retriever = vectordb.as_retriever(search_kwargs={"k": 3})
170
  test_query = "What is this document about?"
171
  results = retriever.get_relevant_documents(test_query)
172
 
 
173
  st.write("🔍 Test Retrieval Results for Query:")
174
  if results:
175
  for i, res in enumerate(results):
 
180
  # Configure memory for conversation
181
  memory = ConversationBufferMemory(
182
  memory_key="chat_history",
183
+ return_messages=True
 
 
184
  )
185
 
186
  return ConversationalRetrievalChain.from_llm(
187
  OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
188
  retriever,
189
+ memory=memory
 
190
  )
191
 
192
+
193
  def extract_patent_number(url):
194
  pattern = r"/patent/([A-Z]{2}\d+)"
195
  match = re.search(pattern, url)