DrishtiSharma commited on
Commit
e7b04df
·
verified ·
1 Parent(s): 0b3783b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -6
app.py CHANGED
@@ -64,17 +64,37 @@ def load_docs(document_path):
64
  documents = loader.load()
65
  text_splitter = NLTKTextSplitter(chunk_size=1000)
66
  split_docs = text_splitter.split_documents(documents)
67
-
68
- # Debug: Check text chunking
69
- st.write(f"🔍 Loaded Documents: {len(split_docs)}")
70
- for i, doc in enumerate(split_docs[:5]): # Show first 5 chunks
71
- st.write(f"Chunk {i + 1}: {doc.page_content[:200]}...")
72
 
73
- return split_docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  except Exception as e:
75
  st.error(f"Failed to load and process PDF: {e}")
76
  st.stop()
77
 
 
78
  def already_indexed(vectordb, file_name):
79
  indexed_sources = set(
80
  x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]
 
64
  documents = loader.load()
65
  text_splitter = NLTKTextSplitter(chunk_size=1000)
66
  split_docs = text_splitter.split_documents(documents)
 
 
 
 
 
67
 
68
+ # Filter out metadata, very short, or redundant chunks
69
+ filtered_docs = []
70
+ seen_chunks = set()
71
+
72
+ for doc in split_docs:
73
+ content = doc.page_content.strip()
74
+
75
+ # Filter conditions: Ignore short chunks, common metadata, or duplicates
76
+ if (
77
+ len(content) < 50 or
78
+ "United States Patent" in content or
79
+ re.match(r"^\(?\d+\)?$", content) or # Matches lines like "(12)" or "10"
80
+ content in seen_chunks
81
+ ):
82
+ continue
83
+
84
+ filtered_docs.append(doc)
85
+ seen_chunks.add(content)
86
+
87
+ # Debugging: Show filtered chunks
88
+ st.write(f"🔍 Filtered Documents: {len(filtered_docs)}")
89
+ for i, doc in enumerate(filtered_docs[:5]): # Show first 5 chunks
90
+ st.write(f"Filtered Chunk {i + 1}: {doc.page_content[:200]}...")
91
+
92
+ return filtered_docs
93
  except Exception as e:
94
  st.error(f"Failed to load and process PDF: {e}")
95
  st.stop()
96
 
97
+
98
  def already_indexed(vectordb, file_name):
99
  indexed_sources = set(
100
  x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]