Spaces:

Arxived
/

quick-spin

Sleeping

DrishtiSharma commited on Dec 20, 2024

Commit

e7b04df

verified ·

1 Parent(s): 0b3783b

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -64,17 +64,37 @@ def load_docs(document_path):
         documents = loader.load()
         text_splitter = NLTKTextSplitter(chunk_size=1000)
         split_docs = text_splitter.split_documents(documents)
-        # Debug: Check text chunking
-        st.write(f"🔍 Loaded Documents: {len(split_docs)}")
-        for i, doc in enumerate(split_docs[:5]):  # Show first 5 chunks
-            st.write(f"Chunk {i + 1}: {doc.page_content[:200]}...")
-        return split_docs
     except Exception as e:
         st.error(f"Failed to load and process PDF: {e}")
         st.stop()
 def already_indexed(vectordb, file_name):
     indexed_sources = set(
         x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]

         documents = loader.load()
         text_splitter = NLTKTextSplitter(chunk_size=1000)
         split_docs = text_splitter.split_documents(documents)
+        # Filter out metadata, very short, or redundant chunks
+        filtered_docs = []
+        seen_chunks = set()
+        for doc in split_docs:
+            content = doc.page_content.strip()
+            # Filter conditions: Ignore short chunks, common metadata, or duplicates
+            if (
+                len(content) < 50 or
+                "United States Patent" in content or
+                re.match(r"^\(?\d+\)?$", content) or  # Matches lines like "(12)" or "10"
+                content in seen_chunks
+            ):
+                continue
+            filtered_docs.append(doc)
+            seen_chunks.add(content)
+        # Debugging: Show filtered chunks
+        st.write(f"🔍 Filtered Documents: {len(filtered_docs)}")
+        for i, doc in enumerate(filtered_docs[:5]):  # Show first 5 chunks
+            st.write(f"Filtered Chunk {i + 1}: {doc.page_content[:200]}...")
+        return filtered_docs
     except Exception as e:
         st.error(f"Failed to load and process PDF: {e}")
         st.stop()
 def already_indexed(vectordb, file_name):
     indexed_sources = set(
         x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]