DrishtiSharma commited on
Commit
e9ee2aa
·
verified ·
1 Parent(s): e7b04df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -34
app.py CHANGED
@@ -36,6 +36,8 @@ from langchain.vectorstores import Chroma
36
  from langchain.embeddings import HuggingFaceEmbeddings
37
  from langchain.text_splitter import NLTKTextSplitter
38
  from patent_downloader import PatentDownloader
 
 
39
 
40
  PERSISTED_DIRECTORY = tempfile.mkdtemp()
41
 
@@ -55,41 +57,30 @@ check_poppler_installed()
55
 
56
  def load_docs(document_path):
57
  try:
58
- loader = UnstructuredPDFLoader(
59
- document_path,
60
- mode="elements",
61
- strategy="fast",
62
- ocr_languages=None
63
- )
64
  documents = loader.load()
65
- text_splitter = NLTKTextSplitter(chunk_size=1000)
66
- split_docs = text_splitter.split_documents(documents)
67
-
68
- # Filter out metadata, very short, or redundant chunks
69
- filtered_docs = []
70
- seen_chunks = set()
71
-
72
- for doc in split_docs:
73
- content = doc.page_content.strip()
74
-
75
- # Filter conditions: Ignore short chunks, common metadata, or duplicates
76
- if (
77
- len(content) < 50 or
78
- "United States Patent" in content or
79
- re.match(r"^\(?\d+\)?$", content) or # Matches lines like "(12)" or "10"
80
- content in seen_chunks
81
- ):
82
- continue
83
-
84
- filtered_docs.append(doc)
85
- seen_chunks.add(content)
86
-
87
- # Debugging: Show filtered chunks
88
- st.write(f"🔍 Filtered Documents: {len(filtered_docs)}")
89
- for i, doc in enumerate(filtered_docs[:5]): # Show first 5 chunks
90
- st.write(f"Filtered Chunk {i + 1}: {doc.page_content[:200]}...")
91
-
92
- return filtered_docs
93
  except Exception as e:
94
  st.error(f"Failed to load and process PDF: {e}")
95
  st.stop()
 
36
  from langchain.embeddings import HuggingFaceEmbeddings
37
  from langchain.text_splitter import NLTKTextSplitter
38
  from patent_downloader import PatentDownloader
39
+ from langchain.document_loaders import PyMuPDFLoader
40
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
41
 
42
  PERSISTED_DIRECTORY = tempfile.mkdtemp()
43
 
 
57
 
58
  def load_docs(document_path):
59
  try:
60
+ # Load the entire PDF content
61
+ loader = PyMuPDFLoader(document_path)
 
 
 
 
62
  documents = loader.load()
63
+
64
+ # Combine all pages into a single string
65
+ full_text = "\n".join([doc.page_content for doc in documents])
66
+
67
+ # Debug: Verify total text size
68
+ st.write(f"📄 Total Text Length: {len(full_text)} characters")
69
+
70
+ # Split the text into meaningful chunks
71
+ text_splitter = RecursiveCharacterTextSplitter(
72
+ chunk_size=1000,
73
+ chunk_overlap=100,
74
+ separators=["\n\n", "\n", " ", ""]
75
+ )
76
+ split_docs = text_splitter.create_documents([full_text])
77
+
78
+ # Debug: Show filtered chunks
79
+ st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
80
+ for i, doc in enumerate(split_docs[:5]): # Show first 5 chunks
81
+ st.write(f"Chunk {i + 1}: {doc.page_content[:200]}...")
82
+
83
+ return split_docs
 
 
 
 
 
 
 
84
  except Exception as e:
85
  st.error(f"Failed to load and process PDF: {e}")
86
  st.stop()