Spaces:

ivyblossom
/

question-answering

Running

App Files Files Community

ivyblossom commited on Aug 3, 2023

Commit

09ef786

•

1 Parent(s): 472d748

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -11

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import streamlit as st
 from sentence_transformers import SentenceTransformer, util
 # Load a pre-trained SentenceTransformer model
-model_name = "paraphrase-MiniLM-L6-v2"
 model = SentenceTransformer(model_name)
 # Function to extract text from a PDF file
@@ -13,8 +13,9 @@ def extract_text_from_pdf(pdf_path):
     with fitz.open(pdf_path) as pdf_document:
         for page_num in range(pdf_document.page_count):
             page = pdf_document.load_page(page_num)
-            text += page.get_text()
-    return text
 # Function to perform semantic search
 def semantic_search(query, documents, top_k=5):
@@ -29,16 +30,16 @@ def semantic_search(query, documents, top_k=5):
     # Sort the results in decreasing order
     results = []
     for idx in range(len(cosine_scores)):
-        results.append((documents[idx], cosine_scores[idx].item()))
-    results = sorted(results, key=lambda x: x[1], reverse=True)
     return results[:top_k]
 def main():
     st.title("Semantic Search on PDF Documents")
-    pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
     query = st.text_input("Enter your query:")
     if st.button("Search"):
         if pdf_file:
@@ -46,14 +47,21 @@ def main():
             with open(pdf_path, "wb") as f:
                 f.write(pdf_file.read())
-            pdf_text = extract_text_from_pdf(pdf_path)
-            search_results = semantic_search(query, [pdf_text])
             os.remove(pdf_path)  # Delete the uploaded file after processing
             st.write(f"Search results for query: '{query}'")
-            for i, (result, score) in enumerate(search_results, start=1):
-                st.write(f"{i}. Score: {score:.2f}")
-                st.write(result)
 if __name__ == "__main__":
     main()

 from sentence_transformers import SentenceTransformer, util
 # Load a pre-trained SentenceTransformer model
+model_name = "paraphrase-MiniLM-L6-v2"
 model = SentenceTransformer(model_name)
 # Function to extract text from a PDF file
     with fitz.open(pdf_path) as pdf_document:
         for page_num in range(pdf_document.page_count):
             page = pdf_document.load_page(page_num)
+            page_text = page.get_text()
+            text += page_text
+            yield page_num + 1, page_text  # Return the page number (1-based) and the extracted text
 # Function to perform semantic search
 def semantic_search(query, documents, top_k=5):
     # Sort the results in decreasing order
     results = []
     for idx in range(len(cosine_scores)):
+        results.append((documents[idx][0], documents[idx][1], cosine_scores[idx].item()))
+    results = sorted(results, key=lambda x: x[2], reverse=True)
     return results[:top_k]
 def main():
     st.title("Semantic Search on PDF Documents")
     query = st.text_input("Enter your query:")
+    pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
     if st.button("Search"):
         if pdf_file:
             with open(pdf_path, "wb") as f:
                 f.write(pdf_file.read())
+            # Extract text from the PDF along with page numbers
+            pdf_text_with_pages = list(extract_text_from_pdf(pdf_path))
+            # Extract the text content from the tuple list
+            pdf_text = [text for _, text in pdf_text_with_pages]
+            search_results = semantic_search(query, pdf_text)
             os.remove(pdf_path)  # Delete the uploaded file after processing
             st.write(f"Search results for query: '{query}'")
+            for i, (page_num, result_text, score) in enumerate(search_results, start=1):
+                with st.container():
+                    st.write(f"Result {i} - Page {page_num}")
+                    st.write(f"Score: {score:.2f}")
+                    st.write(result_text)
 if __name__ == "__main__":
     main()