ivyblossom commited on
Commit
09ef786
1 Parent(s): 472d748

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -11
app.py CHANGED
@@ -4,7 +4,7 @@ import streamlit as st
4
  from sentence_transformers import SentenceTransformer, util
5
 
6
  # Load a pre-trained SentenceTransformer model
7
- model_name = "paraphrase-MiniLM-L6-v2"
8
  model = SentenceTransformer(model_name)
9
 
10
  # Function to extract text from a PDF file
@@ -13,8 +13,9 @@ def extract_text_from_pdf(pdf_path):
13
  with fitz.open(pdf_path) as pdf_document:
14
  for page_num in range(pdf_document.page_count):
15
  page = pdf_document.load_page(page_num)
16
- text += page.get_text()
17
- return text
 
18
 
19
  # Function to perform semantic search
20
  def semantic_search(query, documents, top_k=5):
@@ -29,16 +30,16 @@ def semantic_search(query, documents, top_k=5):
29
  # Sort the results in decreasing order
30
  results = []
31
  for idx in range(len(cosine_scores)):
32
- results.append((documents[idx], cosine_scores[idx].item()))
33
- results = sorted(results, key=lambda x: x[1], reverse=True)
34
 
35
  return results[:top_k]
36
 
37
  def main():
38
  st.title("Semantic Search on PDF Documents")
39
 
40
- pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
41
  query = st.text_input("Enter your query:")
 
42
 
43
  if st.button("Search"):
44
  if pdf_file:
@@ -46,14 +47,21 @@ def main():
46
  with open(pdf_path, "wb") as f:
47
  f.write(pdf_file.read())
48
 
49
- pdf_text = extract_text_from_pdf(pdf_path)
50
- search_results = semantic_search(query, [pdf_text])
 
 
 
 
 
51
  os.remove(pdf_path) # Delete the uploaded file after processing
52
 
53
  st.write(f"Search results for query: '{query}'")
54
- for i, (result, score) in enumerate(search_results, start=1):
55
- st.write(f"{i}. Score: {score:.2f}")
56
- st.write(result)
 
 
57
 
58
  if __name__ == "__main__":
59
  main()
 
4
  from sentence_transformers import SentenceTransformer, util
5
 
6
  # Load a pre-trained SentenceTransformer model
7
+ model_name = "paraphrase-MiniLM-L6-v2"
8
  model = SentenceTransformer(model_name)
9
 
10
  # Function to extract text from a PDF file
 
13
  with fitz.open(pdf_path) as pdf_document:
14
  for page_num in range(pdf_document.page_count):
15
  page = pdf_document.load_page(page_num)
16
+ page_text = page.get_text()
17
+ text += page_text
18
+ yield page_num + 1, page_text # Return the page number (1-based) and the extracted text
19
 
20
  # Function to perform semantic search
21
  def semantic_search(query, documents, top_k=5):
 
30
  # Sort the results in decreasing order
31
  results = []
32
  for idx in range(len(cosine_scores)):
33
+ results.append((documents[idx][0], documents[idx][1], cosine_scores[idx].item()))
34
+ results = sorted(results, key=lambda x: x[2], reverse=True)
35
 
36
  return results[:top_k]
37
 
38
  def main():
39
  st.title("Semantic Search on PDF Documents")
40
 
 
41
  query = st.text_input("Enter your query:")
42
+ pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
43
 
44
  if st.button("Search"):
45
  if pdf_file:
 
47
  with open(pdf_path, "wb") as f:
48
  f.write(pdf_file.read())
49
 
50
+ # Extract text from the PDF along with page numbers
51
+ pdf_text_with_pages = list(extract_text_from_pdf(pdf_path))
52
+
53
+ # Extract the text content from the tuple list
54
+ pdf_text = [text for _, text in pdf_text_with_pages]
55
+
56
+ search_results = semantic_search(query, pdf_text)
57
  os.remove(pdf_path) # Delete the uploaded file after processing
58
 
59
  st.write(f"Search results for query: '{query}'")
60
+ for i, (page_num, result_text, score) in enumerate(search_results, start=1):
61
+ with st.container():
62
+ st.write(f"Result {i} - Page {page_num}")
63
+ st.write(f"Score: {score:.2f}")
64
+ st.write(result_text)
65
 
66
  if __name__ == "__main__":
67
  main()