ivyblossom commited on
Commit
7b208e8
1 Parent(s): 0f897d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -33
app.py CHANGED
@@ -1,13 +1,9 @@
1
  import os
2
  import fitz # PyMuPDF for parsing PDF
3
  import streamlit as st
4
- from sentence_transformers import SentenceTransformer, util
5
  import re
6
 
7
- # Load a pre-trained SentenceTransformer model
8
- model_name = "paraphrase-MiniLM-L6-v2"
9
- model = SentenceTransformer(model_name)
10
-
11
  # Function to extract text from a PDF file
12
  def extract_text_from_pdf(pdf_path):
13
  text = ""
@@ -19,38 +15,28 @@ def extract_text_from_pdf(pdf_path):
19
  yield page_num + 1, page_text # Return the page number (1-based) and the extracted text
20
 
21
  # Function to truncate text to the nearest word boundary
22
- def truncate_to_word_boundary(text, max_words=500):
23
  words = re.findall(r'\w+', text)
24
  truncated_text = ' '.join(words[:max_words])
25
  return truncated_text
26
 
27
- # Function to perform semantic search
28
- def semantic_search(query, documents, top_k=5, max_words=500):
29
- query_embedding = model.encode(query, convert_to_tensor=True)
30
-
31
- # Convert the list of documents to embeddings
32
- document_embeddings = model.encode([text for _, text in documents], convert_to_tensor=True)
33
-
34
- # Compute cosine similarity scores of query with documents
35
- cosine_scores = util.pytorch_cos_sim(query_embedding.unsqueeze(0), document_embeddings)[0]
36
 
37
- # Sort the results in decreasing order
38
- results = []
39
- for idx in range(len(cosine_scores)):
40
- page_num, text = documents[idx]
41
- truncated_text = truncate_to_word_boundary(text, max_words)
42
- results.append((page_num, truncated_text, cosine_scores[idx].item()))
43
- results = sorted(results, key=lambda x: x[2], reverse=True)
44
 
45
- return results[:top_k]
46
 
47
  def main():
48
- st.title("Semantic Search on PDF Documents")
49
 
50
  pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
51
- query = st.text_input("Enter your query:")
52
 
53
- if st.button("Search"):
54
  if pdf_file:
55
  pdf_path = os.path.join(os.getcwd(), pdf_file.name)
56
  with open(pdf_path, "wb") as f:
@@ -59,15 +45,21 @@ def main():
59
  # Extract text from the PDF along with page numbers
60
  pdf_text_with_pages = list(extract_text_from_pdf(pdf_path))
61
 
62
- search_results = semantic_search(query, pdf_text_with_pages)
 
 
63
  os.remove(pdf_path) # Delete the uploaded file after processing
64
 
65
- st.write(f"Search results for query: '{query}'")
66
- for i, (page_num, result_text, score) in enumerate(search_results, start=1):
67
- with st.container():
68
- st.write(f"Result {i} - Page {page_num}")
69
- st.write(f"Score: {score:.2f}")
70
- st.write(result_text)
 
 
 
 
71
 
72
  if __name__ == "__main__":
73
  main()
 
1
  import os
2
  import fitz # PyMuPDF for parsing PDF
3
  import streamlit as st
4
+ from transformers import pipeline
5
  import re
6
 
 
 
 
 
7
  # Function to extract text from a PDF file
8
  def extract_text_from_pdf(pdf_path):
9
  text = ""
 
15
  yield page_num + 1, page_text # Return the page number (1-based) and the extracted text
16
 
17
  # Function to truncate text to the nearest word boundary
18
+ def truncate_to_word_boundary(text, max_words=100):
19
  words = re.findall(r'\w+', text)
20
  truncated_text = ' '.join(words[:max_words])
21
  return truncated_text
22
 
23
+ # Function to perform question-answering
24
+ def question_answering(question, pdf_text_with_pages):
25
+ pdf_text = "\n".join([text for _, text in pdf_text_with_pages])
 
 
 
 
 
 
26
 
27
+ # Perform question-answering using Hugging Face's Transformers
28
+ question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
29
+ answer = question_answerer(question=question, context=pdf_text)
 
 
 
 
30
 
31
+ return answer
32
 
33
  def main():
34
+ st.title("Question Answering using a PDF Document")
35
 
36
  pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
37
+ question = st.text_input("Ask your question:")
38
 
39
+ if st.button("Answer"):
40
  if pdf_file:
41
  pdf_path = os.path.join(os.getcwd(), pdf_file.name)
42
  with open(pdf_path, "wb") as f:
 
45
  # Extract text from the PDF along with page numbers
46
  pdf_text_with_pages = list(extract_text_from_pdf(pdf_path))
47
 
48
+ # Perform question-answering
49
+ answer = question_answering(question, pdf_text_with_pages)
50
+
51
  os.remove(pdf_path) # Delete the uploaded file after processing
52
 
53
+ st.write(f"Question: '{question}'")
54
+ st.write("Answer:", answer['answer'])
55
+ st.write("Score:", answer['score'])
56
+ st.write("Page Number:", answer['start'] + 1) # Add 1 to convert 0-based index to 1-based page number
57
+
58
+ # Display truncated context
59
+ start_page = answer['start']
60
+ context = pdf_text_with_pages[start_page][1]
61
+ truncated_context = truncate_to_word_boundary(context)
62
+ st.write("Context:", truncated_context)
63
 
64
  if __name__ == "__main__":
65
  main()