ivyblossom commited on
Commit
f601880
1 Parent(s): 7b208e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -35
app.py CHANGED
@@ -1,19 +1,8 @@
1
  import os
2
- import fitz # PyMuPDF for parsing PDF
3
  import streamlit as st
4
  from transformers import pipeline
5
  import re
6
 
7
- # Function to extract text from a PDF file
8
- def extract_text_from_pdf(pdf_path):
9
- text = ""
10
- with fitz.open(pdf_path) as pdf_document:
11
- for page_num in range(pdf_document.page_count):
12
- page = pdf_document.load_page(page_num)
13
- page_text = page.get_text()
14
- text += page_text
15
- yield page_num + 1, page_text # Return the page number (1-based) and the extracted text
16
-
17
  # Function to truncate text to the nearest word boundary
18
  def truncate_to_word_boundary(text, max_words=100):
19
  words = re.findall(r'\w+', text)
@@ -21,45 +10,53 @@ def truncate_to_word_boundary(text, max_words=100):
21
  return truncated_text
22
 
23
  # Function to perform question-answering
24
- def question_answering(question, pdf_text_with_pages):
25
- pdf_text = "\n".join([text for _, text in pdf_text_with_pages])
26
-
27
  # Perform question-answering using Hugging Face's Transformers
28
  question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
29
- answer = question_answerer(question=question, context=pdf_text)
30
 
31
  return answer
32
 
33
  def main():
34
- st.title("Question Answering using a PDF Document")
35
 
36
- pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
37
  question = st.text_input("Ask your question:")
38
 
39
- if st.button("Answer"):
40
- if pdf_file:
41
- pdf_path = os.path.join(os.getcwd(), pdf_file.name)
42
- with open(pdf_path, "wb") as f:
43
- f.write(pdf_file.read())
 
 
44
 
45
- # Extract text from the PDF along with page numbers
46
- pdf_text_with_pages = list(extract_text_from_pdf(pdf_path))
 
 
 
 
 
47
 
 
 
 
48
  # Perform question-answering
49
- answer = question_answering(question, pdf_text_with_pages)
50
 
51
- os.remove(pdf_path) # Delete the uploaded file after processing
52
 
53
- st.write(f"Question: '{question}'")
54
- st.write("Answer:", answer['answer'])
55
- st.write("Score:", answer['score'])
56
- st.write("Page Number:", answer['start'] + 1) # Add 1 to convert 0-based index to 1-based page number
57
 
58
- # Display truncated context
59
- start_page = answer['start']
60
- context = pdf_text_with_pages[start_page][1]
61
- truncated_context = truncate_to_word_boundary(context)
62
- st.write("Context:", truncated_context)
63
 
64
  if __name__ == "__main__":
65
  main()
 
1
  import os
 
2
  import streamlit as st
3
  from transformers import pipeline
4
  import re
5
 
 
 
 
 
 
 
 
 
 
 
6
  # Function to truncate text to the nearest word boundary
7
  def truncate_to_word_boundary(text, max_words=100):
8
  words = re.findall(r'\w+', text)
 
10
  return truncated_text
11
 
12
  # Function to perform question-answering
13
+ def question_answering(question, text):
 
 
14
  # Perform question-answering using Hugging Face's Transformers
15
  question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
16
+ answer = question_answerer(question=question, context=text)
17
 
18
  return answer
19
 
20
  def main():
21
+ st.title("Question Answering on Uploaded Files")
22
 
23
+ uploaded_file = st.file_uploader("Upload a file:", type=["pdf", "txt"]) # , "docx", "csv", "json", "txt"
24
  question = st.text_input("Ask your question:")
25
 
26
+ if st.button("Answer") and uploaded_file is not None:
27
+ file_extension = os.path.splitext(uploaded_file.name)[1].lower()
28
+ file_contents = uploaded_file.read()
29
+
30
+ if file_extension == ".pdf":
31
+ # Handle PDF files
32
+ import fitz # PyMuPDF for parsing PDF
33
 
34
+ pdf_text = ""
35
+ with fitz.open(stream=uploaded_file, filetype="pdf") as pdf_document:
36
+ for page in pdf_document:
37
+ pdf_text += page.get_text()
38
+
39
+ # Perform question-answering
40
+ answer = question_answering(question, pdf_text)
41
 
42
+ elif file_extension == ".txt":
43
+ # Handle plain text files
44
+ text = file_contents.decode("utf-8")
45
  # Perform question-answering
46
+ answer = question_answering(question, text)
47
 
48
+ # Add support for other file types (e.g., docx, csv, json) if needed
49
 
50
+ st.write(f"Question: '{question}'")
51
+ st.write("Answer:", answer['answer'])
52
+ st.write("Score:", answer['score'])
53
+ st.write("Page Number:", answer['start'] + 1) # Add 1 to convert 0-based index to 1-based page number
54
 
55
+ # Display truncated context
56
+ start_page = answer['start']
57
+ context = pdf_text if file_extension == ".pdf" else text
58
+ truncated_context = truncate_to_word_boundary(context)
59
+ st.write("Context:", truncated_context)
60
 
61
  if __name__ == "__main__":
62
  main()