ivyblossom's picture
Update app.py
7b208e8
raw
history blame
2.51 kB
import os
import fitz # PyMuPDF for parsing PDF
import streamlit as st
from transformers import pipeline
import re
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
text = ""
with fitz.open(pdf_path) as pdf_document:
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
page_text = page.get_text()
text += page_text
yield page_num + 1, page_text # Return the page number (1-based) and the extracted text
# Function to truncate text to the nearest word boundary
def truncate_to_word_boundary(text, max_words=100):
words = re.findall(r'\w+', text)
truncated_text = ' '.join(words[:max_words])
return truncated_text
# Function to perform question-answering
def question_answering(question, pdf_text_with_pages):
pdf_text = "\n".join([text for _, text in pdf_text_with_pages])
# Perform question-answering using Hugging Face's Transformers
question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
answer = question_answerer(question=question, context=pdf_text)
return answer
def main():
st.title("Question Answering using a PDF Document")
pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
question = st.text_input("Ask your question:")
if st.button("Answer"):
if pdf_file:
pdf_path = os.path.join(os.getcwd(), pdf_file.name)
with open(pdf_path, "wb") as f:
f.write(pdf_file.read())
# Extract text from the PDF along with page numbers
pdf_text_with_pages = list(extract_text_from_pdf(pdf_path))
# Perform question-answering
answer = question_answering(question, pdf_text_with_pages)
os.remove(pdf_path) # Delete the uploaded file after processing
st.write(f"Question: '{question}'")
st.write("Answer:", answer['answer'])
st.write("Score:", answer['score'])
st.write("Page Number:", answer['start'] + 1) # Add 1 to convert 0-based index to 1-based page number
# Display truncated context
start_page = answer['start']
context = pdf_text_with_pages[start_page][1]
truncated_context = truncate_to_word_boundary(context)
st.write("Context:", truncated_context)
if __name__ == "__main__":
main()