Spaces:

ivyblossom
/

question-answering

Running

App Files Files Community

question-answering / app.py

ivyblossom

Update app.py

7b208e8 over 1 year ago

raw

history blame

2.51 kB

	import os
	import fitz # PyMuPDF for parsing PDF
	import streamlit as st
	from transformers import pipeline
	import re

	# Function to extract text from a PDF file
	def extract_text_from_pdf(pdf_path):
	text = ""
	with fitz.open(pdf_path) as pdf_document:
	for page_num in range(pdf_document.page_count):
	page = pdf_document.load_page(page_num)
	page_text = page.get_text()
	text += page_text
	yield page_num + 1, page_text # Return the page number (1-based) and the extracted text

	# Function to truncate text to the nearest word boundary
	def truncate_to_word_boundary(text, max_words=100):
	words = re.findall(r'\w+', text)
	truncated_text = ' '.join(words[:max_words])
	return truncated_text

	# Function to perform question-answering
	def question_answering(question, pdf_text_with_pages):
	pdf_text = "\n".join([text for _, text in pdf_text_with_pages])

	# Perform question-answering using Hugging Face's Transformers
	question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
	answer = question_answerer(question=question, context=pdf_text)

	return answer

	def main():
	st.title("Question Answering using a PDF Document")

	pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"])
	question = st.text_input("Ask your question:")

	if st.button("Answer"):
	if pdf_file:
	pdf_path = os.path.join(os.getcwd(), pdf_file.name)
	with open(pdf_path, "wb") as f:
	f.write(pdf_file.read())

	# Extract text from the PDF along with page numbers
	pdf_text_with_pages = list(extract_text_from_pdf(pdf_path))

	# Perform question-answering
	answer = question_answering(question, pdf_text_with_pages)

	os.remove(pdf_path) # Delete the uploaded file after processing

	st.write(f"Question: '{question}'")
	st.write("Answer:", answer['answer'])
	st.write("Score:", answer['score'])
	st.write("Page Number:", answer['start'] + 1) # Add 1 to convert 0-based index to 1-based page number

	# Display truncated context
	start_page = answer['start']
	context = pdf_text_with_pages[start_page][1]
	truncated_context = truncate_to_word_boundary(context)
	st.write("Context:", truncated_context)

	if __name__ == "__main__":
	main()