Spaces:
Running
Running
import os | |
import fitz # PyMuPDF for parsing PDF | |
import streamlit as st | |
from transformers import pipeline | |
import re | |
# Function to extract text from a PDF file | |
def extract_text_from_pdf(pdf_path): | |
text = "" | |
with fitz.open(pdf_path) as pdf_document: | |
for page_num in range(pdf_document.page_count): | |
page = pdf_document.load_page(page_num) | |
page_text = page.get_text() | |
text += page_text | |
yield page_num + 1, page_text # Return the page number (1-based) and the extracted text | |
# Function to truncate text to the nearest word boundary | |
def truncate_to_word_boundary(text, max_words=100): | |
words = re.findall(r'\w+', text) | |
truncated_text = ' '.join(words[:max_words]) | |
return truncated_text | |
# Function to perform question-answering | |
def question_answering(question, pdf_text_with_pages): | |
pdf_text = "\n".join([text for _, text in pdf_text_with_pages]) | |
# Perform question-answering using Hugging Face's Transformers | |
question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad") | |
answer = question_answerer(question=question, context=pdf_text) | |
return answer | |
def main(): | |
st.title("Question Answering using a PDF Document") | |
pdf_file = st.file_uploader("Upload a PDF file:", type=["pdf"]) | |
question = st.text_input("Ask your question:") | |
if st.button("Answer"): | |
if pdf_file: | |
pdf_path = os.path.join(os.getcwd(), pdf_file.name) | |
with open(pdf_path, "wb") as f: | |
f.write(pdf_file.read()) | |
# Extract text from the PDF along with page numbers | |
pdf_text_with_pages = list(extract_text_from_pdf(pdf_path)) | |
# Perform question-answering | |
answer = question_answering(question, pdf_text_with_pages) | |
os.remove(pdf_path) # Delete the uploaded file after processing | |
st.write(f"Question: '{question}'") | |
st.write("Answer:", answer['answer']) | |
st.write("Score:", answer['score']) | |
st.write("Page Number:", answer['start'] + 1) # Add 1 to convert 0-based index to 1-based page number | |
# Display truncated context | |
start_page = answer['start'] | |
context = pdf_text_with_pages[start_page][1] | |
truncated_context = truncate_to_word_boundary(context) | |
st.write("Context:", truncated_context) | |
if __name__ == "__main__": | |
main() |