ivyblossom's picture
Update app.py
6b1590b
raw history blame
No virus
2.39 kB
import os
import streamlit as st
from transformers import pipeline
import re
from PyPDF2 import PdfFileReader
# Function to truncate text to the nearest word boundary
def truncate_to_word_boundary(text, max_words=100):
words = re.findall(r'\w+', text)
truncated_text = ' '.join(words[:max_words])
return truncated_text
# Function to perform question-answering
def question_answering(question, text):
# Perform question-answering using Hugging Face's Transformers
question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
answer = question_answerer(question=question, context=text)
return answer
def main():
st.title("Question Answering on an Uploaded File")
uploaded_file = st.file_uploader("Upload a file:", type=["pdf", "txt", "docx", "csv", "json", "txt"])
question = st.text_input("Ask your question:")
if st.button("Answer") and uploaded_file is not None:
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
file_contents = uploaded_file.read()
if file_extension == ".pdf":
# Handle PDF files using PyPDF2
pdf_reader = PdfFileReader(uploaded_file)
pdf_text = ""
for page_num in range(pdf_reader.getNumPages()):
pdf_page = pdf_reader.getPage(page_num)
pdf_text += pdf_page.extractText()
# Perform question-answering
answer = question_answering(question, pdf_text)
elif file_extension == ".txt":
# Handle plain text files
text = file_contents.decode("utf-8")
# Perform question-answering
answer = question_answering(question, text)
# Add support for other file types (e.g., docx, csv, json) if needed
st.write(f"Question: '{question}'")
st.write("Answer:", answer['answer'])
st.write("Score:", answer['score'])
st.write("Page Number:", answer['start'] + 1) # Add 1 to convert 0-based index to 1-based page number
# Display truncated context
start_page = answer['start']
context = pdf_text if file_extension == ".pdf" else text
truncated_context = truncate_to_word_boundary(context)
st.write("Context:", truncated_context)
if __name__ == "__main__":
main()