import os import streamlit as st from transformers import pipeline import re from PyPDF2 import PdfFileReader # Function to truncate text to the nearest word boundary def truncate_to_word_boundary(text, max_words=100): words = re.findall(r'\w+', text) truncated_text = ' '.join(words[:max_words]) return truncated_text # Function to perform question-answering def question_answering(question, text): # Perform question-answering using Hugging Face's Transformers question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad") answer = question_answerer(question=question, context=text) return answer def main(): st.title("Question Answering on an Uploaded File") uploaded_file = st.file_uploader("Upload a file:", type=["pdf", "txt", "docx", "csv", "json", "txt"]) question = st.text_input("Ask your question:") if st.button("Answer") and uploaded_file is not None: file_extension = os.path.splitext(uploaded_file.name)[1].lower() file_contents = uploaded_file.read() if file_extension == ".pdf": # Handle PDF files using PyPDF2 pdf_reader = PdfFileReader(uploaded_file) pdf_text = "" for page_num in range(pdf_reader.getNumPages()): pdf_page = pdf_reader.getPage(page_num) pdf_text += pdf_page.extractText() # Perform question-answering answer = question_answering(question, pdf_text) elif file_extension == ".txt": # Handle plain text files text = file_contents.decode("utf-8") # Perform question-answering answer = question_answering(question, text) # Add support for other file types (e.g., docx, csv, json) if needed st.write(f"Question: '{question}'") st.write("Answer:", answer['answer']) st.write("Score:", answer['score']) st.write("Page Number:", answer['start'] + 1) # Add 1 to convert 0-based index to 1-based page number # Display truncated context start_page = answer['start'] context = pdf_text if file_extension == ".pdf" else text truncated_context = truncate_to_word_boundary(context) st.write("Context:", truncated_context) if __name__ == "__main__": main()