File size: 2,388 Bytes
e4a1f31
 
7b208e8
0f897d9
6b1590b
e4a1f31
0f897d9
7b208e8
0f897d9
 
 
 
7b208e8
f601880
7b208e8
 
f601880
e4a1f31
7b208e8
e4a1f31
 
4bd7bfe
e4a1f31
6b1590b
7b208e8
e4a1f31
f601880
 
 
 
 
6b1590b
 
f601880
6b1590b
 
 
f601880
 
 
09ef786
f601880
 
 
7b208e8
f601880
7b208e8
f601880
e4a1f31
f601880
 
 
 
7b208e8
f601880
 
 
 
 
e4a1f31
 
b06f609
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import streamlit as st
from transformers import pipeline
import re
from PyPDF2 import PdfFileReader

# Function to truncate text to the nearest word boundary
def truncate_to_word_boundary(text, max_words=100):
    words = re.findall(r'\w+', text)
    truncated_text = ' '.join(words[:max_words])
    return truncated_text

# Function to perform question-answering
def question_answering(question, text):
    # Perform question-answering using Hugging Face's Transformers
    question_answerer = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="distilbert-base-cased-distilled-squad")
    answer = question_answerer(question=question, context=text)

    return answer

def main():
    st.title("Question Answering on an Uploaded File")

    uploaded_file = st.file_uploader("Upload a file:", type=["pdf", "txt", "docx", "csv", "json", "txt"])
    question = st.text_input("Ask your question:")

    if st.button("Answer") and uploaded_file is not None:
        file_extension = os.path.splitext(uploaded_file.name)[1].lower()
        file_contents = uploaded_file.read()

        if file_extension == ".pdf":
            # Handle PDF files using PyPDF2
            pdf_reader = PdfFileReader(uploaded_file)
            pdf_text = ""
            for page_num in range(pdf_reader.getNumPages()):
                pdf_page = pdf_reader.getPage(page_num)
                pdf_text += pdf_page.extractText()

            # Perform question-answering
            answer = question_answering(question, pdf_text)

        elif file_extension == ".txt":
            # Handle plain text files
            text = file_contents.decode("utf-8")
            # Perform question-answering
            answer = question_answering(question, text)

        # Add support for other file types (e.g., docx, csv, json) if needed

        st.write(f"Question: '{question}'")
        st.write("Answer:", answer['answer'])
        st.write("Score:", answer['score'])
        st.write("Page Number:", answer['start'] + 1)  # Add 1 to convert 0-based index to 1-based page number

        # Display truncated context
        start_page = answer['start']
        context = pdf_text if file_extension == ".pdf" else text
        truncated_context = truncate_to_word_boundary(context)
        st.write("Context:", truncated_context)

if __name__ == "__main__":
    main()