File size: 4,500 Bytes
26998f0
 
 
 
 
 
 
 
 
 
 
c9f5d3f
 
 
 
 
26998f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9f5d3f
 
 
 
 
 
26998f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9f5d3f
26998f0
c9f5d3f
 
 
 
 
 
 
 
 
26998f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import streamlit as st

# setting page config. for centered mode
st.set_page_config(layout="centered")


from utils.footer import cust_footer
import docx2txt
import requests
import pdfplumber

# Load CSS file
def load_css(file_path):
    with open(file_path) as f:
        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)

# function to run the enter button
def run_function(documents):
    data = ""
    if documents is not None:
        for document in documents:
            document_details = {
                "filename": document.name,
                "filetype": document.type,
                "filesize": document.size
            }
            st.write(document_details)

            # Extract content from the txt file
            if document.type == "text/plain":
                # Read as bytes
                data += str(document.read(), "utf-8")

            # Extract content from the pdf file
            elif document.type == "application/pdf":
                # using PyPDF2
                # data += read_pdf(document)

                # using pdfplumber
                try:
                    with pdfplumber.open(document) as pdf:
                        all_text = ""
                        for page in pdf.pages:
                            text = page.extract_text()
                            all_text += text + "\n"
                        data += all_text
                except requests.exceptions.RequestException as e:
                    st.write("None")

            # Extract content from the docx file
            elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
                data += docx2txt.process(document)

        # Display the extracted text content from file
        st.text_area("Extracted Text", value=data, height=200)
        # return extract status, and the data extracted
        return True, data
    
        

    else:
        st.error("Error: An error occurred while fetching content.")
        # return extract status, and the data extracted
        return False, data


def main():
    
    st.subheader("Extract Data from Documents")

    documents = st.file_uploader(
        "", type=["pdf", "txt", "docx"], accept_multiple_files=True
    )

    if "button_enter_doc" not in st.session_state:
        st.session_state.button_enter_doc = False

    if "extracted_doc" not in st.session_state:
        st.session_state.extracted_doc = False
    data = ""

    enter_c1, enter_c2 = st.columns([0.5, 0.5])
    with enter_c1:
        if st.button("Enter"):
            st.session_state.button_enter_doc = True
    with enter_c2:
        print()

    # the enter button
    if st.session_state.button_enter_doc:
        # check if it is a sitemap or not
        if not documents:
            documents = None
        else:
            for doc in documents:
                if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
                    # if documents is not the relevant type
                    st.error("Unsupported file: " + doc.name)

        st.session_state.extracted_doc, data = run_function(documents)

        if st.session_state.extracted_doc:
            col1, col2 = st.columns([0.5, 0.5])
            with col1:
                saved_button = False

                if st.download_button(
                    label="Save",
                    data=data
                ):
                    saved_button = True

            with col2:
                if st.button("Clear"):
                    st.session_state.button_enter_doc = False
                    st.session_state.extracted_doc = False
                    st.experimental_rerun()

            if saved_button:
                # Confirmation message
                st.success(f"File saved successfully.")

        else:
            clear_c1, clear_c2 = st.columns([0.5, 0.5])
            st.warning("Data not extracted")
            with clear_c1:
                if st.button("clear"):
                    st.session_state.button_enter_doc = False
                    st.session_state.extracted_doc = False
                    st.experimental_rerun()
            with clear_c2:
                print()
    st.write("#")
    st.write("#")        
            
                


    

    # Add a success message to the sidebar
    st.sidebar.success("Select a page above.")

    # importing the custom footer from utils
    cust_footer()


if __name__ == "__main__":
    main()