File size: 5,395 Bytes
6c40526
90186c7
6c40526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1714bd5
 
 
 
6c40526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90186c7
6c40526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6663845
6c40526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import streamlit as st
from PIL import Image
# from pdf2image import convert_from_path
import pandas as pd
import yake
import fitz
import nltk
from gtts import gTTS
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import os
import re

os.system('sudo apt-get install tesseract-ocr')
os.system('pip install -q pytesseract')
import pytesseract

st.title("Extract info from Files")

st.sidebar.title('Hyper Params')

menu = ["Image","Dataset","DocumentFiles","About"]
choice = st.sidebar.selectbox("Select the type of data", menu)

no_of_keys = st.sidebar.slider('Select the no of keywords', 1, 20, 2, 2)

output = 'response'
output = st.selectbox('Select the type of output', ('keys', 'response'))

# pre processing the images
filters = ['Gaussian', 'Low pass', 'High Pass', 'System defined']
filter = st.sidebar.selectbox("Select the type of filter to preprocess the image", filters)

tes = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
pytesseract.pytesseract.tesseract_cmd = tes

extractor = yake.KeywordExtractor()
language = 'en'
max_ngram_size = st.sidebar.slider('Select the parameter for ngram', 1, 20, 3, 2)
deduplication_threshold = st.sidebar.slider('Select the parameter for DD threshold', 1, 10, 9, 1)
deduplication_threshold = deduplication_threshold/10
numOfKeywords = 100
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)

lemmer = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
  return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict= dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
  return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

def rees(glo_text, keys):
    for key in keys[:no_of_keys]:
        # st.write(type(glo_text))
        sent_tokens = nltk.sent_tokenize(glo_text)
        word_tokens = nltk.word_tokenize(glo_text)
        sent_tokens.append(key)
        word_tokens = word_tokens + nltk.word_tokenize(key)
        TfidfVec = TfidfVectorizer(tokenizer = LemNormalize, stop_words='english')
        tfidf = TfidfVec.fit_transform(sent_tokens)
        vals = cosine_similarity(tfidf[-1], tfidf)
        idx = vals.argsort()[0][-2]
        response = sent_tokens[idx]
        if(output == 'response'):
            st.write(' - ' + key + ':' + response)
        else:
            st.write(' - ' + key)
        response = re.sub("[^a-zA-Z0-9]","",response)
        myobj = gTTS(text=response, lang=language, slow=False)
        myobj.save("audio.mp3")  
        st.audio("audio.mp3", format='audio/ogg')
        os.remove("audio.mp3")

def load_image(image_file):
    img = Image.open(image_file)
    st.image(img, width=250)
    text = pytesseract.image_to_string(img)
    img.close()
    return text
    # text = pytesseract.image_to_string(img)

def load_pdf(data_file):
    doc = fitz.open(stream=data_file.read(), filetype="pdf")
    text = ""
    glo_text = ''
    for page in doc:
        text = text + page.get_text()
    glo_text += text
    keywords = custom_kw_extractor.extract_keywords(text)

    for kw in keywords[::-1]:
        if(kw[1] > 0.1):
            keys.append(kw[0])
    # st.write(keys)
    doc.close()
    return glo_text, keys

keys = []

def tes_image(image_file):
    if image_file != None:
        # add filters if time permits
        glo_text = ''
        # text = pytesseract.image_to_string(load_image(image_file)) # can add a specific language to detect the text on the screen
        # st.image(load_image(image_file),width=250)
        # st.write(text)
        text = load_image(image_file)
        glo_text += text
        keywords = custom_kw_extractor.extract_keywords(text)

        for kw in keywords[::-1]:
            if(kw[1] > 0.1):
                keys.append(kw[0])

        # st.write(keys)
        return glo_text, keys

def tes_doc(data_file):
    if data_file != None:
        tup = load_pdf(data_file)
        return tup

def convert_df_to_text(df):
    pass # implement key to text here using key2text package

if choice == "Image":
    st.subheader("Image")
    image_file = st.file_uploader("Upload Images", type=["png","jpg","jpeg"])
    if image_file != None:
        file_details = {"filename":image_file.name, "filetype":image_file.type, "filesize":image_file.size}
        st.write(file_details)
        glo_text, keys = tes_image(image_file)
        rees(glo_text, keys)

elif choice == "Dataset":
    st.subheader("Dataset")
    data_file = st.file_uploader("Upload CSV",type=["csv"])
    if data_file != None:
        file_details = {"filename":data_file, "filetype":data_file.type, "filesize":data_file.size}
        st.write(file_details)
        df = pd.read_csv(data_file)
        st.write(df)
        convert_df_to_text(df)


elif choice == "DocumentFiles":
    st.subheader("DocumentFiles")
    docx_file = st.file_uploader("Upload Document", type=["pdf","docx","txt"])
    if st.button("Process"):
        if docx_file is not None:
            file_details = {"filename":docx_file.name, "filetype":docx_file.type, "filesize":docx_file.size}
            st.write(file_details)
            glo_text, keys = tes_doc(docx_file)
            rees(glo_text, keys)