VasudevaK commited on
Commit
6c40526
1 Parent(s): 18860d8

deployed first version

Browse files
Files changed (2) hide show
  1. app.py +159 -0
  2. requirements.txt +145 -0
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lib2to3 import pytree
2
+ from urllib import response
3
+ import streamlit as st
4
+ import pytesseract
5
+ from PIL import Image
6
+ # from pdf2image import convert_from_path
7
+ import pandas as pd
8
+ import yake
9
+ import fitz
10
+ import nltk
11
+ from gtts import gTTS
12
+ nltk.download('punkt')
13
+ nltk.download('wordnet')
14
+ nltk.download('omw-1.4')
15
+ from sklearn.feature_extraction.text import TfidfVectorizer
16
+ from sklearn.metrics.pairwise import cosine_similarity
17
+ import string
18
+ import os
19
+ import re
20
+
21
+ st.title("Extract info from Files")
22
+
23
+ st.sidebar.title('Hyper Params')
24
+
25
+ menu = ["Image","Dataset","DocumentFiles","About"]
26
+ choice = st.sidebar.selectbox("Select the type of data", menu)
27
+
28
+ no_of_keys = st.sidebar.slider('Select the no of keywords', 1, 20, 2, 2)
29
+
30
+ output = 'response'
31
+ output = st.selectbox('Select the type of output', ('keys', 'response'))
32
+
33
+ # pre processing the images
34
+ filters = ['Gaussian', 'Low pass', 'High Pass', 'System defined']
35
+ filter = st.sidebar.selectbox("Select the type of filter to preprocess the image", filters)
36
+
37
+ tes = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
38
+ pytesseract.pytesseract.tesseract_cmd = tes
39
+
40
+ extractor = yake.KeywordExtractor()
41
+ language = 'en'
42
+ max_ngram_size = st.sidebar.slider('Select the parameter for ngram', 1, 20, 3, 2)
43
+ deduplication_threshold = st.sidebar.slider('Select the parameter for DD threshold', 1, 10, 9, 1)
44
+ deduplication_threshold = deduplication_threshold/10
45
+ numOfKeywords = 100
46
+ custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
47
+
48
+ lemmer = nltk.stem.WordNetLemmatizer()
49
+
50
+ def LemTokens(tokens):
51
+ return [lemmer.lemmatize(token) for token in tokens]
52
+ remove_punct_dict= dict((ord(punct), None) for punct in string.punctuation)
53
+
54
+ def LemNormalize(text):
55
+ return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
56
+
57
+ def rees(glo_text, keys):
58
+ for key in keys[:no_of_keys]:
59
+ # st.write(type(glo_text))
60
+ sent_tokens = nltk.sent_tokenize(glo_text)
61
+ word_tokens = nltk.word_tokenize(glo_text)
62
+ sent_tokens.append(key)
63
+ word_tokens = word_tokens + nltk.word_tokenize(key)
64
+ TfidfVec = TfidfVectorizer(tokenizer = LemNormalize, stop_words='english')
65
+ tfidf = TfidfVec.fit_transform(sent_tokens)
66
+ vals = cosine_similarity(tfidf[-1], tfidf)
67
+ idx = vals.argsort()[0][-2]
68
+ response = sent_tokens[idx]
69
+ if(output == 'response'):
70
+ st.write(' - ' + key + ':' + response)
71
+ else:
72
+ st.write(' - ' + key)
73
+ response = re.sub("[^a-zA-Z0-9]","",response)
74
+ myobj = gTTS(text=response, lang=language, slow=False)
75
+ myobj.save("audio.mp3")
76
+ st.audio("audio.mp3", format='audio/ogg')
77
+ os.remove("audio.mp3")
78
+
79
+ def load_image(image_file):
80
+ img = Image.open(image_file)
81
+ st.image(img, width=250)
82
+ text = pytesseract.image_to_string(img)
83
+ img.close()
84
+ return text
85
+ # text = pytesseract.image_to_string(img)
86
+
87
+ def load_pdf(data_file):
88
+ doc = fitz.open(stream=data_file.read(), filetype="pdf")
89
+ text = ""
90
+ glo_text = ''
91
+ for page in doc:
92
+ text = text + page.get_text()
93
+ glo_text += text
94
+ keywords = custom_kw_extractor.extract_keywords(text)
95
+
96
+ for kw in keywords[::-1]:
97
+ if(kw[1] > 0.1):
98
+ keys.append(kw[0])
99
+ # st.write(keys)
100
+ doc.close()
101
+ return glo_text, keys
102
+
103
+ keys = []
104
+
105
+ def tes_image(image_file):
106
+ if image_file != None:
107
+ # add filters if time permits
108
+ glo_text = ''
109
+ # text = pytesseract.image_to_string(load_image(image_file)) # can add a specific language to detect the text on the screen
110
+ # st.image(load_image(image_file),width=250)
111
+ # st.write(text)
112
+ text = load_image(image_file)
113
+ glo_text += text
114
+ keywords = custom_kw_extractor.extract_keywords(text)
115
+
116
+ for kw in keywords[::-1]:
117
+ if(kw[1] > 0.1):
118
+ keys.append(kw[0])
119
+
120
+ # st.write(keys)
121
+ return glo_text, keys
122
+
123
+ def tes_doc(data_file):
124
+ if data_file != None:
125
+ tup = load_pdf(data_file)
126
+ return tup
127
+
128
+ def convert_df_to_text(df):
129
+ pass # implement key to text here using key2text package
130
+
131
+ if choice == "Image":
132
+ st.subheader("Image")
133
+ image_file = st.file_uploader("Upload Images", type=["png","jpg","jpeg"])
134
+ if image_file != None:
135
+ file_details = {"filename":image_file.name, "filetype":image_file.type, "filesize":image_file.size}
136
+ st.write(file_details)
137
+ glo_text, keys = tes_image(image_file)
138
+ rees(glo_text, keys)
139
+
140
+ elif choice == "Dataset":
141
+ st.subheader("Dataset")
142
+ data_file = st.file_uploader("Upload CSV",type=["csv"])
143
+ if data_file != None:
144
+ file_details = {"filename":data_file, "filetype":data_file.type, "filesize":data_file.size}
145
+ st.write(file_details)
146
+ df = pd.read_csv(data_file)
147
+ st.write(df)
148
+ convert_df_to_text(df)
149
+
150
+
151
+ elif choice == "DocumentFiles":
152
+ st.subheader("DocumentFiles")
153
+ docx_file = st.file_uploader("Upload Document", type=["pdf","docx","txt"])
154
+ if st.button("Process"):
155
+ if docx_file is not None:
156
+ file_details = {"filename":docx_file.name, "filetype":docx_file.type, "filesize":docx_file.size}
157
+ st.write(file_details)
158
+ glo_text, keys = tes_doc(docx_file)
159
+ rees(glo_text, keys)
requirements.txt ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==4.2.0
2
+ argon2-cffi==21.3.0
3
+ argon2-cffi-bindings==21.2.0
4
+ asttokens==2.0.5
5
+ attrs==21.4.0
6
+ backcall==0.2.0
7
+ beautifulsoup4==4.11.1
8
+ bleach==5.0.0
9
+ blinker==1.4
10
+ blis==0.7.7
11
+ cachetools==5.0.0
12
+ catalogue==2.0.7
13
+ certifi==2021.10.8
14
+ cffi==1.15.0
15
+ charset-normalizer==2.0.12
16
+ ci-info==0.2.0
17
+ click @ file:///C:/ci/click_1646038595831/work
18
+ colorama @ file:///tmp/build/80754af9/colorama_1607707115595/work
19
+ configobj==5.0.6
20
+ configparser==5.2.0
21
+ cymem==2.0.6
22
+ debugpy==1.6.0
23
+ decorator==5.1.1
24
+ defusedxml==0.7.1
25
+ docopt==0.6.2
26
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
27
+ entrypoints==0.4
28
+ etelemetry==0.3.0
29
+ executing==0.8.3
30
+ fastjsonschema==2.15.3
31
+ filelock==3.6.0
32
+ fitz==0.0.1.dev2
33
+ gitdb==4.0.9
34
+ GitPython==3.1.27
35
+ gTTS==2.2.4
36
+ httplib2==0.20.4
37
+ idna==3.3
38
+ importlib-metadata==4.11.3
39
+ ipykernel==6.13.0
40
+ ipython==8.2.0
41
+ ipython-genutils==0.2.0
42
+ ipywidgets==7.7.0
43
+ isodate==0.6.1
44
+ jedi==0.18.1
45
+ jellyfish==0.9.0
46
+ Jinja2==3.1.1
47
+ joblib @ file:///tmp/build/80754af9/joblib_1635411271373/work
48
+ jsonschema==4.4.0
49
+ jupyter-client==7.3.0
50
+ jupyter-core==4.10.0
51
+ jupyterlab-pygments==0.2.2
52
+ jupyterlab-widgets==1.1.0
53
+ langcodes==3.3.0
54
+ lxml==4.8.0
55
+ MarkupSafe==2.1.1
56
+ matplotlib-inline==0.1.3
57
+ mistune==0.8.4
58
+ murmurhash==1.0.7
59
+ nbclient==0.6.0
60
+ nbconvert==6.5.0
61
+ nbformat==5.3.0
62
+ nest-asyncio==1.5.5
63
+ networkx==2.8
64
+ nibabel==3.2.2
65
+ nipype==1.7.1
66
+ nltk @ file:///opt/conda/conda-bld/nltk_1645628263994/work
67
+ notebook==6.4.11
68
+ numpy==1.22.3
69
+ packaging==21.3
70
+ pandas==1.4.2
71
+ pandocfilters==1.5.0
72
+ parso==0.8.3
73
+ pathy==0.6.1
74
+ pickleshare==0.7.5
75
+ Pillow==9.1.0
76
+ preshed==3.0.6
77
+ prometheus-client==0.14.1
78
+ prompt-toolkit==3.0.29
79
+ protobuf==3.20.1
80
+ prov==2.0.0
81
+ psutil==5.9.0
82
+ pure-eval==0.2.2
83
+ pyarrow==7.0.0
84
+ pycparser==2.21
85
+ pydantic==1.8.2
86
+ pydeck==0.7.1
87
+ pydot==1.4.2
88
+ Pygments==2.12.0
89
+ Pympler==1.0.1
90
+ PyMuPDF==1.19.6
91
+ pyparsing==3.0.8
92
+ pyrsistent==0.18.1
93
+ pytesseract==0.3.9
94
+ python-dateutil==2.8.2
95
+ pytz==2022.1
96
+ pytz-deprecation-shim==0.1.0.post0
97
+ pywin32==303
98
+ pywinpty==2.0.5
99
+ pyxnat==1.3
100
+ pyzmq==22.3.0
101
+ rdflib==6.1.1
102
+ regex==2022.4.24
103
+ requests==2.27.1
104
+ scikit-learn==1.0.2
105
+ scipy==1.8.0
106
+ segtok==1.5.11
107
+ semver==2.13.0
108
+ Send2Trash==1.8.0
109
+ simplejson==3.17.6
110
+ six==1.16.0
111
+ sklearn==0.0
112
+ smart-open==5.2.1
113
+ smmap==5.0.0
114
+ soupsieve==2.3.2.post1
115
+ spacy-legacy==3.0.9
116
+ spacy-loggers==1.0.2
117
+ srsly==2.4.3
118
+ stack-data==0.2.0
119
+ streamlit==1.8.1
120
+ tabulate==0.8.9
121
+ terminado==0.13.3
122
+ thinc==8.0.15
123
+ threadpoolctl==3.1.0
124
+ tinycss2==1.1.1
125
+ toml==0.10.2
126
+ toolz==0.11.2
127
+ tornado==6.1
128
+ tqdm @ file:///C:/ci/tqdm_1650636210717/work
129
+ traitlets==5.1.1
130
+ traits==6.3.2
131
+ typer==0.4.1
132
+ typing_extensions==4.2.0
133
+ tzdata==2022.1
134
+ tzlocal==4.2
135
+ urllib3==1.26.9
136
+ validators==0.18.2
137
+ wasabi==0.9.1
138
+ watchdog==2.1.7
139
+ wcwidth==0.2.5
140
+ webencodings==0.5.1
141
+ widgetsnbextension==3.6.0
142
+ wincertstore==0.2
143
+ yake==0.4.8
144
+ yarg==0.1.9
145
+ zipp==3.8.0