Spaces:

VasudevaK
/

Information_Extractor

Runtime error

App Files Files Community

VasudevaK commited on Sep 5, 2022

Commit

6c40526

•

1 Parent(s): 18860d8

deployed first version

Browse files

Files changed (2) hide show

app.py +159 -0
requirements.txt +145 -0

app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+from lib2to3 import pytree
+from urllib import response
+import streamlit as st
+import pytesseract
+from PIL import Image
+# from pdf2image import convert_from_path
+import pandas as pd
+import yake
+import fitz
+import nltk
+from gtts import gTTS
+nltk.download('punkt')
+nltk.download('wordnet')
+nltk.download('omw-1.4')
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import string
+import os
+import re
+st.title("Extract info from Files")
+st.sidebar.title('Hyper Params')
+menu = ["Image","Dataset","DocumentFiles","About"]
+choice = st.sidebar.selectbox("Select the type of data", menu)
+no_of_keys = st.sidebar.slider('Select the no of keywords', 1, 20, 2, 2)
+output = 'response'
+output = st.selectbox('Select the type of output', ('keys', 'response'))
+# pre processing the images
+filters = ['Gaussian', 'Low pass', 'High Pass', 'System defined']
+filter = st.sidebar.selectbox("Select the type of filter to preprocess the image", filters)
+tes = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
+pytesseract.pytesseract.tesseract_cmd = tes
+extractor = yake.KeywordExtractor()
+language = 'en'
+max_ngram_size = st.sidebar.slider('Select the parameter for ngram', 1, 20, 3, 2)
+deduplication_threshold = st.sidebar.slider('Select the parameter for DD threshold', 1, 10, 9, 1)
+deduplication_threshold = deduplication_threshold/10
+numOfKeywords = 100
+custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
+lemmer = nltk.stem.WordNetLemmatizer()
+def LemTokens(tokens):
+  return [lemmer.lemmatize(token) for token in tokens]
+remove_punct_dict= dict((ord(punct), None) for punct in string.punctuation)
+def LemNormalize(text):
+  return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
+def rees(glo_text, keys):
+    for key in keys[:no_of_keys]:
+        # st.write(type(glo_text))
+        sent_tokens = nltk.sent_tokenize(glo_text)
+        word_tokens = nltk.word_tokenize(glo_text)
+        sent_tokens.append(key)
+        word_tokens = word_tokens + nltk.word_tokenize(key)
+        TfidfVec = TfidfVectorizer(tokenizer = LemNormalize, stop_words='english')
+        tfidf = TfidfVec.fit_transform(sent_tokens)
+        vals = cosine_similarity(tfidf[-1], tfidf)
+        idx = vals.argsort()[0][-2]
+        response = sent_tokens[idx]
+        if(output == 'response'):
+            st.write(' - ' + key + ':' + response)
+        else:
+            st.write(' - ' + key)
+        response = re.sub("[^a-zA-Z0-9]","",response)
+        myobj = gTTS(text=response, lang=language, slow=False)
+        myobj.save("audio.mp3")
+        st.audio("audio.mp3", format='audio/ogg')
+        os.remove("audio.mp3")
+def load_image(image_file):
+    img = Image.open(image_file)
+    st.image(img, width=250)
+    text = pytesseract.image_to_string(img)
+    img.close()
+    return text
+    # text = pytesseract.image_to_string(img)
+def load_pdf(data_file):
+    doc = fitz.open(stream=data_file.read(), filetype="pdf")
+    text = ""
+    glo_text = ''
+    for page in doc:
+        text = text + page.get_text()
+    glo_text += text
+    keywords = custom_kw_extractor.extract_keywords(text)
+    for kw in keywords[::-1]:
+        if(kw[1] > 0.1):
+            keys.append(kw[0])
+    # st.write(keys)
+    doc.close()
+    return glo_text, keys
+keys = []
+def tes_image(image_file):
+    if image_file != None:
+        # add filters if time permits
+        glo_text = ''
+        # text = pytesseract.image_to_string(load_image(image_file)) # can add a specific language to detect the text on the screen
+        # st.image(load_image(image_file),width=250)
+        # st.write(text)
+        text = load_image(image_file)
+        glo_text += text
+        keywords = custom_kw_extractor.extract_keywords(text)
+        for kw in keywords[::-1]:
+            if(kw[1] > 0.1):
+                keys.append(kw[0])
+        # st.write(keys)
+        return glo_text, keys
+def tes_doc(data_file):
+    if data_file != None:
+        tup = load_pdf(data_file)
+        return tup
+def convert_df_to_text(df):
+    pass # implement key to text here using key2text package
+if choice == "Image":
+    st.subheader("Image")
+    image_file = st.file_uploader("Upload Images", type=["png","jpg","jpeg"])
+    if image_file != None:
+        file_details = {"filename":image_file.name, "filetype":image_file.type, "filesize":image_file.size}
+        st.write(file_details)
+        glo_text, keys = tes_image(image_file)
+        rees(glo_text, keys)
+elif choice == "Dataset":
+    st.subheader("Dataset")
+    data_file = st.file_uploader("Upload CSV",type=["csv"])
+    if data_file != None:
+        file_details = {"filename":data_file, "filetype":data_file.type, "filesize":data_file.size}
+        st.write(file_details)
+        df = pd.read_csv(data_file)
+        st.write(df)
+        convert_df_to_text(df)
+elif choice == "DocumentFiles":
+    st.subheader("DocumentFiles")
+    docx_file = st.file_uploader("Upload Document", type=["pdf","docx","txt"])
+    if st.button("Process"):
+        if docx_file is not None:
+            file_details = {"filename":docx_file.name, "filetype":docx_file.type, "filesize":docx_file.size}
+            st.write(file_details)
+            glo_text, keys = tes_doc(docx_file)
+            rees(glo_text, keys)

requirements.txt ADDED Viewed

	@@ -0,0 +1,145 @@

+altair==4.2.0
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.0.5
+attrs==21.4.0
+backcall==0.2.0
+beautifulsoup4==4.11.1
+bleach==5.0.0
+blinker==1.4
+blis==0.7.7
+cachetools==5.0.0
+catalogue==2.0.7
+certifi==2021.10.8
+cffi==1.15.0
+charset-normalizer==2.0.12
+ci-info==0.2.0
+click @ file:///C:/ci/click_1646038595831/work
+colorama @ file:///tmp/build/80754af9/colorama_1607707115595/work
+configobj==5.0.6
+configparser==5.2.0
+cymem==2.0.6
+debugpy==1.6.0
+decorator==5.1.1
+defusedxml==0.7.1
+docopt==0.6.2
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
+entrypoints==0.4
+etelemetry==0.3.0
+executing==0.8.3
+fastjsonschema==2.15.3
+filelock==3.6.0
+fitz==0.0.1.dev2
+gitdb==4.0.9
+GitPython==3.1.27
+gTTS==2.2.4
+httplib2==0.20.4
+idna==3.3
+importlib-metadata==4.11.3
+ipykernel==6.13.0
+ipython==8.2.0
+ipython-genutils==0.2.0
+ipywidgets==7.7.0
+isodate==0.6.1
+jedi==0.18.1
+jellyfish==0.9.0
+Jinja2==3.1.1
+joblib @ file:///tmp/build/80754af9/joblib_1635411271373/work
+jsonschema==4.4.0
+jupyter-client==7.3.0
+jupyter-core==4.10.0
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==1.1.0
+langcodes==3.3.0
+lxml==4.8.0
+MarkupSafe==2.1.1
+matplotlib-inline==0.1.3
+mistune==0.8.4
+murmurhash==1.0.7
+nbclient==0.6.0
+nbconvert==6.5.0
+nbformat==5.3.0
+nest-asyncio==1.5.5
+networkx==2.8
+nibabel==3.2.2
+nipype==1.7.1
+nltk @ file:///opt/conda/conda-bld/nltk_1645628263994/work
+notebook==6.4.11
+numpy==1.22.3
+packaging==21.3
+pandas==1.4.2
+pandocfilters==1.5.0
+parso==0.8.3
+pathy==0.6.1
+pickleshare==0.7.5
+Pillow==9.1.0
+preshed==3.0.6
+prometheus-client==0.14.1
+prompt-toolkit==3.0.29
+protobuf==3.20.1
+prov==2.0.0
+psutil==5.9.0
+pure-eval==0.2.2
+pyarrow==7.0.0
+pycparser==2.21
+pydantic==1.8.2
+pydeck==0.7.1
+pydot==1.4.2
+Pygments==2.12.0
+Pympler==1.0.1
+PyMuPDF==1.19.6
+pyparsing==3.0.8
+pyrsistent==0.18.1
+pytesseract==0.3.9
+python-dateutil==2.8.2
+pytz==2022.1
+pytz-deprecation-shim==0.1.0.post0
+pywin32==303
+pywinpty==2.0.5
+pyxnat==1.3
+pyzmq==22.3.0
+rdflib==6.1.1
+regex==2022.4.24
+requests==2.27.1
+scikit-learn==1.0.2
+scipy==1.8.0
+segtok==1.5.11
+semver==2.13.0
+Send2Trash==1.8.0
+simplejson==3.17.6
+six==1.16.0
+sklearn==0.0
+smart-open==5.2.1
+smmap==5.0.0
+soupsieve==2.3.2.post1
+spacy-legacy==3.0.9
+spacy-loggers==1.0.2
+srsly==2.4.3
+stack-data==0.2.0
+streamlit==1.8.1
+tabulate==0.8.9
+terminado==0.13.3
+thinc==8.0.15
+threadpoolctl==3.1.0
+tinycss2==1.1.1
+toml==0.10.2
+toolz==0.11.2
+tornado==6.1
+tqdm @ file:///C:/ci/tqdm_1650636210717/work
+traitlets==5.1.1
+traits==6.3.2
+typer==0.4.1
+typing_extensions==4.2.0
+tzdata==2022.1
+tzlocal==4.2
+urllib3==1.26.9
+validators==0.18.2
+wasabi==0.9.1
+watchdog==2.1.7
+wcwidth==0.2.5
+webencodings==0.5.1
+widgetsnbextension==3.6.0
+wincertstore==0.2
+yake==0.4.8
+yarg==0.1.9
+zipp==3.8.0