Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

File size: 9,822 Bytes

9c37e72
dba2773
 
9c37e72
1a16a58
9c37e72
0c5b55b
9c37e72
0c5b55b
9c37e72
 
 
1a16a58
9c37e72
 
 
0c5b55b
9c37e72
 
6e58c44
4834995
bd18577
 
 
 
 
 
 
2f51bd6
c75cc74
9eb1dec
9c37e72
234b51a
09d4214
 
 
06dd768
09d4214
0842639
baf370a
 
06dd768
9c37e72
 
 
2c22d61
9c37e72
 
 
 
1ecea99
fa73ddc
419e04c
9c37e72
9eb1dec
 
dd55b25
8c11fa3
 
dd55b25
 
dec4937
6fe4e2e
dd55b25
 
 
 
 
 
8c11fa3
dd55b25
10ef8bd
dd55b25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6ec27d
9eb1dec
 
9c37e72
 
 
 
 
 
9eb1dec
 
9531d63
 
 
 
9c37e72
9eb1dec
 
9c37e72
 
 
 
 
 
 
 
e113d20
3f6c2be
9c37e72
1a16a58
 
 
 
2c22d61
f1ae271
 
 
0a7287e
 
2c22d61
 
63c4e55
 
 
cb6a8b6
2bfe916
7b01ac0
 
 
06b5ba0
ea8c799
e0988f4
 
4c8e372
e8b7e7a
8cc1e8b
6e25163
dd55b25
3e4f1f9
dd55b25
ed0375d
dd55b25
3e4f1f9
8c11fa3
f5aabdb
 
3e4f1f9
dd55b25
3e4f1f9
dd55b25
c6ec27d
c08e6a6
1e17e2b
cd370f7
 
 
c6ec27d
cd370f7
1e17e2b
cd370f7
 
 
fda8d0d
9eb1dec
b9b4937
 
9eb1dec
b9b4937
9eb1dec
b9b4937
 
 
 
9eb1dec
b9b4937
 
9eb1dec
4387995
 
 
 
 
2c22d61
 
 
 
 
 
 
c6ec27d
c95ac40
9eb1dec
b9b4937
 
c95ac40
b9b4937
9c37e72
0d9c3fb
9c37e72
b9b4937
 
 
ef54be3
9eb1dec
ef54be3
9c37e72
b9b4937

"""
#App: NLP App with Streamlit
Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery)
Description
This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows;

+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy

+ Named Entity Recognition(NER)/Trigger word detection using SpaCy

+ Sentiment Analysis using TextBlob

+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstructive.

This is built with Streamlit Framework, an awesome framework for building ML and NLP tools.
Purpose
To perform basic and useful NLP tasks with Streamlit, Spacy, Textblob, and Gensim
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Anomaly_Detection_Tool", layout="wide", initial_sidebar_state="expanded")
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
import docx2txt
from PIL import Image 
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image


# NLP Pkgs
from textblob import TextBlob 
import spacy
#from gensim.summarization import summarize
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
#@st.experimental_singleton
@st.cache_resource(experimental_allow_widgets=True)
def read_pdf(file):
#     images=pdf2image.convert_from_path(file)
#     # print(type(images))
    pdfReader = PdfFileReader(file)
    count = pdfReader.numPages
    all_page_text = " "
    for i in range(count):
        page = pdfReader.getPage(i)
        # img = Image.open(page)
        # img = Image.open(page)
        # img = img.save("img.png")
        # image_name = cv2.imread("img.png")
        # # get co-ordinates to cr
#         text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
        all_page_text += page.extractText()+" "
    return all_page_text
# def read_pdf_with_pdfplumber(file):
#     all_page_text=" "
# #     all_page_text = ""
#     with pdfplumber.open(file) as pdf:
# 	    page = pdf.pages[0]
#     ge=page.to_image()
#     img = Image.open(ge)
#     img = img.save("img.png")
#     image_name = cv2.imread("img.png")
# # get co-ordinates to c
# #         return page.extract_text()
# #         get co-ordinates to cr  
# # # get co-ordinates to cr
#     text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
#     all_page_text += text + " " #page.extractText()
#     return all_page_text
st.title("NLP APPLICATION")
#@st.experimental_singleton
@st.cache_resource(experimental_allow_widgets=True)
def text_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	# tokens = [ token.text for token in docx]
	allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ]
	return allData
#@st.experimental_singleton
@st.cache_resource(experimental_allow_widgets=True)
def load_models():
    tokenizer = AutoTokenizer.from_pretrained('gpt2-large')
    model = GPT2LMHeadModel.from_pretrained('gpt2-large')
    return tokenizer, model
# Function For Extracting Entities
#@st.experimental_singleton
@st.cache_resource(experimental_allow_widgets=True)
def entity_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	tokens = [ token.text for token in docx]
	entities = [(entity.text,entity.label_)for entity in docx.ents]
	allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)]
	return allData
def main():
    """ NLP Based Application with Streamlit """
    st.markdown("""
    	#### Description
    	##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows:
+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy
+ Named Entity Recognition(NER)/Trigger word detection using SpaCy
+ Sentiment Analysis using TextBlob
+ Document/Text Summarization using T5 for English Abstractive.
    	""")
    def change_photo_state():
        st.session_state["photo"]="done"
    st.subheader("Please, feed your image/text, features/services will appear automatically!")
    message = st.text_input("Type your text here!")
    camera_photo = st.camera_input("Take a photo, Containing English texts", on_change=change_photo_state)
    uploaded_photo = st.file_uploader("Upload your PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
    if "photo" not in st.session_state:
        st.session_state["photo"]="not done"
    if st.session_state["photo"]=="done" or message:
        text=" "
        if uploaded_photo and uploaded_photo.type=='application/pdf':
            #file = uploaded_photo.read() # Read the data
            #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
            #image_result.write(file)
            tet = read_pdf(uploaded_photo)
            #tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
            values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*10)])
            text = tet[values[0]*7*10:values[1]*7*10] if values[0]!=len(tet)//(7*10) else tet[len(tet)//(7*10):]
            st.success(text)
        elif uploaded_photo:
            img = Image.open(uploaded_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            # get co-ordinates to crop the image
            #imag, lc = line_cor.mark_region(imge)
            #st.success(*lc)
           # c = lc
            # cropping image img = image[y0:y1, x0:x1]
            #imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
            #plt.figure(figsize=(10,10))
           # plt.imshow(img)
            # convert the image to black and white for better OCR
            #ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
            # pytesseract image to string to get results
            #text = str(pytesseract.image_to_string(img, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
            text = pytesseract.image_to_string(img) #pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else 
            st.success(text)
        elif camera_photo:
            img = Image.open(camera_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            text = pytesseract.image_to_string(img) #pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
            st.success(text)
        elif uploaded_photo==None and camera_photo==None:
    	#our_image=load_image("image.jpg")
        #img = cv2.imread("scholarly_text.jpg")
            text = message
        if st.checkbox("Show Named Entities English/Bangla"):
            st.cache_data.clear()
            entity_result = entity_analyzer(text)
            st.json(entity_result)
        
        if st.checkbox("Show Sentiment Analysis for English"):
            st.cache_data.clear()
            blob = TextBlob(text)
            result_sentiment = blob.sentiment
            st.success(result_sentiment)
        if st.checkbox("Spell Corrections for English"):
            st.cache_data.clear()
            st.success(TextBlob(text).correct())
        if st.checkbox("Text Generation"):
            st.cache_data.clear()
            tokenizer, model = load_models()
            input_ids = tokenizer(text, return_tensors='pt').input_ids
            st.text("Using Hugging Face Transformer, Contrastive Search ..")
            output = model.generate(input_ids, max_length=128)
            st.success(tokenizer.decode(output[0], skip_special_tokens=True))
        # if st.checkbox("Mark here, Text Summarization for English or Bangla!"):
        #     st.subheader("Summarize Your Text for English and Bangla Texts!")
        #     message = st.text_area("Enter the Text","Type please ..")
        #     st.text("Using Gensim Summarizer ..")
        #     st.success(message)
        #     summary_result = summarize(text)
        #     st.success(summary_result)
        if st.checkbox("Mark to English Text Summarization!"):
			#st.title("Summarize Your Text for English only!")
            st.cache_data.clear()
            tokenizer = AutoTokenizer.from_pretrained('t5-base')
            model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
			#st.text("Using Google T5 Transformer ..")
            inputs = tokenizer.encode("summarize: " + text,
						return_tensors='pt',
										max_length= 512,
										truncation=True)
            summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
            summary = tokenizer.decode(summary_ids[0])
            st.success(summary)
        if st.button("refresh"):
            st.cache_data.clear()
            st.experimental_rerun()
if __name__ == '__main__':
    main()