Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

App Files Files Community

Text-Summarization-and-NLP-tasks / app.py

Soumen

Update app.py

991d44a over 2 years ago

raw

history blame

5.39 kB

	"""
	#App: NLP App with Streamlit
	Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery)
	Description
	This is a Natural Language Processing(NLP) Based App useful for basic NLP concepts such as follows;

	+ Tokenization & Lemmatization using Spacy

	+ Named Entity Recognition(NER) using SpaCy

	+ Sentiment Analysis using TextBlob

	+ Document/Text Summarization using Gensim/T5 for both Bangla and english

	This is built with Streamlit Framework, an awesome framework for building ML and NLP tools.
	Purpose
	To perform basic and useful NLP task with Streamlit, Spacy, Textblob and Gensim
	"""
	# Core Pkgs
	import os
	#os.system('sudo apt-get install tesseract-ocr-eng')
	#os.system('sudo apt-get install tesseract-ocr-ben')

	#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
	#os.system('gunzip ben.traineddata.gz ')
	#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
	#os.system('pip install -q pytesseract')
	import streamlit as st
	import os
	import torch
	from transformers import AutoTokenizer, AutoModelWithLMHead

	# NLP Pkgs
	from textblob import TextBlob
	import spacy
	from gensim.summarization import summarize
	import requests
	import cv2
	import numpy as np
	import pytesseract
	#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
	from PIL import Image
	@st.cache
	def text_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	# tokens = [ token.text for token in docx]
	allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ]
	return allData

	# Function For Extracting Entities
	@st.cache
	def entity_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	tokens = [ token.text for token in docx]
	entities = [(entity.text,entity.label_)for entity in docx.ents]
	allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)]
	return allData


	def main():
	""" NLP Based App with Streamlit """

	# Title
	st.title("Streamlit NLP APP")
	st.markdown("""
	#### Description
	+ This is a Natural Language Processing(NLP) Based App useful for basic NLP task
	NER,Sentiment, Spell Corrections and Summarization
	""")

	#Text Corrections
	if st.checkbox("Spell Corrections"):
	st.subheader("Correct Your Text")
	message = st.text_area("Enter the Text","Type please ..")
	if st.button("Spell Corrections"):
	st.text("Using TextBlob ..")
	st.success(TextBlob(message).correct())
	# Entity Extraction
	elif st.checkbox("Show Named Entities"):
	st.subheader("Analyze Your Text")

	message = st.text_area("Enter your Text","Typing Here ..")
	if st.button("Extract"):
	entity_result = entity_analyzer(message)
	st.json(entity_result)

	# Sentiment Analysis
	elif st.checkbox("Show Sentiment Analysis"):
	st.subheader("Analyse Your Text")
	message = st.text_area("Enter Text plz, Type Here ...")
	if st.button("Analyze"):
	blob = TextBlob(message)
	result_sentiment = blob.sentiment
	st.success(result_sentiment)
	def change_photo_state():
	st.session_state["photo"]="done"
	st.subheader("Summary section, feed your image!")
	camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
	uploaded_photo = st.file_uploader("Upload Image, Containing English or Bangla texts",type=['jpg','png','jpeg'], on_change=change_photo_state)
	message = st.text_input("Or, drop your text here, only English text!")
	if "photo" not in st.session_state:
	st.session_state["photo"]="not done"

	if st.session_state["photo"]=="done" or message:
	if uploaded_photo:
	img = Image.open(uploaded_photo)
	img = img.save("img.png")
	img = cv2.imread("img.png")
	text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark here to see in Bangla for Bangla Images only") else pytesseract.image_to_string(img)
	st.success(text)
	if camera_photo:
	img = Image.open(camera_photo)
	img = img.save("img.png")
	img = cv2.imread("img.png")
	text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark here to see Bangla") else pytesseract.image_to_string(img)
	st.success(text)
	if uploaded_photo==None and camera_photo==None:
	#our_image=load_image("image.jpg")
	#img = cv2.imread("scholarly_text.jpg")
	text = message
	# Summarization
	if st.checkbox("Mark here, Text Summarization for English or Bangla!"):
	#st.subheader("Summarize Your Text for English and Bangla Texts!")
	#message = st.text_area("Enter the Text","Type please ..")
	#st.text("Using Gensim Summarizer ..")
	#st.success(mess)
	summary_result = summarize(text)
	st.success(summary_result)
	elif st.checkbox("Mark here, Better Text Summarization for English only!"):
	#st.title("Summarize Your Text for English only!")
	tokenizer = AutoTokenizer.from_pretrained('t5-base')
	model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
	#st.text("Using Google T5 Transformer ..")
	inputs = tokenizer.encode("summarize: " + text,
	return_tensors='pt',
	max_length=512,
	truncation=True)
	summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
	summary = tokenizer.decode(summary_ids[0])
	st.success(summary)

	st.sidebar.subheader("About App")
	st.sidebar.subheader("By")
	st.sidebar.text("Soumen Sarker")

	if __name__ == '__main__':
	main()