Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

App Files Files Community

Text-Summarization-and-NLP-tasks / app.py

Soumen

Update app.py

2d1cbd4 about 1 year ago

raw

history blame

9.09 kB

	"""
	#App: NLP App with Streamlit
	Description
	This is a Natural Language Processing(NLP) base Application that is useful for
	Document/Text Summarization from Bangla images and English Images/PDF files.
	"""
	# Core Pkgs
	import os
	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	#os.system('sudo apt-get install tesseract-ocr-eng')
	#os.system('sudo apt-get install tesseract-ocr-ben')

	#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
	#os.system('gunzip ben.traineddata.gz ')
	#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
	#os.system('pip install -q pytesseract')
	#os.system('conda install -c conda-forge poppler')
	import streamlit as st
	st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
	st.title("Chatbot and Bangla/English Text Summarizer: Upload Images/Pdf or input texts to summarize!")
	import torch
	import docx2txt
	from PIL import Image
	from PyPDF2 import PdfFileReader
	from pdf2image import convert_from_bytes
	import pdfplumber
	#from line_cor import mark_region
	import pdf2image
	import requests
	import cv2
	import numpy as np
	import pytesseract
	import line_cor
	import altair as alt
	#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
	from PIL import Image
	API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
	headers0 = {"Authorization": "Bearer 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"}
	API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base"
	headers1 = {"Authorization": "Bearer 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"}
	API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
	headers2 = {"Authorization": "Bearer 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"}
	@st.cache
	def read_pdf(file):
	# images=pdf2image.convert_from_path(file)
	# # print(type(images))
	pdfReader = PdfFileReader(file)
	count = pdfReader.numPages
	all_page_text = " "
	for i in range(count):
	page = pdfReader.getPage(i)
	all_page_text += page.extractText()+" "
	return all_page_text
	# def read_pdf_with_pdfplumber(file):
	# # Open the uploaded PDF file with pdfplumber
	# with pdfplumber.open(file) as pdf:
	# extracted_text = ''
	# for page in pdf.pages:
	# extracted_text += page.extract_text()

	# # Display the extracted text
	# #st.text(extracted_text)
	# return extracted_text

	def engsum(output):
	def query(payload):
	response = requests.post(API_URL1, headers=headers1, json=payload)
	return response.json()

	out = query({
	"inputs": output,
	"min_length":300
	})
	if isinstance(out, list) and out[0].get("generated_text"):
	text_output = out[0]["generated_text"]
	st.success(text_output)
	def bansum(text):
	def query(payload):
	response = requests.post(API_URL0, headers=headers0, json=payload)
	return response.json()
	out = query({"inputs": text, "min_length":300})
	if isinstance(out, list) and out[0].get("summary_text"):
	text_output = out[0]["summary_text"]
	st.success(text_output)

	# @st.cache
	# def save(l):
	# return l
	#@st.cache
	def main():
	camera_photo=None
	import streamlit as st
	if "photo" not in st.session_state:
	st.session_state["photo"]="not done"
	def change_photo_state():
	st.session_state["photo"]="done"
	with st.container():
	c1, c2, c3 = st.columns([2,2,1])
	message = c1.text_input("Type your text here!")
	Capture=True
	if c2.button("Capture"):
	camera_photo = c2.camera_input("Capture a photo to summarize: ", on_change=change_photo_state)
	if c2.button("Stop Camera"):
	Capture =False

	uploaded_photo = c3.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
	if st.session_state["photo"]=="done" or message:
	if uploaded_photo and uploaded_photo.type=='application/pdf':
	tet = read_pdf(uploaded_photo)
	# with tempfile.NamedTemporaryFile(delete=False) as temp_file:
	# temp_file.write(uploaded_photo.read())
	# temp_file_path = temp_file.name

	# loader = PyPDFLoader(temp_file_path)
	# if loader:
	# text.extend(loader.load())
	# os.remove(temp_file_path)
	# text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
	# text_chunks = text_splitter.split_documents(text)
	values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
	text = tet[values[0]710:values[1]10100] if values[0]!=len(tet)//(10100) else tet[len(tet)//(10100):]
	#st.success(type(text_chunks))
	if st.button("English Pdf Summarize"):
	st.subheader("Selected text for summarize: ")
	st.success(text)
	st.subheader("Summarized Text: ")
	engsum(text)

	elif uploaded_photo and uploaded_photo.type !='application/pdf':
	text=None
	img = Image.open(uploaded_photo)
	img = img.save("img.png")
	img = cv2.imread("img.png")
	st.text("Select the summarization type:")
	c4, c5 = st.columns([1,7])
	if c4.button("BENGALI"):
	text = pytesseract.image_to_string(img, lang="ben")
	st.success(text)
	st.subheader("সারাংশ/সারমর্ম")
	bansum(text)
	if c5.button("ENGLISH"):
	text=pytesseract.image_to_string(img)
	st.success(text)
	st.subheader("Summarized Text")
	engsum(text)
	elif camera_photo:
	text=None
	img = Image.open(camera_photo)
	img = img.save("img.png")
	img = cv2.imread("img.png")
	#text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
	st.text("Select the summarization type:")
	#st.image(camera_photo, caption='Uploaded Image', use_column_width=True)
	c6, c7 = st.columns([1,7])
	if c6.button("Bangla"):
	text = pytesseract.image_to_string(img, lang="ben")
	st.success(text)
	st.subheader("সারাংশ/সারমর্ম")
	bansum(text)
	if c7.button("English"):
	text=pytesseract.image_to_string(img)
	st.success(text)
	st.subheader("Summarized Text")
	engsum(text)
	else:
	text=None
	text = message
	c8, c9 = st.columns([1,7])
	if c8.button("Bangla"):
	bansum(text)
	if c9.button("English"):
	engsum(text)

	with st.container():
	from streamlit_chat import message as st_message
	from transformers import BlenderbotTokenizer
	from transformers import BlenderbotForConditionalGeneration
	st.title("Chatbot!!!")

	@st.experimental_singleton
	def get_models():
	# it may be necessary for other frameworks to cache the model
	# seems pytorch keeps an internal state of the conversation
	model_name = "facebook/blenderbot-400M-distill"
	tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
	model = BlenderbotForConditionalGeneration.from_pretrained(model_name)
	return tokenizer, model
	if "history" not in st.session_state:
	st.session_state.history = []
	# st.title("Hello bot: ")
	def generate_answer():
	tokenizer, model = get_models()
	user_message = st.session_state.input_text
	inputs = tokenizer(st.session_state.input_text, return_tensors="pt")
	result = model.generate(**inputs)
	message_bot = tokenizer.decode(
	result[0], skip_special_tokens=True
	) # .replace("<s>", "").replace("</s>", "")
	st.session_state.history.append({"message": user_message, "is_user": True})
	st.session_state.history.append({"message": message_bot, "is_user": False})
	from copyreg import clear_extension_cache
	for chat in st.session_state.history:
	st_message(**chat)
	st.text_input("Talk to the bot", key="input_text", on_change=generate_answer)
	if st.button("Refresh/New Chat"):
	st.session_state.history = None

	if __name__ == "__main__":
	main()