Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

App Files Files Community

Text-Summarization-and-NLP-tasks / app.py

Soumen

Update app.py

b808107 over 1 year ago

raw

history blame

7.65 kB

	"""
	#App: NLP App with Streamlit
	Description
	This is a Natural Language Processing(NLP) base Application that is useful for
	Document/Text Summarization from Bangla images and English Images/PDF files.
	"""
	# Core Pkgs
	import os
	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	#os.system('sudo apt-get install tesseract-ocr-eng')
	#os.system('sudo apt-get install tesseract-ocr-ben')

	#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
	#os.system('gunzip ben.traineddata.gz ')
	#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
	#os.system('pip install -q pytesseract')
	#os.system('conda install -c conda-forge poppler')
	import streamlit as st
	st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
	import torch
	import docx2txt
	from PIL import Image
	from PyPDF2 import PdfFileReader
	from pdf2image import convert_from_bytes
	import pdfplumber
	#from line_cor import mark_region
	import pdf2image
	import requests
	import cv2
	import numpy as np
	import pytesseract
	import line_cor
	import altair as alt
	#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
	from PIL import Image
	API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
	headers0 = {"Authorization": "Bearer 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"}


	def read_pdf(file):
	# images=pdf2image.convert_from_path(file)
	# # print(type(images))
	pdfReader = PdfFileReader(file)
	count = pdfReader.numPages
	all_page_text = " "
	for i in range(count):
	page = pdfReader.getPage(i)
	# img = Image.open(page)
	# img = Image.open(page)
	# img = img.save("img.png")
	# image_name = cv2.imread("img.png")
	# # get co-ordinates to cr
	# text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
	all_page_text += page.extractText()+" "
	return all_page_text
	# def read_pdf_with_pdfplumber(file):
	# # Open the uploaded PDF file with pdfplumber
	# with pdfplumber.open(file) as pdf:
	# extracted_text = ''
	# for page in pdf.pages:
	# extracted_text += page.extract_text()

	# # Display the extracted text
	# #st.text(extracted_text)
	# return extracted_text
	def engsum(output):
	API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base"
	headers1 = {"Authorization": "Bearer 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"}

	def query(payload):
	response = requests.post(API_URL1, headers=headers1, json=payload)
	return response.json()

	output = query({
	"inputs": output,
	})
	st.success(output)
	def bansum(text):
	def query(payload):
	response = requests.post(API_URL0, headers=headers0, json=payload)
	return response.json()
	output = query({"inputs": text})
	st.success(output)

	st.title("NLP APPLICATION")
	#@st.cache_resource(experimental_allow_widgets=True)
	def main():
	b=0
	#global tokenizer, model
	#tokenizer = AutoTokenizer.from_pretrained('t5-base')
	#model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
	""" NLP Based Application with Streamlit """
	st.markdown("""
	#### Description
	##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows:
	PDF Document/Image's Text Summarization.
	""")
	def change_photo_state():
	st.session_state["photo"]="done"
	st.subheader("Please, feed your pdf/images/text, features/services will appear automatically!")
	message = st.text_input("Type your text here!")
	uploaded_photo = st.file_uploader("Upload your PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
	camera_photo = st.camera_input("Take a photo, Containing English texts", on_change=change_photo_state)
	if "photo" not in st.session_state:
	st.session_state["photo"]="not done"
	if st.session_state["photo"]=="done" or message:
	text=" "
	if uploaded_photo and uploaded_photo.type=='application/pdf':
	#file = uploaded_photo.read() # Read the data
	#image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
	#image_result.write(file)
	tet = read_pdf(uploaded_photo)
	#tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
	values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
	text = tet[values[0]710:values[1]710] if values[0]!=len(tet)//(710) else tet[len(tet)//(7100):]
	st.success(text)
	elif uploaded_photo:
	img = Image.open(uploaded_photo)
	img = img.save("img.png")
	img = cv2.imread("img.png")
	# get co-ordinates to crop the image
	#imag, lc = line_cor.mark_region(imge)
	#st.success(*lc)
	# c = lc
	# cropping image img = image[y0:y1, x0:x1]
	#imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
	#plt.figure(figsize=(10,10))
	# plt.imshow(img)
	# convert the image to black and white for better OCR
	#ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
	# pytesseract image to string to get results
	#text = str(pytesseract.image_to_string(img, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
	if st.checkbox("Bangla"):
	b=1
	text = pytesseract.image_to_string(img, lang="ben")
	else:
	b=0
	text=pytesseract.image_to_string(img)
	#st.success(text)
	elif camera_photo:
	img = Image.open(camera_photo)
	img = img.save("img.png")
	img = cv2.imread("img.png")
	#text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
	if st.checkbox("Bangla"):
	b=1
	text = pytesseract.image_to_string(img, lang="ben")
	else:
	b=0
	text=pytesseract.image_to_string(img)
	#st.success(text)
	elif uploaded_photo==None and camera_photo==None:
	#our_image=load_image("image.jpg")
	#img = cv2.imread("scholarly_text.jpg")
	text = message

	if st.checkbox("English Text Generation"):
	API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
	headers2 = {"Authorization": "Bearer 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"}

	def query(payload):
	response = requests.post(API_URL2, headers=headers2, json=payload)
	return response.json()

	output = query({
	"inputs": text,
	})
	if isinstance(output, list) and output[0].get("generated_text"):
	text_output = output[0]["generated_text"]
	st.success(text_output)
	if st.checkbox("Summarize generated text"):
	engsum(text_ouput)


	if st.checkbox("Mark for Text Summarization"):
	if b==1:
	bansum(text)
	else:
	engsum(text)
	if __name__ == '__main__':
	main()