""" #App: NLP App with Streamlit Description This is a Natural Language Processing(NLP) base Application that is useful for Document/Text Summarization from Bangla images and English Images/PDF files. """ # Core Pkgs import os os.environ["TOKENIZERS_PARALLELISM"] = "false" #os.system('sudo apt-get install tesseract-ocr-eng') #os.system('sudo apt-get install tesseract-ocr-ben') #os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata') #os.system('gunzip ben.traineddata.gz ') #os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/') #os.system('pip install -q pytesseract') #os.system('conda install -c conda-forge poppler') import streamlit as st st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded") import torch import docx2txt from PIL import Image from PyPDF2 import PdfFileReader from pdf2image import convert_from_bytes import pdfplumber #from line_cor import mark_region import pdf2image import requests import cv2 import numpy as np import pytesseract import line_cor import altair as alt #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe" from PIL import Image API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum" headers0 = {"Authorization": "Bearer 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"} def read_pdf(file): # images=pdf2image.convert_from_path(file) # # print(type(images)) pdfReader = PdfFileReader(file) count = pdfReader.numPages all_page_text = " " for i in range(count): page = pdfReader.getPage(i) # img = Image.open(page) # img = Image.open(page) # img = img.save("img.png") # image_name = cv2.imread("img.png") # # get co-ordinates to cr # text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name) all_page_text += page.extractText()+" " return all_page_text # def read_pdf_with_pdfplumber(file): # # Open the uploaded PDF file with pdfplumber # with pdfplumber.open(file) as pdf: # extracted_text = '' # for page in pdf.pages: # extracted_text += page.extract_text() # # Display the extracted text # #st.text(extracted_text) # return extracted_text def engsum(output): API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base" headers1 = {"Authorization": "Bearer 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"} def query(payload): response = requests.post(API_URL1, headers=headers1, json=payload) return response.json() output = query({ "inputs": output, }) st.success(output) def bansum(text): def query(payload): response = requests.post(API_URL0, headers=headers0, json=payload) return response.json() output = query({"inputs": text}) st.success(output) st.title("NLP APPLICATION") #@st.cache_resource(experimental_allow_widgets=True) def main(): b=0 #global tokenizer, model #tokenizer = AutoTokenizer.from_pretrained('t5-base') #model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True) """ NLP Based Application with Streamlit """ st.markdown(""" #### Description ##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows: PDF Document/Image's Text Summarization. """) def change_photo_state(): st.session_state["photo"]="done" st.subheader("Please, feed your pdf/images/text, features/services will appear automatically!") message = st.text_input("Type your text here!") uploaded_photo = st.file_uploader("Upload your PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state) camera_photo = st.camera_input("Take a photo, Containing English texts", on_change=change_photo_state) if "photo" not in st.session_state: st.session_state["photo"]="not done" if st.session_state["photo"]=="done" or message: text=" " if uploaded_photo and uploaded_photo.type=='application/pdf': #file = uploaded_photo.read() # Read the data #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result #image_result.write(file) tet = read_pdf(uploaded_photo) #tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img) values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)]) text = tet[values[0]*7*10:values[1]*7*10] if values[0]!=len(tet)//(7*10) else tet[len(tet)//(7*100):] st.success(text) elif uploaded_photo: img = Image.open(uploaded_photo) img = img.save("img.png") img = cv2.imread("img.png") # get co-ordinates to crop the image #imag, lc = line_cor.mark_region(imge) #st.success(*lc) # c = lc # cropping image img = image[y0:y1, x0:x1] #imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]] #plt.figure(figsize=(10,10)) # plt.imshow(img) # convert the image to black and white for better OCR #ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY) # pytesseract image to string to get results #text = str(pytesseract.image_to_string(img, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6')) if st.checkbox("Bangla"): b=1 text = pytesseract.image_to_string(img, lang="ben") else: b=0 text=pytesseract.image_to_string(img) #st.success(text) elif camera_photo: img = Image.open(camera_photo) img = img.save("img.png") img = cv2.imread("img.png") #text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben") if st.checkbox("Bangla"): b=1 text = pytesseract.image_to_string(img, lang="ben") else: b=0 text=pytesseract.image_to_string(img) #st.success(text) elif uploaded_photo==None and camera_photo==None: #our_image=load_image("image.jpg") #img = cv2.imread("scholarly_text.jpg") text = message if st.checkbox("English Text Generation"): API_URL2 = "https://api-inference.huggingface.co/models/gpt2" headers2 = {"Authorization": "Bearer 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"} def query(payload): response = requests.post(API_URL2, headers=headers2, json=payload) return response.json() output = query({ "inputs": text, }) if isinstance(output, list) and output[0].get("generated_text"): text_output = output[0]["generated_text"] st.success(text_output) if st.checkbox("Summarize generated text"): engsum(text_ouput) if st.checkbox("Mark for Text Summarization"): if b==1: bansum(text) else: engsum(text) if __name__ == '__main__': main()