""" #App: NLP App with Streamlit Description This is a Natural Language Processing(NLP) base Application that is useful for Document/Text Summarization from Bangla images and English Images/PDF files. """ # Core Pkgs import os os.environ["TOKENIZERS_PARALLELISM"] = "false" #os.system('sudo apt-get install tesseract-ocr-eng') #os.system('sudo apt-get install tesseract-ocr-ben') #os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata') #os.system('gunzip ben.traineddata.gz ') #os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/') #os.system('pip install -q pytesseract') #os.system('conda install -c conda-forge poppler') import streamlit as st st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded") import torch import docx2txt from PIL import Image from PyPDF2 import PdfFileReader from pdf2image import convert_from_bytes import pdfplumber #from line_cor import mark_region import pdf2image import requests import cv2 import numpy as np import pytesseract import line_cor import altair as alt #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe" from PIL import Image API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum" headers0 = {"Authorization": "Bearer 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"} API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base" headers1 = {"Authorization": "Bearer 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"} API_URL2 = "https://api-inference.huggingface.co/models/gpt2" headers2 = {"Authorization": "Bearer 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"} def read_pdf(file): # images=pdf2image.convert_from_path(file) # # print(type(images)) pdfReader = PdfFileReader(file) count = pdfReader.numPages all_page_text = " " for i in range(count): page = pdfReader.getPage(i) all_page_text += page.extractText()+" " return all_page_text def engsum(output): def query(payload): response = requests.post(API_URL1, headers=headers1, json=payload) return response.json() out = query({ "inputs": output, "min_length":300 }) if isinstance(out, list) and out[0].get("generated_text"): text_output = out[0]["generated_text"] st.success(text_output) def bansum(text): def query(payload): response = requests.post(API_URL0, headers=headers0, json=payload) return response.json() out = query({"inputs": text, "min_length":300}) if isinstance(out, list) and out[0].get("summary_text"): text_output = out[0]["summary_text"] st.success(text_output) st.title("Bangla and English Summarizer:") st.subheader("Please, upload your PDF/Images or input texts to summarize: ") #@st.cache_resource(experimental_allow_widgets=True) def main(): """ NLP Based Application with Streamlit """ def change_photo_state(): st.session_state["photo"]="done" message = st.text_input("Type your text here!") uploaded_photo = st.sidebar.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state) camera_photo = st.sidebar.camera_input("Capture a photo to summarize: ", on_change=change_photo_state) if "photo" not in st.session_state: st.session_state["photo"]="not done" if st.session_state["photo"]=="done" or message: if uploaded_photo and uploaded_photo.type=='application/pdf': text=None #file = uploaded_photo.read() # Read the data #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result #image_result.write(file) tet = read_pdf(uploaded_photo) #tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img) values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)]) st.text("Selected text for summarize: ") text = tet[values[0]*7*10:values[1]*7*10] if values[0]!=len(tet)//(7*10) else tet[len(tet)//(7*100):] st.success(text) st.text("Summarized text: ") engsum(text) if uploaded_photo and uploaded_photo.type !='application/pdf': text=None img = Image.open(uploaded_photo) img = img.save("img.png") img = cv2.imread("img.png") st.text("Select the summarization type:") if st.button("BENGALI"): text = pytesseract.image_to_string(img, lang="ben") bansum(text) if st.button("ENGLISH"): text=pytesseract.image_to_string(img) engsum(text) #st.success(text) if camera_photo: text=None img = Image.open(camera_photo) img = img.save("img.png") img = cv2.imread("img.png") #text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben") st.text("Select the summarization type:") if st.button("Bangla"): text = pytesseract.image_to_string(img, lang="ben") bansum(text) if st.button("English"): text=pytesseract.image_to_string(img) engsum(text) if message: text=None text = message if st.button("Bangla"): bansum(text) if st.button("English"): engsum(text) # if st.button("English Text Generation"): # def query(payload): # response = requests.post(API_URL2, headers=headers2, json=payload) # return response.json() # out = query({ # "inputs": text, # }) # if isinstance(out, list) and out[0].get("generated_text"): # text_output = out[0]["generated_text"] # st.success(text_output) # #text=text_output if __name__ == '__main__': main()