""" #App: NLP App with Streamlit Description This is a Natural Language Processing(NLP) base Application that is useful for Document/Text Summarization from Bangla images and English Images/PDF files. """ # Core Pkgs import os os.environ["TOKENIZERS_PARALLELISM"] = "false" #os.system('sudo apt-get install tesseract-ocr-eng') #os.system('sudo apt-get install tesseract-ocr-ben') #os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata') #os.system('gunzip ben.traineddata.gz ') #os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/') #os.system('pip install -q pytesseract') #os.system('conda install -c conda-forge poppler') import streamlit as st st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded") import torch import docx2txt from PIL import Image from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import CharacterTextSplitter import tempfile #from PyPDF2 import PdfFileReader from pdf2image import convert_from_bytes import pdfplumber #from line_cor import mark_region import pdf2image import requests import cv2 import numpy as np import pytesseract import line_cor import altair as alt #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe" from PIL import Image API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum" headers0 = {"Authorization": "Bearer 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"} API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base" headers1 = {"Authorization": "Bearer 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"} API_URL2 = "https://api-inference.huggingface.co/models/gpt2" headers2 = {"Authorization": "Bearer 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"} # def read_pdf(file): # # images=pdf2image.convert_from_path(file) # # # print(type(images)) # pdfReader = PdfFileReader(file) # count = pdfReader.numPages # all_page_text = " " # for i in range(count): # page = pdfReader.getPage(i) # all_page_text += page.extractText()+" " # return all_page_text def read_pdf_with_pdfplumber(file): # Open the uploaded PDF file with pdfplumber with pdfplumber.open(file) as pdf: extracted_text = '' for page in pdf.pages: extracted_text += page.extract_text() # Display the extracted text #st.text(extracted_text) return extracted_text def engsum(output): def query(payload): response = requests.post(API_URL1, headers=headers1, json=payload) return response.json() out = query({ "inputs": output, "min_length":300 }) if isinstance(out, list) and out[0].get("generated_text"): text_output = out[0]["generated_text"] st.success(text_output) def bansum(text): def query(payload): response = requests.post(API_URL0, headers=headers0, json=payload) return response.json() out = query({"inputs": text, "min_length":300}) if isinstance(out, list) and out[0].get("summary_text"): text_output = out[0]["summary_text"] st.success(text_output) st.title("Bangla and English Summarizer:") #st.subheader("Input texts to summarize: ") #@st.cache_resource(experimental_allow_widgets=True) def main(): """ NLP Based Application with Streamlit """ def change_photo_state(): st.session_state["photo"]="done" message = st.sidebar.text_input("Type your text here!") uploaded_photo = st.sidebar.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state) camera_photo = st.sidebar.camera_input("Capture a photo to summarize: ", on_change=change_photo_state) if "photo" not in st.session_state: st.session_state["photo"]="not done" if st.session_state["photo"]=="done" or message: if uploaded_photo and uploaded_photo.type=='application/pdf': text=[] loader = None #tet = read_pdf(uploaded_photo) with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(uploaded_photo.read()) temp_file_path = temp_file.name loader = PyPDFLoader(temp_file_path) if loader: text.extend(loader.load()) os.remove(temp_file_path) text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len) text_chunks = text_splitter.split_documents(text) # values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)]) #text = tet[values[0]*7*10:values[1]*7*100] if values[0]!=len(tet)//(7*100) else tet[len(tet)//(7*100):] st.text("Selected text for summarize: ") # if st.button("BENGALI"): # st.success(text) # st.text("Summarized text: ") # bansum(text) #if st.button("ENGLISH"): st.success(type(text_chunks)) st.success(text_chunks[0]) st.text("Summarized text: ") engsum(text_chunks[0]) elif uploaded_photo and uploaded_photo.type !='application/pdf': text=None img = Image.open(uploaded_photo) img = img.save("img.png") img = cv2.imread("img.png") st.text("Select the summarization type:") if st.button("BENGALI"): text = pytesseract.image_to_string(img, lang="ben") bansum(text) if st.button("ENGLISH"): text=pytesseract.image_to_string(img) engsum(text) #st.success(text) elif camera_photo: text=None img = Image.open(camera_photo) img = img.save("img.png") img = cv2.imread("img.png") #text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben") st.text("Select the summarization type:") if st.button("Bangla"): text = pytesseract.image_to_string(img, lang="ben") bansum(text) if st.button("English"): text=pytesseract.image_to_string(img) engsum(text) else: text=None text = message if st.button("Bangla"): bansum(text) if st.button("English"): engsum(text) # if st.button("English Text Generation"): # def query(payload): # response = requests.post(API_URL2, headers=headers2, json=payload) # return response.json() # out = query({ # "inputs": text, # }) # if isinstance(out, list) and out[0].get("generated_text"): # text_output = out[0]["generated_text"] # st.success(text_output) # #text=text_output if __name__ == '__main__': main()