""" #App: NLP App with Streamlit Description This is a Natural Language Processing(NLP) base Application that is useful for Document/Text Summarization from Bangla images and English Images/PDF files. """ # Core Pkgs import os os.environ["TOKENIZERS_PARALLELISM"] = "false" #os.system('sudo apt-get install tesseract-ocr-eng') #os.system('sudo apt-get install tesseract-ocr-ben') #os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata') #os.system('gunzip ben.traineddata.gz ') #os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/') #os.system('pip install -q pytesseract') #os.system('conda install -c conda-forge poppler') import streamlit as st st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded") st.title("Bangla/English Text Summarizer: Upload Images/Pdf or input texts to summarize!") import torch import docx2txt from PIL import Image from PyPDF2 import PdfFileReader from pdf2image import convert_from_bytes import pdfplumber #from line_cor import mark_region import pdf2image import requests import cv2 import numpy as np import pytesseract import line_cor import altair as alt from transformers import AutoTokenizer, AutoModelWithLMHead #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe" from PIL import Image API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum" headers0 = {"Authorization": "Bearer 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"} API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base" headers1 = {"Authorization": "Bearer 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"} API_URL2 = "https://api-inference.huggingface.co/models/gpt2" headers2 = {"Authorization": "Bearer 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"} @st.cache def read_pdf(file): # images=pdf2image.convert_from_path(file) # # print(type(images)) pdfReader = PdfFileReader(file) count = pdfReader.numPages all_page_text = " " for i in range(count): page = pdfReader.getPage(i) all_page_text += page.extractText()+" " return all_page_text # def read_pdf_with_pdfplumber(file): # # Open the uploaded PDF file with pdfplumber # with pdfplumber.open(file) as pdf: # extracted_text = '' # for page in pdf.pages: # extracted_text += page.extract_text() # # Display the extracted text # #st.text(extracted_text) # return extracted_text tokenizer = AutoTokenizer.from_pretrained('t5-base') model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True) @st.cache(suppress_st_warning=True) def engsum(text): #st.text("Using Google T5 Transformer ..") inputs = tokenizer.encode("summarize: " + text,return_tensors='pt', max_length= 512, truncation=True) summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2) summary = tokenizer.decode(summary_ids[0]) st.success(summary[5:-4]) def bansum(text): def query(payload): response = requests.post(API_URL0, headers=headers0, json=payload) return response.json() out = query({"inputs": text, "min_length":300}) if isinstance(out, list) and out[0].get("summary_text"): text_output = out[0]["summary_text"] st.success(text_output) # @st.cache # def save(l): # return l #@st.cache def main(): camera_photo=None import streamlit as st if "photo" not in st.session_state: st.session_state["photo"]="not done" def change_photo_state(): st.session_state["photo"]="done" with st.container(): c1, c2, c3 = st.columns([2,2,1]) message = c1.text_input("Type your text here!") Capture=True if c2.button("Start Camera"): camera_photo = c2.camera_input("Capture a photo to summarize: ", on_change=change_photo_state) if c2.button("Stop Camera"): Capture =False uploaded_photo = c3.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state) if st.session_state["photo"]=="done" or message: if uploaded_photo and uploaded_photo.type=='application/pdf': tet = read_pdf(uploaded_photo) # with tempfile.NamedTemporaryFile(delete=False) as temp_file: # temp_file.write(uploaded_photo.read()) # temp_file_path = temp_file.name # loader = PyPDFLoader(temp_file_path) # if loader: # text.extend(loader.load()) # os.remove(temp_file_path) # text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len) # text_chunks = text_splitter.split_documents(text) values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)]) text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):] #st.success(type(text_chunks)) if st.button("English Pdf Summarize"): st.subheader("Selected text for summarize: ") st.success(text) st.subheader("Summarized Text: ") engsum(text) elif uploaded_photo and uploaded_photo.type !='application/pdf': text=None img = Image.open(uploaded_photo) img = img.save("img.png") img = cv2.imread("img.png") st.subheader("Select the summarization type:") c4, c5 = st.columns([1,7]) if c4.button("BENGALI"): text = pytesseract.image_to_string(img, lang="ben") st.success(text) st.subheader("সারাংশ/সারমর্ম") bansum(text) if c5.button("ENGLISH"): text=pytesseract.image_to_string(img) st.success(text) st.subheader("Summarized Text") engsum(text) elif camera_photo: text=None img = Image.open(camera_photo) img = img.save("img.png") img = cv2.imread("img.png") #text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben") st.subheader("Select the summarization type:") #st.image(camera_photo, caption='Uploaded Image', use_column_width=True) c6, c7 = st.columns([1,7]) if c6.button("Bangla"): text = pytesseract.image_to_string(img, lang="ben") st.success(text) st.subheader("সারাংশ/সারমর্ম") bansum(text) if c7.button("English"): text=pytesseract.image_to_string(img) st.success(text) st.subheader("Summarized Text") engsum(text) else: text=None text = message c8, c9 = st.columns([1,7]) if c8.button("Bangla"): bansum(text) if c9.button("English"): engsum(text) if __name__ == "__main__": main()