""" #App: NLP App with Streamlit Description This is a Natural Language Processing(NLP) base Application that is useful for Document/Text Summarization from Bangla images and English Images/PDF files. """ # Core Pkgs import os os.environ["TOKENIZERS_PARALLELISM"] = "false" #os.system('sudo apt-get install tesseract-ocr-eng') #os.system('sudo apt-get install tesseract-ocr-ben') #os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata') #os.system('gunzip ben.traineddata.gz ') #os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/') #os.system('pip install -q pytesseract') #os.system('conda install -c conda-forge poppler') import streamlit as st st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded") st.title("Bangla and English Summarizer: Upload Images/Pdf or input texts to summarize!") import torch import docx2txt from PIL import Image from PyPDF2 import PdfFileReader from pdf2image import convert_from_bytes import pdfplumber #from line_cor import mark_region import pdf2image import requests import cv2 import numpy as np import pytesseract import line_cor import altair as alt #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe" from PIL import Image API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum" headers0 = {"Authorization": "Bearer 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"} API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base" headers1 = {"Authorization": "Bearer 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"} API_URL2 = "https://api-inference.huggingface.co/models/gpt2" headers2 = {"Authorization": "Bearer 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"} @st.cache def read_pdf(file): # images=pdf2image.convert_from_path(file) # # print(type(images)) pdfReader = PdfFileReader(file) count = pdfReader.numPages all_page_text = " " for i in range(count): page = pdfReader.getPage(i) all_page_text += page.extractText()+" " return all_page_text # def read_pdf_with_pdfplumber(file): # # Open the uploaded PDF file with pdfplumber # with pdfplumber.open(file) as pdf: # extracted_text = '' # for page in pdf.pages: # extracted_text += page.extract_text() # # Display the extracted text # #st.text(extracted_text) # return extracted_text def engsum(output): def query(payload): response = requests.post(API_URL1, headers=headers1, json=payload) return response.json() out = query({ "inputs": output, "min_length":300 }) if isinstance(out, list) and out[0].get("generated_text"): text_output = out[0]["generated_text"] st.success(text_output) def bansum(text): def query(payload): response = requests.post(API_URL0, headers=headers0, json=payload) return response.json() out = query({"inputs": text, "min_length":300}) if isinstance(out, list) and out[0].get("summary_text"): text_output = out[0]["summary_text"] st.success(text_output) @st.cache def save(l): return l #@st.cache def main(): import streamlit as st if "photo" not in st.session_state: st.session_state["photo"]="not done" a, b = st.columns([1, 1]) def change_photo_state(): st.session_state["photo"]="done" with st.container(): with a: #import torch from streamlit_option_menu import option_menu from streamlit_chat import message as st_message from transformers import BlenderbotTokenizer from transformers import BlenderbotForConditionalGeneration st.title("Simple Chatbot for fun!") @st.experimental_singleton def get_models(): # it may be necessary for other frameworks to cache the model # seems pytorch keeps an internal state of the conversation model_name = "facebook/blenderbot-400M-distill" tokenizer = BlenderbotTokenizer.from_pretrained(model_name) model = BlenderbotForConditionalGeneration.from_pretrained(model_name) return tokenizer, model if "history" not in st.session_state: st.session_state.history = [] st.title("Hello Chatbot") def main(): st.text_input("Talk to the bot", key="input_text", on_change=generate_answer) def generate_answer(): tokenizer, model = get_models() user_message = st.session_state.input_text inputs = tokenizer(st.session_state.input_text, return_tensors="pt") result = model.generate(**inputs) message_bot = tokenizer.decode( result[0], skip_special_tokens=True ) # .replace("", "").replace("", "") st.session_state.history.append({"message": user_message, "is_user": True}) st.session_state.history.append({"message": message_bot, "is_user": False}) from copyreg import clear_extension_cache for chat in st.session_state.history: st_message(**chat) with b: c2, c3 = st.columns([1,1]) message = st.text_input("Type your text here!") camera_photo = c2.camera_input("Capture a photo to summarize: ", on_change=change_photo_state) uploaded_photo = save(c3.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)) if st.session_state["photo"]=="done" or message: if uploaded_photo and uploaded_photo.type=='application/pdf': tet = read_pdf(uploaded_photo) # with tempfile.NamedTemporaryFile(delete=False) as temp_file: # temp_file.write(uploaded_photo.read()) # temp_file_path = temp_file.name # loader = PyPDFLoader(temp_file_path) # if loader: # text.extend(loader.load()) # os.remove(temp_file_path) # text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len) # text_chunks = text_splitter.split_documents(text) values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)]) text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):] #st.success(type(text_chunks)) if st.button("English Pdf Summarize"): st.subheader("Selected text for summarize: ") st.success(text) st.subheader("Summarized Text: ") engsum(text) elif uploaded_photo and uploaded_photo.type !='application/pdf': text=None img = Image.open(uploaded_photo) img = img.save("img.png") img = cv2.imread("img.png") st.text("Select the summarization type:") c4, c5 = st.columns([1,1]) if c4.button("BENGALI"): text = pytesseract.image_to_string(img, lang="ben") st.subheader("সারাংশ/সারমর্ম") bansum(text) if c5.button("ENGLISH"): text=pytesseract.image_to_string(img) st.subheader("Summarized Text") engsum(text) #st.success(text) elif camera_photo: text=None img = Image.open(camera_photo) img = img.save("img.png") img = cv2.imread("img.png") #text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben") st.text("Select the summarization type:") c6, c7 = st.columns([1,1]) if c6.button("Bangla"): text = pytesseract.image_to_string(img, lang="ben") st.subheader("সারাংশ/সারমর্ম") bansum(text) if c7.button("English"): text=pytesseract.image_to_string(img) st.subheader("Summarized Text") engsum(text) else: text=None text = message c8, c9 = st.columns([1,1]) if c8.button("Bangla"): bansum(text) if c9.button("English"): engsum(text) # if st.button("English Text Generation"): # def query(payload): # response = requests.post(API_URL2, headers=headers2, json=payload) # return response.json() # out = query({ # "inputs": text, # }) # if isinstance(out, list) and out[0].get("generated_text"): # text_output = out[0]["generated_text"] # st.success(text_output) # #text=text_output if __name__ == "__main__": main()