|
""" |
|
#App: NLP App with Streamlit |
|
Description |
|
This is a Natural Language Processing(NLP) base Application that is useful for |
|
Document/Text Summarization from Bangla images and English Images/PDF files. |
|
""" |
|
|
|
import os |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded") |
|
import torch |
|
import docx2txt |
|
from PIL import Image |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
import tempfile |
|
|
|
from pdf2image import convert_from_bytes |
|
import pdfplumber |
|
|
|
import pdf2image |
|
import requests |
|
import cv2 |
|
import numpy as np |
|
import pytesseract |
|
import line_cor |
|
import altair as alt |
|
|
|
from PIL import Image |
|
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum" |
|
headers0 = {"Authorization": "Bearer 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"} |
|
API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base" |
|
headers1 = {"Authorization": "Bearer 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"} |
|
API_URL2 = "https://api-inference.huggingface.co/models/gpt2" |
|
headers2 = {"Authorization": "Bearer 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"} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_pdf_with_pdfplumber(file): |
|
|
|
with pdfplumber.open(file) as pdf: |
|
extracted_text = '' |
|
for page in pdf.pages: |
|
extracted_text += page.extract_text() |
|
|
|
|
|
|
|
return extracted_text |
|
|
|
def engsum(output): |
|
def query(payload): |
|
response = requests.post(API_URL1, headers=headers1, json=payload) |
|
return response.json() |
|
|
|
out = query({ |
|
"inputs": output, |
|
"min_length":300 |
|
}) |
|
if isinstance(out, list) and out[0].get("generated_text"): |
|
text_output = out[0]["generated_text"] |
|
st.success(text_output) |
|
def bansum(text): |
|
def query(payload): |
|
response = requests.post(API_URL0, headers=headers0, json=payload) |
|
return response.json() |
|
out = query({"inputs": text, "min_length":300}) |
|
if isinstance(out, list) and out[0].get("summary_text"): |
|
text_output = out[0]["summary_text"] |
|
st.success(text_output) |
|
|
|
st.title("Bangla and English Summarizer:") |
|
|
|
|
|
def main(): |
|
""" NLP Based Application with Streamlit """ |
|
def change_photo_state(): |
|
st.session_state["photo"]="done" |
|
message = st.sidebar.text_input("Type your text here!") |
|
uploaded_photo = st.sidebar.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state) |
|
camera_photo = st.sidebar.camera_input("Capture a photo to summarize: ", on_change=change_photo_state) |
|
if "photo" not in st.session_state: |
|
st.session_state["photo"]="not done" |
|
if st.session_state["photo"]=="done" or message: |
|
if uploaded_photo and uploaded_photo.type=='application/pdf': |
|
text=[] |
|
loader = None |
|
|
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file: |
|
temp_file.write(uploaded_photo.read()) |
|
temp_file_path = temp_file.name |
|
|
|
loader = PyPDFLoader(temp_file_path) |
|
if loader: |
|
text.extend(loader.load()) |
|
os.remove(temp_file_path) |
|
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len) |
|
text_chunks = text_splitter.split_documents(text) |
|
|
|
|
|
st.text("Selected text for summarize: ") |
|
|
|
|
|
|
|
|
|
|
|
st.success(type(text_chunks)) |
|
st.success(text_chunks[0]) |
|
st.text("Summarized text: ") |
|
engsum(text_chunks[0]) |
|
|
|
elif uploaded_photo and uploaded_photo.type !='application/pdf': |
|
text=None |
|
img = Image.open(uploaded_photo) |
|
img = img.save("img.png") |
|
img = cv2.imread("img.png") |
|
st.text("Select the summarization type:") |
|
if st.button("BENGALI"): |
|
text = pytesseract.image_to_string(img, lang="ben") |
|
bansum(text) |
|
if st.button("ENGLISH"): |
|
text=pytesseract.image_to_string(img) |
|
engsum(text) |
|
|
|
elif camera_photo: |
|
text=None |
|
img = Image.open(camera_photo) |
|
img = img.save("img.png") |
|
img = cv2.imread("img.png") |
|
|
|
st.text("Select the summarization type:") |
|
if st.button("Bangla"): |
|
text = pytesseract.image_to_string(img, lang="ben") |
|
bansum(text) |
|
if st.button("English"): |
|
text=pytesseract.image_to_string(img) |
|
engsum(text) |
|
else: |
|
text=None |
|
text = message |
|
if st.button("Bangla"): |
|
bansum(text) |
|
if st.button("English"): |
|
engsum(text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|