Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

File size: 9,094 Bytes

9c37e72
dba2773
9c37e72
68f40bc
d82d18a
9c37e72
 
6e58c44
4834995
bd18577
 
 
 
 
 
 
2f51bd6
c75cc74
ba45265
3df853f
9c37e72
09d4214
 
29e33a8
06dd768
09d4214
0842639
baf370a
9c37e72
 
 
 
1ecea99
fa73ddc
419e04c
9c37e72
b446f5c
a07988a
af0fae9
 
 
 
ba45265
29e33a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d5dc1c
29e33a8
 
 
1a9fa41
c7a7627
 
 
 
 
3f06691
c7a7627
37287e0
c7a7627
0975d28
 
 
c7a7627
 
 
 
37287e0
0975d28
 
 
b330918
c9df9a0
 
 
ec4347b
9d1426d
b330918
ebcff05
b330918
 
 
 
 
0964e86
b330918
8eacb9c
b33cce8
8eacb9c
2d1cbd4
 
cbcd024
c9df9a0
b330918
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d960c1e
b330918
 
 
 
 
 
2e3a776
b330918
 
ac35f7a
b330918
 
 
 
ac35f7a
b330918
 
 
 
 
 
 
 
 
ac35f7a
2e3a776
b330918
 
ac35f7a
b330918
 
 
 
ac35f7a
b330918
 
 
 
 
2e3a776
b330918
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2e9e23
b330918
 
 
 
 
 
 
 
 
 
 
 
 
a2e9e23
b330918
a2e9e23
d960c1e
9d1426d

"""
#App: NLP App with Streamlit
Description
This is a Natural Language Processing(NLP) base Application that is useful for 
Document/Text Summarization from Bangla images and English Images/PDF files.
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
st.title("Chatbot and Bangla/English Text Summarizer: Upload Images/Pdf or input texts to summarize!")
import torch
import docx2txt
from PIL import Image 
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
headers0 = {"Authorization": "Bearer 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"}
API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base"
headers1 = {"Authorization": "Bearer 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"}
API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
headers2 = {"Authorization": "Bearer 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"}
@st.cache
def read_pdf(file):
#     images=pdf2image.convert_from_path(file)
#     # print(type(images))
    pdfReader = PdfFileReader(file)
    count = pdfReader.numPages
    all_page_text = " "
    for i in range(count):
        page = pdfReader.getPage(i)
        all_page_text += page.extractText()+" "
    return all_page_text
# def read_pdf_with_pdfplumber(file):
#     # Open the uploaded PDF file with pdfplumber
#     with pdfplumber.open(file) as pdf:
#         extracted_text = ''
#         for page in pdf.pages:
#             extracted_text += page.extract_text()

#     # Display the extracted text
#     #st.text(extracted_text)
#     return extracted_text

def engsum(output):
    def query(payload):
        response = requests.post(API_URL1, headers=headers1, json=payload)
        return response.json()
        
    out = query({
        "inputs": output,
        "min_length":300
    })
    if isinstance(out, list) and out[0].get("generated_text"):
       text_output = out[0]["generated_text"]
       st.success(text_output)
def bansum(text):
    def query(payload):
        response = requests.post(API_URL0, headers=headers0, json=payload)
        return response.json()
    out = query({"inputs": text, "min_length":300})
    if isinstance(out, list) and out[0].get("summary_text"):
       text_output = out[0]["summary_text"]
       st.success(text_output) 
    
# @st.cache
# def save(l):
#     return l
#@st.cache
def main():
    camera_photo=None
    import streamlit as st
    if "photo" not in st.session_state:
        st.session_state["photo"]="not done"
    def change_photo_state():
        st.session_state["photo"]="done"
    with st.container():
        c1, c2, c3 = st.columns([2,2,1])
        message = c1.text_input("Type your text here!")
        Capture=True
        if c2.button("Capture"):
           camera_photo = c2.camera_input("Capture a photo to summarize: ", on_change=change_photo_state)
        if c2.button("Stop Camera"):
           Capture =False
           
        uploaded_photo = c3.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
        if st.session_state["photo"]=="done" or message:
            if uploaded_photo and uploaded_photo.type=='application/pdf':
                tet = read_pdf(uploaded_photo)
                # with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                #     temp_file.write(uploaded_photo.read())
                #     temp_file_path = temp_file.name
                    
                # loader = PyPDFLoader(temp_file_path)
                # if loader:
                #     text.extend(loader.load())
                #     os.remove(temp_file_path)
                # text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
                # text_chunks = text_splitter.split_documents(text)
                values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
                text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):]
                #st.success(type(text_chunks))
                if st.button("English Pdf Summarize"):
                    st.subheader("Selected text for summarize: ")
                    st.success(text)
                    st.subheader("Summarized Text: ")
                    engsum(text)
            
            elif uploaded_photo and uploaded_photo.type !='application/pdf':
                text=None
                img = Image.open(uploaded_photo)
                img = img.save("img.png")
                img = cv2.imread("img.png")
                st.text("Select the summarization type:")
                c4, c5 = st.columns([1,7])
                if c4.button("BENGALI"):
                    text =  pytesseract.image_to_string(img, lang="ben")
                    st.success(text)
                    st.subheader("সারাংশ/সারমর্ম")
                    bansum(text)
                if c5.button("ENGLISH"): 
                    text=pytesseract.image_to_string(img)
                    st.success(text)
                    st.subheader("Summarized Text")
                    engsum(text)
            elif camera_photo:
                text=None
                img = Image.open(camera_photo)
                img = img.save("img.png")
                img = cv2.imread("img.png")
                #text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
                st.text("Select the summarization type:")
                #st.image(camera_photo, caption='Uploaded Image', use_column_width=True)
                c6, c7 = st.columns([1,7])
                if c6.button("Bangla"):
                    text =  pytesseract.image_to_string(img, lang="ben")
                    st.success(text)
                    st.subheader("সারাংশ/সারমর্ম")
                    bansum(text)
                if c7.button("English"): 
                    text=pytesseract.image_to_string(img)
                    st.success(text)
                    st.subheader("Summarized Text")
                    engsum(text)
            else:
                text=None
                text = message
                c8, c9 = st.columns([1,7])
                if c8.button("Bangla"):
                    bansum(text)
                if c9.button("English"): 
                    engsum(text) 
             
    with st.container():
        from streamlit_chat import message as st_message
        from transformers import BlenderbotTokenizer
        from transformers import BlenderbotForConditionalGeneration
        st.title("Chatbot!!!")
        
        @st.experimental_singleton
        def get_models():
            # it may be necessary for other frameworks to cache the model
            # seems pytorch keeps an internal state of the conversation
            model_name = "facebook/blenderbot-400M-distill"
            tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
            model = BlenderbotForConditionalGeneration.from_pretrained(model_name)
            return tokenizer, model
        if "history" not in st.session_state:
            st.session_state.history = []
       # st.title("Hello bot: ")
        def generate_answer():
            tokenizer, model = get_models()
            user_message = st.session_state.input_text
            inputs = tokenizer(st.session_state.input_text, return_tensors="pt")
            result = model.generate(**inputs)
            message_bot = tokenizer.decode(
                result[0], skip_special_tokens=True
            )  # .replace("<s>", "").replace("</s>", "")
            st.session_state.history.append({"message": user_message, "is_user": True})
            st.session_state.history.append({"message": message_bot, "is_user": False})
        from copyreg import clear_extension_cache
        for chat in st.session_state.history:
            st_message(**chat)
        st.text_input("Talk to the bot", key="input_text", on_change=generate_answer)
        if st.button("Refresh/New Chat"):
           st.session_state.history = None
            
if __name__ == "__main__":
    main()