File size: 6,749 Bytes
9c37e72
dba2773
9c37e72
68f40bc
d82d18a
9c37e72
 
6e58c44
4834995
bd18577
 
 
 
 
 
 
2f51bd6
c75cc74
ba45265
e6d3f33
9c37e72
09d4214
 
29e33a8
06dd768
09d4214
0842639
baf370a
9c37e72
 
 
 
1ecea99
fa73ddc
419e04c
9c37e72
b446f5c
a07988a
af0fae9
 
 
 
ba45265
29e33a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d5dc1c
29e33a8
 
 
1a9fa41
c7a7627
 
 
 
 
3f06691
c7a7627
37287e0
c7a7627
0975d28
 
 
c7a7627
 
 
 
37287e0
0975d28
 
 
47d9c5a
1e3fbe3
 
ba45265
2dffa73
 
531fa94
0e16a66
 
6589236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba45265
 
6589236
 
 
ba45265
6589236
 
 
 
 
 
 
 
 
 
 
ba45265
 
6589236
 
 
ba45265
6589236
 
 
 
 
 
ba45265
 
6589236
ba45265
6589236
 
 
 
 
 
 
 
 
 
 
 
 
2dffa73
d0a45f9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""
#App: NLP App with Streamlit
Description
This is a Natural Language Processing(NLP) base Application that is useful for 
Document/Text Summarization from Bangla images and English Images/PDF files.
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
st.title("Bangla and English Summarizer: Upload Images/Pdf or input texts to summarize!")
import torch
import docx2txt
from PIL import Image 
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
headers0 = {"Authorization": "Bearer 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"}
API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base"
headers1 = {"Authorization": "Bearer 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"}
API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
headers2 = {"Authorization": "Bearer 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"}
@st.cache
def read_pdf(file):
#     images=pdf2image.convert_from_path(file)
#     # print(type(images))
    pdfReader = PdfFileReader(file)
    count = pdfReader.numPages
    all_page_text = " "
    for i in range(count):
        page = pdfReader.getPage(i)
        all_page_text += page.extractText()+" "
    return all_page_text
# def read_pdf_with_pdfplumber(file):
#     # Open the uploaded PDF file with pdfplumber
#     with pdfplumber.open(file) as pdf:
#         extracted_text = ''
#         for page in pdf.pages:
#             extracted_text += page.extract_text()

#     # Display the extracted text
#     #st.text(extracted_text)
#     return extracted_text

def engsum(output):
    def query(payload):
        response = requests.post(API_URL1, headers=headers1, json=payload)
        return response.json()
        
    out = query({
        "inputs": output,
        "min_length":300
    })
    if isinstance(out, list) and out[0].get("generated_text"):
       text_output = out[0]["generated_text"]
       st.success(text_output)
def bansum(text):
    def query(payload):
        response = requests.post(API_URL0, headers=headers0, json=payload)
        return response.json()
    out = query({"inputs": text, "min_length":300})
    if isinstance(out, list) and out[0].get("summary_text"):
       text_output = out[0]["summary_text"]
       st.success(text_output) 
    
if "photo" not in st.session_state:
    st.session_state["photo"]="not done"
c2, c3 = st.columns([2,1])
def change_photo_state():
    st.session_state["photo"]="done"
message = st.text_input("Type your text here!")
camera_photo = c2.camera_input("Capture a photo to summarize: ", on_change=change_photo_state)
uploaded_photo = c3.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
if st.session_state["photo"]=="done" or message:
    if uploaded_photo and uploaded_photo.type=='application/pdf':
        tet = read_pdf(uploaded_photo)
        # with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        #     temp_file.write(uploaded_photo.read())
        #     temp_file_path = temp_file.name
            
        # loader = PyPDFLoader(temp_file_path)
        # if loader:
        #     text.extend(loader.load())
        #     os.remove(temp_file_path)
        # text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
        # text_chunks = text_splitter.split_documents(text)
        values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
        text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):]
        #st.success(type(text_chunks))
        if st.button("English Pdf Summarize"):
            st.subheader("Selected text for summarize: ")
            st.success(text)
            st.subheader("Summarized Text: ")
            engsum(text)
    
    elif uploaded_photo and uploaded_photo.type !='application/pdf':
        text=None
        img = Image.open(uploaded_photo)
        img = img.save("img.png")
        img = cv2.imread("img.png")
        st.text("Select the summarization type:")
        c4, c5 = st.columns([1,1])
        if c4.button("BENGALI"):
            text =  pytesseract.image_to_string(img, lang="ben")
            st.subheader("সারাংশ/সারমর্ম")
            bansum(text)
        if c5.button("ENGLISH"): 
            text=pytesseract.image_to_string(img)
            st.subheader("Summarized Text")
            engsum(text)
        #st.success(text)
    elif camera_photo:
        text=None
        img = Image.open(camera_photo)
        img = img.save("img.png")
        img = cv2.imread("img.png")
        #text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
        st.text("Select the summarization type:")
        c6, c7 = st.columns([1,1])
        if c6.button("Bangla"):
            text =  pytesseract.image_to_string(img, lang="ben")
            st.subheader("সারাংশ/সারমর্ম")
            bansum(text)
        if c7.button("English"): 
            text=pytesseract.image_to_string(img)
            st.subheader("Summarized Text")
            engsum(text)
    else:
        text=None
        text = message
        c8, c9 = st.columns([1,1])
        if c8.button("Bangla"):
            bansum(text)
        if c9.button("English"): 
            engsum(text)  
    # if st.button("English Text Generation"): 
    #     def query(payload):
    #     	response = requests.post(API_URL2, headers=headers2, json=payload)
    #     	return response.json()
            
    #     out = query({
    #     	"inputs": text,
    #     })
    #     if isinstance(out, list) and out[0].get("generated_text"):
    #         text_output = out[0]["generated_text"]
    #         st.success(text_output)
    #         #text=text_output