File size: 9,094 Bytes
9c37e72 dba2773 9c37e72 68f40bc d82d18a 9c37e72 6e58c44 4834995 bd18577 2f51bd6 c75cc74 ba45265 3df853f 9c37e72 09d4214 29e33a8 06dd768 09d4214 0842639 baf370a 9c37e72 1ecea99 fa73ddc 419e04c 9c37e72 b446f5c a07988a af0fae9 ba45265 29e33a8 9d5dc1c 29e33a8 1a9fa41 c7a7627 3f06691 c7a7627 37287e0 c7a7627 0975d28 c7a7627 37287e0 0975d28 b330918 c9df9a0 ec4347b 9d1426d b330918 ebcff05 b330918 0964e86 b330918 8eacb9c b33cce8 8eacb9c 2d1cbd4 cbcd024 c9df9a0 b330918 d960c1e b330918 2e3a776 b330918 ac35f7a b330918 ac35f7a b330918 ac35f7a 2e3a776 b330918 ac35f7a b330918 ac35f7a b330918 2e3a776 b330918 a2e9e23 b330918 a2e9e23 b330918 a2e9e23 d960c1e 9d1426d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
"""
#App: NLP App with Streamlit
Description
This is a Natural Language Processing(NLP) base Application that is useful for
Document/Text Summarization from Bangla images and English Images/PDF files.
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')
#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
st.title("Chatbot and Bangla/English Text Summarizer: Upload Images/Pdf or input texts to summarize!")
import torch
import docx2txt
from PIL import Image
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
headers0 = {"Authorization": "Bearer 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"}
API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base"
headers1 = {"Authorization": "Bearer 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"}
API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
headers2 = {"Authorization": "Bearer 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"}
@st.cache
def read_pdf(file):
# images=pdf2image.convert_from_path(file)
# # print(type(images))
pdfReader = PdfFileReader(file)
count = pdfReader.numPages
all_page_text = " "
for i in range(count):
page = pdfReader.getPage(i)
all_page_text += page.extractText()+" "
return all_page_text
# def read_pdf_with_pdfplumber(file):
# # Open the uploaded PDF file with pdfplumber
# with pdfplumber.open(file) as pdf:
# extracted_text = ''
# for page in pdf.pages:
# extracted_text += page.extract_text()
# # Display the extracted text
# #st.text(extracted_text)
# return extracted_text
def engsum(output):
def query(payload):
response = requests.post(API_URL1, headers=headers1, json=payload)
return response.json()
out = query({
"inputs": output,
"min_length":300
})
if isinstance(out, list) and out[0].get("generated_text"):
text_output = out[0]["generated_text"]
st.success(text_output)
def bansum(text):
def query(payload):
response = requests.post(API_URL0, headers=headers0, json=payload)
return response.json()
out = query({"inputs": text, "min_length":300})
if isinstance(out, list) and out[0].get("summary_text"):
text_output = out[0]["summary_text"]
st.success(text_output)
# @st.cache
# def save(l):
# return l
#@st.cache
def main():
camera_photo=None
import streamlit as st
if "photo" not in st.session_state:
st.session_state["photo"]="not done"
def change_photo_state():
st.session_state["photo"]="done"
with st.container():
c1, c2, c3 = st.columns([2,2,1])
message = c1.text_input("Type your text here!")
Capture=True
if c2.button("Capture"):
camera_photo = c2.camera_input("Capture a photo to summarize: ", on_change=change_photo_state)
if c2.button("Stop Camera"):
Capture =False
uploaded_photo = c3.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
if st.session_state["photo"]=="done" or message:
if uploaded_photo and uploaded_photo.type=='application/pdf':
tet = read_pdf(uploaded_photo)
# with tempfile.NamedTemporaryFile(delete=False) as temp_file:
# temp_file.write(uploaded_photo.read())
# temp_file_path = temp_file.name
# loader = PyPDFLoader(temp_file_path)
# if loader:
# text.extend(loader.load())
# os.remove(temp_file_path)
# text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
# text_chunks = text_splitter.split_documents(text)
values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):]
#st.success(type(text_chunks))
if st.button("English Pdf Summarize"):
st.subheader("Selected text for summarize: ")
st.success(text)
st.subheader("Summarized Text: ")
engsum(text)
elif uploaded_photo and uploaded_photo.type !='application/pdf':
text=None
img = Image.open(uploaded_photo)
img = img.save("img.png")
img = cv2.imread("img.png")
st.text("Select the summarization type:")
c4, c5 = st.columns([1,7])
if c4.button("BENGALI"):
text = pytesseract.image_to_string(img, lang="ben")
st.success(text)
st.subheader("সারাংশ/সারমর্ম")
bansum(text)
if c5.button("ENGLISH"):
text=pytesseract.image_to_string(img)
st.success(text)
st.subheader("Summarized Text")
engsum(text)
elif camera_photo:
text=None
img = Image.open(camera_photo)
img = img.save("img.png")
img = cv2.imread("img.png")
#text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
st.text("Select the summarization type:")
#st.image(camera_photo, caption='Uploaded Image', use_column_width=True)
c6, c7 = st.columns([1,7])
if c6.button("Bangla"):
text = pytesseract.image_to_string(img, lang="ben")
st.success(text)
st.subheader("সারাংশ/সারমর্ম")
bansum(text)
if c7.button("English"):
text=pytesseract.image_to_string(img)
st.success(text)
st.subheader("Summarized Text")
engsum(text)
else:
text=None
text = message
c8, c9 = st.columns([1,7])
if c8.button("Bangla"):
bansum(text)
if c9.button("English"):
engsum(text)
with st.container():
from streamlit_chat import message as st_message
from transformers import BlenderbotTokenizer
from transformers import BlenderbotForConditionalGeneration
st.title("Chatbot!!!")
@st.experimental_singleton
def get_models():
# it may be necessary for other frameworks to cache the model
# seems pytorch keeps an internal state of the conversation
model_name = "facebook/blenderbot-400M-distill"
tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
model = BlenderbotForConditionalGeneration.from_pretrained(model_name)
return tokenizer, model
if "history" not in st.session_state:
st.session_state.history = []
# st.title("Hello bot: ")
def generate_answer():
tokenizer, model = get_models()
user_message = st.session_state.input_text
inputs = tokenizer(st.session_state.input_text, return_tensors="pt")
result = model.generate(**inputs)
message_bot = tokenizer.decode(
result[0], skip_special_tokens=True
) # .replace("<s>", "").replace("</s>", "")
st.session_state.history.append({"message": user_message, "is_user": True})
st.session_state.history.append({"message": message_bot, "is_user": False})
from copyreg import clear_extension_cache
for chat in st.session_state.history:
st_message(**chat)
st.text_input("Talk to the bot", key="input_text", on_change=generate_answer)
if st.button("Refresh/New Chat"):
st.session_state.history = None
if __name__ == "__main__":
main() |