"""
#App: NLP App with Streamlit
Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery)
Description
This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows;

+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy

+ Named Entity Recognition(NER)/Trigger word detection using SpaCy

+ Sentiment Analysis using TextBlob

+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstructive.

This is built with Streamlit Framework, an awesome framework for building ML and NLP tools.
Purpose
To perform basic and useful NLP tasks with Streamlit, Spacy, Textblob, and Gensim
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Anomaly_Detection_Tool", layout="wide", initial_sidebar_state="expanded")
import torch
#from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
import docx2txt
from PIL import Image 
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image


# NLP Pkgs
from textblob import TextBlob 
import spacy
#from gensim.summarization import summarize
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
API_URL = "https://api-inference.huggingface.co/models/gpt2-large"
headers = {"Authorization": "Bearer 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"}
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
headers0 = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.",
})
def read_pdf(file):
#     images=pdf2image.convert_from_path(file)
#     # print(type(images))
    pdfReader = PdfFileReader(file)
    count = pdfReader.numPages
    all_page_text = " "
    for i in range(count):
        page = pdfReader.getPage(i)
        # img = Image.open(page)
        # img = Image.open(page)
        # img = img.save("img.png")
        # image_name = cv2.imread("img.png")
        # # get co-ordinates to cr
#         text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
        all_page_text += page.extractText()+" "
    return all_page_text
# def read_pdf_with_pdfplumber(file):
#     all_page_text=" "
# #     all_page_text = ""
#     with pdfplumber.open(file) as pdf:
# 	    page = pdf.pages[0]
#     ge=page.to_image()
#     img = Image.open(ge)
#     img = img.save("img.png")
#     image_name = cv2.imread("img.png")
# # get co-ordinates to c
# #         return page.extract_text()
# #         get co-ordinates to cr  
# # # get co-ordinates to cr
#     text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
#     all_page_text += text + " " #page.extractText()
#     return all_page_text
st.title("NLP APPLICATION")
#@st.cache_resource(experimental_allow_widgets=True)
@st.experimental_singleton
def text_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	# tokens = [ token.text for token in docx]
	allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ]
	return allData
#@st.cache_resource(experimental_allow_widgets=True)
# def load_models():
#     tokenizer = AutoTokenizer.from_pretrained('gpt2-large')
#     model = GPT2LMHeadModel.from_pretrained('gpt2-large')
#     return tokenizer, model

# Function For Extracting Entities
@st.experimental_singleton
#@st.cache_resource(experimental_allow_widgets=True)
def entity_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	tokens = [ token.text for token in docx]
	entities = [(entity.text,entity.label_)for entity in docx.ents]
	allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)]
	return allData
def main():
    #global tokenizer, model
    #tokenizer = AutoTokenizer.from_pretrained('t5-base')
    #model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
    """ NLP Based Application with Streamlit """
    st.markdown("""
    	#### Description
    	##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows:
+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy
+ Named Entity Recognition(NER)/Trigger word detection using SpaCy
+ Sentiment Analysis using TextBlob
+ Document/Text Summarization using T5 for English Abstractive.
    	""")
    def change_photo_state():
        st.session_state["photo"]="done"
    st.subheader("Please, feed your image/text, features/services will appear automatically!")
    message = st.text_input("Type your text here!")
    uploaded_photo = st.file_uploader("Upload your PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
    camera_photo = st.camera_input("Take a photo, Containing English texts", on_change=change_photo_state)
    if "photo" not in st.session_state:
        st.session_state["photo"]="not done"
    if st.session_state["photo"]=="done" or message:
        text=" "
        if uploaded_photo and uploaded_photo.type=='application/pdf':
            #file = uploaded_photo.read() # Read the data
            #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
            #image_result.write(file)
            tet = read_pdf(uploaded_photo)
            #tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
            values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*10)])
            text = tet[values[0]*7*10:values[1]*7*10] if values[0]!=len(tet)//(7*10) else tet[len(tet)//(7*10):]
            st.success(text)
        elif uploaded_photo:
            img = Image.open(uploaded_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            # get co-ordinates to crop the image
            #imag, lc = line_cor.mark_region(imge)
            #st.success(*lc)
           # c = lc
            # cropping image img = image[y0:y1, x0:x1]
            #imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
            #plt.figure(figsize=(10,10))
           # plt.imshow(img)
            # convert the image to black and white for better OCR
            #ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
            # pytesseract image to string to get results
            #text = str(pytesseract.image_to_string(img, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
            text =  pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark for Bangla Image's Text") else pytesseract.image_to_string(img)
            #st.success(text)
        elif camera_photo:
            img = Image.open(camera_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            text = pytesseract.image_to_string(img) if st.checkbox("Mark for Bangla Image's Text") else pytesseract.image_to_string(img, lang="ben")
            #st.success(text)
        elif uploaded_photo==None and camera_photo==None:
    	#our_image=load_image("image.jpg")
        #img = cv2.imread("scholarly_text.jpg")
            text = message
        if st.checkbox("Show Named Entities English/Bangla"):
            entity_result = entity_analyzer(text)
            st.json(entity_result)
        
        if st.checkbox("Show Sentiment Analysis for English"):
            blob = TextBlob(text)
            result_sentiment = blob.sentiment
            st.success(result_sentiment)
        if st.checkbox("Spell Corrections for English"):
            st.success(TextBlob(text).correct())
        if st.checkbox("Text Generation"): 
            def query(payload):
            	response = requests.post(API_URL, headers=headers, json=payload)
            	return response.json()
            	
            output = query({
            	"inputs": text,
            })
            st.success(output)
        if st.checkbox("Mark for Text Summarization!"):
            def query(payload):
                response = requests.post(API_URL0, headers=headers0, json=payload)
                return response.json()
            output = query({
                "inputs": text})
            st.success(output)
if __name__ == '__main__':
    main()