Update app.py
Browse files
@@ -32,7 +32,7 @@ import docx2txt
32 |
from PIL import Image
33 |
from PyPDF2 import PdfFileReader
34 |
import pdfplumber
35 |
36 |
# NLP Pkgs
37 |
from textblob import TextBlob
38 |
import spacy
@@ -41,6 +41,7 @@ import requests
41 |
import cv2
42 |
import numpy as np
43 |
import pytesseract
44 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
45 |
from PIL import Image
46 |
def read_pdf(file):
@@ -49,14 +50,17 @@ def read_pdf(file):
49 |
all_page_text = ""
50 |
for i in range(count):
51 |
page = pdfReader.getPage(i)
52 |
53 |
54 |
return all_page_text
55 |
56 |
#def read_pdf_with_pdfplumber(file):
57 |
# with pdfplumber.open(file) as pdf:
58 |
# page = pdf.pages[0]
59 |
# return page.extract_text()
60 |
st.title("Streamlit NLP APP")
61 |
62 |
def text_analyzer(my_text):
@@ -107,7 +111,18 @@ def main():
107 |
img = Image.open(uploaded_photo)
108 |
img = img.save("img.png")
109 |
img = cv2.imread("img.png")
110 |
111 |
112 |
elif camera_photo:
113 |
img = Image.open(camera_photo)
32 |
from PIL import Image
33 |
from PyPDF2 import PdfFileReader
34 |
import pdfplumber
35 |
from line_cor import mark_region
36 |
# NLP Pkgs
37 |
from textblob import TextBlob
38 |
import spacy
41 |
import cv2
42 |
import numpy as np
43 |
import pytesseract
44 |
45 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
46 |
from PIL import Image
47 |
def read_pdf(file):
50 |
all_page_text = ""
51 |
for i in range(count):
52 |
page = pdfReader.getPage(i)
53 |
image_name = "Page_" + str(i) + ".jpg"
54 |
page.save(image_name, "JPEG")
55 |
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
56 |
all_page_text += text + " " #page.extractText()
57 |
58 |
return all_page_text
59 |
60 |
#def read_pdf_with_pdfplumber(file):
61 |
# with pdfplumber.open(file) as pdf:
62 |
# page = pdf.pages[0]
63 |
# return page.extract_text()
64 |
st.title("Streamlit NLP APP")
65 |
66 |
def text_analyzer(my_text):
111 |
img = Image.open(uploaded_photo)
112 |
img = img.save("img.png")
113 |
img = cv2.imread("img.png")
114 |
# get co-ordinates to crop the image
115 |
image, lc = mark_region(img)
116 |
c = lc[1]
117 |
# cropping image img = image[y0:y1, x0:x1]
118 |
img = image[c[0][1]:c[1][1], c[0][0]:c[1][0]]
119 |
120 |
121 |
# convert the image to black and white for better OCR
122 |
ret,thresh1 = cv2.threshold(img,120,255,cv2.THRESH_BINARY)
123 |
# pytesseract image to string to get results
124 |
text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
125 |
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
126 |
127 |
elif camera_photo:
128 |
img = Image.open(camera_photo)