Update app.py
Browse files
app.py
CHANGED
@@ -50,38 +50,38 @@ import line_cor
|
|
50 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
51 |
from PIL import Image
|
52 |
@st.experimental_singleton
|
53 |
-
|
54 |
# images=pdf2image.convert_from_path(file)
|
55 |
# # print(type(images))
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
#
|
62 |
-
#
|
63 |
-
#
|
64 |
-
#
|
65 |
-
#
|
66 |
# text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
67 |
-
|
68 |
-
# return all_page_text
|
69 |
-
def read_pdf_with_pdfplumber(file):
|
70 |
-
all_page_text=" "
|
71 |
-
# all_page_text = ""
|
72 |
-
with pdfplumber.open(file) as pdf:
|
73 |
-
page = pdf.pages[0]
|
74 |
-
ge=page.to_image()
|
75 |
-
img = Image.open(ge)
|
76 |
-
img = img.save("img.png")
|
77 |
-
image_name = cv2.imread("img.png")
|
78 |
-
# get co-ordinates to c
|
79 |
-
# return page.extract_text()
|
80 |
-
# get co-ordinates to cr
|
81 |
-
# # get co-ordinates to cr
|
82 |
-
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
83 |
-
all_page_text += text + " " #page.extractText()
|
84 |
return all_page_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
st.title("Streamlit NLP APP")
|
86 |
@st.experimental_singleton
|
87 |
def text_analyzer(my_text):
|
@@ -119,7 +119,7 @@ def main():
|
|
119 |
st.subheader("Please, feed your image/text, features/services will appear automatically!")
|
120 |
message = st.text_input("Type your text here!")
|
121 |
camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
|
122 |
-
uploaded_photo = st.file_uploader("Upload
|
123 |
if "photo" not in st.session_state:
|
124 |
st.session_state["photo"]="not done"
|
125 |
if st.session_state["photo"]=="done" or message:
|
@@ -128,26 +128,26 @@ def main():
|
|
128 |
#file = uploaded_photo.read() # Read the data
|
129 |
#image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
|
130 |
#image_result.write(file)
|
131 |
-
text =
|
132 |
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
|
133 |
st.success(text)
|
134 |
elif uploaded_photo.type != "application/image":
|
135 |
img = Image.open(uploaded_photo)
|
136 |
img = img.save("img.png")
|
137 |
-
|
138 |
# get co-ordinates to crop the image
|
139 |
-
imag, lc = line_cor.mark_region(imge)
|
140 |
#st.success(*lc)
|
141 |
-
|
142 |
# cropping image img = image[y0:y1, x0:x1]
|
143 |
#imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
|
144 |
#plt.figure(figsize=(10,10))
|
145 |
# plt.imshow(img)
|
146 |
# convert the image to black and white for better OCR
|
147 |
-
ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
|
148 |
# pytesseract image to string to get results
|
149 |
-
text = str(pytesseract.image_to_string(
|
150 |
-
|
151 |
st.success(text)
|
152 |
elif camera_photo:
|
153 |
img = Image.open(camera_photo)
|
|
|
50 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
51 |
from PIL import Image
|
52 |
@st.experimental_singleton
|
53 |
+
def read_pdf(file):
|
54 |
# images=pdf2image.convert_from_path(file)
|
55 |
# # print(type(images))
|
56 |
+
pdfReader = PdfFileReader(file)
|
57 |
+
count = pdfReader.numPages
|
58 |
+
all_page_text = ""
|
59 |
+
for i range(count):
|
60 |
+
page = pdfReader.getPage(i)
|
61 |
+
# img = Image.open(page)
|
62 |
+
# img = Image.open(page)
|
63 |
+
# img = img.save("img.png")
|
64 |
+
# image_name = cv2.imread("img.png")
|
65 |
+
# # get co-ordinates to cr
|
66 |
# text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
67 |
+
all_page_text += page.extractText()+" "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
return all_page_text
|
69 |
+
# def read_pdf_with_pdfplumber(file):
|
70 |
+
# all_page_text=" "
|
71 |
+
# # all_page_text = ""
|
72 |
+
# with pdfplumber.open(file) as pdf:
|
73 |
+
# page = pdf.pages[0]
|
74 |
+
# ge=page.to_image()
|
75 |
+
# img = Image.open(ge)
|
76 |
+
# img = img.save("img.png")
|
77 |
+
# image_name = cv2.imread("img.png")
|
78 |
+
# # get co-ordinates to c
|
79 |
+
# # return page.extract_text()
|
80 |
+
# # get co-ordinates to cr
|
81 |
+
# # # get co-ordinates to cr
|
82 |
+
# text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
83 |
+
# all_page_text += text + " " #page.extractText()
|
84 |
+
# return all_page_text
|
85 |
st.title("Streamlit NLP APP")
|
86 |
@st.experimental_singleton
|
87 |
def text_analyzer(my_text):
|
|
|
119 |
st.subheader("Please, feed your image/text, features/services will appear automatically!")
|
120 |
message = st.text_input("Type your text here!")
|
121 |
camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
|
122 |
+
uploaded_photo = st.file_uploader("Upload Bangla or English Image/ English PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
|
123 |
if "photo" not in st.session_state:
|
124 |
st.session_state["photo"]="not done"
|
125 |
if st.session_state["photo"]=="done" or message:
|
|
|
128 |
#file = uploaded_photo.read() # Read the data
|
129 |
#image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
|
130 |
#image_result.write(file)
|
131 |
+
text = read_pdf(uploaded_photo)
|
132 |
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
|
133 |
st.success(text)
|
134 |
elif uploaded_photo.type != "application/image":
|
135 |
img = Image.open(uploaded_photo)
|
136 |
img = img.save("img.png")
|
137 |
+
img = cv2.imread("img.png")
|
138 |
# get co-ordinates to crop the image
|
139 |
+
#imag, lc = line_cor.mark_region(imge)
|
140 |
#st.success(*lc)
|
141 |
+
# c = lc
|
142 |
# cropping image img = image[y0:y1, x0:x1]
|
143 |
#imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
|
144 |
#plt.figure(figsize=(10,10))
|
145 |
# plt.imshow(img)
|
146 |
# convert the image to black and white for better OCR
|
147 |
+
#ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
|
148 |
# pytesseract image to string to get results
|
149 |
+
#text = str(pytesseract.image_to_string(img, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
|
150 |
+
text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
|
151 |
st.success(text)
|
152 |
elif camera_photo:
|
153 |
img = Image.open(camera_photo)
|