Update app.py
#1
by
Soumen
- opened
app.py
CHANGED
@@ -32,7 +32,7 @@ import docx2txt
|
|
32 |
from PIL import Image
|
33 |
from PyPDF2 import PdfFileReader
|
34 |
import pdfplumber
|
35 |
-
|
36 |
# NLP Pkgs
|
37 |
from textblob import TextBlob
|
38 |
import spacy
|
@@ -41,6 +41,7 @@ import requests
|
|
41 |
import cv2
|
42 |
import numpy as np
|
43 |
import pytesseract
|
|
|
44 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
45 |
from PIL import Image
|
46 |
def read_pdf(file):
|
@@ -49,14 +50,17 @@ def read_pdf(file):
|
|
49 |
all_page_text = ""
|
50 |
for i in range(count):
|
51 |
page = pdfReader.getPage(i)
|
52 |
-
|
|
|
|
|
|
|
53 |
|
54 |
return all_page_text
|
55 |
|
56 |
#def read_pdf_with_pdfplumber(file):
|
57 |
# with pdfplumber.open(file) as pdf:
|
58 |
# page = pdf.pages[0]
|
59 |
-
# return page.extract_text()
|
60 |
st.title("Streamlit NLP APP")
|
61 |
@st.experimental_singleton
|
62 |
def text_analyzer(my_text):
|
@@ -107,7 +111,18 @@ def main():
|
|
107 |
img = Image.open(uploaded_photo)
|
108 |
img = img.save("img.png")
|
109 |
img = cv2.imread("img.png")
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
st.success(text)
|
112 |
elif camera_photo:
|
113 |
img = Image.open(camera_photo)
|
|
|
32 |
from PIL import Image
|
33 |
from PyPDF2 import PdfFileReader
|
34 |
import pdfplumber
|
35 |
+
from line_cor import mark_region
|
36 |
# NLP Pkgs
|
37 |
from textblob import TextBlob
|
38 |
import spacy
|
|
|
41 |
import cv2
|
42 |
import numpy as np
|
43 |
import pytesseract
|
44 |
+
|
45 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
46 |
from PIL import Image
|
47 |
def read_pdf(file):
|
|
|
50 |
all_page_text = ""
|
51 |
for i in range(count):
|
52 |
page = pdfReader.getPage(i)
|
53 |
+
image_name = "Page_" + str(i) + ".jpg"
|
54 |
+
page.save(image_name, "JPEG")
|
55 |
+
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
56 |
+
all_page_text += text + " " #page.extractText()
|
57 |
|
58 |
return all_page_text
|
59 |
|
60 |
#def read_pdf_with_pdfplumber(file):
|
61 |
# with pdfplumber.open(file) as pdf:
|
62 |
# page = pdf.pages[0]
|
63 |
+
# return page.extract_text()
|
64 |
st.title("Streamlit NLP APP")
|
65 |
@st.experimental_singleton
|
66 |
def text_analyzer(my_text):
|
|
|
111 |
img = Image.open(uploaded_photo)
|
112 |
img = img.save("img.png")
|
113 |
img = cv2.imread("img.png")
|
114 |
+
# get co-ordinates to crop the image
|
115 |
+
image, lc = mark_region(img)
|
116 |
+
c = lc[1]
|
117 |
+
# cropping image img = image[y0:y1, x0:x1]
|
118 |
+
img = image[c[0][1]:c[1][1], c[0][0]:c[1][0]]
|
119 |
+
plt.figure(figsize=(10,10))
|
120 |
+
plt.imshow(img)
|
121 |
+
# convert the image to black and white for better OCR
|
122 |
+
ret,thresh1 = cv2.threshold(img,120,255,cv2.THRESH_BINARY)
|
123 |
+
# pytesseract image to string to get results
|
124 |
+
text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
|
125 |
+
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
|
126 |
st.success(text)
|
127 |
elif camera_photo:
|
128 |
img = Image.open(camera_photo)
|