Soumen commited on
Commit
dd55b25
1 Parent(s): 7b01ac0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -36
app.py CHANGED
@@ -50,38 +50,38 @@ import line_cor
50
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
51
  from PIL import Image
52
  @st.experimental_singleton
53
- # def read_pdf(file):
54
  # images=pdf2image.convert_from_path(file)
55
  # # print(type(images))
56
- # # pdfReader = PdfFileReader(file)
57
- # # count = pdfReader.numPages
58
- # all_page_text = ""
59
- # for page in images:
60
- # # page = pdfReader.getPage(i)
61
- # #img = Image.open(page)
62
- # img = Image.open(page)
63
- # img = img.save("img.png")
64
- # image_name = cv2.imread("img.png")
65
- # # get co-ordinates to cr
66
  # text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
67
- # all_page_text += text + " " #page.extractText()
68
- # return all_page_text
69
- def read_pdf_with_pdfplumber(file):
70
- all_page_text=" "
71
- # all_page_text = ""
72
- with pdfplumber.open(file) as pdf:
73
- page = pdf.pages[0]
74
- ge=page.to_image()
75
- img = Image.open(ge)
76
- img = img.save("img.png")
77
- image_name = cv2.imread("img.png")
78
- # get co-ordinates to c
79
- # return page.extract_text()
80
- # get co-ordinates to cr
81
- # # get co-ordinates to cr
82
- text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
83
- all_page_text += text + " " #page.extractText()
84
  return all_page_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  st.title("Streamlit NLP APP")
86
  @st.experimental_singleton
87
  def text_analyzer(my_text):
@@ -119,7 +119,7 @@ def main():
119
  st.subheader("Please, feed your image/text, features/services will appear automatically!")
120
  message = st.text_input("Type your text here!")
121
  camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
122
- uploaded_photo = st.file_uploader("Upload Image/PDF, Containing English or Bangla texts",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
123
  if "photo" not in st.session_state:
124
  st.session_state["photo"]="not done"
125
  if st.session_state["photo"]=="done" or message:
@@ -128,26 +128,26 @@ def main():
128
  #file = uploaded_photo.read() # Read the data
129
  #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
130
  #image_result.write(file)
131
- text = read_pdf_with_pdfplumber(uploaded_photo)
132
  #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
133
  st.success(text)
134
  elif uploaded_photo.type != "application/image":
135
  img = Image.open(uploaded_photo)
136
  img = img.save("img.png")
137
- imge = cv2.imread("img.png")
138
  # get co-ordinates to crop the image
139
- imag, lc = line_cor.mark_region(imge)
140
  #st.success(*lc)
141
- c = lc
142
  # cropping image img = image[y0:y1, x0:x1]
143
  #imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
144
  #plt.figure(figsize=(10,10))
145
  # plt.imshow(img)
146
  # convert the image to black and white for better OCR
147
- ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
148
  # pytesseract image to string to get results
149
- text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
150
- #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
151
  st.success(text)
152
  elif camera_photo:
153
  img = Image.open(camera_photo)
 
50
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
51
  from PIL import Image
52
  @st.experimental_singleton
53
+ def read_pdf(file):
54
  # images=pdf2image.convert_from_path(file)
55
  # # print(type(images))
56
+ pdfReader = PdfFileReader(file)
57
+ count = pdfReader.numPages
58
+ all_page_text = ""
59
+ for i range(count):
60
+ page = pdfReader.getPage(i)
61
+ # img = Image.open(page)
62
+ # img = Image.open(page)
63
+ # img = img.save("img.png")
64
+ # image_name = cv2.imread("img.png")
65
+ # # get co-ordinates to cr
66
  # text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
67
+ all_page_text += page.extractText()+" "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  return all_page_text
69
+ # def read_pdf_with_pdfplumber(file):
70
+ # all_page_text=" "
71
+ # # all_page_text = ""
72
+ # with pdfplumber.open(file) as pdf:
73
+ # page = pdf.pages[0]
74
+ # ge=page.to_image()
75
+ # img = Image.open(ge)
76
+ # img = img.save("img.png")
77
+ # image_name = cv2.imread("img.png")
78
+ # # get co-ordinates to c
79
+ # # return page.extract_text()
80
+ # # get co-ordinates to cr
81
+ # # # get co-ordinates to cr
82
+ # text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
83
+ # all_page_text += text + " " #page.extractText()
84
+ # return all_page_text
85
  st.title("Streamlit NLP APP")
86
  @st.experimental_singleton
87
  def text_analyzer(my_text):
 
119
  st.subheader("Please, feed your image/text, features/services will appear automatically!")
120
  message = st.text_input("Type your text here!")
121
  camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
122
+ uploaded_photo = st.file_uploader("Upload Bangla or English Image/ English PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
123
  if "photo" not in st.session_state:
124
  st.session_state["photo"]="not done"
125
  if st.session_state["photo"]=="done" or message:
 
128
  #file = uploaded_photo.read() # Read the data
129
  #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
130
  #image_result.write(file)
131
+ text = read_pdf(uploaded_photo)
132
  #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
133
  st.success(text)
134
  elif uploaded_photo.type != "application/image":
135
  img = Image.open(uploaded_photo)
136
  img = img.save("img.png")
137
+ img = cv2.imread("img.png")
138
  # get co-ordinates to crop the image
139
+ #imag, lc = line_cor.mark_region(imge)
140
  #st.success(*lc)
141
+ # c = lc
142
  # cropping image img = image[y0:y1, x0:x1]
143
  #imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
144
  #plt.figure(figsize=(10,10))
145
  # plt.imshow(img)
146
  # convert the image to black and white for better OCR
147
+ #ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
148
  # pytesseract image to string to get results
149
+ #text = str(pytesseract.image_to_string(img, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
150
+ text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
151
  st.success(text)
152
  elif camera_photo:
153
  img = Image.open(camera_photo)