Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

App Files Files Community

Soumen commited on Nov 25, 2022

Commit

dd55b25

1 Parent(s): 7b01ac0

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -36

app.py CHANGED Viewed

@@ -50,38 +50,38 @@ import line_cor
 #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
 from PIL import Image
 @st.experimental_singleton
-# def read_pdf(file):
 #     images=pdf2image.convert_from_path(file)
 #     # print(type(images))
-#     # pdfReader = PdfFileReader(file)
-#     # count = pdfReader.numPages
-#     all_page_text = ""
-#     for page in images:
-#        # page = pdfReader.getPage(i)
-#         #img = Image.open(page)
-#         img = Image.open(page)
-#         img = img.save("img.png")
-#         image_name = cv2.imread("img.png")
-#         # get co-ordinates to cr
 #         text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
-#         all_page_text += text + " " #page.extractText()
-#     return all_page_text
-def read_pdf_with_pdfplumber(file):
-    all_page_text=" "
-#     all_page_text = ""
-    with pdfplumber.open(file) as pdf:
-	    page = pdf.pages[0]
-    ge=page.to_image()
-    img = Image.open(ge)
-    img = img.save("img.png")
-    image_name = cv2.imread("img.png")
-# get co-ordinates to c
-#         return page.extract_text()
-#         get co-ordinates to cr
-# # get co-ordinates to cr
-    text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
-    all_page_text += text + " " #page.extractText()
     return all_page_text
 st.title("Streamlit NLP APP")
 @st.experimental_singleton
 def text_analyzer(my_text):
@@ -119,7 +119,7 @@ def main():
     st.subheader("Please, feed your image/text, features/services will appear automatically!")
     message = st.text_input("Type your text here!")
     camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
-    uploaded_photo = st.file_uploader("Upload Image/PDF, Containing English or Bangla texts",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
     if "photo" not in st.session_state:
         st.session_state["photo"]="not done"
     if st.session_state["photo"]=="done" or message:
@@ -128,26 +128,26 @@ def main():
             #file = uploaded_photo.read() # Read the data
             #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
             #image_result.write(file)
-            text = read_pdf_with_pdfplumber(uploaded_photo)
             #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
             st.success(text)
         elif uploaded_photo.type != "application/image":
             img = Image.open(uploaded_photo)
             img = img.save("img.png")
-            imge = cv2.imread("img.png")
             # get co-ordinates to crop the image
-            imag, lc = line_cor.mark_region(imge)
             #st.success(*lc)
-            c = lc
             # cropping image img = image[y0:y1, x0:x1]
             #imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
             #plt.figure(figsize=(10,10))
            # plt.imshow(img)
             # convert the image to black and white for better OCR
-            ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
             # pytesseract image to string to get results
-            text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
-            #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
             st.success(text)
         elif camera_photo:
             img = Image.open(camera_photo)

 #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
 from PIL import Image
 @st.experimental_singleton
+def read_pdf(file):
 #     images=pdf2image.convert_from_path(file)
 #     # print(type(images))
+    pdfReader = PdfFileReader(file)
+    count = pdfReader.numPages
+    all_page_text = ""
+    for i range(count):
+        page = pdfReader.getPage(i)
+        # img = Image.open(page)
+        # img = Image.open(page)
+        # img = img.save("img.png")
+        # image_name = cv2.imread("img.png")
+        # # get co-ordinates to cr
 #         text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
+        all_page_text += page.extractText()+" "
     return all_page_text
+# def read_pdf_with_pdfplumber(file):
+#     all_page_text=" "
+# #     all_page_text = ""
+#     with pdfplumber.open(file) as pdf:
+# 	    page = pdf.pages[0]
+#     ge=page.to_image()
+#     img = Image.open(ge)
+#     img = img.save("img.png")
+#     image_name = cv2.imread("img.png")
+# # get co-ordinates to c
+# #         return page.extract_text()
+# #         get co-ordinates to cr
+# # # get co-ordinates to cr
+#     text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
+#     all_page_text += text + " " #page.extractText()
+#     return all_page_text
 st.title("Streamlit NLP APP")
 @st.experimental_singleton
 def text_analyzer(my_text):
     st.subheader("Please, feed your image/text, features/services will appear automatically!")
     message = st.text_input("Type your text here!")
     camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
+    uploaded_photo = st.file_uploader("Upload Bangla or English Image/ English PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
     if "photo" not in st.session_state:
         st.session_state["photo"]="not done"
     if st.session_state["photo"]=="done" or message:
             #file = uploaded_photo.read() # Read the data
             #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
             #image_result.write(file)
+            text = read_pdf(uploaded_photo)
             #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
             st.success(text)
         elif uploaded_photo.type != "application/image":
             img = Image.open(uploaded_photo)
             img = img.save("img.png")
+            img = cv2.imread("img.png")
             # get co-ordinates to crop the image
+            #imag, lc = line_cor.mark_region(imge)
             #st.success(*lc)
+           # c = lc
             # cropping image img = image[y0:y1, x0:x1]
             #imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
             #plt.figure(figsize=(10,10))
            # plt.imshow(img)
             # convert the image to black and white for better OCR
+            #ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
             # pytesseract image to string to get results
+            #text = str(pytesseract.image_to_string(img, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
+            text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
             st.success(text)
         elif camera_photo:
             img = Image.open(camera_photo)