Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

Soumen commited on Nov 25, 2022

Commit

2406036

•

1 Parent(s): baf370a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -49,24 +49,31 @@ import pytesseract
 #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
 from PIL import Image
 @st.experimental_singleton
-def read_pdf(file):
-    images=pdf2image.convert_from_bytes(file.read(),"rb")
-    #pdfReader = PdfFileReader(file)
-    #count = pdfReader.numPages
-    all_page_text = ""
-    for im in images:
-        #page = pdfReader.getPage(i)
-        img = Image.open(im)
-        img = img.save("img.png")
-        image_name = cv2.imread("img.png")
         # get co-ordinates to cr
         text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
         all_page_text += text + " " #page.extractText()
-    return all_page_text
-#def read_pdf_with_pdfplumber(file):
-#	with pdfplumber.open(file) as pdf:
-#	    page = pdf.pages[0]
-#	    return page.extract_text()
 st.title("Streamlit NLP APP")
 @st.experimental_singleton
 def text_analyzer(my_text):

 #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
 from PIL import Image
 @st.experimental_singleton
+# def read_pdf(file):
+#     images=pdf2image.convert_from_bytes(file.read(),"rb")
+#     #pdfReader = PdfFileReader(file)
+#     #count = pdfReader.numPages
+#     all_page_text = ""
+#     for im in images:
+#         #page = pdfReader.getPage(i)
+#         img = Image.open(im)
+#         img = img.save("img.png")
+#         image_name = cv2.imread("img.png")
+#         # get co-ordinates to cr
+#         text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
+#         all_page_text += text + " " #page.extractText()
+#     return all_page_text
+def read_pdf_with_pdfplumber(file):
+	with pdfplumber.open(file) as pdf:
+	    page = pdf.pages[0]
+	    #return page.extract_text()
         # get co-ordinates to cr
+        #img = Image.open(im)
+        img = page.save("img.png")
+        image_name = cv2.imread("img.png")
+## get co-ordinates to cr
         text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
         all_page_text += text + " " #page.extractText()
 st.title("Streamlit NLP APP")
 @st.experimental_singleton
 def text_analyzer(my_text):