Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

App Files Files Community

Update app.py

by Soumen - opened Nov 25, 2022

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+19

-4

Files changed (1) hide show

app.py +19 -4

app.py CHANGED Viewed

@@ -32,7 +32,7 @@ import docx2txt
 from PIL import Image
 from PyPDF2 import PdfFileReader
 import pdfplumber
 # NLP Pkgs
 from textblob import TextBlob
 import spacy
@@ -41,6 +41,7 @@ import requests
 import cv2
 import numpy as np
 import pytesseract
 #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
 from PIL import Image
 def read_pdf(file):
@@ -49,14 +50,17 @@ def read_pdf(file):
 	all_page_text = ""
 	for i in range(count):
 		page = pdfReader.getPage(i)
-		all_page_text += page.extractText()
 	return all_page_text
 #def read_pdf_with_pdfplumber(file):
 #	with pdfplumber.open(file) as pdf:
 #	    page = pdf.pages[0]
-#	    return page.extract_text()s
 st.title("Streamlit NLP APP")
 @st.experimental_singleton
 def text_analyzer(my_text):
@@ -107,7 +111,18 @@ def main():
             img = Image.open(uploaded_photo)
             img = img.save("img.png")
             img = cv2.imread("img.png")
-            text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
             st.success(text)
         elif camera_photo:
             img = Image.open(camera_photo)

 from PIL import Image
 from PyPDF2 import PdfFileReader
 import pdfplumber
+from line_cor import mark_region
 # NLP Pkgs
 from textblob import TextBlob
 import spacy
 import cv2
 import numpy as np
 import pytesseract
 #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
 from PIL import Image
 def read_pdf(file):
 	all_page_text = ""
 	for i in range(count):
 		page = pdfReader.getPage(i)
+        image_name = "Page_" + str(i) + ".jpg"
+        page.save(image_name, "JPEG")
+        text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
+		all_page_text += text + " " #page.extractText()
 	return all_page_text
 #def read_pdf_with_pdfplumber(file):
 #	with pdfplumber.open(file) as pdf:
 #	    page = pdf.pages[0]
+#	    return page.extract_text()
 st.title("Streamlit NLP APP")
 @st.experimental_singleton
 def text_analyzer(my_text):
             img = Image.open(uploaded_photo)
             img = img.save("img.png")
             img = cv2.imread("img.png")
+            # get co-ordinates to crop the image
+            image, lc = mark_region(img)
+            c = lc[1]
+            # cropping image img = image[y0:y1, x0:x1]
+            img = image[c[0][1]:c[1][1], c[0][0]:c[1][0]]
+            plt.figure(figsize=(10,10))
+            plt.imshow(img)
+            # convert the image to black and white for better OCR
+            ret,thresh1 = cv2.threshold(img,120,255,cv2.THRESH_BINARY)
+            # pytesseract image to string to get results
+            text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
+            #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
             st.success(text)
         elif camera_photo:
             img = Image.open(camera_photo)