Soumen commited on
Commit
2406036
1 Parent(s): baf370a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -15
app.py CHANGED
@@ -49,24 +49,31 @@ import pytesseract
49
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
50
  from PIL import Image
51
  @st.experimental_singleton
52
- def read_pdf(file):
53
- images=pdf2image.convert_from_bytes(file.read(),"rb")
54
- #pdfReader = PdfFileReader(file)
55
- #count = pdfReader.numPages
56
- all_page_text = ""
57
- for im in images:
58
- #page = pdfReader.getPage(i)
59
- img = Image.open(im)
60
- img = img.save("img.png")
61
- image_name = cv2.imread("img.png")
 
 
 
 
 
 
 
 
62
  # get co-ordinates to cr
 
 
 
 
63
  text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
64
  all_page_text += text + " " #page.extractText()
65
- return all_page_text
66
- #def read_pdf_with_pdfplumber(file):
67
- # with pdfplumber.open(file) as pdf:
68
- # page = pdf.pages[0]
69
- # return page.extract_text()
70
  st.title("Streamlit NLP APP")
71
  @st.experimental_singleton
72
  def text_analyzer(my_text):
 
49
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
50
  from PIL import Image
51
  @st.experimental_singleton
52
+ # def read_pdf(file):
53
+ # images=pdf2image.convert_from_bytes(file.read(),"rb")
54
+ # #pdfReader = PdfFileReader(file)
55
+ # #count = pdfReader.numPages
56
+ # all_page_text = ""
57
+ # for im in images:
58
+ # #page = pdfReader.getPage(i)
59
+ # img = Image.open(im)
60
+ # img = img.save("img.png")
61
+ # image_name = cv2.imread("img.png")
62
+ # # get co-ordinates to cr
63
+ # text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
64
+ # all_page_text += text + " " #page.extractText()
65
+ # return all_page_text
66
+ def read_pdf_with_pdfplumber(file):
67
+ with pdfplumber.open(file) as pdf:
68
+ page = pdf.pages[0]
69
+ #return page.extract_text()
70
  # get co-ordinates to cr
71
+ #img = Image.open(im)
72
+ img = page.save("img.png")
73
+ image_name = cv2.imread("img.png")
74
+ ## get co-ordinates to cr
75
  text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
76
  all_page_text += text + " " #page.extractText()
 
 
 
 
 
77
  st.title("Streamlit NLP APP")
78
  @st.experimental_singleton
79
  def text_analyzer(my_text):