Soumen commited on
Commit
8c11fa3
·
1 Parent(s): ed0375d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -29
app.py CHANGED
@@ -25,6 +25,7 @@ import os
25
  #os.system('gunzip ben.traineddata.gz ')
26
  #os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
27
  #os.system('pip install -q pytesseract')
 
28
  import streamlit as st
29
  import torch
30
  from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
@@ -49,35 +50,35 @@ import line_cor
49
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
50
  from PIL import Image
51
  @st.experimental_singleton
52
- def read_pdf(file):
53
- images=pdf2image.convert_from_path(file)
54
- # print(type(images))
55
- # pdfReader = PdfFileReader(file)
56
- # count = pdfReader.numPages
57
- all_page_text = ""
58
- for page in images:
59
- # page = pdfReader.getPage(i)
60
- #img = Image.open(page)
61
- img = Image.open(page)
62
- img = img.save("img.png")
63
- image_name = cv2.imread("img.png")
64
- # get co-ordinates to cr
65
- text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
66
- all_page_text += text + " " #page.extractText()
67
- return all_page_text
68
  def read_pdf_with_pdfplumber(file):
69
- # all_page_text=" "
70
- # # all_page_text = ""
71
- # #with pdfplumber.open(file) as pdf:
72
- # # page = pdf.pages[0]
73
- # ge=page.to_image()
74
- # img = Image.open(ge)
75
- # img = img.save("img.png")
76
- # image_name = cv2.imread("img.png")
77
  # get co-ordinates to c
78
- #return page.extract_text()
79
- # get co-ordinates to cr
80
- ## get co-ordinates to cr
81
  text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
82
  all_page_text += text + " " #page.extractText()
83
  return all_page_text
@@ -139,11 +140,11 @@ def main():
139
  #st.success(*lc)
140
  c = lc
141
  # cropping image img = image[y0:y1, x0:x1]
142
- imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
143
  #plt.figure(figsize=(10,10))
144
  # plt.imshow(img)
145
  # convert the image to black and white for better OCR
146
- ret,thresh1 = cv2.threshold(imgg,120,255,cv2.THRESH_BINARY)
147
  # pytesseract image to string to get results
148
  text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
149
  #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
 
25
  #os.system('gunzip ben.traineddata.gz ')
26
  #os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
27
  #os.system('pip install -q pytesseract')
28
+ os.system('conda install -c conda-forge poppler')
29
  import streamlit as st
30
  import torch
31
  from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
 
50
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
51
  from PIL import Image
52
  @st.experimental_singleton
53
+ # def read_pdf(file):
54
+ # images=pdf2image.convert_from_path(file)
55
+ # # print(type(images))
56
+ # # pdfReader = PdfFileReader(file)
57
+ # # count = pdfReader.numPages
58
+ # all_page_text = ""
59
+ # for page in images:
60
+ # # page = pdfReader.getPage(i)
61
+ # #img = Image.open(page)
62
+ # img = Image.open(page)
63
+ # img = img.save("img.png")
64
+ # image_name = cv2.imread("img.png")
65
+ # # get co-ordinates to cr
66
+ # text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
67
+ # all_page_text += text + " " #page.extractText()
68
+ # return all_page_text
69
  def read_pdf_with_pdfplumber(file):
70
+ all_page_text=" "
71
+ # all_page_text = ""
72
+ with pdfplumber.open(file) as pdf:
73
+ page = pdf.pages[0]
74
+ ge=page.to_image()
75
+ img = Image.open(ge)
76
+ img = img.save("img.png")
77
+ image_name = cv2.imread("img.png")
78
  # get co-ordinates to c
79
+ # return page.extract_text()
80
+ # get co-ordinates to cr
81
+ # # get co-ordinates to cr
82
  text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
83
  all_page_text += text + " " #page.extractText()
84
  return all_page_text
 
140
  #st.success(*lc)
141
  c = lc
142
  # cropping image img = image[y0:y1, x0:x1]
143
+ #imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
144
  #plt.figure(figsize=(10,10))
145
  # plt.imshow(img)
146
  # convert the image to black and white for better OCR
147
+ ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
148
  # pytesseract image to string to get results
149
  text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
150
  #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)