Update app.py
Browse files
app.py
CHANGED
@@ -25,6 +25,7 @@ import os
|
|
25 |
#os.system('gunzip ben.traineddata.gz ')
|
26 |
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
|
27 |
#os.system('pip install -q pytesseract')
|
|
|
28 |
import streamlit as st
|
29 |
import torch
|
30 |
from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
|
@@ -49,35 +50,35 @@ import line_cor
|
|
49 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
50 |
from PIL import Image
|
51 |
@st.experimental_singleton
|
52 |
-
def read_pdf(file):
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
def read_pdf_with_pdfplumber(file):
|
69 |
-
|
70 |
-
#
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
# get co-ordinates to c
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
82 |
all_page_text += text + " " #page.extractText()
|
83 |
return all_page_text
|
@@ -139,11 +140,11 @@ def main():
|
|
139 |
#st.success(*lc)
|
140 |
c = lc
|
141 |
# cropping image img = image[y0:y1, x0:x1]
|
142 |
-
imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
|
143 |
#plt.figure(figsize=(10,10))
|
144 |
# plt.imshow(img)
|
145 |
# convert the image to black and white for better OCR
|
146 |
-
ret,thresh1 = cv2.threshold(
|
147 |
# pytesseract image to string to get results
|
148 |
text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
|
149 |
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
|
|
|
25 |
#os.system('gunzip ben.traineddata.gz ')
|
26 |
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
|
27 |
#os.system('pip install -q pytesseract')
|
28 |
+
os.system('conda install -c conda-forge poppler')
|
29 |
import streamlit as st
|
30 |
import torch
|
31 |
from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
|
|
|
50 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
51 |
from PIL import Image
|
52 |
@st.experimental_singleton
|
53 |
+
# def read_pdf(file):
|
54 |
+
# images=pdf2image.convert_from_path(file)
|
55 |
+
# # print(type(images))
|
56 |
+
# # pdfReader = PdfFileReader(file)
|
57 |
+
# # count = pdfReader.numPages
|
58 |
+
# all_page_text = ""
|
59 |
+
# for page in images:
|
60 |
+
# # page = pdfReader.getPage(i)
|
61 |
+
# #img = Image.open(page)
|
62 |
+
# img = Image.open(page)
|
63 |
+
# img = img.save("img.png")
|
64 |
+
# image_name = cv2.imread("img.png")
|
65 |
+
# # get co-ordinates to cr
|
66 |
+
# text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
67 |
+
# all_page_text += text + " " #page.extractText()
|
68 |
+
# return all_page_text
|
69 |
def read_pdf_with_pdfplumber(file):
|
70 |
+
all_page_text=" "
|
71 |
+
# all_page_text = ""
|
72 |
+
with pdfplumber.open(file) as pdf:
|
73 |
+
page = pdf.pages[0]
|
74 |
+
ge=page.to_image()
|
75 |
+
img = Image.open(ge)
|
76 |
+
img = img.save("img.png")
|
77 |
+
image_name = cv2.imread("img.png")
|
78 |
# get co-ordinates to c
|
79 |
+
# return page.extract_text()
|
80 |
+
# get co-ordinates to cr
|
81 |
+
# # get co-ordinates to cr
|
82 |
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
83 |
all_page_text += text + " " #page.extractText()
|
84 |
return all_page_text
|
|
|
140 |
#st.success(*lc)
|
141 |
c = lc
|
142 |
# cropping image img = image[y0:y1, x0:x1]
|
143 |
+
#imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
|
144 |
#plt.figure(figsize=(10,10))
|
145 |
# plt.imshow(img)
|
146 |
# convert the image to black and white for better OCR
|
147 |
+
ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
|
148 |
# pytesseract image to string to get results
|
149 |
text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
|
150 |
#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
|