Update app.py
Browse files
app.py
CHANGED
@@ -31,8 +31,10 @@ from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
|
|
31 |
import docx2txt
|
32 |
from PIL import Image
|
33 |
from PyPDF2 import PdfFileReader
|
|
|
34 |
import pdfplumber
|
35 |
from line_cor import mark_region
|
|
|
36 |
# NLP Pkgs
|
37 |
from textblob import TextBlob
|
38 |
import spacy
|
@@ -45,13 +47,15 @@ import pytesseract
|
|
45 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
46 |
from PIL import Image
|
47 |
def read_pdf(file):
|
48 |
-
|
49 |
-
|
|
|
50 |
all_page_text = ""
|
51 |
-
for i in range(
|
52 |
-
page = pdfReader.getPage(i)
|
53 |
-
|
54 |
-
|
|
|
55 |
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
56 |
all_page_text += text + " " #page.extractText()
|
57 |
return all_page_text
|
|
|
31 |
import docx2txt
|
32 |
from PIL import Image
|
33 |
from PyPDF2 import PdfFileReader
|
34 |
+
from pdf2image import convert_from_bytes
|
35 |
import pdfplumber
|
36 |
from line_cor import mark_region
|
37 |
+
|
38 |
# NLP Pkgs
|
39 |
from textblob import TextBlob
|
40 |
import spacy
|
|
|
47 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
48 |
from PIL import Image
|
49 |
def read_pdf(file):
|
50 |
+
images=convert_from_bytes(open('file.pdf', 'rb').read())
|
51 |
+
#pdfReader = PdfFileReader(file)
|
52 |
+
#count = pdfReader.numPages
|
53 |
all_page_text = ""
|
54 |
+
for i in range(len(images)):
|
55 |
+
#page = pdfReader.getPage(i)
|
56 |
+
img=images[i]
|
57 |
+
image_name = "img_" + str(i) + ".jpg"
|
58 |
+
img.save(image_name, "JPEG")
|
59 |
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
|
60 |
all_page_text += text + " " #page.extractText()
|
61 |
return all_page_text
|