Soumen commited on
Commit
06dd768
·
1 Parent(s): 25ea9c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -6
app.py CHANGED
@@ -31,8 +31,10 @@ from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
31
  import docx2txt
32
  from PIL import Image
33
  from PyPDF2 import PdfFileReader
 
34
  import pdfplumber
35
  from line_cor import mark_region
 
36
  # NLP Pkgs
37
  from textblob import TextBlob
38
  import spacy
@@ -45,13 +47,15 @@ import pytesseract
45
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
46
  from PIL import Image
47
  def read_pdf(file):
48
- pdfReader = PdfFileReader(file)
49
- count = pdfReader.numPages
 
50
  all_page_text = ""
51
- for i in range(count):
52
- page = pdfReader.getPage(i)
53
- image_name = "Page_" + str(i) + ".jpg"
54
- page.save(image_name, "JPEG")
 
55
  text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
56
  all_page_text += text + " " #page.extractText()
57
  return all_page_text
 
31
  import docx2txt
32
  from PIL import Image
33
  from PyPDF2 import PdfFileReader
34
+ from pdf2image import convert_from_bytes
35
  import pdfplumber
36
  from line_cor import mark_region
37
+
38
  # NLP Pkgs
39
  from textblob import TextBlob
40
  import spacy
 
47
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
48
  from PIL import Image
49
  def read_pdf(file):
50
+ images=convert_from_bytes(open('file.pdf', 'rb').read())
51
+ #pdfReader = PdfFileReader(file)
52
+ #count = pdfReader.numPages
53
  all_page_text = ""
54
+ for i in range(len(images)):
55
+ #page = pdfReader.getPage(i)
56
+ img=images[i]
57
+ image_name = "img_" + str(i) + ".jpg"
58
+ img.save(image_name, "JPEG")
59
  text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
60
  all_page_text += text + " " #page.extractText()
61
  return all_page_text