BhagatSurya commited on
Commit
185443c
1 Parent(s): fa29ba8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -2
app.py CHANGED
@@ -14,6 +14,7 @@ from pdf2image.exceptions import (
14
  import fitz # PyMuPDF
15
  from PIL import Image
16
  import io
 
17
 
18
  def clean_text(text):
19
  nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
@@ -38,8 +39,8 @@ def pdf_to_text(file):
38
  image_list = page.get_images(full=True)
39
  for img in image_list:
40
  xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
41
- print(type(image_data)) # Print the type of image_data
42
- print(image_data[:100]) # Print the first 100 characters of image_data
43
  image = Image.open(io.BytesIO(image_data))
44
  latex_code = image_to_latex(image)
45
  page_text += "\n" + latex_code # Add LaTeX code to page text
 
14
  import fitz # PyMuPDF
15
  from PIL import Image
16
  import io
17
+ import base64
18
 
19
  def clean_text(text):
20
  nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
 
39
  image_list = page.get_images(full=True)
40
  for img in image_list:
41
  xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
42
+ # Decode image_data from base64 before opening it
43
+ image_data = base64.b64decode(image_data)
44
  image = Image.open(io.BytesIO(image_data))
45
  latex_code = image_to_latex(image)
46
  page_text += "\n" + latex_code # Add LaTeX code to page text