BhagatSurya's picture
Update app.py
dd899a3
raw
history blame
2.75 kB
import gradio as gr
import tempfile
import re
import os
import spacy
import pytesseract
import pdf2image
import subprocess
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
import fitz # PyMuPDF
from PIL import Image, UnidentifiedImageError
import io
import base64
def clean_text(text):
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
text = re.sub(r'\n+', '\n', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def safe_base64_decode(s):
# add missing padding if necessary
missing_padding = len(s) % 4
if missing_padding:
s += '='* (4 - missing_padding)
try:
return base64.b64decode(s)
except binascii.Error as e:
print("Error decoding base64 string:", e)
return None
def image_to_latex(image):
image_path = "/tmp/equation.png" # Modify as needed
image.save(image_path)
result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
return result.stdout
def pdf_to_text(file):
doc = fitz.open(file.name)
full_text = ''
for i, page in enumerate(doc):
# Extract text
page_text = page.get_text()
# Extract images and convert to LaTeX
image_list = page.get_images(full=True)
for img in image_list:
xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
# Check if image_data is base64 encoded string
if isinstance(image_data, str) and re.match(r'^[A-Za-z0-9+/]+[=]{0,2}$', image_data):
image_data = safe_base64_decode(image_data)
try:
image = Image.open(io.BytesIO(image_data))
latex_code = image_to_latex(image)
page_text += "\n" + latex_code # Add LaTeX code to page text
except UnidentifiedImageError:
print(f"Could not identify image on page {i+1}")
page_text = clean_text(page_text)
if len(page_text.split()) > 5:
page_number = i + 1
page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
full_text += page_text + "\n\n"
base_name = os.path.splitext(os.path.basename(file.name))[0]
output_file_name = base_name + ".txt"
with open(output_file_name, 'w') as f:
f.write(full_text)
return output_file_name
iface = gr.Interface(fn=pdf_to_text,
inputs=gr.inputs.File(label="Your PDF"),
outputs=gr.outputs.File(label="Download TXT"),
title="PDF to TXT",
description="Convert your PDF files to clean text")
iface.launch()