Spaces:

BhagatSurya
/

convet_pdf_to_txt

Runtime error

App Files Files Community

convet_pdf_to_txt / app.py

BhagatSurya

Update app.py

dd899a3 over 1 year ago

raw

history blame

2.75 kB

	import gradio as gr
	import tempfile
	import re
	import os
	import spacy
	import pytesseract
	import pdf2image
	import subprocess
	from pdf2image.exceptions import (
	PDFInfoNotInstalledError,
	PDFPageCountError,
	PDFSyntaxError
	)
	import fitz # PyMuPDF
	from PIL import Image, UnidentifiedImageError
	import io
	import base64

	def clean_text(text):
	nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
	text = re.sub(r'\n+', '\n', text)
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def safe_base64_decode(s):
	# add missing padding if necessary
	missing_padding = len(s) % 4
	if missing_padding:
	s += '='* (4 - missing_padding)
	try:
	return base64.b64decode(s)
	except binascii.Error as e:
	print("Error decoding base64 string:", e)
	return None

	def image_to_latex(image):
	image_path = "/tmp/equation.png" # Modify as needed
	image.save(image_path)
	result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
	return result.stdout

	def pdf_to_text(file):
	doc = fitz.open(file.name)
	full_text = ''
	for i, page in enumerate(doc):
	# Extract text
	page_text = page.get_text()

	# Extract images and convert to LaTeX
	image_list = page.get_images(full=True)
	for img in image_list:
	xref, name, ext, color_space, width, height, bpc, image_data, image_mask, smask_data = img
	# Check if image_data is base64 encoded string
	if isinstance(image_data, str) and re.match(r'^[A-Za-z0-9+/]+[=]{0,2}$', image_data):
	image_data = safe_base64_decode(image_data)
	try:
	image = Image.open(io.BytesIO(image_data))
	latex_code = image_to_latex(image)
	page_text += "\n" + latex_code # Add LaTeX code to page text
	except UnidentifiedImageError:
	print(f"Could not identify image on page {i+1}")

	page_text = clean_text(page_text)
	if len(page_text.split()) > 5:
	page_number = i + 1
	page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
	full_text += page_text + "\n\n"

	base_name = os.path.splitext(os.path.basename(file.name))[0]
	output_file_name = base_name + ".txt"
	with open(output_file_name, 'w') as f:
	f.write(full_text)

	return output_file_name

	iface = gr.Interface(fn=pdf_to_text,
	inputs=gr.inputs.File(label="Your PDF"),
	outputs=gr.outputs.File(label="Download TXT"),
	title="PDF to TXT",
	description="Convert your PDF files to clean text")
	iface.launch()