Spaces:

BhagatSurya
/

convet_pdf_to_txt

Runtime error

App Files Files Community

convet_pdf_to_txt / app.py

bhagatsuryainatom

Update app.py

76a5996 over 1 year ago

raw

history blame

1.92 kB

	import gradio as gr
	import tempfile
	import re
	from PyPDF2 import PdfReader, PdfFileReader
	import os
	import spacy
	import pytesseract
	import pdf2image
	import subprocess
	from pdf2image.exceptions import (
	PDFInfoNotInstalledError,
	PDFPageCountError,
	PDFSyntaxError
	)

	def clean_text(text):
	nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
	text = re.sub(r'\n+', '\n', text)
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def image_to_latex(image):
	image_path = "/tmp/equation.png" # Modify as needed
	image.save(image_path)
	result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True)
	return result.stdout



	def pdf_to_text(file):
	with open(file.name, 'rb') as f:
	reader = PdfReader(f)
	full_text = ''
	for i, page in enumerate(reader.pages):
	page_text = page.extract_text()
	if page_text is None:
	images = pdf2image.convert_from_path(file.name, first_page=i+1, last_page=i+2)
	for image in images:
	page_text = image_to_latex(image)
	page_text = clean_text(page_text)
	if len(page_text.split()) > 5:
	page_number = i + 1
	page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text
	full_text += page_text + "\n\n"
	base_name = os.path.splitext(os.path.basename(file.name))[0]
	output_file_name = base_name + ".txt"
	with open(output_file_name, 'w') as f:
	f.write(full_text)
	return output_file_name, page_number



	iface = gr.Interface(fn=pdf_to_text,
	inputs=gr.inputs.File(label="Your PDF"),
	outputs=gr.outputs.File(label="Download TXT"),
	title="PDF to TXT",
	description="Convert your PDF files to clean text")
	iface.launch()