Spaces:
Runtime error
Runtime error
import gradio as gr | |
import tempfile | |
import re | |
from PyPDF2 import PdfReader, PdfFileReader | |
import os | |
import spacy | |
import pytesseract | |
import pdf2image | |
import subprocess | |
from pdf2image.exceptions import ( | |
PDFInfoNotInstalledError, | |
PDFPageCountError, | |
PDFSyntaxError | |
) | |
def clean_text(text): | |
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"]) | |
text = re.sub(r'\n+', '\n', text) | |
text = re.sub(r'\s+', ' ', text) | |
return text.strip() | |
def image_to_latex(image): | |
image_path = "/tmp/equation.png" # Modify as needed | |
image.save(image_path) | |
result = subprocess.run(["pix2tex", image_path], capture_output=True, text=True) | |
return result.stdout | |
def pdf_to_text(file): | |
with open(file.name, 'rb') as f: | |
reader = PdfReader(f) | |
full_text = '' | |
for i, page in enumerate(reader.pages): | |
page_text = page.extract_text() | |
if page_text is None: | |
images = pdf2image.convert_from_path(file.name, first_page=i+1, last_page=i+2) | |
for image in images: | |
page_text = image_to_latex(image) | |
page_text = clean_text(page_text) | |
if len(page_text.split()) > 5: | |
page_number = i + 1 | |
page_text = "## Metadata: Page Number " + str(page_number) + "\n" + page_text | |
full_text += page_text + "\n\n" | |
base_name = os.path.splitext(os.path.basename(file.name))[0] | |
output_file_name = base_name + ".txt" | |
with open(output_file_name, 'w') as f: | |
f.write(full_text) | |
return output_file_name, page_number | |
iface = gr.Interface(fn=pdf_to_text, | |
inputs=gr.inputs.File(label="Your PDF"), | |
outputs=gr.outputs.File(label="Download TXT"), | |
title="PDF to TXT", | |
description="Convert your PDF files to clean text") | |
iface.launch() | |