|
import spaces
|
|
import gradio as gr
|
|
from pypdf import PdfReader
|
|
import ocrmypdf
|
|
|
|
|
|
def extract_text_from_pdf(reader):
|
|
full_text = ""
|
|
for idx, page in enumerate(reader.pages):
|
|
text = page.extract_text()
|
|
if len(text) > 0:
|
|
full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
|
|
|
|
return full_text.strip()
|
|
|
|
|
|
@spaces.GPU
|
|
def convert(pdf_file):
|
|
reader = PdfReader(pdf_file)
|
|
|
|
|
|
metadata = {
|
|
"author": reader.metadata.author,
|
|
"creator": reader.metadata.creator,
|
|
"producer": reader.metadata.producer,
|
|
"subject": reader.metadata.subject,
|
|
"title": reader.metadata.title,
|
|
}
|
|
|
|
|
|
full_text = extract_text_from_pdf(reader)
|
|
|
|
|
|
image_count = 0
|
|
for page in reader.pages:
|
|
image_count += len(page.images)
|
|
|
|
|
|
if image_count > 0 and len(full_text) < 1000:
|
|
out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
|
|
ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
|
|
|
|
|
|
reader = PdfReader(pdf_file)
|
|
full_text = extract_text_from_pdf(reader)
|
|
|
|
return full_text, metadata
|
|
|
|
|
|
gr.Interface(
|
|
convert,
|
|
inputs=[
|
|
gr.File(label="Upload PDF", type="filepath"),
|
|
],
|
|
outputs=[
|
|
gr.Text(label="Markdown"),
|
|
gr.JSON(label="Metadata"),
|
|
],
|
|
).launch()
|
|
|