Spaces:

chensh123
/

pdftoMD

Build error

App Files Files Community

chensh123 commited on Jun 4

Commit

728f241

•

1 Parent(s): 361a436

Upload 4 files

Browse files

Files changed (4) hide show

README.md +7 -12
app.py +59 -0
packages.txt +2 -0
requirements.txt +2 -0

README.md CHANGED Viewed

@@ -1,12 +1,7 @@
----
-title: PdftoMD
-emoji: 📊
-colorFrom: indigo
-colorTo: green
-sdk: gradio
-sdk_version: 4.32.2
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+license: gpl-3.0
+title: PDF to Markdown
+sdk: gradio
+app_file: app.py
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import spaces
+import gradio as gr
+from pypdf import PdfReader
+import ocrmypdf
+def extract_text_from_pdf(reader):
+    full_text = ""
+    for idx, page in enumerate(reader.pages):
+        text = page.extract_text()
+        if len(text) > 0:
+            full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
+    return full_text.strip()
+@spaces.GPU
+def convert(pdf_file):
+    reader = PdfReader(pdf_file)
+    # Extract metadata
+    metadata = {
+        "author": reader.metadata.author,
+        "creator": reader.metadata.creator,
+        "producer": reader.metadata.producer,
+        "subject": reader.metadata.subject,
+        "title": reader.metadata.title,
+    }
+    # Extract text
+    full_text = extract_text_from_pdf(reader)
+    # Check if there are any images
+    image_count = 0
+    for page in reader.pages:
+        image_count += len(page.images)
+    # If there are images and not much content, perform OCR on the document
+    if image_count > 0 and len(full_text) < 1000:
+        out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
+        ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
+        # Re-extract text
+        reader = PdfReader(pdf_file)
+        full_text = extract_text_from_pdf(reader)
+    return full_text, metadata
+gr.Interface(
+    convert,
+    inputs=[
+        gr.File(label="Upload PDF", type="filepath"),
+    ],
+    outputs=[
+        gr.Text(label="Markdown"),
+        gr.JSON(label="Metadata"),
+    ],
+).launch()

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ocrmypdf
2	+ tesseract-ocr-eng

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ocrmypdf==16.3.1
2	+ pypdf==4.2.0