Upload 4 files
Browse files- README.md +7 -12
- app.py +59 -0
- packages.txt +2 -0
- requirements.txt +2 -0
README.md
CHANGED
@@ -1,12 +1,7 @@
|
|
1 |
-
---
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
---
|
2 |
+
license: gpl-3.0
|
3 |
+
title: PDF to Markdown
|
4 |
+
sdk: gradio
|
5 |
+
app_file: app.py
|
6 |
+
pinned: false
|
7 |
+
---
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spaces
|
2 |
+
import gradio as gr
|
3 |
+
from pypdf import PdfReader
|
4 |
+
import ocrmypdf
|
5 |
+
|
6 |
+
|
7 |
+
def extract_text_from_pdf(reader):
|
8 |
+
full_text = ""
|
9 |
+
for idx, page in enumerate(reader.pages):
|
10 |
+
text = page.extract_text()
|
11 |
+
if len(text) > 0:
|
12 |
+
full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
|
13 |
+
|
14 |
+
return full_text.strip()
|
15 |
+
|
16 |
+
|
17 |
+
@spaces.GPU
|
18 |
+
def convert(pdf_file):
|
19 |
+
reader = PdfReader(pdf_file)
|
20 |
+
|
21 |
+
# Extract metadata
|
22 |
+
metadata = {
|
23 |
+
"author": reader.metadata.author,
|
24 |
+
"creator": reader.metadata.creator,
|
25 |
+
"producer": reader.metadata.producer,
|
26 |
+
"subject": reader.metadata.subject,
|
27 |
+
"title": reader.metadata.title,
|
28 |
+
}
|
29 |
+
|
30 |
+
# Extract text
|
31 |
+
full_text = extract_text_from_pdf(reader)
|
32 |
+
|
33 |
+
# Check if there are any images
|
34 |
+
image_count = 0
|
35 |
+
for page in reader.pages:
|
36 |
+
image_count += len(page.images)
|
37 |
+
|
38 |
+
# If there are images and not much content, perform OCR on the document
|
39 |
+
if image_count > 0 and len(full_text) < 1000:
|
40 |
+
out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
|
41 |
+
ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
|
42 |
+
|
43 |
+
# Re-extract text
|
44 |
+
reader = PdfReader(pdf_file)
|
45 |
+
full_text = extract_text_from_pdf(reader)
|
46 |
+
|
47 |
+
return full_text, metadata
|
48 |
+
|
49 |
+
|
50 |
+
gr.Interface(
|
51 |
+
convert,
|
52 |
+
inputs=[
|
53 |
+
gr.File(label="Upload PDF", type="filepath"),
|
54 |
+
],
|
55 |
+
outputs=[
|
56 |
+
gr.Text(label="Markdown"),
|
57 |
+
gr.JSON(label="Metadata"),
|
58 |
+
],
|
59 |
+
).launch()
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
ocrmypdf
|
2 |
+
tesseract-ocr-eng
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
ocrmypdf==16.3.1
|
2 |
+
pypdf==4.2.0
|