chensh123 commited on
Commit
728f241
1 Parent(s): 361a436

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +7 -12
  2. app.py +59 -0
  3. packages.txt +2 -0
  4. requirements.txt +2 -0
README.md CHANGED
@@ -1,12 +1,7 @@
1
- ---
2
- title: PdftoMD
3
- emoji: 📊
4
- colorFrom: indigo
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 4.32.2
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ license: gpl-3.0
3
+ title: PDF to Markdown
4
+ sdk: gradio
5
+ app_file: app.py
6
+ pinned: false
7
+ ---
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ from pypdf import PdfReader
4
+ import ocrmypdf
5
+
6
+
7
+ def extract_text_from_pdf(reader):
8
+ full_text = ""
9
+ for idx, page in enumerate(reader.pages):
10
+ text = page.extract_text()
11
+ if len(text) > 0:
12
+ full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
13
+
14
+ return full_text.strip()
15
+
16
+
17
+ @spaces.GPU
18
+ def convert(pdf_file):
19
+ reader = PdfReader(pdf_file)
20
+
21
+ # Extract metadata
22
+ metadata = {
23
+ "author": reader.metadata.author,
24
+ "creator": reader.metadata.creator,
25
+ "producer": reader.metadata.producer,
26
+ "subject": reader.metadata.subject,
27
+ "title": reader.metadata.title,
28
+ }
29
+
30
+ # Extract text
31
+ full_text = extract_text_from_pdf(reader)
32
+
33
+ # Check if there are any images
34
+ image_count = 0
35
+ for page in reader.pages:
36
+ image_count += len(page.images)
37
+
38
+ # If there are images and not much content, perform OCR on the document
39
+ if image_count > 0 and len(full_text) < 1000:
40
+ out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
41
+ ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
42
+
43
+ # Re-extract text
44
+ reader = PdfReader(pdf_file)
45
+ full_text = extract_text_from_pdf(reader)
46
+
47
+ return full_text, metadata
48
+
49
+
50
+ gr.Interface(
51
+ convert,
52
+ inputs=[
53
+ gr.File(label="Upload PDF", type="filepath"),
54
+ ],
55
+ outputs=[
56
+ gr.Text(label="Markdown"),
57
+ gr.JSON(label="Metadata"),
58
+ ],
59
+ ).launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ocrmypdf
2
+ tesseract-ocr-eng
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ocrmypdf==16.3.1
2
+ pypdf==4.2.0