not-lain commited on
Commit
d5b5b0f
·
1 Parent(s): 54dbb32

add pdfitdown

Browse files
Files changed (2) hide show
  1. app.py +64 -1
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,4 +1,7 @@
1
  import gradio as gr
 
 
 
2
 
3
  from base_utils import (
4
  convert_pdf_to_image,
@@ -68,6 +71,65 @@ url_parser = gr.Interface(
68
  )
69
 
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  demo = gr.TabbedInterface(
72
  [
73
  pdf_to_img,
@@ -78,7 +140,7 @@ demo = gr.TabbedInterface(
78
  pptx_to_text,
79
  url_parser,
80
  str_to_json,
81
- # rmbg,
82
  ],
83
  [
84
  "PDF to Image",
@@ -89,6 +151,7 @@ demo = gr.TabbedInterface(
89
  "Extract PPTX Text",
90
  "Extract text from URL",
91
  "Extract Json",
 
92
  ],
93
  )
94
 
 
1
  import gradio as gr
2
+ import warnings
3
+ from typing import List
4
+ from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf
5
 
6
  from base_utils import (
7
  convert_pdf_to_image,
 
71
  )
72
 
73
 
74
+ class FileNotConvertedWarning(Warning):
75
+ """The file was not in one of the specified formats for conversion to PDF"""
76
+
77
+ pass
78
+
79
+
80
+ def to_pdf(files: List[str]) -> List[str]:
81
+ pdfs = []
82
+ for f in files:
83
+ if f.endswith(".docx"):
84
+ newfile = f.replace(".docx", ".pdf")
85
+ file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
86
+ pdfs.append(file_to_add)
87
+ elif f.endswith(".pdf"):
88
+ pdfs.append(f)
89
+ elif f.endswith(".html"):
90
+ newfile = f.replace(".html", ".pdf")
91
+ file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
92
+ pdfs.append(file_to_add)
93
+ elif f.endswith(".pptx"):
94
+ newfile = f.replace(".pptx", ".pdf")
95
+ file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
96
+ pdfs.append(file_to_add)
97
+ elif f.endswith(".csv"):
98
+ newfile = f.replace(".csv", ".pdf")
99
+ file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
100
+ pdfs.append(file_to_add)
101
+ elif f.endswith(".xml"):
102
+ newfile = f.replace(".xml", ".pdf")
103
+ file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
104
+ pdfs.append(file_to_add)
105
+ elif f.endswith(".md"):
106
+ newfile = f.replace(".md", ".pdf")
107
+ file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0])
108
+ pdfs.append(file_to_add)
109
+ else:
110
+ warnings.warn(
111
+ f"File {f} was not converted to PDF because its file format is not included in those that can be converted",
112
+ FileNotConvertedWarning,
113
+ )
114
+ continue
115
+ return pdfs
116
+
117
+
118
+ def convert(file: str) -> str:
119
+ files = [file]
120
+ pdfs = to_pdf(files)
121
+ return pdfs[0]
122
+
123
+
124
+ pdf_converter = gr.Interface(
125
+ fn=convert,
126
+ inputs=gr.File(label="Upload your file"),
127
+ outputs=gr.File(label="Converted PDF"),
128
+ title="File to PDF Converter",
129
+ description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.",
130
+ api_name="convert_to_pdf",
131
+ )
132
+
133
  demo = gr.TabbedInterface(
134
  [
135
  pdf_to_img,
 
140
  pptx_to_text,
141
  url_parser,
142
  str_to_json,
143
+ pdf_converter,
144
  ],
145
  [
146
  "PDF to Image",
 
151
  "Extract PPTX Text",
152
  "Extract text from URL",
153
  "Extract Json",
154
+ "Convert to PDF",
155
  ],
156
  )
157
 
requirements.txt CHANGED
@@ -6,6 +6,7 @@ pdfplumber
6
  python-docx
7
  gradio
8
  python-pptx
 
9
  # numpy<2
10
  # torch>=2
11
  # spaces
 
6
  python-docx
7
  gradio
8
  python-pptx
9
+ pdfitdown
10
  # numpy<2
11
  # torch>=2
12
  # spaces