Spaces:

not-lain
/

utils

Running

App Files Files Community

not-lain commited on 2 days ago

Commit

d5b5b0f

1 Parent(s): 54dbb32

add pdfitdown

Browse files

Files changed (2) hide show

app.py +64 -1
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import gradio as gr
 from base_utils import (
     convert_pdf_to_image,
@@ -68,6 +71,65 @@ url_parser = gr.Interface(
 )
 demo = gr.TabbedInterface(
     [
         pdf_to_img,
@@ -78,7 +140,7 @@ demo = gr.TabbedInterface(
         pptx_to_text,
         url_parser,
         str_to_json,
-        # rmbg,
     ],
     [
         "PDF to Image",
@@ -89,6 +151,7 @@ demo = gr.TabbedInterface(
         "Extract PPTX Text",
         "Extract text from URL",
         "Extract Json",
     ],
 )

 import gradio as gr
+import warnings
+from typing import List
+from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf
 from base_utils import (
     convert_pdf_to_image,
 )
+class FileNotConvertedWarning(Warning):
+    """The file was not in one of the specified formats for conversion to PDF"""
+    pass
+def to_pdf(files: List[str]) -> List[str]:
+    pdfs = []
+    for f in files:
+        if f.endswith(".docx"):
+            newfile = f.replace(".docx", ".pdf")
+            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
+            pdfs.append(file_to_add)
+        elif f.endswith(".pdf"):
+            pdfs.append(f)
+        elif f.endswith(".html"):
+            newfile = f.replace(".html", ".pdf")
+            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
+            pdfs.append(file_to_add)
+        elif f.endswith(".pptx"):
+            newfile = f.replace(".pptx", ".pdf")
+            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
+            pdfs.append(file_to_add)
+        elif f.endswith(".csv"):
+            newfile = f.replace(".csv", ".pdf")
+            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
+            pdfs.append(file_to_add)
+        elif f.endswith(".xml"):
+            newfile = f.replace(".xml", ".pdf")
+            file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0])
+            pdfs.append(file_to_add)
+        elif f.endswith(".md"):
+            newfile = f.replace(".md", ".pdf")
+            file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0])
+            pdfs.append(file_to_add)
+        else:
+            warnings.warn(
+                f"File {f} was not converted to PDF because its file format is not included in those that can be converted",
+                FileNotConvertedWarning,
+            )
+            continue
+    return pdfs
+def convert(file: str) -> str:
+    files = [file]
+    pdfs = to_pdf(files)
+    return pdfs[0]
+pdf_converter = gr.Interface(
+    fn=convert,
+    inputs=gr.File(label="Upload your file"),
+    outputs=gr.File(label="Converted PDF"),
+    title="File to PDF Converter",
+    description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.",
+    api_name="convert_to_pdf",
+)
 demo = gr.TabbedInterface(
     [
         pdf_to_img,
         pptx_to_text,
         url_parser,
         str_to_json,
+        pdf_converter,
     ],
     [
         "PDF to Image",
         "Extract PPTX Text",
         "Extract text from URL",
         "Extract Json",
+        "Convert to PDF",
     ],
 )

requirements.txt CHANGED Viewed

@@ -6,6 +6,7 @@ pdfplumber
 python-docx
 gradio
 python-pptx
 # numpy<2
 # torch>=2
 # spaces

 python-docx
 gradio
 python-pptx
+pdfitdown
 # numpy<2
 # torch>=2
 # spaces