Spaces:

drewThomasson
/

libreoffice-file-converter

Sleeping

App Files Files Community

drewThomasson commited on Nov 23, 2024

Commit

e8f95a9

•

1 Parent(s): f6b6deb

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -85

app.py CHANGED Viewed

@@ -3,108 +3,182 @@ import os
 import jpype
 import jpype.imports
 from jpype.types import *
-# Start the JVM and import POI classes
-def init_poi():
-    if not jpype.isJVMStarted():
-        jpype.startJVM()
-    from org.apache.poi.xwpf.usermodel import XWPFDocument
-    from org.apache.poi.hwpf.usermodel import HWPFDocument
-    from org.apache.poi.xssf.usermodel import XSSFWorkbook
-    from org.apache.poi.hssf.usermodel import HSSFWorkbook
-    from org.apache.poi.sl.usermodel import SlideShowFactory
-    from java.io import FileInputStream, FileOutputStream
-    from fr.opensagres.poi.xwpf.converter.pdf import PdfConverter  # Requires poi-converter library
-    return True
-def convert_file_with_progress(input_file, output_format, progress=None):
-    try:
-        input_filepath = input_file.name
-        output_filename = f"{os.path.splitext(os.path.basename(input_filepath))[0]}.{output_format}"
-        output_filepath = os.path.join(os.getcwd(), output_filename)
-        if progress:
-            progress(0.1, "Loading document...")
-        # Initialize POI
-        init_poi()
-        # Open input file with appropriate POI class
-        extension = os.path.splitext(input_filepath)[1].lower()
-        input_stream = FileInputStream(input_filepath)
-        if extension in ['.docx', '.doc']:
-            if extension == '.docx':
-                doc = XWPFDocument(input_stream)
-            else:
-                doc = HWPFDocument(input_stream)
-            if progress:
-                progress(0.3, "Processing document...")
-            # Get total pages/paragraphs for progress tracking
-            total_paragraphs = len(doc.getParagraphs())
-            if output_format == 'pdf':
-                output_stream = FileOutputStream(output_filepath)
-                converter = PdfConverter.getInstance()
-                # Custom progress listener
                 class ProgressListener(jpype.JImplements('fr.opensagres.poi.xwpf.converter.core.IProgressListener')):
                     def onProgress(self, processed_paragraphs):
                         if progress:
                             percent = min((processed_paragraphs / total_paragraphs) * 100, 100)
-                            progress(percent/100, f"Converting: {percent:.1f}%")
-                converter.setProgressListener(ProgressListener())
-                converter.convert(doc, output_stream)
-                output_stream.close()
-        elif extension in ['.xlsx', '.xls']:
-            if extension == '.xlsx':
-                workbook = XSSFWorkbook(input_stream)
-            else:
-                workbook = HSSFWorkbook(input_stream)
-            if progress:
-                progress(0.3, "Processing spreadsheet...")
-            # Get total sheets/cells for progress tracking
-            total_sheets = workbook.getNumberOfSheets()
-            processed_sheets = 0
-            output_stream = FileOutputStream(output_filepath)
-            for sheet_idx in range(total_sheets):
                 if progress:
-                    processed_sheets += 1
-                    percent = (processed_sheets / total_sheets) * 100
-                    progress(percent/100, f"Converting sheet {processed_sheets}/{total_sheets}")
-                # Process sheet here
-                sheet = workbook.getSheetAt(sheet_idx)
-            workbook.write(output_stream)
-            output_stream.close()
-        input_stream.close()
-        if progress:
-            progress(1.0, "Conversion complete!")
-        return output_filepath
-    except Exception as e:
-        if progress:
-            progress(1.0, f"Error: {str(e)}")
-        return f"Error: {str(e)}"
-# Supported formats (limited to what POI handles well)
-supported_formats = [
-    "pdf", "docx", "doc", "xlsx", "xls"
-]
 def gradio_interface():
     with gr.Blocks() as demo:
         with gr.Row():
             with gr.Column():
@@ -123,8 +197,8 @@ def gradio_interface():
         def wrapped_convert(input_file, output_format, progress=None):
             status_text.update("Starting conversion...")
-            result = convert_file_with_progress(input_file, output_format, progress)
-            status_text.update("Complete!" if not result.startswith("Error") else result)
             return result
         convert_button.click(

 import jpype
 import jpype.imports
 from jpype.types import *
+import subprocess
+import uno
+from com.sun.star.beans import PropertyValue
+from com.sun.star.connection import NoConnectException
+import time
+import sys
+import logging
+from pathlib import Path
+class DocumentConverter:
+    def __init__(self):
+        self.init_jpype()
+        self.init_libreoffice()
+    def init_jpype(self):
+        if not jpype.isJVMStarted():
+            jpype.startJVM()
+        # Import POI classes
+        from org.apache.poi.xwpf.usermodel import XWPFDocument
+        from org.apache.poi.hwpf.usermodel import HWPFDocument
+        from org.apache.poi.xssf.usermodel import XSSFWorkbook
+        from org.apache.poi.hssf.usermodel import HSSFWorkbook
+        from org.apache.poi.sl.usermodel import SlideShowFactory
+        from java.io import FileInputStream, FileOutputStream
+        from fr.opensagres.poi.xwpf.converter.pdf import PdfConverter
+    def init_libreoffice(self):
+        # Start LibreOffice in listening mode if not already running
+        try:
+            subprocess.Popen(['soffice', '--headless', '--accept=socket,host=localhost,port=2002;urp;'])
+            time.sleep(2)  # Give LibreOffice time to start
+        except Exception as e:
+            logging.error(f"Failed to start LibreOffice: {e}")
+    def connect_to_libreoffice(self):
+        localContext = uno.getComponentContext()
+        resolver = localContext.ServiceManager.createInstanceWithContext(
+            "com.sun.star.bridge.UnoUrlResolver", localContext)
+        context = resolver.resolve("uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext")
+        return context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context)
+    def convert_with_poi(self, input_file, output_format, progress=None):
+        try:
+            input_filepath = input_file.name
+            output_filename = f"{os.path.splitext(os.path.basename(input_filepath))[0]}.{output_format}"
+            output_filepath = os.path.join(os.getcwd(), output_filename)
+            extension = os.path.splitext(input_filepath)[1].lower()
+            input_stream = FileInputStream(input_filepath)
+            # Handle Word documents
+            if extension in ['.docx', '.doc']:
+                if progress:
+                    progress(0.1, "Loading document...")
+                if extension == '.docx':
+                    doc = XWPFDocument(input_stream)
+                else:
+                    doc = HWPFDocument(input_stream)
+                total_paragraphs = len(doc.getParagraphs())
                 class ProgressListener(jpype.JImplements('fr.opensagres.poi.xwpf.converter.core.IProgressListener')):
                     def onProgress(self, processed_paragraphs):
                         if progress:
                             percent = min((processed_paragraphs / total_paragraphs) * 100, 100)
+                            progress(percent/100, f"Converting paragraph {processed_paragraphs} of {total_paragraphs}")
+                if output_format == 'pdf':
+                    output_stream = FileOutputStream(output_filepath)
+                    converter = PdfConverter.getInstance()
+                    converter.setProgressListener(ProgressListener())
+                    converter.convert(doc, output_stream)
+                    output_stream.close()
+            # Handle Excel files
+            elif extension in ['.xlsx', '.xls']:
                 if progress:
+                    progress(0.1, "Loading spreadsheet...")
+                if extension == '.xlsx':
+                    workbook = XSSFWorkbook(input_stream)
+                else:
+                    workbook = HSSFWorkbook(input_stream)
+                total_sheets = workbook.getNumberOfSheets()
+                for sheet_idx in range(total_sheets):
+                    if progress:
+                        percent = ((sheet_idx + 1) / total_sheets) * 100
+                        progress(percent/100, f"Converting sheet {sheet_idx + 1} of {total_sheets}")
+                    sheet = workbook.getSheetAt(sheet_idx)
+                output_stream = FileOutputStream(output_filepath)
+                workbook.write(output_stream)
+                output_stream.close()
+            input_stream.close()
+            return output_filepath
+        except Exception as e:
+            logging.error(f"POI conversion error: {e}")
+            return f"Error: {str(e)}"
+    def convert_with_libreoffice(self, input_file, output_format, progress=None):
+        try:
+            input_filepath = input_file.name
+            output_filename = f"{os.path.splitext(os.path.basename(input_filepath))[0]}.{output_format}"
+            output_filepath = os.path.join(os.getcwd(), output_filename)
+            if progress:
+                progress(0.1, "Connecting to LibreOffice...")
+            desktop = self.connect_to_libreoffice()
+            if progress:
+                progress(0.2, "Loading document...")
+            props = PropertyValue(Name="Hidden", Value=True),
+            doc = desktop.loadComponentFromURL(
+                f"file://{os.path.abspath(input_filepath)}", "_blank", 0, props)
+            if progress:
+                progress(0.5, "Converting document...")
+            output_url = f"file://{os.path.abspath(output_filepath)}"
+            props = PropertyValue(Name="FilterName", Value=self.get_filter_name(output_format)),
+            doc.storeToURL(output_url, props)
+            doc.close(True)
+            if progress:
+                progress(1.0, "Conversion complete!")
+            return output_filepath
+        except Exception as e:
+            logging.error(f"LibreOffice conversion error: {e}")
+            return f"Error: {str(e)}"
+    def get_filter_name(self, format):
+        filters = {
+            'pdf': 'writer_pdf_Export',
+            'doc': 'MS Word 97',
+            'docx': 'Office Open XML Text',
+            'odt': 'writer8',
+            'rtf': 'Rich Text Format',
+            'txt': 'Text',
+            'html': 'HTML',
+            'ppt': 'MS PowerPoint 97',
+            'pptx': 'Impress Office Open XML',
+            'xls': 'MS Excel 97',
+            'xlsx': 'Office Open XML Spreadsheet'
+        }
+        return filters.get(format, 'writer_pdf_Export')
+    def convert_file(self, input_file, output_format, progress=None):
+        extension = os.path.splitext(input_file.name)[1].lower()
+        # Use POI for Office formats when possible (better progress tracking)
+        if extension in ['.docx', '.doc', '.xlsx', '.xls'] and output_format in ['pdf', 'docx', 'xlsx']:
+            return self.convert_with_poi(input_file, output_format, progress)
+        # Use LibreOffice for other formats
+        else:
+            return self.convert_with_libreoffice(input_file, output_format, progress)
 def gradio_interface():
+    converter = DocumentConverter()
+    supported_formats = [
+        "pdf", "doc", "docx", "html", "txt", "odt", "rtf",
+        "ppt", "pptx", "xls", "xlsx"
+    ]
     with gr.Blocks() as demo:
         with gr.Row():
             with gr.Column():
         def wrapped_convert(input_file, output_format, progress=None):
             status_text.update("Starting conversion...")
+            result = converter.convert_file(input_file, output_format, progress)
+            status_text.update("Complete!" if not isinstance(result, str) else result)
             return result
         convert_button.click(