drewThomasson's picture
Update app.py
e8f95a9 verified
raw
history blame
8.63 kB
import gradio as gr
import os
import jpype
import jpype.imports
from jpype.types import *
import subprocess
import uno
from com.sun.star.beans import PropertyValue
from com.sun.star.connection import NoConnectException
import time
import sys
import logging
from pathlib import Path
class DocumentConverter:
def __init__(self):
self.init_jpype()
self.init_libreoffice()
def init_jpype(self):
if not jpype.isJVMStarted():
jpype.startJVM()
# Import POI classes
from org.apache.poi.xwpf.usermodel import XWPFDocument
from org.apache.poi.hwpf.usermodel import HWPFDocument
from org.apache.poi.xssf.usermodel import XSSFWorkbook
from org.apache.poi.hssf.usermodel import HSSFWorkbook
from org.apache.poi.sl.usermodel import SlideShowFactory
from java.io import FileInputStream, FileOutputStream
from fr.opensagres.poi.xwpf.converter.pdf import PdfConverter
def init_libreoffice(self):
# Start LibreOffice in listening mode if not already running
try:
subprocess.Popen(['soffice', '--headless', '--accept=socket,host=localhost,port=2002;urp;'])
time.sleep(2) # Give LibreOffice time to start
except Exception as e:
logging.error(f"Failed to start LibreOffice: {e}")
def connect_to_libreoffice(self):
localContext = uno.getComponentContext()
resolver = localContext.ServiceManager.createInstanceWithContext(
"com.sun.star.bridge.UnoUrlResolver", localContext)
context = resolver.resolve("uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext")
return context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context)
def convert_with_poi(self, input_file, output_format, progress=None):
try:
input_filepath = input_file.name
output_filename = f"{os.path.splitext(os.path.basename(input_filepath))[0]}.{output_format}"
output_filepath = os.path.join(os.getcwd(), output_filename)
extension = os.path.splitext(input_filepath)[1].lower()
input_stream = FileInputStream(input_filepath)
# Handle Word documents
if extension in ['.docx', '.doc']:
if progress:
progress(0.1, "Loading document...")
if extension == '.docx':
doc = XWPFDocument(input_stream)
else:
doc = HWPFDocument(input_stream)
total_paragraphs = len(doc.getParagraphs())
class ProgressListener(jpype.JImplements('fr.opensagres.poi.xwpf.converter.core.IProgressListener')):
def onProgress(self, processed_paragraphs):
if progress:
percent = min((processed_paragraphs / total_paragraphs) * 100, 100)
progress(percent/100, f"Converting paragraph {processed_paragraphs} of {total_paragraphs}")
if output_format == 'pdf':
output_stream = FileOutputStream(output_filepath)
converter = PdfConverter.getInstance()
converter.setProgressListener(ProgressListener())
converter.convert(doc, output_stream)
output_stream.close()
# Handle Excel files
elif extension in ['.xlsx', '.xls']:
if progress:
progress(0.1, "Loading spreadsheet...")
if extension == '.xlsx':
workbook = XSSFWorkbook(input_stream)
else:
workbook = HSSFWorkbook(input_stream)
total_sheets = workbook.getNumberOfSheets()
for sheet_idx in range(total_sheets):
if progress:
percent = ((sheet_idx + 1) / total_sheets) * 100
progress(percent/100, f"Converting sheet {sheet_idx + 1} of {total_sheets}")
sheet = workbook.getSheetAt(sheet_idx)
output_stream = FileOutputStream(output_filepath)
workbook.write(output_stream)
output_stream.close()
input_stream.close()
return output_filepath
except Exception as e:
logging.error(f"POI conversion error: {e}")
return f"Error: {str(e)}"
def convert_with_libreoffice(self, input_file, output_format, progress=None):
try:
input_filepath = input_file.name
output_filename = f"{os.path.splitext(os.path.basename(input_filepath))[0]}.{output_format}"
output_filepath = os.path.join(os.getcwd(), output_filename)
if progress:
progress(0.1, "Connecting to LibreOffice...")
desktop = self.connect_to_libreoffice()
if progress:
progress(0.2, "Loading document...")
props = PropertyValue(Name="Hidden", Value=True),
doc = desktop.loadComponentFromURL(
f"file://{os.path.abspath(input_filepath)}", "_blank", 0, props)
if progress:
progress(0.5, "Converting document...")
output_url = f"file://{os.path.abspath(output_filepath)}"
props = PropertyValue(Name="FilterName", Value=self.get_filter_name(output_format)),
doc.storeToURL(output_url, props)
doc.close(True)
if progress:
progress(1.0, "Conversion complete!")
return output_filepath
except Exception as e:
logging.error(f"LibreOffice conversion error: {e}")
return f"Error: {str(e)}"
def get_filter_name(self, format):
filters = {
'pdf': 'writer_pdf_Export',
'doc': 'MS Word 97',
'docx': 'Office Open XML Text',
'odt': 'writer8',
'rtf': 'Rich Text Format',
'txt': 'Text',
'html': 'HTML',
'ppt': 'MS PowerPoint 97',
'pptx': 'Impress Office Open XML',
'xls': 'MS Excel 97',
'xlsx': 'Office Open XML Spreadsheet'
}
return filters.get(format, 'writer_pdf_Export')
def convert_file(self, input_file, output_format, progress=None):
extension = os.path.splitext(input_file.name)[1].lower()
# Use POI for Office formats when possible (better progress tracking)
if extension in ['.docx', '.doc', '.xlsx', '.xls'] and output_format in ['pdf', 'docx', 'xlsx']:
return self.convert_with_poi(input_file, output_format, progress)
# Use LibreOffice for other formats
else:
return self.convert_with_libreoffice(input_file, output_format, progress)
def gradio_interface():
converter = DocumentConverter()
supported_formats = [
"pdf", "doc", "docx", "html", "txt", "odt", "rtf",
"ppt", "pptx", "xls", "xlsx"
]
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload a file")
format_dropdown = gr.Dropdown(
choices=supported_formats,
value="pdf",
label="Select Output Format"
)
convert_button = gr.Button("Convert")
with gr.Column():
progress_bar = gr.Progress()
output_file = gr.File(label="Converted File")
status_text = gr.Textbox(label="Status", interactive=False)
def wrapped_convert(input_file, output_format, progress=None):
status_text.update("Starting conversion...")
result = converter.convert_file(input_file, output_format, progress)
status_text.update("Complete!" if not isinstance(result, str) else result)
return result
convert_button.click(
fn=wrapped_convert,
inputs=[file_input, format_dropdown],
outputs=output_file,
show_progress="full",
)
return demo
if __name__ == "__main__":
gradio_interface().launch()