drewThomasson commited on
Commit
e8f95a9
1 Parent(s): f6b6deb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -85
app.py CHANGED
@@ -3,108 +3,182 @@ import os
3
  import jpype
4
  import jpype.imports
5
  from jpype.types import *
 
 
 
 
 
 
 
 
6
 
7
- # Start the JVM and import POI classes
8
- def init_poi():
9
- if not jpype.isJVMStarted():
10
- jpype.startJVM()
11
 
12
- from org.apache.poi.xwpf.usermodel import XWPFDocument
13
- from org.apache.poi.hwpf.usermodel import HWPFDocument
14
- from org.apache.poi.xssf.usermodel import XSSFWorkbook
15
- from org.apache.poi.hssf.usermodel import HSSFWorkbook
16
- from org.apache.poi.sl.usermodel import SlideShowFactory
17
- from java.io import FileInputStream, FileOutputStream
18
- from fr.opensagres.poi.xwpf.converter.pdf import PdfConverter # Requires poi-converter library
19
- return True
20
-
21
- def convert_file_with_progress(input_file, output_format, progress=None):
22
- try:
23
- input_filepath = input_file.name
24
- output_filename = f"{os.path.splitext(os.path.basename(input_filepath))[0]}.{output_format}"
25
- output_filepath = os.path.join(os.getcwd(), output_filename)
26
-
27
- if progress:
28
- progress(0.1, "Loading document...")
29
 
30
- # Initialize POI
31
- init_poi()
32
-
33
- # Open input file with appropriate POI class
34
- extension = os.path.splitext(input_filepath)[1].lower()
35
- input_stream = FileInputStream(input_filepath)
 
 
36
 
37
- if extension in ['.docx', '.doc']:
38
- if extension == '.docx':
39
- doc = XWPFDocument(input_stream)
40
- else:
41
- doc = HWPFDocument(input_stream)
42
-
43
- if progress:
44
- progress(0.3, "Processing document...")
45
-
46
- # Get total pages/paragraphs for progress tracking
47
- total_paragraphs = len(doc.getParagraphs())
 
 
 
 
 
 
 
 
 
48
 
49
- if output_format == 'pdf':
50
- output_stream = FileOutputStream(output_filepath)
51
- converter = PdfConverter.getInstance()
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- # Custom progress listener
54
  class ProgressListener(jpype.JImplements('fr.opensagres.poi.xwpf.converter.core.IProgressListener')):
55
  def onProgress(self, processed_paragraphs):
56
  if progress:
57
  percent = min((processed_paragraphs / total_paragraphs) * 100, 100)
58
- progress(percent/100, f"Converting: {percent:.1f}%")
59
-
60
- converter.setProgressListener(ProgressListener())
61
- converter.convert(doc, output_stream)
62
- output_stream.close()
63
-
64
- elif extension in ['.xlsx', '.xls']:
65
- if extension == '.xlsx':
66
- workbook = XSSFWorkbook(input_stream)
67
- else:
68
- workbook = HSSFWorkbook(input_stream)
69
 
70
- if progress:
71
- progress(0.3, "Processing spreadsheet...")
 
 
 
 
72
 
73
- # Get total sheets/cells for progress tracking
74
- total_sheets = workbook.getNumberOfSheets()
75
- processed_sheets = 0
76
-
77
- output_stream = FileOutputStream(output_filepath)
78
- for sheet_idx in range(total_sheets):
79
  if progress:
80
- processed_sheets += 1
81
- percent = (processed_sheets / total_sheets) * 100
82
- progress(percent/100, f"Converting sheet {processed_sheets}/{total_sheets}")
83
 
84
- # Process sheet here
85
- sheet = workbook.getSheetAt(sheet_idx)
 
 
86
 
87
- workbook.write(output_stream)
88
- output_stream.close()
 
 
 
 
 
 
 
 
 
 
89
 
90
- input_stream.close()
91
-
92
- if progress:
93
- progress(1.0, "Conversion complete!")
94
-
95
- return output_filepath
96
-
97
- except Exception as e:
98
- if progress:
99
- progress(1.0, f"Error: {str(e)}")
100
- return f"Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- # Supported formats (limited to what POI handles well)
103
- supported_formats = [
104
- "pdf", "docx", "doc", "xlsx", "xls"
105
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  def gradio_interface():
 
 
 
 
 
 
 
108
  with gr.Blocks() as demo:
109
  with gr.Row():
110
  with gr.Column():
@@ -123,8 +197,8 @@ def gradio_interface():
123
 
124
  def wrapped_convert(input_file, output_format, progress=None):
125
  status_text.update("Starting conversion...")
126
- result = convert_file_with_progress(input_file, output_format, progress)
127
- status_text.update("Complete!" if not result.startswith("Error") else result)
128
  return result
129
 
130
  convert_button.click(
 
3
  import jpype
4
  import jpype.imports
5
  from jpype.types import *
6
+ import subprocess
7
+ import uno
8
+ from com.sun.star.beans import PropertyValue
9
+ from com.sun.star.connection import NoConnectException
10
+ import time
11
+ import sys
12
+ import logging
13
+ from pathlib import Path
14
 
15
+ class DocumentConverter:
16
+ def __init__(self):
17
+ self.init_jpype()
18
+ self.init_libreoffice()
19
 
20
+ def init_jpype(self):
21
+ if not jpype.isJVMStarted():
22
+ jpype.startJVM()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # Import POI classes
25
+ from org.apache.poi.xwpf.usermodel import XWPFDocument
26
+ from org.apache.poi.hwpf.usermodel import HWPFDocument
27
+ from org.apache.poi.xssf.usermodel import XSSFWorkbook
28
+ from org.apache.poi.hssf.usermodel import HSSFWorkbook
29
+ from org.apache.poi.sl.usermodel import SlideShowFactory
30
+ from java.io import FileInputStream, FileOutputStream
31
+ from fr.opensagres.poi.xwpf.converter.pdf import PdfConverter
32
 
33
+ def init_libreoffice(self):
34
+ # Start LibreOffice in listening mode if not already running
35
+ try:
36
+ subprocess.Popen(['soffice', '--headless', '--accept=socket,host=localhost,port=2002;urp;'])
37
+ time.sleep(2) # Give LibreOffice time to start
38
+ except Exception as e:
39
+ logging.error(f"Failed to start LibreOffice: {e}")
40
+
41
+ def connect_to_libreoffice(self):
42
+ localContext = uno.getComponentContext()
43
+ resolver = localContext.ServiceManager.createInstanceWithContext(
44
+ "com.sun.star.bridge.UnoUrlResolver", localContext)
45
+ context = resolver.resolve("uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext")
46
+ return context.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", context)
47
+
48
+ def convert_with_poi(self, input_file, output_format, progress=None):
49
+ try:
50
+ input_filepath = input_file.name
51
+ output_filename = f"{os.path.splitext(os.path.basename(input_filepath))[0]}.{output_format}"
52
+ output_filepath = os.path.join(os.getcwd(), output_filename)
53
 
54
+ extension = os.path.splitext(input_filepath)[1].lower()
55
+ input_stream = FileInputStream(input_filepath)
56
+
57
+ # Handle Word documents
58
+ if extension in ['.docx', '.doc']:
59
+ if progress:
60
+ progress(0.1, "Loading document...")
61
+
62
+ if extension == '.docx':
63
+ doc = XWPFDocument(input_stream)
64
+ else:
65
+ doc = HWPFDocument(input_stream)
66
+
67
+ total_paragraphs = len(doc.getParagraphs())
68
 
 
69
  class ProgressListener(jpype.JImplements('fr.opensagres.poi.xwpf.converter.core.IProgressListener')):
70
  def onProgress(self, processed_paragraphs):
71
  if progress:
72
  percent = min((processed_paragraphs / total_paragraphs) * 100, 100)
73
+ progress(percent/100, f"Converting paragraph {processed_paragraphs} of {total_paragraphs}")
 
 
 
 
 
 
 
 
 
 
74
 
75
+ if output_format == 'pdf':
76
+ output_stream = FileOutputStream(output_filepath)
77
+ converter = PdfConverter.getInstance()
78
+ converter.setProgressListener(ProgressListener())
79
+ converter.convert(doc, output_stream)
80
+ output_stream.close()
81
 
82
+ # Handle Excel files
83
+ elif extension in ['.xlsx', '.xls']:
 
 
 
 
84
  if progress:
85
+ progress(0.1, "Loading spreadsheet...")
 
 
86
 
87
+ if extension == '.xlsx':
88
+ workbook = XSSFWorkbook(input_stream)
89
+ else:
90
+ workbook = HSSFWorkbook(input_stream)
91
 
92
+ total_sheets = workbook.getNumberOfSheets()
93
+
94
+ for sheet_idx in range(total_sheets):
95
+ if progress:
96
+ percent = ((sheet_idx + 1) / total_sheets) * 100
97
+ progress(percent/100, f"Converting sheet {sheet_idx + 1} of {total_sheets}")
98
+
99
+ sheet = workbook.getSheetAt(sheet_idx)
100
+
101
+ output_stream = FileOutputStream(output_filepath)
102
+ workbook.write(output_stream)
103
+ output_stream.close()
104
 
105
+ input_stream.close()
106
+ return output_filepath
107
+
108
+ except Exception as e:
109
+ logging.error(f"POI conversion error: {e}")
110
+ return f"Error: {str(e)}"
111
+
112
+ def convert_with_libreoffice(self, input_file, output_format, progress=None):
113
+ try:
114
+ input_filepath = input_file.name
115
+ output_filename = f"{os.path.splitext(os.path.basename(input_filepath))[0]}.{output_format}"
116
+ output_filepath = os.path.join(os.getcwd(), output_filename)
117
+
118
+ if progress:
119
+ progress(0.1, "Connecting to LibreOffice...")
120
+
121
+ desktop = self.connect_to_libreoffice()
122
+
123
+ if progress:
124
+ progress(0.2, "Loading document...")
125
+
126
+ props = PropertyValue(Name="Hidden", Value=True),
127
+ doc = desktop.loadComponentFromURL(
128
+ f"file://{os.path.abspath(input_filepath)}", "_blank", 0, props)
129
+
130
+ if progress:
131
+ progress(0.5, "Converting document...")
132
+
133
+ output_url = f"file://{os.path.abspath(output_filepath)}"
134
+ props = PropertyValue(Name="FilterName", Value=self.get_filter_name(output_format)),
135
+
136
+ doc.storeToURL(output_url, props)
137
+ doc.close(True)
138
+
139
+ if progress:
140
+ progress(1.0, "Conversion complete!")
141
+
142
+ return output_filepath
143
+
144
+ except Exception as e:
145
+ logging.error(f"LibreOffice conversion error: {e}")
146
+ return f"Error: {str(e)}"
147
 
148
+ def get_filter_name(self, format):
149
+ filters = {
150
+ 'pdf': 'writer_pdf_Export',
151
+ 'doc': 'MS Word 97',
152
+ 'docx': 'Office Open XML Text',
153
+ 'odt': 'writer8',
154
+ 'rtf': 'Rich Text Format',
155
+ 'txt': 'Text',
156
+ 'html': 'HTML',
157
+ 'ppt': 'MS PowerPoint 97',
158
+ 'pptx': 'Impress Office Open XML',
159
+ 'xls': 'MS Excel 97',
160
+ 'xlsx': 'Office Open XML Spreadsheet'
161
+ }
162
+ return filters.get(format, 'writer_pdf_Export')
163
+
164
+ def convert_file(self, input_file, output_format, progress=None):
165
+ extension = os.path.splitext(input_file.name)[1].lower()
166
+
167
+ # Use POI for Office formats when possible (better progress tracking)
168
+ if extension in ['.docx', '.doc', '.xlsx', '.xls'] and output_format in ['pdf', 'docx', 'xlsx']:
169
+ return self.convert_with_poi(input_file, output_format, progress)
170
+ # Use LibreOffice for other formats
171
+ else:
172
+ return self.convert_with_libreoffice(input_file, output_format, progress)
173
 
174
  def gradio_interface():
175
+ converter = DocumentConverter()
176
+
177
+ supported_formats = [
178
+ "pdf", "doc", "docx", "html", "txt", "odt", "rtf",
179
+ "ppt", "pptx", "xls", "xlsx"
180
+ ]
181
+
182
  with gr.Blocks() as demo:
183
  with gr.Row():
184
  with gr.Column():
 
197
 
198
  def wrapped_convert(input_file, output_format, progress=None):
199
  status_text.update("Starting conversion...")
200
+ result = converter.convert_file(input_file, output_format, progress)
201
+ status_text.update("Complete!" if not isinstance(result, str) else result)
202
  return result
203
 
204
  convert_button.click(