arad1367 commited on
Commit
ccd18dc
1 Parent(s): 3e041fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +254 -232
app.py CHANGED
@@ -1,233 +1,255 @@
1
- import spaces
2
- import gradio as gr
3
- import json
4
- import os
5
- from pathlib import Path
6
- import logging
7
- from docling.document_converter import DocumentConverter
8
- from docling.datamodel.base_models import InputFormat, DocumentStream
9
- from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
10
- from docling.document_converter import PdfFormatOption
11
- import requests
12
- from urllib.parse import urlparse
13
- from datetime import datetime
14
- import tempfile
15
- from docx import Document
16
- from docx.shared import Inches
17
- import markdown
18
-
19
- # Set up logging
20
- logging.basicConfig(level=logging.DEBUG)
21
- logger = logging.getLogger(__name__)
22
-
23
- def is_valid_url(url):
24
- try:
25
- result = urlparse(url)
26
- return all([result.scheme, result.netloc])
27
- except:
28
- return False
29
-
30
- def markdown_to_docx(markdown_content):
31
- """Convert markdown content to DOCX format"""
32
- doc = Document()
33
-
34
- # Split content into lines
35
- lines = markdown_content.split('\n')
36
-
37
- for line in lines:
38
- # Handle headers
39
- if line.startswith('# '):
40
- doc.add_heading(line[2:], level=1)
41
- elif line.startswith('## '):
42
- doc.add_heading(line[3:], level=2)
43
- elif line.startswith('### '):
44
- doc.add_heading(line[4:], level=3)
45
- # Handle lists
46
- elif line.startswith('* ') or line.startswith('- '):
47
- doc.add_paragraph(line[2:], style='List Bullet')
48
- elif line.startswith('1. '):
49
- doc.add_paragraph(line[3:], style='List Number')
50
- # Handle normal text
51
- elif line.strip():
52
- doc.add_paragraph(line)
53
- # Handle empty lines
54
- else:
55
- doc.add_paragraph()
56
-
57
- return doc
58
-
59
- def create_output_files(content, original_name):
60
- """Create temporary files for different formats and return their paths"""
61
- files = {}
62
-
63
- # Generate base filename
64
- base_name = Path(original_name).stem
65
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
66
-
67
- # Create markdown file
68
- md_path = tempfile.NamedTemporaryFile(delete=False, suffix='.md').name
69
- with open(md_path, "w", encoding="utf-8") as f:
70
- f.write(content)
71
- files['markdown'] = md_path
72
-
73
- # Create JSON file
74
- json_content = {
75
- "title": original_name,
76
- "content": content,
77
- "metadata": {
78
- "conversion_date": datetime.now().isoformat()
79
- }
80
- }
81
- json_path = tempfile.NamedTemporaryFile(delete=False, suffix='.json').name
82
- with open(json_path, "w", encoding="utf-8") as f:
83
- json.dump(json_content, f, ensure_ascii=False, indent=2)
84
- files['json'] = json_path
85
-
86
- # Create proper DOCX file
87
- docx_path = tempfile.NamedTemporaryFile(delete=False, suffix='.docx').name
88
- doc = markdown_to_docx(content)
89
- doc.save(docx_path)
90
- files['docx'] = docx_path
91
-
92
- return files
93
-
94
- @spaces.GPU()
95
- def process_document(input_type, file_input, url_input, use_gpu, table_mode):
96
- try:
97
- logger.debug(f"Processing with input type: {input_type}")
98
- logger.debug(f"File input: {file_input}")
99
-
100
- # Configure pipeline
101
- pipeline_options = PdfPipelineOptions(do_table_structure=True)
102
- if table_mode:
103
- pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
104
- else:
105
- pipeline_options.table_structure_options.mode = TableFormerMode.FAST
106
-
107
- converter = DocumentConverter(
108
- format_options={
109
- InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
110
- }
111
- )
112
-
113
- # Handle different input types
114
- if input_type == "file":
115
- if file_input is None:
116
- return None, None, None, None, "Please upload a file"
117
- source = file_input
118
- original_name = Path(file_input).name
119
- elif input_type == "url":
120
- if not url_input or not is_valid_url(url_input):
121
- return None, None, None, None, "Please enter a valid URL"
122
- source = url_input
123
- original_name = Path(urlparse(url_input).path).name or "url_document"
124
- else:
125
- return None, None, None, None, "Invalid input type"
126
-
127
- # Convert document
128
- logger.debug(f"Converting document: {source}")
129
- result = converter.convert(source)
130
-
131
- # Get markdown content
132
- markdown_content = result.document.export_to_markdown()
133
-
134
- # Create output files
135
- output_files = create_output_files(markdown_content, original_name)
136
-
137
- return (
138
- output_files['markdown'],
139
- output_files['json'],
140
- output_files['docx'],
141
- markdown_content,
142
- "Conversion completed successfully! Use the download buttons below to get your files."
143
- )
144
-
145
- except Exception as e:
146
- logger.exception("Error occurred during conversion")
147
- return None, None, None, None, f"Error during conversion: {str(e)}\nCheck the console for detailed error logs."
148
-
149
- # Create title HTML with custom style
150
- title_html = """
151
- <div style="text-align: center; max-width: 800px; margin: 0 auto;">
152
- <h1 style="color: #2C3E50; font-size: 2.5rem; margin-bottom: 0.5rem;">Professional Document Converter</h1>
153
- <p style="color: #34495E; font-size: 1.1rem; margin-bottom: 1.5rem;">Convert documents from files or URLs to various formats</p>
154
- </div>
155
- """
156
-
157
- # Create Gradio interface with custom theme
158
- with gr.Blocks(css="footer {display: none}") as demo:
159
- gr.HTML(title_html)
160
-
161
- with gr.Row():
162
- with gr.Column(scale=1):
163
- input_type = gr.Radio(
164
- choices=["file", "url"],
165
- value="file",
166
- label="Input Type"
167
- )
168
-
169
- # File input with proper file type handling
170
- file_input = gr.File(
171
- label="Upload Document",
172
- file_types=[".pdf", ".PDF"],
173
- type="filepath"
174
- )
175
-
176
- # URL input
177
- url_input = gr.Textbox(
178
- label="Or Enter URL",
179
- placeholder="https://arxiv.org/pdf/2408.09869"
180
- )
181
-
182
- # Processing options
183
- use_gpu = gr.Checkbox(label="Use GPU", value=True)
184
- table_mode = gr.Checkbox(label="Use Accurate Table Mode (Slower but better)", value=False)
185
-
186
- convert_btn = gr.Button("Convert Document", variant="primary")
187
-
188
- with gr.Column(scale=2):
189
- # Status message
190
- status_message = gr.Markdown("")
191
-
192
- # Preview area
193
- preview = gr.Markdown("", label="Preview")
194
-
195
- # Download files
196
- with gr.Group() as download_group:
197
- gr.Markdown("### Download Files")
198
- with gr.Row():
199
- markdown_output = gr.File(label="Download Markdown")
200
- json_output = gr.File(label="Download JSON")
201
- docx_output = gr.File(label="Download DOCX")
202
-
203
- # Define the main conversion event
204
- convert_btn.click(
205
- fn=process_document,
206
- inputs=[input_type, file_input, url_input, use_gpu, table_mode],
207
- outputs=[markdown_output, json_output, docx_output, preview, status_message]
208
- )
209
-
210
- # Add footer
211
- footer = """
212
- <div style="text-align: center; margin: 2rem auto; padding: 1rem; border-top: 1px solid #ddd; max-width: 800px;">
213
- <div style="margin-bottom: 1rem;">
214
- <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank" style="text-decoration: none; color: #2C3E50; margin: 0 10px;">LinkedIn</a> |
215
- <a href="https://github.com/arad1367" target="_blank" style="text-decoration: none; color: #2C3E50; margin: 0 10px;">GitHub</a> |
216
- <a href="https://arad1367.pythonanywhere.com/" target="_blank" style="text-decoration: none; color: #2C3E50; margin: 0 10px;">PhD Defense Demo</a> |
217
- <a href="https://github.com/DS4SD/docling" target="_blank" style="text-decoration: none; color: #2C3E50; margin: 0 10px;">Docling Project</a>
218
- </div>
219
- <p style="color: #7F8C8D; margin-top: 0.5rem;">Made with 💖 by Pejman Ebrahimi</p>
220
- </div>
221
- """
222
- gr.HTML(footer)
223
-
224
- # Launch the app
225
- if __name__ == "__main__":
226
- demo.queue(max_size=5) # Enable queue for better handling of multiple requests
227
- demo.launch(
228
- show_error=True,
229
- share=False,
230
- debug=True,
231
- show_api=False,
232
- server_name="0.0.0.0"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  )
 
1
+ import spaces
2
+ import gradio as gr
3
+ import json
4
+ import os
5
+ from pathlib import Path
6
+ import logging
7
+ from docling.document_converter import DocumentConverter
8
+ from docling.datamodel.base_models import InputFormat, DocumentStream
9
+ from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
10
+ from docling.document_converter import PdfFormatOption
11
+ import requests
12
+ from urllib.parse import urlparse
13
+ from datetime import datetime
14
+ import tempfile
15
+ from docx import Document
16
+ from docx.shared import Inches
17
+ import markdown
18
+
19
+ # Set up logging
20
+ logging.basicConfig(level=logging.DEBUG)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ def is_valid_url(url):
24
+ try:
25
+ result = urlparse(url)
26
+ return all([result.scheme, result.netloc])
27
+ except:
28
+ return False
29
+
30
+ def markdown_to_docx(markdown_content):
31
+ """Convert markdown content to DOCX format"""
32
+ doc = Document()
33
+
34
+ # Split content into lines
35
+ lines = markdown_content.split('\n')
36
+
37
+ for line in lines:
38
+ # Handle headers
39
+ if line.startswith('# '):
40
+ doc.add_heading(line[2:], level=1)
41
+ elif line.startswith('## '):
42
+ doc.add_heading(line[3:], level=2)
43
+ elif line.startswith('### '):
44
+ doc.add_heading(line[4:], level=3)
45
+ # Handle lists
46
+ elif line.startswith('* ') or line.startswith('- '):
47
+ doc.add_paragraph(line[2:], style='List Bullet')
48
+ elif line.startswith('1. '):
49
+ doc.add_paragraph(line[3:], style='List Number')
50
+ # Handle normal text
51
+ elif line.strip():
52
+ doc.add_paragraph(line)
53
+ # Handle empty lines
54
+ else:
55
+ doc.add_paragraph()
56
+
57
+ return doc
58
+
59
+ def create_output_files(content, original_name):
60
+ """Create temporary files for different formats and return their paths"""
61
+ files = {}
62
+
63
+ # Generate base filename
64
+ base_name = Path(original_name).stem
65
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
66
+
67
+ # Create markdown file
68
+ md_path = tempfile.NamedTemporaryFile(delete=False, suffix='.md').name
69
+ with open(md_path, "w", encoding="utf-8") as f:
70
+ f.write(content)
71
+ files['markdown'] = md_path
72
+
73
+ # Create JSON file
74
+ json_content = {
75
+ "title": original_name,
76
+ "content": content,
77
+ "metadata": {
78
+ "conversion_date": datetime.now().isoformat()
79
+ }
80
+ }
81
+ json_path = tempfile.NamedTemporaryFile(delete=False, suffix='.json').name
82
+ with open(json_path, "w", encoding="utf-8") as f:
83
+ json.dump(json_content, f, ensure_ascii=False, indent=2)
84
+ files['json'] = json_path
85
+
86
+ # Create proper DOCX file
87
+ docx_path = tempfile.NamedTemporaryFile(delete=False, suffix='.docx').name
88
+ doc = markdown_to_docx(content)
89
+ doc.save(docx_path)
90
+ files['docx'] = docx_path
91
+
92
+ return files
93
+
94
+ @spaces.GPU()
95
+ def process_document(input_type, file_input, url_input, use_gpu, table_mode):
96
+ try:
97
+ logger.debug(f"Processing with input type: {input_type}")
98
+ logger.debug(f"File input: {file_input}")
99
+
100
+ # Configure pipeline
101
+ pipeline_options = PdfPipelineOptions(do_table_structure=True)
102
+ if table_mode:
103
+ pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
104
+ else:
105
+ pipeline_options.table_structure_options.mode = TableFormerMode.FAST
106
+
107
+ converter = DocumentConverter(
108
+ format_options={
109
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
110
+ }
111
+ )
112
+
113
+ # Handle different input types
114
+ if input_type == "file":
115
+ if file_input is None:
116
+ return None, None, None, None, "Please upload a file"
117
+ source = file_input
118
+ original_name = Path(file_input).name
119
+ elif input_type == "url":
120
+ if not url_input or not is_valid_url(url_input):
121
+ return None, None, None, None, "Please enter a valid URL"
122
+ source = url_input
123
+ original_name = Path(urlparse(url_input).path).name or "url_document"
124
+ else:
125
+ return None, None, None, None, "Invalid input type"
126
+
127
+ # Convert document
128
+ logger.debug(f"Converting document: {source}")
129
+ result = converter.convert(source)
130
+
131
+ # Get markdown content
132
+ markdown_content = result.document.export_to_markdown()
133
+
134
+ # Create output files
135
+ output_files = create_output_files(markdown_content, original_name)
136
+
137
+ return (
138
+ output_files['markdown'],
139
+ output_files['json'],
140
+ output_files['docx'],
141
+ markdown_content,
142
+ "Conversion completed successfully! Use the download buttons below to get your files."
143
+ )
144
+
145
+ except Exception as e:
146
+ logger.exception("Error occurred during conversion")
147
+ return None, None, None, None, f"Error during conversion: {str(e)}\nCheck the console for detailed error logs."
148
+
149
+ # Create title HTML with custom style and duplicate button CSS
150
+ title_html = """
151
+ <div style="text-align: center; max-width: 800px; margin: 0 auto;">
152
+ <h1 style="color: #FFD700; font-size: 2.5rem; margin-bottom: 0.5rem;">Professional Document Converter</h1>
153
+ <p style="color: #FFA500; font-size: 1.1rem; margin-bottom: 1.5rem;">Convert documents from files or URLs to various formats</p>
154
+ <p style="color: #87CEEB; font-size: 0.9rem;">Please like this Space if you find it useful! Your support is appreciated 🙏</p>
155
+ </div>
156
+ <style>
157
+ .duplicate-button {
158
+ margin: 0.5em auto 1em;
159
+ display: block;
160
+ background-color: #FFD700 !important;
161
+ color: black !important;
162
+ border: none !important;
163
+ font-weight: bold !important;
164
+ }
165
+ .duplicate-button:hover {
166
+ background-color: #FFA500 !important;
167
+ transform: translateY(-2px);
168
+ transition: all 0.2s ease;
169
+ }
170
+ </style>
171
+ """
172
+
173
+ # Create Gradio interface with custom theme
174
+ with gr.Blocks(css="footer {display: none}") as demo:
175
+ gr.HTML(title_html)
176
+
177
+ # Add duplicate button at the top
178
+ gr.DuplicateButton(
179
+ value="Duplicate Space for private use",
180
+ elem_classes="duplicate-button",
181
+ )
182
+
183
+ with gr.Row():
184
+ with gr.Column(scale=1):
185
+ input_type = gr.Radio(
186
+ choices=["file", "url"],
187
+ value="file",
188
+ label="Input Type"
189
+ )
190
+
191
+ # File input with proper file type handling
192
+ file_input = gr.File(
193
+ label="Upload Document",
194
+ file_types=[".pdf", ".PDF"],
195
+ type="filepath"
196
+ )
197
+
198
+ # URL input
199
+ url_input = gr.Textbox(
200
+ label="Or Enter URL",
201
+ placeholder="https://arxiv.org/pdf/2408.09869"
202
+ )
203
+
204
+ # Processing options
205
+ use_gpu = gr.Checkbox(label="Use GPU", value=True)
206
+ table_mode = gr.Checkbox(label="Use Accurate Table Mode (Slower but better)", value=False)
207
+
208
+ convert_btn = gr.Button("Convert Document", variant="primary")
209
+
210
+ with gr.Column(scale=2):
211
+ # Status message
212
+ status_message = gr.Markdown("")
213
+
214
+ # Preview area
215
+ preview = gr.Markdown("", label="Preview")
216
+
217
+ # Download files
218
+ with gr.Group() as download_group:
219
+ gr.Markdown("### Download Files")
220
+ with gr.Row():
221
+ markdown_output = gr.File(label="Download Markdown")
222
+ json_output = gr.File(label="Download JSON")
223
+ docx_output = gr.File(label="Download DOCX")
224
+
225
+ # Define the main conversion event
226
+ convert_btn.click(
227
+ fn=process_document,
228
+ inputs=[input_type, file_input, url_input, use_gpu, table_mode],
229
+ outputs=[markdown_output, json_output, docx_output, preview, status_message]
230
+ )
231
+
232
+ # Updated footer with better visibility
233
+ footer = """
234
+ <div style="text-align: center; margin: 2rem auto; padding: 1rem; border-top: 1px solid #FFD700; max-width: 800px;">
235
+ <div style="margin-bottom: 1rem;">
236
+ <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">LinkedIn</a> |
237
+ <a href="https://github.com/arad1367" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">GitHub</a> |
238
+ <a href="https://arad1367.pythonanywhere.com/" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">PhD Defense Demo</a> |
239
+ <a href="https://github.com/DS4SD/docling" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">Docling Project</a>
240
+ </div>
241
+ <p style="color: #FFA500; margin-top: 0.5rem;">Made with 💖 by Pejman Ebrahimi</p>
242
+ </div>
243
+ """
244
+ gr.HTML(footer)
245
+
246
+ # Launch the app
247
+ if __name__ == "__main__":
248
+ demo.queue(max_size=5) # Enable queue for better handling of multiple requests
249
+ demo.launch(
250
+ show_error=True,
251
+ share=False,
252
+ debug=True,
253
+ show_api=False,
254
+ server_name="0.0.0.0"
255
  )