arad1367 commited on
Commit
3e041fc
1 Parent(s): 7646e21

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +233 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import json
4
+ import os
5
+ from pathlib import Path
6
+ import logging
7
+ from docling.document_converter import DocumentConverter
8
+ from docling.datamodel.base_models import InputFormat, DocumentStream
9
+ from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
10
+ from docling.document_converter import PdfFormatOption
11
+ import requests
12
+ from urllib.parse import urlparse
13
+ from datetime import datetime
14
+ import tempfile
15
+ from docx import Document
16
+ from docx.shared import Inches
17
+ import markdown
18
+
19
+ # Set up logging
20
+ logging.basicConfig(level=logging.DEBUG)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ def is_valid_url(url):
24
+ try:
25
+ result = urlparse(url)
26
+ return all([result.scheme, result.netloc])
27
+ except:
28
+ return False
29
+
30
+ def markdown_to_docx(markdown_content):
31
+ """Convert markdown content to DOCX format"""
32
+ doc = Document()
33
+
34
+ # Split content into lines
35
+ lines = markdown_content.split('\n')
36
+
37
+ for line in lines:
38
+ # Handle headers
39
+ if line.startswith('# '):
40
+ doc.add_heading(line[2:], level=1)
41
+ elif line.startswith('## '):
42
+ doc.add_heading(line[3:], level=2)
43
+ elif line.startswith('### '):
44
+ doc.add_heading(line[4:], level=3)
45
+ # Handle lists
46
+ elif line.startswith('* ') or line.startswith('- '):
47
+ doc.add_paragraph(line[2:], style='List Bullet')
48
+ elif line.startswith('1. '):
49
+ doc.add_paragraph(line[3:], style='List Number')
50
+ # Handle normal text
51
+ elif line.strip():
52
+ doc.add_paragraph(line)
53
+ # Handle empty lines
54
+ else:
55
+ doc.add_paragraph()
56
+
57
+ return doc
58
+
59
+ def create_output_files(content, original_name):
60
+ """Create temporary files for different formats and return their paths"""
61
+ files = {}
62
+
63
+ # Generate base filename
64
+ base_name = Path(original_name).stem
65
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
66
+
67
+ # Create markdown file
68
+ md_path = tempfile.NamedTemporaryFile(delete=False, suffix='.md').name
69
+ with open(md_path, "w", encoding="utf-8") as f:
70
+ f.write(content)
71
+ files['markdown'] = md_path
72
+
73
+ # Create JSON file
74
+ json_content = {
75
+ "title": original_name,
76
+ "content": content,
77
+ "metadata": {
78
+ "conversion_date": datetime.now().isoformat()
79
+ }
80
+ }
81
+ json_path = tempfile.NamedTemporaryFile(delete=False, suffix='.json').name
82
+ with open(json_path, "w", encoding="utf-8") as f:
83
+ json.dump(json_content, f, ensure_ascii=False, indent=2)
84
+ files['json'] = json_path
85
+
86
+ # Create proper DOCX file
87
+ docx_path = tempfile.NamedTemporaryFile(delete=False, suffix='.docx').name
88
+ doc = markdown_to_docx(content)
89
+ doc.save(docx_path)
90
+ files['docx'] = docx_path
91
+
92
+ return files
93
+
94
+ @spaces.GPU()
95
+ def process_document(input_type, file_input, url_input, use_gpu, table_mode):
96
+ try:
97
+ logger.debug(f"Processing with input type: {input_type}")
98
+ logger.debug(f"File input: {file_input}")
99
+
100
+ # Configure pipeline
101
+ pipeline_options = PdfPipelineOptions(do_table_structure=True)
102
+ if table_mode:
103
+ pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
104
+ else:
105
+ pipeline_options.table_structure_options.mode = TableFormerMode.FAST
106
+
107
+ converter = DocumentConverter(
108
+ format_options={
109
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
110
+ }
111
+ )
112
+
113
+ # Handle different input types
114
+ if input_type == "file":
115
+ if file_input is None:
116
+ return None, None, None, None, "Please upload a file"
117
+ source = file_input
118
+ original_name = Path(file_input).name
119
+ elif input_type == "url":
120
+ if not url_input or not is_valid_url(url_input):
121
+ return None, None, None, None, "Please enter a valid URL"
122
+ source = url_input
123
+ original_name = Path(urlparse(url_input).path).name or "url_document"
124
+ else:
125
+ return None, None, None, None, "Invalid input type"
126
+
127
+ # Convert document
128
+ logger.debug(f"Converting document: {source}")
129
+ result = converter.convert(source)
130
+
131
+ # Get markdown content
132
+ markdown_content = result.document.export_to_markdown()
133
+
134
+ # Create output files
135
+ output_files = create_output_files(markdown_content, original_name)
136
+
137
+ return (
138
+ output_files['markdown'],
139
+ output_files['json'],
140
+ output_files['docx'],
141
+ markdown_content,
142
+ "Conversion completed successfully! Use the download buttons below to get your files."
143
+ )
144
+
145
+ except Exception as e:
146
+ logger.exception("Error occurred during conversion")
147
+ return None, None, None, None, f"Error during conversion: {str(e)}\nCheck the console for detailed error logs."
148
+
149
+ # Create title HTML with custom style
150
+ title_html = """
151
+ <div style="text-align: center; max-width: 800px; margin: 0 auto;">
152
+ <h1 style="color: #2C3E50; font-size: 2.5rem; margin-bottom: 0.5rem;">Professional Document Converter</h1>
153
+ <p style="color: #34495E; font-size: 1.1rem; margin-bottom: 1.5rem;">Convert documents from files or URLs to various formats</p>
154
+ </div>
155
+ """
156
+
157
+ # Create Gradio interface with custom theme
158
+ with gr.Blocks(css="footer {display: none}") as demo:
159
+ gr.HTML(title_html)
160
+
161
+ with gr.Row():
162
+ with gr.Column(scale=1):
163
+ input_type = gr.Radio(
164
+ choices=["file", "url"],
165
+ value="file",
166
+ label="Input Type"
167
+ )
168
+
169
+ # File input with proper file type handling
170
+ file_input = gr.File(
171
+ label="Upload Document",
172
+ file_types=[".pdf", ".PDF"],
173
+ type="filepath"
174
+ )
175
+
176
+ # URL input
177
+ url_input = gr.Textbox(
178
+ label="Or Enter URL",
179
+ placeholder="https://arxiv.org/pdf/2408.09869"
180
+ )
181
+
182
+ # Processing options
183
+ use_gpu = gr.Checkbox(label="Use GPU", value=True)
184
+ table_mode = gr.Checkbox(label="Use Accurate Table Mode (Slower but better)", value=False)
185
+
186
+ convert_btn = gr.Button("Convert Document", variant="primary")
187
+
188
+ with gr.Column(scale=2):
189
+ # Status message
190
+ status_message = gr.Markdown("")
191
+
192
+ # Preview area
193
+ preview = gr.Markdown("", label="Preview")
194
+
195
+ # Download files
196
+ with gr.Group() as download_group:
197
+ gr.Markdown("### Download Files")
198
+ with gr.Row():
199
+ markdown_output = gr.File(label="Download Markdown")
200
+ json_output = gr.File(label="Download JSON")
201
+ docx_output = gr.File(label="Download DOCX")
202
+
203
+ # Define the main conversion event
204
+ convert_btn.click(
205
+ fn=process_document,
206
+ inputs=[input_type, file_input, url_input, use_gpu, table_mode],
207
+ outputs=[markdown_output, json_output, docx_output, preview, status_message]
208
+ )
209
+
210
+ # Add footer
211
+ footer = """
212
+ <div style="text-align: center; margin: 2rem auto; padding: 1rem; border-top: 1px solid #ddd; max-width: 800px;">
213
+ <div style="margin-bottom: 1rem;">
214
+ <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank" style="text-decoration: none; color: #2C3E50; margin: 0 10px;">LinkedIn</a> |
215
+ <a href="https://github.com/arad1367" target="_blank" style="text-decoration: none; color: #2C3E50; margin: 0 10px;">GitHub</a> |
216
+ <a href="https://arad1367.pythonanywhere.com/" target="_blank" style="text-decoration: none; color: #2C3E50; margin: 0 10px;">PhD Defense Demo</a> |
217
+ <a href="https://github.com/DS4SD/docling" target="_blank" style="text-decoration: none; color: #2C3E50; margin: 0 10px;">Docling Project</a>
218
+ </div>
219
+ <p style="color: #7F8C8D; margin-top: 0.5rem;">Made with 💖 by Pejman Ebrahimi</p>
220
+ </div>
221
+ """
222
+ gr.HTML(footer)
223
+
224
+ # Launch the app
225
+ if __name__ == "__main__":
226
+ demo.queue(max_size=5) # Enable queue for better handling of multiple requests
227
+ demo.launch(
228
+ show_error=True,
229
+ share=False,
230
+ debug=True,
231
+ show_api=False,
232
+ server_name="0.0.0.0"
233
+ )
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ docling
2
+ gradio
3
+ --extra-index-url https://download.pytorch.org/whl/cu118
4
+ torch
5
+ pytesseract
6
+ python-docx
7
+ markdown
8
+ requests