Falah commited on
Commit
8d6c205
Β·
verified Β·
1 Parent(s): 4635915
Files changed (2) hide show
  1. app.py +295 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from docx import Document
3
+ import datetime
4
+ import json
5
+ import io
6
+ from datasets import DatasetDict, Dataset
7
+ from huggingface_hub import login
8
+
9
+ # Header styling
10
+ st.set_page_config(
11
+ page_title="DOCX to HuggingFace Dataset Converter",
12
+ page_icon="πŸ”„", # Document conversion icon
13
+ layout="wide",
14
+ menu_items={
15
+ 'Get Help': None,
16
+ 'Report a bug': None,
17
+ 'About': None
18
+ },
19
+ initial_sidebar_state="expanded"
20
+ )
21
+
22
+ # Hide Streamlit elements and add custom styling
23
+ st.markdown("""
24
+ <style>
25
+ #MainMenu {visibility: hidden;}
26
+ footer {visibility: hidden;}
27
+ .stDeployButton {display: none;}
28
+
29
+ /* Professional color theme */
30
+ :root {
31
+ --primary-color: #2E4057;
32
+ --secondary-color: #4F7CAC;
33
+ --accent-color: #66A6D1;
34
+ --background-color: #F5F7FA;
35
+ }
36
+
37
+ /* Sidebar styling */
38
+ .css-1d391kg {
39
+ background-color: var(--primary-color);
40
+ }
41
+ .sidebar .sidebar-content {
42
+ background-color: var(--primary-color);
43
+ }
44
+
45
+ /* Logo styling */
46
+ .logo-container {
47
+ display: flex;
48
+ align-items: center;
49
+ padding: 1rem;
50
+ background: var(--primary-color);
51
+ border-radius: 10px;
52
+ margin-bottom: 1rem;
53
+ }
54
+ .logo-text {
55
+ color: white;
56
+ margin-left: 10px;
57
+ font-size: 1.2em;
58
+ font-weight: bold;
59
+ }
60
+ .logo-icon {
61
+ font-size: 2em;
62
+ margin-right: 10px;
63
+ }
64
+
65
+ /* Header styling update */
66
+ .main-header {
67
+ background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
68
+ color: white !important;
69
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
70
+ }
71
+ .main-header h1, .main-header p {
72
+ color: white !important;
73
+ }
74
+ </style>
75
+ """, unsafe_allow_html=True)
76
+
77
+ # Logo in sidebar
78
+ st.sidebar.markdown("""
79
+ <div class='logo-container'>
80
+ <span class='logo-icon'>πŸ”„</span>
81
+ <span class='logo-text'>DOCX2HF</span>
82
+ </div>
83
+ """, unsafe_allow_html=True)
84
+
85
+ # Sidebar content
86
+ with st.sidebar:
87
+ st.markdown("### βš™οΈ Settings")
88
+ hf_token = st.text_input("Enter your Hugging Face Token:", type="password")
89
+ if hf_token:
90
+ login(token=hf_token)
91
+ st.success("βœ“ Authenticated")
92
+
93
+ st.header("Dataset Settings")
94
+ repo_name = st.text_input("Repository Name:", "Falah/rag")
95
+ test_split = st.slider("Test Split %", 10, 50, 20)
96
+
97
+ # Header
98
+ st.markdown("""
99
+ <style>
100
+ .main-header {
101
+ text-align: center;
102
+ padding: 1rem;
103
+ background-color: #f0f2f6;
104
+ border-radius: 0.5rem;
105
+ margin-bottom: 2rem;
106
+ }
107
+ </style>
108
+ <div class="main-header">
109
+ <h1>DOCX to HuggingFace Dataset Converter</h1>
110
+ <p>Convert Word documents to JSONL format and upload directly to HuggingFace</p>
111
+ </div>
112
+ """, unsafe_allow_html=True)
113
+
114
+ # Documentation Section
115
+ st.header("πŸ“š Documentation")
116
+
117
+ tab1, tab2, tab3, tab4 = st.tabs(["Overview", "Features & Use Cases", "User Guide", "Best Practices"])
118
+
119
+ with tab1:
120
+ st.markdown("""
121
+ ### About This Tool
122
+ This application helps researchers and data scientists prepare datasets for Large Language Models (LLM)
123
+ and Natural Language Processing (NLP) tasks by converting DOCX documents into a structured format
124
+ suitable for training and fine-tuning.
125
+
126
+ ### What It Does
127
+ - Converts DOCX files to JSONL format
128
+ - Automatically splits data into train/test sets
129
+ - Uploads directly to HuggingFace Hub
130
+ - Handles multiple documents simultaneously
131
+ """)
132
+
133
+ with tab2:
134
+ col1, col2 = st.columns(2)
135
+
136
+ with col1:
137
+ st.markdown("""
138
+ ### πŸ”‘ Key Features
139
+ - **Automatic Train-Test Split**
140
+ - Configurable ratio
141
+ - Balanced distribution
142
+
143
+ - **HuggingFace Integration**
144
+ - Direct upload to Hub
145
+ - Seamless with transformers
146
+
147
+ - **Batch Processing**
148
+ - Multiple files at once
149
+ - Efficient processing
150
+
151
+ - **Structured Output**
152
+ - Consistent JSONL format
153
+ - Clean data structure
154
+ """)
155
+
156
+ with col2:
157
+ st.markdown("""
158
+ ### 🎯 Use Cases
159
+
160
+ **RAG Systems**
161
+ - Knowledge base creation
162
+ - QA model context prep
163
+ - Document retrieval datasets
164
+
165
+ **LLM Fine-tuning**
166
+ - Custom training data
167
+ - Domain-specific datasets
168
+ - Evaluation sets
169
+ """)
170
+
171
+ with tab3:
172
+ st.markdown("""
173
+ ### πŸ“ Step-by-Step Guide
174
+
175
+ **1. Authentication**
176
+ - Get your token from [HuggingFace Settings](https://huggingface.co/settings/tokens)
177
+ - Enter it in the sidebar
178
+
179
+ **2. Configuration**
180
+ - Set repository name (username/repository)
181
+ - Adjust test split percentage
182
+
183
+ **3. Upload & Process**
184
+ - Select DOCX files
185
+ - Click Generate & Upload
186
+ - Monitor progress
187
+
188
+ **4. Dataset Format**
189
+ ```json
190
+ {
191
+ "section_title": "RAG Post",
192
+ "content": "Your document content here"
193
+ }
194
+ ```
195
+ """)
196
+
197
+ with tab4:
198
+ st.markdown("""
199
+ ### πŸ’‘ Best Practices
200
+
201
+ **Document Preparation**
202
+ - Use well-formatted DOCX files
203
+ - Ensure clean, consistent formatting
204
+ - Remove unnecessary headers/footers
205
+
206
+ **Content Quality**
207
+ - Verify content relevance for NLP/LLM tasks
208
+ - Check for proper text encoding
209
+ - Remove sensitive information
210
+
211
+ **Dataset Configuration**
212
+ - Choose appropriate train/test split
213
+ - Use descriptive repository names
214
+ - Monitor dataset size and balance
215
+
216
+ **Processing**
217
+ - Start with small test batches
218
+ - Verify output format
219
+ - Check HuggingFace space usage
220
+ """)
221
+
222
+ st.markdown("---")
223
+
224
+ # Streamlit UI for file upload
225
+ st.title("Upload DOCX Files to Hugging Face Dataset")
226
+ st.subheader("Convert DOCX to JSONL and Upload to Hugging Face")
227
+
228
+ # Function to extract text from DOCX
229
+ def extract_text_from_docx(docx_file):
230
+ doc = Document(io.BytesIO(docx_file.read()))
231
+ return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
232
+
233
+ # Function to create JSONL from uploaded files
234
+ def create_jsonl_data(file_objects):
235
+ train_data, test_data = [], []
236
+ for i, uploaded_file in enumerate(file_objects):
237
+ if uploaded_file.type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
238
+ content = extract_text_from_docx(uploaded_file)
239
+ document = {"section_title": "RAG Post", "content": content}
240
+ if i % (100/test_split) == 0: # Use slider value for split
241
+ test_data.append(document)
242
+ else:
243
+ train_data.append(document)
244
+ else:
245
+ st.warning(f"File type {uploaded_file.type} not supported!")
246
+ return train_data, test_data
247
+
248
+ # Upload section
249
+ uploaded_files = st.file_uploader("Choose DOCX files", accept_multiple_files=True, type=["docx"])
250
+ if uploaded_files:
251
+ st.write(f"Files uploaded: {[file.name for file in uploaded_files]}")
252
+
253
+ if st.button("Generate JSONL and Upload"):
254
+ train_data, test_data = create_jsonl_data(uploaded_files)
255
+ if train_data or test_data:
256
+ dataset_dict = DatasetDict({
257
+ "train": Dataset.from_list(train_data),
258
+ "test": Dataset.from_list(test_data)
259
+ })
260
+ dataset_dict.push_to_hub(repo_name)
261
+ st.success("Dataset uploaded to Hugging Face successfully with train and test splits!")
262
+ else:
263
+ st.error("No valid files to process.")
264
+
265
+ # Footer
266
+ st.markdown("---")
267
+
268
+ # Social Links
269
+ st.markdown("""
270
+ <div style='text-align: center; margin-bottom: 1rem;'>
271
+ <a href='https://x.com/FalahGatea' target='_blank' style='text-decoration: none; color: #1DA1F2; margin: 0 10px;'>Twitter</a>
272
+ <a href='https://www.linkedin.com/in/falah-gatea-060a211a7/' target='_blank' style='text-decoration: none; color: #0077B5; margin: 0 10px;'>LinkedIn</a>
273
+ <a href='https://github.com/falahgs' target='_blank' style='text-decoration: none; color: #333; margin: 0 10px;'>GitHub</a>
274
+ <a href='https://www.instagram.com/falah.g.saleih/' target='_blank' style='text-decoration: none; color: #E4405F; margin: 0 10px;'>Instagram</a>
275
+ <a href='https://www.facebook.com/falahgs' target='_blank' style='text-decoration: none; color: #1877F2; margin: 0 10px;'>Facebook</a>
276
+ <a href='https://iraqprogrammer.wordpress.com/' target='_blank' style='text-decoration: none; color: #21759B; margin: 0 10px;'>Blog</a>
277
+ </div>
278
+ <div style='text-align: center; margin-bottom: 1rem;'>
279
+ <a href='https://medium.com/@falahgs' target='_blank' style='text-decoration: none; color: #000000; margin: 0 10px;'>Medium</a>
280
+ <a href='https://pypi.org/user/falahgs/' target='_blank' style='text-decoration: none; color: #3775A9; margin: 0 10px;'>PyPI</a>
281
+ <a href='https://www.youtube.com/@FalahgsGate' target='_blank' style='text-decoration: none; color: #FF0000; margin: 0 10px;'>YouTube</a>
282
+ <a href='https://www.amazon.com/stores/Falah-Gatea-Salieh/author/B0BYHXLP7R' target='_blank' style='text-decoration: none; color: #FF9900; margin: 0 10px;'>Amazon</a>
283
+ <a href='https://huggingface.co/Falah' target='_blank' style='text-decoration: none; color: #FFD21E; margin: 0 10px;'>HuggingFace</a>
284
+ <a href='https://www.kaggle.com/falahgatea' target='_blank' style='text-decoration: none; color: #20BEFF; margin: 0 10px;'>Kaggle</a>
285
+ <a href='https://civitai.com/user/falahgs' target='_blank' style='text-decoration: none; color: #4A90E2; margin: 0 10px;'>CivitAI</a>
286
+ </div>
287
+ """, unsafe_allow_html=True)
288
+
289
+ # Copyright
290
+ st.markdown(f"""
291
+ <div style='text-align: center; color: grey; padding: 1rem;'>
292
+ <p>Β© {datetime.datetime.now().year} DOCX to HuggingFace Dataset Converter v1.0.0</p>
293
+ <p>Copyright Β© Falah.G.Salieh 2025</p>
294
+ </div>
295
+ """, unsafe_allow_html=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit==1.29.0
2
+ python-docx==1.0.1
3
+ datasets==2.15.0
4
+ huggingface-hub==0.19.4