victormiller commited on
Commit
8061116
1 Parent(s): ec2b3ce

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +162 -0
main.py CHANGED
@@ -175,6 +175,167 @@ def main():
175
  )
176
 
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  dataset_comparison1 = pd.DataFrame(
179
  {
180
  "Dataset": [
@@ -474,6 +635,7 @@ def intro():
474
  H3(
475
  "TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
476
  ),
 
477
  table_div_1,
478
  table_div_2,
479
  P(
 
175
  )
176
 
177
 
178
+ new_dataset_comparison1 = pd.DataFrame(
179
+ {
180
+ "Data Source": [
181
+ "CommonCrawl",
182
+ "Papers",
183
+ "Wikipedia",
184
+ "FreeLaw",
185
+ "DM Math",
186
+ "USPTO",
187
+ "PG-19",
188
+ "HackerNews",
189
+ "Ubuntu IRC",
190
+ "EuroParl",
191
+ "StackExchange",
192
+ "Code",
193
+
194
+ ],
195
+ "TxT360": [
196
+ "99 Snapshots",
197
+ "5 Sources",
198
+ "310+ Languages",
199
+ "Included",
200
+ "Included",
201
+ "Included",
202
+ "Included",
203
+ "Included",
204
+ "Included",
205
+ "Included",
206
+ "Included",
207
+ "**",
208
+ ],
209
+ "FineWeb": [
210
+ "96 Snapshots",
211
+ "-",
212
+ "-",
213
+ "-",
214
+ "-",
215
+ "-",
216
+ "-",
217
+ "-",
218
+ "-",
219
+ "-",
220
+ "-",
221
+ "-",
222
+ ],
223
+ "RefinedWeb": [
224
+ "90 Snapshots",
225
+ "-",
226
+ "-",
227
+ "-",
228
+ "-",
229
+ "-",
230
+ "-",
231
+ "-",
232
+ "-",
233
+ "-",
234
+ "-",
235
+ "-",
236
+ ],
237
+ "PedPajama-V-2": [
238
+ "84 Snapshots",
239
+ "-",
240
+ "-",
241
+ "-",
242
+ "-",
243
+ "-",
244
+ "-",
245
+ "-",
246
+ "-",
247
+ "-",
248
+ "-",
249
+ "-",
250
+ ],
251
+ "C4": [
252
+ "1 Snapshots",
253
+ "-",
254
+ "-",
255
+ "-",
256
+ "-",
257
+ "-",
258
+ "-",
259
+ "-",
260
+ "-",
261
+ "-",
262
+ "-",
263
+ "-",
264
+ ],
265
+ "Dolma": [
266
+ "24 Snapshots",
267
+ "1 Source",
268
+ "checkmark",
269
+ "-",
270
+ "-",
271
+ "-",
272
+ "Included",
273
+ "-",
274
+ "-",
275
+ "-",
276
+ "-",
277
+ "Included",
278
+ ],
279
+ "RedPajama-V-1": [
280
+ "5 Snapshots",
281
+ "1 Source",
282
+ "checkmark",
283
+ "",
284
+ " ",
285
+ "",
286
+ "Included",
287
+ "-",
288
+ "-",
289
+ "-",
290
+ "Included",
291
+ "Included",
292
+ ],
293
+ "The Pile": [
294
+ "0.6% of 74 Snapshots",
295
+ "4 Sources",
296
+ "English Only",
297
+ "Included",
298
+ "Included",
299
+ "Included",
300
+ "Included",
301
+ "Included",
302
+ "Included",
303
+ "Included",
304
+ "Included",
305
+ "Included",
306
+ ],
307
+ }
308
+ )
309
+
310
+
311
+ # Apply table styling: Light green for the header, alternating white and light grey for rows
312
+ styled_table = (
313
+ new_dataset_comparison1.style.set_properties(
314
+ **{"background-color": "#E1EEDB"},
315
+ subset=pd.IndexSlice[0, :], # Row 0 with a light green background
316
+ )
317
+ .apply(
318
+ lambda x: [
319
+ "background-color: #E1EEDB"
320
+ if i == 0
321
+ else (
322
+ "background-color: rgb(237, 242, 251)"
323
+ if i % 2 == 0
324
+ else "background-color: white"
325
+ )
326
+ for i in range(len(x))
327
+ ],
328
+ axis=0,
329
+ )
330
+ .hide(axis="index")
331
+ ) # Hide the row index
332
+
333
+ # Use _repr_html_() method to get the HTML representation of the styled DataFrame
334
+ table_html = styled_table._repr_html_()
335
+ # table_html = dataset_comparison1.to_html(index=False, border=0)
336
+ new_table_div_1 = Div(NotStr(table_html), style="margin: 40px;")
337
+
338
+
339
  dataset_comparison1 = pd.DataFrame(
340
  {
341
  "Dataset": [
 
635
  H3(
636
  "TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
637
  ),
638
+ new_table_div_1,
639
  table_div_1,
640
  table_div_2,
641
  P(