CarisMu commited on
Commit
12b422d
1 Parent(s): 02f8831

change background to lighter color

Browse files

change pink to lighter one
add margin-bottom

Files changed (1) hide show
  1. web.py +55 -10
web.py CHANGED
@@ -240,6 +240,7 @@ def web_data():
240
  border: 1px solid #c3e6cb; /* Green border */
241
  border-radius: 5px;
242
  padding: 15px 15px 0px 15px;
 
243
  """,
244
  ),
245
  H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
@@ -301,6 +302,7 @@ def web_data():
301
  padding: 15px;
302
  # border: 1px solid #949494; /* Grey border */
303
  border-radius: 12px;
 
304
  """, #https://colors.muz.li/palette/d3d3d3/949494/d3d3d3/d3d3d3/949494
305
  ),
306
  #DV2("data/sample_wet.json", "data/sample_warc.json", 3),
@@ -316,9 +318,10 @@ def web_data():
316
  Summary("Non-English Documents"),
317
  DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
318
  style="""
319
- background-color: #FFC0CB; /* Light pink background */
320
  padding: 15px;
321
  border-radius: 12px;
 
322
  """,
323
  ),
324
 
@@ -331,6 +334,7 @@ def web_data():
331
  background-color: #EAFFF1; /* Light green background */
332
  padding: 15px;
333
  border-radius: 12px;
 
334
  """,
335
  ),
336
 
@@ -350,9 +354,10 @@ def web_data():
350
  Summary("24 URL domains with more than 4k matches"),
351
  DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
352
  style="""
353
- background-color: #FFC0CB; /* Light pink background */
354
  padding: 15px;
355
  border-radius: 12px;
 
356
  """,
357
  ),
358
 
@@ -363,9 +368,10 @@ def web_data():
363
  Summary("6 url domains that are removed from the blocklist"),
364
  DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
365
  style="""
366
- background-color: #FFC0CB; /* Light pink background */
367
  padding: 15px;
368
  border-radius: 12px;
 
369
  """,
370
  ),
371
 
@@ -377,9 +383,10 @@ def web_data():
377
  "Sample documents whose urls are blocked by the refined url blocklist",
378
  ),
379
  style="""
380
- background-color: #FFC0CB; /* Light pink background */
381
  padding: 15px;
382
  border-radius: 12px;
 
383
  """,
384
  ),
385
 
@@ -395,9 +402,10 @@ def web_data():
395
  "curated url domains that are excluded from our dataset",
396
  ),
397
  style="""
398
- background-color: #FFC0CB; /* Light pink background */
399
  padding: 15px;
400
  border-radius: 12px;
 
401
  """,
402
  ),
403
 
@@ -408,6 +416,7 @@ def web_data():
408
  background-color: #EAFFF1; /* Light green background */
409
  padding: 15px;
410
  border-radius: 12px;
 
411
  """,
412
  ),
413
 
@@ -438,9 +447,10 @@ def web_data():
438
  "Sample documents with lines that are removed by the rule of terminal punctuation",
439
  ),
440
  style="""
441
- background-color: #FFC0CB; /* Light pink background */
442
  padding: 15px;
443
  border-radius: 12px;
 
444
  """,
445
  ),
446
 
@@ -464,9 +474,10 @@ def web_data():
464
  "Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
465
  ),
466
  style="""
467
- background-color: #FFC0CB; /* Light pink background */
468
  padding: 15px;
469
  border-radius: 12px;
 
470
  """,
471
  ),
472
  H3("2.2 Other Rules from RefinedWeb"),
@@ -487,9 +498,10 @@ def web_data():
487
  "Sample documents with lines that are removed by the RefinedWeb rules",
488
  ),
489
  style="""
490
- background-color: #FFC0CB; /* Light pink background */
491
  padding: 15px;
492
  border-radius: 12px;
 
493
  """,
494
  ),
495
  H3("2.3 Toxic Lines"),
@@ -507,9 +519,10 @@ def web_data():
507
  "Sample documents with toxic lines",
508
  ),
509
  style="""
510
- background-color: #FFC0CB; /* Light pink background */
511
  padding: 15px;
512
  border-radius: 12px;
 
513
  """,
514
  ),
515
 
@@ -527,6 +540,7 @@ def web_data():
527
  background-color: #EAFFF1; /* Light green background */
528
  padding: 15px;
529
  border-radius: 12px;
 
530
  """,
531
  ),
532
  P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
@@ -570,6 +584,7 @@ def web_data():
570
  background-color: #FFFAEA; /* Light yellow background */
571
  padding: 15px;
572
  border-radius: 12px;
 
573
  """,
574
  ),
575
  Details(
@@ -609,6 +624,7 @@ def web_data():
609
  background-color: #FFFAEA; /* Light yellow background */
610
  padding: 15px;
611
  border-radius: 12px;
 
612
  """,
613
  ),
614
  P("""
@@ -655,6 +671,7 @@ def web_data():
655
  background-color: #EAFFF1; /* Light green background */
656
  padding: 15px;
657
  border-radius: 12px;
 
658
  """,
659
  ),
660
  Details(
@@ -668,6 +685,7 @@ def web_data():
668
  background-color: #EAFFF1; /* Light green background */
669
  padding: 15px;
670
  border-radius: 12px;
 
671
  """,
672
  ),
673
  H3("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
@@ -696,6 +714,7 @@ def web_data():
696
  background-color: #FFFAEA; /* Light yellow background */
697
  padding: 15px;
698
  border-radius: 12px;
 
699
  """,
700
  ),
701
  Details(
@@ -739,6 +758,7 @@ def web_data():
739
  background-color: #FFFAEA; /* Light yellow background */
740
  padding: 15px;
741
  border-radius: 12px;
 
742
  """,
743
  ),
744
 
@@ -767,6 +787,7 @@ def web_data():
767
  background-color: #FFFAEA; /* Light yellow background */
768
  padding: 15px;
769
  border-radius: 12px;
 
770
  """,
771
  ),
772
  P("""
@@ -799,6 +820,7 @@ def web_data():
799
  background-color: #EAFFF1; /* Light green background */
800
  padding: 15px;
801
  border-radius: 12px;
 
802
  """,
803
  ),
804
  Details(
@@ -812,6 +834,7 @@ def web_data():
812
  background-color: #EAFFF1; /* Light green background */
813
  padding: 15px;
814
  border-radius: 12px;
 
815
  """,
816
  ),
817
  H3("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
@@ -843,6 +866,7 @@ def web_data():
843
  background-color: #FFFAEA; /* Light yellow background */
844
  padding: 15px;
845
  border-radius: 12px;
 
846
  """,
847
  ),
848
  Details(
@@ -901,6 +925,7 @@ def web_data():
901
  background-color: #FFFAEA; /* Light yellow background */
902
  padding: 15px;
903
  border-radius: 12px;
 
904
  """,
905
  ),
906
 
@@ -931,6 +956,7 @@ def web_data():
931
  background-color: #FFFAEA; /* Light yellow background */
932
  padding: 15px;
933
  border-radius: 12px;
 
934
  """,
935
  ),
936
  P("""
@@ -989,6 +1015,7 @@ def web_data():
989
  background-color: #EAFFF1; /* Light green background */
990
  padding: 15px;
991
  border-radius: 12px;
 
992
  """,
993
  ),
994
  Details(
@@ -1008,6 +1035,7 @@ def web_data():
1008
  background-color: #EAFFF1; /* Light green background */
1009
  padding: 15px;
1010
  border-radius: 12px;
 
1011
  """,
1012
  ),
1013
  H5(
@@ -1024,6 +1052,7 @@ def web_data():
1024
  background-color: #EAFFF1; /* Light green background */
1025
  padding: 15px;
1026
  border-radius: 12px;
 
1027
  """,
1028
  ),
1029
  H3("3.2 Line-wise Heuristics"),
@@ -1055,6 +1084,7 @@ def web_data():
1055
  background-color: #FFFAEA; /* Light yellow background */
1056
  padding: 15px;
1057
  border-radius: 12px;
 
1058
  """,
1059
  ),
1060
  Details(
@@ -1104,6 +1134,7 @@ def web_data():
1104
  background-color: #FFFAEA; /* Light yellow background */
1105
  padding: 15px;
1106
  border-radius: 12px;
 
1107
  """,
1108
  ),
1109
 
@@ -1119,6 +1150,7 @@ def web_data():
1119
  background-color: #EAFFF1; /* Light green background */
1120
  padding: 15px;
1121
  border-radius: 12px;
 
1122
  """,
1123
  ),
1124
 
@@ -1184,6 +1216,7 @@ def web_data():
1184
  background-color: #FFFAEA; /* Light yellow background */
1185
  padding: 15px;
1186
  border-radius: 12px;
 
1187
  """,
1188
  ),
1189
 
@@ -1200,6 +1233,7 @@ def web_data():
1200
  background-color: #FFFAEA; /* Light yellow background */
1201
  padding: 15px;
1202
  border-radius: 12px;
 
1203
  """,
1204
  ),
1205
  P("""
@@ -1249,6 +1283,7 @@ def web_data():
1249
  background-color: #FFFAEA; /* Light yellow background */
1250
  padding: 15px;
1251
  border-radius: 12px;
 
1252
  """,
1253
  ),
1254
  P("""
@@ -1270,6 +1305,7 @@ def web_data():
1270
  background-color: #EAFFF1; /* Light green background */
1271
  padding: 15px;
1272
  border-radius: 12px;
 
1273
  """,
1274
  ),
1275
 
@@ -1291,6 +1327,7 @@ def web_data():
1291
  background-color: #FFFAEA; /* Light yellow background */
1292
  padding: 15px;
1293
  border-radius: 12px;
 
1294
  """,
1295
  ),
1296
  Details(
@@ -1322,6 +1359,7 @@ def web_data():
1322
  background-color: #FFFAEA; /* Light yellow background */
1323
  padding: 15px;
1324
  border-radius: 12px;
 
1325
  """,
1326
  ),
1327
 
@@ -1337,6 +1375,7 @@ def web_data():
1337
  background-color: #FFFAEA; /* Light yellow background */
1338
  padding: 15px;
1339
  border-radius: 12px;
 
1340
  """,
1341
  ),
1342
  Details(
@@ -1352,6 +1391,7 @@ def web_data():
1352
  background-color: #EAFFF1; /* Light green background */
1353
  padding: 15px;
1354
  border-radius: 12px;
 
1355
  """,
1356
  ),
1357
 
@@ -1367,6 +1407,7 @@ def web_data():
1367
  background-color: #FFFAEA; /* Light yellow background */
1368
  padding: 15px;
1369
  border-radius: 12px;
 
1370
  """,
1371
  ),
1372
  Details(
@@ -1396,6 +1437,7 @@ def web_data():
1396
  background-color: #FFFAEA; /* Light yellow background */
1397
  padding: 15px;
1398
  border-radius: 12px;
 
1399
  """,
1400
  ),
1401
  Details(
@@ -1412,6 +1454,7 @@ def web_data():
1412
  background-color: #FFFAEA; /* Light yellow background */
1413
  padding: 15px;
1414
  border-radius: 12px;
 
1415
  """,
1416
  ),
1417
  P("""
@@ -1443,6 +1486,7 @@ def web_data():
1443
  background-color: #EAFFF1; /* Light green background */
1444
  padding: 15px;
1445
  border-radius: 12px;
 
1446
  """,
1447
  ),
1448
  H3("3.4 Others"),
@@ -1455,9 +1499,10 @@ def web_data():
1455
  Summary("Sample documents containing 'lorem ipsum'"),
1456
  DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
1457
  style="""
1458
- background-color: #FFC0CB; /* Light pink background */
1459
  padding: 15px;
1460
  border-radius: 12px;
 
1461
  """,
1462
  ),
1463
  H2("4. Deduplication"),
 
240
  border: 1px solid #c3e6cb; /* Green border */
241
  border-radius: 5px;
242
  padding: 15px 15px 0px 15px;
243
+ marging-bottom: 15px
244
  """,
245
  ),
246
  H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
 
302
  padding: 15px;
303
  # border: 1px solid #949494; /* Grey border */
304
  border-radius: 12px;
305
+ marging-bottom: 15px
306
  """, #https://colors.muz.li/palette/d3d3d3/949494/d3d3d3/d3d3d3/949494
307
  ),
308
  #DV2("data/sample_wet.json", "data/sample_warc.json", 3),
 
318
  Summary("Non-English Documents"),
319
  DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
320
  style="""
321
+ background-color: #FAEAEA; /* Light pink background */
322
  padding: 15px;
323
  border-radius: 12px;
324
+ marging-bottom: 15px
325
  """,
326
  ),
327
 
 
334
  background-color: #EAFFF1; /* Light green background */
335
  padding: 15px;
336
  border-radius: 12px;
337
+ marging-bottom: 15px
338
  """,
339
  ),
340
 
 
354
  Summary("24 URL domains with more than 4k matches"),
355
  DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
356
  style="""
357
+ background-color: #FAEAEA; /* Light pink background */
358
  padding: 15px;
359
  border-radius: 12px;
360
+ marging-bottom: 15px
361
  """,
362
  ),
363
 
 
368
  Summary("6 url domains that are removed from the blocklist"),
369
  DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
370
  style="""
371
+ background-color: #FAEAEA; /* Light pink background */
372
  padding: 15px;
373
  border-radius: 12px;
374
+ marging-bottom: 15px
375
  """,
376
  ),
377
 
 
383
  "Sample documents whose urls are blocked by the refined url blocklist",
384
  ),
385
  style="""
386
+ background-color: #FAEAEA; /* Light pink background */
387
  padding: 15px;
388
  border-radius: 12px;
389
+ marging-bottom: 15px
390
  """,
391
  ),
392
 
 
402
  "curated url domains that are excluded from our dataset",
403
  ),
404
  style="""
405
+ background-color: #FAEAEA; /* Light pink background */
406
  padding: 15px;
407
  border-radius: 12px;
408
+ marging-bottom: 15px
409
  """,
410
  ),
411
 
 
416
  background-color: #EAFFF1; /* Light green background */
417
  padding: 15px;
418
  border-radius: 12px;
419
+ marging-bottom: 15px
420
  """,
421
  ),
422
 
 
447
  "Sample documents with lines that are removed by the rule of terminal punctuation",
448
  ),
449
  style="""
450
+ background-color: #FAEAEA; /* Light pink background */
451
  padding: 15px;
452
  border-radius: 12px;
453
+ marging-bottom: 15px
454
  """,
455
  ),
456
 
 
474
  "Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
475
  ),
476
  style="""
477
+ background-color: #FAEAEA; /* Light pink background */
478
  padding: 15px;
479
  border-radius: 12px;
480
+ marging-bottom: 15px
481
  """,
482
  ),
483
  H3("2.2 Other Rules from RefinedWeb"),
 
498
  "Sample documents with lines that are removed by the RefinedWeb rules",
499
  ),
500
  style="""
501
+ background-color: #FAEAEA; /* Light pink background */
502
  padding: 15px;
503
  border-radius: 12px;
504
+ marging-bottom: 15px
505
  """,
506
  ),
507
  H3("2.3 Toxic Lines"),
 
519
  "Sample documents with toxic lines",
520
  ),
521
  style="""
522
+ background-color: #FAEAEA; /* Light pink background */
523
  padding: 15px;
524
  border-radius: 12px;
525
+ marging-bottom: 15px
526
  """,
527
  ),
528
 
 
540
  background-color: #EAFFF1; /* Light green background */
541
  padding: 15px;
542
  border-radius: 12px;
543
+ marging-bottom: 15px
544
  """,
545
  ),
546
  P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
 
584
  background-color: #FFFAEA; /* Light yellow background */
585
  padding: 15px;
586
  border-radius: 12px;
587
+ marging-bottom: 15px
588
  """,
589
  ),
590
  Details(
 
624
  background-color: #FFFAEA; /* Light yellow background */
625
  padding: 15px;
626
  border-radius: 12px;
627
+ marging-bottom: 15px
628
  """,
629
  ),
630
  P("""
 
671
  background-color: #EAFFF1; /* Light green background */
672
  padding: 15px;
673
  border-radius: 12px;
674
+ marging-bottom: 15px
675
  """,
676
  ),
677
  Details(
 
685
  background-color: #EAFFF1; /* Light green background */
686
  padding: 15px;
687
  border-radius: 12px;
688
+ marging-bottom: 15px
689
  """,
690
  ),
691
  H3("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
 
714
  background-color: #FFFAEA; /* Light yellow background */
715
  padding: 15px;
716
  border-radius: 12px;
717
+ marging-bottom: 15px
718
  """,
719
  ),
720
  Details(
 
758
  background-color: #FFFAEA; /* Light yellow background */
759
  padding: 15px;
760
  border-radius: 12px;
761
+ marging-bottom: 15px
762
  """,
763
  ),
764
 
 
787
  background-color: #FFFAEA; /* Light yellow background */
788
  padding: 15px;
789
  border-radius: 12px;
790
+ marging-bottom: 15px
791
  """,
792
  ),
793
  P("""
 
820
  background-color: #EAFFF1; /* Light green background */
821
  padding: 15px;
822
  border-radius: 12px;
823
+ marging-bottom: 15px
824
  """,
825
  ),
826
  Details(
 
834
  background-color: #EAFFF1; /* Light green background */
835
  padding: 15px;
836
  border-radius: 12px;
837
+ marging-bottom: 15px
838
  """,
839
  ),
840
  H3("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
 
866
  background-color: #FFFAEA; /* Light yellow background */
867
  padding: 15px;
868
  border-radius: 12px;
869
+ marging-bottom: 15px
870
  """,
871
  ),
872
  Details(
 
925
  background-color: #FFFAEA; /* Light yellow background */
926
  padding: 15px;
927
  border-radius: 12px;
928
+ marging-bottom: 15px
929
  """,
930
  ),
931
 
 
956
  background-color: #FFFAEA; /* Light yellow background */
957
  padding: 15px;
958
  border-radius: 12px;
959
+ marging-bottom: 15px
960
  """,
961
  ),
962
  P("""
 
1015
  background-color: #EAFFF1; /* Light green background */
1016
  padding: 15px;
1017
  border-radius: 12px;
1018
+ marging-bottom: 15px
1019
  """,
1020
  ),
1021
  Details(
 
1035
  background-color: #EAFFF1; /* Light green background */
1036
  padding: 15px;
1037
  border-radius: 12px;
1038
+ marging-bottom: 15px
1039
  """,
1040
  ),
1041
  H5(
 
1052
  background-color: #EAFFF1; /* Light green background */
1053
  padding: 15px;
1054
  border-radius: 12px;
1055
+ marging-bottom: 15px
1056
  """,
1057
  ),
1058
  H3("3.2 Line-wise Heuristics"),
 
1084
  background-color: #FFFAEA; /* Light yellow background */
1085
  padding: 15px;
1086
  border-radius: 12px;
1087
+ marging-bottom: 15px
1088
  """,
1089
  ),
1090
  Details(
 
1134
  background-color: #FFFAEA; /* Light yellow background */
1135
  padding: 15px;
1136
  border-radius: 12px;
1137
+ marging-bottom: 15px
1138
  """,
1139
  ),
1140
 
 
1150
  background-color: #EAFFF1; /* Light green background */
1151
  padding: 15px;
1152
  border-radius: 12px;
1153
+ marging-bottom: 15px
1154
  """,
1155
  ),
1156
 
 
1216
  background-color: #FFFAEA; /* Light yellow background */
1217
  padding: 15px;
1218
  border-radius: 12px;
1219
+ marging-bottom: 15px
1220
  """,
1221
  ),
1222
 
 
1233
  background-color: #FFFAEA; /* Light yellow background */
1234
  padding: 15px;
1235
  border-radius: 12px;
1236
+ marging-bottom: 15px
1237
  """,
1238
  ),
1239
  P("""
 
1283
  background-color: #FFFAEA; /* Light yellow background */
1284
  padding: 15px;
1285
  border-radius: 12px;
1286
+ marging-bottom: 15px
1287
  """,
1288
  ),
1289
  P("""
 
1305
  background-color: #EAFFF1; /* Light green background */
1306
  padding: 15px;
1307
  border-radius: 12px;
1308
+ marging-bottom: 15px
1309
  """,
1310
  ),
1311
 
 
1327
  background-color: #FFFAEA; /* Light yellow background */
1328
  padding: 15px;
1329
  border-radius: 12px;
1330
+ marging-bottom: 15px
1331
  """,
1332
  ),
1333
  Details(
 
1359
  background-color: #FFFAEA; /* Light yellow background */
1360
  padding: 15px;
1361
  border-radius: 12px;
1362
+ marging-bottom: 15px
1363
  """,
1364
  ),
1365
 
 
1375
  background-color: #FFFAEA; /* Light yellow background */
1376
  padding: 15px;
1377
  border-radius: 12px;
1378
+ marging-bottom: 15px
1379
  """,
1380
  ),
1381
  Details(
 
1391
  background-color: #EAFFF1; /* Light green background */
1392
  padding: 15px;
1393
  border-radius: 12px;
1394
+ marging-bottom: 15px
1395
  """,
1396
  ),
1397
 
 
1407
  background-color: #FFFAEA; /* Light yellow background */
1408
  padding: 15px;
1409
  border-radius: 12px;
1410
+ marging-bottom: 15px
1411
  """,
1412
  ),
1413
  Details(
 
1437
  background-color: #FFFAEA; /* Light yellow background */
1438
  padding: 15px;
1439
  border-radius: 12px;
1440
+ marging-bottom: 15px
1441
  """,
1442
  ),
1443
  Details(
 
1454
  background-color: #FFFAEA; /* Light yellow background */
1455
  padding: 15px;
1456
  border-radius: 12px;
1457
+ marging-bottom: 15px
1458
  """,
1459
  ),
1460
  P("""
 
1486
  background-color: #EAFFF1; /* Light green background */
1487
  padding: 15px;
1488
  border-radius: 12px;
1489
+ marging-bottom: 15px
1490
  """,
1491
  ),
1492
  H3("3.4 Others"),
 
1499
  Summary("Sample documents containing 'lorem ipsum'"),
1500
  DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
1501
  style="""
1502
+ background-color: #FAEAEA; /* Light pink background */
1503
  padding: 15px;
1504
  border-radius: 12px;
1505
+ marging-bottom: 15px
1506
  """,
1507
  ),
1508
  H2("4. Deduplication"),