Spaces:
Runtime error
Runtime error
victormiller
commited on
Commit
•
ed640d3
1
Parent(s):
f4f88cc
Update web.py
Browse files
web.py
CHANGED
@@ -396,7 +396,7 @@ def web_data():
|
|
396 |
padding: 15px 15px 0px 15px;
|
397 |
""",
|
398 |
),
|
399 |
-
H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets")
|
400 |
P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
|
401 |
table_div_filter_data,
|
402 |
P("The table below provides a comparison of the quality filters that have been applied to each dataset."),
|
@@ -404,8 +404,8 @@ def web_data():
|
|
404 |
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
|
405 |
Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
406 |
P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
|
407 |
-
H3("TxT360 Filter Summary")
|
408 |
-
P("This section provides highlevel details into the filtering that is applied to CommonCrawl in TxT360. Each decision listed is discussed in detail further on in this section.")
|
409 |
P("We adopt rules from RefinedWeb [1] to remove lines if they satisfy any of the following criteria:"),
|
410 |
Ul(
|
411 |
Li("the line is only composed of uppercase characters", style = "margin-bottom: 5px"),
|
|
|
396 |
padding: 15px 15px 0px 15px;
|
397 |
""",
|
398 |
),
|
399 |
+
H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
|
400 |
P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
|
401 |
table_div_filter_data,
|
402 |
P("The table below provides a comparison of the quality filters that have been applied to each dataset."),
|
|
|
404 |
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
|
405 |
Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
406 |
P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
|
407 |
+
H3("TxT360 Filter Summary"),
|
408 |
+
P("This section provides highlevel details into the filtering that is applied to CommonCrawl in TxT360. Each decision listed is discussed in detail further on in this section."),
|
409 |
P("We adopt rules from RefinedWeb [1] to remove lines if they satisfy any of the following criteria:"),
|
410 |
Ul(
|
411 |
Li("the line is only composed of uppercase characters", style = "margin-bottom: 5px"),
|