omwdataset

Runtime error

App Files Files Community

victormiller commited on Oct 3, 2024

Commit

adcd5e6

verified ·

1 Parent(s): 8061116

Update main.py

Browse files

Files changed (1) hide show

main.py +13 -13

main.py CHANGED Viewed

@@ -178,7 +178,7 @@ def main():
 new_dataset_comparison1 = pd.DataFrame(
     {
         "Data Source": [
-            "CommonCrawl",
             "Papers",
             "Wikipedia",
             "FreeLaw",
@@ -193,7 +193,7 @@ new_dataset_comparison1 = pd.DataFrame(
         ],
         "TxT360": [
-            "99 Snapshots",
             "5 Sources",
             "310+ Languages",
             "Included",
@@ -207,7 +207,7 @@ new_dataset_comparison1 = pd.DataFrame(
             "**",
         ],
         "FineWeb": [
-            "96 Snapshots",
             "-",
             "-",
             "-",
@@ -221,7 +221,7 @@ new_dataset_comparison1 = pd.DataFrame(
             "-",
         ],
         "RefinedWeb": [
-            "90 Snapshots",
             "-",
             "-",
             "-",
@@ -234,8 +234,8 @@ new_dataset_comparison1 = pd.DataFrame(
             "-",
             "-",
         ],
-        "PedPajama-V-2": [
-            "84 Snapshots",
             "-",
             "-",
             "-",
@@ -249,7 +249,7 @@ new_dataset_comparison1 = pd.DataFrame(
             "-",
         ],
         "C4": [
-            "1 Snapshots",
             "-",
             "-",
             "-",
@@ -263,7 +263,7 @@ new_dataset_comparison1 = pd.DataFrame(
             "-",
         ],
         "Dolma": [
-            "24 Snapshots",
             "1 Source",
             "checkmark",
             "-",
@@ -276,8 +276,8 @@ new_dataset_comparison1 = pd.DataFrame(
             "-",
             "Included",
         ],
-        "RedPajama-V-1": [
-            "5 Snapshots",
             "1 Source",
             "checkmark",
             "",
@@ -291,7 +291,7 @@ new_dataset_comparison1 = pd.DataFrame(
             "Included",
         ],
         "The Pile": [
-            "0.6% of 74 Snapshots",
             "4 Sources",
             "English Only",
             "Included",
@@ -636,8 +636,8 @@ def intro():
                 "TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
             ),
             new_table_div_1,
-            table_div_1,
-            table_div_2,
             P(
                 "In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below."
             ),

 new_dataset_comparison1 = pd.DataFrame(
     {
         "Data Source": [
+            "CommonCrawl Snapshots",
             "Papers",
             "Wikipedia",
             "FreeLaw",
         ],
         "TxT360": [
+            "99",
             "5 Sources",
             "310+ Languages",
             "Included",
             "**",
         ],
         "FineWeb": [
+            "96",
             "-",
             "-",
             "-",
             "-",
         ],
         "RefinedWeb": [
+            "90",
             "-",
             "-",
             "-",
             "-",
             "-",
         ],
+        "PedPajamaV2": [
+            "84",
             "-",
             "-",
             "-",
             "-",
         ],
         "C4": [
+            "1",
             "-",
             "-",
             "-",
             "-",
         ],
         "Dolma": [
+            "24",
             "1 Source",
             "checkmark",
             "-",
             "-",
             "Included",
         ],
+        "RedPajamaV1": [
+            "5",
             "1 Source",
             "checkmark",
             "",
             "Included",
         ],
         "The Pile": [
+            "0.6% of 74",
             "4 Sources",
             "English Only",
             "Included",
                 "TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
             ),
             new_table_div_1,
+            #table_div_1,
+            #table_div_2,
             P(
                 "In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below."
             ),