omwdataset

Running

App Files Files Community

victormiller commited on 12 days ago

Commit

0e12ce8

•

1 Parent(s): 6a336ca

Update curated.py

Browse files

Files changed (1) hide show

curated.py +5 -5

curated.py CHANGED Viewed

@@ -544,7 +544,7 @@ data_preprocessing_div = Div(
     P(
         "The ",
         B("Unigram Log Probability Filter"),
-        " calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words. To calculate the average log word probability, we use word frequencies extracted from the ",
         A("1T Web-gram corpus", href="https://catalog.ldc.upenn.edu/LDC2006T13"),
         ". Specifically, we use the list available created by ",
         A(
@@ -555,7 +555,7 @@ data_preprocessing_div = Div(
     ),
     H3("Data Processing for S2ORC"),
     P(
-        "The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."
     ),
     P(
         "The ",
@@ -637,7 +637,7 @@ filtering_process = Div(
         ),
         plotly2fasthtml(diff2_stacked_bar),
         H3(
-            "This section continues belows with the specific filtering steps taken for all 14 curated datasets."
         ),
     ),
     Section(
@@ -1188,7 +1188,7 @@ filtering_process = Div(
                 language="python",
             ),
             P(
-                "All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."
             ),
             P(B("Unique Data Preparation Challenges: ")),
             Ul(
@@ -1352,7 +1352,7 @@ filtering_process = Div(
             ),
             P(
                 B("Download and Extraction: "),
-                "The dataset was downloaded rirectly downloaded from the Huggingface repo: ",
                 A(
                     "https://huggingface.co/datasets/deepmind/math_dataset",
                     href="https://huggingface.co/datasets/deepmind/math_dataset",

     P(
         "The ",
         B("Unigram Log Probability Filter"),
+        " calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but may not capture the semantic meaning of words. To calculate the average log word probability, we use word frequencies extracted from the ",
         A("1T Web-gram corpus", href="https://catalog.ldc.upenn.edu/LDC2006T13"),
         ". Specifically, we use the list available created by ",
         A(
     ),
     H3("Data Processing for S2ORC"),
     P(
+        "The formatting of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."
     ),
     P(
         "The ",
         ),
         plotly2fasthtml(diff2_stacked_bar),
         H3(
+            "This section continues below with the specific filtering steps taken for all 14 curated datasets."
         ),
     ),
     Section(
                 language="python",
             ),
             P(
+                "All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priority was given to plain_text first, followed by the columns in the table in reverse order."
             ),
             P(B("Unique Data Preparation Challenges: ")),
             Ul(
             ),
             P(
                 B("Download and Extraction: "),
+                "The dataset was downloaded directly from the Huggingface repo: ",
                 A(
                     "https://huggingface.co/datasets/deepmind/math_dataset",
                     href="https://huggingface.co/datasets/deepmind/math_dataset",