victormiller commited on
Commit
0e12ce8
1 Parent(s): 6a336ca

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +5 -5
curated.py CHANGED
@@ -544,7 +544,7 @@ data_preprocessing_div = Div(
544
  P(
545
  "The ",
546
  B("Unigram Log Probability Filter"),
547
- " calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words. To calculate the average log word probability, we use word frequencies extracted from the ",
548
  A("1T Web-gram corpus", href="https://catalog.ldc.upenn.edu/LDC2006T13"),
549
  ". Specifically, we use the list available created by ",
550
  A(
@@ -555,7 +555,7 @@ data_preprocessing_div = Div(
555
  ),
556
  H3("Data Processing for S2ORC"),
557
  P(
558
- "The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."
559
  ),
560
  P(
561
  "The ",
@@ -637,7 +637,7 @@ filtering_process = Div(
637
  ),
638
  plotly2fasthtml(diff2_stacked_bar),
639
  H3(
640
- "This section continues belows with the specific filtering steps taken for all 14 curated datasets."
641
  ),
642
  ),
643
  Section(
@@ -1188,7 +1188,7 @@ filtering_process = Div(
1188
  language="python",
1189
  ),
1190
  P(
1191
- "All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."
1192
  ),
1193
  P(B("Unique Data Preparation Challenges: ")),
1194
  Ul(
@@ -1352,7 +1352,7 @@ filtering_process = Div(
1352
  ),
1353
  P(
1354
  B("Download and Extraction: "),
1355
- "The dataset was downloaded rirectly downloaded from the Huggingface repo: ",
1356
  A(
1357
  "https://huggingface.co/datasets/deepmind/math_dataset",
1358
  href="https://huggingface.co/datasets/deepmind/math_dataset",
 
544
  P(
545
  "The ",
546
  B("Unigram Log Probability Filter"),
547
+ " calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but may not capture the semantic meaning of words. To calculate the average log word probability, we use word frequencies extracted from the ",
548
  A("1T Web-gram corpus", href="https://catalog.ldc.upenn.edu/LDC2006T13"),
549
  ". Specifically, we use the list available created by ",
550
  A(
 
555
  ),
556
  H3("Data Processing for S2ORC"),
557
  P(
558
+ "The formatting of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."
559
  ),
560
  P(
561
  "The ",
 
637
  ),
638
  plotly2fasthtml(diff2_stacked_bar),
639
  H3(
640
+ "This section continues below with the specific filtering steps taken for all 14 curated datasets."
641
  ),
642
  ),
643
  Section(
 
1188
  language="python",
1189
  ),
1190
  P(
1191
+ "All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priority was given to plain_text first, followed by the columns in the table in reverse order."
1192
  ),
1193
  P(B("Unique Data Preparation Challenges: ")),
1194
  Ul(
 
1352
  ),
1353
  P(
1354
  B("Download and Extraction: "),
1355
+ "The dataset was downloaded directly from the Huggingface repo: ",
1356
  A(
1357
  "https://huggingface.co/datasets/deepmind/math_dataset",
1358
  href="https://huggingface.co/datasets/deepmind/math_dataset",