victormiller commited on
Commit
f36591a
1 Parent(s): 564e0a1

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +4 -17
curated.py CHANGED
@@ -445,14 +445,9 @@ filtering_process = Div(
445
  Section(
446
  H3("Wikipedia"),
447
  H4("Download and Extraction"),
448
- Ol(
449
- Li("The Wikimedia dataset was downloaded from the official snapshot on Huggingface", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main")),
450
- Li("Data is originally in parqet format so we used the", D_code("huggingface dataset.to_json"), " function to convert the data to the jsonl format"),
451
- ),
452
  H4("Filtering"),
453
- Ol(
454
- Li("As we expect the dataset to be already of high quality so only one filter is applied which is to remove all documents (articles) with less than 10 words (not inclusive)"),
455
- ),
456
  H4("Local Deduplication Process"),
457
  Ol(
458
  Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
@@ -463,12 +458,7 @@ filtering_process = Div(
463
  Section(
464
  H3("ArXiv"),
465
  H4("Download and Extraction"),
466
- Ol(
467
- Li("All the data was downloaded in original latex format from Arxiv official S3 dump s3://arxic/src"),
468
- Li("We try to encode the downloaded data into utf-8 or guess encoding using chardet library"),
469
- Li("After that pandoc was used to extract information from the latex files and saved as markdown format - code: pandoc -s {tex} -o out/{out_name}.md --wrap=none"),
470
- Li("All markdowns were combined to create jsonl files"),
471
- ),
472
  H4("Filtering"),
473
  P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
474
  Ol(
@@ -1207,12 +1197,9 @@ def curated(request):
1207
  H2("Curated Sources Defined"),
1208
  table_desc,
1209
  data_preprocessing_div,
1210
- # plotly2fasthtml(get_chart_28168342()),
1211
- # plotly2fasthtml(get_chart_new()),
1212
- # plotly2fasthtml(stacked_bar),
1213
- # plotly2fasthtml(diff_stacked_bar),
1214
  plotly2fasthtml(diff2_stacked_bar),
1215
  H2("Curated Sources Processing"),
 
1216
  filtering_process,
1217
  data_preparation_div,
1218
  H2("Local Deduplication"),
 
445
  Section(
446
  H3("Wikipedia"),
447
  H4("Download and Extraction"),
448
+ P("The Wikimedia dataset was downloaded from the official snapshot on Huggingface: ", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"), ". The", D_code("huggingface dataset.to_json", language="python"), " function was used to convert the original parqet format to the jsonl format."),
 
 
 
449
  H4("Filtering"),
450
+ P("Manual inspection of the dataset demostrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed."),
 
 
451
  H4("Local Deduplication Process"),
452
  Ol(
453
  Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
 
458
  Section(
459
  H3("ArXiv"),
460
  H4("Download and Extraction"),
461
+ P("All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), "We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
 
 
 
 
 
462
  H4("Filtering"),
463
  P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
464
  Ol(
 
1197
  H2("Curated Sources Defined"),
1198
  table_desc,
1199
  data_preprocessing_div,
 
 
 
 
1200
  plotly2fasthtml(diff2_stacked_bar),
1201
  H2("Curated Sources Processing"),
1202
+ H3("TALK ABOUT THE DIFFERENT FILTERS BEFORE HAND"),
1203
  filtering_process,
1204
  data_preparation_div,
1205
  H2("Local Deduplication"),