victormiller commited on
Commit
6263148
1 Parent(s): 88c0211

Update web.py

Browse files
Files changed (1) hide show
  1. web.py +32 -16
web.py CHANGED
@@ -586,9 +586,12 @@ def web_data():
586
  P("""
587
  In this section, we introduce all the quality signals that we have used to filter out low-quality documents.
588
  Overview of all the quality signals that are used for filtering."""),
589
- DVS(
590
- json.load(open("data/all_signals.json")),
591
- "Overview of all the quality signals that are used for filtering",
 
 
 
592
  ),
593
  P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
594
  Most of these quality signals were initially introduced by Gopher [2] and subsequently adopted by later
@@ -636,10 +639,13 @@ def web_data():
636
  ensures consistency with the overall document character count calculation.
637
  """),
638
  H5("Our Implementation"),
639
- DV(
640
- "data/repeat_line_frac.jsonl",
641
- 0,
642
- "Sample documents filtered by excessive line repetitions / characters in repeated lines",
 
 
 
643
  ),
644
  H5("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
645
  P("""
@@ -663,10 +669,13 @@ def web_data():
663
  only once — tend to be short.
664
  """),
665
  H5("Our Implementations"),
666
- DV(
667
- "data/sample_top_ngram.json",
668
- 0,
669
- "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
 
 
 
670
  ),
671
  H5("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
672
  P("""
@@ -710,10 +719,13 @@ def web_data():
710
  works ([2], [3], [6]), we remove the documents if more than 30% of the lines end with an ellipsis or more than
711
  90% of lines start with a bullet point.
712
  """),
713
- DV(
714
- "data/line_info.json",
715
- 0,
716
- "Sample documents that are filtered out by line-wise heuristics",
 
 
 
717
  ),
718
  H4("3.3 Statistics-based Heuristics"),
719
  P("""
@@ -806,7 +818,11 @@ def web_data():
806
  Following C4, we remove any page where the phrase “lorem ipsum” appeared since some pages had placeholder “lorem ipsum”
807
  text.
808
  """),
809
- DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum"),
 
 
 
 
810
  H3("4. Deduplication"),
811
  P("..."), # Add detailed content and images as needed
812
  H3("5. PII Removal"),
 
586
  P("""
587
  In this section, we introduce all the quality signals that we have used to filter out low-quality documents.
588
  Overview of all the quality signals that are used for filtering."""),
589
+ Details(
590
+ Summary("Overview of all the quality signals that are used for filtering"),
591
+ DVS(
592
+ json.load(open("data/all_signals.json")),
593
+ "Overview of all the quality signals that are used for filtering",
594
+ ),
595
  ),
596
  P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
597
  Most of these quality signals were initially introduced by Gopher [2] and subsequently adopted by later
 
639
  ensures consistency with the overall document character count calculation.
640
  """),
641
  H5("Our Implementation"),
642
+ Details(
643
+ Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
644
+ DV(
645
+ "data/repeat_line_frac.jsonl",
646
+ 0,
647
+ "Sample documents filtered by excessive line repetitions / characters in repeated lines",
648
+ ),
649
  ),
650
  H5("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
651
  P("""
 
669
  only once — tend to be short.
670
  """),
671
  H5("Our Implementations"),
672
+ Details(
673
+ Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
674
+ DV(
675
+ "data/sample_top_ngram.json",
676
+ 0,
677
+ "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
678
+ ),
679
  ),
680
  H5("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
681
  P("""
 
719
  works ([2], [3], [6]), we remove the documents if more than 30% of the lines end with an ellipsis or more than
720
  90% of lines start with a bullet point.
721
  """),
722
+ Details(
723
+ Summary("Sample documents that are filtered out by line-wise heuristics"),
724
+ DV(
725
+ "data/line_info.json",
726
+ 0,
727
+ "Sample documents that are filtered out by line-wise heuristics",
728
+ ),
729
  ),
730
  H4("3.3 Statistics-based Heuristics"),
731
  P("""
 
818
  Following C4, we remove any page where the phrase “lorem ipsum” appeared since some pages had placeholder “lorem ipsum”
819
  text.
820
  """),
821
+
822
+ Details(
823
+ Summary("Sample documents containing 'lorem ipsum'"),
824
+ DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
825
+ ),
826
  H3("4. Deduplication"),
827
  P("..."), # Add detailed content and images as needed
828
  H3("5. PII Removal"),