CarisMu commited on
Commit
2767124
·
verified ·
1 Parent(s): e52677f

Update web.py

Browse files
Files changed (1) hide show
  1. web.py +573 -448
web.py CHANGED
@@ -319,7 +319,10 @@ def web_data():
319
 
320
  Details(
321
  Summary("Non-English Documents"),
322
- DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
 
 
 
323
  style="""
324
  background-color: #FAEAEA; /* Light pink background */
325
  padding: 15px;
@@ -332,7 +335,10 @@ def web_data():
332
 
333
  Details(
334
  Summary("English Documents Scoring Lower than 0.65"),
335
- DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
 
 
 
336
  style="""
337
  background-color: #EAFFF1; /* Light green background */
338
  padding: 15px;
@@ -355,7 +361,10 @@ def web_data():
355
 
356
  Details(
357
  Summary("24 URL domains with more than 4k matches"),
358
- DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
 
 
 
359
  style="""
360
  background-color: #FAEAEA; /* Light pink background */
361
  padding: 15px;
@@ -369,7 +378,10 @@ def web_data():
369
  """),
370
  Details(
371
  Summary("6 url domains that are removed from the blocklist"),
372
- DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
 
 
 
373
  style="""
374
  background-color: #FAEAEA; /* Light pink background */
375
  padding: 15px;
@@ -380,11 +392,13 @@ def web_data():
380
 
381
  Details(
382
  Summary("Sample documents whose urls are blocked by the refined url blocklist"),
383
- DV(
 
384
  "data/bad_url_doc.jsonl",
385
  3,
386
  "Sample documents whose urls are blocked by the refined url blocklist",
387
- ),
 
388
  style="""
389
  background-color: #FAEAEA; /* Light pink background */
390
  padding: 15px;
@@ -400,9 +414,12 @@ def web_data():
400
 
401
  Details(
402
  Summary("curated url domains that are excluded from our dataset"),
403
- DVS(
 
404
  non_web_urls,
405
  "curated url domains that are excluded from our dataset",
 
 
406
  ),
407
  style="""
408
  background-color: #FAEAEA; /* Light pink background */
@@ -414,7 +431,10 @@ def web_data():
414
 
415
  Details(
416
  Summary("Sample documents whose urls are in our curated url domain list"),
417
- DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
 
 
 
418
  style="""
419
  background-color: #EAFFF1; /* Light green background */
420
  padding: 15px;
@@ -444,11 +464,14 @@ def web_data():
444
 
445
  Details(
446
  Summary("Sample documents with lines that are removed by the rule of terminal punctuation"),
447
- DV(
448
- "data/sample_terminal_punc.json",
449
- 0,
450
- "Sample documents with lines that are removed by the rule of terminal punctuation",
451
- ),
 
 
 
452
  style="""
453
  background-color: #FAEAEA; /* Light pink background */
454
  padding: 15px;
@@ -471,10 +494,13 @@ def web_data():
471
  """),
472
  Details(
473
  Summary("Sample documents that are removed by original C4 javascript rule but are kept after our refinement"),
474
- DV(
 
475
  "data/sample_java.jsonl",
476
  0,
477
  "Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
 
 
478
  ),
479
  style="""
480
  background-color: #FAEAEA; /* Light pink background */
@@ -495,10 +521,13 @@ def web_data():
495
  ),
496
  Details(
497
  Summary("Sample documents with lines that are removed by the RefinedWeb rules"),
498
- DV(
 
499
  "data/sample_refinedweb_line.json",
500
  0,
501
  "Sample documents with lines that are removed by the RefinedWeb rules",
 
 
502
  ),
503
  style="""
504
  background-color: #FAEAEA; /* Light pink background */
@@ -517,9 +546,12 @@ def web_data():
517
  """),
518
  Details(
519
  Summary("Sample documents with toxic lines"),
520
- DVS(
 
521
  json.load(open("data/toxic_lines.json")),
522
  "Sample documents with toxic lines",
 
 
523
  ),
524
  style="""
525
  background-color: #FAEAEA; /* Light pink background */
@@ -535,9 +567,12 @@ def web_data():
535
  """),
536
  Details(
537
  Summary("Overview of all the quality signals that are used for filtering"),
538
- DVS(
 
539
  json.load(open("data/all_signals.json")),
540
  "Overview of all the quality signals that are used for filtering",
 
 
541
  ),
542
  style="""
543
  background-color: #EAFFF1; /* Light green background */
@@ -567,22 +602,25 @@ def web_data():
567
  """),
568
  Details(
569
  Summary("Implementations from Dolma"),
570
- D_code("""
571
- words = text.split()
572
- word_count = len(words)
573
- character_count = sum(len(word) for word in words)
574
- ...
575
- lines = text.split("\n")
576
- line_count = len(lines)
577
- ...
578
- line_counts = Counter(lines)
579
- attrs.fraction_of_duplicate_lines = sum(count for line, count in line_counts.items() if count > 1) / max(
580
- line_count, 1
581
- )
582
- attrs.fraction_of_characters_in_duplicate_lines = sum(
583
- len(line) * count for line, count in line_counts.items() if count > 1
584
- ) / max(character_count, 1)
585
- """, block="block", language="python"),
 
 
 
586
  style="""
587
  background-color: #FFFAEA; /* Light yellow background */
588
  padding: 15px;
@@ -592,37 +630,40 @@ def web_data():
592
  ),
593
  Details(
594
  Summary("Implementations from DataTrove"),
595
- D_code("""
596
- def find_duplicates(x: list[str]) -> tuple[int, int]:
597
- unique_x = set()
598
- duplicate_chars = 0
599
- duplicate_elements = 0
600
- for element in x:
601
- if element in unique_x:
602
- duplicate_chars += len(element)
603
- duplicate_elements += 1
604
-
605
- else:
606
- unique_x.add(element)
607
- return duplicate_elements, duplicate_chars
608
- ...
609
- self.paragraph_exp = re.compile(r"\n{2,}")
610
- self._line_splitter = re.compile("\n+")
611
- ...
612
- paragraphs = self.paragraph_exp.split(text.strip())
613
- paragraphs_duplicates, char_duplicates = find_duplicates(paragraphs)
614
- if self.dup_para_frac and paragraphs_duplicates / len(paragraphs) > self.dup_para_frac:
615
- return False, "dup_para_frac"
616
- if self.dup_para_char_frac and char_duplicates / len(text) > self.dup_para_char_frac:
617
- return False, "dup_para_char_frac"
618
-
619
- lines = self._line_splitter.split(text)
620
- line_duplicates, char_duplicates = find_duplicates(lines)
621
- if self.dup_line_frac and line_duplicates / len(lines) > self.dup_line_frac:
622
- return False, "dup_line_frac"
623
- if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac:
624
- return False, "dup_line_char_frac"
625
- """, block="block", language="python"),
 
 
 
626
  style="""
627
  background-color: #FFFAEA; /* Light yellow background */
628
  padding: 15px;
@@ -654,22 +695,25 @@ def web_data():
654
  H3("TxT360 Implementation"),
655
  Details(
656
  Summary("TxT360 Implementation"),
657
- D_code("""
658
- words = text.split()
659
- word_count = len(words)
660
- character_count = sum(len(word) for word in words)
661
- ...
662
- lines = text.split("\n")
663
- line_count = len(lines)
664
-
665
- line_counts = Counter(lines)
666
- attrs.fraction_of_duplicate_lines = (
667
- sum((count - 1) for line, count in line_counts.items() if count > 1) / line_count
668
- )
669
- attrs.fraction_of_characters_in_duplicate_lines = (
670
- sum(sum(len(w) for w in line.split()) * (count - 1) for line, count in
671
- line_counts.items() if count > 1) / character_count
672
- """, block="block", language="python"),
 
 
 
673
  style="""
674
  background-color: #EAFFF1; /* Light green background */
675
  padding: 15px;
@@ -679,10 +723,13 @@ def web_data():
679
  ),
680
  Details(
681
  Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
682
- DV(
 
683
  "data/repeat_line_frac.jsonl",
684
  0,
685
  "Sample documents filtered by excessive line repetitions / characters in repeated lines",
 
 
686
  ),
687
  style="""
688
  background-color: #EAFFF1; /* Light green background */
@@ -698,21 +745,24 @@ def web_data():
698
  """),
699
  Details(
700
  Summary("Implementations from Dolma"),
701
- D_code("""
702
- def all_ngram_counts(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
703
- return [(n, Counter(list(zip(*[words[i:] for i in range(n)])))) for n in range(2, 11)]
704
- ...
705
- all_counts = all_ngram_counts(words)
706
-
707
- count_most_common_ngrams = (2, 3, 4)
708
- for n, ngram_counts in all_counts:
709
- if not ngram_counts:
710
- continue
711
- if n in count_most_common_ngrams:
712
- most_common_ngram, count = ngram_counts.most_common(1)[0]
713
- value = count * sum(len(w) for w in most_common_ngram) / max(character_count, 1)
714
- attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
715
- """, block="block", language="python"),
 
 
 
716
  style="""
717
  background-color: #FFFAEA; /* Light yellow background */
718
  padding: 15px;
@@ -722,7 +772,8 @@ def web_data():
722
  ),
723
  Details(
724
  Summary("Implementations from RedPajama-V2"),
725
- D_code("""
 
726
  class Base_RPS_Frac_Chars_In_Top_NGram(RPSBase): # noqa
727
  ## Base class for calculating the fraction of characters in the top N-gram. This operates on the lower-cased, punctation removed content.
728
  NGRAM_SIZE: int = None
@@ -756,7 +807,9 @@ def web_data():
756
  score = sum(len(w) for w in ngram) * count / total_chars
757
  score = round(score, PRECISION)
758
  return [(0, len(document), score)]
759
- """, block="block", language="python"),
 
 
760
  style="""
761
  background-color: #FFFAEA; /* Light yellow background */
762
  padding: 15px;
@@ -767,25 +820,28 @@ def web_data():
767
 
768
  Details(
769
  Summary("Implementations from DataTrove"),
770
- D_code("""
771
- def get_n_grams(words: list[str], n: int) -> list[str]:
772
- return [" ".join(words[i : i + n]) for i in range(len(words) - n + 1)]
773
-
774
- def find_top_duplicate(x: list[str]) -> int:
775
- counter = Counter()
776
- for element in x:
777
- counter[element] += 1
778
- top_n_gram = counter.most_common(1)[0]
779
- return len(top_n_gram[0]) * top_n_gram[1]
780
- ...
781
- for n, n_frac in self.top_n_grams:
782
- n_grams = get_n_grams(words, n)
783
- if not n_grams:
784
- continue
785
- top_char_length = find_top_duplicate(n_grams)
786
- if top_char_length / len(text) > n_frac:
787
- return False, f"top_n_gram"
788
- """, block="block", language="python"),
 
 
 
789
  style="""
790
  background-color: #FFFAEA; /* Light yellow background */
791
  padding: 15px;
@@ -805,20 +861,23 @@ def web_data():
805
  """),
806
  Details(
807
  Summary("TxT360 Implementation"),
808
- D_code("""
809
- def all_ngram_counts_new(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
810
- return [(n, list(zip(*[words[i:] for i in range(n)]))) for n in range(2, 11)]
811
- ...
812
- all_counts = all_ngram_counts_new(words)
813
- count_most_common_ngrams = (2, 3, 4)
814
- for n, ngram_counts in all_counts:
815
- if not ngram_counts:
816
- continue
817
- if n in count_most_common_ngrams:
818
- most_common_ngram, count = Counter(ngram_counts).most_common(1)[0]
819
- value = count * sum(len(w) for w in most_common_ngram) / character_count
820
- attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
821
- """, block="block", language="python"),
 
 
 
822
  style="""
823
  background-color: #EAFFF1; /* Light green background */
824
  padding: 15px;
@@ -828,10 +887,13 @@ def web_data():
828
  ),
829
  Details(
830
  Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
831
- DV(
 
832
  "data/sample_top_ngram.json",
833
  0,
834
  "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
 
 
835
  ),
836
  style="""
837
  background-color: #EAFFF1; /* Light green background */
@@ -848,23 +910,26 @@ def web_data():
848
  """),
849
  Details(
850
  Summary("Implementations from Dolma"),
851
- D_code("""
852
- def all_ngram_counts(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
853
- return [(n, Counter(list(zip(*[words[i:] for i in range(n)])))) for n in range(2, 11)]
854
- ...
855
- all_counts = all_ngram_counts(words)
856
- for n, ngram_counts in all_counts:
857
- if not ngram_counts:
858
- continue
859
- if n in count_most_common_ngrams:
860
- ...
861
- else:
862
- ng_char_count = sum(count * sum(len(w) for w in ng) for ng, count in ngram_counts.items())
863
- value = sum(
864
- count * sum(len(w) for w in ng) for ng, count in ngram_counts.items() if count > 1
865
- ) / max(ng_char_count, 1)
866
- attrs.fraction_of_characters_in_duplicate_ngrams.append((n, value))
867
- """, block="block", language="python"),
 
 
 
868
  style="""
869
  background-color: #FFFAEA; /* Light yellow background */
870
  padding: 15px;
@@ -874,56 +939,59 @@ def web_data():
874
  ),
875
  Details(
876
  Summary("Implementations from RedPajama-V2"),
877
- D_code("""
878
- class Base_RPS_Frac_Chars_In_Dupe_NGrams(RPSBase): # noqa
879
- ## Base class for calculating the fraction of characters in duplicate word N-grams. This operates on the lower-cased, punctation removed content. The function also ensures that characters in overlapping ngrams are only counted once.
880
- NGRAM_SIZE: int = None
881
- __slots__ = []
882
-
883
- def __call__(self, document: Document) -> SignalType:
884
- if self.NGRAM_SIZE is None:
885
- raise NotImplementedError(
886
- "NGRAM_SIZE must be set in the subclass"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
887
  )
888
-
889
- if len(document.normalized_words) < self.NGRAM_SIZE:
890
- return [(0, len(document), 0.0)]
891
-
892
- # fetch the ngrams from the document if they exist, otherwise
893
- # compute them
894
- doc_n_grams = (
895
- getattr(document, f"norm_self.NGRAM_SIZEgrams", None)
896
- or
897
- tuple(form_ngrams(
898
- iter(document.normalized_words), self.NGRAM_SIZE
899
- ))
900
- )
901
-
902
- # keep only ngrams which occur at least twice
903
- ngram_dupes =
904
- ngram for ngram, count in Counter(doc_n_grams).items() if count > 1
905
-
906
-
907
- duplicated_grams = np.zeros(len(document.normalized_words), dtype=int)
908
-
909
- i = 0
910
- for ngram in doc_n_grams:
911
- if ngram in ngram_dupes:
912
- duplicated_grams[i: i + self.NGRAM_SIZE] = 1
913
-
914
- i += 1
915
-
916
- word_lengths = np.array(list(map(len, document.normalized_words)))
917
- chars_duped = np.sum(word_lengths * duplicated_grams)
918
- total_chars = np.sum(word_lengths)
919
-
920
- if total_chars == 0:
921
- return [(0, len(document), 0.0)]
922
-
923
- score = float(chars_duped / total_chars)
924
- score = round(score, PRECISION)
925
- return [(0, len(document), score)]
926
- """, block="block", language="python"),
927
  style="""
928
  background-color: #FFFAEA; /* Light yellow background */
929
  padding: 15px;
@@ -934,27 +1002,30 @@ def web_data():
934
 
935
  Details(
936
  Summary("Implementations from DataTrove"),
937
- D_code("""
938
- def find_all_duplicate(words: list[str], n: int) -> int:
939
- n_words = len(words)
940
- unique = set()
941
- repeated_chars, idx = 0, 0
942
- while idx < n_words - n + 1:
943
- n_gram = "".join(words[idx : idx + n])
944
- if n_gram in unique:
945
- repeated_chars += len(n_gram)
946
- idx += n
947
- else:
948
- unique.add(n_gram)
949
- idx += 1
950
- assert repeated_chars <= len("".join(words))
951
- return repeated_chars
952
- ...
953
- for n, n_frac in self.dup_n_grams:
954
- n_duplicates_char = find_all_duplicate(words, n)
955
- if n_duplicates_char / len(text) > n_frac:
956
- return False, f"duplicated_n_grams"
957
- """, block="block", language="python"),
 
 
 
958
  style="""
959
  background-color: #FFFAEA; /* Light yellow background */
960
  padding: 15px;
@@ -979,41 +1050,44 @@ def web_data():
979
  """),
980
  Details(
981
  Summary("TxT360 Implementation"),
982
- D_code("""
983
- def get_dup_ngram_frac(n, doc_n_grams, text):
984
- # fetch the ngrams from the document if they exist, otherwise compute them
985
- # doc_n_grams = list(zip(*[words[i:] for i in range(n)]))
986
-
987
- duplicated_grams = np.zeros(len(text.split()), dtype=int)
988
-
989
- unique_ngrams = set()
990
-
991
- for i, ngram in enumerate(doc_n_grams):
992
- if ngram in unique_ngrams:
993
- duplicated_grams[i: i + n] = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
994
  else:
995
- unique_ngrams.add(ngram)
996
-
997
- word_lengths = np.array(list(map(len, text.split())))
998
- chars_duped = np.sum(word_lengths * duplicated_grams)
999
- total_chars = np.sum(word_lengths)
1000
-
1001
- return float(chars_duped / total_chars)
1002
-
1003
- def all_ngram_counts_new(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
1004
- return [(n, list(zip(*[words[i:] for i in range(n)]))) for n in range(2, 11)]
1005
- ...
1006
- all_counts = all_ngram_counts_new(words)
1007
- count_most_common_ngrams = (2, 3, 4)
1008
- for n, ngram_counts in all_counts:
1009
- if not ngram_counts:
1010
- continue
1011
- if n in count_most_common_ngrams:
1012
- ...
1013
- else:
1014
- score = get_dup_ngram_frac(n, ngram_counts, text)
1015
- attrs.fraction_of_characters_in_duplicate_ngrams.append((n, score))
1016
- """, block="block", language="python"),
1017
  style="""
1018
  background-color: #EAFFF1; /* Light green background */
1019
  padding: 15px;
@@ -1046,10 +1120,13 @@ def web_data():
1046
  ),
1047
  Details(
1048
  Summary("Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)"),
1049
- DV(
 
1050
  "data/sample_dup_ngram.json",
1051
  0,
1052
  "Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
 
 
1053
  ),
1054
  style="""
1055
  background-color: #EAFFF1; /* Light green background */
@@ -1067,22 +1144,25 @@ def web_data():
1067
  """),
1068
  Details(
1069
  Summary("Ellipsis Symbol Identification Implemetations"),
1070
- P("Dolma: "),
1071
- D_code("""
1072
- ELLIPSIS_SYMBOLS = ("")
1073
- """, block="block", language="python"),
1074
- P("RedPajamaV2: "),
1075
- D_code("""
1076
- ELLIPSIS_SYMBOLS = ("...", "…")
1077
- """, block="block", language="python"),
1078
- P("DataTrove: "),
1079
- D_code("""
1080
- ELLIPSIS_SYMBOLS = ("...", "…")
1081
- """, block="block", language="python"),
1082
- P("TxT360: "),
1083
- D_code("""
1084
- ELLIPSIS_SYMBOLS = ("...", "…", "[...]", "[…]")
1085
- """, block="block", language="python"),
 
 
 
1086
  style="""
1087
  background-color: #FFFAEA; /* Light yellow background */
1088
  padding: 15px;
@@ -1092,47 +1172,50 @@ def web_data():
1092
  ),
1093
  Details(
1094
  Summary("Bullet Point Identification Implemetations"),
1095
- P("Dolma: "),
1096
- D_code("""
1097
- BULLET_POINTS = ("*", "-"
1098
- """, block="block", language="python"),
1099
- P("RedPajamaV2: "),
1100
- D_code("""
1101
- BULLET_POINT_SYMBOLS = (
1102
- "•", # bullet point
1103
- "", # triangular bullet point
1104
- "", # black right pointing triangle
1105
- "", # black left pointing triangle
1106
- "", # white bullet point
1107
- "", # black square
1108
- "", # white square
1109
- "", # black small square
1110
- "", # white small square
1111
- "", # en dash
1112
- )
1113
- """, block="block", language="python"),
1114
- P("DataTrove: "),
1115
- D_code("""
1116
- BULLET_POINT_SYMBOLS = ("" , "-")
1117
- """, block="block", language="python"),
1118
- P("TxT360: "),
1119
- D_code("""
1120
- BULLET_POINT_SYMBOLS = (
1121
- "•", # bullet point
1122
- "", # triangular bullet point
1123
- "", # black right pointing triangle
1124
- "", # black left pointing triangle
1125
- "", # white bullet point
1126
- "", # black square
1127
- "", # white square
1128
- "", # black small square
1129
- "", # white small square
1130
- "-", # - en dash
1131
- "", # dash
1132
- "", # zh dash
1133
- "*", # * star
1134
- )
1135
- """, block="block", language="python"),
 
 
 
1136
  style="""
1137
  background-color: #FFFAEA; /* Light yellow background */
1138
  padding: 15px;
@@ -1144,10 +1227,13 @@ def web_data():
1144
 
1145
  Details(
1146
  Summary("Sample documents that are filtered out by line-wise heuristics"),
1147
- DV(
 
1148
  "data/line_info.json",
1149
  0,
1150
  "Sample documents that are filtered out by line-wise heuristics",
 
 
1151
  ),
1152
  style="""
1153
  background-color: #EAFFF1; /* Light green background */
@@ -1186,35 +1272,38 @@ def web_data():
1186
  ),
1187
  Details(
1188
  Summary("Implementations from RedPajama-V2"),
1189
- D_code("""
1190
- # the normalized content: lowercased and punctuation removed
1191
- self._normalized_content = normalize(content)
1192
- self._normalized_words = tuple(self._normalized_content.split())
1193
- self._num_normalized_words = len(self._normalized_words)
1194
-
1195
- ...
1196
- def normalize(
1197
- text: str,
1198
- remove_punct: bool = True,
1199
- lowercase: bool = True,
1200
- nfd_unicode: bool = True,
1201
- white_space: bool = True
1202
- ) -> str:
1203
- #Normalize the text by lowercasing and removing punctuation.
1204
- # remove punctuation
1205
- if remove_punct:
1206
- text = text.translate(TRANSLATION_TABLE_PUNCTUATION)
1207
- # lowercase
1208
- if lowercase:
1209
- text = text.lower()
1210
- if white_space:
1211
- text = text.strip()
1212
- text = re.sub(r"\s+", " ", text)
1213
- # NFD unicode normalization
1214
- if nfd_unicode:
1215
- text = unicodedata.normalize("NFD", text)
1216
- return text
1217
- """, block="block", language="python"),
 
 
 
1218
  style="""
1219
  background-color: #FFFAEA; /* Light yellow background */
1220
  padding: 15px;
@@ -1225,13 +1314,16 @@ def web_data():
1225
 
1226
  Details(
1227
  Summary("Implementations from DataTrove"),
1228
- D_code("""
1229
- words = self.tokenizer.word_tokenize(text)
1230
- n_words = len(words)
1231
-
1232
- non_symbol_words = [w for w in words if any(ch not in PUNCTUATION_SET for ch in w)]
1233
- n_non_symbol_words_words = len(non_symbol_words)
1234
- """, block="block", language="python"),
 
 
 
1235
  style="""
1236
  background-color: #FFFAEA; /* Light yellow background */
1237
  padding: 15px;
@@ -1270,18 +1362,21 @@ def web_data():
1270
  """),
1271
  Details(
1272
  Summary("Implementations from RedPajama-V2"),
1273
- D_code("""
1274
- class RPS_Doc_Num_Sentences(RPSBase): # noqa
1275
- ##The number of sentences in the content. This is calculated using the regex r'[^.!?]+[.!?]*'
1276
- SENT_PATTERN = re.compile(r'[^.!?]+[.!?]*', flags=re.UNICODE)
1277
-
1278
- __slots__ = ()
1279
-
1280
- def __call__(self, document: Document) -> SignalType:
1281
- ##count the number of sentences in the content using regex
1282
- score = float(len(self.SENT_PATTERN.findall(document.raw_content)))
1283
- return [(0, len(document), score)]
1284
- """, block="block", language="python"),
 
 
 
1285
  style="""
1286
  background-color: #FFFAEA; /* Light yellow background */
1287
  padding: 15px;
@@ -1295,15 +1390,18 @@ def web_data():
1295
  """),
1296
  Details(
1297
  Summary("TxT360 Implementation"),
1298
- D_code("""
1299
- from nltk.tokenize import sent_tokenize
1300
- ...
1301
- def count_sentences(text):
1302
- sentences = sent_tokenize(text)
1303
- return len(sentences)
1304
- ...
1305
- attrs.num_of_sentences = count_sentences(text)
1306
- """, block="block", language="python"),
 
 
 
1307
  style="""
1308
  background-color: #EAFFF1; /* Light green background */
1309
  padding: 15px;
@@ -1319,13 +1417,16 @@ def web_data():
1319
  """),
1320
  Details(
1321
  Summary("Implementations from Dolma"),
1322
- D_code("""
1323
- SYMBOLS = ("#", "…")
1324
- ...
1325
- attrs.symbol_to_word_ratio = sum(1 for word in words if any(s in word for s in SYMBOLS)) / max(
1326
- word_count, 1
1327
- )
1328
- """, block="block", language="python"),
 
 
 
1329
  style="""
1330
  background-color: #FFFAEA; /* Light yellow background */
1331
  padding: 15px;
@@ -1335,29 +1436,32 @@ def web_data():
1335
  ),
1336
  Details(
1337
  Summary("Implementations from RedPajama-V2"),
1338
- D_code("""
1339
- class RPS_Doc_Symbol_To_Word_Ratio(RPSBase): # noqa
1340
- ##The ratio of symbols to words in the content. This is analogous to
1341
- ##the signal used in Gopher. Symbols are defined "#", "...", and "…".
1342
- SYMBOLS = ("#", "...", "…")
1343
-
1344
- __slots__ = ()
1345
-
1346
- def __call__(self, document: Document) -> SignalType:
1347
- num_words = document.num_raw_words
1348
-
1349
- if num_words == 0:
1350
- return [(0, len(document), None)]
1351
-
1352
- # count the number of symbols in the content
1353
- num_symbols = float(sum(
1354
- document.raw_content.count(x) for x in self.SYMBOLS
1355
- ))
1356
-
1357
- score = num_symbols / num_words
1358
- score = round(score, PRECISION)
1359
- return [(0, len(document), score)]
1360
- """, block="block", language="python"),
 
 
 
1361
  style="""
1362
  background-color: #FFFAEA; /* Light yellow background */
1363
  padding: 15px;
@@ -1368,12 +1472,15 @@ def web_data():
1368
 
1369
  Details(
1370
  Summary("Implementations from DataTrove"),
1371
- D_code("""
1372
- if self.max_symbol_word_ratio and text.count("#") / n_words > self.max_symbol_word_ratio:
1373
- return False, "gopher_too_many_hashes"
1374
- if self.max_symbol_word_ratio and (text.count("...") + text.count("…")) / n_words > self.max_symbol_word_ratio:
1375
- return False, "gopher_too_many_ellipsis"
1376
- """, block="block", language="python"),
 
 
 
1377
  style="""
1378
  background-color: #FFFAEA; /* Light yellow background */
1379
  padding: 15px;
@@ -1383,13 +1490,16 @@ def web_data():
1383
  ),
1384
  Details(
1385
  Summary("TxT360 Implementation"),
1386
- D_code("""
1387
- SYMBOLS = ("#", "...", "…")
1388
- ...
1389
- symbol_pattern = re.compile("|".join(re.escape(symbol) for symbol in SYMBOLS))
1390
- ...
1391
- attrs.symbol_to_word_ratio = sum(1 for word in words if symbol_pattern.search(word)) / word_count
1392
- """, block="block", language="python"),
 
 
 
1393
  style="""
1394
  background-color: #EAFFF1; /* Light green background */
1395
  padding: 15px;
@@ -1401,11 +1511,14 @@ def web_data():
1401
  H3("Fraction of Alphabetic Words"),
1402
  Details(
1403
  Summary("Implementations from Dolma"),
1404
- D_code("""
1405
- attrs.fraction_of_words_with_alpha_character = sum(
1406
- 1 for word in words if any(c.isalpha() for c in word)
1407
- ) / max(word_count, 1)
1408
- """, block="block", language="python"),
 
 
 
1409
  style="""
1410
  background-color: #FFFAEA; /* Light yellow background */
1411
  padding: 15px;
@@ -1415,27 +1528,30 @@ def web_data():
1415
  ),
1416
  Details(
1417
  Summary("Implementations from RedPajama-V2"),
1418
- D_code("""
1419
- class RPS_Doc_Frac_No_Alph_Words(RPSBase): # noqa
1420
- ALPH_REGEX = re.compile(r"[a-zA-Z]")
1421
-
1422
- __slots__ = ()
1423
-
1424
- def __call__(self, document: Document) -> SignalType:
1425
- num_words = document.num_raw_words
1426
-
1427
- if num_words == 0:
1428
- return [(0, len(document), None)]
1429
-
1430
- num_words_with_alpha = float(sum(
1431
- int(self.ALPH_REGEX.search(word) is not None)
1432
- for word in document.raw_words
1433
- ))
1434
-
1435
- score = 1.0 - num_words_with_alpha / num_words
1436
- score = round(score, PRECISION)
1437
- return [(0, len(document), score)]
1438
- """, block="block", language="python"),
 
 
 
1439
  style="""
1440
  background-color: #FFFAEA; /* Light yellow background */
1441
  padding: 15px;
@@ -1445,14 +1561,17 @@ def web_data():
1445
  ),
1446
  Details(
1447
  Summary("Implementations from DataTrove"),
1448
- D_code("""
1449
- # that 80 % of words in a document contain at least one alphabetic character
1450
- if (
1451
- self.max_non_alpha_words_ratio
1452
- and sum([any((c.isalpha() for c in w)) for w in words]) / n_words < self.max_non_alpha_words_ratio
1453
- ):
1454
- return False, "gopher_below_alpha_threshold"
1455
- """, block="block", language="python"),
 
 
 
1456
  style="""
1457
  background-color: #FFFAEA; /* Light yellow background */
1458
  padding: 15px;
@@ -1480,10 +1599,13 @@ def web_data():
1480
  H3("TxT360 Implementation"),
1481
  Details(
1482
  Summary("Sample documents that are filtered out by statistics-based heuristics"),
1483
- DV(
 
1484
  "data/sample_doc_stat.json",
1485
  0,
1486
  "Sample documents that are filtered out by statistics-based heuristics",
 
 
1487
  ),
1488
  style="""
1489
  background-color: #EAFFF1; /* Light green background */
@@ -1500,7 +1622,10 @@ def web_data():
1500
 
1501
  Details(
1502
  Summary("Sample documents containing 'lorem ipsum'"),
1503
- DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
 
 
 
1504
  style="""
1505
  background-color: #FAEAEA; /* Light pink background */
1506
  padding: 15px;
 
319
 
320
  Details(
321
  Summary("Non-English Documents"),
322
+ Div(
323
+ DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
324
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
325
+ ),
326
  style="""
327
  background-color: #FAEAEA; /* Light pink background */
328
  padding: 15px;
 
335
 
336
  Details(
337
  Summary("English Documents Scoring Lower than 0.65"),
338
+ Div(
339
+ DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
340
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
341
+ ),
342
  style="""
343
  background-color: #EAFFF1; /* Light green background */
344
  padding: 15px;
 
361
 
362
  Details(
363
  Summary("24 URL domains with more than 4k matches"),
364
+ Div (
365
+ DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
366
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
367
+ ),
368
  style="""
369
  background-color: #FAEAEA; /* Light pink background */
370
  padding: 15px;
 
378
  """),
379
  Details(
380
  Summary("6 url domains that are removed from the blocklist"),
381
+ Div (
382
+ DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
383
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
384
+ ),
385
  style="""
386
  background-color: #FAEAEA; /* Light pink background */
387
  padding: 15px;
 
392
 
393
  Details(
394
  Summary("Sample documents whose urls are blocked by the refined url blocklist"),
395
+ Div(
396
+ DV(
397
  "data/bad_url_doc.jsonl",
398
  3,
399
  "Sample documents whose urls are blocked by the refined url blocklist",
400
+ ), style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
401
+ ),
402
  style="""
403
  background-color: #FAEAEA; /* Light pink background */
404
  padding: 15px;
 
414
 
415
  Details(
416
  Summary("curated url domains that are excluded from our dataset"),
417
+ Div (
418
+ DVS(
419
  non_web_urls,
420
  "curated url domains that are excluded from our dataset",
421
+ ),
422
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
423
  ),
424
  style="""
425
  background-color: #FAEAEA; /* Light pink background */
 
431
 
432
  Details(
433
  Summary("Sample documents whose urls are in our curated url domain list"),
434
+ Div (
435
+ DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
436
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
437
+ ),
438
  style="""
439
  background-color: #EAFFF1; /* Light green background */
440
  padding: 15px;
 
464
 
465
  Details(
466
  Summary("Sample documents with lines that are removed by the rule of terminal punctuation"),
467
+ Div (
468
+ DV(
469
+ "data/sample_terminal_punc.json",
470
+ 0,
471
+ "Sample documents with lines that are removed by the rule of terminal punctuation",
472
+ ),
473
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
474
+ ),
475
  style="""
476
  background-color: #FAEAEA; /* Light pink background */
477
  padding: 15px;
 
494
  """),
495
  Details(
496
  Summary("Sample documents that are removed by original C4 javascript rule but are kept after our refinement"),
497
+ Div (
498
+ DV(
499
  "data/sample_java.jsonl",
500
  0,
501
  "Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
502
+ ),
503
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
504
  ),
505
  style="""
506
  background-color: #FAEAEA; /* Light pink background */
 
521
  ),
522
  Details(
523
  Summary("Sample documents with lines that are removed by the RefinedWeb rules"),
524
+ Div (
525
+ DV(
526
  "data/sample_refinedweb_line.json",
527
  0,
528
  "Sample documents with lines that are removed by the RefinedWeb rules",
529
+ ),
530
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
531
  ),
532
  style="""
533
  background-color: #FAEAEA; /* Light pink background */
 
546
  """),
547
  Details(
548
  Summary("Sample documents with toxic lines"),
549
+ Div (
550
+ DVS(
551
  json.load(open("data/toxic_lines.json")),
552
  "Sample documents with toxic lines",
553
+ ),
554
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
555
  ),
556
  style="""
557
  background-color: #FAEAEA; /* Light pink background */
 
567
  """),
568
  Details(
569
  Summary("Overview of all the quality signals that are used for filtering"),
570
+ Div (
571
+ DVS(
572
  json.load(open("data/all_signals.json")),
573
  "Overview of all the quality signals that are used for filtering",
574
+ ),
575
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
576
  ),
577
  style="""
578
  background-color: #EAFFF1; /* Light green background */
 
602
  """),
603
  Details(
604
  Summary("Implementations from Dolma"),
605
+ Div(
606
+ D_code("""
607
+ words = text.split()
608
+ word_count = len(words)
609
+ character_count = sum(len(word) for word in words)
610
+ ...
611
+ lines = text.split("\n")
612
+ line_count = len(lines)
613
+ ...
614
+ line_counts = Counter(lines)
615
+ attrs.fraction_of_duplicate_lines = sum(count for line, count in line_counts.items() if count > 1) / max(
616
+ line_count, 1
617
+ )
618
+ attrs.fraction_of_characters_in_duplicate_lines = sum(
619
+ len(line) * count for line, count in line_counts.items() if count > 1
620
+ ) / max(character_count, 1)
621
+ """, block="block", language="python"),
622
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
623
+ ),
624
  style="""
625
  background-color: #FFFAEA; /* Light yellow background */
626
  padding: 15px;
 
630
  ),
631
  Details(
632
  Summary("Implementations from DataTrove"),
633
+ Div(
634
+ D_code("""
635
+ def find_duplicates(x: list[str]) -> tuple[int, int]:
636
+ unique_x = set()
637
+ duplicate_chars = 0
638
+ duplicate_elements = 0
639
+ for element in x:
640
+ if element in unique_x:
641
+ duplicate_chars += len(element)
642
+ duplicate_elements += 1
643
+
644
+ else:
645
+ unique_x.add(element)
646
+ return duplicate_elements, duplicate_chars
647
+ ...
648
+ self.paragraph_exp = re.compile(r"\n{2,}")
649
+ self._line_splitter = re.compile("\n+")
650
+ ...
651
+ paragraphs = self.paragraph_exp.split(text.strip())
652
+ paragraphs_duplicates, char_duplicates = find_duplicates(paragraphs)
653
+ if self.dup_para_frac and paragraphs_duplicates / len(paragraphs) > self.dup_para_frac:
654
+ return False, "dup_para_frac"
655
+ if self.dup_para_char_frac and char_duplicates / len(text) > self.dup_para_char_frac:
656
+ return False, "dup_para_char_frac"
657
+
658
+ lines = self._line_splitter.split(text)
659
+ line_duplicates, char_duplicates = find_duplicates(lines)
660
+ if self.dup_line_frac and line_duplicates / len(lines) > self.dup_line_frac:
661
+ return False, "dup_line_frac"
662
+ if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac:
663
+ return False, "dup_line_char_frac"
664
+ """, block="block", language="python"),
665
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
666
+ ),
667
  style="""
668
  background-color: #FFFAEA; /* Light yellow background */
669
  padding: 15px;
 
695
  H3("TxT360 Implementation"),
696
  Details(
697
  Summary("TxT360 Implementation"),
698
+ Div(
699
+ D_code("""
700
+ words = text.split()
701
+ word_count = len(words)
702
+ character_count = sum(len(word) for word in words)
703
+ ...
704
+ lines = text.split("\n")
705
+ line_count = len(lines)
706
+
707
+ line_counts = Counter(lines)
708
+ attrs.fraction_of_duplicate_lines = (
709
+ sum((count - 1) for line, count in line_counts.items() if count > 1) / line_count
710
+ )
711
+ attrs.fraction_of_characters_in_duplicate_lines = (
712
+ sum(sum(len(w) for w in line.split()) * (count - 1) for line, count in
713
+ line_counts.items() if count > 1) / character_count
714
+ """, block="block", language="python"),
715
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
716
+ ),
717
  style="""
718
  background-color: #EAFFF1; /* Light green background */
719
  padding: 15px;
 
723
  ),
724
  Details(
725
  Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
726
+ Div(
727
+ DV(
728
  "data/repeat_line_frac.jsonl",
729
  0,
730
  "Sample documents filtered by excessive line repetitions / characters in repeated lines",
731
+ ),
732
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
733
  ),
734
  style="""
735
  background-color: #EAFFF1; /* Light green background */
 
745
  """),
746
  Details(
747
  Summary("Implementations from Dolma"),
748
+ Div(
749
+ D_code("""
750
+ def all_ngram_counts(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
751
+ return [(n, Counter(list(zip(*[words[i:] for i in range(n)])))) for n in range(2, 11)]
752
+ ...
753
+ all_counts = all_ngram_counts(words)
754
+
755
+ count_most_common_ngrams = (2, 3, 4)
756
+ for n, ngram_counts in all_counts:
757
+ if not ngram_counts:
758
+ continue
759
+ if n in count_most_common_ngrams:
760
+ most_common_ngram, count = ngram_counts.most_common(1)[0]
761
+ value = count * sum(len(w) for w in most_common_ngram) / max(character_count, 1)
762
+ attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
763
+ """, block="block", language="python"),
764
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
765
+ ),
766
  style="""
767
  background-color: #FFFAEA; /* Light yellow background */
768
  padding: 15px;
 
772
  ),
773
  Details(
774
  Summary("Implementations from RedPajama-V2"),
775
+ Div(
776
+ D_code("""
777
  class Base_RPS_Frac_Chars_In_Top_NGram(RPSBase): # noqa
778
  ## Base class for calculating the fraction of characters in the top N-gram. This operates on the lower-cased, punctation removed content.
779
  NGRAM_SIZE: int = None
 
807
  score = sum(len(w) for w in ngram) * count / total_chars
808
  score = round(score, PRECISION)
809
  return [(0, len(document), score)]
810
+ """, block="block", language="python"),
811
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
812
+ ),
813
  style="""
814
  background-color: #FFFAEA; /* Light yellow background */
815
  padding: 15px;
 
820
 
821
  Details(
822
  Summary("Implementations from DataTrove"),
823
+ Div(
824
+ D_code("""
825
+ def get_n_grams(words: list[str], n: int) -> list[str]:
826
+ return [" ".join(words[i : i + n]) for i in range(len(words) - n + 1)]
827
+
828
+ def find_top_duplicate(x: list[str]) -> int:
829
+ counter = Counter()
830
+ for element in x:
831
+ counter[element] += 1
832
+ top_n_gram = counter.most_common(1)[0]
833
+ return len(top_n_gram[0]) * top_n_gram[1]
834
+ ...
835
+ for n, n_frac in self.top_n_grams:
836
+ n_grams = get_n_grams(words, n)
837
+ if not n_grams:
838
+ continue
839
+ top_char_length = find_top_duplicate(n_grams)
840
+ if top_char_length / len(text) > n_frac:
841
+ return False, f"top_n_gram"
842
+ """, block="block", language="python"),
843
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
844
+ ),
845
  style="""
846
  background-color: #FFFAEA; /* Light yellow background */
847
  padding: 15px;
 
861
  """),
862
  Details(
863
  Summary("TxT360 Implementation"),
864
+ Div(
865
+ D_code("""
866
+ def all_ngram_counts_new(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
867
+ return [(n, list(zip(*[words[i:] for i in range(n)]))) for n in range(2, 11)]
868
+ ...
869
+ all_counts = all_ngram_counts_new(words)
870
+ count_most_common_ngrams = (2, 3, 4)
871
+ for n, ngram_counts in all_counts:
872
+ if not ngram_counts:
873
+ continue
874
+ if n in count_most_common_ngrams:
875
+ most_common_ngram, count = Counter(ngram_counts).most_common(1)[0]
876
+ value = count * sum(len(w) for w in most_common_ngram) / character_count
877
+ attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
878
+ """, block="block", language="python"),
879
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
880
+ ),
881
  style="""
882
  background-color: #EAFFF1; /* Light green background */
883
  padding: 15px;
 
887
  ),
888
  Details(
889
  Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
890
+ Div(
891
+ DV(
892
  "data/sample_top_ngram.json",
893
  0,
894
  "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
895
+ ),
896
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
897
  ),
898
  style="""
899
  background-color: #EAFFF1; /* Light green background */
 
910
  """),
911
  Details(
912
  Summary("Implementations from Dolma"),
913
+ Div(
914
+ D_code("""
915
+ def all_ngram_counts(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
916
+ return [(n, Counter(list(zip(*[words[i:] for i in range(n)])))) for n in range(2, 11)]
917
+ ...
918
+ all_counts = all_ngram_counts(words)
919
+ for n, ngram_counts in all_counts:
920
+ if not ngram_counts:
921
+ continue
922
+ if n in count_most_common_ngrams:
923
+ ...
924
+ else:
925
+ ng_char_count = sum(count * sum(len(w) for w in ng) for ng, count in ngram_counts.items())
926
+ value = sum(
927
+ count * sum(len(w) for w in ng) for ng, count in ngram_counts.items() if count > 1
928
+ ) / max(ng_char_count, 1)
929
+ attrs.fraction_of_characters_in_duplicate_ngrams.append((n, value))
930
+ """, block="block", language="python"),
931
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
932
+ ),
933
  style="""
934
  background-color: #FFFAEA; /* Light yellow background */
935
  padding: 15px;
 
939
  ),
940
  Details(
941
  Summary("Implementations from RedPajama-V2"),
942
+ Div(
943
+ D_code("""
944
+ class Base_RPS_Frac_Chars_In_Dupe_NGrams(RPSBase): # noqa
945
+ ## Base class for calculating the fraction of characters in duplicate word N-grams. This operates on the lower-cased, punctation removed content. The function also ensures that characters in overlapping ngrams are only counted once.
946
+ NGRAM_SIZE: int = None
947
+ __slots__ = []
948
+
949
+ def __call__(self, document: Document) -> SignalType:
950
+ if self.NGRAM_SIZE is None:
951
+ raise NotImplementedError(
952
+ "NGRAM_SIZE must be set in the subclass"
953
+ )
954
+
955
+ if len(document.normalized_words) < self.NGRAM_SIZE:
956
+ return [(0, len(document), 0.0)]
957
+
958
+ # fetch the ngrams from the document if they exist, otherwise
959
+ # compute them
960
+ doc_n_grams = (
961
+ getattr(document, f"norm_self.NGRAM_SIZEgrams", None)
962
+ or
963
+ tuple(form_ngrams(
964
+ iter(document.normalized_words), self.NGRAM_SIZE
965
+ ))
966
  )
967
+
968
+ # keep only ngrams which occur at least twice
969
+ ngram_dupes =
970
+ ngram for ngram, count in Counter(doc_n_grams).items() if count > 1
971
+
972
+
973
+ duplicated_grams = np.zeros(len(document.normalized_words), dtype=int)
974
+
975
+ i = 0
976
+ for ngram in doc_n_grams:
977
+ if ngram in ngram_dupes:
978
+ duplicated_grams[i: i + self.NGRAM_SIZE] = 1
979
+
980
+ i += 1
981
+
982
+ word_lengths = np.array(list(map(len, document.normalized_words)))
983
+ chars_duped = np.sum(word_lengths * duplicated_grams)
984
+ total_chars = np.sum(word_lengths)
985
+
986
+ if total_chars == 0:
987
+ return [(0, len(document), 0.0)]
988
+
989
+ score = float(chars_duped / total_chars)
990
+ score = round(score, PRECISION)
991
+ return [(0, len(document), score)]
992
+ """, block="block", language="python"),
993
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
994
+ ),
 
 
 
 
 
 
 
 
 
 
 
995
  style="""
996
  background-color: #FFFAEA; /* Light yellow background */
997
  padding: 15px;
 
1002
 
1003
  Details(
1004
  Summary("Implementations from DataTrove"),
1005
+ Div(
1006
+ D_code("""
1007
+ def find_all_duplicate(words: list[str], n: int) -> int:
1008
+ n_words = len(words)
1009
+ unique = set()
1010
+ repeated_chars, idx = 0, 0
1011
+ while idx < n_words - n + 1:
1012
+ n_gram = "".join(words[idx : idx + n])
1013
+ if n_gram in unique:
1014
+ repeated_chars += len(n_gram)
1015
+ idx += n
1016
+ else:
1017
+ unique.add(n_gram)
1018
+ idx += 1
1019
+ assert repeated_chars <= len("".join(words))
1020
+ return repeated_chars
1021
+ ...
1022
+ for n, n_frac in self.dup_n_grams:
1023
+ n_duplicates_char = find_all_duplicate(words, n)
1024
+ if n_duplicates_char / len(text) > n_frac:
1025
+ return False, f"duplicated_n_grams"
1026
+ """, block="block", language="python"),
1027
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1028
+ ),
1029
  style="""
1030
  background-color: #FFFAEA; /* Light yellow background */
1031
  padding: 15px;
 
1050
  """),
1051
  Details(
1052
  Summary("TxT360 Implementation"),
1053
+ Div(
1054
+ D_code("""
1055
+ def get_dup_ngram_frac(n, doc_n_grams, text):
1056
+ # fetch the ngrams from the document if they exist, otherwise compute them
1057
+ # doc_n_grams = list(zip(*[words[i:] for i in range(n)]))
1058
+
1059
+ duplicated_grams = np.zeros(len(text.split()), dtype=int)
1060
+
1061
+ unique_ngrams = set()
1062
+
1063
+ for i, ngram in enumerate(doc_n_grams):
1064
+ if ngram in unique_ngrams:
1065
+ duplicated_grams[i: i + n] = 1
1066
+ else:
1067
+ unique_ngrams.add(ngram)
1068
+
1069
+ word_lengths = np.array(list(map(len, text.split())))
1070
+ chars_duped = np.sum(word_lengths * duplicated_grams)
1071
+ total_chars = np.sum(word_lengths)
1072
+
1073
+ return float(chars_duped / total_chars)
1074
+
1075
+ def all_ngram_counts_new(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
1076
+ return [(n, list(zip(*[words[i:] for i in range(n)]))) for n in range(2, 11)]
1077
+ ...
1078
+ all_counts = all_ngram_counts_new(words)
1079
+ count_most_common_ngrams = (2, 3, 4)
1080
+ for n, ngram_counts in all_counts:
1081
+ if not ngram_counts:
1082
+ continue
1083
+ if n in count_most_common_ngrams:
1084
+ ...
1085
  else:
1086
+ score = get_dup_ngram_frac(n, ngram_counts, text)
1087
+ attrs.fraction_of_characters_in_duplicate_ngrams.append((n, score))
1088
+ """, block="block", language="python"),
1089
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1090
+ ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1091
  style="""
1092
  background-color: #EAFFF1; /* Light green background */
1093
  padding: 15px;
 
1120
  ),
1121
  Details(
1122
  Summary("Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)"),
1123
+ Div(
1124
+ DV(
1125
  "data/sample_dup_ngram.json",
1126
  0,
1127
  "Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
1128
+ ),
1129
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1130
  ),
1131
  style="""
1132
  background-color: #EAFFF1; /* Light green background */
 
1144
  """),
1145
  Details(
1146
  Summary("Ellipsis Symbol Identification Implemetations"),
1147
+ Div(
1148
+ P("Dolma: "),
1149
+ D_code("""
1150
+ ELLIPSIS_SYMBOLS = ("")
1151
+ """, block="block", language="python"),
1152
+ P("RedPajamaV2: "),
1153
+ D_code("""
1154
+ ELLIPSIS_SYMBOLS = ("...", "")
1155
+ """, block="block", language="python"),
1156
+ P("DataTrove: "),
1157
+ D_code("""
1158
+ ELLIPSIS_SYMBOLS = ("...", "")
1159
+ """, block="block", language="python"),
1160
+ P("TxT360: "),
1161
+ D_code("""
1162
+ ELLIPSIS_SYMBOLS = ("...", "…", "[...]", "[…]")
1163
+ """, block="block", language="python"),
1164
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1165
+ ),
1166
  style="""
1167
  background-color: #FFFAEA; /* Light yellow background */
1168
  padding: 15px;
 
1172
  ),
1173
  Details(
1174
  Summary("Bullet Point Identification Implemetations"),
1175
+ Div(
1176
+ P("Dolma: "),
1177
+ D_code("""
1178
+ BULLET_POINTS = ("*", "-"
1179
+ """, block="block", language="python"),
1180
+ P("RedPajamaV2: "),
1181
+ D_code("""
1182
+ BULLET_POINT_SYMBOLS = (
1183
+ "", # bullet point
1184
+ "", # triangular bullet point
1185
+ "", # black right pointing triangle
1186
+ "", # black left pointing triangle
1187
+ "", # white bullet point
1188
+ "", # black square
1189
+ "", # white square
1190
+ "", # black small square
1191
+ "", # white small square
1192
+ "–", # en dash
1193
+ )
1194
+ """, block="block", language="python"),
1195
+ P("DataTrove: "),
1196
+ D_code("""
1197
+ BULLET_POINT_SYMBOLS = ("" , "-")
1198
+ """, block="block", language="python"),
1199
+ P("TxT360: "),
1200
+ D_code("""
1201
+ BULLET_POINT_SYMBOLS = (
1202
+ "", # bullet point
1203
+ "", # triangular bullet point
1204
+ "", # black right pointing triangle
1205
+ "", # black left pointing triangle
1206
+ "", # white bullet point
1207
+ "", # black square
1208
+ "", # white square
1209
+ "", # black small square
1210
+ "", # white small square
1211
+ "-", # - en dash
1212
+ "", # dash
1213
+ "", # zh dash
1214
+ "*", # * star
1215
+ )
1216
+ """, block="block", language="python"),
1217
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1218
+ ),
1219
  style="""
1220
  background-color: #FFFAEA; /* Light yellow background */
1221
  padding: 15px;
 
1227
 
1228
  Details(
1229
  Summary("Sample documents that are filtered out by line-wise heuristics"),
1230
+ Div(
1231
+ DV(
1232
  "data/line_info.json",
1233
  0,
1234
  "Sample documents that are filtered out by line-wise heuristics",
1235
+ ),
1236
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1237
  ),
1238
  style="""
1239
  background-color: #EAFFF1; /* Light green background */
 
1272
  ),
1273
  Details(
1274
  Summary("Implementations from RedPajama-V2"),
1275
+ Div(
1276
+ D_code("""
1277
+ # the normalized content: lowercased and punctuation removed
1278
+ self._normalized_content = normalize(content)
1279
+ self._normalized_words = tuple(self._normalized_content.split())
1280
+ self._num_normalized_words = len(self._normalized_words)
1281
+
1282
+ ...
1283
+ def normalize(
1284
+ text: str,
1285
+ remove_punct: bool = True,
1286
+ lowercase: bool = True,
1287
+ nfd_unicode: bool = True,
1288
+ white_space: bool = True
1289
+ ) -> str:
1290
+ #Normalize the text by lowercasing and removing punctuation.
1291
+ # remove punctuation
1292
+ if remove_punct:
1293
+ text = text.translate(TRANSLATION_TABLE_PUNCTUATION)
1294
+ # lowercase
1295
+ if lowercase:
1296
+ text = text.lower()
1297
+ if white_space:
1298
+ text = text.strip()
1299
+ text = re.sub(r"\s+", " ", text)
1300
+ # NFD unicode normalization
1301
+ if nfd_unicode:
1302
+ text = unicodedata.normalize("NFD", text)
1303
+ return text
1304
+ """, block="block", language="python"),
1305
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1306
+ ),
1307
  style="""
1308
  background-color: #FFFAEA; /* Light yellow background */
1309
  padding: 15px;
 
1314
 
1315
  Details(
1316
  Summary("Implementations from DataTrove"),
1317
+ Div(
1318
+ D_code("""
1319
+ words = self.tokenizer.word_tokenize(text)
1320
+ n_words = len(words)
1321
+
1322
+ non_symbol_words = [w for w in words if any(ch not in PUNCTUATION_SET for ch in w)]
1323
+ n_non_symbol_words_words = len(non_symbol_words)
1324
+ """, block="block", language="python"),
1325
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1326
+ ),
1327
  style="""
1328
  background-color: #FFFAEA; /* Light yellow background */
1329
  padding: 15px;
 
1362
  """),
1363
  Details(
1364
  Summary("Implementations from RedPajama-V2"),
1365
+ Div(
1366
+ D_code("""
1367
+ class RPS_Doc_Num_Sentences(RPSBase): # noqa
1368
+ ##The number of sentences in the content. This is calculated using the regex r'[^.!?]+[.!?]*'
1369
+ SENT_PATTERN = re.compile(r'[^.!?]+[.!?]*', flags=re.UNICODE)
1370
+
1371
+ __slots__ = ()
1372
+
1373
+ def __call__(self, document: Document) -> SignalType:
1374
+ ##count the number of sentences in the content using regex
1375
+ score = float(len(self.SENT_PATTERN.findall(document.raw_content)))
1376
+ return [(0, len(document), score)]
1377
+ """, block="block", language="python"),
1378
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1379
+ ),
1380
  style="""
1381
  background-color: #FFFAEA; /* Light yellow background */
1382
  padding: 15px;
 
1390
  """),
1391
  Details(
1392
  Summary("TxT360 Implementation"),
1393
+ Div(
1394
+ D_code("""
1395
+ from nltk.tokenize import sent_tokenize
1396
+ ...
1397
+ def count_sentences(text):
1398
+ sentences = sent_tokenize(text)
1399
+ return len(sentences)
1400
+ ...
1401
+ attrs.num_of_sentences = count_sentences(text)
1402
+ """, block="block", language="python"),
1403
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1404
+ ),
1405
  style="""
1406
  background-color: #EAFFF1; /* Light green background */
1407
  padding: 15px;
 
1417
  """),
1418
  Details(
1419
  Summary("Implementations from Dolma"),
1420
+ Div(
1421
+ D_code("""
1422
+ SYMBOLS = ("#", "…")
1423
+ ...
1424
+ attrs.symbol_to_word_ratio = sum(1 for word in words if any(s in word for s in SYMBOLS)) / max(
1425
+ word_count, 1
1426
+ )
1427
+ """, block="block", language="python"),
1428
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1429
+ ),
1430
  style="""
1431
  background-color: #FFFAEA; /* Light yellow background */
1432
  padding: 15px;
 
1436
  ),
1437
  Details(
1438
  Summary("Implementations from RedPajama-V2"),
1439
+ Div(
1440
+ D_code("""
1441
+ class RPS_Doc_Symbol_To_Word_Ratio(RPSBase): # noqa
1442
+ ##The ratio of symbols to words in the content. This is analogous to
1443
+ ##the signal used in Gopher. Symbols are defined "#", "...", and "…".
1444
+ SYMBOLS = ("#", "...", "…")
1445
+
1446
+ __slots__ = ()
1447
+
1448
+ def __call__(self, document: Document) -> SignalType:
1449
+ num_words = document.num_raw_words
1450
+
1451
+ if num_words == 0:
1452
+ return [(0, len(document), None)]
1453
+
1454
+ # count the number of symbols in the content
1455
+ num_symbols = float(sum(
1456
+ document.raw_content.count(x) for x in self.SYMBOLS
1457
+ ))
1458
+
1459
+ score = num_symbols / num_words
1460
+ score = round(score, PRECISION)
1461
+ return [(0, len(document), score)]
1462
+ """, block="block", language="python"),
1463
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1464
+ ),
1465
  style="""
1466
  background-color: #FFFAEA; /* Light yellow background */
1467
  padding: 15px;
 
1472
 
1473
  Details(
1474
  Summary("Implementations from DataTrove"),
1475
+ Div(
1476
+ D_code("""
1477
+ if self.max_symbol_word_ratio and text.count("#") / n_words > self.max_symbol_word_ratio:
1478
+ return False, "gopher_too_many_hashes"
1479
+ if self.max_symbol_word_ratio and (text.count("...") + text.count("…")) / n_words > self.max_symbol_word_ratio:
1480
+ return False, "gopher_too_many_ellipsis"
1481
+ """, block="block", language="python"),
1482
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1483
+ ),
1484
  style="""
1485
  background-color: #FFFAEA; /* Light yellow background */
1486
  padding: 15px;
 
1490
  ),
1491
  Details(
1492
  Summary("TxT360 Implementation"),
1493
+ Div(
1494
+ D_code("""
1495
+ SYMBOLS = ("#", "...", "…")
1496
+ ...
1497
+ symbol_pattern = re.compile("|".join(re.escape(symbol) for symbol in SYMBOLS))
1498
+ ...
1499
+ attrs.symbol_to_word_ratio = sum(1 for word in words if symbol_pattern.search(word)) / word_count
1500
+ """, block="block", language="python"),
1501
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1502
+ ),
1503
  style="""
1504
  background-color: #EAFFF1; /* Light green background */
1505
  padding: 15px;
 
1511
  H3("Fraction of Alphabetic Words"),
1512
  Details(
1513
  Summary("Implementations from Dolma"),
1514
+ Div(
1515
+ D_code("""
1516
+ attrs.fraction_of_words_with_alpha_character = sum(
1517
+ 1 for word in words if any(c.isalpha() for c in word)
1518
+ ) / max(word_count, 1)
1519
+ """, block="block", language="python"),
1520
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1521
+ ),
1522
  style="""
1523
  background-color: #FFFAEA; /* Light yellow background */
1524
  padding: 15px;
 
1528
  ),
1529
  Details(
1530
  Summary("Implementations from RedPajama-V2"),
1531
+ Div(
1532
+ D_code("""
1533
+ class RPS_Doc_Frac_No_Alph_Words(RPSBase): # noqa
1534
+ ALPH_REGEX = re.compile(r"[a-zA-Z]")
1535
+
1536
+ __slots__ = ()
1537
+
1538
+ def __call__(self, document: Document) -> SignalType:
1539
+ num_words = document.num_raw_words
1540
+
1541
+ if num_words == 0:
1542
+ return [(0, len(document), None)]
1543
+
1544
+ num_words_with_alpha = float(sum(
1545
+ int(self.ALPH_REGEX.search(word) is not None)
1546
+ for word in document.raw_words
1547
+ ))
1548
+
1549
+ score = 1.0 - num_words_with_alpha / num_words
1550
+ score = round(score, PRECISION)
1551
+ return [(0, len(document), score)]
1552
+ """, block="block", language="python"),
1553
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1554
+ ),
1555
  style="""
1556
  background-color: #FFFAEA; /* Light yellow background */
1557
  padding: 15px;
 
1561
  ),
1562
  Details(
1563
  Summary("Implementations from DataTrove"),
1564
+ Div(
1565
+ D_code("""
1566
+ # that 80 % of words in a document contain at least one alphabetic character
1567
+ if (
1568
+ self.max_non_alpha_words_ratio
1569
+ and sum([any((c.isalpha() for c in w)) for w in words]) / n_words < self.max_non_alpha_words_ratio
1570
+ ):
1571
+ return False, "gopher_below_alpha_threshold"
1572
+ """, block="block", language="python"),
1573
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1574
+ ),
1575
  style="""
1576
  background-color: #FFFAEA; /* Light yellow background */
1577
  padding: 15px;
 
1599
  H3("TxT360 Implementation"),
1600
  Details(
1601
  Summary("Sample documents that are filtered out by statistics-based heuristics"),
1602
+ Div(
1603
+ DV(
1604
  "data/sample_doc_stat.json",
1605
  0,
1606
  "Sample documents that are filtered out by statistics-based heuristics",
1607
+ ),
1608
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1609
  ),
1610
  style="""
1611
  background-color: #EAFFF1; /* Light green background */
 
1622
 
1623
  Details(
1624
  Summary("Sample documents containing 'lorem ipsum'"),
1625
+ Div(
1626
+ DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
1627
+ style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
1628
+ ),
1629
  style="""
1630
  background-color: #FAEAEA; /* Light pink background */
1631
  padding: 15px;