HugoLaurencon commited on
Commit
ea01f38
·
1 Parent(s): b37a555

7 languages supported

Browse files
app.py CHANGED
@@ -909,7 +909,7 @@ param_visu_langs = {
909
  "path_sentencepiece_model": f"./{lang_dataset_id}.sp.model",
910
  "path_kenlm_model": f"./{lang_dataset_id}.arpa.bin",
911
  }
912
- for lang_dataset_id in ["en", "zh"]
913
  }
914
 
915
  visualization = Visualization(path_instructions, param_visu_langs)
 
909
  "path_sentencepiece_model": f"./{lang_dataset_id}.sp.model",
910
  "path_kenlm_model": f"./{lang_dataset_id}.arpa.bin",
911
  }
912
+ for lang_dataset_id in ["eu", "ca", "zh", "en", "fr", "id", "es"]
913
  }
914
 
915
  visualization = Visualization(path_instructions, param_visu_langs)
ca.arpa.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ece1e503d4b44409069ea9c5c5125b74792b575143169e08cf9a27248f9a78e
3
+ size 2809368958
ca.sp.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abc6936e2ff5dcdc86962ffaeef48ef66f567d568ef7090d28123ed6618b455c
3
+ size 946977
ca_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4207b45aa366ece2763a06565fcb771b86e433f2a6190248017f97e7534fa4a
3
+ size 103605036
en_examples_with_stats.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d64ccdaa835843aabbcc7d74789140e5b68313dfff6c23f79a60b65d668633e8
3
- size 276101129
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dccf03710e9dc7ec68c676175e711be815bc29a50260f5d334156b03fe2e6d1
3
+ size 241408394
es.arpa.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26964ff8185eb105021fc0e9eaa0a1de590c4a12f8aa3fe12112b29d42281cf3
3
+ size 3828418653
es.sp.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aae545566a995d3374fbc8ac1d4e0c7073008da8ae32acfe7f176136a8efcf37
3
+ size 961535
es_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d52760c4c961ebfe419a603a6d837619ca146656f563f5abbd140dec8fbe28e
3
+ size 148378888
eu.arpa.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d04c4d1233b40044e2facc978987ecd4a6d4f84032f2af3f85f7079676fa08b
3
+ size 774011873
eu.sp.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:447cbd1714e51e6a7b4dd8ff55b7bd975fdb7f6ba873cb6f8a1fe36b5867dbb6
3
+ size 955869
eu_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10a06ac7ed9b4c444f35fb9a3e3636a22689c198a6bdd4fd358b0eec50aa924d
3
+ size 66358003
explanation_filtering_pipeline.pdf CHANGED
Binary files a/explanation_filtering_pipeline.pdf and b/explanation_filtering_pipeline.pdf differ
 
filtering.py CHANGED
@@ -423,33 +423,91 @@ class Filtering:
423
  return cond
424
 
425
  @staticmethod
426
- def compute_repetitions_ratio(document, repetitions_length):
427
- def get_freq_ngrams(document, n):
428
- ngrams = [document[i : i + n] for i in range(len(document) - n + 1)]
429
- freq_ngrams = {}
430
- for ngram in ngrams:
431
- freq_ngrams[ngram] = freq_ngrams.get(ngram, 0) + 1
432
- return freq_ngrams
433
-
434
- freq_ngrams = get_freq_ngrams(document, repetitions_length)
435
- if len(freq_ngrams) == 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  return 0
437
- freq_ngrams = list(freq_ngrams.values())
438
- freq_ngrams = sorted(freq_ngrams, reverse=True)
439
- num_rep_ngrams = int(np.sqrt(len(freq_ngrams)))
440
- repetitions_ratio = sum(freq_ngrams[:num_rep_ngrams]) / sum(freq_ngrams)
441
- return repetitions_ratio
442
 
443
  @staticmethod
444
- def check_repetitions_removal(
445
  document,
446
- repetitions_length,
447
- repetitions_max_cutoff,
 
 
448
  ):
449
- repetitions_ratio = Filtering.compute_repetitions_ratio(
450
- document, repetitions_length
451
  )
452
- cond = repetitions_ratio <= repetitions_max_cutoff
453
  return cond
454
 
455
  @staticmethod
@@ -629,7 +687,7 @@ class Filtering:
629
  document=document,
630
  remove_non_printing_characters=True,
631
  strip=True,
632
- lower_case=True,
633
  uniform_whitespace=True,
634
  replace_digits_with_zeros=True,
635
  replace_unicode_punctuation=True,
@@ -670,9 +728,12 @@ class Filtering:
670
  strip_characters,
671
  number_words_min_cutoff,
672
  number_words_max_cutoff,
673
- cond_check_repetitions_removal,
674
- repetitions_length,
675
- repetitions_max_cutoff,
 
 
 
676
  cond_check_special_characters,
677
  special_characters,
678
  special_characters_max_cutoff,
@@ -703,11 +764,20 @@ class Filtering:
703
  number_words_max_cutoff,
704
  ):
705
  return False
706
- if cond_check_repetitions_removal:
707
- if not Filtering.check_repetitions_removal(
708
  document,
709
- repetitions_length,
710
- repetitions_max_cutoff,
 
 
 
 
 
 
 
 
 
711
  ):
712
  return False
713
  if cond_check_special_characters:
@@ -797,9 +867,18 @@ class FunctionDatasetFiltering:
797
  strip_characters=self.param["strip_characters"],
798
  number_words_min_cutoff=self.param["number_words_min_cutoff"],
799
  number_words_max_cutoff=self.param["number_words_max_cutoff"],
800
- cond_check_repetitions_removal=self.param["check_repetitions_removal"],
801
- repetitions_length=self.param["repetitions_length"],
802
- repetitions_max_cutoff=self.param["repetitions_max_cutoff"],
 
 
 
 
 
 
 
 
 
803
  cond_check_special_characters=self.param["cond_check_special_characters"],
804
  special_characters=self.param["special_characters"],
805
  special_characters_max_cutoff=self.param["special_characters_max_cutoff"],
@@ -855,10 +934,10 @@ class DatasetFiltering:
855
  self.path_dir_save_dataset = path_dir_save_dataset
856
 
857
  def modifying_documents(self):
858
- dataset_modifying_documents = FunctionDatasetModifyingDocuments(
859
  self.lang_dataset_id
860
  )
861
- self.ds = self.ds.map(dataset_modifying_documents, num_proc=self.num_proc)
862
 
863
  def filtering(self):
864
  func_dataset_filtering = FunctionDatasetFiltering(
 
423
  return cond
424
 
425
  @staticmethod
426
+ def compute_character_repetition_ratio(document, character_repetition_length):
427
+ def get_freq_character_ngrams(document, n):
428
+ character_ngrams = [
429
+ document[i : i + n] for i in range(len(document) - n + 1)
430
+ ]
431
+ freq_character_ngrams = {}
432
+ for character_ngram in character_ngrams:
433
+ freq_character_ngrams[character_ngram] = (
434
+ freq_character_ngrams.get(character_ngram, 0) + 1
435
+ )
436
+ return freq_character_ngrams
437
+
438
+ freq_character_ngrams = get_freq_character_ngrams(
439
+ document, character_repetition_length
440
+ )
441
+ if len(freq_character_ngrams) == 0:
442
+ return 0
443
+ freq_character_ngrams = list(freq_character_ngrams.values())
444
+ freq_character_ngrams = sorted(freq_character_ngrams, reverse=True)
445
+ val_less_than_one = len([el for el in freq_character_ngrams if el > 1])
446
+ num_rep_character_ngrams = min(
447
+ int(np.sqrt(len(freq_character_ngrams))),
448
+ len(freq_character_ngrams) - val_less_than_one,
449
+ )
450
+ character_repetition_ratio = sum(
451
+ freq_character_ngrams[:num_rep_character_ngrams]
452
+ ) / sum(freq_character_ngrams)
453
+ return character_repetition_ratio
454
+
455
+ @staticmethod
456
+ def check_character_repetition_removal(
457
+ document,
458
+ character_repetition_length,
459
+ character_repetition_max_cutoff,
460
+ ):
461
+ character_repetition_ratio = Filtering.compute_character_repetition_ratio(
462
+ document, character_repetition_length
463
+ )
464
+ cond = character_repetition_ratio <= character_repetition_max_cutoff
465
+ return cond
466
+
467
+ @staticmethod
468
+ def compute_word_repetition_ratio(
469
+ document, sentencepiece_model_tok, strip_characters, word_repetition_length
470
+ ):
471
+ def get_freq_word_ngrams(
472
+ document, sentencepiece_model_tok, strip_characters, n
473
+ ):
474
+ words = ModifyingDocuments.get_words_from_document(
475
+ document,
476
+ sentencepiece_model_tok,
477
+ lower_case=True,
478
+ strip_characters=strip_characters,
479
+ )
480
+ word_ngrams = [
481
+ " ".join(words[i : i + n]) for i in range(len(words) - n + 1)
482
+ ]
483
+ freq_word_ngrams = {}
484
+ for word_ngram in word_ngrams:
485
+ freq_word_ngrams[word_ngram] = freq_word_ngrams.get(word_ngram, 0) + 1
486
+ return freq_word_ngrams
487
+
488
+ freq_word_ngrams = get_freq_word_ngrams(
489
+ document, sentencepiece_model_tok, strip_characters, word_repetition_length
490
+ )
491
+ if len(freq_word_ngrams) == 0:
492
  return 0
493
+ freq_word_ngrams = list(freq_word_ngrams.values())
494
+ word_repetition_ratio = sum(
495
+ freq for freq in freq_word_ngrams if freq > 1
496
+ ) / sum(freq_word_ngrams)
497
+ return word_repetition_ratio
498
 
499
  @staticmethod
500
+ def check_word_repetition_removal(
501
  document,
502
+ sentencepiece_model_tok,
503
+ strip_characters,
504
+ word_repetition_length,
505
+ word_repetition_max_cutoff,
506
  ):
507
+ word_repetition_ratio = Filtering.compute_word_repetition_ratio(
508
+ document, sentencepiece_model_tok, strip_characters, word_repetition_length
509
  )
510
+ cond = word_repetition_ratio <= word_repetition_max_cutoff
511
  return cond
512
 
513
  @staticmethod
 
687
  document=document,
688
  remove_non_printing_characters=True,
689
  strip=True,
690
+ lower_case=False,
691
  uniform_whitespace=True,
692
  replace_digits_with_zeros=True,
693
  replace_unicode_punctuation=True,
 
728
  strip_characters,
729
  number_words_min_cutoff,
730
  number_words_max_cutoff,
731
+ cond_check_character_repetition_removal,
732
+ character_repetition_length,
733
+ character_repetition_max_cutoff,
734
+ cond_check_word_repetition_removal,
735
+ word_repetition_length,
736
+ word_repetition_max_cutoff,
737
  cond_check_special_characters,
738
  special_characters,
739
  special_characters_max_cutoff,
 
764
  number_words_max_cutoff,
765
  ):
766
  return False
767
+ if cond_check_character_repetition_removal:
768
+ if not Filtering.check_character_repetition_removal(
769
  document,
770
+ character_repetition_length,
771
+ character_repetition_max_cutoff,
772
+ ):
773
+ return False
774
+ if cond_check_word_repetition_removal:
775
+ if not Filtering.check_word_repetition_removal(
776
+ document,
777
+ sentencepiece_model_tok,
778
+ strip_characters,
779
+ word_repetition_length,
780
+ word_repetition_max_cutoff,
781
  ):
782
  return False
783
  if cond_check_special_characters:
 
867
  strip_characters=self.param["strip_characters"],
868
  number_words_min_cutoff=self.param["number_words_min_cutoff"],
869
  number_words_max_cutoff=self.param["number_words_max_cutoff"],
870
+ cond_check_character_repetition_removal=self.param[
871
+ "cond_check_character_repetition_removal"
872
+ ],
873
+ character_repetition_length=self.param["character_repetition_length"],
874
+ character_repetition_max_cutoff=self.param[
875
+ "character_repetition_max_cutoff"
876
+ ],
877
+ cond_check_word_repetition_removal=self.param[
878
+ "cond_check_word_repetition_removal"
879
+ ],
880
+ word_repetition_length=self.param["word_repetition_length"],
881
+ word_repetition_max_cutoff=self.param["word_repetition_max_cutoff"],
882
  cond_check_special_characters=self.param["cond_check_special_characters"],
883
  special_characters=self.param["special_characters"],
884
  special_characters_max_cutoff=self.param["special_characters_max_cutoff"],
 
934
  self.path_dir_save_dataset = path_dir_save_dataset
935
 
936
  def modifying_documents(self):
937
+ func_dataset_modifying_documents = FunctionDatasetModifyingDocuments(
938
  self.lang_dataset_id
939
  )
940
+ self.ds = self.ds.map(func_dataset_modifying_documents, num_proc=self.num_proc)
941
 
942
  def filtering(self):
943
  func_dataset_filtering = FunctionDatasetFiltering(
flagged_words.py CHANGED
@@ -81,989 +81,281 @@ english_flagged_words = [
81
  flagged_words = {
82
  "ar": english_flagged_words
83
  + [
 
84
  "احتلام",
 
 
85
  "اغتصاب",
 
 
 
86
  "بز",
87
  "بزاز",
88
  "بظر",
 
89
  "بيضان",
90
- "تمص",
91
  "ثدي",
92
  "جماع",
 
93
  "حلمة",
 
94
  "خنثي",
95
  "خول",
 
96
  "زب",
97
  "سحاق",
98
  "سحاقية",
99
  "سكس",
 
100
  "شاذ",
 
101
  "شرج",
102
  "شرموطة",
 
103
  "شهوة",
104
  "طيز",
 
105
  "عاهرة",
106
  "عرص",
 
107
  "فرج",
108
  "قحبة",
109
  "قضيب",
110
  "كس",
111
- "لبوة",
112
  "لحس",
113
  "لعق",
114
  "لواط",
115
  "لوطي",
116
- "مبادل",
117
  "متناك",
118
  "متناكة",
 
 
119
  "مص",
 
120
  "مفلقسة",
 
 
 
 
121
  "نيك",
122
  ],
123
  "ca": english_flagged_words
124
  + [
125
- "anal",
126
- "anus",
127
- "cul",
128
- "ass-fucker",
129
- "asss",
130
- "asshole",
131
- "assholes",
132
- "bolera",
133
- "boles",
134
- "bastardo",
135
- "bellend",
136
- "bestial",
137
- "bestialitat",
138
- "puta",
139
- "femelles",
140
- "picant",
141
- "sagnant",
142
- "mamada",
143
- "bollok",
144
- "boob",
145
- "pits",
146
- "buceta",
147
- "bum",
148
- "culata",
149
- "catifa muncher",
150
- "picar",
151
- "cipa",
152
- "clitoris",
153
- "polla",
154
- "galletejador",
155
- "gallines",
156
- "coon",
157
- "merda",
158
- "cum",
159
- "correguda",
160
- "cunillingus",
161
- "boig",
162
- "maleït",
163
  "consolador",
164
  "consoladors",
165
- "dink",
166
- "canalla",
167
- "duche",
168
- "dique",
169
- "ejaculació",
170
- "ejaculat",
171
- "ejacula",
172
- "ejaculant",
173
- "fag",
174
- "fagging",
175
- "fagot",
176
- "fagots",
177
- "fanny",
178
- "felching",
179
- "fel.lació",
180
- "brida",
181
- "follar",
182
- "follat",
183
- "escuradents",
184
- "follant",
185
- "folles",
186
- "fucks",
187
- "empacadora de llaminadures",
188
- "déu maldit",
189
- "deu meu",
190
- "infern",
191
- "hore",
192
- "córrer",
193
- "retrocés",
194
- "kock",
195
- "llavis",
196
- "lujuria",
197
- "lució",
198
- "masoquista",
199
- "masturbarse",
200
- "puta mare",
201
- "nazi",
202
- "nigger",
203
- "negres",
204
- "orgasim",
205
  "orgasme",
206
  "orgasmes",
207
- "pecker",
208
- "penis",
209
- "piss",
210
- "mossegat",
211
- "pisser",
212
- "pisses",
213
- "pissing",
214
- "treure de polleguera",
215
- "caca",
216
- "porno",
217
  "pornografia",
218
- "picades",
219
- "pube",
220
- "coques",
221
- "gatet",
222
- "violació",
223
- "violador",
224
- "recte",
225
- "retard",
226
- "rimming",
227
- "sàdic",
228
- "cargolar",
229
- "escrot",
230
  "semen",
231
- "sexe",
232
- "shag",
233
- "borratxos",
234
- "transsexual",
235
- "mossegar",
236
- "shitted",
237
- "skank",
238
- "smegma",
239
- "smut",
240
- "arrebat",
241
- "fill de puta",
242
- "spac",
243
- "spunk",
244
- "testicle",
245
- "tit",
246
- "tetas",
247
- "titt",
248
- "turd",
249
- "vagina",
250
- "viagra",
251
- "vulva",
252
- "wang",
253
- "wank",
254
- "x classificat",
255
  "xxx",
256
  ],
257
  "en": english_flagged_words,
258
  "es": english_flagged_words
259
  + [
260
- "Asesinato",
261
- "Bollera",
262
- "Cabrón",
263
- "Caca",
264
- "Chupada",
265
- "Chupapollas",
266
- "Chupetón",
267
- "Concha de tu madre",
268
- "Coprofagía",
269
- "Coño",
270
- "Culo",
271
- "Drogas",
272
- "Esperma",
273
- "Fiesta de salchichas",
274
- "Follador",
275
- "Follar",
276
- "Gilipichis",
277
- "Gilipollas",
278
- "Hacer una paja",
279
- "Haciendo el amor",
280
- "Heroína",
281
- "Hija de puta",
282
- "Hijaputa",
283
- "Hijo de puta",
284
- "Hijoputa",
285
- "Idiota",
286
- "Imbécil",
287
- "Jilipollas",
288
- "Kapullo",
289
- "Lameculos",
290
- "Maciza",
291
- "Macizorra",
292
- "Mamada",
293
- "Marica",
294
- "Mariconazo",
295
- "Maricón",
296
- "Mierda",
297
- "Nazi",
298
- "Orina",
299
- "Pedo",
300
- "Pendejo",
301
- "Pervertido",
302
- "Pezón",
303
- "Pinche",
304
- "Pis",
305
- "Prostituta",
306
- "Puta",
307
- "Racista",
308
- "Ramera",
309
- "Semen",
310
- "Sexo",
311
- "Sexo oral",
312
- "Soplagaitas",
313
- "Soplapollas",
314
- "Sádico",
315
- "Tetas grandes",
316
- "Travesti",
317
- "Trio",
318
- "Tía buena",
319
- "Verga",
320
- "Vulva",
321
- "aborto",
322
- "agallas",
323
- "anal",
324
- "ano",
325
- "arrebatar",
326
- "asno",
327
- "atornillar",
328
- "bastardo",
329
- "bestial",
330
- "bestialidad",
331
- "bolas",
332
- "bollok",
333
- "bolsa de pelota",
334
- "brida",
335
- "buceta",
336
- "cabron",
337
- "cagadas",
338
- "cagado",
339
- "cagando",
340
- "campana",
341
- "carajo",
342
  "chupar la polla",
343
- "cipa",
344
- "clítoris",
345
- "concha",
346
- "consolador",
347
- "consoladores",
348
- "corrida",
349
- "coño",
350
- "coños",
351
- "culo",
352
- "culos",
353
  "cunillingus",
354
- "córneo",
355
- "de mierda",
356
- "dique",
357
- "duche",
358
- "enojado",
359
- "escroto",
360
- "espacio",
361
- "estúpido",
362
- "extremo",
363
- "eyacula",
364
- "eyaculación",
365
- "eyaculado",
366
- "eyacular",
367
  "fagging",
368
  "felación",
369
  "felching",
370
- "folla",
371
  "follada",
372
  "follador de culo",
373
  "folladores",
374
- "follar",
375
  "fudge packer",
376
- "gallos",
377
- "grieta",
378
  "hacerse una paja",
379
- "hijo de puta",
380
  "hore",
381
- "infierno",
382
  "kock",
383
- "labios vaginales",
384
- "los pechos",
385
- "lujuria",
386
  "madre folladora",
387
- "maldita sea",
388
- "maldito",
389
- "maldito sea",
390
  "mamada",
391
- "mapache",
392
- "maricones",
393
- "maricón",
394
- "martillo",
395
- "masoquista",
396
- "masturbarse",
397
- "mear",
398
- "mierda",
399
- "molesto",
400
- "muncher alfombra",
401
- "nazi",
402
- "negro",
403
- "niggers",
404
- "orgasimo",
405
- "orgasmo",
406
- "orgasmos",
407
- "orinando",
408
- "pelusa",
409
- "pene",
410
- "perra",
411
- "perras",
412
  "perro follador",
413
- "pinchazo",
414
- "pinchazos",
415
  "pisser",
416
- "polla",
417
- "porno",
418
  "pornografía",
419
- "pube",
420
- "puta",
421
- "putas",
422
- "pájaro carpintero",
423
- "quejas",
424
- "recto",
425
- "retardar",
426
- "rimming",
427
- "sangriento",
428
- "semen",
429
- "sexo",
430
  "skank",
431
  "smegma",
432
- "sádico",
433
- "testículo",
434
- "teta",
435
- "tetas",
436
- "tirón",
437
- "tizón",
438
- "tonto",
439
- "vagina",
440
- "vete a la mierda",
441
- "viagra",
442
- "violación",
443
- "violador",
444
- "vulva",
445
- "wang",
446
  "x clasificado",
447
- "xxx",
448
- "zurullo",
449
- ],
450
- "eu": english_flagged_words
451
- + [
452
- "abortu",
453
- "anal",
454
- "ipurdi",
455
- "kabroi",
456
- "puta",
457
- "clitoris",
458
- "cunillingus",
459
- "madarikatu",
460
- "zakil",
461
- "hazia isuri",
462
- "arraio",
463
- "izorratu",
464
- "infernu",
465
- "emagaldu",
466
- "lizunkeri",
467
- "lizun",
468
- "masokista",
469
- "masturbatu",
470
- "nazi",
471
- "beltz",
472
- "orgasmo",
473
- "pixa",
474
- "porno",
475
- "pornografia",
476
- "alu",
477
- "bortxaketa",
478
- "bortxatzaile",
479
- "sadista",
480
- "ipurzulo",
481
- "hazi",
482
- "semen",
483
- "sexu",
484
- "kaka",
485
- "putaseme",
486
- "barrabil",
487
- "titi",
488
- "bagina",
489
- "viagra",
490
  ],
 
491
  "fr": english_flagged_words
492
  + [
493
- "MALPT",
494
- "anal",
495
- "anus",
496
- "arracher",
497
- "baise",
498
- "baiser",
499
- "baiseur de chien",
500
  "baiseurs",
501
- "baisée",
502
- "bander",
503
- "bellend",
504
- "bestial",
505
- "bestialité",
506
- "bigornette",
507
- "bite",
508
- "bitte",
509
- "bloblos",
510
- "bollok",
511
- "boob",
512
- "bordel",
513
- "bourré",
514
- "bourrée",
515
- "bout",
516
- "brackmard",
517
- "branlage",
518
- "branler",
519
  "branlette",
520
- "branleur",
521
  "branleuse",
522
- "bride",
523
- "brouter le cresson",
524
- "buceta",
525
- "caca",
526
- "chatte",
527
- "chattes",
528
- "chiasse",
529
- "chienne",
530
- "chiennes",
531
- "chier",
532
- "chiottes",
533
- "chié",
534
- "cipa",
535
- "clito",
536
- "clitoris",
537
- "clochard",
538
- "cochonneries",
539
- "con",
540
- "connard",
541
- "connards",
542
- "connasse",
543
- "conne",
544
- "convoitise",
545
- "coq",
546
- "coqs",
547
- "corné",
548
- "couilles",
549
- "cramouille",
550
- "cran",
551
- "cul",
552
- "culs",
553
  "cunillingus",
554
- "damné",
555
- "des balles",
556
- "digue",
557
- "duché",
558
- "déconne",
559
- "déconner",
560
- "emballeur de fudge",
561
- "emmerdant",
562
- "emmerder",
563
- "emmerdeur",
564
- "emmerdeuse",
565
- "enculer",
566
- "enculeur",
567
- "enculeurs",
568
- "enculé",
569
  "enculée",
570
- "enfer",
571
- "enfoiré",
572
- "enfoirée",
573
- "espacer",
574
- "fagging",
575
- "fagot",
576
- "fagots",
577
- "faire chier",
578
  "fellation",
579
- "fente",
580
- "fille de pute",
581
- "fils de pute",
582
- "folle",
583
- "foutre",
584
- "fuckings",
585
- "gerbe",
586
- "gerber",
587
- "godemiché",
588
- "godes",
589
- "gouine",
590
- "grande folle",
591
- "grogniasse",
592
- "gueule",
593
- "hore",
594
- "jouir",
595
- "kock",
596
- "la putain de ta mère",
597
- "les lèvres",
598
- "les seins",
599
- "luxure",
600
- "masochiste",
601
- "masturber",
602
- "merde",
603
- "merdeuse",
604
- "merdeux",
605
- "merdique",
606
- "meuf",
607
- "mère enculée",
608
- "ménage à trois",
609
- "mésange",
610
- "nazi",
611
- "negro",
612
- "nique ta mère",
613
- "nique ta race",
614
- "nègre",
615
- "nègres",
616
- "orgasim",
617
- "orgasme",
618
- "orgasmes",
619
- "palucher",
620
- "penchant",
621
- "pipe",
622
- "pipi",
623
- "piquer",
624
- "piqûres",
625
- "pisse",
626
- "pisser",
627
  "porno",
 
628
  "pornographie",
629
- "pouffiasse",
630
- "pousse-crotte",
631
- "pube",
632
- "putain",
633
- "putain de",
634
- "pute",
635
- "pédale",
636
- "pédé",
637
- "pénis",
638
- "péter",
639
- "queue",
640
- "quéquette",
641
- "ramoner",
642
- "rectum",
643
- "retard",
644
- "rimming",
645
- "râpé",
646
- "sac de billes",
647
- "sac à foutre",
648
- "sac à merde",
649
- "sadique",
650
- "salaud",
651
  "salope",
652
  "salopes",
653
- "sanglant",
654
- "scrotum",
655
- "se branler",
656
- "seins",
657
- "sexe",
658
- "skank",
659
- "smegma",
660
- "sperme",
661
- "suce",
662
  "suceuse",
663
- "tanche",
664
- "tapette",
665
- "tapis muncher",
666
- "testicule",
667
- "teuch",
668
- "titt",
669
- "tremper",
670
- "tringler",
671
- "trique",
672
- "troncher",
673
- "trou du cul",
674
- "turlute",
675
- "vagin",
676
- "viagra",
677
- "violeur",
678
- "vulve",
679
- "wang",
680
- "x évalué",
681
  "xxx",
682
- "zigounette",
683
- "zizi",
684
- "zut",
685
- "éjaculant",
686
- "éjaculation",
687
- "éjacule",
688
- "éjaculer",
689
- "éjaculé",
690
- "étron",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691
  ],
692
  "hi": english_flagged_words
693
  + [
694
- "aand",
695
- "aandu",
696
- "balatkar",
697
- "balatkari",
698
- "behen chod",
699
- "beti chod",
700
- "bhadva",
701
- "bhadve",
702
- "bhandve",
703
- "bhangi",
704
- "bhootni ke",
705
- "bhosad",
706
- "bhosadi ke",
707
- "bitching",
708
- "blowjob",
709
- "bollok",
710
- "boobe",
711
- "buceta",
712
- "chakke",
713
- "chinaal",
714
- "chinki",
715
- "chod",
716
- "chodu",
717
- "chodu bhagat",
718
- "chooche",
719
- "choochi",
720
- "choope",
721
- "choot",
722
- "choot ke baal",
723
- "chootia",
724
- "chootiya",
725
- "chuche",
726
- "chuchi",
727
- "chudaap",
728
- "chudai khanaa",
729
- "chudam chudai",
730
- "chude",
731
- "chut",
732
- "chut ka chuha",
733
- "chut ka churan",
734
- "chut ka mail",
735
- "chut ke baal",
736
- "chut ke dhakkan",
737
- "chut maarli",
738
- "chutad",
739
- "chutadd",
740
- "chutan",
741
- "chutia",
742
- "chutiya",
743
- "cipa",
744
- "cunillingus",
745
- "dink",
746
- "duche",
747
- "ejaculated",
748
- "ejaculates",
749
- "ejaculating",
750
- "fagging",
751
- "fagots",
752
- "felching",
753
- "fuckings",
754
- "fucks",
755
- "gaand",
756
- "gaandfat",
757
- "gaandmasti",
758
- "gaandufad",
759
- "gandfattu",
760
- "gandu",
761
- "gashti",
762
- "gasti",
763
- "ghassa",
764
- "ghasti",
765
- "gucchi",
766
- "gucchu",
767
- "harami",
768
- "haramzade",
769
- "hawas",
770
- "hawas ke pujari",
771
- "hijda",
772
- "hijra",
773
- "jhant",
774
- "jhant chaatu",
775
- "jhant ka keeda",
776
- "jhant ke baal",
777
- "jhant ke pissu",
778
- "jhantu",
779
- "kamine",
780
- "kaminey",
781
- "kanjar",
782
- "kutta",
783
- "kutta kamina",
784
- "kutte ki aulad",
785
- "kutte ki jat",
786
- "kuttiya",
787
- "loda",
788
- "lodu",
789
- "lund",
790
- "lund choos",
791
- "lund ka bakkal",
792
- "lund khajoor",
793
- "lundtopi",
794
- "lundure",
795
- "lusting",
796
- "maa ki chut",
797
- "maal",
798
- "madar chod",
799
- "madarchod",
800
- "madhavchod",
801
- "masochist",
802
- "mooh mein le",
803
- "mutth",
804
- "mutthal",
805
- "najayaz",
806
- "najayaz aulaad",
807
- "najayaz paidaish",
808
- "orgasim",
809
- "paki",
810
- "pataka",
811
- "patakha",
812
- "pisser",
813
- "pisses",
814
- "pissing",
815
- "pube",
816
- "pussies",
817
- "raand",
818
- "randaap",
819
- "randi",
820
- "randi rona",
821
- "rimming",
822
- "saala",
823
- "saala kutta",
824
- "saali kutti",
825
- "saali randi",
826
- "shagging",
827
- "shite",
828
- "shitted",
829
- "shitting",
830
- "shitty",
831
- "skank",
832
- "sluts",
833
- "spac",
834
- "suar",
835
- "suar ke lund",
836
- "suar ki aulad",
837
- "tatte",
838
- "tatti",
839
- "teri maa ka bhosada",
840
- "teri maa ka boba chusu",
841
- "teri maa ki behenchod ",
842
- "teri maa ki chut",
843
- "tharak",
844
- "tharki",
845
- "titt",
846
- "tu chuda",
847
- "turd",
848
- "wank",
849
- "xxx",
850
  "अंडकोश की थैली",
851
- "अंडा",
852
- "अरे नहीं",
853
- "अश्लील",
854
- "उल्लू",
855
  "एक्स रेटेड",
856
  "ओगाज़्म",
857
- "कमबख्त",
858
- "काम करना",
859
  "कामोद्दीपक चित्र",
860
  "कालीन का चूरा",
861
- "किन्नर",
862
- "कुतिया",
863
- "कुत्ते-कमीने",
864
  "कून",
865
  "कॉक",
866
- "गड़बड़",
867
- "गधा कमीने",
868
- "गधे",
869
- "गर्भपात",
870
- "गुदा",
871
  "गेंद का थैला",
872
- "गेंदों",
873
- "गोली चलाने की आवाज़",
874
- "घटिया इंसान",
875
  "चाकलेट का रंग",
876
- "चिंक",
877
- "चुभन",
878
  "चूची",
879
  "चूतड़",
880
- "चोंच",
881
- "छीनना",
882
- "जी में आये करो",
883
  "झटका बंद",
884
  "ठगना पैकर",
885
  "डिल्डो",
886
- "दुष्ट",
887
- "दूर जाने का अभद्र संकेत देना",
888
- "धत् तेरे की",
889
- "नरक",
890
- "नाजी",
891
- "निकला हुआ किनारा",
892
  "नितंब",
893
- "पंगा लेना",
894
  "पिछाड़ी",
895
  "पीड़न कामुक",
896
- "पेशाब",
897
  "पॉर्न",
898
  "फटना",
899
  "फूहड़",
900
- "बकवास",
901
  "बट",
902
- "बलात्कार",
903
  "बहुत मदहोश",
904
- "बांध",
905
- "बिल्ली",
906
  "बेल अंत",
907
- "बेवकूफों",
908
- "बोल पड़ना",
909
  "भगवान-शापित",
910
  "भगशेफ",
911
- "मल",
912
- "मलाशय",
913
  "माँ कमीने",
914
  "मुखमैथुन",
915
- "मुर्गा",
916
- "मुर्गा के",
917
  "मुर्गा चूसने वाला",
918
- "मूर्ख",
919
- "मैल",
920
- "योनि",
921
- "योनी",
922
- "यौन-संबंध",
923
  "रक्तरंजित",
924
- "लानत है",
925
- "लिंग",
926
- "लुटेरा",
927
  "लेबिया",
928
  "वहशी",
929
  "वहशीता",
930
- "वियाग्रा",
931
- "वीर्य",
932
- "वेश्या",
933
  "वैंग",
934
- "वो साले",
935
- "शिफ़्ट को",
936
  "शिश्नमल",
937
  "संभोग सुख",
938
- "सह",
939
  "सह शॉट",
940
- "साहस",
941
- "सिगरेट",
942
  "सींग का बना हुआ",
943
- "स्तन",
944
- "स्तनों",
945
- "हवस",
946
- "हस्तमैथुन",
947
- "होमोसेक्सुअल",
948
  "होर",
 
 
 
949
  ],
950
  "id": english_flagged_words
951
  + [
952
- "abortus",
953
- "anal",
954
- "dubur",
955
- "pantat",
956
- "bajingan",
957
- "keledai",
958
- "keparat",
959
- "tas bola",
960
- "bola",
961
- "bellend",
962
- "kejam",
963
- "kebinatangan",
964
- "menggerutu",
965
- "pelacur",
966
- "berdarah",
967
- "blowjob",
968
- "bollok",
969
- "dada",
970
- "payudara",
971
- "buceta",
972
- "gelandangan",
973
- "pengunyah karpet",
974
- "celah",
975
- "cipa",
976
- "kelentit",
977
- "kokang",
978
- "pengisap ayam",
979
- "ayam",
980
- "coon",
981
- "sampah",
982
- "air mani",
983
- "cumshot",
984
- "cunillingus",
985
- "vagina",
986
- "mengutuk",
987
  "kontol",
988
- "dildo",
989
- "dink",
990
- "anjing-keparat",
991
- "duche",
992
- "tanggul",
993
- "berejakulasi",
994
- "ejakulasi",
995
- "homo",
996
- "fagging",
997
- "kayu bakar",
998
- "penggemar",
999
- "felching",
1000
- "fellatio",
1001
- "flens",
1002
- "brengsek",
1003
- "kacau",
1004
- "sialan",
1005
- "persetan",
1006
- "pengepakan fudge",
1007
- "terkutuk",
1008
- "ya tuhan",
1009
- "neraka",
1010
- "hore",
1011
- "terangsang",
1012
- "kock",
1013
- "labia",
1014
- "nafsu",
1015
- "bernafsu",
1016
- "masokis",
1017
  "masturbasi",
1018
- "keparat ibu",
1019
- "nazi",
1020
- "orang negro",
1021
- "negro",
1022
- "orgasim",
1023
- "orgasme",
1024
- "cotok",
1025
- "penis",
1026
- "kencing",
1027
- "kesal",
1028
- "pisser",
1029
- "bikin",
1030
- "buritan",
1031
- "porno",
1032
  "pornografi",
1033
- "tusukan",
1034
- "menusuk",
1035
- "pube",
1036
- "pussies",
1037
- "memperkosa",
1038
- "pemerkosa",
1039
- "memperlambat",
1040
- "rimming",
1041
- "sadis",
1042
- "meniduri",
1043
- "skrotum",
1044
- "seks",
1045
- "bercinta",
1046
- "waria",
1047
- "kotoran",
1048
- "shite",
1049
- "kengerian",
1050
- "dikirim",
1051
- "buang hajat",
1052
- "menyebalkan",
1053
- "smegma",
1054
- "jelaga",
1055
- "merebut",
1056
- "dasar bajingan",
1057
- "ruang",
1058
- "keberanian",
1059
- "buah pelir",
1060
- "titt",
1061
- "viagra",
1062
- "vulva",
1063
- "wang",
1064
- "terima kasih",
1065
- "x diberi peringkat",
1066
- "xxx",
1067
  ],
1068
  "kn": english_flagged_words
1069
  + [
@@ -1682,6 +974,19 @@ flagged_words = {
1682
  "xxx",
1683
  "ânus",
1684
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
1685
  "te": english_flagged_words
1686
  + [
1687
  "గర్భస్రావం",
@@ -1831,437 +1136,69 @@ flagged_words = {
1831
  ],
1832
  "vi": english_flagged_words
1833
  + [
1834
- "sự phá thai",
1835
- "hậu môn",
1836
- "mông",
1837
- "đồ ngu",
1838
- "lừa",
1839
- "lỗ đít",
1840
- "túi bóng",
1841
- "những quả bóng",
1842
- "đồ khốn",
1843
- "tuyệt vời",
1844
- "mục sư",
1845
- "lòng tốt",
1846
- "chó cái",
1847
- "dính máu",
1848
- "công việc thổi",
1849
- "bollok",
1850
- "boob",
1851
- "ngực",
1852
- "buceta",
1853
- "ăn mày",
1854
- "thảm muncher",
1855
- "sứt mẻ",
1856
- "cipa",
1857
- "âm vật",
1858
- "gà",
1859
- "gà hút",
1860
- "gà trống",
1861
- "coon",
1862
- "tào lao",
1863
- "kiêm",
1864
- "cum",
1865
- "cunillingus",
1866
  "lồn",
1867
- "chỉ trích",
1868
- "tinh ranh",
1869
- "dương vật giả",
1870
- "dink",
1871
- "chó-chó",
1872
- "duche",
1873
- "đê",
1874
- "xuất tinh",
1875
- "fag",
1876
- "đóng băng",
1877
- "fagot",
1878
- "đồ ăn vặt",
1879
- "người hâm mộ",
1880
- "nỉ",
1881
- "thất bại",
1882
- "mặt bích",
1883
- "chết tiệt",
1884
- "quái",
1885
  "đụ",
1886
- "ôm",
1887
- "đóng gói fudge",
1888
- "địa ngục",
1889
- "",
1890
- "sừng",
1891
- "giật",
1892
- "kock",
1893
- "môi âm",
1894
- "ham muốn",
1895
- "khổ dâm",
1896
- "thủ dâm",
1897
- "mẹ kiếp",
1898
- "nazi",
1899
- "người da đen",
1900
- "người mách nước",
1901
- "cực khoái",
1902
- "người mổ",
1903
- "dương vật",
1904
- "đi tiểu",
1905
- "bực mình",
1906
- "đái",
1907
- "phân",
1908
- "khiêu dâm",
1909
- "nội dung khiêu dâm",
1910
- "châm",
1911
- "chích",
1912
- "pube",
1913
- "pussies",
1914
- "âm hộ",
1915
- "hiếp dâm",
1916
- "trực tràng",
1917
- "chậm phát triển",
1918
- "",
1919
- "người tàn bạo",
1920
- "vặn vít",
1921
- "bìu",
1922
- "tinh dịch",
1923
- "tình dục",
1924
- "lông",
1925
- "xáo trộn",
1926
- "đồng tính",
1927
- "cứt",
1928
- "shite",
1929
- "ván trượt",
1930
- "đĩ",
1931
- "quần lót",
1932
- "smegma",
1933
- "xì trum",
1934
- "con trai",
1935
- "spac",
1936
- "spunk",
1937
- "tinh hoàn",
1938
- "ăn miếng trả miếng",
1939
- "titt",
1940
- "cỏ",
1941
- "âm đạo",
1942
- "viagra",
1943
- "âm môn",
1944
- "wang",
1945
- "đã ngủ",
1946
- "con điếm",
1947
- "x đánh giá",
1948
- "xxx",
1949
  ],
1950
  "zh": english_flagged_words
1951
  + [
1952
- "13.",
1953
- "13点",
1954
- "㞗",
1955
  "三级片",
1956
- "下三烂",
1957
- "下贱",
1958
- "个老子的",
1959
- "九游",
1960
- "乳",
1961
  "乳交",
1962
- "乳头",
1963
- "乳房",
1964
- "乳波臀浪",
1965
- "交配",
1966
- "仆街",
1967
- "仆街",
1968
- "他奶奶",
1969
- "他奶奶的",
1970
- "他奶娘的",
1971
- "他妈",
1972
- "他妈ㄉ王八蛋",
1973
- "他妈地",
1974
- "他妈的",
1975
- "他娘",
1976
- "他马的",
1977
- "你个傻比",
1978
- "你他马的",
1979
- "你全家",
1980
- "你奶奶的",
1981
- "你她马的",
1982
- "你妈",
1983
- "你妈的",
1984
- "你娘",
1985
- "你娘卡好",
1986
- "你娘咧",
1987
- "你它妈的",
1988
- "你它马的",
1989
- "你是鸡",
1990
- "你是鸭",
1991
- "你老味",
1992
- "你老母",
1993
- "你老闆",
1994
- "你马的",
1995
- "做爱",
1996
- "傻比",
1997
- "傻逼",
1998
- "册那",
1999
- "冚家拎",
2000
- "冚家鏟",
2001
- "军妓",
2002
- "几八",
2003
- "几叭",
2004
- "几巴",
2005
- "几芭",
2006
- "刚度",
2007
- "刚瘪三",
2008
- "包皮",
2009
- "十三点",
2010
  "卖B",
2011
  "卖比",
2012
  "卖淫",
2013
- "卵",
2014
- "卵子",
2015
- "双峰微颤",
2016
  "口交",
2017
- "口肯",
2018
- "叫床",
2019
- "吃屎",
2020
- "后庭",
2021
- "吹箫",
2022
- "咸家伶",
2023
- "咸家鏟",
2024
- "塞你公",
2025
- "塞你娘",
2026
- "塞你母",
2027
- "塞你爸",
2028
- "塞你老师",
2029
- "塞你老母",
2030
- "处女",
2031
- "外阴",
2032
- "大卵子",
2033
- "大卵泡",
2034
- "大鸡巴",
2035
- "奶",
2036
- "奶奶的熊",
2037
- "奶子",
2038
- "奸",
2039
- "奸你",
2040
- "她妈地",
2041
- "她妈的",
2042
- "她马的",
2043
- "妈B",
2044
- "妈个B",
2045
- "妈个比",
2046
- "妈个老比",
2047
- "妈妈的",
2048
- "妈比",
2049
- "妈的",
2050
- "妈的B",
2051
- "妈逼",
2052
- "妓",
2053
- "妓女",
2054
- "妓院",
2055
- "妳她妈的",
2056
- "妳妈的",
2057
- "妳娘的",
2058
- "妳老母的",
2059
- "妳马的",
2060
- "姘头",
2061
  "姣西",
2062
- "姦",
2063
- "娘个比",
2064
- "娘的",
2065
- "婊子",
2066
- "婊子养的",
2067
- "嫖娼",
2068
- "嫖客",
2069
- "它妈地",
2070
- "它妈的",
2071
- "密洞",
2072
- "射你",
2073
- "射精",
2074
- "小乳头",
2075
- "小卵子",
2076
- "小卵泡",
2077
- "小瘪三",
2078
- "小肉粒",
2079
- "小骚比",
2080
- "小骚货",
2081
- "小鸡巴",
2082
- "小鸡鸡",
2083
- "尻",
2084
- "屁眼",
2085
- "屁股",
2086
- "屄",
2087
- "屌",
2088
- "屎忽",
2089
- "巨乳",
2090
- "干x娘",
2091
- "干七八",
2092
- "干你",
2093
- "干你妈",
2094
- "干你娘",
2095
- "干你老母",
2096
- "干你良",
2097
- "干妳妈",
2098
- "干妳娘",
2099
- "干妳老母",
2100
- "干妳马",
2101
- "干您娘",
2102
- "干机掰",
2103
- "干死CS",
2104
- "干死GM",
2105
- "干死你",
2106
- "干死客服",
2107
- "幹",
2108
- "强奸",
2109
- "强奸你",
2110
- "性",
2111
- "性交",
2112
- "性器",
2113
- "性无能",
2114
- "性爱",
2115
- "情色",
2116
- "想上你",
2117
- "懆您妈",
2118
- "懆您娘",
2119
- "懒8",
2120
- "懒八",
2121
- "懒叫",
2122
- "懒教",
2123
- "成人",
2124
- "我操你祖宗十八代",
2125
- "扒光",
2126
  "打炮",
2127
  "打飞机",
2128
  "抽插",
2129
  "招妓",
2130
  "插你",
2131
  "插死你",
2132
- "撒尿",
2133
- "撚",
2134
- "操你",
2135
- "操你全家",
2136
- "操你奶奶",
2137
- "操你妈",
2138
- "操你娘",
2139
- "操你祖宗",
2140
- "操你老妈",
2141
- "操你老母",
2142
- "操妳",
2143
- "操妳全家",
2144
- "操妳妈",
2145
- "操妳娘",
2146
- "操妳祖宗",
2147
- "操机掰",
2148
- "操比",
2149
- "操逼",
2150
- "放荡",
2151
- "日他娘",
2152
- "日你",
2153
- "日你妈",
2154
- "日你老娘",
2155
- "日你老母",
2156
- "日批",
2157
- "月经",
2158
- "机八",
2159
- "机巴",
2160
- "机机歪歪",
2161
- "杂种",
2162
- "柒",
2163
- "浪叫",
2164
- "淫",
2165
- "淫乱",
2166
- "淫妇",
2167
- "淫棍",
2168
  "淫水",
2169
- "淫秽",
2170
- "淫荡",
2171
- "淫西",
2172
- "湿透的内裤",
2173
- "激情",
2174
- "灨你娘",
2175
- "烂货",
2176
- "烂逼",
2177
- "爛",
2178
- "狗屁",
2179
- "狗日",
2180
- "狗狼养的",
2181
- "玉杵",
2182
- "王八蛋",
2183
- "瓜娃子",
2184
- "瓜婆娘",
2185
- "瓜批",
2186
- "瘪三",
2187
- "白烂",
2188
- "白痴",
2189
- "白癡",
2190
- "硬膠",
2191
- "祖宗",
2192
- "私服",
2193
- "笨實",
2194
- "笨蛋",
2195
- "粉腸",
2196
- "精子",
2197
- "老二",
2198
- "老味",
2199
- "老母",
2200
- "老瘪三",
2201
- "老骚比",
2202
- "老骚货",
2203
  "肉壁",
2204
  "肉棍子",
2205
- "肉棒",
2206
  "肉缝",
2207
  "肏",
2208
  "肛交",
2209
- "肥西",
2210
- "色情",
2211
- "花柳",
2212
- "荡妇",
2213
- "賤",
2214
- "贝肉",
2215
- "贱B",
2216
- "贱人",
2217
- "贱货",
2218
- "贼你妈",
2219
- "赛你老母",
2220
- "赛妳阿母",
2221
- "赣您娘",
2222
- "躝癱",
2223
- "轮奸",
2224
- "迷药",
2225
- "逼",
2226
- "逼样",
2227
- "野鸡",
2228
- "閪",
2229
- "阳具",
2230
- "阳萎",
2231
- "阴唇",
2232
- "阴户",
2233
- "阴核",
2234
- "阴毛",
2235
- "阴茎",
2236
- "阴道",
2237
- "阴部",
2238
- "陰莖",
2239
- "雞巴",
2240
- "靠北",
2241
- "靠母",
2242
- "靠爸",
2243
- "靠背",
2244
- "靠腰",
2245
- "驶你公",
2246
- "驶你娘",
2247
- "驶你母",
2248
- "驶你爸",
2249
- "驶你老师",
2250
- "驶你老母",
2251
- "骚比",
2252
- "骚货",
2253
- "骚逼",
2254
- "鬼公",
2255
- "鳩",
2256
- "鸡8",
2257
- "鸡八",
2258
- "鸡叭",
2259
- "鸡吧",
2260
- "鸡奸",
2261
- "鸡巴",
2262
- "鸡芭",
2263
- "鸡鸡",
2264
- "龟儿子",
2265
- "龟头",
2266
  ],
2267
  }
 
81
  flagged_words = {
82
  "ar": english_flagged_words
83
  + [
84
+ "إباحي",
85
  "احتلام",
86
+ "است",
87
+ "استمناء",
88
  "اغتصاب",
89
+ "أورغازم",
90
+ "إيروتيك",
91
+ "أير",
92
  "بز",
93
  "بزاز",
94
  "بظر",
95
+ "بورن",
96
  "بيضان",
97
+ "مص",
98
  "ثدي",
99
  "جماع",
100
+ "جنس",
101
  "حلمة",
102
+ "خلاعة",
103
  "خنثي",
104
  "خول",
105
+ "دعارة",
106
  "زب",
107
  "سحاق",
108
  "سحاقية",
109
  "سكس",
110
+ "سيكس",
111
  "شاذ",
112
+ "شبق",
113
  "شرج",
114
  "شرموطة",
115
+ "شهواني",
116
  "شهوة",
117
  "طيز",
118
+ "عادة السرية",
119
  "عاهرة",
120
  "عرص",
121
+ "فاسقة",
122
  "فرج",
123
  "قحبة",
124
  "قضيب",
125
  "كس",
 
126
  "لحس",
127
  "لعق",
128
  "لواط",
129
  "لوطي",
130
+ "مؤخرة",
131
  "متناك",
132
  "متناكة",
133
+ "مومس",
134
+ "مثير",
135
  "مص",
136
+ "مضاجعة",
137
  "مفلقسة",
138
+ "مني",
139
+ "مهتاج",
140
+ "نشوة",
141
+ "نكاح",
142
  "نيك",
143
  ],
144
  "ca": english_flagged_words
145
  + [
146
+ "cagarro",
147
+ "cagarros",
148
+ "cipote",
149
+ "cipotes",
150
+ "collons",
151
+ "colló",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  "consolador",
153
  "consoladors",
154
+ "cony",
155
+ "conys",
156
+ "corre's",
157
+ "corre't",
158
+ "corregut",
159
+ "cunillingus",
160
+ "córrer-se",
161
+ "escorreguda",
162
+ "escorregudes",
163
+ "escorregut",
164
+ "escrot",
165
+ "escrots",
166
+ "escórre's",
167
+ "escórre't",
168
+ "escórrer-se",
169
+ "mamada",
170
+ "mamadera",
171
+ "mamaderes",
172
+ "mamades",
173
+ "masturba",
174
+ "masturbacions",
175
+ "masturbació",
176
+ "masturbant",
177
+ "masturbar",
178
+ "masturbar-se",
179
+ "masturbat",
180
+ "masturbats",
181
+ "masturbes",
 
 
 
 
 
 
 
 
 
 
 
 
182
  "orgasme",
183
  "orgasmes",
184
+ "ou",
185
+ "ous",
186
+ "palla",
187
+ "palles",
 
 
 
 
 
 
188
  "pornografia",
 
 
 
 
 
 
 
 
 
 
 
 
189
  "semen",
190
+ "semens",
191
+ "verga",
192
+ "vergues",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  "xxx",
194
  ],
195
  "en": english_flagged_words,
196
  "es": english_flagged_words
197
  + [
198
+ "chupar el coño",
199
+ "chupar la concha",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  "chupar la polla",
201
+ "chupar la verga",
202
+ "comer el coño",
203
+ "comer la concha",
204
+ "comer la polla",
205
+ "comer la verga",
206
+ "coprofagía",
207
+ "correrse",
 
 
 
208
  "cunillingus",
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  "fagging",
210
  "felación",
211
  "felching",
 
212
  "follada",
213
  "follador de culo",
214
  "folladores",
 
215
  "fudge packer",
216
+ "hacer una paja",
 
217
  "hacerse una paja",
 
218
  "hore",
 
219
  "kock",
220
+ "macizorra",
 
 
221
  "madre folladora",
 
 
 
222
  "mamada",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  "perro follador",
 
 
224
  "pisser",
 
 
225
  "pornografía",
226
+ "sado",
227
+ "sadomasoquismo",
228
+ "sadomasoquista",
229
+ "sexo anal",
 
 
 
 
 
 
 
230
  "skank",
231
  "smegma",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  "x clasificado",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  ],
234
+ "eu": english_flagged_words + [],
235
  "fr": english_flagged_words
236
  + [
 
 
 
 
 
 
 
237
  "baiseurs",
238
+ "baiseur",
239
+ "baiseuse",
240
+ "baiseuses",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  "branlette",
242
+ "branlettes",
243
  "branleuse",
244
+ "branleuses",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  "cunillingus",
246
+ "cunilingus",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  "enculée",
248
+ "enculées",
249
+ "enculation",
250
+ "enculations",
251
+ "enculement",
252
+ "enculements",
 
 
 
253
  "fellation",
254
+ "fellations",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  "porno",
256
+ "pornos",
257
  "pornographie",
258
+ "pornographique",
259
+ "pornographiques",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  "salope",
261
  "salopes",
 
 
 
 
 
 
 
 
 
262
  "suceuse",
263
+ "suceuses",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  "xxx",
265
+ ],
266
+ "ha": english_flagged_words
267
+ + [
268
+ "bf",
269
+ "bura",
270
+ "burar",
271
+ "cin gindi",
272
+ "duri",
273
+ "durin",
274
+ "gindi",
275
+ "gindin",
276
+ "guntsu",
277
+ "guntsun",
278
+ "karuwa",
279
+ "karuwai",
280
+ "karuwar",
281
+ "maniyyi",
282
+ "maɗigo",
283
+ "maɗugo",
284
+ "nonuwa",
285
+ "shan gindi",
286
+ "tsuliya",
287
+ "tsuliyariskanci",
288
+ "ɗuwaiwai",
289
+ "ɗuwaiwan",
290
+ "ɗuwawu",
291
+ "ɗuwawun",
292
  ],
293
  "hi": english_flagged_words
294
  + [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  "अंडकोश की थैली",
 
 
 
 
296
  "एक्स रेटेड",
297
  "ओगाज़्म",
 
 
298
  "कामोद्दीपक चित्र",
299
  "कालीन का चूरा",
 
 
 
300
  "कून",
301
  "कॉक",
 
 
 
 
 
302
  "गेंद का थैला",
 
 
 
303
  "चाकलेट का रंग",
 
 
304
  "चूची",
305
  "चूतड़",
 
 
 
306
  "झटका बंद",
307
  "ठगना पैकर",
308
  "डिल्डो",
 
 
 
 
 
 
309
  "नितंब",
 
310
  "पिछाड़ी",
311
  "पीड़न कामुक",
 
312
  "पॉर्न",
313
  "फटना",
314
  "फूहड़",
 
315
  "बट",
 
316
  "बहुत मदहोश",
 
 
317
  "बेल अंत",
 
 
318
  "भगवान-शापित",
319
  "भगशेफ",
 
 
320
  "माँ कमीने",
321
  "मुखमैथुन",
 
 
322
  "मुर्गा चूसने वाला",
 
 
 
 
 
323
  "रक्तरंजित",
 
 
 
324
  "लेबिया",
325
  "वहशी",
326
  "वहशीता",
 
 
 
327
  "वैंग",
 
 
328
  "शिश्नमल",
329
  "संभोग सुख",
 
330
  "सह शॉट",
 
 
331
  "सींग का बना हुआ",
 
 
 
 
 
332
  "होर",
333
+ "घपा घप",
334
+ "चुदाई",
335
+ "चुदक्कड़",
336
  ],
337
  "id": english_flagged_words
338
  + [
339
+ "bokep",
340
+ "coli",
341
+ "colmek",
342
+ "grepe",
343
+ "horni",
344
+ "janda",
345
+ "jembut",
346
+ "jilat memek",
347
+ "jilmek",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  "kontol",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  "masturbasi",
350
+ "memek",
351
+ "ngentot",
352
+ "ngewe",
353
+ "peju",
354
+ "pepek",
 
 
 
 
 
 
 
 
 
355
  "pornografi",
356
+ "sange",
357
+ "sepong",
358
+ "tusbol",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  ],
360
  "kn": english_flagged_words
361
  + [
 
974
  "xxx",
975
  "ânus",
976
  ],
977
+ "ta": english_flagged_words
978
+ + [
979
+ "ஓதா",
980
+ "ஒத்தா",
981
+ "புண்டை",
982
+ "ஒம்மாளே",
983
+ "பக்கி",
984
+ "கூமுட்டை",
985
+ "கருமம்",
986
+ "சனியன்",
987
+ "கஸ்மாலம்",
988
+ "சூத்து",
989
+ ],
990
  "te": english_flagged_words
991
  + [
992
  "గర్భస్రావం",
 
1136
  ],
1137
  "vi": english_flagged_words
1138
  + [
1139
+ "cặc",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1140
  "lồn",
1141
+ "địt",
1142
+ "buồi",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1143
  "đụ",
1144
+ "nứng",
1145
+ "phò",
1146
+ " lol",
1147
+ "bú lon",
1148
+ "bú cac",
1149
+ "bú cak",
1150
+ "bú cax",
1151
+ " buoi",
1152
+ " mút",
1153
+ "húp lol",
1154
+ "húp lon",
1155
+ "húp ",
1156
+ "nung lol",
1157
+ "nung lon",
1158
+ "nug lon",
1159
+ "nung cac",
1160
+ "nung buoi",
1161
+ "thèm dit",
1162
+ "thèm lon",
1163
+ "thèm lol",
1164
+ "thèm buoi",
1165
+ "thèm dit",
1166
+ "thèm chich",
1167
+ "thèm chịch",
1168
+ "thèm cax",
1169
+ "thèm cak",
1170
+ "móc lol",
1171
+ "móc lon",
1172
+ "buscu",
1173
+ "lol non",
1174
+ "bướm non",
1175
+ " to",
1176
+ "vú bự",
1177
+ "lon to",
1178
+ "phim sex",
1179
+ "phim xex",
1180
+ "phim xxx",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1181
  ],
1182
  "zh": english_flagged_words
1183
  + [
 
 
 
1184
  "三级片",
 
 
 
 
 
1185
  "乳交",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1186
  "卖B",
1187
  "卖比",
1188
  "卖淫",
 
 
 
1189
  "口交",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1190
  "姣西",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1191
  "打炮",
1192
  "打飞机",
1193
  "抽插",
1194
  "招妓",
1195
  "插你",
1196
  "插死你",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1197
  "淫水",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1198
  "肉壁",
1199
  "肉棍子",
 
1200
  "肉缝",
1201
  "肏",
1202
  "肛交",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1203
  ],
1204
  }
fr.arpa.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:301c82d52a8e34f63937afc12970794c8783244c8c0b085a8bbfb0d54dcb9374
3
+ size 2829042764
fr.sp.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1b70d5e6556ad245e02ac76919a714ad0b7d288955df65ecd3831a42950b653
3
+ size 942639
fr_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dd605b140e7a4c20a00e06c8c70d90333d2559434acd9c182de054d6b53b13b
3
+ size 140859096
id.arpa.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e099b6216a558d6c6f6108895e2e13fbc6ffd00b59791d16d6a5f85103ac0be
3
+ size 1847280248
id.sp.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b217615a7b185e5e0c967ea5b7156fe149145221e32a54b96dfed15d98b3c807
3
+ size 926624
id_examples_with_stats.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1c05dfc6f847bccf2e79cdb90c0dbb05a7266ae77673cd9f6c3cb811dace8e8
3
+ size 89435039
languages_id.py CHANGED
@@ -26,8 +26,8 @@ langs_id = [
26
  "stopwords_id": None,
27
  "flagged_words_id": None,
28
  "fasttext_id": "arz",
29
- "sentencepiece_id": None,
30
- "kenlm_id": None,
31
  },
32
  {
33
  "lang": "Assamese",
@@ -35,8 +35,8 @@ langs_id = [
35
  "stopwords_id": None,
36
  "flagged_words_id": None,
37
  "fasttext_id": "as",
38
- "sentencepiece_id": None,
39
- "kenlm_id": None,
40
  },
41
  {
42
  "lang": "Bengali",
@@ -80,8 +80,8 @@ langs_id = [
80
  "stopwords_id": "eu",
81
  "flagged_words_id": "eu",
82
  "fasttext_id": "eu",
83
- "sentencepiece_id": None,
84
- "kenlm_id": None,
85
  },
86
  {
87
  "lang": "French",
@@ -155,32 +155,23 @@ langs_id = [
155
  "sentencepiece_id": "pt",
156
  "kenlm_id": "pt",
157
  },
158
- {
159
- "lang": "Somali",
160
- "dataset_id": "so",
161
- "stopwords_id": "so",
162
- "flagged_words_id": None,
163
- "fasttext_id": "so",
164
- "sentencepiece_id": None,
165
- "kenlm_id": None,
166
- },
167
  {
168
  "lang": "Swahili",
169
  "dataset_id": "sw",
170
  "stopwords_id": "sw",
171
  "flagged_words_id": None,
172
  "fasttext_id": "sw",
173
- "sentencepiece_id": None,
174
- "kenlm_id": None,
175
  },
176
  {
177
  "lang": "Tamil",
178
  "dataset_id": "ta",
179
  "stopwords_id": None,
180
- "flagged_words_id": None,
181
  "fasttext_id": "ta",
182
- "sentencepiece_id": None,
183
- "kenlm_id": None,
184
  },
185
  {
186
  "lang": "Telugu",
@@ -188,8 +179,8 @@ langs_id = [
188
  "stopwords_id": None,
189
  "flagged_words_id": "te",
190
  "fasttext_id": "te",
191
- "sentencepiece_id": None,
192
- "kenlm_id": None,
193
  },
194
  {
195
  "lang": "Urdu",
@@ -197,8 +188,8 @@ langs_id = [
197
  "stopwords_id": "ur",
198
  "flagged_words_id": None,
199
  "fasttext_id": "ur",
200
- "sentencepiece_id": None,
201
- "kenlm_id": None,
202
  },
203
  {
204
  "lang": "Vietnamese",
@@ -206,8 +197,8 @@ langs_id = [
206
  "stopwords_id": "vi",
207
  "flagged_words_id": "vi",
208
  "fasttext_id": "vi",
209
- "sentencepiece_id": None,
210
- "kenlm_id": None,
211
  },
212
  {
213
  "lang": "Yoruba",
@@ -215,8 +206,8 @@ langs_id = [
215
  "stopwords_id": "yo",
216
  "flagged_words_id": None,
217
  "fasttext_id": "yo",
218
- "sentencepiece_id": None,
219
- "kenlm_id": None,
220
  },
221
  {
222
  "lang": "Chinese",
 
26
  "stopwords_id": None,
27
  "flagged_words_id": None,
28
  "fasttext_id": "arz",
29
+ "sentencepiece_id": "arz",
30
+ "kenlm_id": "arz",
31
  },
32
  {
33
  "lang": "Assamese",
 
35
  "stopwords_id": None,
36
  "flagged_words_id": None,
37
  "fasttext_id": "as",
38
+ "sentencepiece_id": "as",
39
+ "kenlm_id": "as",
40
  },
41
  {
42
  "lang": "Bengali",
 
80
  "stopwords_id": "eu",
81
  "flagged_words_id": "eu",
82
  "fasttext_id": "eu",
83
+ "sentencepiece_id": "eu",
84
+ "kenlm_id": "eu",
85
  },
86
  {
87
  "lang": "French",
 
155
  "sentencepiece_id": "pt",
156
  "kenlm_id": "pt",
157
  },
 
 
 
 
 
 
 
 
 
158
  {
159
  "lang": "Swahili",
160
  "dataset_id": "sw",
161
  "stopwords_id": "sw",
162
  "flagged_words_id": None,
163
  "fasttext_id": "sw",
164
+ "sentencepiece_id": "sw",
165
+ "kenlm_id": "sw",
166
  },
167
  {
168
  "lang": "Tamil",
169
  "dataset_id": "ta",
170
  "stopwords_id": None,
171
+ "flagged_words_id": "ta",
172
  "fasttext_id": "ta",
173
+ "sentencepiece_id": "ta",
174
+ "kenlm_id": "ta",
175
  },
176
  {
177
  "lang": "Telugu",
 
179
  "stopwords_id": None,
180
  "flagged_words_id": "te",
181
  "fasttext_id": "te",
182
+ "sentencepiece_id": "te",
183
+ "kenlm_id": "te",
184
  },
185
  {
186
  "lang": "Urdu",
 
188
  "stopwords_id": "ur",
189
  "flagged_words_id": None,
190
  "fasttext_id": "ur",
191
+ "sentencepiece_id": "ur",
192
+ "kenlm_id": "ur",
193
  },
194
  {
195
  "lang": "Vietnamese",
 
197
  "stopwords_id": "vi",
198
  "flagged_words_id": "vi",
199
  "fasttext_id": "vi",
200
+ "sentencepiece_id": "vi",
201
+ "kenlm_id": "vi",
202
  },
203
  {
204
  "lang": "Yoruba",
 
206
  "stopwords_id": "yo",
207
  "flagged_words_id": None,
208
  "fasttext_id": "yo",
209
+ "sentencepiece_id": "yo",
210
+ "kenlm_id": "yo",
211
  },
212
  {
213
  "lang": "Chinese",
parameters_filtering.py CHANGED
@@ -28,9 +28,12 @@ parameters_filtering_default = {
28
  "strip_characters": special_characters_default,
29
  "number_words_min_cutoff": 1,
30
  "number_words_max_cutoff": 100000,
31
- "check_repetitions_removal": True,
32
- "repetitions_length": 10,
33
- "repetitions_max_cutoff": 0.106,
 
 
 
34
  "cond_check_special_characters": True,
35
  "special_characters": special_characters_default,
36
  "special_characters_max_cutoff": 0.4,
@@ -59,9 +62,12 @@ parameters_filtering_af = {
59
  "strip_characters": special_characters_default,
60
  "number_words_min_cutoff": 1,
61
  "number_words_max_cutoff": 100000,
62
- "check_repetitions_removal": True,
63
- "repetitions_length": 10,
64
- "repetitions_max_cutoff": 0.106,
 
 
 
65
  "cond_check_special_characters": True,
66
  "special_characters": special_characters_default,
67
  "special_characters_max_cutoff": 0.3,
@@ -90,9 +96,12 @@ parameters_filtering_ar = {
90
  "strip_characters": special_characters_default,
91
  "number_words_min_cutoff": 1,
92
  "number_words_max_cutoff": 100000,
93
- "check_repetitions_removal": True,
94
- "repetitions_length": 10,
95
- "repetitions_max_cutoff": 0.106,
 
 
 
96
  "cond_check_special_characters": True,
97
  "special_characters": special_characters_default,
98
  "special_characters_max_cutoff": 0.45,
@@ -121,9 +130,12 @@ parameters_filtering_arz = {
121
  "strip_characters": special_characters_default,
122
  "number_words_min_cutoff": 1,
123
  "number_words_max_cutoff": 100000,
124
- "check_repetitions_removal": True,
125
- "repetitions_length": 10,
126
- "repetitions_max_cutoff": 0.106,
 
 
 
127
  "cond_check_special_characters": True,
128
  "special_characters": special_characters_default,
129
  "special_characters_max_cutoff": 0.5,
@@ -152,9 +164,12 @@ parameters_filtering_as = {
152
  "strip_characters": special_characters_default,
153
  "number_words_min_cutoff": 1,
154
  "number_words_max_cutoff": 100000,
155
- "check_repetitions_removal": True,
156
- "repetitions_length": 10,
157
- "repetitions_max_cutoff": 0.106,
 
 
 
158
  "cond_check_special_characters": True,
159
  "special_characters": special_characters_default,
160
  "special_characters_max_cutoff": 0.25,
@@ -183,9 +198,12 @@ parameters_filtering_bn = {
183
  "strip_characters": special_characters_default,
184
  "number_words_min_cutoff": 1,
185
  "number_words_max_cutoff": 100000,
186
- "check_repetitions_removal": True,
187
- "repetitions_length": 10,
188
- "repetitions_max_cutoff": 0.106,
 
 
 
189
  "cond_check_special_characters": True,
190
  "special_characters": special_characters_default,
191
  "special_characters_max_cutoff": 0.275,
@@ -214,9 +232,12 @@ parameters_filtering_ca = {
214
  "strip_characters": special_characters_default,
215
  "number_words_min_cutoff": 1,
216
  "number_words_max_cutoff": 100000,
217
- "check_repetitions_removal": True,
218
- "repetitions_length": 10,
219
- "repetitions_max_cutoff": 0.106,
 
 
 
220
  "cond_check_special_characters": True,
221
  "special_characters": special_characters_default,
222
  "special_characters_max_cutoff": 0.35,
@@ -245,9 +266,12 @@ parameters_filtering_en = {
245
  "strip_characters": special_characters_default,
246
  "number_words_min_cutoff": 20,
247
  "number_words_max_cutoff": 100000,
248
- "check_repetitions_removal": True,
249
- "repetitions_length": 10,
250
- "repetitions_max_cutoff": 0.106,
 
 
 
251
  "cond_check_special_characters": True,
252
  "special_characters": special_characters_default,
253
  "special_characters_max_cutoff": 0.4,
@@ -276,9 +300,12 @@ parameters_filtering_es = {
276
  "strip_characters": special_characters_default,
277
  "number_words_min_cutoff": 1,
278
  "number_words_max_cutoff": 100000,
279
- "check_repetitions_removal": True,
280
- "repetitions_length": 10,
281
- "repetitions_max_cutoff": 0.106,
 
 
 
282
  "cond_check_special_characters": True,
283
  "special_characters": special_characters_default,
284
  "special_characters_max_cutoff": 0.3,
@@ -307,9 +334,12 @@ parameters_filtering_eu = {
307
  "strip_characters": special_characters_default,
308
  "number_words_min_cutoff": 1,
309
  "number_words_max_cutoff": 100000,
310
- "check_repetitions_removal": True,
311
- "repetitions_length": 10,
312
- "repetitions_max_cutoff": 0.106,
 
 
 
313
  "cond_check_special_characters": True,
314
  "special_characters": special_characters_default,
315
  "special_characters_max_cutoff": 0.3,
@@ -338,9 +368,12 @@ parameters_filtering_fr = {
338
  "strip_characters": special_characters_default,
339
  "number_words_min_cutoff": 1,
340
  "number_words_max_cutoff": 100000,
341
- "check_repetitions_removal": True,
342
- "repetitions_length": 10,
343
- "repetitions_max_cutoff": 0.106,
 
 
 
344
  "cond_check_special_characters": True,
345
  "special_characters": special_characters_default,
346
  "special_characters_max_cutoff": 0.35,
@@ -369,9 +402,12 @@ parameters_filtering_gu = {
369
  "strip_characters": special_characters_default,
370
  "number_words_min_cutoff": 1,
371
  "number_words_max_cutoff": 100000,
372
- "check_repetitions_removal": True,
373
- "repetitions_length": 10,
374
- "repetitions_max_cutoff": 0.106,
 
 
 
375
  "cond_check_special_characters": True,
376
  "special_characters": special_characters_default,
377
  "special_characters_max_cutoff": 0.3,
@@ -400,9 +436,12 @@ parameters_filtering_hi = {
400
  "strip_characters": special_characters_default,
401
  "number_words_min_cutoff": 1,
402
  "number_words_max_cutoff": 100000,
403
- "check_repetitions_removal": True,
404
- "repetitions_length": 10,
405
- "repetitions_max_cutoff": 0.106,
 
 
 
406
  "cond_check_special_characters": True,
407
  "special_characters": special_characters_default,
408
  "special_characters_max_cutoff": 0.35,
@@ -431,9 +470,12 @@ parameters_filtering_id = {
431
  "strip_characters": special_characters_default,
432
  "number_words_min_cutoff": 1,
433
  "number_words_max_cutoff": 100000,
434
- "check_repetitions_removal": True,
435
- "repetitions_length": 10,
436
- "repetitions_max_cutoff": 0.106,
 
 
 
437
  "cond_check_special_characters": True,
438
  "special_characters": special_characters_default,
439
  "special_characters_max_cutoff": 0.25,
@@ -462,9 +504,12 @@ parameters_filtering_kn = {
462
  "strip_characters": special_characters_default,
463
  "number_words_min_cutoff": 1,
464
  "number_words_max_cutoff": 100000,
465
- "check_repetitions_removal": True,
466
- "repetitions_length": 10,
467
- "repetitions_max_cutoff": 0.106,
 
 
 
468
  "cond_check_special_characters": True,
469
  "special_characters": special_characters_default,
470
  "special_characters_max_cutoff": 0.25,
@@ -493,9 +538,12 @@ parameters_filtering_ml = {
493
  "strip_characters": special_characters_default,
494
  "number_words_min_cutoff": 1,
495
  "number_words_max_cutoff": 100000,
496
- "check_repetitions_removal": True,
497
- "repetitions_length": 10,
498
- "repetitions_max_cutoff": 0.106,
 
 
 
499
  "cond_check_special_characters": True,
500
  "special_characters": special_characters_default,
501
  "special_characters_max_cutoff": 0.2,
@@ -524,9 +572,12 @@ parameters_filtering_mr = {
524
  "strip_characters": special_characters_default,
525
  "number_words_min_cutoff": 1,
526
  "number_words_max_cutoff": 100000,
527
- "check_repetitions_removal": True,
528
- "repetitions_length": 10,
529
- "repetitions_max_cutoff": 0.106,
 
 
 
530
  "cond_check_special_characters": True,
531
  "special_characters": special_characters_default,
532
  "special_characters_max_cutoff": 0.25,
@@ -555,9 +606,12 @@ parameters_filtering_pt = {
555
  "strip_characters": special_characters_default,
556
  "number_words_min_cutoff": 1,
557
  "number_words_max_cutoff": 100000,
558
- "check_repetitions_removal": True,
559
- "repetitions_length": 10,
560
- "repetitions_max_cutoff": 0.106,
 
 
 
561
  "cond_check_special_characters": True,
562
  "special_characters": special_characters_default,
563
  "special_characters_max_cutoff": 0.3,
@@ -574,37 +628,6 @@ parameters_filtering_pt = {
574
  "perplexity_max_cutoff": 3000000,
575
  }
576
 
577
- parameters_filtering_so = {
578
- "cond_uniform_whitespace": True,
579
- "cond_replace_unicode_punctuation": False,
580
- "cond_remove_words_with_incorrect_substrings": False,
581
- "incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
582
- "cond_remove_long_words": False,
583
- "length_word_max_cutoff": 1000,
584
- "cond_check_number_words": True,
585
- "tokenization": False,
586
- "strip_characters": special_characters_default,
587
- "number_words_min_cutoff": 1,
588
- "number_words_max_cutoff": 100000,
589
- "check_repetitions_removal": True,
590
- "repetitions_length": 10,
591
- "repetitions_max_cutoff": 0.106,
592
- "cond_check_special_characters": True,
593
- "special_characters": special_characters_default,
594
- "special_characters_max_cutoff": 0.3,
595
- "cond_words_augmentation": False,
596
- "words_augmentation_group_sizes": [],
597
- "words_augmentation_join_char": "",
598
- "cond_check_stopwords": False,
599
- "stopwords_min_cutoff": 0,
600
- "cond_check_flagged_words": False,
601
- "flagged_words_max_cutoff": 0.2,
602
- "cond_check_lang_id": True,
603
- "lang_id_min_cutoff": 0.75,
604
- "cond_check_perplexity": False,
605
- "perplexity_max_cutoff": 3000000,
606
- }
607
-
608
  parameters_filtering_sw = {
609
  "cond_uniform_whitespace": True,
610
  "cond_replace_unicode_punctuation": False,
@@ -617,9 +640,12 @@ parameters_filtering_sw = {
617
  "strip_characters": special_characters_default,
618
  "number_words_min_cutoff": 1,
619
  "number_words_max_cutoff": 100000,
620
- "check_repetitions_removal": True,
621
- "repetitions_length": 10,
622
- "repetitions_max_cutoff": 0.106,
 
 
 
623
  "cond_check_special_characters": True,
624
  "special_characters": special_characters_default,
625
  "special_characters_max_cutoff": 0.275,
@@ -648,9 +674,12 @@ parameters_filtering_ta = {
648
  "strip_characters": special_characters_default,
649
  "number_words_min_cutoff": 1,
650
  "number_words_max_cutoff": 100000,
651
- "check_repetitions_removal": True,
652
- "repetitions_length": 10,
653
- "repetitions_max_cutoff": 0.106,
 
 
 
654
  "cond_check_special_characters": True,
655
  "special_characters": special_characters_default,
656
  "special_characters_max_cutoff": 0.25,
@@ -679,9 +708,12 @@ parameters_filtering_te = {
679
  "strip_characters": special_characters_default,
680
  "number_words_min_cutoff": 1,
681
  "number_words_max_cutoff": 100000,
682
- "check_repetitions_removal": True,
683
- "repetitions_length": 10,
684
- "repetitions_max_cutoff": 0.106,
 
 
 
685
  "cond_check_special_characters": True,
686
  "special_characters": special_characters_default,
687
  "special_characters_max_cutoff": 0.25,
@@ -710,9 +742,12 @@ parameters_filtering_ur = {
710
  "strip_characters": special_characters_default,
711
  "number_words_min_cutoff": 1,
712
  "number_words_max_cutoff": 100000,
713
- "check_repetitions_removal": True,
714
- "repetitions_length": 10,
715
- "repetitions_max_cutoff": 0.106,
 
 
 
716
  "cond_check_special_characters": True,
717
  "special_characters": special_characters_default,
718
  "special_characters_max_cutoff": 0.4,
@@ -741,14 +776,17 @@ parameters_filtering_vi = {
741
  "strip_characters": special_characters_default,
742
  "number_words_min_cutoff": 1,
743
  "number_words_max_cutoff": 100000,
744
- "check_repetitions_removal": True,
745
- "repetitions_length": 10,
746
- "repetitions_max_cutoff": 0.106,
 
 
 
747
  "cond_check_special_characters": True,
748
  "special_characters": special_characters_default,
749
  "special_characters_max_cutoff": 0.35,
750
  "cond_words_augmentation": True,
751
- "words_augmentation_group_sizes": [2, 3],
752
  "words_augmentation_join_char": " ",
753
  "cond_check_stopwords": True,
754
  "stopwords_min_cutoff": 0,
@@ -772,9 +810,12 @@ parameters_filtering_yo = {
772
  "strip_characters": special_characters_default,
773
  "number_words_min_cutoff": 1,
774
  "number_words_max_cutoff": 100000,
775
- "check_repetitions_removal": True,
776
- "repetitions_length": 10,
777
- "repetitions_max_cutoff": 0.106,
 
 
 
778
  "cond_check_special_characters": True,
779
  "special_characters": special_characters_default,
780
  "special_characters_max_cutoff": 0.3,
@@ -803,14 +844,17 @@ parameters_filtering_zh = {
803
  "strip_characters": special_characters_default,
804
  "number_words_min_cutoff": 1,
805
  "number_words_max_cutoff": 100000,
806
- "check_repetitions_removal": True,
807
- "repetitions_length": 10,
808
- "repetitions_max_cutoff": 0.106,
 
 
 
809
  "cond_check_special_characters": True,
810
  "special_characters": special_characters_default,
811
  "special_characters_max_cutoff": 0.4,
812
  "cond_words_augmentation": True,
813
- "words_augmentation_group_sizes": [2, 3],
814
  "words_augmentation_join_char": "",
815
  "cond_check_stopwords": False,
816
  "stopwords_min_cutoff": 0,
@@ -841,7 +885,6 @@ parameters_filtering = {
841
  "ml": parameters_filtering_ml,
842
  "mr": parameters_filtering_mr,
843
  "pt": parameters_filtering_pt,
844
- "so": parameters_filtering_so,
845
  "sw": parameters_filtering_sw,
846
  "ta": parameters_filtering_ta,
847
  "te": parameters_filtering_te,
 
28
  "strip_characters": special_characters_default,
29
  "number_words_min_cutoff": 1,
30
  "number_words_max_cutoff": 100000,
31
+ "cond_check_character_repetition_removal": True,
32
+ "character_repetition_length": 10,
33
+ "character_repetition_max_cutoff": 0.106,
34
+ "cond_check_word_repetition_removal": True,
35
+ "word_repetition_length": 5,
36
+ "word_repetition_max_cutoff": 0.19,
37
  "cond_check_special_characters": True,
38
  "special_characters": special_characters_default,
39
  "special_characters_max_cutoff": 0.4,
 
62
  "strip_characters": special_characters_default,
63
  "number_words_min_cutoff": 1,
64
  "number_words_max_cutoff": 100000,
65
+ "cond_check_character_repetition_removal": True,
66
+ "character_repetition_length": 10,
67
+ "character_repetition_max_cutoff": 0.106,
68
+ "cond_check_word_repetition_removal": True,
69
+ "word_repetition_length": 5,
70
+ "word_repetition_max_cutoff": 0.19,
71
  "cond_check_special_characters": True,
72
  "special_characters": special_characters_default,
73
  "special_characters_max_cutoff": 0.3,
 
96
  "strip_characters": special_characters_default,
97
  "number_words_min_cutoff": 1,
98
  "number_words_max_cutoff": 100000,
99
+ "cond_check_character_repetition_removal": True,
100
+ "character_repetition_length": 10,
101
+ "character_repetition_max_cutoff": 0.106,
102
+ "cond_check_word_repetition_removal": True,
103
+ "word_repetition_length": 5,
104
+ "word_repetition_max_cutoff": 0.19,
105
  "cond_check_special_characters": True,
106
  "special_characters": special_characters_default,
107
  "special_characters_max_cutoff": 0.45,
 
130
  "strip_characters": special_characters_default,
131
  "number_words_min_cutoff": 1,
132
  "number_words_max_cutoff": 100000,
133
+ "cond_check_character_repetition_removal": True,
134
+ "character_repetition_length": 10,
135
+ "character_repetition_max_cutoff": 0.106,
136
+ "cond_check_word_repetition_removal": True,
137
+ "word_repetition_length": 5,
138
+ "word_repetition_max_cutoff": 0.19,
139
  "cond_check_special_characters": True,
140
  "special_characters": special_characters_default,
141
  "special_characters_max_cutoff": 0.5,
 
164
  "strip_characters": special_characters_default,
165
  "number_words_min_cutoff": 1,
166
  "number_words_max_cutoff": 100000,
167
+ "cond_check_character_repetition_removal": True,
168
+ "character_repetition_length": 10,
169
+ "character_repetition_max_cutoff": 0.106,
170
+ "cond_check_word_repetition_removal": True,
171
+ "word_repetition_length": 5,
172
+ "word_repetition_max_cutoff": 0.19,
173
  "cond_check_special_characters": True,
174
  "special_characters": special_characters_default,
175
  "special_characters_max_cutoff": 0.25,
 
198
  "strip_characters": special_characters_default,
199
  "number_words_min_cutoff": 1,
200
  "number_words_max_cutoff": 100000,
201
+ "cond_check_character_repetition_removal": True,
202
+ "character_repetition_length": 10,
203
+ "character_repetition_max_cutoff": 0.106,
204
+ "cond_check_word_repetition_removal": True,
205
+ "word_repetition_length": 5,
206
+ "word_repetition_max_cutoff": 0.19,
207
  "cond_check_special_characters": True,
208
  "special_characters": special_characters_default,
209
  "special_characters_max_cutoff": 0.275,
 
232
  "strip_characters": special_characters_default,
233
  "number_words_min_cutoff": 1,
234
  "number_words_max_cutoff": 100000,
235
+ "cond_check_character_repetition_removal": True,
236
+ "character_repetition_length": 10,
237
+ "character_repetition_max_cutoff": 0.106,
238
+ "cond_check_word_repetition_removal": True,
239
+ "word_repetition_length": 5,
240
+ "word_repetition_max_cutoff": 0.19,
241
  "cond_check_special_characters": True,
242
  "special_characters": special_characters_default,
243
  "special_characters_max_cutoff": 0.35,
 
266
  "strip_characters": special_characters_default,
267
  "number_words_min_cutoff": 20,
268
  "number_words_max_cutoff": 100000,
269
+ "cond_check_character_repetition_removal": True,
270
+ "character_repetition_length": 10,
271
+ "character_repetition_max_cutoff": 0.106,
272
+ "cond_check_word_repetition_removal": True,
273
+ "word_repetition_length": 5,
274
+ "word_repetition_max_cutoff": 0.19,
275
  "cond_check_special_characters": True,
276
  "special_characters": special_characters_default,
277
  "special_characters_max_cutoff": 0.4,
 
300
  "strip_characters": special_characters_default,
301
  "number_words_min_cutoff": 1,
302
  "number_words_max_cutoff": 100000,
303
+ "cond_check_character_repetition_removal": True,
304
+ "character_repetition_length": 10,
305
+ "character_repetition_max_cutoff": 0.106,
306
+ "cond_check_word_repetition_removal": True,
307
+ "word_repetition_length": 5,
308
+ "word_repetition_max_cutoff": 0.19,
309
  "cond_check_special_characters": True,
310
  "special_characters": special_characters_default,
311
  "special_characters_max_cutoff": 0.3,
 
334
  "strip_characters": special_characters_default,
335
  "number_words_min_cutoff": 1,
336
  "number_words_max_cutoff": 100000,
337
+ "cond_check_character_repetition_removal": True,
338
+ "character_repetition_length": 10,
339
+ "character_repetition_max_cutoff": 0.106,
340
+ "cond_check_word_repetition_removal": True,
341
+ "word_repetition_length": 5,
342
+ "word_repetition_max_cutoff": 0.19,
343
  "cond_check_special_characters": True,
344
  "special_characters": special_characters_default,
345
  "special_characters_max_cutoff": 0.3,
 
368
  "strip_characters": special_characters_default,
369
  "number_words_min_cutoff": 1,
370
  "number_words_max_cutoff": 100000,
371
+ "cond_check_character_repetition_removal": True,
372
+ "character_repetition_length": 10,
373
+ "character_repetition_max_cutoff": 0.106,
374
+ "cond_check_word_repetition_removal": True,
375
+ "word_repetition_length": 5,
376
+ "word_repetition_max_cutoff": 0.19,
377
  "cond_check_special_characters": True,
378
  "special_characters": special_characters_default,
379
  "special_characters_max_cutoff": 0.35,
 
402
  "strip_characters": special_characters_default,
403
  "number_words_min_cutoff": 1,
404
  "number_words_max_cutoff": 100000,
405
+ "cond_check_character_repetition_removal": True,
406
+ "character_repetition_length": 10,
407
+ "character_repetition_max_cutoff": 0.106,
408
+ "cond_check_word_repetition_removal": True,
409
+ "word_repetition_length": 5,
410
+ "word_repetition_max_cutoff": 0.19,
411
  "cond_check_special_characters": True,
412
  "special_characters": special_characters_default,
413
  "special_characters_max_cutoff": 0.3,
 
436
  "strip_characters": special_characters_default,
437
  "number_words_min_cutoff": 1,
438
  "number_words_max_cutoff": 100000,
439
+ "cond_check_character_repetition_removal": True,
440
+ "character_repetition_length": 10,
441
+ "character_repetition_max_cutoff": 0.106,
442
+ "cond_check_word_repetition_removal": True,
443
+ "word_repetition_length": 5,
444
+ "word_repetition_max_cutoff": 0.19,
445
  "cond_check_special_characters": True,
446
  "special_characters": special_characters_default,
447
  "special_characters_max_cutoff": 0.35,
 
470
  "strip_characters": special_characters_default,
471
  "number_words_min_cutoff": 1,
472
  "number_words_max_cutoff": 100000,
473
+ "cond_check_character_repetition_removal": True,
474
+ "character_repetition_length": 10,
475
+ "character_repetition_max_cutoff": 0.106,
476
+ "cond_check_word_repetition_removal": True,
477
+ "word_repetition_length": 5,
478
+ "word_repetition_max_cutoff": 0.19,
479
  "cond_check_special_characters": True,
480
  "special_characters": special_characters_default,
481
  "special_characters_max_cutoff": 0.25,
 
504
  "strip_characters": special_characters_default,
505
  "number_words_min_cutoff": 1,
506
  "number_words_max_cutoff": 100000,
507
+ "cond_check_character_repetition_removal": True,
508
+ "character_repetition_length": 10,
509
+ "character_repetition_max_cutoff": 0.106,
510
+ "cond_check_word_repetition_removal": True,
511
+ "word_repetition_length": 5,
512
+ "word_repetition_max_cutoff": 0.19,
513
  "cond_check_special_characters": True,
514
  "special_characters": special_characters_default,
515
  "special_characters_max_cutoff": 0.25,
 
538
  "strip_characters": special_characters_default,
539
  "number_words_min_cutoff": 1,
540
  "number_words_max_cutoff": 100000,
541
+ "cond_check_character_repetition_removal": True,
542
+ "character_repetition_length": 10,
543
+ "character_repetition_max_cutoff": 0.106,
544
+ "cond_check_word_repetition_removal": True,
545
+ "word_repetition_length": 5,
546
+ "word_repetition_max_cutoff": 0.19,
547
  "cond_check_special_characters": True,
548
  "special_characters": special_characters_default,
549
  "special_characters_max_cutoff": 0.2,
 
572
  "strip_characters": special_characters_default,
573
  "number_words_min_cutoff": 1,
574
  "number_words_max_cutoff": 100000,
575
+ "cond_check_character_repetition_removal": True,
576
+ "character_repetition_length": 10,
577
+ "character_repetition_max_cutoff": 0.106,
578
+ "cond_check_word_repetition_removal": True,
579
+ "word_repetition_length": 5,
580
+ "word_repetition_max_cutoff": 0.19,
581
  "cond_check_special_characters": True,
582
  "special_characters": special_characters_default,
583
  "special_characters_max_cutoff": 0.25,
 
606
  "strip_characters": special_characters_default,
607
  "number_words_min_cutoff": 1,
608
  "number_words_max_cutoff": 100000,
609
+ "cond_check_character_repetition_removal": True,
610
+ "character_repetition_length": 10,
611
+ "character_repetition_max_cutoff": 0.106,
612
+ "cond_check_word_repetition_removal": True,
613
+ "word_repetition_length": 5,
614
+ "word_repetition_max_cutoff": 0.19,
615
  "cond_check_special_characters": True,
616
  "special_characters": special_characters_default,
617
  "special_characters_max_cutoff": 0.3,
 
628
  "perplexity_max_cutoff": 3000000,
629
  }
630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
  parameters_filtering_sw = {
632
  "cond_uniform_whitespace": True,
633
  "cond_replace_unicode_punctuation": False,
 
640
  "strip_characters": special_characters_default,
641
  "number_words_min_cutoff": 1,
642
  "number_words_max_cutoff": 100000,
643
+ "cond_check_character_repetition_removal": True,
644
+ "character_repetition_length": 10,
645
+ "character_repetition_max_cutoff": 0.106,
646
+ "cond_check_word_repetition_removal": True,
647
+ "word_repetition_length": 5,
648
+ "word_repetition_max_cutoff": 0.19,
649
  "cond_check_special_characters": True,
650
  "special_characters": special_characters_default,
651
  "special_characters_max_cutoff": 0.275,
 
674
  "strip_characters": special_characters_default,
675
  "number_words_min_cutoff": 1,
676
  "number_words_max_cutoff": 100000,
677
+ "cond_check_character_repetition_removal": True,
678
+ "character_repetition_length": 10,
679
+ "character_repetition_max_cutoff": 0.106,
680
+ "cond_check_word_repetition_removal": True,
681
+ "word_repetition_length": 5,
682
+ "word_repetition_max_cutoff": 0.19,
683
  "cond_check_special_characters": True,
684
  "special_characters": special_characters_default,
685
  "special_characters_max_cutoff": 0.25,
 
708
  "strip_characters": special_characters_default,
709
  "number_words_min_cutoff": 1,
710
  "number_words_max_cutoff": 100000,
711
+ "cond_check_character_repetition_removal": True,
712
+ "character_repetition_length": 10,
713
+ "character_repetition_max_cutoff": 0.106,
714
+ "cond_check_word_repetition_removal": True,
715
+ "word_repetition_length": 5,
716
+ "word_repetition_max_cutoff": 0.19,
717
  "cond_check_special_characters": True,
718
  "special_characters": special_characters_default,
719
  "special_characters_max_cutoff": 0.25,
 
742
  "strip_characters": special_characters_default,
743
  "number_words_min_cutoff": 1,
744
  "number_words_max_cutoff": 100000,
745
+ "cond_check_character_repetition_removal": True,
746
+ "character_repetition_length": 10,
747
+ "character_repetition_max_cutoff": 0.106,
748
+ "cond_check_word_repetition_removal": True,
749
+ "word_repetition_length": 5,
750
+ "word_repetition_max_cutoff": 0.19,
751
  "cond_check_special_characters": True,
752
  "special_characters": special_characters_default,
753
  "special_characters_max_cutoff": 0.4,
 
776
  "strip_characters": special_characters_default,
777
  "number_words_min_cutoff": 1,
778
  "number_words_max_cutoff": 100000,
779
+ "cond_check_character_repetition_removal": True,
780
+ "character_repetition_length": 10,
781
+ "character_repetition_max_cutoff": 0.106,
782
+ "cond_check_word_repetition_removal": True,
783
+ "word_repetition_length": 5,
784
+ "word_repetition_max_cutoff": 0.19,
785
  "cond_check_special_characters": True,
786
  "special_characters": special_characters_default,
787
  "special_characters_max_cutoff": 0.35,
788
  "cond_words_augmentation": True,
789
+ "words_augmentation_group_sizes": [2],
790
  "words_augmentation_join_char": " ",
791
  "cond_check_stopwords": True,
792
  "stopwords_min_cutoff": 0,
 
810
  "strip_characters": special_characters_default,
811
  "number_words_min_cutoff": 1,
812
  "number_words_max_cutoff": 100000,
813
+ "cond_check_character_repetition_removal": True,
814
+ "character_repetition_length": 10,
815
+ "character_repetition_max_cutoff": 0.106,
816
+ "cond_check_word_repetition_removal": True,
817
+ "word_repetition_length": 5,
818
+ "word_repetition_max_cutoff": 0.19,
819
  "cond_check_special_characters": True,
820
  "special_characters": special_characters_default,
821
  "special_characters_max_cutoff": 0.3,
 
844
  "strip_characters": special_characters_default,
845
  "number_words_min_cutoff": 1,
846
  "number_words_max_cutoff": 100000,
847
+ "cond_check_character_repetition_removal": True,
848
+ "character_repetition_length": 10,
849
+ "character_repetition_max_cutoff": 0.106,
850
+ "cond_check_word_repetition_removal": True,
851
+ "word_repetition_length": 5,
852
+ "word_repetition_max_cutoff": 0.19,
853
  "cond_check_special_characters": True,
854
  "special_characters": special_characters_default,
855
  "special_characters_max_cutoff": 0.4,
856
  "cond_words_augmentation": True,
857
+ "words_augmentation_group_sizes": [2],
858
  "words_augmentation_join_char": "",
859
  "cond_check_stopwords": False,
860
  "stopwords_min_cutoff": 0,
 
885
  "ml": parameters_filtering_ml,
886
  "mr": parameters_filtering_mr,
887
  "pt": parameters_filtering_pt,
 
888
  "sw": parameters_filtering_sw,
889
  "ta": parameters_filtering_ta,
890
  "te": parameters_filtering_te,
stopwords.py CHANGED
The diff for this file is too large to render. See raw diff
 
zh_examples_with_stats.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b401f28bfed66f7380c439b311122b734bbf9a1f3012a331a502a83ac18c5ff
3
- size 74919679
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:318cf4641a46c9c7c16fc77171f28475cb8e96935201d3541d493b5231e8d53a
3
+ size 63524762