Spaces:
Runtime error
Runtime error
import string | |
import emoji | |
main_special_characters = string.punctuation + string.digits + string.whitespace | |
other_special_characters = ( | |
" ’“”–ー一▬…✦�£•€«»°·═" | |
"×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰ ‑≤≥‖" | |
"◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" | |
"゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" | |
"」﴾》" | |
) | |
emoji = list(emoji.UNICODE_EMOJI["en"].keys()) | |
special_characters_default = set(main_special_characters + other_special_characters) | |
special_characters_default.update(emoji) | |
parameters_filtering_default = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": False, | |
"length_word_max_cutoff": 50, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.4, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": False, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.70, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_af = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 25, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.3, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.6, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_ar = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 25, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.45, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 1000000, | |
} | |
parameters_filtering_arz = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 25, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.5, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_as = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 25, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.25, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_bn = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 30, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.275, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0.05, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 575000, | |
} | |
parameters_filtering_ca = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 30, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.35, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 1750000, | |
} | |
parameters_filtering_en = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": True, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 25, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 20, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.4, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0.3, | |
"cond_check_flagged_words": True, | |
"flagged_words_max_cutoff": 0.045, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.80, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 2500, | |
} | |
parameters_filtering_es = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 30, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.3, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0.2, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 2500000, | |
} | |
parameters_filtering_eu = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 35, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.3, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_fr = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 30, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.35, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0.15, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_gu = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 30, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.3, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 250000, | |
} | |
parameters_filtering_hi = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 25, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.35, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 600000, | |
} | |
parameters_filtering_id = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 30, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.25, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0.25, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 2500000, | |
} | |
parameters_filtering_kn = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 50, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.25, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 400000, | |
} | |
parameters_filtering_ml = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 50, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.2, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 1600000, | |
} | |
parameters_filtering_mr = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 30, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.25, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 425000, | |
} | |
parameters_filtering_pt = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 30, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.3, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0.15, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": True, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_so = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": False, | |
"length_word_max_cutoff": 1000, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.3, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": False, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_sw = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 30, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.275, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_ta = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 50, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.25, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_te = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 35, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.25, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_ur = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 30, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.4, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_vi = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 30, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.35, | |
"cond_words_augmentation": True, | |
"words_augmentation_group_sizes": [2, 3], | |
"words_augmentation_join_char": " ", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_yo = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": True, | |
"length_word_max_cutoff": 30, | |
"cond_check_number_words": True, | |
"tokenization": False, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.3, | |
"cond_words_augmentation": False, | |
"words_augmentation_group_sizes": [], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": True, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering_zh = { | |
"cond_uniform_whitespace": True, | |
"cond_replace_unicode_punctuation": False, | |
"cond_remove_words_with_incorrect_substrings": False, | |
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"], | |
"cond_remove_long_words": False, | |
"length_word_max_cutoff": 1000, | |
"cond_check_number_words": True, | |
"tokenization": True, | |
"strip_characters": special_characters_default, | |
"number_words_min_cutoff": 1, | |
"number_words_max_cutoff": 100000, | |
"check_repetitions_removal": True, | |
"repetitions_length": 10, | |
"repetitions_max_cutoff": 0.106, | |
"cond_check_special_characters": True, | |
"special_characters": special_characters_default, | |
"special_characters_max_cutoff": 0.4, | |
"cond_words_augmentation": True, | |
"words_augmentation_group_sizes": [2, 3], | |
"words_augmentation_join_char": "", | |
"cond_check_stopwords": False, | |
"stopwords_min_cutoff": 0, | |
"cond_check_flagged_words": False, | |
"flagged_words_max_cutoff": 0.2, | |
"cond_check_lang_id": True, | |
"lang_id_min_cutoff": 0.75, | |
"cond_check_perplexity": False, | |
"perplexity_max_cutoff": 3000000, | |
} | |
parameters_filtering = { | |
"default": parameters_filtering_default, | |
"af": parameters_filtering_af, | |
"ar": parameters_filtering_ar, | |
"arz": parameters_filtering_arz, | |
"as": parameters_filtering_as, | |
"bn": parameters_filtering_bn, | |
"ca": parameters_filtering_ca, | |
"en": parameters_filtering_en, | |
"es": parameters_filtering_es, | |
"eu": parameters_filtering_eu, | |
"fr": parameters_filtering_fr, | |
"gu": parameters_filtering_gu, | |
"hi": parameters_filtering_hi, | |
"id": parameters_filtering_id, | |
"kn": parameters_filtering_kn, | |
"ml": parameters_filtering_ml, | |
"mr": parameters_filtering_mr, | |
"pt": parameters_filtering_pt, | |
"so": parameters_filtering_so, | |
"sw": parameters_filtering_sw, | |
"ta": parameters_filtering_ta, | |
"te": parameters_filtering_te, | |
"ur": parameters_filtering_ur, | |
"vi": parameters_filtering_vi, | |
"yo": parameters_filtering_yo, | |
"zh": parameters_filtering_zh, | |
} | |