Spaces:
Running
Running
from functools import lru_cache | |
from logging import getLogger | |
from typing import List, Optional | |
from .constant import ( | |
COMMON_SAFE_ASCII_CHARACTERS, | |
TRACE, | |
UNICODE_SECONDARY_RANGE_KEYWORD, | |
) | |
from .utils import ( | |
is_accentuated, | |
is_arabic, | |
is_arabic_isolated_form, | |
is_case_variable, | |
is_cjk, | |
is_emoticon, | |
is_hangul, | |
is_hiragana, | |
is_katakana, | |
is_latin, | |
is_punctuation, | |
is_separator, | |
is_symbol, | |
is_thai, | |
is_unprintable, | |
remove_accent, | |
unicode_range, | |
) | |
class MessDetectorPlugin: | |
""" | |
Base abstract class used for mess detection plugins. | |
All detectors MUST extend and implement given methods. | |
""" | |
def eligible(self, character: str) -> bool: | |
""" | |
Determine if given character should be fed in. | |
""" | |
raise NotImplementedError # pragma: nocover | |
def feed(self, character: str) -> None: | |
""" | |
The main routine to be executed upon character. | |
Insert the logic in witch the text would be considered chaotic. | |
""" | |
raise NotImplementedError # pragma: nocover | |
def reset(self) -> None: # pragma: no cover | |
""" | |
Permit to reset the plugin to the initial state. | |
""" | |
raise NotImplementedError | |
def ratio(self) -> float: | |
""" | |
Compute the chaos ratio based on what your feed() has seen. | |
Must NOT be lower than 0.; No restriction gt 0. | |
""" | |
raise NotImplementedError # pragma: nocover | |
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): | |
def __init__(self) -> None: | |
self._punctuation_count: int = 0 | |
self._symbol_count: int = 0 | |
self._character_count: int = 0 | |
self._last_printable_char: Optional[str] = None | |
self._frenzy_symbol_in_word: bool = False | |
def eligible(self, character: str) -> bool: | |
return character.isprintable() | |
def feed(self, character: str) -> None: | |
self._character_count += 1 | |
if ( | |
character != self._last_printable_char | |
and character not in COMMON_SAFE_ASCII_CHARACTERS | |
): | |
if is_punctuation(character): | |
self._punctuation_count += 1 | |
elif ( | |
character.isdigit() is False | |
and is_symbol(character) | |
and is_emoticon(character) is False | |
): | |
self._symbol_count += 2 | |
self._last_printable_char = character | |
def reset(self) -> None: # pragma: no cover | |
self._punctuation_count = 0 | |
self._character_count = 0 | |
self._symbol_count = 0 | |
def ratio(self) -> float: | |
if self._character_count == 0: | |
return 0.0 | |
ratio_of_punctuation: float = ( | |
self._punctuation_count + self._symbol_count | |
) / self._character_count | |
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 | |
class TooManyAccentuatedPlugin(MessDetectorPlugin): | |
def __init__(self) -> None: | |
self._character_count: int = 0 | |
self._accentuated_count: int = 0 | |
def eligible(self, character: str) -> bool: | |
return character.isalpha() | |
def feed(self, character: str) -> None: | |
self._character_count += 1 | |
if is_accentuated(character): | |
self._accentuated_count += 1 | |
def reset(self) -> None: # pragma: no cover | |
self._character_count = 0 | |
self._accentuated_count = 0 | |
def ratio(self) -> float: | |
if self._character_count < 8: | |
return 0.0 | |
ratio_of_accentuation: float = self._accentuated_count / self._character_count | |
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 | |
class UnprintablePlugin(MessDetectorPlugin): | |
def __init__(self) -> None: | |
self._unprintable_count: int = 0 | |
self._character_count: int = 0 | |
def eligible(self, character: str) -> bool: | |
return True | |
def feed(self, character: str) -> None: | |
if is_unprintable(character): | |
self._unprintable_count += 1 | |
self._character_count += 1 | |
def reset(self) -> None: # pragma: no cover | |
self._unprintable_count = 0 | |
def ratio(self) -> float: | |
if self._character_count == 0: | |
return 0.0 | |
return (self._unprintable_count * 8) / self._character_count | |
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): | |
def __init__(self) -> None: | |
self._successive_count: int = 0 | |
self._character_count: int = 0 | |
self._last_latin_character: Optional[str] = None | |
def eligible(self, character: str) -> bool: | |
return character.isalpha() and is_latin(character) | |
def feed(self, character: str) -> None: | |
self._character_count += 1 | |
if ( | |
self._last_latin_character is not None | |
and is_accentuated(character) | |
and is_accentuated(self._last_latin_character) | |
): | |
if character.isupper() and self._last_latin_character.isupper(): | |
self._successive_count += 1 | |
# Worse if its the same char duplicated with different accent. | |
if remove_accent(character) == remove_accent(self._last_latin_character): | |
self._successive_count += 1 | |
self._last_latin_character = character | |
def reset(self) -> None: # pragma: no cover | |
self._successive_count = 0 | |
self._character_count = 0 | |
self._last_latin_character = None | |
def ratio(self) -> float: | |
if self._character_count == 0: | |
return 0.0 | |
return (self._successive_count * 2) / self._character_count | |
class SuspiciousRange(MessDetectorPlugin): | |
def __init__(self) -> None: | |
self._suspicious_successive_range_count: int = 0 | |
self._character_count: int = 0 | |
self._last_printable_seen: Optional[str] = None | |
def eligible(self, character: str) -> bool: | |
return character.isprintable() | |
def feed(self, character: str) -> None: | |
self._character_count += 1 | |
if ( | |
character.isspace() | |
or is_punctuation(character) | |
or character in COMMON_SAFE_ASCII_CHARACTERS | |
): | |
self._last_printable_seen = None | |
return | |
if self._last_printable_seen is None: | |
self._last_printable_seen = character | |
return | |
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen) | |
unicode_range_b: Optional[str] = unicode_range(character) | |
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): | |
self._suspicious_successive_range_count += 1 | |
self._last_printable_seen = character | |
def reset(self) -> None: # pragma: no cover | |
self._character_count = 0 | |
self._suspicious_successive_range_count = 0 | |
self._last_printable_seen = None | |
def ratio(self) -> float: | |
if self._character_count <= 24: | |
return 0.0 | |
ratio_of_suspicious_range_usage: float = ( | |
self._suspicious_successive_range_count * 2 | |
) / self._character_count | |
return ratio_of_suspicious_range_usage | |
class SuperWeirdWordPlugin(MessDetectorPlugin): | |
def __init__(self) -> None: | |
self._word_count: int = 0 | |
self._bad_word_count: int = 0 | |
self._foreign_long_count: int = 0 | |
self._is_current_word_bad: bool = False | |
self._foreign_long_watch: bool = False | |
self._character_count: int = 0 | |
self._bad_character_count: int = 0 | |
self._buffer: str = "" | |
self._buffer_accent_count: int = 0 | |
def eligible(self, character: str) -> bool: | |
return True | |
def feed(self, character: str) -> None: | |
if character.isalpha(): | |
self._buffer += character | |
if is_accentuated(character): | |
self._buffer_accent_count += 1 | |
if ( | |
self._foreign_long_watch is False | |
and (is_latin(character) is False or is_accentuated(character)) | |
and is_cjk(character) is False | |
and is_hangul(character) is False | |
and is_katakana(character) is False | |
and is_hiragana(character) is False | |
and is_thai(character) is False | |
): | |
self._foreign_long_watch = True | |
return | |
if not self._buffer: | |
return | |
if ( | |
character.isspace() or is_punctuation(character) or is_separator(character) | |
) and self._buffer: | |
self._word_count += 1 | |
buffer_length: int = len(self._buffer) | |
self._character_count += buffer_length | |
if buffer_length >= 4: | |
if self._buffer_accent_count / buffer_length > 0.34: | |
self._is_current_word_bad = True | |
# Word/Buffer ending with an upper case accentuated letter are so rare, | |
# that we will consider them all as suspicious. Same weight as foreign_long suspicious. | |
if ( | |
is_accentuated(self._buffer[-1]) | |
and self._buffer[-1].isupper() | |
and all(_.isupper() for _ in self._buffer) is False | |
): | |
self._foreign_long_count += 1 | |
self._is_current_word_bad = True | |
if buffer_length >= 24 and self._foreign_long_watch: | |
camel_case_dst = [ | |
i | |
for c, i in zip(self._buffer, range(0, buffer_length)) | |
if c.isupper() | |
] | |
probable_camel_cased: bool = False | |
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3): | |
probable_camel_cased = True | |
if not probable_camel_cased: | |
self._foreign_long_count += 1 | |
self._is_current_word_bad = True | |
if self._is_current_word_bad: | |
self._bad_word_count += 1 | |
self._bad_character_count += len(self._buffer) | |
self._is_current_word_bad = False | |
self._foreign_long_watch = False | |
self._buffer = "" | |
self._buffer_accent_count = 0 | |
elif ( | |
character not in {"<", ">", "-", "=", "~", "|", "_"} | |
and character.isdigit() is False | |
and is_symbol(character) | |
): | |
self._is_current_word_bad = True | |
self._buffer += character | |
def reset(self) -> None: # pragma: no cover | |
self._buffer = "" | |
self._is_current_word_bad = False | |
self._foreign_long_watch = False | |
self._bad_word_count = 0 | |
self._word_count = 0 | |
self._character_count = 0 | |
self._bad_character_count = 0 | |
self._foreign_long_count = 0 | |
def ratio(self) -> float: | |
if self._word_count <= 10 and self._foreign_long_count == 0: | |
return 0.0 | |
return self._bad_character_count / self._character_count | |
class CjkInvalidStopPlugin(MessDetectorPlugin): | |
""" | |
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and | |
can be easily detected. Searching for the overuse of '丅' and '丄'. | |
""" | |
def __init__(self) -> None: | |
self._wrong_stop_count: int = 0 | |
self._cjk_character_count: int = 0 | |
def eligible(self, character: str) -> bool: | |
return True | |
def feed(self, character: str) -> None: | |
if character in {"丅", "丄"}: | |
self._wrong_stop_count += 1 | |
return | |
if is_cjk(character): | |
self._cjk_character_count += 1 | |
def reset(self) -> None: # pragma: no cover | |
self._wrong_stop_count = 0 | |
self._cjk_character_count = 0 | |
def ratio(self) -> float: | |
if self._cjk_character_count < 16: | |
return 0.0 | |
return self._wrong_stop_count / self._cjk_character_count | |
class ArchaicUpperLowerPlugin(MessDetectorPlugin): | |
def __init__(self) -> None: | |
self._buf: bool = False | |
self._character_count_since_last_sep: int = 0 | |
self._successive_upper_lower_count: int = 0 | |
self._successive_upper_lower_count_final: int = 0 | |
self._character_count: int = 0 | |
self._last_alpha_seen: Optional[str] = None | |
self._current_ascii_only: bool = True | |
def eligible(self, character: str) -> bool: | |
return True | |
def feed(self, character: str) -> None: | |
is_concerned = character.isalpha() and is_case_variable(character) | |
chunk_sep = is_concerned is False | |
if chunk_sep and self._character_count_since_last_sep > 0: | |
if ( | |
self._character_count_since_last_sep <= 64 | |
and character.isdigit() is False | |
and self._current_ascii_only is False | |
): | |
self._successive_upper_lower_count_final += ( | |
self._successive_upper_lower_count | |
) | |
self._successive_upper_lower_count = 0 | |
self._character_count_since_last_sep = 0 | |
self._last_alpha_seen = None | |
self._buf = False | |
self._character_count += 1 | |
self._current_ascii_only = True | |
return | |
if self._current_ascii_only is True and character.isascii() is False: | |
self._current_ascii_only = False | |
if self._last_alpha_seen is not None: | |
if (character.isupper() and self._last_alpha_seen.islower()) or ( | |
character.islower() and self._last_alpha_seen.isupper() | |
): | |
if self._buf is True: | |
self._successive_upper_lower_count += 2 | |
self._buf = False | |
else: | |
self._buf = True | |
else: | |
self._buf = False | |
self._character_count += 1 | |
self._character_count_since_last_sep += 1 | |
self._last_alpha_seen = character | |
def reset(self) -> None: # pragma: no cover | |
self._character_count = 0 | |
self._character_count_since_last_sep = 0 | |
self._successive_upper_lower_count = 0 | |
self._successive_upper_lower_count_final = 0 | |
self._last_alpha_seen = None | |
self._buf = False | |
self._current_ascii_only = True | |
def ratio(self) -> float: | |
if self._character_count == 0: | |
return 0.0 | |
return self._successive_upper_lower_count_final / self._character_count | |
class ArabicIsolatedFormPlugin(MessDetectorPlugin): | |
def __init__(self) -> None: | |
self._character_count: int = 0 | |
self._isolated_form_count: int = 0 | |
def reset(self) -> None: # pragma: no cover | |
self._character_count = 0 | |
self._isolated_form_count = 0 | |
def eligible(self, character: str) -> bool: | |
return is_arabic(character) | |
def feed(self, character: str) -> None: | |
self._character_count += 1 | |
if is_arabic_isolated_form(character): | |
self._isolated_form_count += 1 | |
def ratio(self) -> float: | |
if self._character_count < 8: | |
return 0.0 | |
isolated_form_usage: float = self._isolated_form_count / self._character_count | |
return isolated_form_usage | |
def is_suspiciously_successive_range( | |
unicode_range_a: Optional[str], unicode_range_b: Optional[str] | |
) -> bool: | |
""" | |
Determine if two Unicode range seen next to each other can be considered as suspicious. | |
""" | |
if unicode_range_a is None or unicode_range_b is None: | |
return True | |
if unicode_range_a == unicode_range_b: | |
return False | |
if "Latin" in unicode_range_a and "Latin" in unicode_range_b: | |
return False | |
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: | |
return False | |
# Latin characters can be accompanied with a combining diacritical mark | |
# eg. Vietnamese. | |
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( | |
"Combining" in unicode_range_a or "Combining" in unicode_range_b | |
): | |
return False | |
keywords_range_a, keywords_range_b = unicode_range_a.split( | |
" " | |
), unicode_range_b.split(" ") | |
for el in keywords_range_a: | |
if el in UNICODE_SECONDARY_RANGE_KEYWORD: | |
continue | |
if el in keywords_range_b: | |
return False | |
# Japanese Exception | |
range_a_jp_chars, range_b_jp_chars = ( | |
unicode_range_a | |
in ( | |
"Hiragana", | |
"Katakana", | |
), | |
unicode_range_b in ("Hiragana", "Katakana"), | |
) | |
if (range_a_jp_chars or range_b_jp_chars) and ( | |
"CJK" in unicode_range_a or "CJK" in unicode_range_b | |
): | |
return False | |
if range_a_jp_chars and range_b_jp_chars: | |
return False | |
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: | |
if "CJK" in unicode_range_a or "CJK" in unicode_range_b: | |
return False | |
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": | |
return False | |
# Chinese/Japanese use dedicated range for punctuation and/or separators. | |
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( | |
unicode_range_a in ["Katakana", "Hiragana"] | |
and unicode_range_b in ["Katakana", "Hiragana"] | |
): | |
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: | |
return False | |
if "Forms" in unicode_range_a or "Forms" in unicode_range_b: | |
return False | |
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": | |
return False | |
return True | |
def mess_ratio( | |
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False | |
) -> float: | |
""" | |
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. | |
""" | |
detectors: List[MessDetectorPlugin] = [ | |
md_class() for md_class in MessDetectorPlugin.__subclasses__() | |
] | |
length: int = len(decoded_sequence) + 1 | |
mean_mess_ratio: float = 0.0 | |
if length < 512: | |
intermediary_mean_mess_ratio_calc: int = 32 | |
elif length <= 1024: | |
intermediary_mean_mess_ratio_calc = 64 | |
else: | |
intermediary_mean_mess_ratio_calc = 128 | |
for character, index in zip(decoded_sequence + "\n", range(length)): | |
for detector in detectors: | |
if detector.eligible(character): | |
detector.feed(character) | |
if ( | |
index > 0 and index % intermediary_mean_mess_ratio_calc == 0 | |
) or index == length - 1: | |
mean_mess_ratio = sum(dt.ratio for dt in detectors) | |
if mean_mess_ratio >= maximum_threshold: | |
break | |
if debug: | |
logger = getLogger("charset_normalizer") | |
logger.log( | |
TRACE, | |
"Mess-detector extended-analysis start. " | |
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} " | |
f"maximum_threshold={maximum_threshold}", | |
) | |
if len(decoded_sequence) > 16: | |
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") | |
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") | |
for dt in detectors: # pragma: nocover | |
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") | |
return round(mean_mess_ratio, 3) | |