import re from urlextract import URLExtract from src.start_up.start_up_bad_words_rule import create_bad_word_rule from src.config import config from src.rule_based_system.HTMLRule import HTMLRule from src.rule_based_system.PersonalDetailsRule import PersonalDetailsRule from src.rule_based_system.RuleBasedSystem import RuleBasedSystem from src.rule_based_system.TextLengthRule import TextLengthRule from src.rule_based_system.UrlRule import UrlRule def create_strong_rbs() -> RuleBasedSystem: text_length_rule = TextLengthRule() url_rule = UrlRule(URLExtract()) mail_rule = PersonalDetailsRule([r'[\w.+-]+@[\w-]+\.[\w.-]+'], True) strict_bad_word_rule = create_bad_word_rule(config['bad_words_strict'], True) return RuleBasedSystem([ text_length_rule, # todo: check if this make sense to add here, 500 was our own chosen max length url_rule, mail_rule, strict_bad_word_rule ]) def create_weak_rbs() -> RuleBasedSystem: phone_regex = r"(^\+[0-9]{2}|^\+[0-9]{2}\(0\)|^\(\+[0-9]{2}\)\(0\)|^00[0-9]{2}|^0)([0-9]{9}$|[0-9\-\s]{10}$)" phone_home_local = re.compile(r".*?(\(?\d{3}\D{0,3}\d{2}\D{0,3}\d{2}).*?", re.S) phone_home = re.compile(r".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{2}\D{0,3}\d{2}).*?", re.S) phone_mobile = re.compile(r".*?(\(?\d{2}\D{0,3}\d{3}\D{0,3}\d{3}\D{0,3}\d{2}).*?", re.S) phone_mobile_international = re.compile(r".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{3}\D{0,3}\d{2}).*?", re.S) phone_regexes = [phone_regex, phone_home_local, phone_home, phone_mobile, phone_mobile_international] phone_number_rule = PersonalDetailsRule(phone_regexes, False) html_rule = HTMLRule() ambiguous_bad_word_rule = create_bad_word_rule(config['bad_words_ambiguous'], False) # rule systems return RuleBasedSystem([ phone_number_rule, html_rule, ambiguous_bad_word_rule ])