|
from src.rule_based_system.Rule import Rule |
|
|
|
from src.rule_based_system.TextLengthRule import TEXT_SIZE_LIMIT |
|
from src.rule_based_system.Verdict import Verdict |
|
|
|
|
|
class BadWordRule(Rule): |
|
""" |
|
Bad words obtained from corners of the internet you do not want to visit: |
|
- https://www.ensie.nl/scheldwoordenboek# |
|
- https://scheldwoorden.goedbegin.nl/ |
|
- https://nl.wiktionary.org/wiki/Categorie:Scheldwoord_in_het_Nederlands |
|
- https://www.lannoo.be/sites/default/files/books/issuu/9789401453417.pdf |
|
- https://www.dutchmultimedia.nl/meest-verschrikkelijke-engelse-scheldwoorden/ |
|
- https://www.dutchmultimedia.nl/scheldwoordenboek-1-000-den-nederlandse-scheldwoorden/ |
|
- https://www.henkyspapiamento.com/10-papiaments-scheldwoorden-die-we-liever-niet-horen/ |
|
- https://volkabulaire.nl/tag/scheldwoorden/ |
|
- https://data.world/wordlists/dirty-naughty-obscene-and-otherwise-bad-words-in-dutch |
|
""" |
|
|
|
bad_words = None |
|
|
|
def __init__(self, bad_words: list, strict: bool): |
|
self.bad_words = bad_words |
|
self.strict = strict |
|
|
|
def get_verdict(self, comment_text: str) -> Verdict: |
|
comment_text = comment_text[0:TEXT_SIZE_LIMIT] |
|
|
|
bad_words = self.find_bad_words(comment_text.split()) |
|
|
|
return Verdict(len(bad_words) == 0, bad_words) |
|
|
|
def find_bad_words(self, text: list) -> list: |
|
detected_bad_words = [] |
|
for word in text: |
|
if word in self.bad_words: |
|
detected_bad_words.append(word) |
|
|
|
return detected_bad_words |
|
|
|
def is_strict(self) -> bool: |
|
return self.strict |
|
|
|
def get_rule_description(self) -> str: |
|
return "Comment text contained %s inappropriate words" % \ |
|
('strictly' if self.is_strict() else 'ambiguous') |
|
|