m3hrdadfi's picture
Prepare for new model
a867f36
raw
history blame
1.79 kB
import re
import string
chars_to_ignore = [
",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
"#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?",
".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
]
chars_to_ignore = f"""[{"".join(chars_to_ignore)}]"""
dictionary_mapping = {
"\u200c": " ",
"\u200d": " ",
"\u200e": " ",
"\u200f": " ",
"\ufeff": " ",
"\u0307": " ",
}
def multiple_replace(text, chars_to_mapping):
pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
def remove_special_characters(text, chars_to_ignore_regex):
text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
return text
def normalizer_at_word_level(text):
words = text.split()
_text = []
for word in words:
# Normalizer at word level
_text.append(word)
return " ".join(_text) + " "
def normalizer(batch, return_dict=True, filter_trivials=False, remove_extra_space=False):
text = batch["sentence"].lower().strip()
# Dictionary mapping
text = multiple_replace(text, dictionary_mapping)
text = re.sub(" +", " ", text)
# Remove specials
text = remove_special_characters(text, chars_to_ignore)
text = re.sub(" +", " ", text)
# Normalizer at word level
text = normalizer_at_word_level(text)
text = re.sub(" +", " ", text)
if remove_extra_space:
text = text.strip()
else:
text = text.strip() + " "
if filter_trivials:
if not len(text) > 2:
text = None
if not return_dict:
return text
batch["sentence"] = text
return batch