|
|
|
|
|
|
|
import re |
|
import unicodedata |
|
|
|
class ArabicTextNormalizer: |
|
def __init__(self): |
|
self.arabic_numerals = { |
|
'ู ': '0', 'ูก': '1', 'ูข': '2', 'ูฃ': '3', 'ูค': '4', |
|
'ูฅ': '5', 'ูฆ': '6', 'ูง': '7', 'ูจ': '8', 'ูฉ': '9' |
|
} |
|
|
|
self.arabic_punctuation = { |
|
'ุ': ',', 'ุ': ';', 'ุ': '?', 'ยซ': '"', 'ยป': '"' |
|
} |
|
|
|
self.removable_diacritics = re.compile(r'[\u064B-\u065F\u0670]') |
|
|
|
self.replacers = { |
|
|
|
r'\bุฅู ุดุงุก ุงููู\b': 'ุงู ุดุงุก ุงููู', |
|
r'\bุจุฅุฐู ุงููู\b': 'ุจุงุฐู ุงููู', |
|
r'\bุงูุณูุงู
ุนูููู
\b': 'ุงูุณูุงู
ุนูููู
', |
|
|
|
} |
|
|
|
def remove_diacritics(self, text): |
|
return self.removable_diacritics.sub('', text) |
|
|
|
def normalize_numerals(self, text): |
|
for arabic, western in self.arabic_numerals.items(): |
|
text = text.replace(arabic, western) |
|
return text |
|
|
|
def normalize_punctuation(self, text): |
|
for arabic, western in self.arabic_punctuation.items(): |
|
text = text.replace(arabic, western) |
|
return text |
|
|
|
def remove_tatweel(self, text): |
|
return text.replace('\u0640', '') |
|
|
|
def remove_dots(self, text): |
|
return text.replace('.', '') |
|
|
|
def remove_non_arabic(self, text): |
|
return ''.join(c for c in text if '\u0600' <= c <= '\u06FF' or c.isascii()) |
|
|
|
def __call__(self, text): |
|
|
|
text = unicodedata.normalize('NFC', text) |
|
|
|
|
|
for pattern, replacement in self.replacers.items(): |
|
text = re.sub(pattern, replacement, text) |
|
|
|
|
|
text = self.remove_diacritics(text) |
|
text = self.normalize_numerals(text) |
|
|
|
text = self.remove_tatweel(text) |
|
text = self.remove_dots(text) |
|
|
|
|
|
text = self.remove_non_arabic(text) |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
|
return text |
|
|
|
|
|
if __name__ == "__main__": |
|
normalizer = ArabicTextNormalizer() |
|
|
|
test_texts = [ |
|
"ุงูุณููููุงู
ู ุนูููููููู
ู ููุฑูุญูู
ูุฉู ุงูููู ููุจูุฑูููุงุชููู", |
|
"ุฅู ุดููููุงุก ุงููู ุณููุชูู ูู ุงูุณุงุนุฉ ูฃ:ูฃู ู
ุณุงุกู", |
|
"ููุงูู ููููุงูู ูกูขูฃูคูฅ ุดูุฎูุตูุง ููู ุงููู
ูููุนูุจู", |
|
] |
|
|
|
for text in test_texts: |
|
normalized = normalizer(text) |
|
print(f"Original: {text}") |
|
print(f"Normalized: {normalized}") |
|
print() |