File size: 2,941 Bytes
022d425 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
# author : Mohammed BOUSHABA
# date : 02/10/2024
import re
import unicodedata
class ArabicTextNormalizer:
def __init__(self):
self.arabic_numerals = {
'ู ': '0', 'ูก': '1', 'ูข': '2', 'ูฃ': '3', 'ูค': '4',
'ูฅ': '5', 'ูฆ': '6', 'ูง': '7', 'ูจ': '8', 'ูฉ': '9'
}
self.arabic_punctuation = {
'ุ': ',', 'ุ': ';', 'ุ': '?', 'ยซ': '"', 'ยป': '"'
}
self.removable_diacritics = re.compile(r'[\u064B-\u065F\u0670]')
self.replacers = {
# Common Arabic contractions and their expansions
r'\bุฅู ุดุงุก ุงููู\b': 'ุงู ุดุงุก ุงููู',
r'\bุจุฅุฐู ุงููู\b': 'ุจุงุฐู ุงููู',
r'\bุงูุณูุงู
ุนูููู
\b': 'ุงูุณูุงู
ุนูููู
',
# Add more Arabic-specific contractions here
}
def remove_diacritics(self, text):
return self.removable_diacritics.sub('', text)
def normalize_numerals(self, text):
for arabic, western in self.arabic_numerals.items():
text = text.replace(arabic, western)
return text
def normalize_punctuation(self, text):
for arabic, western in self.arabic_punctuation.items():
text = text.replace(arabic, western)
return text
def remove_tatweel(self, text):
return text.replace('\u0640', '') # Remove tatweel (kashida)
def remove_dots(self, text):
return text.replace('.', '')
def remove_non_arabic(self, text):
return ''.join(c for c in text if '\u0600' <= c <= '\u06FF' or c.isascii())
def __call__(self, text):
# Convert to NFC form for consistent Unicode representation
text = unicodedata.normalize('NFC', text)
# Apply replacements for common contractions
for pattern, replacement in self.replacers.items():
text = re.sub(pattern, replacement, text)
# Normalize Arabic-specific elements
text = self.remove_diacritics(text)
text = self.normalize_numerals(text)
#text = self.normalize_punctuation(text)
text = self.remove_tatweel(text)
text = self.remove_dots(text)
# Remove non-Arabic characters (except ASCII)
text = self.remove_non_arabic(text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
# Example usage
if __name__ == "__main__":
normalizer = ArabicTextNormalizer()
test_texts = [
"ุงูุณููููุงู
ู ุนูููููููู
ู ููุฑูุญูู
ูุฉู ุงูููู ููุจูุฑูููุงุชููู",
"ุฅู ุดููููุงุก ุงููู ุณููุชูู ูู ุงูุณุงุนุฉ ูฃ:ูฃู ู
ุณุงุกู",
"ููุงูู ููููุงูู ูกูขูฃูคูฅ ุดูุฎูุตูุง ููู ุงููู
ูููุนูุจู",
]
for text in test_texts:
normalized = normalizer(text)
print(f"Original: {text}")
print(f"Normalized: {normalized}")
print() |