mboushaba's picture
Upload 2 files
022d425 verified
raw
history blame
2.94 kB
# author : Mohammed BOUSHABA
# date : 02/10/2024
import re
import unicodedata
class ArabicTextNormalizer:
def __init__(self):
self.arabic_numerals = {
'ู ': '0', 'ูก': '1', 'ูข': '2', 'ูฃ': '3', 'ูค': '4',
'ูฅ': '5', 'ูฆ': '6', 'ูง': '7', 'ูจ': '8', 'ูฉ': '9'
}
self.arabic_punctuation = {
'ุŒ': ',', 'ุ›': ';', 'ุŸ': '?', 'ยซ': '"', 'ยป': '"'
}
self.removable_diacritics = re.compile(r'[\u064B-\u065F\u0670]')
self.replacers = {
# Common Arabic contractions and their expansions
r'\bุฅู† ุดุงุก ุงู„ู„ู‡\b': 'ุงู† ุดุงุก ุงู„ู„ู‡',
r'\bุจุฅุฐู† ุงู„ู„ู‡\b': 'ุจุงุฐู† ุงู„ู„ู‡',
r'\bุงู„ุณู„ุงู… ุนู„ูŠูƒู…\b': 'ุงู„ุณู„ุงู… ุนู„ูŠูƒู…',
# Add more Arabic-specific contractions here
}
def remove_diacritics(self, text):
return self.removable_diacritics.sub('', text)
def normalize_numerals(self, text):
for arabic, western in self.arabic_numerals.items():
text = text.replace(arabic, western)
return text
def normalize_punctuation(self, text):
for arabic, western in self.arabic_punctuation.items():
text = text.replace(arabic, western)
return text
def remove_tatweel(self, text):
return text.replace('\u0640', '') # Remove tatweel (kashida)
def remove_dots(self, text):
return text.replace('.', '')
def remove_non_arabic(self, text):
return ''.join(c for c in text if '\u0600' <= c <= '\u06FF' or c.isascii())
def __call__(self, text):
# Convert to NFC form for consistent Unicode representation
text = unicodedata.normalize('NFC', text)
# Apply replacements for common contractions
for pattern, replacement in self.replacers.items():
text = re.sub(pattern, replacement, text)
# Normalize Arabic-specific elements
text = self.remove_diacritics(text)
text = self.normalize_numerals(text)
#text = self.normalize_punctuation(text)
text = self.remove_tatweel(text)
text = self.remove_dots(text)
# Remove non-Arabic characters (except ASCII)
text = self.remove_non_arabic(text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
# Example usage
if __name__ == "__main__":
normalizer = ArabicTextNormalizer()
test_texts = [
"ุงู„ุณูŽู‘ู„ูŽุงู…ู ุนูŽู„ูŽูŠู’ูƒูู…ู’ ูˆูŽุฑูŽุญู’ู…ูŽุฉู ุงู„ู„ู‡ู ูˆูŽุจูŽุฑูŽูƒูŽุงุชูู‡ู",
"ุฅู† ุดู€ู€ู€ู€ุงุก ุงู„ู„ู‡ ุณู†ู„ุชู‚ูŠ ููŠ ุงู„ุณุงุนุฉ ูฃ:ูฃู  ู…ุณุงุกู‹",
"ูƒูŽุงู†ูŽ ู‡ูู†ูŽุงูƒูŽ ูกูขูฃูคูฅ ุดูŽุฎู’ุตู‹ุง ูููŠ ุงู„ู’ู…ูŽู„ู’ุนูŽุจู",
]
for text in test_texts:
normalized = normalizer(text)
print(f"Original: {text}")
print(f"Normalized: {normalized}")
print()