# author : Mohammed BOUSHABA # date : 02/10/2024 import re import unicodedata class ArabicTextNormalizer: def __init__(self): self.arabic_numerals = { '٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4', '٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9' } self.arabic_punctuation = { '،': ',', '؛': ';', '؟': '?', '«': '"', '»': '"' } self.removable_diacritics = re.compile(r'[\u064B-\u065F\u0670]') self.replacers = { # Common Arabic contractions and their expansions r'\bإن شاء الله\b': 'ان شاء الله', r'\bبإذن الله\b': 'باذن الله', r'\bالسلام عليكم\b': 'السلام عليكم', # Add more Arabic-specific contractions here } def remove_diacritics(self, text): return self.removable_diacritics.sub('', text) def normalize_numerals(self, text): for arabic, western in self.arabic_numerals.items(): text = text.replace(arabic, western) return text def normalize_punctuation(self, text): for arabic, western in self.arabic_punctuation.items(): text = text.replace(arabic, western) return text def remove_tatweel(self, text): return text.replace('\u0640', '') # Remove tatweel (kashida) def remove_dots(self, text): return text.replace('.', '') def remove_non_arabic(self, text): return ''.join(c for c in text if '\u0600' <= c <= '\u06FF' or c.isascii()) def __call__(self, text): # Convert to NFC form for consistent Unicode representation text = unicodedata.normalize('NFC', text) # Apply replacements for common contractions for pattern, replacement in self.replacers.items(): text = re.sub(pattern, replacement, text) # Normalize Arabic-specific elements text = self.remove_diacritics(text) text = self.normalize_numerals(text) #text = self.normalize_punctuation(text) text = self.remove_tatweel(text) text = self.remove_dots(text) # Remove non-Arabic characters (except ASCII) text = self.remove_non_arabic(text) # Remove extra whitespace text = re.sub(r'\s+', ' ', text).strip() return text # Example usage if __name__ == "__main__": normalizer = ArabicTextNormalizer() test_texts = [ "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللهِ وَبَرَكَاتُهُ", "إن شــــاء الله سنلتقي في الساعة ٣:٣٠ مساءً", "كَانَ هُنَاكَ ١٢٣٤٥ شَخْصًا فِي الْمَلْعَبِ", ] for text in test_texts: normalized = normalizer(text) print(f"Original: {text}") print(f"Normalized: {normalized}") print()