File size: 2,941 Bytes
022d425
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# author : Mohammed BOUSHABA
# date : 02/10/2024

import re
import unicodedata

class ArabicTextNormalizer:
    def __init__(self):
        self.arabic_numerals = {
            'ู ': '0', 'ูก': '1', 'ูข': '2', 'ูฃ': '3', 'ูค': '4',
            'ูฅ': '5', 'ูฆ': '6', 'ูง': '7', 'ูจ': '8', 'ูฉ': '9'
        }

        self.arabic_punctuation = {
            'ุŒ': ',', 'ุ›': ';', 'ุŸ': '?', 'ยซ': '"', 'ยป': '"'
        }

        self.removable_diacritics = re.compile(r'[\u064B-\u065F\u0670]')

        self.replacers = {
            # Common Arabic contractions and their expansions
            r'\bุฅู† ุดุงุก ุงู„ู„ู‡\b': 'ุงู† ุดุงุก ุงู„ู„ู‡',
            r'\bุจุฅุฐู† ุงู„ู„ู‡\b': 'ุจุงุฐู† ุงู„ู„ู‡',
            r'\bุงู„ุณู„ุงู… ุนู„ูŠูƒู…\b': 'ุงู„ุณู„ุงู… ุนู„ูŠูƒู…',
            # Add more Arabic-specific contractions here
        }

    def remove_diacritics(self, text):
        return self.removable_diacritics.sub('', text)

    def normalize_numerals(self, text):
        for arabic, western in self.arabic_numerals.items():
            text = text.replace(arabic, western)
        return text

    def normalize_punctuation(self, text):
        for arabic, western in self.arabic_punctuation.items():
            text = text.replace(arabic, western)
        return text

    def remove_tatweel(self, text):
        return text.replace('\u0640', '')  # Remove tatweel (kashida)

    def remove_dots(self, text):
        return text.replace('.', '')

    def remove_non_arabic(self, text):
        return ''.join(c for c in text if '\u0600' <= c <= '\u06FF' or c.isascii())

    def __call__(self, text):
        # Convert to NFC form for consistent Unicode representation
        text = unicodedata.normalize('NFC', text)

        # Apply replacements for common contractions
        for pattern, replacement in self.replacers.items():
            text = re.sub(pattern, replacement, text)

        # Normalize Arabic-specific elements
        text = self.remove_diacritics(text)
        text = self.normalize_numerals(text)
        #text = self.normalize_punctuation(text)
        text = self.remove_tatweel(text)
        text = self.remove_dots(text)

        # Remove non-Arabic characters (except ASCII)
        text = self.remove_non_arabic(text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

# Example usage
if __name__ == "__main__":
    normalizer = ArabicTextNormalizer()

    test_texts = [
        "ุงู„ุณูŽู‘ู„ูŽุงู…ู ุนูŽู„ูŽูŠู’ูƒูู…ู’ ูˆูŽุฑูŽุญู’ู…ูŽุฉู ุงู„ู„ู‡ู ูˆูŽุจูŽุฑูŽูƒูŽุงุชูู‡ู",
        "ุฅู† ุดู€ู€ู€ู€ุงุก ุงู„ู„ู‡ ุณู†ู„ุชู‚ูŠ ููŠ ุงู„ุณุงุนุฉ ูฃ:ูฃู  ู…ุณุงุกู‹",
        "ูƒูŽุงู†ูŽ ู‡ูู†ูŽุงูƒูŽ ูกูขูฃูคูฅ ุดูŽุฎู’ุตู‹ุง ูููŠ ุงู„ู’ู…ูŽู„ู’ุนูŽุจู",
    ]

    for text in test_texts:
        normalized = normalizer(text)
        print(f"Original: {text}")
        print(f"Normalized: {normalized}")
        print()