Spaces:

mboushaba
/

whisper-large-v3-vs-turbo-comparaison

Sleeping

File size: 2,941 Bytes

022d425

# author : Mohammed BOUSHABA
# date : 02/10/2024

import re
import unicodedata

class ArabicTextNormalizer:
    def __init__(self):
        self.arabic_numerals = {
            '٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4',
            '٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9'
        }

        self.arabic_punctuation = {
            '،': ',', '؛': ';', '؟': '?', '«': '"', '»': '"'
        }

        self.removable_diacritics = re.compile(r'[\u064B-\u065F\u0670]')

        self.replacers = {
            # Common Arabic contractions and their expansions
            r'\bإن شاء الله\b': 'ان شاء الله',
            r'\bبإذن الله\b': 'باذن الله',
            r'\bالسلام عليكم\b': 'السلام عليكم',
            # Add more Arabic-specific contractions here
        }

    def remove_diacritics(self, text):
        return self.removable_diacritics.sub('', text)

    def normalize_numerals(self, text):
        for arabic, western in self.arabic_numerals.items():
            text = text.replace(arabic, western)
        return text

    def normalize_punctuation(self, text):
        for arabic, western in self.arabic_punctuation.items():
            text = text.replace(arabic, western)
        return text

    def remove_tatweel(self, text):
        return text.replace('\u0640', '')  # Remove tatweel (kashida)

    def remove_dots(self, text):
        return text.replace('.', '')

    def remove_non_arabic(self, text):
        return ''.join(c for c in text if '\u0600' <= c <= '\u06FF' or c.isascii())

    def __call__(self, text):
        # Convert to NFC form for consistent Unicode representation
        text = unicodedata.normalize('NFC', text)

        # Apply replacements for common contractions
        for pattern, replacement in self.replacers.items():
            text = re.sub(pattern, replacement, text)

        # Normalize Arabic-specific elements
        text = self.remove_diacritics(text)
        text = self.normalize_numerals(text)
        #text = self.normalize_punctuation(text)
        text = self.remove_tatweel(text)
        text = self.remove_dots(text)

        # Remove non-Arabic characters (except ASCII)
        text = self.remove_non_arabic(text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

# Example usage
if __name__ == "__main__":
    normalizer = ArabicTextNormalizer()

    test_texts = [
        "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللهِ وَبَرَكَاتُهُ",
        "إن شــــاء الله سنلتقي في الساعة ٣:٣٠ مساءً",
        "كَانَ هُنَاكَ ١٢٣٤٥ شَخْصًا فِي الْمَلْعَبِ",
    ]

    for text in test_texts:
        normalized = normalizer(text)
        print(f"Original: {text}")
        print(f"Normalized: {normalized}")
        print()