Spaces:

mboushaba
/

whisper-large-v3-vs-turbo-comparaison

Running

App Files Files Community

whisper-large-v3-vs-turbo-comparaison / arabic_normalizer.py

mboushaba

Upload 2 files

022d425 verified about 9 hours ago

raw

history blame contribute delete

2.94 kB

	# author : Mohammed BOUSHABA
	# date : 02/10/2024

	import re
	import unicodedata

	class ArabicTextNormalizer:
	def __init__(self):
	self.arabic_numerals = {
	'٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4',
	'٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9'
	}

	self.arabic_punctuation = {
	'،': ',', '؛': ';', '؟': '?', '«': '"', '»': '"'
	}

	self.removable_diacritics = re.compile(r'[\u064B-\u065F\u0670]')

	self.replacers = {
	# Common Arabic contractions and their expansions
	r'\bإن شاء الله\b': 'ان شاء الله',
	r'\bبإذن الله\b': 'باذن الله',
	r'\bالسلام عليكم\b': 'السلام عليكم',
	# Add more Arabic-specific contractions here
	}

	def remove_diacritics(self, text):
	return self.removable_diacritics.sub('', text)

	def normalize_numerals(self, text):
	for arabic, western in self.arabic_numerals.items():
	text = text.replace(arabic, western)
	return text

	def normalize_punctuation(self, text):
	for arabic, western in self.arabic_punctuation.items():
	text = text.replace(arabic, western)
	return text

	def remove_tatweel(self, text):
	return text.replace('\u0640', '') # Remove tatweel (kashida)

	def remove_dots(self, text):
	return text.replace('.', '')

	def remove_non_arabic(self, text):
	return ''.join(c for c in text if '\u0600' <= c <= '\u06FF' or c.isascii())

	def __call__(self, text):
	# Convert to NFC form for consistent Unicode representation
	text = unicodedata.normalize('NFC', text)

	# Apply replacements for common contractions
	for pattern, replacement in self.replacers.items():
	text = re.sub(pattern, replacement, text)

	# Normalize Arabic-specific elements
	text = self.remove_diacritics(text)
	text = self.normalize_numerals(text)
	#text = self.normalize_punctuation(text)
	text = self.remove_tatweel(text)
	text = self.remove_dots(text)

	# Remove non-Arabic characters (except ASCII)
	text = self.remove_non_arabic(text)

	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text).strip()

	return text

	# Example usage
	if __name__ == "__main__":
	normalizer = ArabicTextNormalizer()

	test_texts = [
	"السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللهِ وَبَرَكَاتُهُ",
	"إن شــــاء الله سنلتقي في الساعة ٣:٣٠ مساءً",
	"كَانَ هُنَاكَ ١٢٣٤٥ شَخْصًا فِي الْمَلْعَبِ",
	]

	for text in test_texts:
	normalized = normalizer(text)
	print(f"Original: {text}")
	print(f"Normalized: {normalized}")
	print()