Spaces:

mboushaba
/

whisper-large-v3-vs-turbo-comparaison

Sleeping

App Files Files Community

mboushaba commited on Oct 2, 2024

Commit

022d425

verified ·

1 Parent(s): b99be23

Upload 2 files

Browse files

Files changed (2) hide show

app.py +121 -0
arabic_normalizer.py +87 -0

app.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+import gradio as gr
+from datasets import Audio
+from datasets import load_dataset
+from jiwer import wer, cer
+from transformers import pipeline
+from arabic_normalizer import ArabicTextNormalizer
+# Load dataset
+common_voice = load_dataset("mozilla-foundation/common_voice_11_0", name = "ar", split = "train")
+# select column that will be used
+common_voice = common_voice.select_columns(["audio", "sentence"])
+generate_kwargs = {
+    "language": "arabic",
+    "task": "transcribe"
+}
+# Initialize ASR pipeline
+asr_whisper_large = pipeline("automatic-speech-recognition", model = "openai/whisper-large-v3", device = 0,
+                             generate_kwargs = generate_kwargs)
+asr_whisper_large_turbo = pipeline("automatic-speech-recognition", model = "openai/whisper-large-v3-turbo",
+                                   device = 0, generate_kwargs = generate_kwargs)
+normalizer = ArabicTextNormalizer()
+def generate_audio(index = None):
+    """Select an audio sample, resample if needed, and transcribe using ASR."""
+    # inspect dataset
+    # print(common_voice)
+    # print(common_voice.features)
+    # resample audio using dataset function
+    global common_voice
+    common_voice = common_voice.cast_column("audio", Audio(sampling_rate = 16000))
+    # print(common_voice.features)
+    # Randomly shuffle the dataset and pick the first sample
+    example = common_voice.shuffle()[0]
+    audio = example["audio"]
+    # Ground truth transcription (for WER/CER calculations)
+    reference_text = normalizer(example["sentence"])
+    # Prepare audio data for ASR
+    audio_data = {
+        "array": audio["array"],
+        "sampling_rate": audio["sampling_rate"]
+    }
+    audio_data_turbo = {
+        "raw": audio["array"],
+        "sampling_rate": audio["sampling_rate"]
+    }
+    # Perform automatic speech recognition (ASR) directly on the resampled audio array
+    asr_output = asr_whisper_large(audio_data)
+    asr_output_turbo = asr_whisper_large_turbo(audio_data_turbo)
+    # Extract the transcription from the ASR model output
+    predicted_text = normalizer(asr_output["text"])
+    predicted_text_turbo = normalizer(asr_output_turbo["text"])
+    # Compute WER, Word Accuracy, and CER
+    wer_score = wer(reference_text, predicted_text)
+    cer_score = cer(reference_text, predicted_text)
+    wer_score_turbo = wer(reference_text, predicted_text_turbo)
+    cer_score_turbo = cer(reference_text, predicted_text_turbo)
+    # Prepare display data: original sentence, sampling rate, ASR transcription, and metrics
+    sentence_info = "-".join([reference_text, str(audio["sampling_rate"])])
+    return ((
+                audio["sampling_rate"],
+                audio["array"]
+            ), sentence_info, predicted_text, wer_score, cer_score, predicted_text_turbo,
+            wer_score_turbo, cer_score_turbo)
+def update_ui():
+    res = []
+    for i in range(4):
+        res.append(gr.Textbox(label=f"Label {i}"))
+    return res
+with (gr.Blocks() as demo):
+    gr.HTML("""
+        <h1>Whisper Arabic: ASR Comparison (large and large turbo)</h1>""")
+    gr.Markdown("""
+        This is a demo to compare the outputs, WER & CER of two ASR models (Whisper large and large turbo) using
+        arabic dataset from mozilla-foundation/common_voice_11_0
+    """)
+    num_samples_input = gr.Slider(minimum=1, maximum=10, step=1, value=4, label="Number of audio samples")
+    generate_button = gr.Button("Generate Samples")
+    @gr.render(inputs=num_samples_input, triggers=[generate_button.click])
+    def render(num_samples):
+        with gr.Column():
+            for i in range(num_samples):
+                # Generate audio and associated data
+                _audio, label, asr_text, wer_score, cer_score, asr_text_turbo, wer_score_turbo, cer_score_turbo =generate_audio()
+                # Create Gradio components to display the audio, transcription, and metrics
+                gr.Audio(_audio, label = label)
+                with gr.Row():
+                    with gr.Column():
+                        gr.Textbox(value = asr_text, label = "Whisper large output"),
+                        gr.Textbox(value = f"WER: {wer_score:.2f}", label = "Word Error Rate"),
+                        gr.Textbox(value = f"CER: {cer_score:.2f}", label = "Character Error Rate"),
+                    with gr.Column():
+                        gr.Textbox(value = asr_text_turbo, label = "Whisper large turbo output"),
+                        gr.Textbox(value = f"WER: {wer_score_turbo:.2f}", label = "Word Error Rate - "
+                                                                                                   "TURBO  "),
+                        gr.Textbox(value = f"CER: {cer_score_turbo:.2f}", label = "Character Error "
+                                                                                                      "Rate - TURBO")
+if __name__ == '__main__':
+    demo.launch(show_error = True)

arabic_normalizer.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# author : Mohammed BOUSHABA
+# date : 02/10/2024
+import re
+import unicodedata
+class ArabicTextNormalizer:
+    def __init__(self):
+        self.arabic_numerals = {
+            '٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4',
+            '٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9'
+        }
+        self.arabic_punctuation = {
+            '،': ',', '؛': ';', '؟': '?', '«': '"', '»': '"'
+        }
+        self.removable_diacritics = re.compile(r'[\u064B-\u065F\u0670]')
+        self.replacers = {
+            # Common Arabic contractions and their expansions
+            r'\bإن شاء الله\b': 'ان شاء الله',
+            r'\bبإذن الله\b': 'باذن الله',
+            r'\bالسلام عليكم\b': 'السلام عليكم',
+            # Add more Arabic-specific contractions here
+        }
+    def remove_diacritics(self, text):
+        return self.removable_diacritics.sub('', text)
+    def normalize_numerals(self, text):
+        for arabic, western in self.arabic_numerals.items():
+            text = text.replace(arabic, western)
+        return text
+    def normalize_punctuation(self, text):
+        for arabic, western in self.arabic_punctuation.items():
+            text = text.replace(arabic, western)
+        return text
+    def remove_tatweel(self, text):
+        return text.replace('\u0640', '')  # Remove tatweel (kashida)
+    def remove_dots(self, text):
+        return text.replace('.', '')
+    def remove_non_arabic(self, text):
+        return ''.join(c for c in text if '\u0600' <= c <= '\u06FF' or c.isascii())
+    def __call__(self, text):
+        # Convert to NFC form for consistent Unicode representation
+        text = unicodedata.normalize('NFC', text)
+        # Apply replacements for common contractions
+        for pattern, replacement in self.replacers.items():
+            text = re.sub(pattern, replacement, text)
+        # Normalize Arabic-specific elements
+        text = self.remove_diacritics(text)
+        text = self.normalize_numerals(text)
+        #text = self.normalize_punctuation(text)
+        text = self.remove_tatweel(text)
+        text = self.remove_dots(text)
+        # Remove non-Arabic characters (except ASCII)
+        text = self.remove_non_arabic(text)
+        # Remove extra whitespace
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
+# Example usage
+if __name__ == "__main__":
+    normalizer = ArabicTextNormalizer()
+    test_texts = [
+        "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللهِ وَبَرَكَاتُهُ",
+        "إن شــــاء الله سنلتقي في الساعة ٣:٣٠ مساءً",
+        "كَانَ هُنَاكَ ١٢٣٤٥ شَخْصًا فِي الْمَلْعَبِ",
+    ]
+    for text in test_texts:
+        normalized = normalizer(text)
+        print(f"Original: {text}")
+        print(f"Normalized: {normalized}")
+        print()