Spaces:

hf-audio
/

open_asr_leaderboard

Running on CPU Upgrade

App Files Files Community

Drop common voice and update rtfx

#17

by sanchit-gandhi - opened Aug 12, 2024

base: refs/heads/main

←

from: refs/pr/17

Discussion Files changed

+42

-27

Files changed (5) hide show

README.md +1 -1
app.py +5 -5
constants.py +34 -17
init.py +1 -2
utils_display.py +1 -2

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🏆
 colorFrom: red
 colorTo: blue
 sdk: gradio
-sdk_version: 3.42.0
 app_file: app.py
 pinned: true
 tags:

 colorFrom: red
 colorTo: blue
 sdk: gradio
+sdk_version: 4.41.0
 app_file: app.py
 pinned: true
 tags:

app.py CHANGED Viewed

@@ -6,12 +6,12 @@ from init import is_model_on_hub, upload_file, load_all_info_from_dataset_hub
 from utils_display import AutoEvalColumn, fields, make_clickable_model, styled_error, styled_message
 from datetime import datetime, timezone
-LAST_UPDATED = "Feb 10th 2024"
 column_names = {
     "MODEL": "Model",
     "Avg. WER": "Average WER ⬇️",
-    "RTF": "RTF (1e-3) ⬇️",
     "AMI WER": "AMI",
     "Earnings22 WER": "Earnings22",
     "Gigaspeech WER": "Gigaspeech",
@@ -20,7 +20,7 @@ column_names = {
     "SPGISpeech WER": "SPGISpeech",
     "Tedlium WER": "Tedlium",
     "Voxpopuli WER": "Voxpopuli",
-    "Common Voice WER": "Common Voice 9"}
 eval_queue_repo, requested_models, csv_results = load_all_info_from_dataset_hub()
@@ -111,7 +111,6 @@ with gr.Blocks() as demo:
             leaderboard_table = gr.components.Dataframe(
                 value=original_df,
                 datatype=TYPES,
-                max_rows=None,
                 elem_id="leaderboard-table",
                 interactive=False,
                 visible=True,
@@ -143,6 +142,7 @@ with gr.Blocks() as demo:
                 value=CITATION_TEXT, lines=7,
                 label="Copy the BibTeX snippet to cite this source",
                 elem_id="citation-button",
-            ).style(show_copy_button=True)
 demo.launch()

 from utils_display import AutoEvalColumn, fields, make_clickable_model, styled_error, styled_message
 from datetime import datetime, timezone
+LAST_UPDATED = "Aug 12th 2024"
 column_names = {
     "MODEL": "Model",
     "Avg. WER": "Average WER ⬇️",
+    "Avg. RTFx": "RTFx ⬆️️",
     "AMI WER": "AMI",
     "Earnings22 WER": "Earnings22",
     "Gigaspeech WER": "Gigaspeech",
     "SPGISpeech WER": "SPGISpeech",
     "Tedlium WER": "Tedlium",
     "Voxpopuli WER": "Voxpopuli",
+}
 eval_queue_repo, requested_models, csv_results = load_all_info_from_dataset_hub()
             leaderboard_table = gr.components.Dataframe(
                 value=original_df,
                 datatype=TYPES,
                 elem_id="leaderboard-table",
                 interactive=False,
                 visible=True,
                 value=CITATION_TEXT, lines=7,
                 label="Copy the BibTeX snippet to cite this source",
                 elem_id="citation-button",
+                show_copy_button=True,
+            )
 demo.launch()

constants.py CHANGED Viewed

@@ -15,7 +15,7 @@ TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body>
 INTRODUCTION_TEXT = "📐 The 🤗 Open ASR Leaderboard ranks and evaluates speech recognition models \
     on the Hugging Face Hub. \
-    \nWe report the Average [WER](https://huggingface.co/spaces/evaluate-metric/wer) (⬇️) and [RTF](https://openvoice-tech.net/index.php/Real-time-factor) (⬇️) - lower the better. Models are ranked based on their Average WER, from lowest to highest. Check the 📈 Metrics tab to understand how the models are evaluated. \
     \nIf you want results for a model that is not listed here, you can submit a request for it to be included ✉️✨. \
     \nThe leaderboard currently focuses on English speech recognition, and will be expanded to multilingual evaluation in later versions."
@@ -33,34 +33,52 @@ Here you will find details about the speech recognition metrics and datasets rep
 ## Metrics
-🎯 Word Error Rate (WER) and Real-Time Factor (RTF) are popular metrics for evaluating the accuracy of speech recognition
-models by estimating how accurate the predictions from the models are and how fast they are returned. We explain them each
-below.
 ### Word Error Rate (WER)
 Word Error Rate is used to measure the **accuracy** of automatic speech recognition systems. It calculates the percentage
 of words in the system's output that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**.
-```
-Example: If the reference transcript is "I really love cats," and the ASR system outputs "I don't love dogs,".
-The WER would be `50%` because 2 out of 4 words are incorrect.
-```
-For a fair comparison, we calculate **zero-shot** (i.e. pre-trained models only) *normalised WER* for all the model checkpoints. You can find the evaluation code on our [Github repository](https://github.com/huggingface/open_asr_leaderboard). To read more about how the WER is computed, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/evaluation).
-### Real Time Factor (RTF)
-Real Time Factor is a measure of  the **latency** of automatic speech recognition systems, i.e. how long it takes an
-model to process a given amount of speech. It's usually expressed as a multiple of real time. An RTF of 1 means it processes
-speech as fast as it's spoken, while an RTF of 2 means it takes twice as long. Thus, **a lower RTF value indicates lower latency**.
 ```
-Example: If it takes an ASR system 10 seconds to transcribe 10 seconds of speech, the RTF is 1.
-If it takes 20 seconds to transcribe the same 10 seconds of speech, the RTF is 2.
 ```
-For the benchmark, we report RTF averaged over a 10 minute audio sample that is chunked into 30 second segments, mimicking the [chunked long-form transcription strategy](https://huggingface.co/blog/asr-chunking) performed in Transfomrmers. We measure RTF on an A100 80GB GPU (Driver Version: 535.54.03, CUDA Version: 12.2), performing 5 warm-up runs and 3 graded runs, over which the RTF is averaged to get the final result.
 ## How to reproduce our results
@@ -86,7 +104,6 @@ are ranked based on their average WER scores, from lowest to highest.
 | Dataset                                                                                 | Domain                      | Speaking Style        | Train (h) | Dev (h) | Test (h) | Transcriptions     | License         |
 |-----------------------------------------------------------------------------------------|-----------------------------|-----------------------|-----------|---------|----------|--------------------|-----------------|
 | [LibriSpeech](https://huggingface.co/datasets/librispeech_asr)                          | Audiobook                   | Narrated              | 960       | 11      | 11       | Normalised         | CC-BY-4.0       |
-| [Common Voice 9](https://huggingface.co/datasets/mozilla-foundation/common_voice_9_0)   | Wikipedia                   | Narrated              | 1409      | 27      | 27       | Punctuated & Cased | CC0-1.0         |
 | [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli)                         | European Parliament         | Oratory               | 523       | 5       | 5        | Punctuated         | CC0             |
 | [TED-LIUM](https://huggingface.co/datasets/LIUM/tedlium)                                | TED talks                   | Oratory               | 454       | 2       | 3        | Normalised         | CC-BY-NC-ND 3.0 |
 | [GigaSpeech](https://huggingface.co/datasets/speechcolab/gigaspeech)                    | Audiobook, podcast, YouTube | Narrated, spontaneous | 2500      | 12      | 40       | Punctuated         | apache-2.0      |

 INTRODUCTION_TEXT = "📐 The 🤗 Open ASR Leaderboard ranks and evaluates speech recognition models \
     on the Hugging Face Hub. \
+    \nWe report the Average [WER](https://huggingface.co/spaces/evaluate-metric/wer) (⬇️ lower the better) and [RTFx](https://github.com/NVIDIA/DeepLearningExamples/blob/master/Kaldi/SpeechRecognition/README.md#metrics) (⬆️ higher the better). Models are ranked based on their Average WER, from lowest to highest. Check the 📈 Metrics tab to understand how the models are evaluated. \
     \nIf you want results for a model that is not listed here, you can submit a request for it to be included ✉️✨. \
     \nThe leaderboard currently focuses on English speech recognition, and will be expanded to multilingual evaluation in later versions."
 ## Metrics
+Models are evaluated jointly using the Word Error Rate (WER) and Inverse Real Time Factor (RTFx) metrics. The WER metric
+is used to assess the accuracy of a system, and the RTFx the inference speed. Models are ranked in the leaderboard based
+on their WER, lowest to highest.
+Crucially, the WER and RTFx values are computed for the same inference run using a single script. The implication of this is two-fold:
+1. The WER and RTFx values are coupled: for a given WER, one can expect to achieve the corresponding RTFx. This allows the proposer to trade-off lower WER for higher RTFx should they wish.
+2. The WER and RTFx values are averaged over all audios in the benchmark (in the order of thousands of audios).
+For details on reproducing the benchmark numbers, refer to the [Open ASR GitHub repository](https://github.com/huggingface/open_asr_leaderboard#evaluate-a-model).
 ### Word Error Rate (WER)
 Word Error Rate is used to measure the **accuracy** of automatic speech recognition systems. It calculates the percentage
 of words in the system's output that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**.
+Take the following example:
+| Reference:  | the | cat | sat     | on  | the | mat |
+|-------------|-----|-----|---------|-----|-----|-----|
+| Prediction: | the | cat | **sit** | on  | the |     |  |
+| Label:      | ✅   | ✅   | S       | ✅   | ✅   | D   |
+Here, we have:
+* 1 substitution ("sit" instead of "sat")
+* 0 insertions
+* 1 deletion ("mat" is missing)
+This gives 2 errors in total. To get our word error rate, we divide the total number of errors (substitutions + insertions + deletions) by the total number of words in our
+reference (N), which for this example is 6:
 ```
+WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
+```
+Giving a WER of 0.33, or 33%. For a fair comparison, we calculate **zero-shot** (i.e. pre-trained models only) *normalised WER* for all the model checkpoints, meaning punctuation and casing is removed from the references and predictions. You can find the evaluation code on our [Github repository](https://github.com/huggingface/open_asr_leaderboard). To read more about how the WER is computed, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/evaluation).
+### Inverse Real Time Factor (RTFx)
+Inverse Real Time Factor is a measure of  the **latency** of automatic speech recognition systems, i.e. how long it takes an
+model to process a given amount of speech. It is defined as:
 ```
+RTFx = (number of seconds of audio inferred) / (compute time in seconds)
+```
+Therefore, and RTFx of 1 means a system processes speech as fast as it's spoken, while an RTFx of 2 means it takes half the time.
+Thus, **a higher RTFx value indicates lower latency**.
 ## How to reproduce our results
 | Dataset                                                                                 | Domain                      | Speaking Style        | Train (h) | Dev (h) | Test (h) | Transcriptions     | License         |
 |-----------------------------------------------------------------------------------------|-----------------------------|-----------------------|-----------|---------|----------|--------------------|-----------------|
 | [LibriSpeech](https://huggingface.co/datasets/librispeech_asr)                          | Audiobook                   | Narrated              | 960       | 11      | 11       | Normalised         | CC-BY-4.0       |
 | [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli)                         | European Parliament         | Oratory               | 523       | 5       | 5        | Punctuated         | CC0             |
 | [TED-LIUM](https://huggingface.co/datasets/LIUM/tedlium)                                | TED talks                   | Oratory               | 454       | 2       | 3        | Normalised         | CC-BY-NC-ND 3.0 |
 | [GigaSpeech](https://huggingface.co/datasets/speechcolab/gigaspeech)                    | Audiobook, podcast, YouTube | Narrated, spontaneous | 2500      | 12      | 40       | Punctuated         | apache-2.0      |

init.py CHANGED Viewed

@@ -14,7 +14,6 @@ hf_api = HfApi(
 def load_all_info_from_dataset_hub():
     eval_queue_repo = None
-    results_csv_path = None
     requested_models = None
     passed = True
@@ -40,7 +39,7 @@ def load_all_info_from_dataset_hub():
         if csv_results is None:
             passed = False
     if not passed:
-        print("No HuggingFace token provided. Skipping evaluation requests and results.")
     return eval_queue_repo, requested_models, csv_results

 def load_all_info_from_dataset_hub():
     eval_queue_repo = None
     requested_models = None
     passed = True
         if csv_results is None:
             passed = False
     if not passed:
+        raise ValueError("No Hugging Face token provided. Skipping evaluation requests and results.")
     return eval_queue_repo, requested_models, csv_results

utils_display.py CHANGED Viewed

@@ -14,7 +14,7 @@ def fields(raw_class):
 class AutoEvalColumn: # Auto evals column
     model = ColumnContent("Model", "markdown")
     avg_wer = ColumnContent("Average WER ⬇️", "number")
-    rtf = ColumnContent("RTF (1e-3) ⬇️", "number")
     ami_wer = ColumnContent("AMI", "number")
     e22_wer = ColumnContent("Earnings22", "number")
     gs_wer = ColumnContent("Gigaspeech", "number")
@@ -23,7 +23,6 @@ class AutoEvalColumn: # Auto evals column
     ss_wer = ColumnContent("SPGISpeech", "number")
     tl_wer = ColumnContent("Tedlium", "number")
     vp_wer = ColumnContent("Voxpopuli", "number")
-    cv_wer = ColumnContent("Common Voice", "number")
 def make_clickable_model(model_name):

 class AutoEvalColumn: # Auto evals column
     model = ColumnContent("Model", "markdown")
     avg_wer = ColumnContent("Average WER ⬇️", "number")
+    rtf = ColumnContent("RTFx ⬆️️", "number")
     ami_wer = ColumnContent("AMI", "number")
     e22_wer = ColumnContent("Earnings22", "number")
     gs_wer = ColumnContent("Gigaspeech", "number")
     ss_wer = ColumnContent("SPGISpeech", "number")
     tl_wer = ColumnContent("Tedlium", "number")
     vp_wer = ColumnContent("Voxpopuli", "number")
 def make_clickable_model(model_name):