hf-test
/

xls-r-300m-sv

Automatic Speech Recognition

Generated from Trainer

hf-asr-leaderboard

mozilla-foundation/common_voice_7_0

robust-speech-event

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

patrickvonplaten commited on Jan 18, 2022

Commit

44fabc2

•

1 Parent(s): 814e23f

up

Files changed (1) hide show

eval.py +10 -9

eval.py CHANGED Viewed

@@ -3,6 +3,7 @@ from datasets import load_dataset, load_metric, Audio, Dataset
 from transformers import pipeline, AutoFeatureExtractor
 import re
 import argparse
 from typing import Dict
@@ -52,14 +53,14 @@ def normalize_text(text: str) -> str:
     chars_to_ignore_regex = '[,?.!\-\;\:\"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
-    text = re.sub(chars_to_ignore_regex, "", text.lower())
-    # In addition, we can normalize the target text, e.g. removing new lines characters etc...
-    # note that order is important here!
-    token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
-    for t in token_sequences_to_ignore:
-        text = " ".join(text.split(t))
     return text
@@ -113,10 +114,10 @@ if __name__ == "__main__":
         "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
     )
     parser.add_argument(
-        "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
     )
     parser.add_argument(
-        "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to 1 second."
     )
     parser.add_argument(
         "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."

 from transformers import pipeline, AutoFeatureExtractor
 import re
 import argparse
+import unicodedata
 from typing import Dict
     chars_to_ignore_regex = '[,?.!\-\;\:\"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+    text = text.lower()
+    # normalize non-standard (stylized) unicode characters
+    text = unicodedata.normalize('NFKC', text)
+    # remove punctuation
+    text = re.sub(chars_to_ignore_regex, "", text)
+    # Let's also make sure we split on all kinds of newlines, spaces, etc...
+    text = " ".join(text.split())
     return text
         "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
     )
     parser.add_argument(
+        "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
     )
     parser.add_argument(
+        "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
     )
     parser.add_argument(
         "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."