patrickvonplaten commited on
Commit
44fabc2
β€’
1 Parent(s): 814e23f
Files changed (1) hide show
  1. eval.py +10 -9
eval.py CHANGED
@@ -3,6 +3,7 @@ from datasets import load_dataset, load_metric, Audio, Dataset
3
  from transformers import pipeline, AutoFeatureExtractor
4
  import re
5
  import argparse
 
6
  from typing import Dict
7
 
8
 
@@ -52,14 +53,14 @@ def normalize_text(text: str) -> str:
52
 
53
  chars_to_ignore_regex = '[,?.!\-\;\:\"β€œ%β€˜β€οΏ½β€”β€™β€¦β€“]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
54
 
55
- text = re.sub(chars_to_ignore_regex, "", text.lower())
 
 
 
 
56
 
57
- # In addition, we can normalize the target text, e.g. removing new lines characters etc...
58
- # note that order is important here!
59
- token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
60
-
61
- for t in token_sequences_to_ignore:
62
- text = " ".join(text.split(t))
63
 
64
  return text
65
 
@@ -113,10 +114,10 @@ if __name__ == "__main__":
113
  "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
114
  )
115
  parser.add_argument(
116
- "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
117
  )
118
  parser.add_argument(
119
- "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to 1 second."
120
  )
121
  parser.add_argument(
122
  "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
 
3
  from transformers import pipeline, AutoFeatureExtractor
4
  import re
5
  import argparse
6
+ import unicodedata
7
  from typing import Dict
8
 
9
 
 
53
 
54
  chars_to_ignore_regex = '[,?.!\-\;\:\"β€œ%β€˜β€οΏ½β€”β€™β€¦β€“]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
55
 
56
+ text = text.lower()
57
+ # normalize non-standard (stylized) unicode characters
58
+ text = unicodedata.normalize('NFKC', text)
59
+ # remove punctuation
60
+ text = re.sub(chars_to_ignore_regex, "", text)
61
 
62
+ # Let's also make sure we split on all kinds of newlines, spaces, etc...
63
+ text = " ".join(text.split())
 
 
 
 
64
 
65
  return text
66
 
 
114
  "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
115
  )
116
  parser.add_argument(
117
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
118
  )
119
  parser.add_argument(
120
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
121
  )
122
  parser.add_argument(
123
  "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."