Commit
β’
44fabc2
1
Parent(s):
814e23f
up
Browse files
eval.py
CHANGED
@@ -3,6 +3,7 @@ from datasets import load_dataset, load_metric, Audio, Dataset
|
|
3 |
from transformers import pipeline, AutoFeatureExtractor
|
4 |
import re
|
5 |
import argparse
|
|
|
6 |
from typing import Dict
|
7 |
|
8 |
|
@@ -52,14 +53,14 @@ def normalize_text(text: str) -> str:
|
|
52 |
|
53 |
chars_to_ignore_regex = '[,?.!\-\;\:\"β%ββοΏ½βββ¦β]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
54 |
|
55 |
-
text =
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
#
|
58 |
-
|
59 |
-
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
60 |
-
|
61 |
-
for t in token_sequences_to_ignore:
|
62 |
-
text = " ".join(text.split(t))
|
63 |
|
64 |
return text
|
65 |
|
@@ -113,10 +114,10 @@ if __name__ == "__main__":
|
|
113 |
"--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
|
114 |
)
|
115 |
parser.add_argument(
|
116 |
-
"--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
|
117 |
)
|
118 |
parser.add_argument(
|
119 |
-
"--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to 1
|
120 |
)
|
121 |
parser.add_argument(
|
122 |
"--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
|
|
|
3 |
from transformers import pipeline, AutoFeatureExtractor
|
4 |
import re
|
5 |
import argparse
|
6 |
+
import unicodedata
|
7 |
from typing import Dict
|
8 |
|
9 |
|
|
|
53 |
|
54 |
chars_to_ignore_regex = '[,?.!\-\;\:\"β%ββοΏ½βββ¦β]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
55 |
|
56 |
+
text = text.lower()
|
57 |
+
# normalize non-standard (stylized) unicode characters
|
58 |
+
text = unicodedata.normalize('NFKC', text)
|
59 |
+
# remove punctuation
|
60 |
+
text = re.sub(chars_to_ignore_regex, "", text)
|
61 |
|
62 |
+
# Let's also make sure we split on all kinds of newlines, spaces, etc...
|
63 |
+
text = " ".join(text.split())
|
|
|
|
|
|
|
|
|
64 |
|
65 |
return text
|
66 |
|
|
|
114 |
"--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
|
115 |
)
|
116 |
parser.add_argument(
|
117 |
+
"--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
|
118 |
)
|
119 |
parser.add_argument(
|
120 |
+
"--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
|
121 |
)
|
122 |
parser.add_argument(
|
123 |
"--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
|