nynorsk config
Browse files- run.sh +1 -1
- run_speech_recognition_ctc.py +4 -4
run.sh
CHANGED
@@ -2,7 +2,7 @@ python run_speech_recognition_ctc.py \
|
|
2 |
--dataset_name="NbAiLab/NPSC" \
|
3 |
--model_name_or_path="KBLab/wav2vec2-large-voxrex" \
|
4 |
--hub_model_id="NbAiLab/wav2vec2-large-voxrex-npsc-nynorsk" \
|
5 |
-
--dataset_config_name="
|
6 |
--output_dir="./" \
|
7 |
--overwrite_output_dir \
|
8 |
--num_train_epochs="40" \
|
|
|
2 |
--dataset_name="NbAiLab/NPSC" \
|
3 |
--model_name_or_path="KBLab/wav2vec2-large-voxrex" \
|
4 |
--hub_model_id="NbAiLab/wav2vec2-large-voxrex-npsc-nynorsk" \
|
5 |
+
--dataset_config_name="16K_mp3_nynorsk" \
|
6 |
--output_dir="./" \
|
7 |
--overwrite_output_dir \
|
8 |
--num_train_epochs="40" \
|
run_speech_recognition_ctc.py
CHANGED
@@ -412,8 +412,8 @@ def main():
|
|
412 |
#def filter_inaudible(entry):
|
413 |
# return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
|
414 |
#
|
415 |
-
def filter_nynorsk(entry):
|
416 |
-
|
417 |
|
418 |
def filter_tooshort(entry):
|
419 |
#print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
|
@@ -449,7 +449,7 @@ def main():
|
|
449 |
split=data_args.train_split_name,
|
450 |
use_auth_token=data_args.use_auth_token,
|
451 |
).shuffle()
|
452 |
-
raw_datasets["train"] = raw_datasets["train"].filter(filter_numeric).filter(
|
453 |
raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
|
454 |
|
455 |
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
@@ -476,7 +476,7 @@ def main():
|
|
476 |
split=data_args.eval_split_name,
|
477 |
use_auth_token=data_args.use_auth_token,
|
478 |
).shuffle()
|
479 |
-
raw_datasets["eval"] = raw_datasets["eval"].filter(filter_numeric).filter(
|
480 |
raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
|
481 |
|
482 |
if data_args.max_eval_samples is not None:
|
|
|
412 |
#def filter_inaudible(entry):
|
413 |
# return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
|
414 |
#
|
415 |
+
#def filter_nynorsk(entry):
|
416 |
+
# return re.search("nn-no", entry["sentence_language_code"], flags=re.IGNORECASE)
|
417 |
|
418 |
def filter_tooshort(entry):
|
419 |
#print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
|
|
|
449 |
split=data_args.train_split_name,
|
450 |
use_auth_token=data_args.use_auth_token,
|
451 |
).shuffle()
|
452 |
+
raw_datasets["train"] = raw_datasets["train"].filter(filter_numeric).filter(filter_tooshort)
|
453 |
raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
|
454 |
|
455 |
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
|
|
476 |
split=data_args.eval_split_name,
|
477 |
use_auth_token=data_args.use_auth_token,
|
478 |
).shuffle()
|
479 |
+
raw_datasets["eval"] = raw_datasets["eval"].filter(filter_numeric).filter(filter_tooshort)
|
480 |
raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
|
481 |
|
482 |
if data_args.max_eval_samples is not None:
|