second
Browse files- run.sh +6 -5
- run_speech_recognition_ctc.py +7 -6
run.sh
CHANGED
@@ -1,15 +1,15 @@
|
|
1 |
-
|
2 |
--dataset_name="NbAiLab/NPSC" \
|
3 |
--model_name_or_path="KBLab/wav2vec2-large-voxrex" \
|
4 |
--hub_model_id="NbAiLab/wav2vec2-large-voxrex-npsc-nynorsk" \
|
5 |
--dataset_config_name="16K_mp3" \
|
6 |
--output_dir="./" \
|
7 |
--overwrite_output_dir \
|
8 |
-
--num_train_epochs="
|
9 |
--per_device_train_batch_size="16" \
|
10 |
--per_device_eval_batch_size="16" \
|
11 |
--gradient_accumulation_steps="2" \
|
12 |
-
--learning_rate="
|
13 |
--warmup_steps="2000" \
|
14 |
--length_column_name="input_length" \
|
15 |
--evaluation_strategy="steps" \
|
@@ -29,7 +29,7 @@ WANDB_ENTITY=NbAiLab WANDB_PROJECT=wav2vec2 python run_speech_recognition_ctc.py
|
|
29 |
--mask_feature_prob="0.25" \
|
30 |
--mask_feature_length="64" \
|
31 |
--gradient_checkpointing \
|
32 |
-
--min_duration_in_seconds="0.
|
33 |
--max_duration_in_seconds="30.0" \
|
34 |
--use_auth_token \
|
35 |
--seed="42" \
|
@@ -37,4 +37,5 @@ WANDB_ENTITY=NbAiLab WANDB_PROJECT=wav2vec2 python run_speech_recognition_ctc.py
|
|
37 |
--group_by_length \
|
38 |
--do_train --do_eval \
|
39 |
--push_to_hub \
|
40 |
-
--preprocessing_num_workers="32"
|
|
|
|
1 |
+
python run_speech_recognition_ctc.py \
|
2 |
--dataset_name="NbAiLab/NPSC" \
|
3 |
--model_name_or_path="KBLab/wav2vec2-large-voxrex" \
|
4 |
--hub_model_id="NbAiLab/wav2vec2-large-voxrex-npsc-nynorsk" \
|
5 |
--dataset_config_name="16K_mp3" \
|
6 |
--output_dir="./" \
|
7 |
--overwrite_output_dir \
|
8 |
+
--num_train_epochs="40" \
|
9 |
--per_device_train_batch_size="16" \
|
10 |
--per_device_eval_batch_size="16" \
|
11 |
--gradient_accumulation_steps="2" \
|
12 |
+
--learning_rate="7.5e-5" \
|
13 |
--warmup_steps="2000" \
|
14 |
--length_column_name="input_length" \
|
15 |
--evaluation_strategy="steps" \
|
|
|
29 |
--mask_feature_prob="0.25" \
|
30 |
--mask_feature_length="64" \
|
31 |
--gradient_checkpointing \
|
32 |
+
--min_duration_in_seconds="0.8" \
|
33 |
--max_duration_in_seconds="30.0" \
|
34 |
--use_auth_token \
|
35 |
--seed="42" \
|
|
|
37 |
--group_by_length \
|
38 |
--do_train --do_eval \
|
39 |
--push_to_hub \
|
40 |
+
--preprocessing_num_workers="32"\
|
41 |
+
--ctc_zero_infinity=True
|
run_speech_recognition_ctc.py
CHANGED
@@ -409,11 +409,11 @@ def main():
|
|
409 |
and "9" not in entry["text"]
|
410 |
)
|
411 |
|
412 |
-
def filter_inaudible(entry):
|
413 |
-
|
414 |
-
|
415 |
def filter_nynorsk(entry):
|
416 |
-
return re.search("
|
417 |
|
418 |
def filter_tooshort(entry):
|
419 |
#print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
|
@@ -433,6 +433,7 @@ def main():
|
|
433 |
batch["text"] = re.sub('<ee>', 'eee', batch["text"])
|
434 |
batch["text"] = re.sub('<qq>', 'qqq', batch["text"])
|
435 |
batch["text"] = re.sub('<mm>', 'mmm', batch["text"])
|
|
|
436 |
# batch["text"] = re.sub('<inaudible>', '?', batch["text"])
|
437 |
if "<" in batch["text"]:
|
438 |
raise ValueError(batch["text"])
|
@@ -448,7 +449,7 @@ def main():
|
|
448 |
split=data_args.train_split_name,
|
449 |
use_auth_token=data_args.use_auth_token,
|
450 |
).shuffle()
|
451 |
-
raw_datasets["train"] = raw_datasets["train"].filter(filter_numeric).filter(
|
452 |
raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
|
453 |
|
454 |
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
@@ -475,7 +476,7 @@ def main():
|
|
475 |
split=data_args.eval_split_name,
|
476 |
use_auth_token=data_args.use_auth_token,
|
477 |
).shuffle()
|
478 |
-
raw_datasets["eval"] = raw_datasets["eval"].filter(filter_numeric).filter(
|
479 |
raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
|
480 |
|
481 |
if data_args.max_eval_samples is not None:
|
|
|
409 |
and "9" not in entry["text"]
|
410 |
)
|
411 |
|
412 |
+
#def filter_inaudible(entry):
|
413 |
+
# return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
|
414 |
+
#
|
415 |
def filter_nynorsk(entry):
|
416 |
+
return re.search("nn-no", entry["sentence_language_code"], flags=re.IGNORECASE)
|
417 |
|
418 |
def filter_tooshort(entry):
|
419 |
#print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
|
|
|
433 |
batch["text"] = re.sub('<ee>', 'eee', batch["text"])
|
434 |
batch["text"] = re.sub('<qq>', 'qqq', batch["text"])
|
435 |
batch["text"] = re.sub('<mm>', 'mmm', batch["text"])
|
436 |
+
batch["text"] = re.sub('<inaudible>', 'xxx', batch["text"])
|
437 |
# batch["text"] = re.sub('<inaudible>', '?', batch["text"])
|
438 |
if "<" in batch["text"]:
|
439 |
raise ValueError(batch["text"])
|
|
|
449 |
split=data_args.train_split_name,
|
450 |
use_auth_token=data_args.use_auth_token,
|
451 |
).shuffle()
|
452 |
+
raw_datasets["train"] = raw_datasets["train"].filter(filter_numeric).filter(filter_nynorsk).filter(filter_tooshort)
|
453 |
raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
|
454 |
|
455 |
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
|
|
476 |
split=data_args.eval_split_name,
|
477 |
use_auth_token=data_args.use_auth_token,
|
478 |
).shuffle()
|
479 |
+
raw_datasets["eval"] = raw_datasets["eval"].filter(filter_numeric).filter(filter_nynorsk).filter(filter_tooshort)
|
480 |
raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
|
481 |
|
482 |
if data_args.max_eval_samples is not None:
|