freddy commited on
Commit
6b8e31f
1 Parent(s): 5ef4794

nynorsk config

Browse files
Files changed (2) hide show
  1. run.sh +1 -1
  2. run_speech_recognition_ctc.py +4 -4
run.sh CHANGED
@@ -2,7 +2,7 @@ python run_speech_recognition_ctc.py \
2
  --dataset_name="NbAiLab/NPSC" \
3
  --model_name_or_path="KBLab/wav2vec2-large-voxrex" \
4
  --hub_model_id="NbAiLab/wav2vec2-large-voxrex-npsc-nynorsk" \
5
- --dataset_config_name="16K_mp3" \
6
  --output_dir="./" \
7
  --overwrite_output_dir \
8
  --num_train_epochs="40" \
 
2
  --dataset_name="NbAiLab/NPSC" \
3
  --model_name_or_path="KBLab/wav2vec2-large-voxrex" \
4
  --hub_model_id="NbAiLab/wav2vec2-large-voxrex-npsc-nynorsk" \
5
+ --dataset_config_name="16K_mp3_nynorsk" \
6
  --output_dir="./" \
7
  --overwrite_output_dir \
8
  --num_train_epochs="40" \
run_speech_recognition_ctc.py CHANGED
@@ -412,8 +412,8 @@ def main():
412
  #def filter_inaudible(entry):
413
  # return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
414
  #
415
- def filter_nynorsk(entry):
416
- return re.search("nn-no", entry["sentence_language_code"], flags=re.IGNORECASE)
417
 
418
  def filter_tooshort(entry):
419
  #print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
@@ -449,7 +449,7 @@ def main():
449
  split=data_args.train_split_name,
450
  use_auth_token=data_args.use_auth_token,
451
  ).shuffle()
452
- raw_datasets["train"] = raw_datasets["train"].filter(filter_numeric).filter(filter_nynorsk).filter(filter_tooshort)
453
  raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
454
 
455
  if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -476,7 +476,7 @@ def main():
476
  split=data_args.eval_split_name,
477
  use_auth_token=data_args.use_auth_token,
478
  ).shuffle()
479
- raw_datasets["eval"] = raw_datasets["eval"].filter(filter_numeric).filter(filter_nynorsk).filter(filter_tooshort)
480
  raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
481
 
482
  if data_args.max_eval_samples is not None:
 
412
  #def filter_inaudible(entry):
413
  # return not re.search("\d|<inaudible>", entry["text"], flags=re.IGNORECASE)
414
  #
415
+ #def filter_nynorsk(entry):
416
+ # return re.search("nn-no", entry["sentence_language_code"], flags=re.IGNORECASE)
417
 
418
  def filter_tooshort(entry):
419
  #print(f"The audio sample ({entry["audio"]["path"]}) is too small, and has been omitted. "
 
449
  split=data_args.train_split_name,
450
  use_auth_token=data_args.use_auth_token,
451
  ).shuffle()
452
+ raw_datasets["train"] = raw_datasets["train"].filter(filter_numeric).filter(filter_tooshort)
453
  raw_datasets["train"] = raw_datasets["train"].map(map_dataset)
454
 
455
  if data_args.audio_column_name not in raw_datasets["train"].column_names:
 
476
  split=data_args.eval_split_name,
477
  use_auth_token=data_args.use_auth_token,
478
  ).shuffle()
479
+ raw_datasets["eval"] = raw_datasets["eval"].filter(filter_numeric).filter(filter_tooshort)
480
  raw_datasets["eval"] = raw_datasets["eval"].map(map_dataset)
481
 
482
  if data_args.max_eval_samples is not None: