marinone94
/

xls-r-300m-sv-robust

Automatic Speech Recognition

mozilla-foundation/common_voice_9_0

Generated from Trainer

Inference Endpoints

Model card Files Files and versions Community

marinone94 commited on Jan 31, 2022

Commit

393fd68

•

1 Parent(s): 9be1ce7

correct filtering column

Files changed (2) hide show

run.sh +1 -1
run_speech_recognition_ctc.py +3 -3

run.sh CHANGED Viewed

@@ -17,7 +17,7 @@ python run_speech_recognition_ctc.py \
 	--evaluation_strategy="epoch" \
 	--save_strategy="epoch" \
 	--text_column_name="sentence" \
-	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – / \\ \
 	--logging_steps="100" \
 	--layerdrop="0.0" \
 	--activation_dropout="0.1" \

 	--evaluation_strategy="epoch" \
 	--save_strategy="epoch" \
 	--text_column_name="sentence" \
+	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – / \
 	--logging_steps="100" \
 	--layerdrop="0.0" \
 	--activation_dropout="0.1" \

run_speech_recognition_ctc.py CHANGED Viewed

@@ -521,9 +521,9 @@ def main():
     def remove_special_characters(batch):
         if chars_to_ignore_regex is not None:
-            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).replace("\\\\Punkt", "").replace("\\\\Komma", "").lower().decode("utf-8") + " "
         else:
-            batch["target_text"] = batch[text_column_name].replace("\\\\Punkt", "").replace("\\\\Komma", "").lower().decode("utf-8") + " "
         return batch
     num_workers = data_args.preprocessing_num_workers
@@ -537,7 +537,7 @@ def main():
         raw_datasets = raw_datasets.filter(
             is_text_valid,
             num_proc=num_workers,
-            input_columns=["input_length"],
             desc="remove single words, single chars and 'W O R D S'",
         )

     def remove_special_characters(batch):
         if chars_to_ignore_regex is not None:
+            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).replace("\\\\Punkt", "").replace("\\\\Komma", "").lower() + " "
         else:
+            batch["target_text"] = batch[text_column_name].replace("\\\\Punkt", "").replace("\\\\Komma", "").lower() + " "
         return batch
     num_workers = data_args.preprocessing_num_workers
         raw_datasets = raw_datasets.filter(
             is_text_valid,
             num_proc=num_workers,
+            input_columns=["target_text"],
             desc="remove single words, single chars and 'W O R D S'",
         )