|
{
|
|
"dataset_name": "mozilla-foundation/common_voice_13_0",
|
|
"model_name_or_path": "facebook/wav2vec2-large-xlsr-53",
|
|
"dataset_config_name": "eo",
|
|
"output_dir": "./wav2vec2-common_voice_13_0-eo-10",
|
|
"train_split_name": "train",
|
|
"eval_split_name": "validation",
|
|
"eval_metrics": ["cer", "wer"],
|
|
"overwrite_output_dir": true,
|
|
"preprocessing_num_workers": 1,
|
|
"num_train_epochs": 5,
|
|
"per_device_train_batch_size": 16,
|
|
"gradient_accumulation_steps": 2,
|
|
"gradient_checkpointing": true,
|
|
"learning_rate": 3e-5,
|
|
"warmup_steps": 500,
|
|
"evaluation_strategy": "steps",
|
|
"text_column_name": "sentence",
|
|
"length_column_name": "input_length",
|
|
"save_steps": 1000,
|
|
"eval_steps": 1000,
|
|
"layerdrop": 0.2,
|
|
"save_total_limit": 3,
|
|
"freeze_feature_encoder": true,
|
|
"chars_to_ignore": "-!\"'(),.:;=?_`¨«¸»ʼ‑–—‘’“”„…‹›♫?",
|
|
"chars_to_substitute": {
|
|
"przy": "pŝe",
|
|
"byn": "bin",
|
|
"cx": "ĉ",
|
|
"sx": "ŝ",
|
|
"fi": "fi",
|
|
"fl": "fl",
|
|
"ǔ": "ŭ",
|
|
"ñ": "nj",
|
|
"á": "a",
|
|
"é": "e",
|
|
"ü": "ŭ",
|
|
"y": "j",
|
|
"qu": "ku"
|
|
},
|
|
"fp16": true,
|
|
"group_by_length": true,
|
|
"push_to_hub": true,
|
|
"do_train": true,
|
|
"do_eval": true
|
|
}
|
|
|