accelerate launch run_pseudo_labelling.py \ | |
--model_name_or_path "openai/whisper-large-v2" \ | |
--dataset_name "mozilla-foundation/common_voice_13_0" \ | |
--dataset_config_name "pt" \ | |
--dataset_split_name "train+validation+test" \ | |
--text_column_name "sentence" \ | |
--id_column_name "path" \ | |
--output_dir "./common_voice_13_0_pt_pseudo_labelled" \ | |
--wandb_project "distil-whisper-labelling" \ | |
--per_device_eval_batch_size 64 \ | |
--dtype "bfloat16" \ | |
--dataloader_num_workers 16 \ | |
--preprocessing_num_workers 16 \ | |
--logging_steps 500 \ | |
--max_label_length 128 \ | |
--report_to "wandb" \ | |
--language "pt" \ | |
--task "transcribe" \ | |
--return_timestamps \ | |
--attn_type "flash_attn" \ | |
--streaming False \ | |
--generation_num_beams 1 \ | |
--decode_token_ids False \ | |
--push_to_hub | |