whisper-large-v3-macedonian-asr / hyperparams.yaml

Update hyperparams.yaml

47ad4b9 verified 2 months ago

4.72 kB

	# Seed needs to be set at top of yaml, before objects with parameters are made
	seed: 2024
	__set_seed: !apply:torch.manual_seed [!ref <seed>]

	skip_training: True

	# Hparams NEEDED
	HPARAMS_NEEDED: ["log_softmax"]
	# Modules Needed
	MODULES_NEEDED: ["whisper"]

	output_folder: !ref output_folder_whisper
	pretrained_path: Macedonian-ASR/whisper-large-v3-macedonian-asr
	output_wer_folder: !ref <output_folder>/
	save_folder: !ref <output_folder>/save
	train_log: !ref <output_folder>/train_log.txt


	# URL for the biggest Fairseq english whisper model.
	whisper_hub: openai/whisper-large-v3

	# Normalize inputs with the same normalization done in the paper (https://cdn.openai.com/papers/whisper.pdf). Refer to Appendix C for further information.
	normalized_transcripts: False
	restore_capitalization: False

	# Data files
	language: "macedonian"
	data_folder: "../../data/combined_data/speechbrain_splits"
	accented_letters: True

	ckpt_interval_minutes: 30 # save checkpoint every N min

	####################### Training Parameters ####################################
	freeze_whisper: False
	freeze_encoder: True
	number_of_epochs: 50
	weight_decay: 0.01
	lr_whisper: 1e-5
	warmup_steps: 500
	max_grad_norm: 2.0
	precision: fp16 # bf16, fp16 or fp32
	eval_precision: fp16
	sample_rate: 16000

	# With data_parallel batch_size is split into N jobs
	batch_size: 6
	test_batch_size: 1
	grad_accumulation_factor: 2


	# Decoding parameters
	min_decode_ratio: 0.0
	max_decode_ratio: 1.0
	test_beam_size: 8

	####################### Model Parameters #######################################
	train_dataloader_opts:
	batch_size: !ref <batch_size>

	valid_dataloader_opts:
	batch_size: !ref <batch_size>

	test_dataloader_opts:
	batch_size: !ref <test_batch_size>

	epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
	limit: !ref <number_of_epochs>

	############################## Augmentations ###################################

	# Speed perturbation
	speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
	orig_freq: 16000
	speeds: [95, 100, 105]

	# Frequency drop: randomly drops a number of frequency bands to zero.
	drop_freq: !new:speechbrain.augment.time_domain.DropFreq
	drop_freq_low: 0
	drop_freq_high: 1
	drop_freq_count_low: 1
	drop_freq_count_high: 3
	drop_freq_width: 0.05

	# Time drop: randomly drops a number of temporal chunks.
	drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
	drop_length_low: 1000
	drop_length_high: 2000
	drop_count_low: 1
	drop_count_high: 5

	# Augmenter: Combines previously defined augmentations to perform data augmentation
	wav_augment: !new:speechbrain.augment.augmenter.Augmenter
	concat_original: False
	min_augmentations: 1
	max_augmentations: 3
	augment_prob: 0.5
	augmentations: [
	!ref <speed_perturb>,
	!ref <drop_freq>,
	!ref <drop_chunk>]


	############################## Models ##########################################


	whisper: !new:speechbrain.lobes.models.huggingface_transformers.whisper.Whisper
	source: !ref <whisper_hub>
	freeze: !ref <freeze_whisper>
	freeze_encoder: !ref <freeze_encoder>
	save_path: !ref <save_folder>/whisper_checkpoint
	language: !ref <language>
	task: "transcribe"

	log_softmax: !new:speechbrain.nnet.activations.Softmax
	apply_log: True

	nll_loss: !name:speechbrain.nnet.losses.nll_loss

	modules:
	whisper: !ref <whisper>

	############################## Decoding & optimiser ############################

	whisper_opt_class: !name:torch.optim.AdamW
	lr: !ref <lr_whisper>
	weight_decay: !ref <weight_decay>

	valid_search: !new:speechbrain.decoders.seq2seq.S2SWhisperGreedySearcher
	model: !ref <whisper>
	min_decode_ratio: !ref <min_decode_ratio>
	max_decode_ratio: !ref <max_decode_ratio>

	test_search: !new:speechbrain.decoders.seq2seq.S2SWhisperBeamSearcher
	module: [!ref <whisper>]
	min_decode_ratio: !ref <min_decode_ratio>
	max_decode_ratio: !ref <max_decode_ratio>
	beam_size: !ref <test_beam_size>

	lr_annealing_whisper: !new:speechbrain.nnet.schedulers.NoamScheduler
	lr_initial: !ref <lr_whisper>
	n_warmup_steps: !ref <warmup_steps>

	############################## Logging and Pretrainer ##########################

	checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
	checkpoints_dir: !ref <save_folder>
	recoverables:
	whisper: !ref <whisper>
	scheduler_whisper: !ref <lr_annealing_whisper>
	counter: !ref <epoch_counter>

	train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
	save_file: !ref <train_log>

	error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats

	cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
	split_tokens: True