# ################################ # Model: wav2vec2 + DNN + CTC # Augmentation: SpecAugment # Authors: Titouan Parcollet 2021 # ################################ # Seed needs to be set at top of yaml, before objects with parameters are made seed: 1234 __set_seed: !!python/object/apply:torch.manual_seed [!ref ] output_folder: !ref semi_wavlm_large_tunisian_ctc/ wer_file: !ref /wer.txt save_folder: !ref /save train_log: !ref /train_log.txt # URL for the biggest LeBenchmark wav2vec french. wav2vec2_folder: !ref /wav2vec2_checkpoint # Data files data_folder: /path/to/data # e.g, /localscratch/cv-corpus-5.1-2020-06-22/fr train_tsv_file: !ref /train.tsv # Standard CommonVoice .tsv files dev_tsv_file: !ref /dev.tsv # Standard CommonVoice .tsv files test_tsv_file: !ref /test.tsv # Standard CommonVoice .tsv files accented_letters: True language: fr # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english test_csv: - /path/to/test_data skip_prep: True # Skip data preparation use_language_modelling: True ngram_lm_path: outdomain.arpa # We remove utterance slonger than 10s in the train/dev/test sets as # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 10.0 avoid_if_shorter_than: 1.2 # Training parameters number_of_epochs: 12 lr: 1.0 lr_wav2vec: 0.0001 sorting: ascending auto_mix_prec: False sample_rate: 16000 ckpt_interval_minutes: 30 # save checkpoint every N min # With data_parallel batch_size is split into N jobs # With DDP batch_size is multiplied by N jobs # Must be 6 per GPU to fit 16GB of VRAM batch_size: 10 test_batch_size: 4 dataloader_options: batch_size: !ref num_workers: 6 test_dataloader_options: batch_size: !ref num_workers: 6 # BPE parameters token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 # Model parameters # activation: !name:torch.nn.LeakyReLU wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False freeze_feature_extractor: True dropout: 0.15 warmup_steps: 500 # The wav2vec 2 model isn't updated for this amount of steps # Outputs output_neurons: 40 # BPE size, index(blank/eos/bos) = 0 # Decoding parameters # Be sure that the bos and eos index match with the BPEs ones blank_index: 0 unk_index: 1 # # Functions and classes # epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment sample_rate: !ref speeds: [95, 100, 105] enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref ] linear1: !name:speechbrain.nnet.linear.Linear n_neurons: !ref bias: True bn1: !name:speechbrain.nnet.normalization.BatchNorm1d activation: !new:torch.nn.LeakyReLU drop: !new:torch.nn.Dropout p: !ref linear2: !name:speechbrain.nnet.linear.Linear n_neurons: !ref bias: True bn2: !name:speechbrain.nnet.normalization.BatchNorm1d activation2: !new:torch.nn.LeakyReLU drop2: !new:torch.nn.Dropout p: !ref linear3: !name:speechbrain.nnet.linear.Linear n_neurons: !ref bias: True bn3: !name:speechbrain.nnet.normalization.BatchNorm1d activation3: !new:torch.nn.LeakyReLU wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2 source: /gpfsstore/rech/nou/uzn19yk/wavlm/ output_norm: False freeze: !ref freeze_feature_extractor: !ref save_path: !ref ctc_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True ctc_cost: !name:speechbrain.nnet.losses.ctc_loss blank_index: !ref modules: wav2vec2: !ref enc: !ref ctc_lin: !ref model: !new:torch.nn.ModuleList - [!ref , !ref ] model_opt_class: !name:torch.optim.Adadelta lr: !ref rho: 0.95 eps: 1.e-8 wav2vec_opt_class: !name:torch.optim.Adam lr: !ref lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 patient: 0 lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.9 patient: 0 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: wav2vec2: !ref model: !ref scheduler_model: !ref scheduler_wav2vec: !ref counter: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats split_tokens: True