emotion2vec_plus_base / config.yaml
BoJack's picture
Upload 8 files
c76626b verified
raw
history blame
3.17 kB
# network architecture
model: Emotion2vec
model_conf:
loss_beta: 0.0
loss_scale: null
depth: 8
start_drop_path_rate: 0.0
end_drop_path_rate: 0.0
num_heads: 12
norm_eps: 1e-05
norm_affine: true
encoder_dropout: 0.1
post_mlp_drop: 0.1
attention_dropout: 0.1
activation_dropout: 0.0
dropout_input: 0.0
layerdrop: 0.05
embed_dim: 768
mlp_ratio: 4.0
layer_norm_first: false
average_top_k_layers: 8
end_of_block_targets: false
clone_batch: 8
layer_norm_target_layer: false
batch_norm_target_layer: false
instance_norm_target_layer: true
instance_norm_targets: false
layer_norm_targets: false
ema_decay: 0.999
ema_same_dtype: true
log_norms: true
ema_end_decay: 0.99999
ema_anneal_end_step: 20000
ema_encoder_only: false
max_update: 100000
extractor_mode: layer_norm
shared_decoder: null
min_target_var: 0.1
min_pred_var: 0.01
supported_modality: AUDIO
mae_init: false
seed: 1
skip_ema: false
cls_loss: 1.0
recon_loss: 0.0
d2v_loss: 1.0
decoder_group: false
adversarial_training: false
adversarial_hidden_dim: 128
adversarial_weight: 0.1
cls_type: chunk
normalize: true
project_dim:
modalities:
audio:
type: AUDIO
prenet_depth: 4
prenet_layerdrop: 0.05
prenet_dropout: 0.1
start_drop_path_rate: 0.0
end_drop_path_rate: 0.0
num_extra_tokens: 10
init_extra_token_zero: true
mask_noise_std: 0.01
mask_prob_min: null
mask_prob: 0.5
inverse_mask: false
mask_prob_adjust: 0.05
keep_masked_pct: 0.0
mask_length: 5
add_masks: false
remove_masks: false
mask_dropout: 0.0
encoder_zero_mask: true
mask_channel_prob: 0.0
mask_channel_length: 64
ema_local_encoder: false
local_grad_mult: 1.0
use_alibi_encoder: true
alibi_scale: 1.0
learned_alibi: false
alibi_max_pos: null
learned_alibi_scale: true
learned_alibi_scale_per_head: true
learned_alibi_scale_per_layer: false
num_alibi_heads: 12
model_depth: 8
decoder:
decoder_dim: 384
decoder_groups: 16
decoder_kernel: 7
decoder_layers: 4
input_dropout: 0.1
add_positions_masked: false
add_positions_all: false
decoder_residual: true
projection_layers: 1
projection_ratio: 2.0
extractor_mode: layer_norm
feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
conv_pos_width: 95
conv_pos_groups: 16
conv_pos_depth: 5
conv_pos_pre_ln: false
tokenizer: CharTokenizer
tokenizer_conf:
unk_symbol: <unk>
split_with_space: true
scope_map:
- 'd2v_model.'
- none