File size: 1,181 Bytes
5381499 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
from easydict import EasyDict as dict
D_MODEL = 768
HIDDEN_SIZE = 512
context_encoder = dict(
feature_projection=dict(
in_features=HIDDEN_SIZE,
out_features=D_MODEL,
dropout=0.1,
),
encoder=dict(
d_model=D_MODEL,
num_layers=12,
layer_drop=0.05,
pos_embedding=dict(
d_model=D_MODEL,
kernel_size=3,
groups=2,
dropout=0.1,
),
layer=dict(
d_model=D_MODEL,
num_heads=8,
layer_norm_first=False,
feed_forward_dim=2048,
dropout=0.1,
),
)
)
feature_extractor = dict(
num_channels=7 * (HIDDEN_SIZE,),
kernel_sizes=(10,) + 4 * (3,) + 2 * (2,),
strides=(5,) + 6 * (2,),
)
quantizer = dict(
in_features=HIDDEN_SIZE,
num_codebooks=2,
num_codewords=320,
d_model=D_MODEL,
)
wav2vec2_pretraining = dict(
context_encoder=context_encoder,
feature_extractor=feature_extractor,
quantizer=quantizer,
mask_prob=0.65,
mask_length=10,
min_masks=2,
num_negatives=100,
contrastive_logits_temperature=0.1,
diversity_loss_weight=0.2,
) |