File size: 1,181 Bytes
5381499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from easydict import EasyDict as dict

D_MODEL = 768
HIDDEN_SIZE = 512



context_encoder = dict(
    feature_projection=dict(
        in_features=HIDDEN_SIZE,
        out_features=D_MODEL,
        dropout=0.1,
    ),
    encoder=dict(
        d_model=D_MODEL,
        num_layers=12,
        layer_drop=0.05,
        pos_embedding=dict(
            d_model=D_MODEL,
            kernel_size=3,
            groups=2,
            dropout=0.1,
        ),
        layer=dict(
            d_model=D_MODEL,
            num_heads=8,
            layer_norm_first=False,
            feed_forward_dim=2048,
            dropout=0.1,
        ),
    )
)

feature_extractor = dict(
    num_channels=7 * (HIDDEN_SIZE,),
    kernel_sizes=(10,) + 4 * (3,) + 2 * (2,),
    strides=(5,) + 6 * (2,),
)

quantizer = dict(
    in_features=HIDDEN_SIZE,
    num_codebooks=2,
    num_codewords=320,
    d_model=D_MODEL,
)

wav2vec2_pretraining = dict(
    context_encoder=context_encoder,
    feature_extractor=feature_extractor,
    quantizer=quantizer,
    mask_prob=0.65,
    mask_length=10,
    min_masks=2,
    num_negatives=100,
    contrastive_logits_temperature=0.1,
    diversity_loss_weight=0.2,
)