File size: 2,202 Bytes
d00d112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
## IO
save_data: fr-en/data_spm
overwrite: True
seed: 1234
report_every: 100
valid_metrics: ["BLEU"]
tensorboard: true
tensorboard_log_dir: tensorboard

### Vocab
src_vocab: fr-en/joint.eole.vocab
tgt_vocab: fr-en/joint.eole.vocab
src_vocab_size: 50000
tgt_vocab_size: 50000
vocab_size_multiple: 8
share_vocab: True
n_sample: 0

data:
    corpus_1:
        path_src: hf://quickmt/quickmt-train.fr-en/fr
        path_tgt: hf://quickmt/quickmt-train.fr-en/en
        path_sco: hf://quickmt/quickmt-train.fr-en/sco
    valid:
        path_src: fr-en/dev.src
        path_tgt: fr-en/dev.tgt

transforms: [sentencepiece, filtertoolong]
transforms_configs:
  sentencepiece:
    src_subword_model: "fr-en/joint.spm.model"
    tgt_subword_model: "fr-en/joint.spm.model"
  filtertoolong:
    src_seq_length: 256
    tgt_seq_length: 256

training:
    # Run configuration
    model_path: fr-en/model
    keep_checkpoint: 4
    save_checkpoint_steps: 2000
    train_steps: 100000
    valid_steps: 2000
    
    # Train on a single GPU
    world_size: 1
    gpu_ranks: [0]

    # Batching
    batch_type: "tokens"
    batch_size: 8192
    valid_batch_size: 8192
    batch_size_multiple: 8
    accum_count: [16]
    accum_steps: [0]

    # Optimizer & Compute
    compute_dtype: "bf16"
    optim: "pagedadamw8bit"
    #optim: "adamw"
    learning_rate: 2.0
    warmup_steps: 10000
    decay_method: "noam"
    adam_beta2: 0.998

    # Data loading
    bucket_size: 128000
    num_workers: 4
    prefetch_factor: 100
    
    # Hyperparams
    dropout_steps: [0]
    dropout: [0.1]
    attention_dropout: [0.1]
    max_grad_norm: 2
    label_smoothing: 0.1
    average_decay: 0.0001
    param_init_method: xavier_uniform
    normalization: "tokens"

model:
    architecture: "transformer"
    layer_norm: standard
    share_embeddings: true
    share_decoder_embeddings: true
    add_ffnbias: true
    mlp_activation_fn: gelu
    add_estimator: false
    add_qkvbias: false
    norm_eps: 1e-6
    hidden_size: 1024
    encoder:
        layers: 8
    decoder:
        layers: 2
    heads: 8
    transformer_ff: 4096
    embeddings:
        word_vec_size: 1024
        position_encoding_type: "SinusoidalInterleaved"