File size: 2,169 Bytes
ef3127b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

# General config
save_model: model_1/multilingual_transformer
# Training files
data:
    corpus_1:
        path_src: final_train.src.train
        path_tgt: final_train.tgt.train
        transforms: [sentencepiece, filtertoolong, docify]
    valid:
        path_src: final_train.src.dev
        path_tgt: final_train.tgt.dev
        transforms: [sentencepiece, filtertoolong, docify]
save_checkpoint_steps: 5000
keep_checkpoint: 1

# Data config
save_data: data_1
src_vocab: data_1/source.vocab
tgt_vocab: data_1/target.vocab
src_vocab_size: 256000
tgt_vocab_size: 256000

# Training parameters
world_size: 2
gpu_ranks: [0 , 1]

seed: 3435
train_steps: 30000
valid_steps: 5000
report_every: 1000

# Model parameters
model_type: text
model_dtype: "fp32"
encoder_type: transformer
decoder_type: transformer

enc_layers: 8
dec_layers: 8
heads: 16
hidden_size: 768
word_vec_size: 768
transformer_ff: 3072
dropout: [0.1]
attention_dropout: [0.1]

transforms: [sentencepiece, filtertoolong, docify]
src_subword_type: sentencepiece
tgt_subword_type: sentencepiece
src_seq_length: 512
tgt_seq_length: 512
src_seq_length_trunc: 512
tgt_seq_length_trunc: 512

# Optimization parameters
optim: "adam"
bucket_size: 262144
num_workers: 2 
learning_rate: 2.0
warmup_steps: 4000
decay_method: "noam"
adam_beta1: 0.9
adam_beta2: 0.998
max_grad_norm: 0.0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
batch_size: 4096
batch_type: "tokens"
normalization: "tokens"
accum_count: [2]

src_subword_model: source.model
tgt_subword_model: target.model
src_subword_nbest: 1
src_subword_alpha: 0.0
tgt_subword_nbest: 1
tgt_subword_alpha: 0.0

# Additional features
position_encoding_type: SinusoidalInterleaved
pos_ffn_activation_fn: gelu
parallel_residual: true
position_encoding: false
max_relative_positions: 32
self_attn_type: scaled-dot
sliding_window: 256
global_attention: general

# Validation parameters
valid_batch_size: 2048
# Output generator function
generator_function: softmax
# Docify parameters
docify:
    doc_length: 200
    max_context: 1

switchout_temperature: 0.2

temperature : 5.0 # Adjust this value to scale temperature


# Logging
log_file: train_1.log