NeMo
PyTorch
English
seq2seq
masked language modeling
File size: 3,748 Bytes
1261159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
micro_batch_size: 27
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 1
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
megatron_amp_O2: false
seq_length: 512
max_position_embeddings: 512
num_layers: 24
hidden_size: 1024
ffn_hidden_size: 16384
num_attention_heads: 32
init_method_std: 0.015
hidden_dropout: 0.1
attention_dropout: 0.1
kv_channels: 128
apply_query_key_layer_scaling: true
layernorm_epsilon: 1.0e-05
persist_layer_norm: true
gradient_as_bucket_view: true
encoder_arch: transformer
decoder_arch: transformer
activation: gelu
tokenizer:
  library: megatron
  type: BertWordPieceCase
  model: null
  vocab_file: bert_vocab.txt
  merge_file: null
  num_sentinel_tokens: 100
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
fp32_residual_connection: false
fp16_lm_cross_entropy: false
seed: 1234
use_cpu_initialization: false
onnx_safe: false
activations_checkpoint_method: null
activations_checkpoint_num_layers: 1
data:
  data_prefix:
  - 0.0333
  - /preproc_data/my-t5_00_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_01_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_02_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_03_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_04_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_05_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_06_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_07_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_08_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_09_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_10_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_11_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_12_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_13_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_14_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_15_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_16_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_17_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_18_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_19_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_20_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_21_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_22_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_23_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_24_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_25_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_26_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_27_bert_tokenizer_text_document
  - 0.0333
  - /preproc_data/my-t5_28_bert_tokenizer_text_document
  - 0.0334
  - /preproc_data/my-t5_29_bert_tokenizer_text_document
  data_impl: mmap
  splits_string: 99982,9,9
  seq_length: 512
  seq_length_dec: 128
  skip_warmup: true
  num_workers: 4
  dataloader_type: single
  masked_lm_prob: 0.15
  dataset_type: t5
  short_seq_prob: 0.0
  max_ngram_size: 10
  mean_ngram_size: null
  geometric_dist: true
  permutation: false
  whole_word_masking: true
  favor_longer_ngrams: false
optim:
  name: fused_adam
  lr: 0.0001
  betas:
  - 0.9
  - 0.999
  eps: 1.0e-08
  weight_decay: 0.01
  sched:
    name: WarmupAnnealing
    min_lr: 1.0e-05
    last_epoch: -1
    warmup_ratio: 0.01
precision: bf16
target: nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model
nemo_version: 1.7.1
vocab_file: nemo:6b9a052d82a744389fbe256fea20c06f_vocab.txt