File size: 3,748 Bytes
1261159 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
micro_batch_size: 27
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 1
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
megatron_amp_O2: false
seq_length: 512
max_position_embeddings: 512
num_layers: 24
hidden_size: 1024
ffn_hidden_size: 16384
num_attention_heads: 32
init_method_std: 0.015
hidden_dropout: 0.1
attention_dropout: 0.1
kv_channels: 128
apply_query_key_layer_scaling: true
layernorm_epsilon: 1.0e-05
persist_layer_norm: true
gradient_as_bucket_view: true
encoder_arch: transformer
decoder_arch: transformer
activation: gelu
tokenizer:
library: megatron
type: BertWordPieceCase
model: null
vocab_file: bert_vocab.txt
merge_file: null
num_sentinel_tokens: 100
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
fp32_residual_connection: false
fp16_lm_cross_entropy: false
seed: 1234
use_cpu_initialization: false
onnx_safe: false
activations_checkpoint_method: null
activations_checkpoint_num_layers: 1
data:
data_prefix:
- 0.0333
- /preproc_data/my-t5_00_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_01_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_02_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_03_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_04_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_05_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_06_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_07_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_08_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_09_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_10_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_11_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_12_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_13_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_14_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_15_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_16_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_17_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_18_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_19_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_20_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_21_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_22_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_23_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_24_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_25_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_26_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_27_bert_tokenizer_text_document
- 0.0333
- /preproc_data/my-t5_28_bert_tokenizer_text_document
- 0.0334
- /preproc_data/my-t5_29_bert_tokenizer_text_document
data_impl: mmap
splits_string: 99982,9,9
seq_length: 512
seq_length_dec: 128
skip_warmup: true
num_workers: 4
dataloader_type: single
masked_lm_prob: 0.15
dataset_type: t5
short_seq_prob: 0.0
max_ngram_size: 10
mean_ngram_size: null
geometric_dist: true
permutation: false
whole_word_masking: true
favor_longer_ngrams: false
optim:
name: fused_adam
lr: 0.0001
betas:
- 0.9
- 0.999
eps: 1.0e-08
weight_decay: 0.01
sched:
name: WarmupAnnealing
min_lr: 1.0e-05
last_epoch: -1
warmup_ratio: 0.01
precision: bf16
target: nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model
nemo_version: 1.7.1
vocab_file: nemo:6b9a052d82a744389fbe256fea20c06f_vocab.txt
|