nemo-megatron-mt5-3B / model_config.yaml
MaximumEntropy's picture
Upload model_config.yaml with huggingface_hub
94f4faf
raw
history blame contribute delete
No virus
3.5 kB
micro_batch_size: 24
global_batch_size: 1920
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 1
resume_from_checkpoint: null
pipeline_model_parallel_split_rank: 0
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
megatron_amp_O2: true
seq_length: 512
max_position_embeddings: 512
num_layers: 24
hidden_size: 2048
ffn_hidden_size: 5120
num_attention_heads: 32
init_method_std: 0.015
hidden_dropout: 0.1
attention_dropout: 0.1
kv_channels: 64
apply_query_key_layer_scaling: true
layernorm_epsilon: 1.0e-05
persist_layer_norm: true
gradient_as_bucket_view: true
bias_gelu_fusion: false
masked_softmax_fusion: true
encoder_arch: transformer
decoder_arch: transformer
activation: geglu
tokenizer:
library: sentencepiece
type: null
model: nemo:d55283aced7944109f3cf68d9452e73b_mt5_tokenizer.model
vocab_file: null
merge_file: null
num_sentinel_tokens: 100
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
fp32_residual_connection: false
fp16_lm_cross_entropy: false
seed: 1234
use_cpu_initialization: false
onnx_safe: false
apex_transformer_log_level: 30
activations_checkpoint_method: null
activations_checkpoint_num_layers: 1
data:
data_prefix:
- 0.056224
- /preproc_data/mc4_ja_mt5_tokenizer_text_document
- 0.064717
- /preproc_data/mc4_en_mt5_tokenizer_text_document
- 0.055394
- /preproc_data/mc4_it_mt5_tokenizer_text_document
- 0.006129
- /preproc_data/mc4_lv_mt5_tokenizer_text_document
- 0.156199
- /preproc_data/mc4_ru_mt5_tokenizer_text_document
- 0.02047
- /preproc_data/mc4_hu_mt5_tokenizer_text_document
- 0.020264
- /preproc_data/mc4_zh_mt5_tokenizer_text_document
- 0.047618
- /preproc_data/mc4_pl_mt5_tokenizer_text_document
- 0.021716
- /preproc_data/mc4_el_mt5_tokenizer_text_document
- 0.094469
- /preproc_data/mc4_de_mt5_tokenizer_text_document
- 0.028565
- /preproc_data/mc4_cs_mt5_tokenizer_text_document
- 0.015286
- /preproc_data/mc4_ko_mt5_tokenizer_text_document
- 0.014667
- /preproc_data/mc4_hi_mt5_tokenizer_text_document
- 0.015717
- /preproc_data/mc4_no_mt5_tokenizer_text_document
- 0.016761
- /preproc_data/mc4_da_mt5_tokenizer_text_document
- 0.011884
- /preproc_data/mc4_sk_mt5_tokenizer_text_document
- 0.088899
- /preproc_data/mc4_fr_mt5_tokenizer_text_document
- 0.051519
- /preproc_data/mc4_pt_mt5_tokenizer_text_document
- 0.008662
- /preproc_data/mc4_lt_mt5_tokenizer_text_document
- 0.110217
- /preproc_data/mc4_es_mt5_tokenizer_text_document
- 0.031769
- /preproc_data/mc4_nl_mt5_tokenizer_text_document
- 0.022698
- /preproc_data/mc4_sv_mt5_tokenizer_text_document
- 0.025119
- /preproc_data/mc4_ro_mt5_tokenizer_text_document
- 0.015036
- /preproc_data/mc4_fi_mt5_tokenizer_text_document
index_mapping_dir: null
data_impl: mmap
splits_string: 99892,99,9
seq_length: 512
seq_length_dec: 128
skip_warmup: true
num_workers: 8
dataloader_type: single
masked_lm_prob: 0.15
dataset_type: t5
short_seq_prob: 0.0
max_ngram_size: 10
mean_ngram_size: null
geometric_dist: true
permutation: false
whole_word_masking: false
favor_longer_ngrams: false
optim:
name: fused_adam
lr: 0.0001
betas:
- 0.9
- 0.999
eps: 1.0e-08
weight_decay: 0.01
sched:
name: WarmupAnnealing
min_lr: 1.0e-05
last_epoch: -1
warmup_ratio: 0.01
precision: bf16
target: nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model
nemo_version: 1.9.0rc0