MaximumEntropy commited on
Commit
94f4faf
1 Parent(s): dcd1a42

Upload model_config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. model_config.yaml +129 -0
model_config.yaml ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ micro_batch_size: 24
2
+ global_batch_size: 1920
3
+ tensor_model_parallel_size: 2
4
+ pipeline_model_parallel_size: 1
5
+ resume_from_checkpoint: null
6
+ pipeline_model_parallel_split_rank: 0
7
+ make_vocab_size_divisible_by: 128
8
+ pre_process: true
9
+ post_process: true
10
+ megatron_amp_O2: true
11
+ seq_length: 512
12
+ max_position_embeddings: 512
13
+ num_layers: 24
14
+ hidden_size: 2048
15
+ ffn_hidden_size: 5120
16
+ num_attention_heads: 32
17
+ init_method_std: 0.015
18
+ hidden_dropout: 0.1
19
+ attention_dropout: 0.1
20
+ kv_channels: 64
21
+ apply_query_key_layer_scaling: true
22
+ layernorm_epsilon: 1.0e-05
23
+ persist_layer_norm: true
24
+ gradient_as_bucket_view: true
25
+ bias_gelu_fusion: false
26
+ masked_softmax_fusion: true
27
+ encoder_arch: transformer
28
+ decoder_arch: transformer
29
+ activation: geglu
30
+ tokenizer:
31
+ library: sentencepiece
32
+ type: null
33
+ model: nemo:d55283aced7944109f3cf68d9452e73b_mt5_tokenizer.model
34
+ vocab_file: null
35
+ merge_file: null
36
+ num_sentinel_tokens: 100
37
+ native_amp_init_scale: 4294967296
38
+ native_amp_growth_interval: 1000
39
+ fp32_residual_connection: false
40
+ fp16_lm_cross_entropy: false
41
+ seed: 1234
42
+ use_cpu_initialization: false
43
+ onnx_safe: false
44
+ apex_transformer_log_level: 30
45
+ activations_checkpoint_method: null
46
+ activations_checkpoint_num_layers: 1
47
+ data:
48
+ data_prefix:
49
+ - 0.056224
50
+ - /preproc_data/mc4_ja_mt5_tokenizer_text_document
51
+ - 0.064717
52
+ - /preproc_data/mc4_en_mt5_tokenizer_text_document
53
+ - 0.055394
54
+ - /preproc_data/mc4_it_mt5_tokenizer_text_document
55
+ - 0.006129
56
+ - /preproc_data/mc4_lv_mt5_tokenizer_text_document
57
+ - 0.156199
58
+ - /preproc_data/mc4_ru_mt5_tokenizer_text_document
59
+ - 0.02047
60
+ - /preproc_data/mc4_hu_mt5_tokenizer_text_document
61
+ - 0.020264
62
+ - /preproc_data/mc4_zh_mt5_tokenizer_text_document
63
+ - 0.047618
64
+ - /preproc_data/mc4_pl_mt5_tokenizer_text_document
65
+ - 0.021716
66
+ - /preproc_data/mc4_el_mt5_tokenizer_text_document
67
+ - 0.094469
68
+ - /preproc_data/mc4_de_mt5_tokenizer_text_document
69
+ - 0.028565
70
+ - /preproc_data/mc4_cs_mt5_tokenizer_text_document
71
+ - 0.015286
72
+ - /preproc_data/mc4_ko_mt5_tokenizer_text_document
73
+ - 0.014667
74
+ - /preproc_data/mc4_hi_mt5_tokenizer_text_document
75
+ - 0.015717
76
+ - /preproc_data/mc4_no_mt5_tokenizer_text_document
77
+ - 0.016761
78
+ - /preproc_data/mc4_da_mt5_tokenizer_text_document
79
+ - 0.011884
80
+ - /preproc_data/mc4_sk_mt5_tokenizer_text_document
81
+ - 0.088899
82
+ - /preproc_data/mc4_fr_mt5_tokenizer_text_document
83
+ - 0.051519
84
+ - /preproc_data/mc4_pt_mt5_tokenizer_text_document
85
+ - 0.008662
86
+ - /preproc_data/mc4_lt_mt5_tokenizer_text_document
87
+ - 0.110217
88
+ - /preproc_data/mc4_es_mt5_tokenizer_text_document
89
+ - 0.031769
90
+ - /preproc_data/mc4_nl_mt5_tokenizer_text_document
91
+ - 0.022698
92
+ - /preproc_data/mc4_sv_mt5_tokenizer_text_document
93
+ - 0.025119
94
+ - /preproc_data/mc4_ro_mt5_tokenizer_text_document
95
+ - 0.015036
96
+ - /preproc_data/mc4_fi_mt5_tokenizer_text_document
97
+ index_mapping_dir: null
98
+ data_impl: mmap
99
+ splits_string: 99892,99,9
100
+ seq_length: 512
101
+ seq_length_dec: 128
102
+ skip_warmup: true
103
+ num_workers: 8
104
+ dataloader_type: single
105
+ masked_lm_prob: 0.15
106
+ dataset_type: t5
107
+ short_seq_prob: 0.0
108
+ max_ngram_size: 10
109
+ mean_ngram_size: null
110
+ geometric_dist: true
111
+ permutation: false
112
+ whole_word_masking: false
113
+ favor_longer_ngrams: false
114
+ optim:
115
+ name: fused_adam
116
+ lr: 0.0001
117
+ betas:
118
+ - 0.9
119
+ - 0.999
120
+ eps: 1.0e-08
121
+ weight_decay: 0.01
122
+ sched:
123
+ name: WarmupAnnealing
124
+ min_lr: 1.0e-05
125
+ last_epoch: -1
126
+ warmup_ratio: 0.01
127
+ precision: bf16
128
+ target: nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model
129
+ nemo_version: 1.9.0rc0