Upload nemo_train_params.yaml with huggingface_hub

5d6a887 verified 6 months ago

2.59 kB

	cfg:
	micro_batch_size: 42
	global_batch_size: 8064
	tensor_model_parallel_size: 1
	pipeline_model_parallel_size: 1
	encoder_seq_length: 512
	max_position_embeddings: 512
	num_layers: 12
	hidden_size: 768
	ffn_hidden_size: 3072
	num_attention_heads: 12
	init_method_std: 0.02
	hidden_dropout: 0.1
	kv_channels: null
	apply_query_key_layer_scaling: true
	layernorm_epsilon: 1.0e-05
	make_vocab_size_divisible_by: 128
	pre_process: true
	post_process: true
	bert_binary_head: true
	tokenizer:
	library: huggingface
	type: KBLab/unigram-64k-pretok-small_data-tokenizer
	model: null
	vocab_file: null
	merge_file: null
	native_amp_init_scale: 4294967296
	native_amp_growth_interval: 1000
	fp32_residual_connection: false
	fp16_lm_cross_entropy: false
	megatron_amp_O2: false
	grad_allreduce_chunk_size_mb: 125
	grad_div_ar_fusion: false
	seed: 666
	use_cpu_initialization: false
	onnx_safe: false
	gradient_as_bucket_view: true
	activations_checkpoint_granularity: null
	activations_checkpoint_method: null
	activations_checkpoint_num_layers: null
	num_micro_batches_with_partial_activation_checkpoints: null
	activations_checkpoint_layers_per_pipeline: null
	sequence_parallel: false
	data:
	data_prefix:
	- 1
	- /project/scratch/$PID/data/unigram-64k-pretok-small_data/wikipedia-unigram-64k-pretok-small_data_text_sentence
	- 1
	- /project/scratch/$PID/data/unigram-64k-pretok-small_data/edepos_html-unigram-64k-pretok-small_data_text_sentence
	- 1
	- /project/scratch/$PID/data/unigram-64k-pretok-small_data/oscar-unigram-64k-pretok-small_data_text_sentence
	- 1
	- /project/scratch/$PID/data/unigram-64k-pretok-small_data/kw3-2017-unigram-64k-pretok-small_data_text_sentence
	- 1
	- /project/scratch/$PID/data/unigram-64k-pretok-small_data/issues-unigram-64k-pretok-small_data_text_sentence
	- 1
	- /project/scratch/$PID/data/unigram-64k-pretok-small_data/mc4-unigram-64k-pretok-small_data_text_sentence
	index_mapping_dir: /project/scratch/$PID/data/unigram-64k-pretok-small_data/npy_files/
	data_impl: mmap
	splits_string: 980,10,10
	seq_length: 512
	skip_warmup: true
	num_workers: 32
	dataloader_type: single
	reset_position_ids: false
	reset_attention_mask: false
	eod_mask_loss: false
	masked_lm_prob: 0.15
	short_seq_prob: 0.1
	optim:
	name: fused_adam
	lr: 0.0006
	weight_decay: 0.01
	betas:
	- 0.9
	- 0.98
	sched:
	name: CosineAnnealing
	warmup_steps: 500
	constant_steps: 500
	min_lr: 2.0e-05
	precision: 16