nanotron
/

mistral-nanotron

thomwolf HF staff

update all

aa5ff8c 9 months ago

1.15 kB

	checkpoints: null
	data: null
	general:
	benchmark_csv_path: null
	consumed_train_samples: null
	ignore_sanity_checks: false
	project: mistralai
	run: Mistral-7B-v0.1
	seed: 42
	step: 0
	logging: null
	model:
	ddp_bucket_cap_mb: 25
	dtype: bfloat16
	init_method:
	std: 0.025
	make_vocab_size_divisible_by: 1
	model_config:
	attn_pdrop: 0.0
	bos_token_id: 1
	eos_token_id: 2
	hidden_act: silu
	hidden_size: 4096
	initializer_range: 0.02
	intermediate_size: 14336
	is_mistral_config: true
	max_position_embeddings: 32768
	num_attention_heads: 32
	num_hidden_layers: 32
	num_key_value_heads: 8
	pad_token_id: null
	pretraining_tp: 1
	rms_norm_eps: 1.0e-05
	rope_theta: 10000.0
	sliding_window_size: 4096
	tie_word_embeddings: false
	use_cache: true
	vocab_size: 32000
	optimizer: null
	parallelism:
	dp: 1
	pp: 1
	pp_engine: 1f1b
	recompute_granularity: SELECTIVE
	tp: 1
	tp_linear_async_communication: true
	tp_mode: REDUCE_SCATTER
	profiler: null
	tokenizer:
	tokenizer_max_length: null
	tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
	tokenizer_revision: null
	tokens: null