from transformers import LlamaConfig class CognitivessConfig(LlamaConfig): model_type = "cognitivess" def __init__( self, vocab_size=128256, hidden_size=4096, intermediate_size=14336, num_hidden_layers=32, num_attention_heads=32, num_key_value_heads=8, hidden_act="silu", max_position_embeddings=8192, initializer_range=0.02, rms_norm_eps=1e-5, use_cache=True, pad_token_id=0, bos_token_id=128000, eos_token_id=128001, tie_word_embeddings=False, attention_dropout=0.0, pretraining_tp=1, rope_theta=500000.0, **kwargs ): super().__init__( vocab_size=vocab_size, hidden_size=hidden_size, intermediate_size=intermediate_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, num_key_value_heads=num_key_value_heads, hidden_act=hidden_act, max_position_embeddings=max_position_embeddings, initializer_range=initializer_range, rms_norm_eps=rms_norm_eps, use_cache=use_cache, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, pretraining_tp=pretraining_tp, rope_theta=rope_theta, **kwargs ) self.attention_dropout = attention_dropout