from transformers import GPT2Config class NomicBertConfig(GPT2Config): model_type = "nomic_bert" def __init__( self, prenorm=False, parallel_block=False, parallel_block_tied_norm=False, rotary_emb_fraction=0.0, fused_dropout_add_ln=False, fused_bias_fc=False, use_flash_attn=False, use_xentropy=False, qkv_proj_bias=True, rotary_emb_base=10_000, rotary_emb_scale_base=None, rotary_emb_interleaved=False, mlp_fc1_bias=True, mlp_fc2_bias=True, use_rms_norm=False, causal=False, type_vocab_size=2, dense_seq_output=True, pad_vocab_size_multiple=1, tie_word_embeddings=True, rotary_scaling_factor=1.0, max_trained_positions=2048, **kwargs, ): self.prenorm = prenorm self.parallel_block = parallel_block self.parallel_block_tied_norm = parallel_block_tied_norm self.rotary_emb_fraction = rotary_emb_fraction self.tie_word_embeddings = tie_word_embeddings self.fused_dropout_add_ln = fused_dropout_add_ln self.fused_bias_fc = fused_bias_fc self.use_flash_attn = use_flash_attn self.use_xentropy = use_xentropy self.qkv_proj_bias = qkv_proj_bias self.rotary_emb_base = rotary_emb_base self.rotary_emb_scale_base = rotary_emb_scale_base self.rotary_emb_interleaved = rotary_emb_interleaved self.mlp_fc1_bias = mlp_fc1_bias self.mlp_fc2_bias = mlp_fc2_bias self.use_rms_norm = use_rms_norm self.causal = causal self.type_vocab_size = type_vocab_size self.dense_seq_output = dense_seq_output self.pad_vocab_size_multiple = pad_vocab_size_multiple self.rotary_scaling_factor = rotary_scaling_factor self.max_trained_positions = max_trained_positions super().__init__(**kwargs)