arch:
  type: TransformerLMHeadModel
  args:
    transformer_config:
      type: TransformerDecoderOnlyModel
      args:
        embed_config:
          type: TransformerEmbeddingBlock
          args:
            token_embed_config:
              type: TokenEmbedding
              args:
                n_embed: 2048
                n_vocab: 32256
            pos_embed_config: null
            type_embed_config: null
            ln_config: null
            p_drop_embed: 0.0
            concat_strategy: id_first
        decoder_config:
          type: TransformerDecoderBlock
          args:
            attn_config:
              type: LlamaAttention
              args:
                n_embed: 2048
                n_pos: 16384
                n_head: 16
                n_key_value_head: 16
                head_size: 128
                p_drop_attn: 0.0
                p_drop_resid: 0.0
                bias_attn: false
                bias_proj: false
                cross_attn: false
                scale_dot_product: true
                scale_layer_wise: false
                layer_idx: null
                rope_config:
                  type: RotaryPositionEmbedding
                  args:
                    head_size: 128
                    n_pos: 16384
                    base: 100000
                    scaling_type: linear
                    scaling_factor: 4.0
            mlp_config:
              type: LlamaMLP
              args:
                n_embed: 2048
                n_inner: 5504
                act_fn_config:
                  type: SiLUActivation
                  args: {}
            ln_config:
              type: LlamaRMSNorm
              args:
                n_embed: 2048
                ln_eps: 1.0e-06
            n_embed: 2048
            post_norm: false
            add_cross_attn: false
        n_embed: 2048
        n_layer: 24
        n_head: 16
        ln_config:
          type: LlamaRMSNorm
          args:
            n_embed: 2048
            ln_eps: 1.0e-06
        perform_linear_bias: false
        attn_window_size_loop_unit: null
    lm_head_config:
      type: TransformerLMHead
      args:
        n_vocab: 32256
        n_embed: 2048
        perform_transform: false
        act_fn_config: null
        ln_config: null