|
arch: |
|
type: TransformerLMHeadModel |
|
args: |
|
transformer_config: |
|
type: TransformerDecoderOnlyModel |
|
args: |
|
embed_config: |
|
type: TransformerEmbeddingBlock |
|
args: |
|
token_embed_config: |
|
type: TokenEmbedding |
|
args: |
|
n_embed: 768 |
|
n_vocab: 50257 |
|
pos_embed_config: |
|
type: PositionEmbedding |
|
args: |
|
n_embed: 768 |
|
n_pos: 1024 |
|
type_embed_config: null |
|
ln_config: null |
|
p_drop_embed: 0.1 |
|
concat_strategy: id_first |
|
decoder_config: |
|
type: TransformerDecoderBlock |
|
args: |
|
attn_config: |
|
type: MultiHeadKeyValueAttention |
|
args: |
|
n_embed: 768 |
|
n_pos: 1024 |
|
n_head: 12 |
|
head_size: 64 |
|
p_drop_attn: 0.1 |
|
p_drop_resid: 0.1 |
|
bias_attn: true |
|
bias_proj: true |
|
cross_attn: false |
|
scale_dot_product: true |
|
scale_layer_wise: false |
|
layer_idx: null |
|
perform_linear_bias: false |
|
perform_bloom_split_head: false |
|
perform_query_scaling: false |
|
attn_window_size: null |
|
mlp_config: |
|
type: TransformerMLP |
|
args: |
|
n_embed: 768 |
|
n_inner: 3072 |
|
act_fn_config: |
|
type: NewGELUActivation |
|
args: {} |
|
p_drop_mlp: 0.1 |
|
ln_config: |
|
type: LayerNorm |
|
args: |
|
n_embed: 768 |
|
ln_eps: 1.0e-05 |
|
n_embed: 768 |
|
post_norm: false |
|
add_cross_attn: false |
|
n_embed: 768 |
|
n_layer: 12 |
|
n_head: 12 |
|
ln_config: |
|
type: LayerNorm |
|
args: |
|
n_embed: 768 |
|
ln_eps: 1.0e-05 |
|
perform_linear_bias: false |
|
attn_window_size_loop_unit: null |
|
lm_head_config: |
|
type: TransformerLMHead |
|
args: |
|
n_vocab: 50257 |
|
n_embed: 768 |
|
perform_transform: false |
|
act_fn_config: null |
|
ln_config: null |
|
|