sample_phi-2 / README.md
if001's picture
fix
c4d74b1 verified
---
license: mit
---
```
PhiConfig {
"attention_dropout": 0.0,
"bos_token_id": 1,
"embd_pdrop": 0.0,
"eos_token_id": 2,
"hidden_act": "gelu_new",
"hidden_size": 8,
"initializer_range": 0.02,
"intermediate_size": 10,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 2048,
"model_type": "phi",
"num_attention_heads": 4,
"num_hidden_layers": 6,
"num_key_value_heads": 2,
"partial_rotary_factor": 0.5,
"qk_layernorm": false,
"resid_pdrop": 0.0,
"rope_scaling": null,
"rope_theta": 10000.0,
"tie_word_embeddings": false,
"transformers_version": "4.38.2",
"use_cache": true,
"vocab_size": 51200
}
```
```
PhiForCausalLM(
(model): PhiModel(
(embed_tokens): Embedding(51200, 8)
(embed_dropout): Dropout(p=0.0, inplace=False)
(layers): ModuleList(
(0-5): 6 x PhiDecoderLayer(
(self_attn): PhiAttention(
(q_proj): Linear(in_features=8, out_features=8, bias=True)
(k_proj): Linear(in_features=8, out_features=4, bias=True)
(v_proj): Linear(in_features=8, out_features=4, bias=True)
(dense): Linear(in_features=8, out_features=8, bias=True)
(rotary_emb): PhiRotaryEmbedding()
)
(mlp): PhiMLP(
(activation_fn): NewGELUActivation()
(fc1): Linear(in_features=8, out_features=10, bias=True)
(fc2): Linear(in_features=10, out_features=8, bias=True)
)
(input_layernorm): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
(resid_dropout): Dropout(p=0.0, inplace=False)
)
)
(final_layernorm): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=8, out_features=51200, bias=True)
)
```
```
===========================================================================
Layer (type:depth-idx) Param #
===========================================================================
PhiForCausalLM --
├─PhiModel: 1-1 --
│ └─Embedding: 2-1 409,600
│ └─Dropout: 2-2 --
│ └─ModuleList: 2-3 --
│ │ └─PhiDecoderLayer: 3-1 410
│ │ └─PhiDecoderLayer: 3-2 410
│ │ └─PhiDecoderLayer: 3-3 410
│ │ └─PhiDecoderLayer: 3-4 410
│ │ └─PhiDecoderLayer: 3-5 410
│ │ └─PhiDecoderLayer: 3-6 410
│ └─LayerNorm: 2-4 16
├─Linear: 1-2 460,800
===========================================================================
Total params: 872,876
Trainable params: 872,876
Non-trainable params: 0
===========================================================================
```