imfinethx's picture
for cnn_dm
2e144f7
{
"_name_or_path": "checkpoints/microsoft/phi-1_5",
"anyprec": {
"arch_config": {
"layers_name": "layers",
"model_name": "model",
"module_names": [
"self_attn.q_proj",
"self_attn.k_proj",
"self_attn.v_proj",
"self_attn.dense",
"mlp.fc1",
"mlp.fc2"
]
},
"group_count": 1,
"parent_precision": 4,
"seed_precision": 2,
"sparse_numvals": {
"model.layers.0.mlp.fc1": 1279820,
"model.layers.0.mlp.fc2": 1236307,
"model.layers.0.self_attn.dense": 189187,
"model.layers.0.self_attn.k_proj": 318137,
"model.layers.0.self_attn.q_proj": 285336,
"model.layers.0.self_attn.v_proj": 212019,
"model.layers.1.mlp.fc1": 539655,
"model.layers.1.mlp.fc2": 766840,
"model.layers.1.self_attn.dense": 176195,
"model.layers.1.self_attn.k_proj": 224990,
"model.layers.1.self_attn.q_proj": 219788,
"model.layers.1.self_attn.v_proj": 201422,
"model.layers.10.mlp.fc1": 710241,
"model.layers.10.mlp.fc2": 754886,
"model.layers.10.self_attn.dense": 180978,
"model.layers.10.self_attn.k_proj": 221678,
"model.layers.10.self_attn.q_proj": 212231,
"model.layers.10.self_attn.v_proj": 200830,
"model.layers.11.mlp.fc1": 710220,
"model.layers.11.mlp.fc2": 741443,
"model.layers.11.self_attn.dense": 178022,
"model.layers.11.self_attn.k_proj": 215089,
"model.layers.11.self_attn.q_proj": 207761,
"model.layers.11.self_attn.v_proj": 194583,
"model.layers.12.mlp.fc1": 697047,
"model.layers.12.mlp.fc2": 752645,
"model.layers.12.self_attn.dense": 176274,
"model.layers.12.self_attn.k_proj": 220310,
"model.layers.12.self_attn.q_proj": 213155,
"model.layers.12.self_attn.v_proj": 198321,
"model.layers.13.mlp.fc1": 687659,
"model.layers.13.mlp.fc2": 738003,
"model.layers.13.self_attn.dense": 178018,
"model.layers.13.self_attn.k_proj": 221049,
"model.layers.13.self_attn.q_proj": 210113,
"model.layers.13.self_attn.v_proj": 195362,
"model.layers.14.mlp.fc1": 682287,
"model.layers.14.mlp.fc2": 781981,
"model.layers.14.self_attn.dense": 184895,
"model.layers.14.self_attn.k_proj": 217243,
"model.layers.14.self_attn.q_proj": 213786,
"model.layers.14.self_attn.v_proj": 198728,
"model.layers.15.mlp.fc1": 666136,
"model.layers.15.mlp.fc2": 761731,
"model.layers.15.self_attn.dense": 177136,
"model.layers.15.self_attn.k_proj": 217895,
"model.layers.15.self_attn.q_proj": 232288,
"model.layers.15.self_attn.v_proj": 191081,
"model.layers.16.mlp.fc1": 656390,
"model.layers.16.mlp.fc2": 799995,
"model.layers.16.self_attn.dense": 174656,
"model.layers.16.self_attn.k_proj": 217135,
"model.layers.16.self_attn.q_proj": 219926,
"model.layers.16.self_attn.v_proj": 185798,
"model.layers.17.mlp.fc1": 639288,
"model.layers.17.mlp.fc2": 775904,
"model.layers.17.self_attn.dense": 173271,
"model.layers.17.self_attn.k_proj": 206996,
"model.layers.17.self_attn.q_proj": 205270,
"model.layers.17.self_attn.v_proj": 184931,
"model.layers.18.mlp.fc1": 632334,
"model.layers.18.mlp.fc2": 768287,
"model.layers.18.self_attn.dense": 189412,
"model.layers.18.self_attn.k_proj": 215687,
"model.layers.18.self_attn.q_proj": 242190,
"model.layers.18.self_attn.v_proj": 200154,
"model.layers.19.mlp.fc1": 625021,
"model.layers.19.mlp.fc2": 738002,
"model.layers.19.self_attn.dense": 186977,
"model.layers.19.self_attn.k_proj": 216466,
"model.layers.19.self_attn.q_proj": 240694,
"model.layers.19.self_attn.v_proj": 197648,
"model.layers.2.mlp.fc1": 621667,
"model.layers.2.mlp.fc2": 757420,
"model.layers.2.self_attn.dense": 170986,
"model.layers.2.self_attn.k_proj": 225618,
"model.layers.2.self_attn.q_proj": 217741,
"model.layers.2.self_attn.v_proj": 200228,
"model.layers.20.mlp.fc1": 614692,
"model.layers.20.mlp.fc2": 727978,
"model.layers.20.self_attn.dense": 175731,
"model.layers.20.self_attn.k_proj": 213423,
"model.layers.20.self_attn.q_proj": 236043,
"model.layers.20.self_attn.v_proj": 183771,
"model.layers.21.mlp.fc1": 618662,
"model.layers.21.mlp.fc2": 738785,
"model.layers.21.self_attn.dense": 177493,
"model.layers.21.self_attn.k_proj": 208350,
"model.layers.21.self_attn.q_proj": 237646,
"model.layers.21.self_attn.v_proj": 187251,
"model.layers.22.mlp.fc1": 629352,
"model.layers.22.mlp.fc2": 818793,
"model.layers.22.self_attn.dense": 175140,
"model.layers.22.self_attn.k_proj": 202527,
"model.layers.22.self_attn.q_proj": 284459,
"model.layers.22.self_attn.v_proj": 180999,
"model.layers.23.mlp.fc1": 711633,
"model.layers.23.mlp.fc2": 1103566,
"model.layers.23.self_attn.dense": 219201,
"model.layers.23.self_attn.k_proj": 224644,
"model.layers.23.self_attn.q_proj": 397194,
"model.layers.23.self_attn.v_proj": 230928,
"model.layers.3.mlp.fc1": 663185,
"model.layers.3.mlp.fc2": 761065,
"model.layers.3.self_attn.dense": 185269,
"model.layers.3.self_attn.k_proj": 240041,
"model.layers.3.self_attn.q_proj": 232277,
"model.layers.3.self_attn.v_proj": 214858,
"model.layers.4.mlp.fc1": 716587,
"model.layers.4.mlp.fc2": 767640,
"model.layers.4.self_attn.dense": 179773,
"model.layers.4.self_attn.k_proj": 227913,
"model.layers.4.self_attn.q_proj": 220527,
"model.layers.4.self_attn.v_proj": 211685,
"model.layers.5.mlp.fc1": 707590,
"model.layers.5.mlp.fc2": 780274,
"model.layers.5.self_attn.dense": 178504,
"model.layers.5.self_attn.k_proj": 247977,
"model.layers.5.self_attn.q_proj": 243896,
"model.layers.5.self_attn.v_proj": 207831,
"model.layers.6.mlp.fc1": 710038,
"model.layers.6.mlp.fc2": 763787,
"model.layers.6.self_attn.dense": 190308,
"model.layers.6.self_attn.k_proj": 224045,
"model.layers.6.self_attn.q_proj": 220275,
"model.layers.6.self_attn.v_proj": 212698,
"model.layers.7.mlp.fc1": 715221,
"model.layers.7.mlp.fc2": 758669,
"model.layers.7.self_attn.dense": 175635,
"model.layers.7.self_attn.k_proj": 231017,
"model.layers.7.self_attn.q_proj": 224708,
"model.layers.7.self_attn.v_proj": 200219,
"model.layers.8.mlp.fc1": 722869,
"model.layers.8.mlp.fc2": 747381,
"model.layers.8.self_attn.dense": 184555,
"model.layers.8.self_attn.k_proj": 230928,
"model.layers.8.self_attn.q_proj": 224025,
"model.layers.8.self_attn.v_proj": 206979,
"model.layers.9.mlp.fc1": 719199,
"model.layers.9.mlp.fc2": 748623,
"model.layers.9.self_attn.dense": 174700,
"model.layers.9.self_attn.k_proj": 228878,
"model.layers.9.self_attn.q_proj": 222182,
"model.layers.9.self_attn.v_proj": 199200
}
},
"architectures": [
"PhiForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": null,
"embd_pdrop": 0.0,
"eos_token_id": null,
"hidden_act": "gelu_new",
"hidden_size": 2048,
"initializer_range": 0.02,
"intermediate_size": 8192,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 2048,
"model_type": "phi",
"num_attention_heads": 32,
"num_hidden_layers": 24,
"num_key_value_heads": 32,
"partial_rotary_factor": 0.5,
"qk_layernorm": false,
"resid_pdrop": 0.0,
"rope_scaling": null,
"rope_theta": 10000.0,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.39.3",
"use_cache": true,
"vocab_size": 51200
}