|
{ |
|
"_name_or_path": "checkpoints/microsoft/phi-1_5", |
|
"anyprec": { |
|
"arch_config": { |
|
"layers_name": "layers", |
|
"model_name": "model", |
|
"module_names": [ |
|
"self_attn.q_proj", |
|
"self_attn.k_proj", |
|
"self_attn.v_proj", |
|
"self_attn.dense", |
|
"mlp.fc1", |
|
"mlp.fc2" |
|
] |
|
}, |
|
"group_count": 1, |
|
"parent_precision": 4, |
|
"seed_precision": 2, |
|
"sparse_numvals": { |
|
"model.layers.0.mlp.fc1": 1279820, |
|
"model.layers.0.mlp.fc2": 1236307, |
|
"model.layers.0.self_attn.dense": 189187, |
|
"model.layers.0.self_attn.k_proj": 318137, |
|
"model.layers.0.self_attn.q_proj": 285336, |
|
"model.layers.0.self_attn.v_proj": 212019, |
|
"model.layers.1.mlp.fc1": 539655, |
|
"model.layers.1.mlp.fc2": 766840, |
|
"model.layers.1.self_attn.dense": 176195, |
|
"model.layers.1.self_attn.k_proj": 224990, |
|
"model.layers.1.self_attn.q_proj": 219788, |
|
"model.layers.1.self_attn.v_proj": 201422, |
|
"model.layers.10.mlp.fc1": 710241, |
|
"model.layers.10.mlp.fc2": 754886, |
|
"model.layers.10.self_attn.dense": 180978, |
|
"model.layers.10.self_attn.k_proj": 221678, |
|
"model.layers.10.self_attn.q_proj": 212231, |
|
"model.layers.10.self_attn.v_proj": 200830, |
|
"model.layers.11.mlp.fc1": 710220, |
|
"model.layers.11.mlp.fc2": 741443, |
|
"model.layers.11.self_attn.dense": 178022, |
|
"model.layers.11.self_attn.k_proj": 215089, |
|
"model.layers.11.self_attn.q_proj": 207761, |
|
"model.layers.11.self_attn.v_proj": 194583, |
|
"model.layers.12.mlp.fc1": 697047, |
|
"model.layers.12.mlp.fc2": 752645, |
|
"model.layers.12.self_attn.dense": 176274, |
|
"model.layers.12.self_attn.k_proj": 220310, |
|
"model.layers.12.self_attn.q_proj": 213155, |
|
"model.layers.12.self_attn.v_proj": 198321, |
|
"model.layers.13.mlp.fc1": 687659, |
|
"model.layers.13.mlp.fc2": 738003, |
|
"model.layers.13.self_attn.dense": 178018, |
|
"model.layers.13.self_attn.k_proj": 221049, |
|
"model.layers.13.self_attn.q_proj": 210113, |
|
"model.layers.13.self_attn.v_proj": 195362, |
|
"model.layers.14.mlp.fc1": 682287, |
|
"model.layers.14.mlp.fc2": 781981, |
|
"model.layers.14.self_attn.dense": 184895, |
|
"model.layers.14.self_attn.k_proj": 217243, |
|
"model.layers.14.self_attn.q_proj": 213786, |
|
"model.layers.14.self_attn.v_proj": 198728, |
|
"model.layers.15.mlp.fc1": 666136, |
|
"model.layers.15.mlp.fc2": 761731, |
|
"model.layers.15.self_attn.dense": 177136, |
|
"model.layers.15.self_attn.k_proj": 217895, |
|
"model.layers.15.self_attn.q_proj": 232288, |
|
"model.layers.15.self_attn.v_proj": 191081, |
|
"model.layers.16.mlp.fc1": 656390, |
|
"model.layers.16.mlp.fc2": 799995, |
|
"model.layers.16.self_attn.dense": 174656, |
|
"model.layers.16.self_attn.k_proj": 217135, |
|
"model.layers.16.self_attn.q_proj": 219926, |
|
"model.layers.16.self_attn.v_proj": 185798, |
|
"model.layers.17.mlp.fc1": 639288, |
|
"model.layers.17.mlp.fc2": 775904, |
|
"model.layers.17.self_attn.dense": 173271, |
|
"model.layers.17.self_attn.k_proj": 206996, |
|
"model.layers.17.self_attn.q_proj": 205270, |
|
"model.layers.17.self_attn.v_proj": 184931, |
|
"model.layers.18.mlp.fc1": 632334, |
|
"model.layers.18.mlp.fc2": 768287, |
|
"model.layers.18.self_attn.dense": 189412, |
|
"model.layers.18.self_attn.k_proj": 215687, |
|
"model.layers.18.self_attn.q_proj": 242190, |
|
"model.layers.18.self_attn.v_proj": 200154, |
|
"model.layers.19.mlp.fc1": 625021, |
|
"model.layers.19.mlp.fc2": 738002, |
|
"model.layers.19.self_attn.dense": 186977, |
|
"model.layers.19.self_attn.k_proj": 216466, |
|
"model.layers.19.self_attn.q_proj": 240694, |
|
"model.layers.19.self_attn.v_proj": 197648, |
|
"model.layers.2.mlp.fc1": 621667, |
|
"model.layers.2.mlp.fc2": 757420, |
|
"model.layers.2.self_attn.dense": 170986, |
|
"model.layers.2.self_attn.k_proj": 225618, |
|
"model.layers.2.self_attn.q_proj": 217741, |
|
"model.layers.2.self_attn.v_proj": 200228, |
|
"model.layers.20.mlp.fc1": 614692, |
|
"model.layers.20.mlp.fc2": 727978, |
|
"model.layers.20.self_attn.dense": 175731, |
|
"model.layers.20.self_attn.k_proj": 213423, |
|
"model.layers.20.self_attn.q_proj": 236043, |
|
"model.layers.20.self_attn.v_proj": 183771, |
|
"model.layers.21.mlp.fc1": 618662, |
|
"model.layers.21.mlp.fc2": 738785, |
|
"model.layers.21.self_attn.dense": 177493, |
|
"model.layers.21.self_attn.k_proj": 208350, |
|
"model.layers.21.self_attn.q_proj": 237646, |
|
"model.layers.21.self_attn.v_proj": 187251, |
|
"model.layers.22.mlp.fc1": 629352, |
|
"model.layers.22.mlp.fc2": 818793, |
|
"model.layers.22.self_attn.dense": 175140, |
|
"model.layers.22.self_attn.k_proj": 202527, |
|
"model.layers.22.self_attn.q_proj": 284459, |
|
"model.layers.22.self_attn.v_proj": 180999, |
|
"model.layers.23.mlp.fc1": 711633, |
|
"model.layers.23.mlp.fc2": 1103566, |
|
"model.layers.23.self_attn.dense": 219201, |
|
"model.layers.23.self_attn.k_proj": 224644, |
|
"model.layers.23.self_attn.q_proj": 397194, |
|
"model.layers.23.self_attn.v_proj": 230928, |
|
"model.layers.3.mlp.fc1": 663185, |
|
"model.layers.3.mlp.fc2": 761065, |
|
"model.layers.3.self_attn.dense": 185269, |
|
"model.layers.3.self_attn.k_proj": 240041, |
|
"model.layers.3.self_attn.q_proj": 232277, |
|
"model.layers.3.self_attn.v_proj": 214858, |
|
"model.layers.4.mlp.fc1": 716587, |
|
"model.layers.4.mlp.fc2": 767640, |
|
"model.layers.4.self_attn.dense": 179773, |
|
"model.layers.4.self_attn.k_proj": 227913, |
|
"model.layers.4.self_attn.q_proj": 220527, |
|
"model.layers.4.self_attn.v_proj": 211685, |
|
"model.layers.5.mlp.fc1": 707590, |
|
"model.layers.5.mlp.fc2": 780274, |
|
"model.layers.5.self_attn.dense": 178504, |
|
"model.layers.5.self_attn.k_proj": 247977, |
|
"model.layers.5.self_attn.q_proj": 243896, |
|
"model.layers.5.self_attn.v_proj": 207831, |
|
"model.layers.6.mlp.fc1": 710038, |
|
"model.layers.6.mlp.fc2": 763787, |
|
"model.layers.6.self_attn.dense": 190308, |
|
"model.layers.6.self_attn.k_proj": 224045, |
|
"model.layers.6.self_attn.q_proj": 220275, |
|
"model.layers.6.self_attn.v_proj": 212698, |
|
"model.layers.7.mlp.fc1": 715221, |
|
"model.layers.7.mlp.fc2": 758669, |
|
"model.layers.7.self_attn.dense": 175635, |
|
"model.layers.7.self_attn.k_proj": 231017, |
|
"model.layers.7.self_attn.q_proj": 224708, |
|
"model.layers.7.self_attn.v_proj": 200219, |
|
"model.layers.8.mlp.fc1": 722869, |
|
"model.layers.8.mlp.fc2": 747381, |
|
"model.layers.8.self_attn.dense": 184555, |
|
"model.layers.8.self_attn.k_proj": 230928, |
|
"model.layers.8.self_attn.q_proj": 224025, |
|
"model.layers.8.self_attn.v_proj": 206979, |
|
"model.layers.9.mlp.fc1": 719199, |
|
"model.layers.9.mlp.fc2": 748623, |
|
"model.layers.9.self_attn.dense": 174700, |
|
"model.layers.9.self_attn.k_proj": 228878, |
|
"model.layers.9.self_attn.q_proj": 222182, |
|
"model.layers.9.self_attn.v_proj": 199200 |
|
} |
|
}, |
|
"architectures": [ |
|
"PhiForCausalLM" |
|
], |
|
"attention_dropout": 0.0, |
|
"bos_token_id": null, |
|
"embd_pdrop": 0.0, |
|
"eos_token_id": null, |
|
"hidden_act": "gelu_new", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"layer_norm_eps": 1e-05, |
|
"max_position_embeddings": 2048, |
|
"model_type": "phi", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 24, |
|
"num_key_value_heads": 32, |
|
"partial_rotary_factor": 0.5, |
|
"qk_layernorm": false, |
|
"resid_pdrop": 0.0, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float16", |
|
"transformers_version": "4.39.3", |
|
"use_cache": true, |
|
"vocab_size": 51200 |
|
} |
|
|