tclf90
commited on
Commit
•
ea92b81
1
Parent(s):
cc1a0fe
'decrease gptq group size'
Browse files- README.md +11 -3
- config.json +3 -3
- generation_config.json +1 -0
- model-00001-of-00002.safetensors +2 -2
- model-00002-of-00002.safetensors +2 -2
- model.safetensors.index.json +29 -29
- modeling_chatglm.py +1 -1
README.md
CHANGED
@@ -16,15 +16,23 @@ tags:
|
|
16 |
|
17 |
|
18 |
### 【模型更新日期】
|
19 |
-
``` 2024-06-05 ```
|
20 |
|
21 |
### 【模型大小】
|
22 |
-
`6.
|
23 |
|
24 |
### 【06-05 临时情况告知】
|
25 |
|
26 |
1. 目前需要用vllm entrypoint的方式来启动模型。
|
27 |
-
2.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
### 【介绍】
|
30 |
|
|
|
16 |
|
17 |
|
18 |
### 【模型更新日期】
|
19 |
+
``` 2024-06-05 21:00 ```
|
20 |
|
21 |
### 【模型大小】
|
22 |
+
`6.5GB`
|
23 |
|
24 |
### 【06-05 临时情况告知】
|
25 |
|
26 |
1. 目前需要用vllm entrypoint的方式来启动模型。
|
27 |
+
2. 如果出现感叹号,请留言告知,并告知显卡型号。
|
28 |
+
|
29 |
+
### 【更新日志】
|
30 |
+
|
31 |
+
```
|
32 |
+
2004-06-05 21:00
|
33 |
+
1. 尝试修复!!!感叹号吐字问题。
|
34 |
+
2. group_size 调整为64,减少量化精度损失。
|
35 |
+
```
|
36 |
|
37 |
### 【介绍】
|
38 |
|
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "tclf90/glm-4-9b-chat-GPTQ-Int4
|
3 |
"add_bias_linear": false,
|
4 |
"add_qkv_bias": true,
|
5 |
"apply_query_key_layer_scaling": true,
|
@@ -28,7 +28,7 @@
|
|
28 |
"hidden_dropout": 0.0,
|
29 |
"hidden_size": 4096,
|
30 |
"kv_channels": 128,
|
31 |
-
"layernorm_epsilon":
|
32 |
"model_type": "chatglm",
|
33 |
"multi_query_attention": true,
|
34 |
"multi_query_group_num": 2,
|
@@ -50,7 +50,7 @@
|
|
50 |
"exllama_config": {
|
51 |
"version": 1
|
52 |
},
|
53 |
-
"group_size":
|
54 |
"max_input_length": null,
|
55 |
"model_seqlen": null,
|
56 |
"module_name_preceding_first_block": null,
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "tclf90/glm-4-9b-chat-GPTQ-Int4",
|
3 |
"add_bias_linear": false,
|
4 |
"add_qkv_bias": true,
|
5 |
"apply_query_key_layer_scaling": true,
|
|
|
28 |
"hidden_dropout": 0.0,
|
29 |
"hidden_size": 4096,
|
30 |
"kv_channels": 128,
|
31 |
+
"layernorm_epsilon": 1e-05,
|
32 |
"model_type": "chatglm",
|
33 |
"multi_query_attention": true,
|
34 |
"multi_query_group_num": 2,
|
|
|
50 |
"exllama_config": {
|
51 |
"version": 1
|
52 |
},
|
53 |
+
"group_size": 64,
|
54 |
"max_input_length": null,
|
55 |
"model_seqlen": null,
|
56 |
"module_name_preceding_first_block": null,
|
generation_config.json
CHANGED
@@ -5,5 +5,6 @@
|
|
5 |
151336,
|
6 |
151338
|
7 |
],
|
|
|
8 |
"transformers_version": "4.40.2"
|
9 |
}
|
|
|
5 |
151336,
|
6 |
151338
|
7 |
],
|
8 |
+
"pad_token_id": 151329,
|
9 |
"transformers_version": "4.40.2"
|
10 |
}
|
model-00001-of-00002.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65eadf64a1c6f70038b02fb1a6526e11a115459936ea74f6939b06a9bfe3990f
|
3 |
+
size 4995499432
|
model-00002-of-00002.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad97ab92f6ce8fa9f9fc3ab470e930c9aa837fb471d9dda9dd05a53cc0f294d1
|
3 |
+
size 1893310768
|
model.safetensors.index.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"metadata": {
|
3 |
-
"total_size":
|
4 |
},
|
5 |
"weight_map": {
|
6 |
"transformer.embedding.word_embeddings.weight": "model-00001-of-00002.safetensors",
|
@@ -622,28 +622,28 @@
|
|
622 |
"transformer.encoder.layers.33.self_attention.query_key_value.qzeros": "model-00001-of-00002.safetensors",
|
623 |
"transformer.encoder.layers.33.self_attention.query_key_value.scales": "model-00001-of-00002.safetensors",
|
624 |
"transformer.encoder.layers.34.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
625 |
-
"transformer.encoder.layers.34.mlp.dense_4h_to_h.bias": "model-
|
626 |
-
"transformer.encoder.layers.34.mlp.dense_4h_to_h.g_idx": "model-
|
627 |
-
"transformer.encoder.layers.34.mlp.dense_4h_to_h.qweight": "model-
|
628 |
-
"transformer.encoder.layers.34.mlp.dense_4h_to_h.qzeros": "model-
|
629 |
-
"transformer.encoder.layers.34.mlp.dense_4h_to_h.scales": "model-
|
630 |
-
"transformer.encoder.layers.34.mlp.dense_h_to_4h.bias": "model-
|
631 |
-
"transformer.encoder.layers.34.mlp.dense_h_to_4h.g_idx": "model-
|
632 |
-
"transformer.encoder.layers.34.mlp.dense_h_to_4h.qweight": "model-
|
633 |
-
"transformer.encoder.layers.34.mlp.dense_h_to_4h.qzeros": "model-
|
634 |
-
"transformer.encoder.layers.34.mlp.dense_h_to_4h.scales": "model-
|
635 |
-
"transformer.encoder.layers.34.post_attention_layernorm.weight": "model-
|
636 |
"transformer.encoder.layers.34.self_attention.dense.bias": "model-00001-of-00002.safetensors",
|
637 |
"transformer.encoder.layers.34.self_attention.dense.g_idx": "model-00001-of-00002.safetensors",
|
638 |
"transformer.encoder.layers.34.self_attention.dense.qweight": "model-00001-of-00002.safetensors",
|
639 |
"transformer.encoder.layers.34.self_attention.dense.qzeros": "model-00001-of-00002.safetensors",
|
640 |
"transformer.encoder.layers.34.self_attention.dense.scales": "model-00001-of-00002.safetensors",
|
641 |
-
"transformer.encoder.layers.34.self_attention.query_key_value.bias": "model-
|
642 |
-
"transformer.encoder.layers.34.self_attention.query_key_value.g_idx": "model-
|
643 |
-
"transformer.encoder.layers.34.self_attention.query_key_value.qweight": "model-
|
644 |
-
"transformer.encoder.layers.34.self_attention.query_key_value.qzeros": "model-
|
645 |
-
"transformer.encoder.layers.34.self_attention.query_key_value.scales": "model-
|
646 |
-
"transformer.encoder.layers.35.input_layernorm.weight": "model-
|
647 |
"transformer.encoder.layers.35.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
|
648 |
"transformer.encoder.layers.35.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
|
649 |
"transformer.encoder.layers.35.mlp.dense_4h_to_h.qweight": "model-00002-of-00002.safetensors",
|
@@ -654,17 +654,17 @@
|
|
654 |
"transformer.encoder.layers.35.mlp.dense_h_to_4h.qweight": "model-00002-of-00002.safetensors",
|
655 |
"transformer.encoder.layers.35.mlp.dense_h_to_4h.qzeros": "model-00002-of-00002.safetensors",
|
656 |
"transformer.encoder.layers.35.mlp.dense_h_to_4h.scales": "model-00002-of-00002.safetensors",
|
657 |
-
"transformer.encoder.layers.35.post_attention_layernorm.weight": "model-
|
658 |
-
"transformer.encoder.layers.35.self_attention.dense.bias": "model-
|
659 |
-
"transformer.encoder.layers.35.self_attention.dense.g_idx": "model-
|
660 |
-
"transformer.encoder.layers.35.self_attention.dense.qweight": "model-
|
661 |
-
"transformer.encoder.layers.35.self_attention.dense.qzeros": "model-
|
662 |
-
"transformer.encoder.layers.35.self_attention.dense.scales": "model-
|
663 |
-
"transformer.encoder.layers.35.self_attention.query_key_value.bias": "model-
|
664 |
-
"transformer.encoder.layers.35.self_attention.query_key_value.g_idx": "model-
|
665 |
-
"transformer.encoder.layers.35.self_attention.query_key_value.qweight": "model-
|
666 |
-
"transformer.encoder.layers.35.self_attention.query_key_value.qzeros": "model-
|
667 |
-
"transformer.encoder.layers.35.self_attention.query_key_value.scales": "model-
|
668 |
"transformer.encoder.layers.36.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
669 |
"transformer.encoder.layers.36.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
|
670 |
"transformer.encoder.layers.36.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
|
|
|
1 |
{
|
2 |
"metadata": {
|
3 |
+
"total_size": 6888693824
|
4 |
},
|
5 |
"weight_map": {
|
6 |
"transformer.embedding.word_embeddings.weight": "model-00001-of-00002.safetensors",
|
|
|
622 |
"transformer.encoder.layers.33.self_attention.query_key_value.qzeros": "model-00001-of-00002.safetensors",
|
623 |
"transformer.encoder.layers.33.self_attention.query_key_value.scales": "model-00001-of-00002.safetensors",
|
624 |
"transformer.encoder.layers.34.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
625 |
+
"transformer.encoder.layers.34.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
|
626 |
+
"transformer.encoder.layers.34.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
|
627 |
+
"transformer.encoder.layers.34.mlp.dense_4h_to_h.qweight": "model-00002-of-00002.safetensors",
|
628 |
+
"transformer.encoder.layers.34.mlp.dense_4h_to_h.qzeros": "model-00002-of-00002.safetensors",
|
629 |
+
"transformer.encoder.layers.34.mlp.dense_4h_to_h.scales": "model-00002-of-00002.safetensors",
|
630 |
+
"transformer.encoder.layers.34.mlp.dense_h_to_4h.bias": "model-00002-of-00002.safetensors",
|
631 |
+
"transformer.encoder.layers.34.mlp.dense_h_to_4h.g_idx": "model-00002-of-00002.safetensors",
|
632 |
+
"transformer.encoder.layers.34.mlp.dense_h_to_4h.qweight": "model-00002-of-00002.safetensors",
|
633 |
+
"transformer.encoder.layers.34.mlp.dense_h_to_4h.qzeros": "model-00002-of-00002.safetensors",
|
634 |
+
"transformer.encoder.layers.34.mlp.dense_h_to_4h.scales": "model-00002-of-00002.safetensors",
|
635 |
+
"transformer.encoder.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
636 |
"transformer.encoder.layers.34.self_attention.dense.bias": "model-00001-of-00002.safetensors",
|
637 |
"transformer.encoder.layers.34.self_attention.dense.g_idx": "model-00001-of-00002.safetensors",
|
638 |
"transformer.encoder.layers.34.self_attention.dense.qweight": "model-00001-of-00002.safetensors",
|
639 |
"transformer.encoder.layers.34.self_attention.dense.qzeros": "model-00001-of-00002.safetensors",
|
640 |
"transformer.encoder.layers.34.self_attention.dense.scales": "model-00001-of-00002.safetensors",
|
641 |
+
"transformer.encoder.layers.34.self_attention.query_key_value.bias": "model-00002-of-00002.safetensors",
|
642 |
+
"transformer.encoder.layers.34.self_attention.query_key_value.g_idx": "model-00002-of-00002.safetensors",
|
643 |
+
"transformer.encoder.layers.34.self_attention.query_key_value.qweight": "model-00002-of-00002.safetensors",
|
644 |
+
"transformer.encoder.layers.34.self_attention.query_key_value.qzeros": "model-00002-of-00002.safetensors",
|
645 |
+
"transformer.encoder.layers.34.self_attention.query_key_value.scales": "model-00002-of-00002.safetensors",
|
646 |
+
"transformer.encoder.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
647 |
"transformer.encoder.layers.35.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
|
648 |
"transformer.encoder.layers.35.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
|
649 |
"transformer.encoder.layers.35.mlp.dense_4h_to_h.qweight": "model-00002-of-00002.safetensors",
|
|
|
654 |
"transformer.encoder.layers.35.mlp.dense_h_to_4h.qweight": "model-00002-of-00002.safetensors",
|
655 |
"transformer.encoder.layers.35.mlp.dense_h_to_4h.qzeros": "model-00002-of-00002.safetensors",
|
656 |
"transformer.encoder.layers.35.mlp.dense_h_to_4h.scales": "model-00002-of-00002.safetensors",
|
657 |
+
"transformer.encoder.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
658 |
+
"transformer.encoder.layers.35.self_attention.dense.bias": "model-00002-of-00002.safetensors",
|
659 |
+
"transformer.encoder.layers.35.self_attention.dense.g_idx": "model-00002-of-00002.safetensors",
|
660 |
+
"transformer.encoder.layers.35.self_attention.dense.qweight": "model-00002-of-00002.safetensors",
|
661 |
+
"transformer.encoder.layers.35.self_attention.dense.qzeros": "model-00002-of-00002.safetensors",
|
662 |
+
"transformer.encoder.layers.35.self_attention.dense.scales": "model-00002-of-00002.safetensors",
|
663 |
+
"transformer.encoder.layers.35.self_attention.query_key_value.bias": "model-00002-of-00002.safetensors",
|
664 |
+
"transformer.encoder.layers.35.self_attention.query_key_value.g_idx": "model-00002-of-00002.safetensors",
|
665 |
+
"transformer.encoder.layers.35.self_attention.query_key_value.qweight": "model-00002-of-00002.safetensors",
|
666 |
+
"transformer.encoder.layers.35.self_attention.query_key_value.qzeros": "model-00002-of-00002.safetensors",
|
667 |
+
"transformer.encoder.layers.35.self_attention.query_key_value.scales": "model-00002-of-00002.safetensors",
|
668 |
"transformer.encoder.layers.36.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
669 |
"transformer.encoder.layers.36.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
|
670 |
"transformer.encoder.layers.36.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
|
modeling_chatglm.py
CHANGED
@@ -324,7 +324,7 @@ class SelfAttention(torch.nn.Module):
|
|
324 |
)
|
325 |
|
326 |
def forward(
|
327 |
-
self, hidden_states, attention_mask, rotary_pos_emb
|
328 |
):
|
329 |
# hidden_states: [b, sq, h]
|
330 |
|
|
|
324 |
)
|
325 |
|
326 |
def forward(
|
327 |
+
self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
|
328 |
):
|
329 |
# hidden_states: [b, sq, h]
|
330 |
|