tclf90 commited on
Commit
ea92b81
1 Parent(s): cc1a0fe

'decrease gptq group size'

Browse files
README.md CHANGED
@@ -16,15 +16,23 @@ tags:
16
 
17
 
18
  ### 【模型更新日期】
19
- ``` 2024-06-05 ```
20
 
21
  ### 【模型大小】
22
- `6.2GB`
23
 
24
  ### 【06-05 临时情况告知】
25
 
26
  1. 目前需要用vllm entrypoint的方式来启动模型。
27
- 2. 如果首次对话出现感叹号,那么添加一些提示词可以规避,近期会跟进寻找修复策略。
 
 
 
 
 
 
 
 
28
 
29
  ### 【介绍】
30
 
 
16
 
17
 
18
  ### 【模型更新日期】
19
+ ``` 2024-06-05 21:00 ```
20
 
21
  ### 【模型大小】
22
+ `6.5GB`
23
 
24
  ### 【06-05 临时情况告知】
25
 
26
  1. 目前需要用vllm entrypoint的方式来启动模型。
27
+ 2. 如果出现感叹号,请留言告知,并告知显卡型号。
28
+
29
+ ### 【更新日志】
30
+
31
+ ```
32
+ 2004-06-05 21:00
33
+ 1. 尝试修复!!!感叹号吐字问题。
34
+ 2. group_size 调整为64,减少量化精度损失。
35
+ ```
36
 
37
  ### 【介绍】
38
 
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "tclf90/glm-4-9b-chat-GPTQ-Int4-G128",
3
  "add_bias_linear": false,
4
  "add_qkv_bias": true,
5
  "apply_query_key_layer_scaling": true,
@@ -28,7 +28,7 @@
28
  "hidden_dropout": 0.0,
29
  "hidden_size": 4096,
30
  "kv_channels": 128,
31
- "layernorm_epsilon": 1.5625e-07,
32
  "model_type": "chatglm",
33
  "multi_query_attention": true,
34
  "multi_query_group_num": 2,
@@ -50,7 +50,7 @@
50
  "exllama_config": {
51
  "version": 1
52
  },
53
- "group_size": 128,
54
  "max_input_length": null,
55
  "model_seqlen": null,
56
  "module_name_preceding_first_block": null,
 
1
  {
2
+ "_name_or_path": "tclf90/glm-4-9b-chat-GPTQ-Int4",
3
  "add_bias_linear": false,
4
  "add_qkv_bias": true,
5
  "apply_query_key_layer_scaling": true,
 
28
  "hidden_dropout": 0.0,
29
  "hidden_size": 4096,
30
  "kv_channels": 128,
31
+ "layernorm_epsilon": 1e-05,
32
  "model_type": "chatglm",
33
  "multi_query_attention": true,
34
  "multi_query_group_num": 2,
 
50
  "exllama_config": {
51
  "version": 1
52
  },
53
+ "group_size": 64,
54
  "max_input_length": null,
55
  "model_seqlen": null,
56
  "module_name_preceding_first_block": null,
generation_config.json CHANGED
@@ -5,5 +5,6 @@
5
  151336,
6
  151338
7
  ],
 
8
  "transformers_version": "4.40.2"
9
  }
 
5
  151336,
6
  151338
7
  ],
8
+ "pad_token_id": 151329,
9
  "transformers_version": "4.40.2"
10
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db41f89442a01385028ac48bee7e767b14681bcd274d4939c6edbe707e27d523
3
- size 4975739320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65eadf64a1c6f70038b02fb1a6526e11a115459936ea74f6939b06a9bfe3990f
3
+ size 4995499432
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6953050103ce8867a55b25ccc84e193a5752c9e79a1207ce123c7f38aec46f1b
3
- size 1753736944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad97ab92f6ce8fa9f9fc3ab470e930c9aa837fb471d9dda9dd05a53cc0f294d1
3
+ size 1893310768
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 6729359424
4
  },
5
  "weight_map": {
6
  "transformer.embedding.word_embeddings.weight": "model-00001-of-00002.safetensors",
@@ -622,28 +622,28 @@
622
  "transformer.encoder.layers.33.self_attention.query_key_value.qzeros": "model-00001-of-00002.safetensors",
623
  "transformer.encoder.layers.33.self_attention.query_key_value.scales": "model-00001-of-00002.safetensors",
624
  "transformer.encoder.layers.34.input_layernorm.weight": "model-00001-of-00002.safetensors",
625
- "transformer.encoder.layers.34.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
626
- "transformer.encoder.layers.34.mlp.dense_4h_to_h.g_idx": "model-00001-of-00002.safetensors",
627
- "transformer.encoder.layers.34.mlp.dense_4h_to_h.qweight": "model-00001-of-00002.safetensors",
628
- "transformer.encoder.layers.34.mlp.dense_4h_to_h.qzeros": "model-00001-of-00002.safetensors",
629
- "transformer.encoder.layers.34.mlp.dense_4h_to_h.scales": "model-00001-of-00002.safetensors",
630
- "transformer.encoder.layers.34.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
631
- "transformer.encoder.layers.34.mlp.dense_h_to_4h.g_idx": "model-00001-of-00002.safetensors",
632
- "transformer.encoder.layers.34.mlp.dense_h_to_4h.qweight": "model-00001-of-00002.safetensors",
633
- "transformer.encoder.layers.34.mlp.dense_h_to_4h.qzeros": "model-00001-of-00002.safetensors",
634
- "transformer.encoder.layers.34.mlp.dense_h_to_4h.scales": "model-00001-of-00002.safetensors",
635
- "transformer.encoder.layers.34.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
636
  "transformer.encoder.layers.34.self_attention.dense.bias": "model-00001-of-00002.safetensors",
637
  "transformer.encoder.layers.34.self_attention.dense.g_idx": "model-00001-of-00002.safetensors",
638
  "transformer.encoder.layers.34.self_attention.dense.qweight": "model-00001-of-00002.safetensors",
639
  "transformer.encoder.layers.34.self_attention.dense.qzeros": "model-00001-of-00002.safetensors",
640
  "transformer.encoder.layers.34.self_attention.dense.scales": "model-00001-of-00002.safetensors",
641
- "transformer.encoder.layers.34.self_attention.query_key_value.bias": "model-00001-of-00002.safetensors",
642
- "transformer.encoder.layers.34.self_attention.query_key_value.g_idx": "model-00001-of-00002.safetensors",
643
- "transformer.encoder.layers.34.self_attention.query_key_value.qweight": "model-00001-of-00002.safetensors",
644
- "transformer.encoder.layers.34.self_attention.query_key_value.qzeros": "model-00001-of-00002.safetensors",
645
- "transformer.encoder.layers.34.self_attention.query_key_value.scales": "model-00001-of-00002.safetensors",
646
- "transformer.encoder.layers.35.input_layernorm.weight": "model-00001-of-00002.safetensors",
647
  "transformer.encoder.layers.35.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
648
  "transformer.encoder.layers.35.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
649
  "transformer.encoder.layers.35.mlp.dense_4h_to_h.qweight": "model-00002-of-00002.safetensors",
@@ -654,17 +654,17 @@
654
  "transformer.encoder.layers.35.mlp.dense_h_to_4h.qweight": "model-00002-of-00002.safetensors",
655
  "transformer.encoder.layers.35.mlp.dense_h_to_4h.qzeros": "model-00002-of-00002.safetensors",
656
  "transformer.encoder.layers.35.mlp.dense_h_to_4h.scales": "model-00002-of-00002.safetensors",
657
- "transformer.encoder.layers.35.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
658
- "transformer.encoder.layers.35.self_attention.dense.bias": "model-00001-of-00002.safetensors",
659
- "transformer.encoder.layers.35.self_attention.dense.g_idx": "model-00001-of-00002.safetensors",
660
- "transformer.encoder.layers.35.self_attention.dense.qweight": "model-00001-of-00002.safetensors",
661
- "transformer.encoder.layers.35.self_attention.dense.qzeros": "model-00001-of-00002.safetensors",
662
- "transformer.encoder.layers.35.self_attention.dense.scales": "model-00001-of-00002.safetensors",
663
- "transformer.encoder.layers.35.self_attention.query_key_value.bias": "model-00001-of-00002.safetensors",
664
- "transformer.encoder.layers.35.self_attention.query_key_value.g_idx": "model-00001-of-00002.safetensors",
665
- "transformer.encoder.layers.35.self_attention.query_key_value.qweight": "model-00001-of-00002.safetensors",
666
- "transformer.encoder.layers.35.self_attention.query_key_value.qzeros": "model-00001-of-00002.safetensors",
667
- "transformer.encoder.layers.35.self_attention.query_key_value.scales": "model-00001-of-00002.safetensors",
668
  "transformer.encoder.layers.36.input_layernorm.weight": "model-00002-of-00002.safetensors",
669
  "transformer.encoder.layers.36.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
670
  "transformer.encoder.layers.36.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 6888693824
4
  },
5
  "weight_map": {
6
  "transformer.embedding.word_embeddings.weight": "model-00001-of-00002.safetensors",
 
622
  "transformer.encoder.layers.33.self_attention.query_key_value.qzeros": "model-00001-of-00002.safetensors",
623
  "transformer.encoder.layers.33.self_attention.query_key_value.scales": "model-00001-of-00002.safetensors",
624
  "transformer.encoder.layers.34.input_layernorm.weight": "model-00001-of-00002.safetensors",
625
+ "transformer.encoder.layers.34.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
626
+ "transformer.encoder.layers.34.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
627
+ "transformer.encoder.layers.34.mlp.dense_4h_to_h.qweight": "model-00002-of-00002.safetensors",
628
+ "transformer.encoder.layers.34.mlp.dense_4h_to_h.qzeros": "model-00002-of-00002.safetensors",
629
+ "transformer.encoder.layers.34.mlp.dense_4h_to_h.scales": "model-00002-of-00002.safetensors",
630
+ "transformer.encoder.layers.34.mlp.dense_h_to_4h.bias": "model-00002-of-00002.safetensors",
631
+ "transformer.encoder.layers.34.mlp.dense_h_to_4h.g_idx": "model-00002-of-00002.safetensors",
632
+ "transformer.encoder.layers.34.mlp.dense_h_to_4h.qweight": "model-00002-of-00002.safetensors",
633
+ "transformer.encoder.layers.34.mlp.dense_h_to_4h.qzeros": "model-00002-of-00002.safetensors",
634
+ "transformer.encoder.layers.34.mlp.dense_h_to_4h.scales": "model-00002-of-00002.safetensors",
635
+ "transformer.encoder.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
636
  "transformer.encoder.layers.34.self_attention.dense.bias": "model-00001-of-00002.safetensors",
637
  "transformer.encoder.layers.34.self_attention.dense.g_idx": "model-00001-of-00002.safetensors",
638
  "transformer.encoder.layers.34.self_attention.dense.qweight": "model-00001-of-00002.safetensors",
639
  "transformer.encoder.layers.34.self_attention.dense.qzeros": "model-00001-of-00002.safetensors",
640
  "transformer.encoder.layers.34.self_attention.dense.scales": "model-00001-of-00002.safetensors",
641
+ "transformer.encoder.layers.34.self_attention.query_key_value.bias": "model-00002-of-00002.safetensors",
642
+ "transformer.encoder.layers.34.self_attention.query_key_value.g_idx": "model-00002-of-00002.safetensors",
643
+ "transformer.encoder.layers.34.self_attention.query_key_value.qweight": "model-00002-of-00002.safetensors",
644
+ "transformer.encoder.layers.34.self_attention.query_key_value.qzeros": "model-00002-of-00002.safetensors",
645
+ "transformer.encoder.layers.34.self_attention.query_key_value.scales": "model-00002-of-00002.safetensors",
646
+ "transformer.encoder.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
647
  "transformer.encoder.layers.35.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
648
  "transformer.encoder.layers.35.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
649
  "transformer.encoder.layers.35.mlp.dense_4h_to_h.qweight": "model-00002-of-00002.safetensors",
 
654
  "transformer.encoder.layers.35.mlp.dense_h_to_4h.qweight": "model-00002-of-00002.safetensors",
655
  "transformer.encoder.layers.35.mlp.dense_h_to_4h.qzeros": "model-00002-of-00002.safetensors",
656
  "transformer.encoder.layers.35.mlp.dense_h_to_4h.scales": "model-00002-of-00002.safetensors",
657
+ "transformer.encoder.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
658
+ "transformer.encoder.layers.35.self_attention.dense.bias": "model-00002-of-00002.safetensors",
659
+ "transformer.encoder.layers.35.self_attention.dense.g_idx": "model-00002-of-00002.safetensors",
660
+ "transformer.encoder.layers.35.self_attention.dense.qweight": "model-00002-of-00002.safetensors",
661
+ "transformer.encoder.layers.35.self_attention.dense.qzeros": "model-00002-of-00002.safetensors",
662
+ "transformer.encoder.layers.35.self_attention.dense.scales": "model-00002-of-00002.safetensors",
663
+ "transformer.encoder.layers.35.self_attention.query_key_value.bias": "model-00002-of-00002.safetensors",
664
+ "transformer.encoder.layers.35.self_attention.query_key_value.g_idx": "model-00002-of-00002.safetensors",
665
+ "transformer.encoder.layers.35.self_attention.query_key_value.qweight": "model-00002-of-00002.safetensors",
666
+ "transformer.encoder.layers.35.self_attention.query_key_value.qzeros": "model-00002-of-00002.safetensors",
667
+ "transformer.encoder.layers.35.self_attention.query_key_value.scales": "model-00002-of-00002.safetensors",
668
  "transformer.encoder.layers.36.input_layernorm.weight": "model-00002-of-00002.safetensors",
669
  "transformer.encoder.layers.36.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
670
  "transformer.encoder.layers.36.mlp.dense_4h_to_h.g_idx": "model-00002-of-00002.safetensors",
modeling_chatglm.py CHANGED
@@ -324,7 +324,7 @@ class SelfAttention(torch.nn.Module):
324
  )
325
 
326
  def forward(
327
- self, hidden_states, attention_mask, rotary_pos_emb=None, kv_cache=None, use_cache=True
328
  ):
329
  # hidden_states: [b, sq, h]
330
 
 
324
  )
325
 
326
  def forward(
327
+ self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
328
  ):
329
  # hidden_states: [b, sq, h]
330