Owner
#6
by
Kpd81
- opened
- README.md +3 -6
- config.json +2 -2
- configuration_minicpm.py +0 -2
- modeling_minicpm.py +21 -42
README.md
CHANGED
@@ -344,6 +344,7 @@ When running evaluation on BEIR and C-MTEB/Retrieval, we use instructions in `in
|
|
344 |
|
345 |
```
|
346 |
transformers==4.37.2
|
|
|
347 |
```
|
348 |
|
349 |
### 示例脚本 Demo
|
@@ -357,9 +358,7 @@ import torch.nn.functional as F
|
|
357 |
|
358 |
model_name = "openbmb/MiniCPM-Embedding"
|
359 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
360 |
-
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
|
361 |
-
# You can also use the following line to enable the Flash Attention 2 implementation
|
362 |
-
# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
|
363 |
model.eval()
|
364 |
|
365 |
# 由于在 `model.forward` 中缩放了最终隐层表示,此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
|
@@ -403,9 +402,7 @@ import torch
|
|
403 |
from sentence_transformers import SentenceTransformer
|
404 |
|
405 |
model_name = "openbmb/MiniCPM-Embedding"
|
406 |
-
model = SentenceTransformer(model_name, trust_remote_code=True, model_kwargs={ "torch_dtype": torch.float16})
|
407 |
-
# You can also use the following line to enable the Flash Attention 2 implementation
|
408 |
-
# model = SentenceTransformer(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", model_kwargs={ "torch_dtype": torch.float16})
|
409 |
|
410 |
queries = ["中国的首都是哪里?"]
|
411 |
passages = ["beijing", "shanghai"]
|
|
|
344 |
|
345 |
```
|
346 |
transformers==4.37.2
|
347 |
+
flash-attn>2.3.5
|
348 |
```
|
349 |
|
350 |
### 示例脚本 Demo
|
|
|
358 |
|
359 |
model_name = "openbmb/MiniCPM-Embedding"
|
360 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
361 |
+
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
|
|
|
|
|
362 |
model.eval()
|
363 |
|
364 |
# 由于在 `model.forward` 中缩放了最终隐层表示,此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
|
|
|
402 |
from sentence_transformers import SentenceTransformer
|
403 |
|
404 |
model_name = "openbmb/MiniCPM-Embedding"
|
405 |
+
model = SentenceTransformer(model_name, trust_remote_code=True, model_kwargs={"attn_implementation": "flash_attention_2", "torch_dtype": torch.float16})
|
|
|
|
|
406 |
|
407 |
queries = ["中国的首都是哪里?"]
|
408 |
passages = ["beijing", "shanghai"]
|
config.json
CHANGED
@@ -17,7 +17,7 @@
|
|
17 |
"initializer_range": 0.1,
|
18 |
"intermediate_size": 5760,
|
19 |
"is_causal": false,
|
20 |
-
"max_position_embeddings":
|
21 |
"num_attention_heads": 36,
|
22 |
"num_hidden_layers": 40,
|
23 |
"num_key_value_heads": 36,
|
@@ -25,7 +25,7 @@
|
|
25 |
"rope_scaling": null,
|
26 |
"torch_dtype": "bfloat16",
|
27 |
"transformers_version": "4.36.0",
|
28 |
-
"use_cache":
|
29 |
"vocab_size": 122753,
|
30 |
"scale_emb": 12,
|
31 |
"dim_model_base": 256,
|
|
|
17 |
"initializer_range": 0.1,
|
18 |
"intermediate_size": 5760,
|
19 |
"is_causal": false,
|
20 |
+
"max_position_embeddings": 2048,
|
21 |
"num_attention_heads": 36,
|
22 |
"num_hidden_layers": 40,
|
23 |
"num_key_value_heads": 36,
|
|
|
25 |
"rope_scaling": null,
|
26 |
"torch_dtype": "bfloat16",
|
27 |
"transformers_version": "4.36.0",
|
28 |
+
"use_cache": true,
|
29 |
"vocab_size": 122753,
|
30 |
"scale_emb": 12,
|
31 |
"dim_model_base": 256,
|
configuration_minicpm.py
CHANGED
@@ -140,7 +140,6 @@ class MiniCPMConfig(PretrainedConfig):
|
|
140 |
dim_model_base=1,
|
141 |
scale_depth=1,
|
142 |
is_causal=True,
|
143 |
-
adapt_mean_pooling=True,
|
144 |
**kwargs,
|
145 |
):
|
146 |
self.vocab_size = vocab_size
|
@@ -169,7 +168,6 @@ class MiniCPMConfig(PretrainedConfig):
|
|
169 |
self.dim_model_base = dim_model_base
|
170 |
self.scale_depth = scale_depth
|
171 |
self.is_causal = is_causal
|
172 |
-
self.adapt_mean_pooling = adapt_mean_pooling
|
173 |
|
174 |
super().__init__(
|
175 |
pad_token_id=pad_token_id,
|
|
|
140 |
dim_model_base=1,
|
141 |
scale_depth=1,
|
142 |
is_causal=True,
|
|
|
143 |
**kwargs,
|
144 |
):
|
145 |
self.vocab_size = vocab_size
|
|
|
168 |
self.dim_model_base = dim_model_base
|
169 |
self.scale_depth = scale_depth
|
170 |
self.is_causal = is_causal
|
|
|
171 |
|
172 |
super().__init__(
|
173 |
pad_token_id=pad_token_id,
|
modeling_minicpm.py
CHANGED
@@ -21,16 +21,12 @@
|
|
21 |
import math
|
22 |
import warnings
|
23 |
from typing import List, Optional, Tuple, Union, Dict
|
24 |
-
|
25 |
-
from tqdm import tqdm
|
26 |
import torch
|
27 |
import torch.nn.functional as F
|
28 |
import torch.utils.checkpoint
|
29 |
from torch import nn
|
30 |
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
31 |
-
import numpy as np
|
32 |
-
from copy import deepcopy
|
33 |
-
from transformers import AutoTokenizer
|
34 |
|
35 |
from transformers.activations import ACT2FN
|
36 |
from transformers.cache_utils import Cache, DynamicCache
|
@@ -39,7 +35,6 @@ from transformers.modeling_attn_mask_utils import (
|
|
39 |
_prepare_4d_attention_mask,
|
40 |
_prepare_4d_causal_attention_mask,
|
41 |
_prepare_4d_causal_attention_mask_for_sdpa,
|
42 |
-
_prepare_4d_attention_mask_for_sdpa,
|
43 |
)
|
44 |
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
|
45 |
from transformers.modeling_utils import PreTrainedModel
|
@@ -325,6 +320,9 @@ class MiniCPMAttention(nn.Module):
|
|
325 |
self.rope_theta = config.rope_theta
|
326 |
|
327 |
self.is_causal = config.is_causal
|
|
|
|
|
|
|
328 |
|
329 |
if (self.head_dim * self.num_heads) != self.hidden_size:
|
330 |
raise ValueError(
|
@@ -981,8 +979,6 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
|
|
981 |
self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
982 |
|
983 |
self.gradient_checkpointing = False
|
984 |
-
self.is_causal = config.is_causal
|
985 |
-
self.adapt_mean_pooling = config.adapt_mean_pooling
|
986 |
# Initialize weights and apply final processing
|
987 |
self.post_init()
|
988 |
|
@@ -1004,7 +1000,6 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
|
|
1004 |
output_attentions: Optional[bool] = None,
|
1005 |
output_hidden_states: Optional[bool] = None,
|
1006 |
return_dict: Optional[bool] = None,
|
1007 |
-
adapt_mean_pooling: Optional[bool] = None,
|
1008 |
) -> Union[Tuple, BaseModelOutputWithPast]:
|
1009 |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
1010 |
output_hidden_states = (
|
@@ -1049,35 +1044,24 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
|
|
1049 |
inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb
|
1050 |
|
1051 |
_attention_mask = attention_mask
|
|
|
1052 |
if self._use_flash_attention_2:
|
1053 |
# 2d mask is passed through the layers
|
1054 |
attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
|
1055 |
elif self._use_sdpa and not output_attentions:
|
1056 |
# output_attentions=True can not be supported when using SDPA, and we fall back on
|
1057 |
# the manual implementation that requires a 4D causal mask in all cases.
|
1058 |
-
|
1059 |
-
attention_mask
|
1060 |
-
|
1061 |
-
|
1062 |
-
|
1063 |
-
|
1064 |
-
)
|
1065 |
-
else:
|
1066 |
-
attention_mask = _prepare_4d_attention_mask_for_sdpa(
|
1067 |
-
attention_mask,
|
1068 |
-
inputs_embeds.dtype,
|
1069 |
-
)
|
1070 |
else:
|
1071 |
# 4d mask is passed through the layers
|
1072 |
-
|
1073 |
-
attention_mask
|
1074 |
-
|
1075 |
-
)
|
1076 |
-
else:
|
1077 |
-
attention_mask = _prepare_4d_attention_mask(
|
1078 |
-
attention_mask,
|
1079 |
-
inputs_embeds.dtype,
|
1080 |
-
)
|
1081 |
|
1082 |
# embed positions
|
1083 |
hidden_states = inputs_embeds
|
@@ -1125,18 +1109,14 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
|
|
1125 |
if output_hidden_states:
|
1126 |
all_hidden_states += (hidden_states,)
|
1127 |
|
1128 |
-
next_cache = None
|
1129 |
-
|
1130 |
# gen weight before mean pooling
|
1131 |
-
|
1132 |
-
|
1133 |
-
|
1134 |
-
attention_mask_ = _attention_mask * _attention_mask.cumsum(dim=1)
|
1135 |
-
s = hidden_states * attention_mask_.unsqueeze(-1).float()
|
1136 |
-
d = attention_mask_.sum(dim=1, keepdim=True).unsqueeze(1).float() /_attention_mask.sum(dim=1, keepdim=True).unsqueeze(1).float()
|
1137 |
-
|
1138 |
-
hidden_states = s / d
|
1139 |
|
|
|
|
|
|
|
1140 |
if use_cache:
|
1141 |
next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
|
1142 |
if not return_dict:
|
@@ -1147,8 +1127,7 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
|
|
1147 |
hidden_states=all_hidden_states,
|
1148 |
attentions=all_self_attns,
|
1149 |
)
|
1150 |
-
|
1151 |
-
|
1152 |
|
1153 |
class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
|
1154 |
_tied_weights_keys = ["lm_head.weight"]
|
|
|
21 |
import math
|
22 |
import warnings
|
23 |
from typing import List, Optional, Tuple, Union, Dict
|
24 |
+
|
|
|
25 |
import torch
|
26 |
import torch.nn.functional as F
|
27 |
import torch.utils.checkpoint
|
28 |
from torch import nn
|
29 |
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
|
|
|
|
|
30 |
|
31 |
from transformers.activations import ACT2FN
|
32 |
from transformers.cache_utils import Cache, DynamicCache
|
|
|
35 |
_prepare_4d_attention_mask,
|
36 |
_prepare_4d_causal_attention_mask,
|
37 |
_prepare_4d_causal_attention_mask_for_sdpa,
|
|
|
38 |
)
|
39 |
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
|
40 |
from transformers.modeling_utils import PreTrainedModel
|
|
|
320 |
self.rope_theta = config.rope_theta
|
321 |
|
322 |
self.is_causal = config.is_causal
|
323 |
+
|
324 |
+
logger.info(f"self.is_causal = {self.is_causal}")
|
325 |
+
|
326 |
|
327 |
if (self.head_dim * self.num_heads) != self.hidden_size:
|
328 |
raise ValueError(
|
|
|
979 |
self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
980 |
|
981 |
self.gradient_checkpointing = False
|
|
|
|
|
982 |
# Initialize weights and apply final processing
|
983 |
self.post_init()
|
984 |
|
|
|
1000 |
output_attentions: Optional[bool] = None,
|
1001 |
output_hidden_states: Optional[bool] = None,
|
1002 |
return_dict: Optional[bool] = None,
|
|
|
1003 |
) -> Union[Tuple, BaseModelOutputWithPast]:
|
1004 |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
1005 |
output_hidden_states = (
|
|
|
1044 |
inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb
|
1045 |
|
1046 |
_attention_mask = attention_mask
|
1047 |
+
|
1048 |
if self._use_flash_attention_2:
|
1049 |
# 2d mask is passed through the layers
|
1050 |
attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
|
1051 |
elif self._use_sdpa and not output_attentions:
|
1052 |
# output_attentions=True can not be supported when using SDPA, and we fall back on
|
1053 |
# the manual implementation that requires a 4D causal mask in all cases.
|
1054 |
+
attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
|
1055 |
+
attention_mask,
|
1056 |
+
(batch_size, seq_length),
|
1057 |
+
inputs_embeds,
|
1058 |
+
past_key_values_length,
|
1059 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
1060 |
else:
|
1061 |
# 4d mask is passed through the layers
|
1062 |
+
attention_mask = _prepare_4d_causal_attention_mask(
|
1063 |
+
attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
|
1064 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
1065 |
|
1066 |
# embed positions
|
1067 |
hidden_states = inputs_embeds
|
|
|
1109 |
if output_hidden_states:
|
1110 |
all_hidden_states += (hidden_states,)
|
1111 |
|
|
|
|
|
1112 |
# gen weight before mean pooling
|
1113 |
+
attention_mask_ = _attention_mask * _attention_mask.cumsum(dim=1)
|
1114 |
+
s = hidden_states * attention_mask_.unsqueeze(-1).float()
|
1115 |
+
d = attention_mask_.sum(dim=1, keepdim=True).unsqueeze(1).float() /_attention_mask.sum(dim=1, keepdim=True).unsqueeze(1).float()
|
|
|
|
|
|
|
|
|
|
|
1116 |
|
1117 |
+
hidden_states = s / d
|
1118 |
+
|
1119 |
+
next_cache = None
|
1120 |
if use_cache:
|
1121 |
next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
|
1122 |
if not return_dict:
|
|
|
1127 |
hidden_states=all_hidden_states,
|
1128 |
attentions=all_self_attns,
|
1129 |
)
|
1130 |
+
|
|
|
1131 |
|
1132 |
class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
|
1133 |
_tied_weights_keys = ["lm_head.weight"]
|