openbmb
/

MiniCPM-Embedding

@@ -344,6 +344,7 @@ When running evaluation on BEIR and C-MTEB/Retrieval, we use instructions in `in
 ```
 transformers==4.37.2
 ```
 ### 示例脚本 Demo
@@ -357,9 +358,7 @@ import torch.nn.functional as F
 model_name = "openbmb/MiniCPM-Embedding"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
-# You can also use the following line to enable the Flash Attention 2 implementation
-# model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
 model.eval()
 # 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
@@ -403,9 +402,7 @@ import torch
 from sentence_transformers import SentenceTransformer
 model_name = "openbmb/MiniCPM-Embedding"
-model = SentenceTransformer(model_name, trust_remote_code=True, model_kwargs={ "torch_dtype": torch.float16})
-# You can also use the following line to enable the Flash Attention 2 implementation
-# model = SentenceTransformer(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", model_kwargs={ "torch_dtype": torch.float16})
 queries = ["中国的首都是哪里？"]
 passages = ["beijing", "shanghai"]

 ```
 transformers==4.37.2
+flash-attn>2.3.5
 ```
 ### 示例脚本 Demo
 model_name = "openbmb/MiniCPM-Embedding"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16).to("cuda")
 model.eval()
 # 由于在 `model.forward` 中缩放了最终隐层表示，此处的 mean pooling 实际上起到了 weighted mean pooling 的作用
 from sentence_transformers import SentenceTransformer
 model_name = "openbmb/MiniCPM-Embedding"
+model = SentenceTransformer(model_name, trust_remote_code=True, model_kwargs={"attn_implementation": "flash_attention_2", "torch_dtype": torch.float16})
 queries = ["中国的首都是哪里？"]
 passages = ["beijing", "shanghai"]

config.json CHANGED Viewed

@@ -17,7 +17,7 @@
   "initializer_range": 0.1,
   "intermediate_size": 5760,
   "is_causal": false,
-  "max_position_embeddings": 512,
   "num_attention_heads": 36,
   "num_hidden_layers": 40,
   "num_key_value_heads": 36,
@@ -25,7 +25,7 @@
   "rope_scaling": null,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.36.0",
-  "use_cache": false,
   "vocab_size": 122753,
   "scale_emb": 12,
   "dim_model_base": 256,

   "initializer_range": 0.1,
   "intermediate_size": 5760,
   "is_causal": false,
+  "max_position_embeddings": 2048,
   "num_attention_heads": 36,
   "num_hidden_layers": 40,
   "num_key_value_heads": 36,
   "rope_scaling": null,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.36.0",
+  "use_cache": true,
   "vocab_size": 122753,
   "scale_emb": 12,
   "dim_model_base": 256,

configuration_minicpm.py CHANGED Viewed

@@ -140,7 +140,6 @@ class MiniCPMConfig(PretrainedConfig):
         dim_model_base=1,
         scale_depth=1,
         is_causal=True,
-        adapt_mean_pooling=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -169,7 +168,6 @@ class MiniCPMConfig(PretrainedConfig):
         self.dim_model_base = dim_model_base
         self.scale_depth = scale_depth
         self.is_causal = is_causal
-        self.adapt_mean_pooling = adapt_mean_pooling
         super().__init__(
             pad_token_id=pad_token_id,

         dim_model_base=1,
         scale_depth=1,
         is_causal=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.dim_model_base = dim_model_base
         self.scale_depth = scale_depth
         self.is_causal = is_causal
         super().__init__(
             pad_token_id=pad_token_id,

modeling_minicpm.py CHANGED Viewed

@@ -21,16 +21,12 @@
 import math
 import warnings
 from typing import List, Optional, Tuple, Union, Dict
-import os
-from tqdm import tqdm
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-import numpy as np
-from copy import deepcopy
-from transformers import AutoTokenizer
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
@@ -39,7 +35,6 @@ from transformers.modeling_attn_mask_utils import (
     _prepare_4d_attention_mask,
     _prepare_4d_causal_attention_mask,
     _prepare_4d_causal_attention_mask_for_sdpa,
-    _prepare_4d_attention_mask_for_sdpa,
 )
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
@@ -325,6 +320,9 @@ class MiniCPMAttention(nn.Module):
         self.rope_theta = config.rope_theta
         self.is_causal = config.is_causal
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
@@ -981,8 +979,6 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
-        self.is_causal = config.is_causal
-        self.adapt_mean_pooling = config.adapt_mean_pooling
         # Initialize weights and apply final processing
         self.post_init()
@@ -1004,7 +1000,6 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        adapt_mean_pooling: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1049,35 +1044,24 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
             inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb
         _attention_mask = attention_mask
         if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
         elif self._use_sdpa and not output_attentions:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
-            if self.is_causal:
-                attention_mask = _prepare_4d_causal_attention_mask_for_sdpa (
-                    attention_mask,
-                    (batch_size, seq_length),
-                    inputs_embeds,
-                    past_key_values_length,
-                )
-            else:
-                attention_mask = _prepare_4d_attention_mask_for_sdpa(
-                    attention_mask,
-                    inputs_embeds.dtype,
-                )
         else:
             # 4d mask is passed through the layers
-            if self.is_causal:
-                attention_mask = _prepare_4d_causal_attention_mask (
-                    attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-                )
-            else:
-                attention_mask = _prepare_4d_attention_mask(
-                    attention_mask,
-                    inputs_embeds.dtype,
-                )
         # embed positions
         hidden_states = inputs_embeds
@@ -1125,18 +1109,14 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
-        next_cache = None
         # gen weight before mean pooling
-        if adapt_mean_pooling is None:
-            adapt_mean_pooling = self.adapt_mean_pooling
-        if adapt_mean_pooling:
-            attention_mask_ = _attention_mask * _attention_mask.cumsum(dim=1)
-            s = hidden_states * attention_mask_.unsqueeze(-1).float()
-            d = attention_mask_.sum(dim=1, keepdim=True).unsqueeze(1).float() /_attention_mask.sum(dim=1, keepdim=True).unsqueeze(1).float()
-            hidden_states = s / d
         if use_cache:
             next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
         if not return_dict:
@@ -1147,8 +1127,7 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
 class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]

 import math
 import warnings
 from typing import List, Optional, Tuple, Union, Dict
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
     _prepare_4d_attention_mask,
     _prepare_4d_causal_attention_mask,
     _prepare_4d_causal_attention_mask_for_sdpa,
 )
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
         self.rope_theta = config.rope_theta
         self.is_causal = config.is_causal
+        logger.info(f"self.is_causal = {self.is_causal}")
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
         self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb
         _attention_mask = attention_mask
         if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
         elif self._use_sdpa and not output_attentions:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
         else:
             # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
         # embed positions
         hidden_states = inputs_embeds
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
         # gen weight before mean pooling
+        attention_mask_ = _attention_mask * _attention_mask.cumsum(dim=1)
+        s = hidden_states * attention_mask_.unsqueeze(-1).float()
+        d = attention_mask_.sum(dim=1, keepdim=True).unsqueeze(1).float() /_attention_mask.sum(dim=1, keepdim=True).unsqueeze(1).float()
+        hidden_states = s / d
+        next_cache = None
         if use_cache:
             next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
         if not return_dict:
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
 class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]