update for transformers >= 4.29.1
Browse files- modeling_lsg_roberta.py +17 -22
modeling_lsg_roberta.py
CHANGED
@@ -188,19 +188,25 @@ class CausalAttentionProduct(nn.Module):
|
|
188 |
del key_layer
|
189 |
|
190 |
if attention_mask is not None:
|
191 |
-
# Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
|
192 |
-
attention_scores = attention_scores + attention_mask
|
193 |
-
|
194 |
# Add causal mask
|
195 |
causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
|
196 |
causal_mask = torch.tril(
|
197 |
torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
|
198 |
diagonal=-1
|
199 |
)
|
200 |
-
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
|
|
203 |
del attention_mask
|
|
|
204 |
|
205 |
# Normalize the attention scores to probabilities.
|
206 |
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
@@ -972,6 +978,9 @@ class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
|
|
972 |
"""
|
973 |
|
974 |
config_class = LSGRobertaConfig
|
|
|
|
|
|
|
975 |
|
976 |
def _set_gradient_checkpointing(self, module, value=False):
|
977 |
if isinstance(module, (RobertaEncoder, LSGRobertaEncoder)):
|
@@ -984,9 +993,6 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
|
|
984 |
documentation alongside usage examples.
|
985 |
"""
|
986 |
|
987 |
-
config_class = LSGRobertaConfig
|
988 |
-
|
989 |
-
|
990 |
def __init__(self, config, add_pooling_layer=True):
|
991 |
|
992 |
LSGRobertaPreTrainedModel.__init__(self, config)
|
@@ -1023,9 +1029,7 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
|
|
1023 |
|
1024 |
class LSGRobertaForCausalLM(LSGRobertaPreTrainedModel, RobertaForCausalLM):
|
1025 |
|
1026 |
-
|
1027 |
-
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
1028 |
-
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
1029 |
|
1030 |
def __init__(self, config):
|
1031 |
|
@@ -1050,9 +1054,7 @@ class LSGRobertaForMaskedLM(LSGRobertaPreTrainedModel, RobertaForMaskedLM):
|
|
1050 |
documentation alongside usage examples.
|
1051 |
"""
|
1052 |
|
1053 |
-
|
1054 |
-
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
1055 |
-
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
1056 |
|
1057 |
def __init__(self, config):
|
1058 |
|
@@ -1080,8 +1082,6 @@ class LSGRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaForS
|
|
1080 |
appropriate documentation alongside usage examples.
|
1081 |
"""
|
1082 |
|
1083 |
-
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
1084 |
-
|
1085 |
def __init__(self, config):
|
1086 |
|
1087 |
LSGRobertaPreTrainedModel.__init__(self, config)
|
@@ -1102,8 +1102,6 @@ class LSGRobertaForMultipleChoice(LSGRobertaPreTrainedModel, RobertaForMultipleC
|
|
1102 |
appropriate documentation alongside usage examples.
|
1103 |
"""
|
1104 |
|
1105 |
-
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
1106 |
-
|
1107 |
def __init__(self, config):
|
1108 |
|
1109 |
LSGRobertaPreTrainedModel.__init__(self, config)
|
@@ -1147,10 +1145,7 @@ class LSGRobertaForQuestionAnswering(LSGRobertaPreTrainedModel, RobertaForQuesti
|
|
1147 |
This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
|
1148 |
appropriate documentation alongside usage examples.
|
1149 |
"""
|
1150 |
-
|
1151 |
-
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
1152 |
-
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
1153 |
-
|
1154 |
def __init__(self, config):
|
1155 |
|
1156 |
LSGRobertaPreTrainedModel.__init__(self, config)
|
|
|
188 |
del key_layer
|
189 |
|
190 |
if attention_mask is not None:
|
|
|
|
|
|
|
191 |
# Add causal mask
|
192 |
causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
|
193 |
causal_mask = torch.tril(
|
194 |
torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
|
195 |
diagonal=-1
|
196 |
)
|
197 |
+
|
198 |
+
# Min value
|
199 |
+
dtype_min = torch.tensor(
|
200 |
+
torch.finfo(attention_scores.dtype).min, device=attention_scores.device, dtype=attention_scores.dtype
|
201 |
+
)
|
202 |
+
|
203 |
+
# Build causal + attention_mask
|
204 |
+
causal_mask = torch.nn.functional.pad(causal_mask.T * dtype_min, (attention_mask.size()[-1] - self.block_size, 0), value=0)
|
205 |
+
attention_mask = torch.max(attention_mask + causal_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0), dtype_min)
|
206 |
|
207 |
+
attention_scores = attention_scores + attention_mask
|
208 |
del attention_mask
|
209 |
+
del causal_mask
|
210 |
|
211 |
# Normalize the attention scores to probabilities.
|
212 |
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
|
|
978 |
"""
|
979 |
|
980 |
config_class = LSGRobertaConfig
|
981 |
+
base_model_prefix = "roberta"
|
982 |
+
supports_gradient_checkpointing = True
|
983 |
+
_no_split_modules = []
|
984 |
|
985 |
def _set_gradient_checkpointing(self, module, value=False):
|
986 |
if isinstance(module, (RobertaEncoder, LSGRobertaEncoder)):
|
|
|
993 |
documentation alongside usage examples.
|
994 |
"""
|
995 |
|
|
|
|
|
|
|
996 |
def __init__(self, config, add_pooling_layer=True):
|
997 |
|
998 |
LSGRobertaPreTrainedModel.__init__(self, config)
|
|
|
1029 |
|
1030 |
class LSGRobertaForCausalLM(LSGRobertaPreTrainedModel, RobertaForCausalLM):
|
1031 |
|
1032 |
+
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
|
|
|
|
1033 |
|
1034 |
def __init__(self, config):
|
1035 |
|
|
|
1054 |
documentation alongside usage examples.
|
1055 |
"""
|
1056 |
|
1057 |
+
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
|
|
|
|
1058 |
|
1059 |
def __init__(self, config):
|
1060 |
|
|
|
1082 |
appropriate documentation alongside usage examples.
|
1083 |
"""
|
1084 |
|
|
|
|
|
1085 |
def __init__(self, config):
|
1086 |
|
1087 |
LSGRobertaPreTrainedModel.__init__(self, config)
|
|
|
1102 |
appropriate documentation alongside usage examples.
|
1103 |
"""
|
1104 |
|
|
|
|
|
1105 |
def __init__(self, config):
|
1106 |
|
1107 |
LSGRobertaPreTrainedModel.__init__(self, config)
|
|
|
1145 |
This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
|
1146 |
appropriate documentation alongside usage examples.
|
1147 |
"""
|
1148 |
+
|
|
|
|
|
|
|
1149 |
def __init__(self, config):
|
1150 |
|
1151 |
LSGRobertaPreTrainedModel.__init__(self, config)
|