ccdv commited on
Commit
ad79eac
·
1 Parent(s): 90fb5ad
Files changed (1) hide show
  1. modeling_lsg_xlm_roberta.py +37 -78
modeling_lsg_xlm_roberta.py CHANGED
@@ -55,7 +55,8 @@ class LSGXLMRobertaConfig(XLMRobertaConfig):
55
 
56
  if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
57
  logger.warning(
58
- "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], setting sparsity_type=None, computation will skip sparse attention")
 
59
  self.sparsity_type = None
60
 
61
  if self.sparsity_type in ["stride", "block_stride"]:
@@ -71,7 +72,7 @@ class LSGXLMRobertaConfig(XLMRobertaConfig):
71
  self.num_global_tokens = 1
72
  elif self.num_global_tokens > 512:
73
  logger.warning(
74
- "[WARNING CONFIG]: num_global_tokens > 512 is not compatible, setting num_global_tokens=512"
75
  )
76
  self.num_global_tokens = 512
77
 
@@ -79,6 +80,16 @@ class LSGXLMRobertaConfig(XLMRobertaConfig):
79
  assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
80
  assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
81
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  class BaseSelfAttention(nn.Module):
84
 
@@ -436,39 +447,13 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
436
  return embeddings
437
 
438
 
439
- class LSGRobertaSelfOutput(RobertaSelfOutput):
440
-
441
- def __init__(self, config):
442
- super().__init__(config)
443
-
444
-
445
  class LSGAttention(RobertaAttention):
446
 
447
  def __init__(self, config):
448
 
449
- nn.Module.__init__(self)
450
 
451
  self.self = LSGSelfAttention(config)
452
- self.output = LSGRobertaSelfOutput(config)
453
- self.pruned_heads = set()
454
-
455
-
456
- class LSGRobertaIntermediate(RobertaIntermediate):
457
-
458
- def __init__(self, config):
459
- super().__init__(config)
460
-
461
-
462
- class LSGRobertaOutput(RobertaOutput):
463
-
464
- def __init__(self, config):
465
- super().__init__(config)
466
-
467
-
468
- class LSGRobertaPooler(RobertaPooler):
469
-
470
- def __init__(self, config):
471
- super().__init__(config)
472
 
473
 
474
  class LSGSelfAttention(BaseSelfAttention):
@@ -726,9 +711,7 @@ class LSGSelfAttention(BaseSelfAttention):
726
  attention_mask=attention_mask,
727
  output_attentions=output_attentions
728
  )
729
-
730
- #if head_mask is not None:
731
- # outputs = (outputs[0] * head_mask[:, :, :1, :1], ) + outputs[1:]
732
  return outputs
733
 
734
  def causal_forward(
@@ -898,29 +881,20 @@ class LSGRobertaLayer(RobertaLayer):
898
 
899
  def __init__(self, config):
900
 
901
- nn.Module.__init__(self)
902
 
903
- self.chunk_size_feed_forward = config.chunk_size_feed_forward
904
- self.seq_len_dim = 1
905
  self.attention = LSGAttention(config)
906
- self.is_decoder = config.is_decoder
907
- self.add_cross_attention = config.add_cross_attention
908
  if self.add_cross_attention:
909
  assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
910
  self.crossattention = LSGAttention(config)
911
- self.intermediate = LSGRobertaIntermediate(config)
912
- self.output = LSGRobertaOutput(config)
913
 
914
 
915
  class LSGRobertaEncoder(RobertaEncoder):
916
 
917
  def __init__(self, config):
918
 
919
- nn.Module.__init__(self)
920
-
921
- self.config = config
922
  self.layer = nn.ModuleList([LSGRobertaLayer(config) for _ in range(config.num_hidden_layers)])
923
- self.gradient_checkpointing = False
924
 
925
 
926
  class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
@@ -945,7 +919,7 @@ class LSGXLMRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
945
  config_class = LSGXLMRobertaConfig
946
 
947
 
948
- def __init__(self, config, add_pooling_layer=False):
949
 
950
  LSGRobertaPreTrainedModel.__init__(self, config)
951
 
@@ -961,7 +935,7 @@ class LSGXLMRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
961
 
962
  self.embeddings = LSGRobertaEmbeddings(config)
963
  self.encoder = LSGRobertaEncoder(config)
964
- self.pooler = LSGRobertaPooler(config) if add_pooling_layer else None
965
 
966
  if config.add_cross_attention:
967
  logger.warning(
@@ -988,6 +962,12 @@ class LSGXLMRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
988
  return_dict=None
989
  ):
990
 
 
 
 
 
 
 
991
  inputs_ = input_ids if input_ids is not None else inputs_embeds
992
  n, t = inputs_.size()[:2]
993
 
@@ -1032,33 +1012,26 @@ class LSGXLMRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
1032
  return_dict=return_dict
1033
  )
1034
 
1035
- context = encoder_outputs[0]
1036
  if self.pool_with_global:
1037
- context[:, self.num_global_tokens] = context[:, 0]
1038
 
1039
  diff = t - t_
1040
- n, _, d = context.size()
1041
- context = context[..., self.num_global_tokens:, :]
1042
 
1043
  # Adapt sequence to initial shape
1044
  if diff < 0:
1045
- context = context[:, :t]
1046
 
1047
- encoder_outputs.last_hidden_state = context
1048
- sequence_output = encoder_outputs[0]
1049
  pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1050
 
1051
  if not return_dict:
1052
  return (sequence_output, pooled_output) + encoder_outputs[1:]
1053
-
1054
- return BaseModelOutputWithPoolingAndCrossAttentions(
1055
- last_hidden_state=sequence_output,
1056
- pooler_output=pooled_output,
1057
- past_key_values=encoder_outputs.past_key_values,
1058
- hidden_states=encoder_outputs.hidden_states,
1059
- attentions=encoder_outputs.attentions,
1060
- cross_attentions=encoder_outputs.cross_attentions,
1061
- )
1062
 
1063
  def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
1064
 
@@ -1093,7 +1066,7 @@ class LSGXLMRobertaForCausalLM(LSGRobertaPreTrainedModel, RobertaForCausalLM):
1093
  logger.warning("If you want to use `LSGRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
1094
 
1095
  self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
1096
- self.lm_head = LSGRobertaLMHead(config)
1097
 
1098
  # The LM head weights require special treatment only when they are tied with the word embeddings
1099
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1123,7 +1096,7 @@ class LSGXLMRobertaForMaskedLM(LSGRobertaPreTrainedModel, RobertaForMaskedLM):
1123
  )
1124
 
1125
  self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
1126
- self.lm_head = LSGRobertaLMHead(config)
1127
 
1128
  # The LM head weights require special treatment only when they are tied with the word embeddings
1129
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1132,13 +1105,6 @@ class LSGXLMRobertaForMaskedLM(LSGRobertaPreTrainedModel, RobertaForMaskedLM):
1132
  self.post_init()
1133
 
1134
 
1135
- class LSGRobertaLMHead(RobertaLMHead):
1136
- """LSG Head for masked language modeling."""
1137
-
1138
- def __init__(self, config):
1139
- super().__init__(config)
1140
-
1141
-
1142
  class LSGXLMRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaForSequenceClassification):
1143
  """
1144
  This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
@@ -1155,19 +1121,12 @@ class LSGXLMRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaF
1155
  self.config = config
1156
 
1157
  self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
1158
- self.classifier = LSGRobertaClassificationHead(config)
1159
 
1160
  # Initialize weights and apply final processing
1161
  self.post_init()
1162
 
1163
 
1164
- class LSGRobertaClassificationHead(RobertaClassificationHead):
1165
- """Head for sentence-level classification tasks."""
1166
-
1167
- def __init__(self, config):
1168
- super().__init__(config)
1169
-
1170
-
1171
  class LSGXLMRobertaForMultipleChoice(LSGRobertaPreTrainedModel, RobertaForMultipleChoice):
1172
  """
1173
  This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the
 
55
 
56
  if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
57
  logger.warning(
58
+ "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], \
59
+ setting sparsity_type=None, computation will skip sparse attention")
60
  self.sparsity_type = None
61
 
62
  if self.sparsity_type in ["stride", "block_stride"]:
 
72
  self.num_global_tokens = 1
73
  elif self.num_global_tokens > 512:
74
  logger.warning(
75
+ "[WARNING CONFIG]: num_global_tokens > 512 is not allowed, setting num_global_tokens=512"
76
  )
77
  self.num_global_tokens = 512
78
 
 
80
  assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
81
  assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
82
 
83
+ if self.mask_first_token and not pool_with_global:
84
+ logger.warning(
85
+ "[WARNING CONFIG]: pool_with_global==False is not compatible with mask_first_token==True. Setting pool_with_global to True.")
86
+ self.pool_with_global = True
87
+
88
+ if hasattr(self, "position_embedding_type"):
89
+ if self.position_embedding_type != "absolute":
90
+ logger.warning(
91
+ "[WARNING CONFIG]: LSG Attention is not compatible with relative positional embedding and will skip its computation. Set position_embedding_type='absolute' to remove this warning.")
92
+
93
 
94
  class BaseSelfAttention(nn.Module):
95
 
 
447
  return embeddings
448
 
449
 
 
 
 
 
 
 
450
  class LSGAttention(RobertaAttention):
451
 
452
  def __init__(self, config):
453
 
454
+ super().__init__(config)
455
 
456
  self.self = LSGSelfAttention(config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
 
459
  class LSGSelfAttention(BaseSelfAttention):
 
711
  attention_mask=attention_mask,
712
  output_attentions=output_attentions
713
  )
714
+
 
 
715
  return outputs
716
 
717
  def causal_forward(
 
881
 
882
  def __init__(self, config):
883
 
884
+ super().__init__(config)
885
 
 
 
886
  self.attention = LSGAttention(config)
 
 
887
  if self.add_cross_attention:
888
  assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
889
  self.crossattention = LSGAttention(config)
 
 
890
 
891
 
892
  class LSGRobertaEncoder(RobertaEncoder):
893
 
894
  def __init__(self, config):
895
 
896
+ super().__init__(config)
 
 
897
  self.layer = nn.ModuleList([LSGRobertaLayer(config) for _ in range(config.num_hidden_layers)])
 
898
 
899
 
900
  class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
 
919
  config_class = LSGXLMRobertaConfig
920
 
921
 
922
+ def __init__(self, config, add_pooling_layer=True):
923
 
924
  LSGRobertaPreTrainedModel.__init__(self, config)
925
 
 
935
 
936
  self.embeddings = LSGRobertaEmbeddings(config)
937
  self.encoder = LSGRobertaEncoder(config)
938
+ self.pooler = RobertaPooler(config) if add_pooling_layer else None
939
 
940
  if config.add_cross_attention:
941
  logger.warning(
 
962
  return_dict=None
963
  ):
964
 
965
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
966
+ output_hidden_states = (
967
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
968
+ )
969
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
970
+
971
  inputs_ = input_ids if input_ids is not None else inputs_embeds
972
  n, t = inputs_.size()[:2]
973
 
 
1012
  return_dict=return_dict
1013
  )
1014
 
1015
+ sequence_output = encoder_outputs[0]
1016
  if self.pool_with_global:
1017
+ sequence_output[:, self.num_global_tokens] = sequence_output[:, 0]
1018
 
1019
  diff = t - t_
1020
+ n, _, d = sequence_output.size()
1021
+ sequence_output = sequence_output[..., self.num_global_tokens:, :]
1022
 
1023
  # Adapt sequence to initial shape
1024
  if diff < 0:
1025
+ sequence_output = sequence_output[:, :t]
1026
 
 
 
1027
  pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1028
 
1029
  if not return_dict:
1030
  return (sequence_output, pooled_output) + encoder_outputs[1:]
1031
+
1032
+ encoder_outputs.last_hidden_state = sequence_output
1033
+ encoder_outputs.pooler_output = pooled_output
1034
+ return encoder_outputs
 
 
 
 
 
1035
 
1036
  def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
1037
 
 
1066
  logger.warning("If you want to use `LSGRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
1067
 
1068
  self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
1069
+ self.lm_head = RobertaLMHead(config)
1070
 
1071
  # The LM head weights require special treatment only when they are tied with the word embeddings
1072
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
 
1096
  )
1097
 
1098
  self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
1099
+ self.lm_head = RobertaLMHead(config)
1100
 
1101
  # The LM head weights require special treatment only when they are tied with the word embeddings
1102
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
 
1105
  self.post_init()
1106
 
1107
 
 
 
 
 
 
 
 
1108
  class LSGXLMRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaForSequenceClassification):
1109
  """
1110
  This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
 
1121
  self.config = config
1122
 
1123
  self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
1124
+ self.classifier = RobertaClassificationHead(config)
1125
 
1126
  # Initialize weights and apply final processing
1127
  self.post_init()
1128
 
1129
 
 
 
 
 
 
 
 
1130
  class LSGXLMRobertaForMultipleChoice(LSGRobertaPreTrainedModel, RobertaForMultipleChoice):
1131
  """
1132
  This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the