# -*- coding: utf-8 -*- # @Time : 2021/12/30 8:35 下午 # @Author : JianingWang # @File : mlm.py import logging from typing import Union, Tuple, Optional import torch from torch.nn import CrossEntropyLoss from transformers.modeling_outputs import MaskedLMOutput from transformers.models.bert import BertPreTrainedModel from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel, BertOnlyMLMHead from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaModel, RobertaLMHead from transformers.models.albert.modeling_albert import AlbertPreTrainedModel, AlbertModel, AlbertMLMHead from transformers.models.roformer.modeling_roformer import RoFormerPreTrainedModel, RoFormerModel, RoFormerOnlyMLMHead logger = logging.getLogger(__name__) """ Function: Use MLM to pre-train BERT Notes: - The label of non-masked token is -100, which can be used for cross-entropy function (only calculate loss at not -100) """ class BertForMaskedLM(BertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.bert = BertModel(config, add_pooling_layer=False) self.cls = BertOnlyMLMHead(config) # Initialize weights and apply final processing self.post_init() def forward( self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` kwargs (`Dict[str, any]`, optional, defaults to *{}*): Used to hide legacy arguments that have been deprecated. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) masked_lm_loss = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output return MaskedLMOutput( loss=masked_lm_loss, # () logits=prediction_scores, # (batch_size, seq_len, vocab_size) hidden_states=outputs.hidden_states, # (batch_size, seq_len, hidden_size) attentions=outputs.attentions, ) """ Function: Use MLM to pre-train RoBERTa Notes: - The label of non-masked token is -100, which can be used for cross-entropy function (only calculate loss at not -100) """ class RobertaForMaskedLM(RobertaPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.roberta = BertModel(config, add_pooling_layer=False) self.lm_head = RobertaLMHead(config) # Initialize weights and apply final processing self.post_init() def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, token_type_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` kwargs (`Dict[str, any]`, optional, defaults to *{}*): Used to hide legacy arguments that have been deprecated. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output return MaskedLMOutput( loss=masked_lm_loss, # () logits=prediction_scores, # (batch_size, seq_len, vocab_size) hidden_states=outputs.hidden_states, # (batch_size, seq_len, hidden_size) attentions=outputs.attentions, ) """ Function: Use MLM to pre-train ALBERT Notes: - The label of non-masked token is -100, which can be used for cross-entropy function (only calculate loss at not -100) """ class AlbertForMaskedLM(AlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.albert = AlbertModel(config, add_pooling_layer=False) self.predictions = AlbertMLMHead(config) # Initialize weights and apply final processing self.post_init() def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, token_type_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[MaskedLMOutput, Tuple]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` Returns: Example: ```python >>> import torch >>> from transformers import AlbertTokenizer, AlbertForMaskedLM >>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") >>> model = AlbertForMaskedLM.from_pretrained("albert-base-v2") >>> # add mask_token >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt") >>> with torch.no_grad(): ... logits = model(**inputs).logits >>> # retrieve index of [MASK] >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0] >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1) >>> tokenizer.decode(predicted_token_id) "france" ``` ```python >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"] >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100) >>> outputs = model(**inputs, labels=labels) >>> round(outputs.loss.item(), 2) 0.81 ``` """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_outputs = outputs[0] prediction_scores = self.predictions(sequence_outputs) masked_lm_loss = None if labels is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) """ Function: Use MLM to pre-train RoFormer Notes: - The label of non-masked token is -100, which can be used for cross-entropy function (only calculate loss at not -100) """ class RoFormerForMaskedLM(RoFormerPreTrainedModel): def __init__(self, config): super().__init__(config) if config.is_decoder: logger.warning( "If you want to use `RoFormerForMaskedLM` make sure `config.is_decoder=False` for " "bi-directional self-attention." ) self.roformer = RoFormerModel(config) self.cls = RoFormerOnlyMLMHead(config) # Initialize weights and apply final processing self.post_init() def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, token_type_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[MaskedLMOutput, Tuple[torch.Tensor]]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roformer( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) masked_lm_loss = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if not return_dict: output = (prediction_scores,) + outputs[1:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) if __name__ == "__main__": from transformers.models.bert.tokenization_bert import BertTokenizer tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") model = BertForMaskedLM.from_pretrained("bert-base-uncased") input_text = "Today is a nice day, I will [MASK] to play [MASK] with my friends." inputs = tokenizer(input_text, return_tensors="pt") masked_positions = inputs["input_ids"] == tokenizer.mask_token_id print("inputs=", inputs) """ inputs= {"input_ids": tensor([[ 101, 2651, 2003, 1037, 3835, 2154, 1010, 1045, 2097, 103, 2000, 2377, 103, 2007, 2026, 2814, 1012, 102]]), "token_type_ids": tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), "attention_mask": tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])} """ outputs = model(**inputs) masked_results = outputs.logits.argmax(-1)[masked_positions] masked_results = tokenizer.convert_ids_to_tokens(masked_results) print("masked_results=", masked_results) """ masked_results= ["have", "football"] """